1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
15;                            short *diff, unsigned char *Predictor,
16;                            int pitch);
17global sym(vp8_subtract_b_sse2_impl) PRIVATE
18sym(vp8_subtract_b_sse2_impl):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 5
22    GET_GOT     rbx
23    push rsi
24    push rdi
25    ; end prolog
26
27        mov     rdi,        arg(2) ;diff
28        mov     rax,        arg(3) ;Predictor
29        mov     rsi,        arg(0) ;z
30        movsxd  rdx,        dword ptr arg(1);src_stride;
31        movsxd  rcx,        dword ptr arg(4);pitch
32        pxor    mm7,        mm7
33
34        movd    mm0,        [rsi]
35        movd    mm1,        [rax]
36        punpcklbw   mm0,    mm7
37        punpcklbw   mm1,    mm7
38        psubw   mm0,        mm1
39        movq    MMWORD PTR [rdi],      mm0
40
41        movd    mm0,        [rsi+rdx]
42        movd    mm1,        [rax+rcx]
43        punpcklbw   mm0,    mm7
44        punpcklbw   mm1,    mm7
45        psubw   mm0,        mm1
46        movq    MMWORD PTR [rdi+rcx*2], mm0
47
48        movd    mm0,        [rsi+rdx*2]
49        movd    mm1,        [rax+rcx*2]
50        punpcklbw   mm0,    mm7
51        punpcklbw   mm1,    mm7
52        psubw   mm0,        mm1
53        movq    MMWORD PTR [rdi+rcx*4], mm0
54
55        lea     rsi,        [rsi+rdx*2]
56        lea     rcx,        [rcx+rcx*2]
57
58        movd    mm0,        [rsi+rdx]
59        movd    mm1,        [rax+rcx]
60        punpcklbw   mm0,    mm7
61        punpcklbw   mm1,    mm7
62        psubw   mm0,        mm1
63        movq    MMWORD PTR [rdi+rcx*2], mm0
64
65    ; begin epilog
66    pop rdi
67    pop rsi
68    RESTORE_GOT
69    UNSHADOW_ARGS
70    pop         rbp
71    ret
72
73
74;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
75;unsigned char *pred, int pred_stride)
76global sym(vp8_subtract_mby_sse2) PRIVATE
77sym(vp8_subtract_mby_sse2):
78    push        rbp
79    mov         rbp, rsp
80    SHADOW_ARGS_TO_STACK 5
81    GET_GOT     rbx
82    push rsi
83    push rdi
84    ; end prolog
85
86    mov         rdi,        arg(0)          ;diff
87    mov         rsi,        arg(1)          ;src
88    movsxd      rdx,        dword ptr arg(2);src_stride
89    mov         rax,        arg(3)          ;pred
90    movdqa      xmm4,       [GLOBAL(t80)]
91    push        rbx
92    mov         rcx,        8               ; do two lines at one time
93    movsxd      rbx,        dword ptr arg(4);pred_stride
94
95.submby_loop:
96    movdqa      xmm0,       [rsi]           ; src
97    movdqa      xmm1,       [rax]           ; pred
98
99    movdqa      xmm2,       xmm0
100    psubb       xmm0,       xmm1
101
102    pxor        xmm1,       xmm4            ;convert to signed values
103    pxor        xmm2,       xmm4
104    pcmpgtb     xmm1,       xmm2            ; obtain sign information
105
106    movdqa      xmm2,       xmm0
107    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
108    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
109
110    movdqa      xmm3,       [rsi + rdx]
111    movdqa      xmm5,       [rax + rbx]
112
113    lea         rsi,        [rsi+rdx*2]
114    lea         rax,        [rax+rbx*2]
115
116    movdqa      [rdi],      xmm0
117    movdqa      [rdi +16],  xmm2
118
119    movdqa      xmm1,       xmm3
120    psubb       xmm3,       xmm5
121
122    pxor        xmm5,       xmm4            ;convert to signed values
123    pxor        xmm1,       xmm4
124    pcmpgtb     xmm5,       xmm1            ; obtain sign information
125
126    movdqa      xmm1,       xmm3
127    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
128    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
129
130    movdqa      [rdi +32],  xmm3
131    movdqa      [rdi +48],  xmm1
132
133    add         rdi,        64
134    dec         rcx
135    jnz         .submby_loop
136
137    pop rbx
138    pop rdi
139    pop rsi
140    ; begin epilog
141    RESTORE_GOT
142    UNSHADOW_ARGS
143    pop         rbp
144    ret
145
146;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
147;                         int src_stride, unsigned char *upred,
148;                         unsigned char *vpred, int pred_stride)
149global sym(vp8_subtract_mbuv_sse2) PRIVATE
150sym(vp8_subtract_mbuv_sse2):
151    push        rbp
152    mov         rbp, rsp
153    SHADOW_ARGS_TO_STACK 7
154    GET_GOT     rbx
155    push rsi
156    push rdi
157    ; end prolog
158
159    movdqa      xmm4,       [GLOBAL(t80)]
160    mov         rdi,        arg(0)          ;diff
161    mov         rsi,        arg(1)          ;usrc
162    movsxd      rdx,        dword ptr arg(3);src_stride;
163    mov         rax,        arg(4)          ;upred
164    add         rdi,        256*2           ;diff = diff + 256 (shorts)
165    mov         rcx,        4
166    push        rbx
167    movsxd      rbx,        dword ptr arg(6);pred_stride
168
169    ;u
170.submbu_loop:
171    movq        xmm0,       [rsi]           ; src
172    movq        xmm2,       [rsi+rdx]       ; src -- next line
173    movq        xmm1,       [rax]           ; pred
174    movq        xmm3,       [rax+rbx]       ; pred -- next line
175    lea         rsi,        [rsi + rdx*2]
176    lea         rax,        [rax + rbx*2]
177
178    punpcklqdq  xmm0,       xmm2
179    punpcklqdq  xmm1,       xmm3
180
181    movdqa      xmm2,       xmm0
182    psubb       xmm0,       xmm1            ; subtraction with sign missed
183
184    pxor        xmm1,       xmm4            ;convert to signed values
185    pxor        xmm2,       xmm4
186    pcmpgtb     xmm1,       xmm2            ; obtain sign information
187
188    movdqa      xmm2,       xmm0
189    movdqa      xmm3,       xmm1
190    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
191    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
192
193    movdqa      [rdi],      xmm0            ; store difference
194    movdqa      [rdi +16],  xmm2            ; store difference
195    add         rdi,        32
196    sub         rcx, 1
197    jnz         .submbu_loop
198
199    mov         rsi,        arg(2)          ;vsrc
200    mov         rax,        arg(5)          ;vpred
201    mov         rcx,        4
202
203    ;v
204.submbv_loop:
205    movq        xmm0,       [rsi]           ; src
206    movq        xmm2,       [rsi+rdx]       ; src -- next line
207    movq        xmm1,       [rax]           ; pred
208    movq        xmm3,       [rax+rbx]       ; pred -- next line
209    lea         rsi,        [rsi + rdx*2]
210    lea         rax,        [rax + rbx*2]
211
212    punpcklqdq  xmm0,       xmm2
213    punpcklqdq  xmm1,       xmm3
214
215    movdqa      xmm2,       xmm0
216    psubb       xmm0,       xmm1            ; subtraction with sign missed
217
218    pxor        xmm1,       xmm4            ;convert to signed values
219    pxor        xmm2,       xmm4
220    pcmpgtb     xmm1,       xmm2            ; obtain sign information
221
222    movdqa      xmm2,       xmm0
223    movdqa      xmm3,       xmm1
224    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
225    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
226
227    movdqa      [rdi],      xmm0            ; store difference
228    movdqa      [rdi +16],  xmm2            ; store difference
229    add         rdi,        32
230    sub         rcx, 1
231    jnz         .submbv_loop
232
233    pop         rbx
234    ; begin epilog
235    pop rdi
236    pop rsi
237    RESTORE_GOT
238    UNSHADOW_ARGS
239    pop         rbp
240    ret
241
242SECTION_RODATA
243align 16
244t80:
245    times 16 db 0x80
246