1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13%define _t0 0
14%define _t1 _t0 + 16
15%define _p3 _t1 + 16
16%define _p2 _p3 + 16
17%define _p1 _p2 + 16
18%define _p0 _p1 + 16
19%define _q0 _p0 + 16
20%define _q1 _q0 + 16
21%define _q2 _q1 + 16
22%define _q3 _q2 + 16
23%define lf_var_size 160
24
25; Use of pmaxub instead of psubusb to compute filter mask was seen
26; in ffvp8
27
28%macro LFH_FILTER_AND_HEV_MASK 1
29%if %1
30        movdqa      xmm2,                   [rdi+2*rax]       ; q3
31        movdqa      xmm1,                   [rsi+2*rax]       ; q2
32        movdqa      xmm4,                   [rsi+rax]         ; q1
33        movdqa      xmm5,                   [rsi]             ; q0
34        neg         rax                     ; negate pitch to deal with above border
35%else
36        movlps      xmm2,                   [rsi + rcx*2]     ; q3
37        movlps      xmm1,                   [rsi + rcx]       ; q2
38        movlps      xmm4,                   [rsi]             ; q1
39        movlps      xmm5,                   [rsi + rax]       ; q0
40
41        movhps      xmm2,                   [rdi + rcx*2]
42        movhps      xmm1,                   [rdi + rcx]
43        movhps      xmm4,                   [rdi]
44        movhps      xmm5,                   [rdi + rax]
45
46        lea         rsi,                    [rsi + rax*4]
47        lea         rdi,                    [rdi + rax*4]
48
49        movdqa      [rsp+_q2],              xmm1              ; store q2
50        movdqa      [rsp+_q1],              xmm4              ; store q1
51%endif
52        movdqa      xmm7,                   [rdx]             ;limit
53
54        movdqa      xmm6,                   xmm1              ; q2
55        movdqa      xmm3,                   xmm4              ; q1
56
57        psubusb     xmm1,                   xmm2              ; q2-=q3
58        psubusb     xmm2,                   xmm6              ; q3-=q2
59
60        psubusb     xmm4,                   xmm6              ; q1-=q2
61        psubusb     xmm6,                   xmm3              ; q2-=q1
62
63        por         xmm4,                   xmm6              ; abs(q2-q1)
64        por         xmm1,                   xmm2              ; abs(q3-q2)
65
66        movdqa      xmm0,                   xmm5              ; q0
67        pmaxub      xmm1,                   xmm4
68
69        psubusb     xmm5,                   xmm3              ; q0-=q1
70        psubusb     xmm3,                   xmm0              ; q1-=q0
71
72        por         xmm5,                   xmm3              ; abs(q0-q1)
73        movdqa      [rsp+_t0],              xmm5              ; save to t0
74
75        pmaxub      xmm1,                   xmm5
76
77%if %1
78        movdqa      xmm2,                   [rsi+4*rax]       ; p3
79        movdqa      xmm4,                   [rdi+4*rax]       ; p2
80        movdqa      xmm6,                   [rsi+2*rax]       ; p1
81%else
82        movlps      xmm2,                   [rsi + rax]       ; p3
83        movlps      xmm4,                   [rsi]             ; p2
84        movlps      xmm6,                   [rsi + rcx]       ; p1
85
86        movhps      xmm2,                   [rdi + rax]
87        movhps      xmm4,                   [rdi]
88        movhps      xmm6,                   [rdi + rcx]
89
90        movdqa      [rsp+_p2],              xmm4              ; store p2
91        movdqa      [rsp+_p1],              xmm6              ; store p1
92%endif
93
94        movdqa      xmm5,                   xmm4              ; p2
95        movdqa      xmm3,                   xmm6              ; p1
96
97        psubusb     xmm4,                   xmm2              ; p2-=p3
98        psubusb     xmm2,                   xmm5              ; p3-=p2
99
100        psubusb     xmm3,                   xmm5              ; p1-=p2
101        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
102
103        psubusb     xmm5,                   xmm6              ; p2-=p1
104        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
105
106        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
107        movdqa      xmm2,                   xmm6              ; p1
108
109        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
110%if %1
111        movdqa      xmm4,                   [rsi+rax]         ; p0
112        movdqa      xmm3,                   [rdi]             ; q1
113%else
114        movlps      xmm4,                   [rsi + rcx*2]     ; p0
115        movhps      xmm4,                   [rdi + rcx*2]
116        movdqa      xmm3,                   [rsp+_q1]                ; q1
117%endif
118
119        movdqa      xmm5,                   xmm4              ; p0
120        psubusb     xmm4,                   xmm6              ; p0-=p1
121
122        psubusb     xmm6,                   xmm5              ; p1-=p0
123
124        por         xmm6,                   xmm4              ; abs(p1 - p0)
125        mov         rdx,                    arg(2)            ; get blimit
126
127        movdqa     [rsp+_t1],               xmm6              ; save to t1
128
129        movdqa      xmm4,                   xmm3              ; q1
130        pmaxub      xmm1,                   xmm6
131
132        psubusb     xmm3,                   xmm2              ; q1-=p1
133        psubusb     xmm2,                   xmm4              ; p1-=q1
134
135        psubusb     xmm1,                   xmm7
136        por         xmm2,                   xmm3              ; abs(p1-q1)
137
138        movdqa      xmm7,                   [rdx]             ; blimit
139        mov         rdx,                    arg(4)            ; hev get thresh
140
141        movdqa      xmm3,                   xmm0              ; q0
142        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
143
144        movdqa      xmm6,                   xmm5              ; p0
145        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
146
147        psubusb     xmm5,                   xmm3              ; p0-=q0
148        psubusb     xmm3,                   xmm6              ; q0-=p0
149        por         xmm5,                   xmm3              ; abs(p0 - q0)
150
151        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
152
153        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
154        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
155
156        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
157
158        movdqa      xmm2,                   [rdx]             ; hev
159
160        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
161        psubusb     xmm4,                   xmm2              ; hev
162
163        psubusb     xmm3,                   xmm2              ; hev
164        por         xmm1,                   xmm5
165
166        pxor        xmm7,                   xmm7
167        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
168
169        pcmpeqb     xmm4,                   xmm5              ; hev
170        pcmpeqb     xmm3,                   xmm3              ; hev
171
172        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
173        pxor        xmm4,                   xmm3              ; hev
174%endmacro
175
176%macro B_FILTER 1
177        movdqa      xmm3,                   [GLOBAL(t80)]
178%if %1 == 0
179        movdqa      xmm2,                   [rsp+_p1]                ; p1
180        movdqa      xmm7,                   [rsp+_q1]                ; q1
181%elif %1 == 1
182        movdqa      xmm2,                   [rsi+2*rax]       ; p1
183        movdqa      xmm7,                   [rdi]             ; q1
184%elif %1 == 2
185        movdqa      xmm2,                   [rsp+_p1]         ; p1
186        movdqa      xmm6,                   [rsp+_p0]         ; p0
187        movdqa      xmm0,                   [rsp+_q0]         ; q0
188        movdqa      xmm7,                   [rsp+_q1]         ; q1
189%endif
190
191        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
192        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
193
194        psubsb      xmm2,                   xmm7              ; p1 - q1
195        pxor        xmm6,                   xmm3              ; offset to convert to signed values
196
197        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
198        pxor        xmm0,                   xmm3              ; offset to convert to signed values
199
200        movdqa      xmm3,                   xmm0              ; q0
201        psubsb      xmm0,                   xmm6              ; q0 - p0
202        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
203        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
204        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
205        pand        xmm1,                   xmm2              ; mask filter values we don't care about
206
207        movdqa      xmm2,                   xmm1
208        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
209        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
210
211        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
212        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
213
214        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
215        psraw       xmm5,                   11                ; sign extended shift right by 3
216
217        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
218        psraw       xmm2,                   11                ; sign extended shift right by 3
219
220        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
221        psraw       xmm0,                   11                ; sign extended shift right by 3
222
223        psraw       xmm1,                   11                ; sign extended shift right by 3
224        movdqa      xmm5,                   xmm0              ; save results
225
226        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
227
228        paddsb      xmm6,                   xmm2              ; p0+= p0 add
229
230        movdqa      xmm2,                   [GLOBAL(ones)]
231        paddsw      xmm5,                   xmm2
232        paddsw      xmm1,                   xmm2
233        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
234        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
235        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
236        movdqa      xmm2,                   [GLOBAL(t80)]
237
238%if %1 == 0
239        movdqa      xmm1,                   [rsp+_p1]         ; p1
240        lea         rsi,                    [rsi + rcx*2]
241        lea         rdi,                    [rdi + rcx*2]
242%elif %1 == 1
243        movdqa      xmm1,                   [rsi+2*rax]       ; p1
244%elif %1 == 2
245        movdqa      xmm1,                   [rsp+_p1]         ; p1
246%endif
247
248        pandn       xmm4,                   xmm5              ; high edge variance additive
249        pxor        xmm6,                   xmm2              ; unoffset
250
251        pxor        xmm1,                   xmm2              ; reoffset
252        psubsb      xmm3,                   xmm0              ; q0-= q0 add
253
254        paddsb      xmm1,                   xmm4              ; p1+= p1 add
255        pxor        xmm3,                   xmm2              ; unoffset
256
257        pxor        xmm1,                   xmm2              ; unoffset
258        psubsb      xmm7,                   xmm4              ; q1-= q1 add
259
260        pxor        xmm7,                   xmm2              ; unoffset
261%if %1 == 0
262        movq        [rsi],                  xmm6              ; p0
263        movhps      [rdi],                  xmm6
264        movq        [rsi + rax],            xmm1              ; p1
265        movhps      [rdi + rax],            xmm1
266        movq        [rsi + rcx],            xmm3              ; q0
267        movhps      [rdi + rcx],            xmm3
268        movq        [rsi + rcx*2],          xmm7              ; q1
269        movhps      [rdi + rcx*2],          xmm7
270%elif %1 == 1
271        movdqa      [rsi+rax],              xmm6              ; write back
272        movdqa      [rsi+2*rax],            xmm1              ; write back
273        movdqa      [rsi],                  xmm3              ; write back
274        movdqa      [rdi],                  xmm7              ; write back
275%endif
276
277%endmacro
278
279SECTION .text
280
281%if ABI_IS_32BIT
282
283;void vp8_loop_filter_horizontal_edge_sse2
284;(
285;    unsigned char *src_ptr,
286;    int            src_pixel_step,
287;    const char    *blimit,
288;    const char    *limit,
289;    const char    *thresh,
290;)
291global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
292sym(vp8_loop_filter_horizontal_edge_sse2):
293    push        rbp
294    mov         rbp, rsp
295    SHADOW_ARGS_TO_STACK 5
296    SAVE_XMM 7
297    GET_GOT     rbx
298    push        rsi
299    push        rdi
300    ; end prolog
301
302    ALIGN_STACK 16, rax
303    sub         rsp, lf_var_size
304
305        mov         rsi,                    arg(0)           ;src_ptr
306        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
307
308        mov         rdx,                    arg(3)           ;limit
309
310        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
311
312        ; calculate breakout conditions and high edge variance
313        LFH_FILTER_AND_HEV_MASK 1
314        ; filter and write back the result
315        B_FILTER 1
316
317    add rsp, lf_var_size
318    pop rsp
319    ; begin epilog
320    pop rdi
321    pop rsi
322    RESTORE_GOT
323    RESTORE_XMM
324    UNSHADOW_ARGS
325    pop         rbp
326    ret
327
328%endif
329
330;void vp8_loop_filter_horizontal_edge_uv_sse2
331;(
332;    unsigned char *src_ptr,
333;    int            src_pixel_step,
334;    const char    *blimit,
335;    const char    *limit,
336;    const char    *thresh,
337;    int            count
338;)
339global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
340sym(vp8_loop_filter_horizontal_edge_uv_sse2):
341    push        rbp
342    mov         rbp, rsp
343    SHADOW_ARGS_TO_STACK 6
344    SAVE_XMM 7
345    GET_GOT     rbx
346    push        rsi
347    push        rdi
348    ; end prolog
349
350    ALIGN_STACK 16, rax
351    sub         rsp, lf_var_size
352
353        mov         rsi,                    arg(0)             ; u
354        mov         rdi,                    arg(5)             ; v
355        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
356        mov         rcx,                    rax
357        neg         rax                     ; negate pitch to deal with above border
358
359        mov         rdx,                    arg(3)             ;limit
360
361        lea         rsi,                    [rsi + rcx]
362        lea         rdi,                    [rdi + rcx]
363
364        ; calculate breakout conditions and high edge variance
365        LFH_FILTER_AND_HEV_MASK 0
366        ; filter and write back the result
367        B_FILTER 0
368
369    add rsp, lf_var_size
370    pop rsp
371    ; begin epilog
372    pop rdi
373    pop rsi
374    RESTORE_GOT
375    RESTORE_XMM
376    UNSHADOW_ARGS
377    pop         rbp
378    ret
379
380
381%macro MB_FILTER_AND_WRITEBACK 1
382        movdqa      xmm3,                   [GLOBAL(t80)]
383%if %1 == 0
384        movdqa      xmm2,                   [rsp+_p1]              ; p1
385        movdqa      xmm7,                   [rsp+_q1]              ; q1
386%elif %1 == 1
387        movdqa      xmm2,                   [rsi+2*rax]     ; p1
388        movdqa      xmm7,                   [rdi]           ; q1
389
390        mov         rcx,                    rax
391        neg         rcx
392%elif %1 == 2
393        movdqa      xmm2,                   [rsp+_p1]       ; p1
394        movdqa      xmm6,                   [rsp+_p0]       ; p0
395        movdqa      xmm0,                   [rsp+_q0]       ; q0
396        movdqa      xmm7,                   [rsp+_q1]       ; q1
397%endif
398
399        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
400        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
401        pxor        xmm6,                   xmm3            ; offset to convert to signed values
402        pxor        xmm0,                   xmm3            ; offset to convert to signed values
403
404        psubsb      xmm2,                   xmm7            ; p1 - q1
405
406        movdqa      xmm3,                   xmm0            ; q0
407        psubsb      xmm0,                   xmm6            ; q0 - p0
408        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
409        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
410        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
411        pand        xmm1,                   xmm2            ; mask filter values we don't care about
412
413        movdqa      xmm2,                   xmm1            ; vp8_filter
414
415        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
416        pxor        xmm0,                   xmm0
417
418        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
419        pxor        xmm1,                   xmm1
420
421        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
422        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
423
424        movdqa      xmm5,                   xmm2
425
426        movdqa      xmm4,                   [GLOBAL(s9)]
427        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
428        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
429
430        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
431        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
432
433        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
434        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
435
436        psraw       xmm7,                   11              ; sign extended shift right by 3
437
438        psraw       xmm5,                   11              ; sign extended shift right by 3
439        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
440
441        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
442        psraw       xmm4,                   11              ; sign extended shift right by 3
443
444        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
445        psraw       xmm2,                   11              ; sign extended shift right by 3
446
447        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
448
449        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
450
451        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
452        movdqa      xmm7,                   xmm1
453
454        movdqa      xmm4,                   [GLOBAL(s63)]
455        movdqa      xmm5,                   xmm0
456        movdqa      xmm2,                   xmm5
457        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
458        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
459        movdqa      xmm4,                   xmm7
460
461        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
462
463        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
464        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
465
466        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
467        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
468        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
469
470        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
471        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
472        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
473
474        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
475
476        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
477        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
478        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
479
480        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
481        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
482        movdqa      xmm7,                   [GLOBAL(t80)]
483
484%if %1 == 0
485        movdqa      xmm1,                   [rsp+_q1]       ; q1
486        movdqa      xmm4,                   [rsp+_p1]       ; p1
487        lea         rsi,                    [rsi+rcx*2]
488        lea         rdi,                    [rdi+rcx*2]
489
490%elif %1 == 1
491        movdqa      xmm1,                   [rdi]           ; q1
492        movdqa      xmm4,                   [rsi+rax*2]     ; p1
493%elif %1 == 2
494        movdqa      xmm4,                   [rsp+_p1]       ; p1
495        movdqa      xmm1,                   [rsp+_q1]       ; q1
496%endif
497
498        pxor        xmm1,                   xmm7
499        pxor        xmm4,                   xmm7
500
501        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
502        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
503        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
504        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
505
506%if %1 == 1
507        movdqa      xmm2,                   [rdi+rax*4]     ; p2
508        movdqa      xmm5,                   [rdi+rcx]       ; q2
509%else
510        movdqa      xmm2,                   [rsp+_p2]       ; p2
511        movdqa      xmm5,                   [rsp+_q2]       ; q2
512%endif
513
514        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
515        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
516        pxor        xmm2,                   xmm7
517        pxor        xmm5,                   xmm7
518        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
519        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
520        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
521        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
522        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
523        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
524%if %1 == 0
525        movq        [rsi],                  xmm6            ; p0
526        movhps      [rdi],                  xmm6
527        movq        [rsi + rcx],            xmm3            ; q0
528        movhps      [rdi + rcx],            xmm3
529        lea         rdx,                    [rcx + rcx*2]
530        movq        [rsi+rcx*2],            xmm1            ; q1
531        movhps      [rdi+rcx*2],            xmm1
532
533        movq        [rsi + rax],            xmm4            ; p1
534        movhps      [rdi + rax],            xmm4
535
536        movq        [rsi+rax*2],            xmm2            ; p2
537        movhps      [rdi+rax*2],            xmm2
538
539        movq        [rsi+rdx],              xmm5            ; q2
540        movhps      [rdi+rdx],              xmm5
541%elif %1 == 1
542        movdqa      [rdi+rcx],              xmm5            ; q2
543        movdqa      [rdi],                  xmm1            ; q1
544        movdqa      [rsi],                  xmm3            ; q0
545        movdqa      [rsi+rax  ],            xmm6            ; p0
546        movdqa      [rsi+rax*2],            xmm4            ; p1
547        movdqa      [rdi+rax*4],            xmm2            ; p2
548%elif %1 == 2
549        movdqa      [rsp+_p1],              xmm4            ; p1
550        movdqa      [rsp+_p0],              xmm6            ; p0
551        movdqa      [rsp+_q0],              xmm3            ; q0
552        movdqa      [rsp+_q1],              xmm1            ; q1
553%endif
554
555%endmacro
556
557
558;void vp8_mbloop_filter_horizontal_edge_sse2
559;(
560;    unsigned char *src_ptr,
561;    int            src_pixel_step,
562;    const char    *blimit,
563;    const char    *limit,
564;    const char    *thresh,
565;)
566global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
567sym(vp8_mbloop_filter_horizontal_edge_sse2):
568    push        rbp
569    mov         rbp, rsp
570    SHADOW_ARGS_TO_STACK 5
571    SAVE_XMM 7
572    GET_GOT     rbx
573    push        rsi
574    push        rdi
575    ; end prolog
576
577    ALIGN_STACK 16, rax
578    sub         rsp, lf_var_size
579
580        mov         rsi,                    arg(0)            ;src_ptr
581        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
582        mov         rdx,                    arg(3)            ;limit
583
584        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
585
586        ; calculate breakout conditions and high edge variance
587        LFH_FILTER_AND_HEV_MASK 1
588        ; filter and write back the results
589        MB_FILTER_AND_WRITEBACK 1
590
591    add rsp, lf_var_size
592    pop rsp
593    ; begin epilog
594    pop rdi
595    pop rsi
596    RESTORE_GOT
597    RESTORE_XMM
598    UNSHADOW_ARGS
599    pop         rbp
600    ret
601
602
603;void vp8_mbloop_filter_horizontal_edge_uv_sse2
604;(
605;    unsigned char *u,
606;    int            src_pixel_step,
607;    const char    *blimit,
608;    const char    *limit,
609;    const char    *thresh,
610;    unsigned char *v
611;)
612global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
613sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
614    push        rbp
615    mov         rbp, rsp
616    SHADOW_ARGS_TO_STACK 6
617    SAVE_XMM 7
618    GET_GOT     rbx
619    push        rsi
620    push        rdi
621    ; end prolog
622
623    ALIGN_STACK 16, rax
624    sub         rsp, lf_var_size
625
626        mov         rsi,                    arg(0)             ; u
627        mov         rdi,                    arg(5)             ; v
628        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
629        mov         rcx,                    rax
630        neg         rax                     ; negate pitch to deal with above border
631        mov         rdx,                    arg(3)             ;limit
632
633        lea         rsi,                    [rsi + rcx]
634        lea         rdi,                    [rdi + rcx]
635
636        ; calculate breakout conditions and high edge variance
637        LFH_FILTER_AND_HEV_MASK 0
638        ; filter and write back the results
639        MB_FILTER_AND_WRITEBACK 0
640
641    add rsp, lf_var_size
642    pop rsp
643    ; begin epilog
644    pop rdi
645    pop rsi
646    RESTORE_GOT
647    RESTORE_XMM
648    UNSHADOW_ARGS
649    pop         rbp
650    ret
651
652
653%macro TRANSPOSE_16X8 2
654        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
655        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
656        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
657        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
658        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
659        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
660
661        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
662
663        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
664
665        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
666        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
667
668        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
669
670        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
671%if %1
672        lea         rsi,                [rsi+rax*8]
673        lea         rdi,                [rdi+rax*8]
674%else
675        mov         rsi,                arg(5)          ; v_ptr
676%endif
677
678        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
679        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
680        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
681        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
682        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
683
684%if %1 == 0
685        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
686        lea         rsi,                [rsi - 4]
687%endif
688
689        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
690        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
691
692        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
693        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
694
695        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
696
697        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
698
699        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
700
701        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
702
703        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
704        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
705        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
706        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
707        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
708
709        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
710
711        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
712
713        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
714
715        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
716
717        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
718
719        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
720
721        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
722
723        movdqa      xmm6,               xmm1            ;
724        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
725
726        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
727        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
728
729        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
730
731        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
732
733        movdqa      xmm0,               xmm5
734        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
735
736        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
737        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
738
739        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
740
741        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
742        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
743
744        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
745
746        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
747
748%if %2 == 0
749        movdqa      [rsp+_q3],          xmm7            ; save 7
750        movdqa      [rsp+_q2],          xmm6            ; save 6
751%endif
752        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
753        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
754        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
755        movdqa      [rsp+_p1],          xmm2            ; save 2
756
757        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
758        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
759        movdqa      [rsp+_p0],          xmm3            ; save 3
760
761        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
762
763        movdqa      [rsp+_q0],          xmm4            ; save 4
764        movdqa      [rsp+_q1],          xmm5            ; save 5
765        movdqa      xmm1,               [rsp+_t0]
766
767        movdqa      xmm2,               xmm1            ;
768        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
769        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
770
771%if %2 == 0
772        movdqa      [rsp+_p2],          xmm1
773        movdqa      [rsp+_p3],          xmm2
774%endif
775
776%endmacro
777
778%macro LFV_FILTER_MASK_HEV_MASK 0
779        movdqa      xmm0,               xmm6            ; q2
780        psubusb     xmm0,               xmm7            ; q2-q3
781
782        psubusb     xmm7,               xmm6            ; q3-q2
783        movdqa      xmm4,               xmm5            ; q1
784
785        por         xmm7,               xmm0            ; abs (q3-q2)
786        psubusb     xmm4,               xmm6            ; q1-q2
787
788        movdqa      xmm0,               xmm1
789        psubusb     xmm6,               xmm5            ; q2-q1
790
791        por         xmm6,               xmm4            ; abs (q2-q1)
792        psubusb     xmm0,               xmm2            ; p2 - p3;
793
794        psubusb     xmm2,               xmm1            ; p3 - p2;
795        por         xmm0,               xmm2            ; abs(p2-p3)
796
797        movdqa      xmm5,               [rsp+_p1]       ; p1
798        pmaxub      xmm0,               xmm7
799
800        movdqa      xmm2,               xmm5            ; p1
801        psubusb     xmm5,               xmm1            ; p1-p2
802        psubusb     xmm1,               xmm2            ; p2-p1
803
804        movdqa      xmm7,               xmm3            ; p0
805        psubusb     xmm7,               xmm2            ; p0-p1
806
807        por         xmm1,               xmm5            ; abs(p2-p1)
808        pmaxub      xmm0,               xmm6
809
810        pmaxub      xmm0,               xmm1
811        movdqa      xmm1,               xmm2            ; p1
812
813        psubusb     xmm2,               xmm3            ; p1-p0
814
815        por         xmm2,               xmm7            ; abs(p1-p0)
816
817        pmaxub      xmm0,               xmm2
818
819        movdqa      xmm5,               [rsp+_q0]       ; q0
820        movdqa      xmm7,               [rsp+_q1]       ; q1
821
822        mov         rdx,                arg(3)          ; limit
823
824        movdqa      xmm6,               xmm5            ; q0
825        movdqa      xmm4,               xmm7            ; q1
826
827        psubusb     xmm5,               xmm7            ; q0-q1
828        psubusb     xmm7,               xmm6            ; q1-q0
829
830        por         xmm7,               xmm5            ; abs(q1-q0)
831
832        pmaxub      xmm0,               xmm7
833
834        psubusb     xmm0,               [rdx]           ; limit
835
836        mov         rdx,                arg(2)          ; blimit
837        movdqa      xmm5,               xmm4            ; q1
838
839        psubusb     xmm5,               xmm1            ; q1-=p1
840        psubusb     xmm1,               xmm4            ; p1-=q1
841
842        por         xmm5,               xmm1            ; abs(p1-q1)
843        movdqa      xmm1,               xmm3            ; p0
844
845        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
846        psubusb     xmm1,               xmm6            ; p0-q0
847
848        movdqa      xmm4,               [rdx]           ; blimit
849        mov         rdx,                arg(4)          ; get thresh
850
851        psrlw       xmm5,               1               ; abs(p1-q1)/2
852        psubusb     xmm6,               xmm3            ; q0-p0
853
854        por         xmm1,               xmm6            ; abs(q0-p0)
855        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
856        movdqa      xmm3,               [rdx]
857
858        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
859        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
860
861        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
862
863        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
864        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
865
866        por         xmm1,               xmm0            ; mask
867        pcmpeqb     xmm2,               xmm0
868
869        pxor        xmm0,               xmm0
870        pcmpeqb     xmm4,               xmm4
871
872        pcmpeqb     xmm1,               xmm0
873        pxor        xmm4,               xmm2
874%endmacro
875
876%macro BV_TRANSPOSE 0
877        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
878        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
879        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
880        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
881        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
882        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
883
884        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
885        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
886
887        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
888
889        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
890
891        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
892        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
893
894        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
895        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
896
897        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
898
899        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
900        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
901        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
902        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
903        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
904%endmacro
905
906%macro BV_WRITEBACK 2
907        movd        [rsi+2],            %1
908        movd        [rsi+4*rax+2],      %2
909        psrldq      %1,                 4
910        psrldq      %2,                 4
911        movd        [rdi+2],            %1
912        movd        [rdi+4*rax+2],      %2
913        psrldq      %1,                 4
914        psrldq      %2,                 4
915        movd        [rsi+2*rax+2],      %1
916        movd        [rsi+2*rcx+2],      %2
917        psrldq      %1,                 4
918        psrldq      %2,                 4
919        movd        [rdi+2*rax+2],      %1
920        movd        [rdi+2*rcx+2],      %2
921%endmacro
922
923%if ABI_IS_32BIT
924
925;void vp8_loop_filter_vertical_edge_sse2
926;(
927;    unsigned char *src_ptr,
928;    int            src_pixel_step,
929;    const char    *blimit,
930;    const char    *limit,
931;    const char    *thresh,
932;)
933global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
934sym(vp8_loop_filter_vertical_edge_sse2):
935    push        rbp
936    mov         rbp, rsp
937    SHADOW_ARGS_TO_STACK 5
938    SAVE_XMM 7
939    GET_GOT     rbx
940    push        rsi
941    push        rdi
942    ; end prolog
943
944    ALIGN_STACK 16, rax
945    sub             rsp, lf_var_size
946
947        mov         rsi,        arg(0)                  ; src_ptr
948        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
949
950        lea         rsi,        [rsi - 4]
951        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
952        lea         rcx,        [rax*2+rax]
953
954        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
955        TRANSPOSE_16X8 1, 1
956
957        ; calculate filter mask and high edge variance
958        LFV_FILTER_MASK_HEV_MASK
959
960        ; start work on filters
961        B_FILTER 2
962
963        ; transpose and write back - only work on q1, q0, p0, p1
964        BV_TRANSPOSE
965        ; store 16-line result
966
967        lea         rdx,        [rax]
968        neg         rdx
969
970        BV_WRITEBACK xmm1, xmm5
971
972        lea         rsi,        [rsi+rdx*8]
973        lea         rdi,        [rdi+rdx*8]
974        BV_WRITEBACK xmm2, xmm6
975
976    add rsp, lf_var_size
977    pop rsp
978    ; begin epilog
979    pop rdi
980    pop rsi
981    RESTORE_GOT
982    RESTORE_XMM
983    UNSHADOW_ARGS
984    pop         rbp
985    ret
986
987%endif
988
989;void vp8_loop_filter_vertical_edge_uv_sse2
990;(
991;    unsigned char *u,
992;    int            src_pixel_step,
993;    const char    *blimit,
994;    const char    *limit,
995;    const char    *thresh,
996;    unsigned char *v
997;)
998global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
999sym(vp8_loop_filter_vertical_edge_uv_sse2):
1000    push        rbp
1001    mov         rbp, rsp
1002    SHADOW_ARGS_TO_STACK 6
1003    SAVE_XMM 7
1004    GET_GOT     rbx
1005    push        rsi
1006    push        rdi
1007    ; end prolog
1008
1009    ALIGN_STACK 16, rax
1010    sub             rsp, lf_var_size
1011
1012        mov         rsi,        arg(0)                  ; u_ptr
1013        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1014
1015        lea         rsi,        [rsi - 4]
1016        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1017        lea         rcx,        [rax+2*rax]
1018
1019        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1020        TRANSPOSE_16X8 0, 1
1021
1022        ; calculate filter mask and high edge variance
1023        LFV_FILTER_MASK_HEV_MASK
1024
1025        ; start work on filters
1026        B_FILTER 2
1027
1028        ; transpose and write back - only work on q1, q0, p0, p1
1029        BV_TRANSPOSE
1030
1031        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1032
1033        ; store 16-line result
1034        BV_WRITEBACK xmm1, xmm5
1035
1036        mov         rsi,        arg(0)                  ; u_ptr
1037        lea         rsi,        [rsi - 4]
1038        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1039        BV_WRITEBACK xmm2, xmm6
1040
1041    add rsp, lf_var_size
1042    pop rsp
1043    ; begin epilog
1044    pop rdi
1045    pop rsi
1046    RESTORE_GOT
1047    RESTORE_XMM
1048    UNSHADOW_ARGS
1049    pop         rbp
1050    ret
1051
1052%macro MBV_TRANSPOSE 0
1053        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1054        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1055
1056        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1057        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1058
1059        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1060        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1061
1062        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1063        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1064
1065        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1066        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1067
1068        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1069        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1070
1071        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1072        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1073
1074        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1075        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1076
1077        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1078        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1079
1080        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1081        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1082
1083        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1084        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1085
1086        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1087        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1088%endmacro
1089
1090%macro MBV_WRITEBACK_1 0
1091        movq        [rsi],              xmm0
1092        movhps      [rdi],              xmm0
1093
1094        movq        [rsi+2*rax],        xmm6
1095        movhps      [rdi+2*rax],        xmm6
1096
1097        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1098        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1099        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1100
1101        movq        [rsi+4*rax],        xmm0
1102        movhps      [rdi+4*rax],        xmm0
1103
1104        movq        [rsi+2*rcx],        xmm3
1105        movhps      [rdi+2*rcx],        xmm3
1106
1107        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1108        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1109        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1110
1111        movdqa      xmm0,               xmm7
1112        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1113        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1114
1115        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1116        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1117        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1118%endmacro
1119
1120%macro MBV_WRITEBACK_2 0
1121        movq        [rsi],              xmm1
1122        movhps      [rdi],              xmm1
1123
1124        movq        [rsi+2*rax],        xmm5
1125        movhps      [rdi+2*rax],        xmm5
1126
1127        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1128        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1129        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1130
1131        movq        [rsi+4*rax],        xmm1
1132        movhps      [rdi+4*rax],        xmm1
1133
1134        movq        [rsi+2*rcx],        xmm4
1135        movhps      [rdi+2*rcx],        xmm4
1136%endmacro
1137
1138
1139;void vp8_mbloop_filter_vertical_edge_sse2
1140;(
1141;    unsigned char *src_ptr,
1142;    int            src_pixel_step,
1143;    const char    *blimit,
1144;    const char    *limit,
1145;    const char    *thresh,
1146;)
1147global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
1148sym(vp8_mbloop_filter_vertical_edge_sse2):
1149    push        rbp
1150    mov         rbp, rsp
1151    SHADOW_ARGS_TO_STACK 5
1152    SAVE_XMM 7
1153    GET_GOT     rbx
1154    push        rsi
1155    push        rdi
1156    ; end prolog
1157
1158    ALIGN_STACK 16, rax
1159    sub          rsp, lf_var_size
1160
1161        mov         rsi,                arg(0)              ; src_ptr
1162        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1163
1164        lea         rsi,                [rsi - 4]
1165        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1166        lea         rcx,                [rax*2+rax]
1167
1168        ; Transpose
1169        TRANSPOSE_16X8 1, 0
1170
1171        ; calculate filter mask and high edge variance
1172        LFV_FILTER_MASK_HEV_MASK
1173
1174        neg         rax
1175        ; start work on filters
1176        MB_FILTER_AND_WRITEBACK 2
1177
1178        lea         rsi,                [rsi+rax*8]
1179        lea         rdi,                [rdi+rax*8]
1180
1181        ; transpose and write back
1182        MBV_TRANSPOSE
1183
1184        neg         rax
1185
1186        MBV_WRITEBACK_1
1187
1188
1189        lea         rsi,                [rsi+rax*8]
1190        lea         rdi,                [rdi+rax*8]
1191        MBV_WRITEBACK_2
1192
1193    add rsp, lf_var_size
1194    pop rsp
1195    ; begin epilog
1196    pop rdi
1197    pop rsi
1198    RESTORE_GOT
1199    RESTORE_XMM
1200    UNSHADOW_ARGS
1201    pop         rbp
1202    ret
1203
1204
1205;void vp8_mbloop_filter_vertical_edge_uv_sse2
1206;(
1207;    unsigned char *u,
1208;    int            src_pixel_step,
1209;    const char    *blimit,
1210;    const char    *limit,
1211;    const char    *thresh,
1212;    unsigned char *v
1213;)
1214global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
1215sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1216    push        rbp
1217    mov         rbp, rsp
1218    SHADOW_ARGS_TO_STACK 6
1219    SAVE_XMM 7
1220    GET_GOT     rbx
1221    push        rsi
1222    push        rdi
1223    ; end prolog
1224
1225    ALIGN_STACK 16, rax
1226    sub          rsp, lf_var_size
1227
1228        mov         rsi,                arg(0)              ; u_ptr
1229        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1230
1231        lea         rsi,                [rsi - 4]
1232        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1233        lea         rcx,                [rax+2*rax]
1234
1235        ; Transpose
1236        TRANSPOSE_16X8 0, 0
1237
1238        ; calculate filter mask and high edge variance
1239        LFV_FILTER_MASK_HEV_MASK
1240
1241        ; start work on filters
1242        MB_FILTER_AND_WRITEBACK 2
1243
1244        ; transpose and write back
1245        MBV_TRANSPOSE
1246
1247        mov         rsi,                arg(0)             ;u_ptr
1248        lea         rsi,                [rsi - 4]
1249        lea         rdi,                [rsi + rax]
1250        MBV_WRITEBACK_1
1251        mov         rsi,                arg(5)             ;v_ptr
1252        lea         rsi,                [rsi - 4]
1253        lea         rdi,                [rsi + rax]
1254        MBV_WRITEBACK_2
1255
1256    add rsp, lf_var_size
1257    pop rsp
1258    ; begin epilog
1259    pop rdi
1260    pop rsi
1261    RESTORE_GOT
1262    RESTORE_XMM
1263    UNSHADOW_ARGS
1264    pop         rbp
1265    ret
1266
1267
1268;void vp8_loop_filter_simple_horizontal_edge_sse2
1269;(
1270;    unsigned char *src_ptr,
1271;    int  src_pixel_step,
1272;    const char *blimit,
1273;)
1274global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
1275sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1276    push        rbp
1277    mov         rbp, rsp
1278    SHADOW_ARGS_TO_STACK 3
1279    SAVE_XMM 7
1280    GET_GOT     rbx
1281    ; end prolog
1282
1283        mov         rcx, arg(0)             ;src_ptr
1284        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1285        movdqa      xmm6, [GLOBAL(tfe)]
1286        lea         rdx, [rcx + rax]
1287        neg         rax
1288
1289        ; calculate mask
1290        movdqa      xmm0, [rdx]             ; q1
1291        mov         rdx, arg(2)             ;blimit
1292        movdqa      xmm1, [rcx+2*rax]       ; p1
1293
1294        movdqa      xmm2, xmm1
1295        movdqa      xmm3, xmm0
1296
1297        psubusb     xmm0, xmm1              ; q1-=p1
1298        psubusb     xmm1, xmm3              ; p1-=q1
1299        por         xmm1, xmm0              ; abs(p1-q1)
1300        pand        xmm1, xmm6              ; set lsb of each byte to zero
1301        psrlw       xmm1, 1                 ; abs(p1-q1)/2
1302
1303        movdqa      xmm7, XMMWORD PTR [rdx]
1304
1305        movdqa      xmm5, [rcx+rax]         ; p0
1306        movdqa      xmm4, [rcx]             ; q0
1307        movdqa      xmm0, xmm4              ; q0
1308        movdqa      xmm6, xmm5              ; p0
1309        psubusb     xmm5, xmm4              ; p0-=q0
1310        psubusb     xmm4, xmm6              ; q0-=p0
1311        por         xmm5, xmm4              ; abs(p0 - q0)
1312
1313        movdqa      xmm4, [GLOBAL(t80)]
1314
1315        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1316        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1317        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1318        pxor        xmm7, xmm7
1319        pcmpeqb     xmm5, xmm7
1320
1321
1322        ; start work on filters
1323        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
1324        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
1325        psubsb      xmm2, xmm3              ; p1 - q1
1326
1327        pxor        xmm6, xmm4     ; offset to convert to signed values
1328        pxor        xmm0, xmm4     ; offset to convert to signed values
1329        movdqa      xmm3, xmm0              ; q0
1330        psubsb      xmm0, xmm6              ; q0 - p0
1331        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1332        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1333        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1334        pand        xmm5, xmm2              ; mask filter values we don't care about
1335
1336        movdqa      xmm0, xmm5
1337        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1338        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1339
1340        movdqa      xmm1, [GLOBAL(te0)]
1341        movdqa      xmm2, [GLOBAL(t1f)]
1342
1343;        pxor        xmm7, xmm7
1344        pcmpgtb     xmm7, xmm0              ;save sign
1345        pand        xmm7, xmm1              ;preserve the upper 3 bits
1346        psrlw       xmm0, 3
1347        pand        xmm0, xmm2              ;clear out upper 3 bits
1348        por         xmm0, xmm7              ;add sign
1349        psubsb      xmm3, xmm0              ; q0-= q0sz add
1350
1351        pxor        xmm7, xmm7
1352        pcmpgtb     xmm7, xmm5              ;save sign
1353        pand        xmm7, xmm1              ;preserve the upper 3 bits
1354        psrlw       xmm5, 3
1355        pand        xmm5, xmm2              ;clear out upper 3 bits
1356        por         xmm5, xmm7              ;add sign
1357        paddsb      xmm6, xmm5              ; p0+= p0 add
1358
1359        pxor        xmm3, xmm4     ; unoffset
1360        movdqa      [rcx], xmm3             ; write back
1361
1362        pxor        xmm6, xmm4     ; unoffset
1363        movdqa      [rcx+rax], xmm6         ; write back
1364
1365    ; begin epilog
1366    RESTORE_GOT
1367    RESTORE_XMM
1368    UNSHADOW_ARGS
1369    pop         rbp
1370    ret
1371
1372
1373;void vp8_loop_filter_simple_vertical_edge_sse2
1374;(
1375;    unsigned char *src_ptr,
1376;    int  src_pixel_step,
1377;    const char *blimit,
1378;)
1379global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
1380sym(vp8_loop_filter_simple_vertical_edge_sse2):
1381    push        rbp         ; save old base pointer value.
1382    mov         rbp, rsp    ; set new base pointer value.
1383    SHADOW_ARGS_TO_STACK 3
1384    SAVE_XMM 7
1385    GET_GOT     rbx         ; save callee-saved reg
1386    push        rsi
1387    push        rdi
1388    ; end prolog
1389
1390    ALIGN_STACK 16, rax
1391    sub         rsp, 32                         ; reserve 32 bytes
1392    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1393    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1394
1395        mov         rsi, arg(0) ;src_ptr
1396        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1397
1398        lea         rsi,        [rsi - 2 ]
1399        lea         rdi,        [rsi + rax]
1400        lea         rdx,        [rsi + rax*4]
1401        lea         rcx,        [rdx + rax]
1402
1403        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1404        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1405        movd        xmm2,       [rdi]                   ; 13 12 11 10
1406        movd        xmm3,       [rcx]                   ; 53 52 51 50
1407        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1408        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1409
1410        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
1411        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
1412        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
1413        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
1414        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1415        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1416
1417        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1418        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1419
1420        movdqa      xmm1,       xmm0
1421        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1422        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1423
1424        movdqa      xmm2,       xmm0
1425        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1426        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1427
1428        lea         rsi,        [rsi + rax*8]
1429        lea         rdi,        [rsi + rax]
1430        lea         rdx,        [rsi + rax*4]
1431        lea         rcx,        [rdx + rax]
1432
1433        movd        xmm4,       [rsi]                   ; 83 82 81 80
1434        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
1435        movd        xmm6,       [rdi]                   ; 93 92 91 90
1436        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
1437        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1438        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1439
1440        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
1441        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1442        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
1443        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1444        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1445        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1446
1447        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1448        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1449
1450        movdqa      xmm7,       xmm4
1451        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1452        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1453
1454        movdqa      xmm6,       xmm4
1455        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1456        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1457
1458        movdqa      xmm1,       xmm0
1459        movdqa      xmm3,       xmm2
1460
1461        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1462        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1463        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1464        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1465
1466        mov         rdx,        arg(2)                          ;blimit
1467
1468        ; calculate mask
1469        movdqa      xmm6,       xmm0                            ; p1
1470        movdqa      xmm7,       xmm3                            ; q1
1471        psubusb     xmm7,       xmm0                            ; q1-=p1
1472        psubusb     xmm6,       xmm3                            ; p1-=q1
1473        por         xmm6,       xmm7                            ; abs(p1-q1)
1474        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1475        psrlw       xmm6,       1                               ; abs(p1-q1)/2
1476
1477        movdqa      xmm7, [rdx]
1478
1479        movdqa      xmm5,       xmm1                            ; p0
1480        movdqa      xmm4,       xmm2                            ; q0
1481        psubusb     xmm5,       xmm2                            ; p0-=q0
1482        psubusb     xmm4,       xmm1                            ; q0-=p0
1483        por         xmm5,       xmm4                            ; abs(p0 - q0)
1484        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1485        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1486
1487        movdqa      xmm4, [GLOBAL(t80)]
1488
1489        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1490        pxor        xmm7,        xmm7
1491        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1492
1493        ; start work on filters
1494        movdqa        t0,        xmm0
1495        movdqa        t1,        xmm3
1496
1497        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
1498        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
1499        psubsb      xmm0,        xmm3                           ; p1 - q1
1500
1501        pxor        xmm1,        xmm4                  ; offset to convert to signed values
1502        pxor        xmm2,        xmm4                  ; offset to convert to signed values
1503
1504        movdqa      xmm3,        xmm2                           ; offseted ; q0
1505        psubsb      xmm2,        xmm1                           ; q0 - p0
1506        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
1507        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
1508        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
1509        pand        xmm5,        xmm0                           ; mask filter values we don't care about
1510
1511        movdqa      xmm0, xmm5
1512        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1513        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1514
1515        movdqa  xmm6, [GLOBAL(te0)]
1516        movdqa  xmm2, [GLOBAL(t1f)]
1517
1518;        pxor        xmm7, xmm7
1519        pcmpgtb     xmm7, xmm0              ;save sign
1520        pand        xmm7, xmm6              ;preserve the upper 3 bits
1521        psrlw       xmm0, 3
1522        pand        xmm0, xmm2              ;clear out upper 3 bits
1523        por         xmm0, xmm7              ;add sign
1524        psubsb      xmm3, xmm0              ; q0-= q0sz add
1525
1526        pxor        xmm7, xmm7
1527        pcmpgtb     xmm7, xmm5              ;save sign
1528        pand        xmm7, xmm6              ;preserve the upper 3 bits
1529        psrlw       xmm5, 3
1530        pand        xmm5, xmm2              ;clear out upper 3 bits
1531        por         xmm5, xmm7              ;add sign
1532        paddsb      xmm1, xmm5              ; p0+= p0 add
1533
1534        pxor        xmm3,        xmm4                  ; unoffset   q0
1535        pxor        xmm1,        xmm4                  ; unoffset   p0
1536
1537        movdqa      xmm0,        t0                             ; p1
1538        movdqa      xmm4,        t1                             ; q1
1539
1540        ; write out order: xmm0 xmm2 xmm1 xmm3
1541        lea         rdx,        [rsi + rax*4]
1542
1543        ; transpose back to write out
1544        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1545        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1546        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1547        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1548        movdqa      xmm6,       xmm0
1549        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1550        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1551
1552        movdqa      xmm5,       xmm3
1553        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1554        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1555
1556        movdqa      xmm2,       xmm0
1557        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1558        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1559
1560        movdqa      xmm3,       xmm6
1561        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1562        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1563
1564        movd        [rsi],      xmm6                               ; write the second 8-line result
1565        movd        [rdx],      xmm3
1566        psrldq      xmm6,       4
1567        psrldq      xmm3,       4
1568        movd        [rdi],      xmm6
1569        movd        [rcx],      xmm3
1570        psrldq      xmm6,       4
1571        psrldq      xmm3,       4
1572        movd        [rsi + rax*2], xmm6
1573        movd        [rdx + rax*2], xmm3
1574        psrldq      xmm6,       4
1575        psrldq      xmm3,       4
1576        movd        [rdi + rax*2], xmm6
1577        movd        [rcx + rax*2], xmm3
1578
1579        neg         rax
1580        lea         rsi,        [rsi + rax*8]
1581        neg         rax
1582        lea         rdi,        [rsi + rax]
1583        lea         rdx,        [rsi + rax*4]
1584        lea         rcx,        [rdx + rax]
1585
1586        movd        [rsi],      xmm0                                ; write the first 8-line result
1587        movd        [rdx],      xmm2
1588        psrldq      xmm0,       4
1589        psrldq      xmm2,       4
1590        movd        [rdi],      xmm0
1591        movd        [rcx],      xmm2
1592        psrldq      xmm0,       4
1593        psrldq      xmm2,       4
1594        movd        [rsi + rax*2], xmm0
1595        movd        [rdx + rax*2], xmm2
1596        psrldq      xmm0,       4
1597        psrldq      xmm2,       4
1598        movd        [rdi + rax*2], xmm0
1599        movd        [rcx + rax*2], xmm2
1600
1601    add rsp, 32
1602    pop rsp
1603    ; begin epilog
1604    pop rdi
1605    pop rsi
1606    RESTORE_GOT
1607    RESTORE_XMM
1608    UNSHADOW_ARGS
1609    pop         rbp
1610    ret
1611
1612SECTION_RODATA
1613align 16
1614tfe:
1615    times 16 db 0xfe
1616align 16
1617t80:
1618    times 16 db 0x80
1619align 16
1620t1s:
1621    times 16 db 0x01
1622align 16
1623t3:
1624    times 16 db 0x03
1625align 16
1626t4:
1627    times 16 db 0x04
1628align 16
1629ones:
1630    times 8 dw 0x0001
1631align 16
1632s9:
1633    times 8 dw 0x0900
1634align 16
1635s63:
1636    times 8 dw 0x003f
1637align 16
1638te0:
1639    times 16 db 0xe0
1640align 16
1641t1f:
1642    times 16 db 0x1f
1643