1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define BLOCK_HEIGHT_WIDTH 4
15%define VP8_FILTER_WEIGHT 128
16%define VP8_FILTER_SHIFT  7
17
18SECTION .text
19
20;/************************************************************************************
21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22; input pixel array has output_height rows. This routine assumes that output_height is an
23; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24; rows each iteration to take advantage of the 128 bits operations.
25;
26; This is an implementation of some of the SSE optimizations first seen in ffvp8
27;
28;*************************************************************************************/
29;void vp8_filter_block1d8_h6_ssse3
30;(
31;    unsigned char  *src_ptr,
32;    unsigned int    src_pixels_per_line,
33;    unsigned char *output_ptr,
34;    unsigned int    output_pitch,
35;    unsigned int    output_height,
36;    unsigned int    vp8_filter_index
37;)
38global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
39sym(vp8_filter_block1d8_h6_ssse3):
40    push        rbp
41    mov         rbp, rsp
42    SHADOW_ARGS_TO_STACK 6
43    SAVE_XMM 7
44    GET_GOT     rbx
45    push        rsi
46    push        rdi
47    ; end prolog
48
49    movsxd      rdx, DWORD PTR arg(5)   ;table index
50    xor         rsi, rsi
51    shl         rdx, 4
52
53    movdqa      xmm7, [GLOBAL(rd)]
54
55    lea         rax, [GLOBAL(k0_k5)]
56    add         rax, rdx
57    mov         rdi, arg(2)             ;output_ptr
58
59    cmp         esi, DWORD PTR [rax]
60    je          vp8_filter_block1d8_h4_ssse3
61
62    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
63    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
64    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
65
66    mov         rsi, arg(0)             ;src_ptr
67    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
68    movsxd      rcx, dword ptr arg(4)   ;output_height
69
70    movsxd      rdx, dword ptr arg(3)   ;output_pitch
71
72    sub         rdi, rdx
73;xmm3 free
74.filter_block1d8_h6_rowloop_ssse3:
75    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
76
77    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
78
79    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
80
81    movdqa      xmm1,   xmm0
82    pmaddubsw   xmm0,   xmm4
83
84    movdqa      xmm2,   xmm1
85    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
86
87    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
88    pmaddubsw   xmm1,   xmm5
89
90    lea         rdi,    [rdi + rdx]
91    pmaddubsw   xmm2,   xmm6
92
93    lea         rsi,    [rsi + rax]
94    dec         rcx
95
96    paddsw      xmm0,   xmm1
97    paddsw      xmm2,   xmm7
98
99    paddsw      xmm0,   xmm2
100
101    psraw       xmm0,   7
102
103    packuswb    xmm0,   xmm0
104
105    movq        MMWORD Ptr [rdi], xmm0
106    jnz         .filter_block1d8_h6_rowloop_ssse3
107
108    ; begin epilog
109    pop rdi
110    pop rsi
111    RESTORE_GOT
112    RESTORE_XMM
113    UNSHADOW_ARGS
114    pop         rbp
115    ret
116
117vp8_filter_block1d8_h4_ssse3:
118    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
119    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
120
121    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
122    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
123
124    mov         rsi, arg(0)             ;src_ptr
125
126    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
127    movsxd      rcx, dword ptr arg(4)   ;output_height
128
129    movsxd      rdx, dword ptr arg(3)   ;output_pitch
130
131    sub         rdi, rdx
132
133.filter_block1d8_h4_rowloop_ssse3:
134    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
135
136    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
137
138    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
139
140    movdqa      xmm2,   xmm0
141    pshufb      xmm0,   xmm3
142
143    pshufb      xmm2,   xmm4
144    pmaddubsw   xmm0,   xmm5
145
146    lea         rdi,    [rdi + rdx]
147    pmaddubsw   xmm2,   xmm6
148
149    lea         rsi,    [rsi + rax]
150    dec         rcx
151
152    paddsw      xmm0,   xmm7
153
154    paddsw      xmm0,   xmm2
155
156    psraw       xmm0,   7
157
158    packuswb    xmm0,   xmm0
159
160    movq        MMWORD Ptr [rdi], xmm0
161
162    jnz         .filter_block1d8_h4_rowloop_ssse3
163
164    ; begin epilog
165    pop rdi
166    pop rsi
167    RESTORE_GOT
168    RESTORE_XMM
169    UNSHADOW_ARGS
170    pop         rbp
171    ret
172;void vp8_filter_block1d16_h6_ssse3
173;(
174;    unsigned char  *src_ptr,
175;    unsigned int    src_pixels_per_line,
176;    unsigned char  *output_ptr,
177;    unsigned int    output_pitch,
178;    unsigned int    output_height,
179;    unsigned int    vp8_filter_index
180;)
181global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
182sym(vp8_filter_block1d16_h6_ssse3):
183    push        rbp
184    mov         rbp, rsp
185    SHADOW_ARGS_TO_STACK 6
186    SAVE_XMM 7
187    GET_GOT     rbx
188    push        rsi
189    push        rdi
190    ; end prolog
191
192    movsxd      rdx, DWORD PTR arg(5)           ;table index
193    xor         rsi, rsi
194    shl         rdx, 4      ;
195
196    lea         rax, [GLOBAL(k0_k5)]
197    add         rax, rdx
198
199    mov         rdi, arg(2)                     ;output_ptr
200
201    mov         rsi, arg(0)                     ;src_ptr
202
203    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
204    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
205    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
206
207    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
208    movsxd      rcx, dword ptr arg(4)           ;output_height
209    movsxd      rdx, dword ptr arg(3)           ;output_pitch
210
211.filter_block1d16_h6_rowloop_ssse3:
212    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
213
214    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
215
216    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
217
218    movdqa      xmm1,   xmm0
219    pmaddubsw   xmm0,   xmm4
220
221    movdqa      xmm2,   xmm1
222    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
223
224    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
225    movq        xmm3,   MMWORD PTR [rsi +  6]
226
227    pmaddubsw   xmm1,   xmm5
228    movq        xmm7,   MMWORD PTR [rsi + 11]
229
230    pmaddubsw   xmm2,   xmm6
231    punpcklbw   xmm3,   xmm7
232
233    paddsw      xmm0,   xmm1
234    movdqa      xmm1,   xmm3
235
236    pmaddubsw   xmm3,   xmm4
237    paddsw      xmm0,   xmm2
238
239    movdqa      xmm2,   xmm1
240    paddsw      xmm0,   [GLOBAL(rd)]
241
242    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
243    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
244
245    psraw       xmm0,   7
246    pmaddubsw   xmm1,   xmm5
247
248    pmaddubsw   xmm2,   xmm6
249    packuswb    xmm0,   xmm0
250
251    lea         rsi,    [rsi + rax]
252    paddsw      xmm3,   xmm1
253
254    paddsw      xmm3,   xmm2
255
256    paddsw      xmm3,   [GLOBAL(rd)]
257
258    psraw       xmm3,   7
259
260    packuswb    xmm3,   xmm3
261
262    punpcklqdq  xmm0,   xmm3
263
264    movdqa      XMMWORD Ptr [rdi], xmm0
265
266    lea         rdi,    [rdi + rdx]
267    dec         rcx
268    jnz         .filter_block1d16_h6_rowloop_ssse3
269
270    ; begin epilog
271    pop rdi
272    pop rsi
273    RESTORE_GOT
274    RESTORE_XMM
275    UNSHADOW_ARGS
276    pop         rbp
277    ret
278
279;void vp8_filter_block1d4_h6_ssse3
280;(
281;    unsigned char  *src_ptr,
282;    unsigned int    src_pixels_per_line,
283;    unsigned char  *output_ptr,
284;    unsigned int    output_pitch,
285;    unsigned int    output_height,
286;    unsigned int    vp8_filter_index
287;)
288global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
289sym(vp8_filter_block1d4_h6_ssse3):
290    push        rbp
291    mov         rbp, rsp
292    SHADOW_ARGS_TO_STACK 6
293    SAVE_XMM 7
294    GET_GOT     rbx
295    push        rsi
296    push        rdi
297    ; end prolog
298
299    movsxd      rdx, DWORD PTR arg(5)   ;table index
300    xor         rsi, rsi
301    shl         rdx, 4      ;
302
303    lea         rax, [GLOBAL(k0_k5)]
304    add         rax, rdx
305    movdqa      xmm7, [GLOBAL(rd)]
306
307    cmp         esi, DWORD PTR [rax]
308    je          .vp8_filter_block1d4_h4_ssse3
309
310    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
311    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
312    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
313
314    mov         rsi, arg(0)             ;src_ptr
315    mov         rdi, arg(2)             ;output_ptr
316    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
317    movsxd      rcx, dword ptr arg(4)   ;output_height
318
319    movsxd      rdx, dword ptr arg(3)   ;output_pitch
320
321;xmm3 free
322.filter_block1d4_h6_rowloop_ssse3:
323    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
324
325    movdqa      xmm1, xmm0
326    pshufb      xmm0, [GLOBAL(shuf1b)]
327
328    movdqa      xmm2, xmm1
329    pshufb      xmm1, [GLOBAL(shuf2b)]
330    pmaddubsw   xmm0, xmm4
331    pshufb      xmm2, [GLOBAL(shuf3b)]
332    pmaddubsw   xmm1, xmm5
333
334;--
335    pmaddubsw   xmm2, xmm6
336
337    lea         rsi,    [rsi + rax]
338;--
339    paddsw      xmm0, xmm1
340    paddsw      xmm0, xmm7
341    pxor        xmm1, xmm1
342    paddsw      xmm0, xmm2
343    psraw       xmm0, 7
344    packuswb    xmm0, xmm0
345
346    movd        DWORD PTR [rdi], xmm0
347
348    add         rdi, rdx
349    dec         rcx
350    jnz         .filter_block1d4_h6_rowloop_ssse3
351
352    ; begin epilog
353    pop rdi
354    pop rsi
355    RESTORE_GOT
356    RESTORE_XMM
357    UNSHADOW_ARGS
358    pop         rbp
359    ret
360
361.vp8_filter_block1d4_h4_ssse3:
362    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
363    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
364    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
365    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
366
367    mov         rsi, arg(0)             ;src_ptr
368    mov         rdi, arg(2)             ;output_ptr
369    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
370    movsxd      rcx, dword ptr arg(4)   ;output_height
371
372    movsxd      rdx, dword ptr arg(3)   ;output_pitch
373
374.filter_block1d4_h4_rowloop_ssse3:
375    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
376
377    movdqa      xmm2, xmm1
378    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
379    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
380    pmaddubsw   xmm1, xmm5
381
382;--
383    pmaddubsw   xmm2, xmm6
384
385    lea         rsi,    [rsi + rax]
386;--
387    paddsw      xmm1, xmm7
388    paddsw      xmm1, xmm2
389    psraw       xmm1, 7
390    packuswb    xmm1, xmm1
391
392    movd        DWORD PTR [rdi], xmm1
393
394    add         rdi, rdx
395    dec         rcx
396    jnz         .filter_block1d4_h4_rowloop_ssse3
397
398    ; begin epilog
399    pop rdi
400    pop rsi
401    RESTORE_GOT
402    RESTORE_XMM
403    UNSHADOW_ARGS
404    pop         rbp
405    ret
406
407
408
409;void vp8_filter_block1d16_v6_ssse3
410;(
411;    unsigned char *src_ptr,
412;    unsigned int   src_pitch,
413;    unsigned char *output_ptr,
414;    unsigned int   out_pitch,
415;    unsigned int   output_height,
416;    unsigned int   vp8_filter_index
417;)
418global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
419sym(vp8_filter_block1d16_v6_ssse3):
420    push        rbp
421    mov         rbp, rsp
422    SHADOW_ARGS_TO_STACK 6
423    SAVE_XMM 7
424    GET_GOT     rbx
425    push        rsi
426    push        rdi
427    ; end prolog
428
429    movsxd      rdx, DWORD PTR arg(5)   ;table index
430    xor         rsi, rsi
431    shl         rdx, 4      ;
432
433    lea         rax, [GLOBAL(k0_k5)]
434    add         rax, rdx
435
436    cmp         esi, DWORD PTR [rax]
437    je          .vp8_filter_block1d16_v4_ssse3
438
439    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
440    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
441    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
442
443    mov         rsi, arg(0)             ;src_ptr
444    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
445    mov         rdi, arg(2)             ;output_ptr
446
447%if ABI_IS_32BIT=0
448    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
449%endif
450    mov         rax, rsi
451    movsxd      rcx, DWORD PTR arg(4)   ;output_height
452    add         rax, rdx
453
454
455.vp8_filter_block1d16_v6_ssse3_loop:
456    movq        xmm1, MMWORD PTR [rsi]                  ;A
457    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
458    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
459    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
460    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
461
462    punpcklbw   xmm2, xmm4                  ;B D
463    punpcklbw   xmm3, xmm0                  ;C E
464
465    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
466
467    pmaddubsw   xmm3, xmm6
468    punpcklbw   xmm1, xmm0                  ;A F
469    pmaddubsw   xmm2, xmm7
470    pmaddubsw   xmm1, xmm5
471
472    paddsw      xmm2, xmm3
473    paddsw      xmm2, xmm1
474    paddsw      xmm2, [GLOBAL(rd)]
475    psraw       xmm2, 7
476    packuswb    xmm2, xmm2
477
478    movq        MMWORD PTR [rdi], xmm2          ;store the results
479
480    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
481    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
482    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
483    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
484    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
485
486    punpcklbw   xmm2, xmm4                  ;B D
487    punpcklbw   xmm3, xmm0                  ;C E
488
489    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
490    pmaddubsw   xmm3, xmm6
491    punpcklbw   xmm1, xmm0                  ;A F
492    pmaddubsw   xmm2, xmm7
493    pmaddubsw   xmm1, xmm5
494
495    add         rsi,  rdx
496    add         rax,  rdx
497;--
498;--
499    paddsw      xmm2, xmm3
500    paddsw      xmm2, xmm1
501    paddsw      xmm2, [GLOBAL(rd)]
502    psraw       xmm2, 7
503    packuswb    xmm2, xmm2
504
505    movq        MMWORD PTR [rdi+8], xmm2
506
507%if ABI_IS_32BIT
508    add         rdi,        DWORD PTR arg(3) ;out_pitch
509%else
510    add         rdi,        r8
511%endif
512    dec         rcx
513    jnz         .vp8_filter_block1d16_v6_ssse3_loop
514
515    ; begin epilog
516    pop rdi
517    pop rsi
518    RESTORE_GOT
519    RESTORE_XMM
520    UNSHADOW_ARGS
521    pop         rbp
522    ret
523
524.vp8_filter_block1d16_v4_ssse3:
525    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
526    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
527
528    mov         rsi, arg(0)             ;src_ptr
529    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
530    mov         rdi, arg(2)             ;output_ptr
531
532%if ABI_IS_32BIT=0
533    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
534%endif
535    mov         rax, rsi
536    movsxd      rcx, DWORD PTR arg(4)   ;output_height
537    add         rax, rdx
538
539.vp8_filter_block1d16_v4_ssse3_loop:
540    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
541    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
542    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
543    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
544
545    punpcklbw   xmm2, xmm4                  ;B D
546    punpcklbw   xmm3, xmm0                  ;C E
547
548    pmaddubsw   xmm3, xmm6
549    pmaddubsw   xmm2, xmm7
550    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
551    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
552    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
553    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
554
555    paddsw      xmm2, [GLOBAL(rd)]
556    paddsw      xmm2, xmm3
557    psraw       xmm2, 7
558    packuswb    xmm2, xmm2
559
560    punpcklbw   xmm5, xmm4                  ;B D
561    punpcklbw   xmm1, xmm0                  ;C E
562
563    pmaddubsw   xmm1, xmm6
564    pmaddubsw   xmm5, xmm7
565
566    movdqa      xmm4, [GLOBAL(rd)]
567    add         rsi,  rdx
568    add         rax,  rdx
569;--
570;--
571    paddsw      xmm5, xmm1
572    paddsw      xmm5, xmm4
573    psraw       xmm5, 7
574    packuswb    xmm5, xmm5
575
576    punpcklqdq  xmm2, xmm5
577
578    movdqa       XMMWORD PTR [rdi], xmm2
579
580%if ABI_IS_32BIT
581    add         rdi,        DWORD PTR arg(3) ;out_pitch
582%else
583    add         rdi,        r8
584%endif
585    dec         rcx
586    jnz         .vp8_filter_block1d16_v4_ssse3_loop
587
588    ; begin epilog
589    pop rdi
590    pop rsi
591    RESTORE_GOT
592    RESTORE_XMM
593    UNSHADOW_ARGS
594    pop         rbp
595    ret
596
597;void vp8_filter_block1d8_v6_ssse3
598;(
599;    unsigned char *src_ptr,
600;    unsigned int   src_pitch,
601;    unsigned char *output_ptr,
602;    unsigned int   out_pitch,
603;    unsigned int   output_height,
604;    unsigned int   vp8_filter_index
605;)
606global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
607sym(vp8_filter_block1d8_v6_ssse3):
608    push        rbp
609    mov         rbp, rsp
610    SHADOW_ARGS_TO_STACK 6
611    SAVE_XMM 7
612    GET_GOT     rbx
613    push        rsi
614    push        rdi
615    ; end prolog
616
617    movsxd      rdx, DWORD PTR arg(5)   ;table index
618    xor         rsi, rsi
619    shl         rdx, 4      ;
620
621    lea         rax, [GLOBAL(k0_k5)]
622    add         rax, rdx
623
624    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
625    mov         rdi, arg(2)             ;output_ptr
626%if ABI_IS_32BIT=0
627    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
628%endif
629    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
630
631    cmp         esi, DWORD PTR [rax]
632    je          .vp8_filter_block1d8_v4_ssse3
633
634    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
635    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
636    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
637
638    mov         rsi, arg(0)             ;src_ptr
639
640    mov         rax, rsi
641    add         rax, rdx
642
643.vp8_filter_block1d8_v6_ssse3_loop:
644    movq        xmm1, MMWORD PTR [rsi]                  ;A
645    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
646    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
647    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
648    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
649
650    punpcklbw   xmm2, xmm4                  ;B D
651    punpcklbw   xmm3, xmm0                  ;C E
652
653    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
654    movdqa      xmm4, [GLOBAL(rd)]
655
656    pmaddubsw   xmm3, xmm6
657    punpcklbw   xmm1, xmm0                  ;A F
658    pmaddubsw   xmm2, xmm7
659    pmaddubsw   xmm1, xmm5
660    add         rsi,  rdx
661    add         rax,  rdx
662;--
663;--
664    paddsw      xmm2, xmm3
665    paddsw      xmm2, xmm1
666    paddsw      xmm2, xmm4
667    psraw       xmm2, 7
668    packuswb    xmm2, xmm2
669
670    movq        MMWORD PTR [rdi], xmm2
671
672%if ABI_IS_32BIT
673    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
674%else
675    add         rdi,        r8
676%endif
677    dec         rcx
678    jnz         .vp8_filter_block1d8_v6_ssse3_loop
679
680    ; begin epilog
681    pop rdi
682    pop rsi
683    RESTORE_GOT
684    RESTORE_XMM
685    UNSHADOW_ARGS
686    pop         rbp
687    ret
688
689.vp8_filter_block1d8_v4_ssse3:
690    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
691    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
692    movdqa      xmm5, [GLOBAL(rd)]
693
694    mov         rsi, arg(0)             ;src_ptr
695
696    mov         rax, rsi
697    add         rax, rdx
698
699.vp8_filter_block1d8_v4_ssse3_loop:
700    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
701    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
702    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
703    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
704
705    punpcklbw   xmm2, xmm4                  ;B D
706    punpcklbw   xmm3, xmm0                  ;C E
707
708    pmaddubsw   xmm3, xmm6
709    pmaddubsw   xmm2, xmm7
710    add         rsi,  rdx
711    add         rax,  rdx
712;--
713;--
714    paddsw      xmm2, xmm3
715    paddsw      xmm2, xmm5
716    psraw       xmm2, 7
717    packuswb    xmm2, xmm2
718
719    movq        MMWORD PTR [rdi], xmm2
720
721%if ABI_IS_32BIT
722    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
723%else
724    add         rdi,        r8
725%endif
726    dec         rcx
727    jnz         .vp8_filter_block1d8_v4_ssse3_loop
728
729    ; begin epilog
730    pop rdi
731    pop rsi
732    RESTORE_GOT
733    RESTORE_XMM
734    UNSHADOW_ARGS
735    pop         rbp
736    ret
737;void vp8_filter_block1d4_v6_ssse3
738;(
739;    unsigned char *src_ptr,
740;    unsigned int   src_pitch,
741;    unsigned char *output_ptr,
742;    unsigned int   out_pitch,
743;    unsigned int   output_height,
744;    unsigned int   vp8_filter_index
745;)
746global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
747sym(vp8_filter_block1d4_v6_ssse3):
748    push        rbp
749    mov         rbp, rsp
750    SHADOW_ARGS_TO_STACK 6
751    GET_GOT     rbx
752    push        rsi
753    push        rdi
754    ; end prolog
755
756    movsxd      rdx, DWORD PTR arg(5)   ;table index
757    xor         rsi, rsi
758    shl         rdx, 4      ;
759
760    lea         rax, [GLOBAL(k0_k5)]
761    add         rax, rdx
762
763    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
764    mov         rdi, arg(2)             ;output_ptr
765%if ABI_IS_32BIT=0
766    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
767%endif
768    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
769
770    cmp         esi, DWORD PTR [rax]
771    je          .vp8_filter_block1d4_v4_ssse3
772
773    movq        mm5, MMWORD PTR [rax]         ;k0_k5
774    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
775    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
776
777    mov         rsi, arg(0)             ;src_ptr
778
779    mov         rax, rsi
780    add         rax, rdx
781
782.vp8_filter_block1d4_v6_ssse3_loop:
783    movd        mm1, DWORD PTR [rsi]                  ;A
784    movd        mm2, DWORD PTR [rsi + rdx]            ;B
785    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
786    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
787    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
788
789    punpcklbw   mm2, mm4                  ;B D
790    punpcklbw   mm3, mm0                  ;C E
791
792    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
793
794    movq        mm4, [GLOBAL(rd)]
795
796    pmaddubsw   mm3, mm6
797    punpcklbw   mm1, mm0                  ;A F
798    pmaddubsw   mm2, mm7
799    pmaddubsw   mm1, mm5
800    add         rsi,  rdx
801    add         rax,  rdx
802;--
803;--
804    paddsw      mm2, mm3
805    paddsw      mm2, mm1
806    paddsw      mm2, mm4
807    psraw       mm2, 7
808    packuswb    mm2, mm2
809
810    movd        DWORD PTR [rdi], mm2
811
812%if ABI_IS_32BIT
813    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
814%else
815    add         rdi,        r8
816%endif
817    dec         rcx
818    jnz         .vp8_filter_block1d4_v6_ssse3_loop
819
820    ; begin epilog
821    pop rdi
822    pop rsi
823    RESTORE_GOT
824    UNSHADOW_ARGS
825    pop         rbp
826    ret
827
828.vp8_filter_block1d4_v4_ssse3:
829    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
830    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
831    movq        mm5, MMWORD PTR [GLOBAL(rd)]
832
833    mov         rsi, arg(0)             ;src_ptr
834
835    mov         rax, rsi
836    add         rax, rdx
837
838.vp8_filter_block1d4_v4_ssse3_loop:
839    movd        mm2, DWORD PTR [rsi + rdx]            ;B
840    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
841    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
842    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
843
844    punpcklbw   mm2, mm4                  ;B D
845    punpcklbw   mm3, mm0                  ;C E
846
847    pmaddubsw   mm3, mm6
848    pmaddubsw   mm2, mm7
849    add         rsi,  rdx
850    add         rax,  rdx
851;--
852;--
853    paddsw      mm2, mm3
854    paddsw      mm2, mm5
855    psraw       mm2, 7
856    packuswb    mm2, mm2
857
858    movd        DWORD PTR [rdi], mm2
859
860%if ABI_IS_32BIT
861    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
862%else
863    add         rdi,        r8
864%endif
865    dec         rcx
866    jnz         .vp8_filter_block1d4_v4_ssse3_loop
867
868    ; begin epilog
869    pop rdi
870    pop rsi
871    RESTORE_GOT
872    UNSHADOW_ARGS
873    pop         rbp
874    ret
875
876;void vp8_bilinear_predict16x16_ssse3
877;(
878;    unsigned char  *src_ptr,
879;    int   src_pixels_per_line,
880;    int  xoffset,
881;    int  yoffset,
882;    unsigned char *dst_ptr,
883;    int dst_pitch
884;)
885global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
886sym(vp8_bilinear_predict16x16_ssse3):
887    push        rbp
888    mov         rbp, rsp
889    SHADOW_ARGS_TO_STACK 6
890    SAVE_XMM 7
891    GET_GOT     rbx
892    push        rsi
893    push        rdi
894    ; end prolog
895
896        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
897        movsxd      rax,        dword ptr arg(2)    ; xoffset
898
899        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
900        je          .b16x16_sp_only
901
902        shl         rax,        4
903        lea         rax,        [rax + rcx]         ; HFilter
904
905        mov         rdi,        arg(4)              ; dst_ptr
906        mov         rsi,        arg(0)              ; src_ptr
907        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
908
909        movdqa      xmm1,       [rax]
910
911        movsxd      rax,        dword ptr arg(3)    ; yoffset
912
913        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
914        je          .b16x16_fp_only
915
916        shl         rax,        4
917        lea         rax,        [rax + rcx]         ; VFilter
918
919        lea         rcx,        [rdi+rdx*8]
920        lea         rcx,        [rcx+rdx*8]
921        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
922
923        movdqa      xmm2,       [rax]
924
925%if ABI_IS_32BIT=0
926        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
927%endif
928        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
929        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
930
931        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
932        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
933
934        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
935
936        lea         rsi,        [rsi + rdx]         ; next line
937
938        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
939
940        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
941        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
942
943        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
944        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
945
946        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
947        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
948
949        movdqa      xmm7,       xmm3
950        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
951
952.next_row:
953        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
954        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
955
956        punpcklbw   xmm6,       xmm5
957        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
958
959        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
960        lea         rsi,        [rsi + rdx]         ; next line
961
962        pmaddubsw   xmm6,       xmm1
963
964        punpcklbw   xmm4,       xmm5
965        pmaddubsw   xmm4,       xmm1
966
967        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
968        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
969
970        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
971        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
972
973        packuswb    xmm6,       xmm4
974        movdqa      xmm5,       xmm7
975
976        punpcklbw   xmm5,       xmm6
977        pmaddubsw   xmm5,       xmm2
978
979        punpckhbw   xmm7,       xmm6
980        pmaddubsw   xmm7,       xmm2
981
982        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
983        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
984
985        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
986        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
987
988        packuswb    xmm5,       xmm7
989        movdqa      xmm7,       xmm6
990
991        movdqa      [rdi],      xmm5                ; store the results in the destination
992%if ABI_IS_32BIT
993        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
994%else
995        add         rdi,        r8
996%endif
997
998        cmp         rdi,        rcx
999        jne         .next_row
1000
1001        jmp         .done
1002
1003.b16x16_sp_only:
1004        movsxd      rax,        dword ptr arg(3)    ; yoffset
1005        shl         rax,        4
1006        lea         rax,        [rax + rcx]         ; VFilter
1007
1008        mov         rdi,        arg(4)              ; dst_ptr
1009        mov         rsi,        arg(0)              ; src_ptr
1010        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1011
1012        movdqa      xmm1,       [rax]               ; VFilter
1013
1014        lea         rcx,        [rdi+rdx*8]
1015        lea         rcx,        [rcx+rdx*8]
1016        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1017
1018        ; get the first horizontal line done
1019        movq        xmm4,       [rsi]               ; load row 0
1020        movq        xmm2,       [rsi + 8]           ; load row 0
1021
1022        lea         rsi,        [rsi + rax]         ; next line
1023.next_row_sp:
1024        movq        xmm3,       [rsi]               ; load row + 1
1025        movq        xmm5,       [rsi + 8]           ; load row + 1
1026
1027        punpcklbw   xmm4,       xmm3
1028        punpcklbw   xmm2,       xmm5
1029
1030        pmaddubsw   xmm4,       xmm1
1031        movq        xmm7,       [rsi + rax]         ; load row + 2
1032
1033        pmaddubsw   xmm2,       xmm1
1034        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
1035
1036        punpcklbw   xmm3,       xmm7
1037        punpcklbw   xmm5,       xmm6
1038
1039        pmaddubsw   xmm3,       xmm1
1040        paddw       xmm4,       [GLOBAL(rd)]
1041
1042        pmaddubsw   xmm5,       xmm1
1043        paddw       xmm2,       [GLOBAL(rd)]
1044
1045        psraw       xmm4,       VP8_FILTER_SHIFT
1046        psraw       xmm2,       VP8_FILTER_SHIFT
1047
1048        packuswb    xmm4,       xmm2
1049        paddw       xmm3,       [GLOBAL(rd)]
1050
1051        movdqa      [rdi],      xmm4                ; store row 0
1052        paddw       xmm5,       [GLOBAL(rd)]
1053
1054        psraw       xmm3,       VP8_FILTER_SHIFT
1055        psraw       xmm5,       VP8_FILTER_SHIFT
1056
1057        packuswb    xmm3,       xmm5
1058        movdqa      xmm4,       xmm7
1059
1060        movdqa      [rdi + rdx],xmm3                ; store row 1
1061        lea         rsi,        [rsi + 2*rax]
1062
1063        movdqa      xmm2,       xmm6
1064        lea         rdi,        [rdi + 2*rdx]
1065
1066        cmp         rdi,        rcx
1067        jne         .next_row_sp
1068
1069        jmp         .done
1070
1071.b16x16_fp_only:
1072        lea         rcx,        [rdi+rdx*8]
1073        lea         rcx,        [rcx+rdx*8]
1074        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
1075
1076.next_row_fp:
1077        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
1078        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
1079
1080        punpcklbw   xmm2,       xmm4
1081        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
1082
1083        pmaddubsw   xmm2,       xmm1
1084        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
1085
1086        lea         rsi,        [rsi + rax]         ; next line
1087        punpcklbw   xmm3,       xmm4
1088
1089        pmaddubsw   xmm3,       xmm1
1090        movq        xmm5,       [rsi]
1091
1092        paddw       xmm2,       [GLOBAL(rd)]
1093        movq        xmm7,       [rsi+1]
1094
1095        movq        xmm6,       [rsi+8]
1096        psraw       xmm2,       VP8_FILTER_SHIFT
1097
1098        punpcklbw   xmm5,       xmm7
1099        movq        xmm7,       [rsi+9]
1100
1101        paddw       xmm3,       [GLOBAL(rd)]
1102        pmaddubsw   xmm5,       xmm1
1103
1104        psraw       xmm3,       VP8_FILTER_SHIFT
1105        punpcklbw   xmm6,       xmm7
1106
1107        packuswb    xmm2,       xmm3
1108        pmaddubsw   xmm6,       xmm1
1109
1110        movdqa      [rdi],      xmm2                ; store the results in the destination
1111        paddw       xmm5,       [GLOBAL(rd)]
1112
1113        lea         rdi,        [rdi + rdx]         ; dst_pitch
1114        psraw       xmm5,       VP8_FILTER_SHIFT
1115
1116        paddw       xmm6,       [GLOBAL(rd)]
1117        psraw       xmm6,       VP8_FILTER_SHIFT
1118
1119        packuswb    xmm5,       xmm6
1120        lea         rsi,        [rsi + rax]         ; next line
1121
1122        movdqa      [rdi],      xmm5                ; store the results in the destination
1123        lea         rdi,        [rdi + rdx]         ; dst_pitch
1124
1125        cmp         rdi,        rcx
1126
1127        jne         .next_row_fp
1128
1129.done:
1130    ; begin epilog
1131    pop         rdi
1132    pop         rsi
1133    RESTORE_GOT
1134    RESTORE_XMM
1135    UNSHADOW_ARGS
1136    pop         rbp
1137    ret
1138
1139;void vp8_bilinear_predict8x8_ssse3
1140;(
1141;    unsigned char  *src_ptr,
1142;    int   src_pixels_per_line,
1143;    int  xoffset,
1144;    int  yoffset,
1145;    unsigned char *dst_ptr,
1146;    int dst_pitch
1147;)
1148global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
1149sym(vp8_bilinear_predict8x8_ssse3):
1150    push        rbp
1151    mov         rbp, rsp
1152    SHADOW_ARGS_TO_STACK 6
1153    SAVE_XMM 7
1154    GET_GOT     rbx
1155    push        rsi
1156    push        rdi
1157    ; end prolog
1158
1159    ALIGN_STACK 16, rax
1160    sub         rsp, 144                         ; reserve 144 bytes
1161
1162        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
1163
1164        mov         rsi,        arg(0) ;src_ptr
1165        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1166
1167    ;Read 9-line unaligned data in and put them on stack. This gives a big
1168    ;performance boost.
1169        movdqu      xmm0,       [rsi]
1170        lea         rax,        [rdx + rdx*2]
1171        movdqu      xmm1,       [rsi+rdx]
1172        movdqu      xmm2,       [rsi+rdx*2]
1173        add         rsi,        rax
1174        movdqu      xmm3,       [rsi]
1175        movdqu      xmm4,       [rsi+rdx]
1176        movdqu      xmm5,       [rsi+rdx*2]
1177        add         rsi,        rax
1178        movdqu      xmm6,       [rsi]
1179        movdqu      xmm7,       [rsi+rdx]
1180
1181        movdqa      XMMWORD PTR [rsp],            xmm0
1182
1183        movdqu      xmm0,       [rsi+rdx*2]
1184
1185        movdqa      XMMWORD PTR [rsp+16],         xmm1
1186        movdqa      XMMWORD PTR [rsp+32],         xmm2
1187        movdqa      XMMWORD PTR [rsp+48],         xmm3
1188        movdqa      XMMWORD PTR [rsp+64],         xmm4
1189        movdqa      XMMWORD PTR [rsp+80],         xmm5
1190        movdqa      XMMWORD PTR [rsp+96],         xmm6
1191        movdqa      XMMWORD PTR [rsp+112],        xmm7
1192        movdqa      XMMWORD PTR [rsp+128],        xmm0
1193
1194        movsxd      rax,        dword ptr arg(2)    ; xoffset
1195        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
1196        je          .b8x8_sp_only
1197
1198        shl         rax,        4
1199        add         rax,        rcx                 ; HFilter
1200
1201        mov         rdi,        arg(4)              ; dst_ptr
1202        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1203
1204        movdqa      xmm0,       [rax]
1205
1206        movsxd      rax,        dword ptr arg(3)    ; yoffset
1207        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
1208        je          .b8x8_fp_only
1209
1210        shl         rax,        4
1211        lea         rax,        [rax + rcx]         ; VFilter
1212
1213        lea         rcx,        [rdi+rdx*8]
1214
1215        movdqa      xmm1,       [rax]
1216
1217        ; get the first horizontal line done
1218        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1219        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1220
1221        psrldq      xmm5,       1
1222        lea         rsp,        [rsp + 16]          ; next line
1223
1224        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1225        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
1226
1227        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1228        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
1229
1230        movdqa      xmm7,       xmm3
1231        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1232
1233.next_row:
1234        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1235        lea         rsp,        [rsp + 16]          ; next line
1236
1237        movdqa      xmm5,       xmm6
1238
1239        psrldq      xmm5,       1
1240
1241        punpcklbw   xmm6,       xmm5
1242        pmaddubsw   xmm6,       xmm0
1243
1244        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
1245        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
1246
1247        packuswb    xmm6,       xmm6
1248
1249        punpcklbw   xmm7,       xmm6
1250        pmaddubsw   xmm7,       xmm1
1251
1252        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
1253        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
1254
1255        packuswb    xmm7,       xmm7
1256
1257        movq        [rdi],      xmm7                ; store the results in the destination
1258        lea         rdi,        [rdi + rdx]
1259
1260        movdqa      xmm7,       xmm6
1261
1262        cmp         rdi,        rcx
1263        jne         .next_row
1264
1265        jmp         .done8x8
1266
1267.b8x8_sp_only:
1268        movsxd      rax,        dword ptr arg(3)    ; yoffset
1269        shl         rax,        4
1270        lea         rax,        [rax + rcx]         ; VFilter
1271
1272        mov         rdi,        arg(4) ;dst_ptr
1273        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
1274
1275        movdqa      xmm0,       [rax]               ; VFilter
1276
1277        movq        xmm1,       XMMWORD PTR [rsp]
1278        movq        xmm2,       XMMWORD PTR [rsp+16]
1279
1280        movq        xmm3,       XMMWORD PTR [rsp+32]
1281        punpcklbw   xmm1,       xmm2
1282
1283        movq        xmm4,       XMMWORD PTR [rsp+48]
1284        punpcklbw   xmm2,       xmm3
1285
1286        movq        xmm5,       XMMWORD PTR [rsp+64]
1287        punpcklbw   xmm3,       xmm4
1288
1289        movq        xmm6,       XMMWORD PTR [rsp+80]
1290        punpcklbw   xmm4,       xmm5
1291
1292        movq        xmm7,       XMMWORD PTR [rsp+96]
1293        punpcklbw   xmm5,       xmm6
1294
1295        ; Because the source register (xmm0) is always treated as signed by
1296        ; pmaddubsw, the constant '128' is treated as '-128'.
1297        pmaddubsw   xmm1,       xmm0
1298        pmaddubsw   xmm2,       xmm0
1299
1300        pmaddubsw   xmm3,       xmm0
1301        pmaddubsw   xmm4,       xmm0
1302
1303        pmaddubsw   xmm5,       xmm0
1304        punpcklbw   xmm6,       xmm7
1305
1306        pmaddubsw   xmm6,       xmm0
1307        paddw       xmm1,       [GLOBAL(rd)]
1308
1309        paddw       xmm2,       [GLOBAL(rd)]
1310        psraw       xmm1,       VP8_FILTER_SHIFT
1311
1312        paddw       xmm3,       [GLOBAL(rd)]
1313        psraw       xmm2,       VP8_FILTER_SHIFT
1314
1315        paddw       xmm4,       [GLOBAL(rd)]
1316        psraw       xmm3,       VP8_FILTER_SHIFT
1317
1318        paddw       xmm5,       [GLOBAL(rd)]
1319        psraw       xmm4,       VP8_FILTER_SHIFT
1320
1321        paddw       xmm6,       [GLOBAL(rd)]
1322        psraw       xmm5,       VP8_FILTER_SHIFT
1323
1324        psraw       xmm6,       VP8_FILTER_SHIFT
1325
1326        ; Having multiplied everything by '-128' and obtained negative
1327        ; numbers, the unsigned saturation truncates those values to 0,
1328        ; resulting in incorrect handling of xoffset == 0 && yoffset == 0
1329        packuswb    xmm1,       xmm1
1330
1331        packuswb    xmm2,       xmm2
1332        movq        [rdi],      xmm1
1333
1334        packuswb    xmm3,       xmm3
1335        movq        [rdi+rdx],  xmm2
1336
1337        packuswb    xmm4,       xmm4
1338        movq        xmm1,       XMMWORD PTR [rsp+112]
1339
1340        lea         rdi,        [rdi + 2*rdx]
1341        movq        xmm2,       XMMWORD PTR [rsp+128]
1342
1343        packuswb    xmm5,       xmm5
1344        movq        [rdi],      xmm3
1345
1346        packuswb    xmm6,       xmm6
1347        movq        [rdi+rdx],  xmm4
1348
1349        lea         rdi,        [rdi + 2*rdx]
1350        punpcklbw   xmm7,       xmm1
1351
1352        movq        [rdi],      xmm5
1353        pmaddubsw   xmm7,       xmm0
1354
1355        movq        [rdi+rdx],  xmm6
1356        punpcklbw   xmm1,       xmm2
1357
1358        pmaddubsw   xmm1,       xmm0
1359        paddw       xmm7,       [GLOBAL(rd)]
1360
1361        psraw       xmm7,       VP8_FILTER_SHIFT
1362        paddw       xmm1,       [GLOBAL(rd)]
1363
1364        psraw       xmm1,       VP8_FILTER_SHIFT
1365        packuswb    xmm7,       xmm7
1366
1367        packuswb    xmm1,       xmm1
1368        lea         rdi,        [rdi + 2*rdx]
1369
1370        movq        [rdi],      xmm7
1371
1372        movq        [rdi+rdx],  xmm1
1373        lea         rsp,        [rsp + 144]
1374
1375        jmp         .done8x8
1376
1377.b8x8_fp_only:
1378        lea         rcx,        [rdi+rdx*8]
1379
1380.next_row_fp:
1381        movdqa      xmm1,       XMMWORD PTR [rsp]
1382        movdqa      xmm3,       XMMWORD PTR [rsp+16]
1383
1384        movdqa      xmm2,       xmm1
1385        movdqa      xmm5,       XMMWORD PTR [rsp+32]
1386
1387        psrldq      xmm2,       1
1388        movdqa      xmm7,       XMMWORD PTR [rsp+48]
1389
1390        movdqa      xmm4,       xmm3
1391        psrldq      xmm4,       1
1392
1393        movdqa      xmm6,       xmm5
1394        psrldq      xmm6,       1
1395
1396        punpcklbw   xmm1,       xmm2
1397        pmaddubsw   xmm1,       xmm0
1398
1399        punpcklbw   xmm3,       xmm4
1400        pmaddubsw   xmm3,       xmm0
1401
1402        punpcklbw   xmm5,       xmm6
1403        pmaddubsw   xmm5,       xmm0
1404
1405        movdqa      xmm2,       xmm7
1406        psrldq      xmm2,       1
1407
1408        punpcklbw   xmm7,       xmm2
1409        pmaddubsw   xmm7,       xmm0
1410
1411        paddw       xmm1,       [GLOBAL(rd)]
1412        psraw       xmm1,       VP8_FILTER_SHIFT
1413
1414        paddw       xmm3,       [GLOBAL(rd)]
1415        psraw       xmm3,       VP8_FILTER_SHIFT
1416
1417        paddw       xmm5,       [GLOBAL(rd)]
1418        psraw       xmm5,       VP8_FILTER_SHIFT
1419
1420        paddw       xmm7,       [GLOBAL(rd)]
1421        psraw       xmm7,       VP8_FILTER_SHIFT
1422
1423        packuswb    xmm1,       xmm1
1424        packuswb    xmm3,       xmm3
1425
1426        packuswb    xmm5,       xmm5
1427        movq        [rdi],      xmm1
1428
1429        packuswb    xmm7,       xmm7
1430        movq        [rdi+rdx],  xmm3
1431
1432        lea         rdi,        [rdi + 2*rdx]
1433        movq        [rdi],      xmm5
1434
1435        lea         rsp,        [rsp + 4*16]
1436        movq        [rdi+rdx],  xmm7
1437
1438        lea         rdi,        [rdi + 2*rdx]
1439        cmp         rdi,        rcx
1440
1441        jne         .next_row_fp
1442
1443        lea         rsp,        [rsp + 16]
1444
1445.done8x8:
1446    ;add rsp, 144
1447    pop         rsp
1448    ; begin epilog
1449    pop         rdi
1450    pop         rsi
1451    RESTORE_GOT
1452    RESTORE_XMM
1453    UNSHADOW_ARGS
1454    pop         rbp
1455    ret
1456
1457SECTION_RODATA
1458align 16
1459shuf1b:
1460    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1461shuf2b:
1462    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1463shuf3b:
1464    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1465
1466align 16
1467shuf2bfrom1:
1468    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1469align 16
1470shuf3bfrom1:
1471    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1472
1473align 16
1474rd:
1475    times 8 dw 0x40
1476
1477align 16
1478k0_k5:
1479    times 8 db 0, 0             ;placeholder
1480    times 8 db 0, 0
1481    times 8 db 2, 1
1482    times 8 db 0, 0
1483    times 8 db 3, 3
1484    times 8 db 0, 0
1485    times 8 db 1, 2
1486    times 8 db 0, 0
1487k1_k3:
1488    times 8 db  0,    0         ;placeholder
1489    times 8 db  -6,  12
1490    times 8 db -11,  36
1491    times 8 db  -9,  50
1492    times 8 db -16,  77
1493    times 8 db  -6,  93
1494    times 8 db  -8, 108
1495    times 8 db  -1, 123
1496k2_k4:
1497    times 8 db 128,    0        ;placeholder
1498    times 8 db 123,   -1
1499    times 8 db 108,   -8
1500    times 8 db  93,   -6
1501    times 8 db  77,  -16
1502    times 8 db  50,   -9
1503    times 8 db  36,  -11
1504    times 8 db  12,   -6
1505align 16
1506vp8_bilinear_filters_ssse3:
1507    times 8 db 128, 0
1508    times 8 db 112, 16
1509    times 8 db 96,  32
1510    times 8 db 80,  48
1511    times 8 db 64,  64
1512    times 8 db 48,  80
1513    times 8 db 32,  96
1514    times 8 db 16,  112
1515
1516