1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro VERTx4 1
15    mov         rdx, arg(5)                 ;filter ptr
16    mov         rsi, arg(0)                 ;src_ptr
17    mov         rdi, arg(2)                 ;output_ptr
18    mov         rcx, 0x0400040
19
20    movdqa      xmm4, [rdx]                 ;load filters
21    movd        xmm5, rcx
22    packsswb    xmm4, xmm4
23    pshuflw     xmm0, xmm4, 0b              ;k0_k1
24    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
25    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
26    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
27
28    punpcklqdq  xmm0, xmm0
29    punpcklqdq  xmm1, xmm1
30    punpcklqdq  xmm2, xmm2
31    punpcklqdq  xmm3, xmm3
32
33    movdqa      k0k1, xmm0
34    movdqa      k2k3, xmm1
35    pshufd      xmm5, xmm5, 0
36    movdqa      k4k5, xmm2
37    movdqa      k6k7, xmm3
38    movdqa      krd, xmm5
39
40    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
41
42%if ABI_IS_32BIT=0
43    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
44%endif
45    mov         rax, rsi
46    movsxd      rcx, DWORD PTR arg(4)       ;output_height
47    add         rax, rdx
48
49    lea         rbx, [rdx + rdx*4]
50    add         rbx, rdx                    ;pitch * 6
51
52.loop:
53    movd        xmm0, [rsi]                 ;A
54    movd        xmm1, [rsi + rdx]           ;B
55    movd        xmm2, [rsi + rdx * 2]       ;C
56    movd        xmm3, [rax + rdx * 2]       ;D
57    movd        xmm4, [rsi + rdx * 4]       ;E
58    movd        xmm5, [rax + rdx * 4]       ;F
59
60    punpcklbw   xmm0, xmm1                  ;A B
61    punpcklbw   xmm2, xmm3                  ;C D
62    punpcklbw   xmm4, xmm5                  ;E F
63
64    movd        xmm6, [rsi + rbx]           ;G
65    movd        xmm7, [rax + rbx]           ;H
66
67    pmaddubsw   xmm0, k0k1
68    pmaddubsw   xmm2, k2k3
69    punpcklbw   xmm6, xmm7                  ;G H
70    pmaddubsw   xmm4, k4k5
71    pmaddubsw   xmm6, k6k7
72
73    movdqa      xmm1, xmm2
74    paddsw      xmm0, xmm6
75    pmaxsw      xmm2, xmm4
76    pminsw      xmm4, xmm1
77    paddsw      xmm0, xmm4
78    paddsw      xmm0, xmm2
79
80    paddsw      xmm0, krd
81    psraw       xmm0, 7
82    packuswb    xmm0, xmm0
83
84    add         rsi,  rdx
85    add         rax,  rdx
86%if %1
87    movd        xmm1, [rdi]
88    pavgb       xmm0, xmm1
89%endif
90    movd        [rdi], xmm0
91
92%if ABI_IS_32BIT
93    add         rdi, DWORD PTR arg(3)       ;out_pitch
94%else
95    add         rdi, r8
96%endif
97    dec         rcx
98    jnz         .loop
99%endm
100
101%macro VERTx8 1
102    mov         rdx, arg(5)                 ;filter ptr
103    mov         rsi, arg(0)                 ;src_ptr
104    mov         rdi, arg(2)                 ;output_ptr
105    mov         rcx, 0x0400040
106
107    movdqa      xmm4, [rdx]                 ;load filters
108    movq        xmm5, rcx
109    packsswb    xmm4, xmm4
110    pshuflw     xmm0, xmm4, 0b              ;k0_k1
111    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
112    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
113    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
114
115    punpcklqdq  xmm0, xmm0
116    punpcklqdq  xmm1, xmm1
117    punpcklqdq  xmm2, xmm2
118    punpcklqdq  xmm3, xmm3
119
120    movdqa      k0k1, xmm0
121    movdqa      k2k3, xmm1
122    pshufd      xmm5, xmm5, 0
123    movdqa      k4k5, xmm2
124    movdqa      k6k7, xmm3
125    movdqa      krd, xmm5
126
127    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
128
129%if ABI_IS_32BIT=0
130    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
131%endif
132    mov         rax, rsi
133    movsxd      rcx, DWORD PTR arg(4)       ;output_height
134    add         rax, rdx
135
136    lea         rbx, [rdx + rdx*4]
137    add         rbx, rdx                    ;pitch * 6
138
139.loop:
140    movq        xmm0, [rsi]                 ;A
141    movq        xmm1, [rsi + rdx]           ;B
142    movq        xmm2, [rsi + rdx * 2]       ;C
143    movq        xmm3, [rax + rdx * 2]       ;D
144    movq        xmm4, [rsi + rdx * 4]       ;E
145    movq        xmm5, [rax + rdx * 4]       ;F
146
147    punpcklbw   xmm0, xmm1                  ;A B
148    punpcklbw   xmm2, xmm3                  ;C D
149    punpcklbw   xmm4, xmm5                  ;E F
150
151    movq        xmm6, [rsi + rbx]           ;G
152    movq        xmm7, [rax + rbx]           ;H
153
154    pmaddubsw   xmm0, k0k1
155    pmaddubsw   xmm2, k2k3
156    punpcklbw   xmm6, xmm7                  ;G H
157    pmaddubsw   xmm4, k4k5
158    pmaddubsw   xmm6, k6k7
159
160    paddsw      xmm0, xmm6
161    movdqa      xmm1, xmm2
162    pmaxsw      xmm2, xmm4
163    pminsw      xmm4, xmm1
164    paddsw      xmm0, xmm4
165    paddsw      xmm0, xmm2
166
167    paddsw      xmm0, krd
168    psraw       xmm0, 7
169    packuswb    xmm0, xmm0
170
171    add         rsi,  rdx
172    add         rax,  rdx
173%if %1
174    movq        xmm1, [rdi]
175    pavgb       xmm0, xmm1
176%endif
177    movq        [rdi], xmm0
178
179%if ABI_IS_32BIT
180    add         rdi, DWORD PTR arg(3)       ;out_pitch
181%else
182    add         rdi, r8
183%endif
184    dec         rcx
185    jnz         .loop
186%endm
187
188
189%macro VERTx16 1
190    mov         rdx, arg(5)                 ;filter ptr
191    mov         rsi, arg(0)                 ;src_ptr
192    mov         rdi, arg(2)                 ;output_ptr
193    mov         rcx, 0x0400040
194
195    movdqa      xmm4, [rdx]                 ;load filters
196    movq        xmm5, rcx
197    packsswb    xmm4, xmm4
198    pshuflw     xmm0, xmm4, 0b              ;k0_k1
199    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
200    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
201    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
202
203    punpcklqdq  xmm0, xmm0
204    punpcklqdq  xmm1, xmm1
205    punpcklqdq  xmm2, xmm2
206    punpcklqdq  xmm3, xmm3
207
208    movdqa      k0k1, xmm0
209    movdqa      k2k3, xmm1
210    pshufd      xmm5, xmm5, 0
211    movdqa      k4k5, xmm2
212    movdqa      k6k7, xmm3
213    movdqa      krd, xmm5
214
215    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
216
217%if ABI_IS_32BIT=0
218    movsxd      r8, DWORD PTR arg(3)        ;out_pitch
219%endif
220    mov         rax, rsi
221    movsxd      rcx, DWORD PTR arg(4)       ;output_height
222    add         rax, rdx
223
224    lea         rbx, [rdx + rdx*4]
225    add         rbx, rdx                    ;pitch * 6
226
227.loop:
228    movq        xmm0, [rsi]                 ;A
229    movq        xmm1, [rsi + rdx]           ;B
230    movq        xmm2, [rsi + rdx * 2]       ;C
231    movq        xmm3, [rax + rdx * 2]       ;D
232    movq        xmm4, [rsi + rdx * 4]       ;E
233    movq        xmm5, [rax + rdx * 4]       ;F
234
235    punpcklbw   xmm0, xmm1                  ;A B
236    punpcklbw   xmm2, xmm3                  ;C D
237    punpcklbw   xmm4, xmm5                  ;E F
238
239    movq        xmm6, [rsi + rbx]           ;G
240    movq        xmm7, [rax + rbx]           ;H
241
242    pmaddubsw   xmm0, k0k1
243    pmaddubsw   xmm2, k2k3
244    punpcklbw   xmm6, xmm7                  ;G H
245    pmaddubsw   xmm4, k4k5
246    pmaddubsw   xmm6, k6k7
247
248    paddsw      xmm0, xmm6
249    movdqa      xmm1, xmm2
250    pmaxsw      xmm2, xmm4
251    pminsw      xmm4, xmm1
252    paddsw      xmm0, xmm4
253    paddsw      xmm0, xmm2
254
255    paddsw      xmm0, krd
256    psraw       xmm0, 7
257    packuswb    xmm0, xmm0
258%if %1
259    movq        xmm1, [rdi]
260    pavgb       xmm0, xmm1
261%endif
262    movq        [rdi], xmm0
263
264    movq        xmm0, [rsi + 8]             ;A
265    movq        xmm1, [rsi + rdx + 8]       ;B
266    movq        xmm2, [rsi + rdx * 2 + 8]   ;C
267    movq        xmm3, [rax + rdx * 2 + 8]   ;D
268    movq        xmm4, [rsi + rdx * 4 + 8]   ;E
269    movq        xmm5, [rax + rdx * 4 + 8]   ;F
270
271    punpcklbw   xmm0, xmm1                  ;A B
272    punpcklbw   xmm2, xmm3                  ;C D
273    punpcklbw   xmm4, xmm5                  ;E F
274
275
276    movq        xmm6, [rsi + rbx + 8]       ;G
277    movq        xmm7, [rax + rbx + 8]       ;H
278    punpcklbw   xmm6, xmm7                  ;G H
279
280
281    pmaddubsw   xmm0, k0k1
282    pmaddubsw   xmm2, k2k3
283    pmaddubsw   xmm4, k4k5
284    pmaddubsw   xmm6, k6k7
285
286    paddsw      xmm0, xmm6
287    paddsw      xmm0, xmm2
288    paddsw      xmm0, xmm4
289    paddsw      xmm0, krd
290
291    psraw       xmm0, 7
292    packuswb    xmm0, xmm0
293
294    add         rsi,  rdx
295    add         rax,  rdx
296%if %1
297    movq    xmm1, [rdi+8]
298    pavgb   xmm0, xmm1
299%endif
300
301    movq        [rdi+8], xmm0
302
303%if ABI_IS_32BIT
304    add         rdi, DWORD PTR arg(3)       ;out_pitch
305%else
306    add         rdi, r8
307%endif
308    dec         rcx
309    jnz         .loop
310%endm
311
312;void vp9_filter_block1d8_v8_ssse3
313;(
314;    unsigned char *src_ptr,
315;    unsigned int   src_pitch,
316;    unsigned char *output_ptr,
317;    unsigned int   out_pitch,
318;    unsigned int   output_height,
319;    short *filter
320;)
321global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
322sym(vp9_filter_block1d4_v8_ssse3):
323    push        rbp
324    mov         rbp, rsp
325    SHADOW_ARGS_TO_STACK 6
326    SAVE_XMM 7
327    push        rsi
328    push        rdi
329    push        rbx
330    ; end prolog
331
332    ALIGN_STACK 16, rax
333    sub         rsp, 16*5
334    %define k0k1 [rsp + 16*0]
335    %define k2k3 [rsp + 16*1]
336    %define k4k5 [rsp + 16*2]
337    %define k6k7 [rsp + 16*3]
338    %define krd [rsp + 16*4]
339
340    VERTx4 0
341
342    add rsp, 16*5
343    pop rsp
344    pop rbx
345    ; begin epilog
346    pop rdi
347    pop rsi
348    RESTORE_XMM
349    UNSHADOW_ARGS
350    pop         rbp
351    ret
352
353;void vp9_filter_block1d8_v8_ssse3
354;(
355;    unsigned char *src_ptr,
356;    unsigned int   src_pitch,
357;    unsigned char *output_ptr,
358;    unsigned int   out_pitch,
359;    unsigned int   output_height,
360;    short *filter
361;)
362global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
363sym(vp9_filter_block1d8_v8_ssse3):
364    push        rbp
365    mov         rbp, rsp
366    SHADOW_ARGS_TO_STACK 6
367    SAVE_XMM 7
368    push        rsi
369    push        rdi
370    push        rbx
371    ; end prolog
372
373    ALIGN_STACK 16, rax
374    sub         rsp, 16*5
375    %define k0k1 [rsp + 16*0]
376    %define k2k3 [rsp + 16*1]
377    %define k4k5 [rsp + 16*2]
378    %define k6k7 [rsp + 16*3]
379    %define krd [rsp + 16*4]
380
381    VERTx8 0
382
383    add rsp, 16*5
384    pop rsp
385    pop rbx
386    ; begin epilog
387    pop rdi
388    pop rsi
389    RESTORE_XMM
390    UNSHADOW_ARGS
391    pop         rbp
392    ret
393
394;void vp9_filter_block1d16_v8_ssse3
395;(
396;    unsigned char *src_ptr,
397;    unsigned int   src_pitch,
398;    unsigned char *output_ptr,
399;    unsigned int   out_pitch,
400;    unsigned int   output_height,
401;    short *filter
402;)
403global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
404sym(vp9_filter_block1d16_v8_ssse3):
405    push        rbp
406    mov         rbp, rsp
407    SHADOW_ARGS_TO_STACK 6
408    SAVE_XMM 7
409    push        rsi
410    push        rdi
411    push        rbx
412    ; end prolog
413
414    ALIGN_STACK 16, rax
415    sub         rsp, 16*5
416    %define k0k1 [rsp + 16*0]
417    %define k2k3 [rsp + 16*1]
418    %define k4k5 [rsp + 16*2]
419    %define k6k7 [rsp + 16*3]
420    %define krd [rsp + 16*4]
421
422    VERTx16 0
423
424    add rsp, 16*5
425    pop rsp
426    pop rbx
427    ; begin epilog
428    pop rdi
429    pop rsi
430    RESTORE_XMM
431    UNSHADOW_ARGS
432    pop         rbp
433    ret
434
435;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
436
437
438global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
439sym(vp9_filter_block1d4_v8_avg_ssse3):
440    push        rbp
441    mov         rbp, rsp
442    SHADOW_ARGS_TO_STACK 6
443    SAVE_XMM 7
444    push        rsi
445    push        rdi
446    push        rbx
447    ; end prolog
448
449    ALIGN_STACK 16, rax
450    sub         rsp, 16*5
451    %define k0k1 [rsp + 16*0]
452    %define k2k3 [rsp + 16*1]
453    %define k4k5 [rsp + 16*2]
454    %define k6k7 [rsp + 16*3]
455    %define krd [rsp + 16*4]
456
457    VERTx4 1
458
459    add rsp, 16*5
460    pop rsp
461    pop rbx
462    ; begin epilog
463    pop rdi
464    pop rsi
465    RESTORE_XMM
466    UNSHADOW_ARGS
467    pop         rbp
468    ret
469
470global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
471sym(vp9_filter_block1d8_v8_avg_ssse3):
472    push        rbp
473    mov         rbp, rsp
474    SHADOW_ARGS_TO_STACK 6
475    SAVE_XMM 7
476    push        rsi
477    push        rdi
478    push        rbx
479    ; end prolog
480
481    ALIGN_STACK 16, rax
482    sub         rsp, 16*5
483    %define k0k1 [rsp + 16*0]
484    %define k2k3 [rsp + 16*1]
485    %define k4k5 [rsp + 16*2]
486    %define k6k7 [rsp + 16*3]
487    %define krd [rsp + 16*4]
488
489    VERTx8 1
490
491    add rsp, 16*5
492    pop rsp
493    pop rbx
494    ; begin epilog
495    pop rdi
496    pop rsi
497    RESTORE_XMM
498    UNSHADOW_ARGS
499    pop         rbp
500    ret
501
502global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
503sym(vp9_filter_block1d16_v8_avg_ssse3):
504    push        rbp
505    mov         rbp, rsp
506    SHADOW_ARGS_TO_STACK 6
507    SAVE_XMM 7
508    push        rsi
509    push        rdi
510    push        rbx
511    ; end prolog
512
513    ALIGN_STACK 16, rax
514    sub         rsp, 16*5
515    %define k0k1 [rsp + 16*0]
516    %define k2k3 [rsp + 16*1]
517    %define k4k5 [rsp + 16*2]
518    %define k6k7 [rsp + 16*3]
519    %define krd [rsp + 16*4]
520
521    VERTx16 1
522
523    add rsp, 16*5
524    pop rsp
525    pop rbx
526    ; begin epilog
527    pop rdi
528    pop rsi
529    RESTORE_XMM
530    UNSHADOW_ARGS
531    pop         rbp
532    ret
533
534;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
535%macro HORIZx4_ROW 2
536    movdqa      %2,   %1
537    pshufb      %1,   [GLOBAL(shuf_t0t1)]
538    pshufb      %2,   [GLOBAL(shuf_t2t3)]
539    pmaddubsw   %1,   k0k1k4k5
540    pmaddubsw   %2,   k2k3k6k7
541
542    movdqa      xmm4, %1
543    movdqa      xmm5, %2
544    psrldq      %1,   8
545    psrldq      %2,   8
546    movdqa      xmm6, xmm5
547
548    paddsw      xmm4, %2
549    pmaxsw      xmm5, %1
550    pminsw      %1, xmm6
551    paddsw      %1, xmm4
552    paddsw      %1, xmm5
553
554    paddsw      %1,   krd
555    psraw       %1,   7
556    packuswb    %1,   %1
557%endm
558
559%macro HORIZx4 1
560    mov         rdx, arg(5)                 ;filter ptr
561    mov         rsi, arg(0)                 ;src_ptr
562    mov         rdi, arg(2)                 ;output_ptr
563    mov         rcx, 0x0400040
564
565    movdqa      xmm4, [rdx]                 ;load filters
566    movq        xmm5, rcx
567    packsswb    xmm4, xmm4
568    pshuflw     xmm6, xmm4, 0b              ;k0_k1
569    pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
570    pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
571    pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
572    pshufd      xmm5, xmm5, 0               ;rounding
573
574    movdqa      k0k1k4k5, xmm6
575    movdqa      k2k3k6k7, xmm7
576    movdqa      krd, xmm5
577
578    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
579    movsxd      rdx, dword ptr arg(3)       ;output_pitch
580    movsxd      rcx, dword ptr arg(4)       ;output_height
581    shr         rcx, 1
582.loop:
583    ;Do two rows once
584    movq        xmm0,   [rsi - 3]           ;load src
585    movq        xmm1,   [rsi + 5]
586    movq        xmm2,   [rsi + rax - 3]
587    movq        xmm3,   [rsi + rax + 5]
588    punpcklqdq  xmm0,   xmm1
589    punpcklqdq  xmm2,   xmm3
590
591    HORIZx4_ROW xmm0,   xmm1
592    HORIZx4_ROW xmm2,   xmm3
593%if %1
594    movd        xmm1,   [rdi]
595    pavgb       xmm0,   xmm1
596    movd        xmm3,   [rdi + rdx]
597    pavgb       xmm2,   xmm3
598%endif
599    movd        [rdi],  xmm0
600    movd        [rdi +rdx],  xmm2
601
602    lea         rsi,    [rsi + rax]
603    prefetcht0  [rsi + 4 * rax - 3]
604    lea         rsi,    [rsi + rax]
605    lea         rdi,    [rdi + 2 * rdx]
606    prefetcht0  [rsi + 2 * rax - 3]
607
608    dec         rcx
609    jnz         .loop
610
611    ; Do last row if output_height is odd
612    movsxd      rcx,    dword ptr arg(4)       ;output_height
613    and         rcx,    1
614    je          .done
615
616    movq        xmm0,   [rsi - 3]    ; load src
617    movq        xmm1,   [rsi + 5]
618    punpcklqdq  xmm0,   xmm1
619
620    HORIZx4_ROW xmm0, xmm1
621%if %1
622    movd        xmm1,   [rdi]
623    pavgb       xmm0,   xmm1
624%endif
625    movd        [rdi],  xmm0
626.done
627%endm
628
629%macro HORIZx8_ROW 4
630    movdqa      %2,   %1
631    movdqa      %3,   %1
632    movdqa      %4,   %1
633
634    pshufb      %1,   [GLOBAL(shuf_t0t1)]
635    pshufb      %2,   [GLOBAL(shuf_t2t3)]
636    pshufb      %3,   [GLOBAL(shuf_t4t5)]
637    pshufb      %4,   [GLOBAL(shuf_t6t7)]
638
639    pmaddubsw   %1,   k0k1
640    pmaddubsw   %2,   k2k3
641    pmaddubsw   %3,   k4k5
642    pmaddubsw   %4,   k6k7
643
644    paddsw      %1,   %4
645    movdqa      %4,   %2
646    pmaxsw      %2,   %3
647    pminsw      %3,   %4
648    paddsw      %1,   %3
649    paddsw      %1,   %2
650
651    paddsw      %1,   krd
652    psraw       %1,   7
653    packuswb    %1,   %1
654%endm
655
656%macro HORIZx8 1
657    mov         rdx, arg(5)                 ;filter ptr
658    mov         rsi, arg(0)                 ;src_ptr
659    mov         rdi, arg(2)                 ;output_ptr
660    mov         rcx, 0x0400040
661
662    movdqa      xmm4, [rdx]                 ;load filters
663    movd        xmm5, rcx
664    packsswb    xmm4, xmm4
665    pshuflw     xmm0, xmm4, 0b              ;k0_k1
666    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
667    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
668    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
669
670    punpcklqdq  xmm0, xmm0
671    punpcklqdq  xmm1, xmm1
672    punpcklqdq  xmm2, xmm2
673    punpcklqdq  xmm3, xmm3
674
675    movdqa      k0k1, xmm0
676    movdqa      k2k3, xmm1
677    pshufd      xmm5, xmm5, 0
678    movdqa      k4k5, xmm2
679    movdqa      k6k7, xmm3
680    movdqa      krd, xmm5
681
682    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
683    movsxd      rdx, dword ptr arg(3)       ;output_pitch
684    movsxd      rcx, dword ptr arg(4)       ;output_height
685    shr         rcx, 1
686
687.loop:
688    movq        xmm0,   [rsi - 3]           ;load src
689    movq        xmm3,   [rsi + 5]
690    movq        xmm4,   [rsi + rax - 3]
691    movq        xmm7,   [rsi + rax + 5]
692    punpcklqdq  xmm0,   xmm3
693    punpcklqdq  xmm4,   xmm7
694
695    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
696    HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
697%if %1
698    movq        xmm1,   [rdi]
699    movq        xmm2,   [rdi + rdx]
700    pavgb       xmm0,   xmm1
701    pavgb       xmm4,   xmm2
702%endif
703    movq        [rdi],  xmm0
704    movq        [rdi + rdx],  xmm4
705
706    lea         rsi,    [rsi + rax]
707    prefetcht0  [rsi + 4 * rax - 3]
708    lea         rsi,    [rsi + rax]
709    lea         rdi,    [rdi + 2 * rdx]
710    prefetcht0  [rsi + 2 * rax - 3]
711    dec         rcx
712    jnz         .loop
713
714    ;Do last row if output_height is odd
715    movsxd      rcx,    dword ptr arg(4)    ;output_height
716    and         rcx,    1
717    je          .done
718
719    movq        xmm0,   [rsi - 3]
720    movq        xmm3,   [rsi + 5]
721    punpcklqdq  xmm0,   xmm3
722
723    HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
724%if %1
725    movq        xmm1,   [rdi]
726    pavgb       xmm0,   xmm1
727%endif
728    movq        [rdi],  xmm0
729.done
730%endm
731
732%macro HORIZx16 1
733    mov         rdx, arg(5)                 ;filter ptr
734    mov         rsi, arg(0)                 ;src_ptr
735    mov         rdi, arg(2)                 ;output_ptr
736    mov         rcx, 0x0400040
737
738    movdqa      xmm4, [rdx]                 ;load filters
739    movq        xmm5, rcx
740    packsswb    xmm4, xmm4
741    pshuflw     xmm0, xmm4, 0b              ;k0_k1
742    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
743    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
744    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
745
746    punpcklqdq  xmm0, xmm0
747    punpcklqdq  xmm1, xmm1
748    punpcklqdq  xmm2, xmm2
749    punpcklqdq  xmm3, xmm3
750
751    movdqa      k0k1, xmm0
752    movdqa      k2k3, xmm1
753    pshufd      xmm5, xmm5, 0
754    movdqa      k4k5, xmm2
755    movdqa      k6k7, xmm3
756    movdqa      krd, xmm5
757
758    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
759    movsxd      rdx, dword ptr arg(3)       ;output_pitch
760    movsxd      rcx, dword ptr arg(4)       ;output_height
761
762.loop:
763    prefetcht0  [rsi + 2 * rax -3]
764
765    movq        xmm0,   [rsi - 3]           ;load src data
766    movq        xmm4,   [rsi + 5]
767    movq        xmm7,   [rsi + 13]
768    punpcklqdq  xmm0,   xmm4
769    punpcklqdq  xmm4,   xmm7
770
771    movdqa      xmm1,   xmm0
772    movdqa      xmm2,   xmm0
773    movdqa      xmm3,   xmm0
774    movdqa      xmm5,   xmm4
775    movdqa      xmm6,   xmm4
776    movdqa      xmm7,   xmm4
777
778    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
779    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
780    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
781    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
782    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
783    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
784    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
785    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
786
787    pmaddubsw   xmm0,   k0k1
788    pmaddubsw   xmm1,   k2k3
789    pmaddubsw   xmm2,   k4k5
790    pmaddubsw   xmm3,   k6k7
791    pmaddubsw   xmm4,   k0k1
792    pmaddubsw   xmm5,   k2k3
793    pmaddubsw   xmm6,   k4k5
794    pmaddubsw   xmm7,   k6k7
795
796    paddsw      xmm0,   xmm3
797    movdqa      xmm3,   xmm1
798    pmaxsw      xmm1,   xmm2
799    pminsw      xmm2,   xmm3
800    paddsw      xmm0,   xmm2
801    paddsw      xmm0,   xmm1
802
803    paddsw      xmm4,   xmm7
804    movdqa      xmm7,   xmm5
805    pmaxsw      xmm5,   xmm6
806    pminsw      xmm6,   xmm7
807    paddsw      xmm4,   xmm6
808    paddsw      xmm4,   xmm5
809
810    paddsw      xmm0,   krd
811    paddsw      xmm4,   krd
812    psraw       xmm0,   7
813    psraw       xmm4,   7
814    packuswb    xmm0,   xmm0
815    packuswb    xmm4,   xmm4
816    punpcklqdq  xmm0,   xmm4
817%if %1
818    movdqa      xmm1,   [rdi]
819    pavgb       xmm0,   xmm1
820%endif
821
822    lea         rsi,    [rsi + rax]
823    movdqa      [rdi],  xmm0
824
825    lea         rdi,    [rdi + rdx]
826    dec         rcx
827    jnz         .loop
828%endm
829
830;void vp9_filter_block1d4_h8_ssse3
831;(
832;    unsigned char  *src_ptr,
833;    unsigned int    src_pixels_per_line,
834;    unsigned char  *output_ptr,
835;    unsigned int    output_pitch,
836;    unsigned int    output_height,
837;    short *filter
838;)
839global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
840sym(vp9_filter_block1d4_h8_ssse3):
841    push        rbp
842    mov         rbp, rsp
843    SHADOW_ARGS_TO_STACK 6
844    SAVE_XMM 7
845    GET_GOT     rbx
846    push        rsi
847    push        rdi
848    ; end prolog
849
850    ALIGN_STACK 16, rax
851    sub         rsp, 16 * 3
852    %define k0k1k4k5 [rsp + 16 * 0]
853    %define k2k3k6k7 [rsp + 16 * 1]
854    %define krd      [rsp + 16 * 2]
855
856    HORIZx4 0
857
858    add rsp, 16 * 3
859    pop rsp
860    ; begin epilog
861    pop rdi
862    pop rsi
863    RESTORE_GOT
864    RESTORE_XMM
865    UNSHADOW_ARGS
866    pop         rbp
867    ret
868
869;void vp9_filter_block1d8_h8_ssse3
870;(
871;    unsigned char  *src_ptr,
872;    unsigned int    src_pixels_per_line,
873;    unsigned char  *output_ptr,
874;    unsigned int    output_pitch,
875;    unsigned int    output_height,
876;    short *filter
877;)
878global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
879sym(vp9_filter_block1d8_h8_ssse3):
880    push        rbp
881    mov         rbp, rsp
882    SHADOW_ARGS_TO_STACK 6
883    SAVE_XMM 7
884    GET_GOT     rbx
885    push        rsi
886    push        rdi
887    ; end prolog
888
889    ALIGN_STACK 16, rax
890    sub         rsp, 16*5
891    %define k0k1 [rsp + 16*0]
892    %define k2k3 [rsp + 16*1]
893    %define k4k5 [rsp + 16*2]
894    %define k6k7 [rsp + 16*3]
895    %define krd [rsp + 16*4]
896
897    HORIZx8 0
898
899    add rsp, 16*5
900    pop rsp
901
902    ; begin epilog
903    pop rdi
904    pop rsi
905    RESTORE_GOT
906    RESTORE_XMM
907    UNSHADOW_ARGS
908    pop         rbp
909    ret
910
911;void vp9_filter_block1d16_h8_ssse3
912;(
913;    unsigned char  *src_ptr,
914;    unsigned int    src_pixels_per_line,
915;    unsigned char  *output_ptr,
916;    unsigned int    output_pitch,
917;    unsigned int    output_height,
918;    short *filter
919;)
920global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
921sym(vp9_filter_block1d16_h8_ssse3):
922    push        rbp
923    mov         rbp, rsp
924    SHADOW_ARGS_TO_STACK 6
925    SAVE_XMM 7
926    GET_GOT     rbx
927    push        rsi
928    push        rdi
929    ; end prolog
930
931    ALIGN_STACK 16, rax
932    sub         rsp, 16*5
933    %define k0k1 [rsp + 16*0]
934    %define k2k3 [rsp + 16*1]
935    %define k4k5 [rsp + 16*2]
936    %define k6k7 [rsp + 16*3]
937    %define krd [rsp + 16*4]
938
939    HORIZx16 0
940
941    add rsp, 16*5
942    pop rsp
943
944    ; begin epilog
945    pop rdi
946    pop rsi
947    RESTORE_GOT
948    RESTORE_XMM
949    UNSHADOW_ARGS
950    pop         rbp
951    ret
952
953global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
954sym(vp9_filter_block1d4_h8_avg_ssse3):
955    push        rbp
956    mov         rbp, rsp
957    SHADOW_ARGS_TO_STACK 6
958    SAVE_XMM 7
959    GET_GOT     rbx
960    push        rsi
961    push        rdi
962    ; end prolog
963
964    ALIGN_STACK 16, rax
965    sub         rsp, 16 * 3
966    %define k0k1k4k5 [rsp + 16 * 0]
967    %define k2k3k6k7 [rsp + 16 * 1]
968    %define krd      [rsp + 16 * 2]
969
970    HORIZx4 1
971
972    add rsp, 16 * 3
973    pop rsp
974    ; begin epilog
975    pop rdi
976    pop rsi
977    RESTORE_GOT
978    RESTORE_XMM
979    UNSHADOW_ARGS
980    pop         rbp
981    ret
982
983global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
984sym(vp9_filter_block1d8_h8_avg_ssse3):
985    push        rbp
986    mov         rbp, rsp
987    SHADOW_ARGS_TO_STACK 6
988    SAVE_XMM 7
989    GET_GOT     rbx
990    push        rsi
991    push        rdi
992    ; end prolog
993
994    ALIGN_STACK 16, rax
995    sub         rsp, 16*5
996    %define k0k1 [rsp + 16*0]
997    %define k2k3 [rsp + 16*1]
998    %define k4k5 [rsp + 16*2]
999    %define k6k7 [rsp + 16*3]
1000    %define krd [rsp + 16*4]
1001
1002    HORIZx8 1
1003
1004    add rsp, 16*5
1005    pop rsp
1006
1007    ; begin epilog
1008    pop rdi
1009    pop rsi
1010    RESTORE_GOT
1011    RESTORE_XMM
1012    UNSHADOW_ARGS
1013    pop         rbp
1014    ret
1015
1016global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
1017sym(vp9_filter_block1d16_h8_avg_ssse3):
1018    push        rbp
1019    mov         rbp, rsp
1020    SHADOW_ARGS_TO_STACK 6
1021    SAVE_XMM 7
1022    GET_GOT     rbx
1023    push        rsi
1024    push        rdi
1025    ; end prolog
1026
1027    ALIGN_STACK 16, rax
1028    sub         rsp, 16*5
1029    %define k0k1 [rsp + 16*0]
1030    %define k2k3 [rsp + 16*1]
1031    %define k4k5 [rsp + 16*2]
1032    %define k6k7 [rsp + 16*3]
1033    %define krd [rsp + 16*4]
1034
1035    HORIZx16 1
1036
1037    add rsp, 16*5
1038    pop rsp
1039
1040    ; begin epilog
1041    pop rdi
1042    pop rsi
1043    RESTORE_GOT
1044    RESTORE_XMM
1045    UNSHADOW_ARGS
1046    pop         rbp
1047    ret
1048SECTION_RODATA
1049align 16
1050shuf_t0t1:
1051    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1052align 16
1053shuf_t2t3:
1054    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1055align 16
1056shuf_t4t5:
1057    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1058align 16
1059shuf_t6t7:
1060    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
1061