1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_64:    times 8 dw 64
18even_byte_mask: times 8 dw 0x00ff
19
20; %define USE_PMULHRSW
21; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
22; when using this instruction.
23;
24; The add order below (based on ffav1) must be followed to prevent outranges.
25; x = k0k1 + k4k5
26; y = k2k3 + k6k7
27; z = signed SAT(x + y)
28
29SECTION .text
30%define LOCAL_VARS_SIZE 16*6
31
32%macro SETUP_LOCAL_VARS 0
33    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
34    ; pmaddubsw has a higher latency on some platforms, this might be eased by
35    ; interleaving the instructions.
36    %define    k0k1  [rsp + 16*0]
37    %define    k2k3  [rsp + 16*1]
38    %define    k4k5  [rsp + 16*2]
39    %define    k6k7  [rsp + 16*3]
40    packsswb     m4, m4
41    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
42    ; some platforms.
43    pshuflw      m0, m4, 0b              ;k0_k1
44    pshuflw      m1, m4, 01010101b       ;k2_k3
45    pshuflw      m2, m4, 10101010b       ;k4_k5
46    pshuflw      m3, m4, 11111111b       ;k6_k7
47    punpcklqdq   m0, m0
48    punpcklqdq   m1, m1
49    punpcklqdq   m2, m2
50    punpcklqdq   m3, m3
51    mova       k0k1, m0
52    mova       k2k3, m1
53    mova       k4k5, m2
54    mova       k6k7, m3
55%if ARCH_X86_64
56    %define     krd  m12
57    %define    tmp0  [rsp + 16*4]
58    %define    tmp1  [rsp + 16*5]
59    mova        krd, [GLOBAL(pw_64)]
60%else
61    %define     krd  [rsp + 16*4]
62%if CONFIG_PIC=0
63    mova         m6, [GLOBAL(pw_64)]
64%else
65    ; build constants without accessing global memory
66    pcmpeqb      m6, m6                  ;all ones
67    psrlw        m6, 15
68    psllw        m6, 6                   ;aka pw_64
69%endif
70    mova        krd, m6
71%endif
72%endm
73
74;-------------------------------------------------------------------------------
75%if ARCH_X86_64
76  %define LOCAL_VARS_SIZE_H4 0
77%else
78  %define LOCAL_VARS_SIZE_H4 16*4
79%endif
80
81%macro SUBPIX_HFILTER4 1
82cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
83                            src, sstride, dst, dstride, height, filter
84    mova                m4, [filterq]
85    packsswb            m4, m4
86%if ARCH_X86_64
87    %define       k0k1k4k5  m8
88    %define       k2k3k6k7  m9
89    %define            krd  m10
90    mova               krd, [GLOBAL(pw_64)]
91    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
92    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
93    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
94    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
95%else
96    %define       k0k1k4k5  [rsp + 16*0]
97    %define       k2k3k6k7  [rsp + 16*1]
98    %define            krd  [rsp + 16*2]
99    pshuflw             m6, m4, 0b              ;k0_k1
100    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
101    pshuflw             m7, m4, 01010101b       ;k2_k3
102    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
103%if CONFIG_PIC=0
104    mova                m1, [GLOBAL(pw_64)]
105%else
106    ; build constants without accessing global memory
107    pcmpeqb             m1, m1                  ;all ones
108    psrlw               m1, 15
109    psllw               m1, 6                   ;aka pw_64
110%endif
111    mova          k0k1k4k5, m6
112    mova          k2k3k6k7, m7
113    mova               krd, m1
114%endif
115    dec            heightd
116
117.loop:
118    ;Do two rows at once
119    movu                m4, [srcq - 3]
120    movu                m5, [srcq + sstrideq - 3]
121    punpckhbw           m1, m4, m4
122    punpcklbw           m4, m4
123    punpckhbw           m3, m5, m5
124    punpcklbw           m5, m5
125    palignr             m0, m1, m4, 1
126    pmaddubsw           m0, k0k1k4k5
127    palignr             m1, m4, 5
128    pmaddubsw           m1, k2k3k6k7
129    palignr             m2, m3, m5, 1
130    pmaddubsw           m2, k0k1k4k5
131    palignr             m3, m5, 5
132    pmaddubsw           m3, k2k3k6k7
133    punpckhqdq          m4, m0, m2
134    punpcklqdq          m0, m2
135    punpckhqdq          m5, m1, m3
136    punpcklqdq          m1, m3
137    paddsw              m0, m4
138    paddsw              m1, m5
139%ifidn %1, h8_avg
140    movd                m4, [dstq]
141    movd                m5, [dstq + dstrideq]
142%endif
143    paddsw              m0, m1
144    paddsw              m0, krd
145    psraw               m0, 7
146%ifidn %1, h8_add_src
147    pxor                 m3, m3
148    movu                 m4, [srcq]
149    movu                 m5, [srcq + sstrideq]
150    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
151    punpcklbw            m4, m3
152    paddsw               m0, m4
153%endif
154    packuswb            m0, m0
155    psrldq              m1, m0, 4
156
157%ifidn %1, h8_avg
158    pavgb               m0, m4
159    pavgb               m1, m5
160%endif
161    movd            [dstq], m0
162    movd [dstq + dstrideq], m1
163
164    lea               srcq, [srcq + sstrideq        ]
165    prefetcht0              [srcq + 4 * sstrideq - 3]
166    lea               srcq, [srcq + sstrideq        ]
167    lea               dstq, [dstq + 2 * dstrideq    ]
168    prefetcht0              [srcq + 2 * sstrideq - 3]
169
170    sub            heightd, 2
171    jg               .loop
172
173    ; Do last row if output_height is odd
174    jne              .done
175
176    movu                m4, [srcq - 3]
177    punpckhbw           m1, m4, m4
178    punpcklbw           m4, m4
179    palignr             m0, m1, m4, 1
180    palignr             m1, m4, 5
181    pmaddubsw           m0, k0k1k4k5
182    pmaddubsw           m1, k2k3k6k7
183    psrldq              m2, m0, 8
184    psrldq              m3, m1, 8
185    paddsw              m0, m2
186    paddsw              m1, m3
187    paddsw              m0, m1
188    paddsw              m0, krd
189    psraw               m0, 7
190%ifidn %1, h8_add_src
191    pxor                m3, m3
192    movu                m4, [srcq]
193    punpcklbw           m4, m3
194    paddsw              m0, m4
195%endif
196    packuswb            m0, m0
197%ifidn %1, h8_avg
198    movd                m4, [dstq]
199    pavgb               m0, m4
200%endif
201    movd            [dstq], m0
202.done:
203    REP_RET
204%endm
205
206;-------------------------------------------------------------------------------
207%macro SUBPIX_HFILTER8 1
208cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
209                            src, sstride, dst, dstride, height, filter
210    mova                 m4, [filterq]
211    SETUP_LOCAL_VARS
212    dec             heightd
213
214.loop:
215    ;Do two rows at once
216    movu                 m0, [srcq - 3]
217    movu                 m4, [srcq + sstrideq - 3]
218    punpckhbw            m1, m0, m0
219    punpcklbw            m0, m0
220    palignr              m5, m1, m0, 13
221    pmaddubsw            m5, k6k7
222    palignr              m2, m1, m0, 5
223    palignr              m3, m1, m0, 9
224    palignr              m1, m0, 1
225    pmaddubsw            m1, k0k1
226    punpckhbw            m6, m4, m4
227    punpcklbw            m4, m4
228    pmaddubsw            m2, k2k3
229    pmaddubsw            m3, k4k5
230
231    palignr              m7, m6, m4, 13
232    palignr              m0, m6, m4, 5
233    pmaddubsw            m7, k6k7
234    paddsw               m1, m3
235    paddsw               m2, m5
236    paddsw               m1, m2
237%ifidn %1, h8_avg
238    movh                 m2, [dstq]
239    movhps               m2, [dstq + dstrideq]
240%endif
241    palignr              m5, m6, m4, 9
242    palignr              m6, m4, 1
243    pmaddubsw            m0, k2k3
244    pmaddubsw            m6, k0k1
245    paddsw               m1, krd
246    pmaddubsw            m5, k4k5
247    psraw                m1, 7
248    paddsw               m0, m7
249    paddsw               m6, m5
250    paddsw               m6, m0
251    paddsw               m6, krd
252    psraw                m6, 7
253%ifidn %1, h8_add_src
254    pxor                 m3, m3
255    movu                 m4, [srcq]
256    movu                 m5, [srcq + sstrideq]
257    punpcklbw            m4, m3
258    punpcklbw            m5, m3
259    paddsw               m1, m4
260    paddsw               m6, m5
261%endif
262    packuswb             m1, m6
263%ifidn %1, h8_avg
264    pavgb                m1, m2
265%endif
266    movh              [dstq], m1
267    movhps [dstq + dstrideq], m1
268
269    lea                srcq, [srcq + sstrideq        ]
270    prefetcht0               [srcq + 4 * sstrideq - 3]
271    lea                srcq, [srcq + sstrideq        ]
272    lea                dstq, [dstq + 2 * dstrideq    ]
273    prefetcht0               [srcq + 2 * sstrideq - 3]
274    sub             heightd, 2
275    jg                .loop
276
277    ; Do last row if output_height is odd
278    jne               .done
279
280    movu                 m0, [srcq - 3]
281    punpckhbw            m3, m0, m0
282    punpcklbw            m0, m0
283    palignr              m1, m3, m0, 1
284    palignr              m2, m3, m0, 5
285    palignr              m4, m3, m0, 13
286    palignr              m3, m0, 9
287    pmaddubsw            m1, k0k1
288    pmaddubsw            m2, k2k3
289    pmaddubsw            m3, k4k5
290    pmaddubsw            m4, k6k7
291    paddsw               m1, m3
292    paddsw               m4, m2
293    paddsw               m1, m4
294    paddsw               m1, krd
295    psraw                m1, 7
296%ifidn %1, h8_add_src
297    pxor                 m6, m6
298    movu                 m5, [srcq]
299    punpcklbw            m5, m6
300    paddsw               m1, m5
301%endif
302    packuswb             m1, m1
303%ifidn %1, h8_avg
304    movh                 m0, [dstq]
305    pavgb                m1, m0
306%endif
307    movh             [dstq], m1
308.done:
309    REP_RET
310%endm
311
312;-------------------------------------------------------------------------------
313%macro SUBPIX_HFILTER16 1
314cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
315                             src, sstride, dst, dstride, height, filter
316    mova          m4, [filterq]
317    SETUP_LOCAL_VARS
318
319.loop:
320    prefetcht0        [srcq + 2 * sstrideq -3]
321
322    movu          m0, [srcq - 3]
323    movu          m4, [srcq - 2]
324    pmaddubsw     m0, k0k1
325    pmaddubsw     m4, k0k1
326    movu          m1, [srcq - 1]
327    movu          m5, [srcq + 0]
328    pmaddubsw     m1, k2k3
329    pmaddubsw     m5, k2k3
330    movu          m2, [srcq + 1]
331    movu          m6, [srcq + 2]
332    pmaddubsw     m2, k4k5
333    pmaddubsw     m6, k4k5
334    movu          m3, [srcq + 3]
335    movu          m7, [srcq + 4]
336    pmaddubsw     m3, k6k7
337    pmaddubsw     m7, k6k7
338    paddsw        m0, m2
339    paddsw        m1, m3
340    paddsw        m0, m1
341    paddsw        m4, m6
342    paddsw        m5, m7
343    paddsw        m4, m5
344    paddsw        m0, krd
345    paddsw        m4, krd
346    psraw         m0, 7
347    psraw         m4, 7
348%ifidn %1, h8_add_src
349%if ARCH_X86=1 && CONFIG_PIC=1
350    pcmpeqb       m2, m2                  ;all ones
351    psrlw         m2, 8                   ;even_byte_mask
352%else
353    mova          m2, [GLOBAL(even_byte_mask)]
354%endif
355    movu          m5, [srcq]
356    mova          m7, m5
357    pand          m5, m2
358    psrlw         m7, 8
359    paddsw        m0, m5
360    paddsw        m4, m7
361%endif
362    packuswb      m0, m0
363    packuswb      m4, m4
364    punpcklbw     m0, m4
365%ifidn %1, h8_avg
366    pavgb         m0, [dstq]
367%endif
368    lea         srcq, [srcq + sstrideq]
369    mova      [dstq], m0
370    lea         dstq, [dstq + dstrideq]
371    dec      heightd
372    jnz        .loop
373    REP_RET
374%endm
375
376INIT_XMM ssse3
377SUBPIX_HFILTER16 h8
378SUBPIX_HFILTER8  h8
379SUBPIX_HFILTER4  h8
380
381;-------------------------------------------------------------------------------
382
383; TODO(Linfeng): Detect cpu type and choose the code with better performance.
384%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
385
386%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
387    %define NUM_GENERAL_REG_USED 9
388%else
389    %define NUM_GENERAL_REG_USED 6
390%endif
391
392%macro SUBPIX_VFILTER 2
393cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
394                             src, sstride, dst, dstride, height, filter
395    mova          m4, [filterq]
396    SETUP_LOCAL_VARS
397
398%ifidn %2, 8
399    %define                movx  movh
400%else
401    %define                movx  movd
402%endif
403
404    dec                 heightd
405
406%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
407
408%if ARCH_X86_64
409    %define               src1q  r7
410    %define           sstride6q  r8
411    %define          dst_stride  dstrideq
412%else
413    %define               src1q  filterq
414    %define           sstride6q  dstrideq
415    %define          dst_stride  dstridemp
416%endif
417    mov                   src1q, srcq
418    add                   src1q, sstrideq
419    lea               sstride6q, [sstrideq + sstrideq * 4]
420    add               sstride6q, sstrideq                   ;pitch * 6
421
422.loop:
423    ;Do two rows at once
424    movx                     m0, [srcq                ]     ;A
425    movx                     m1, [src1q               ]     ;B
426    punpcklbw                m0, m1                         ;A B
427    movx                     m2, [srcq + sstrideq * 2 ]     ;C
428    pmaddubsw                m0, k0k1
429    mova                     m6, m2
430    movx                     m3, [src1q + sstrideq * 2]     ;D
431    punpcklbw                m2, m3                         ;C D
432    pmaddubsw                m2, k2k3
433    movx                     m4, [srcq + sstrideq * 4 ]     ;E
434    mova                     m7, m4
435    movx                     m5, [src1q + sstrideq * 4]     ;F
436    punpcklbw                m4, m5                         ;E F
437    pmaddubsw                m4, k4k5
438    punpcklbw                m1, m6                         ;A B next iter
439    movx                     m6, [srcq + sstride6q    ]     ;G
440    punpcklbw                m5, m6                         ;E F next iter
441    punpcklbw                m3, m7                         ;C D next iter
442    pmaddubsw                m5, k4k5
443    movx                     m7, [src1q + sstride6q   ]     ;H
444    punpcklbw                m6, m7                         ;G H
445    pmaddubsw                m6, k6k7
446    pmaddubsw                m3, k2k3
447    pmaddubsw                m1, k0k1
448    paddsw                   m0, m4
449    paddsw                   m2, m6
450    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
451    punpcklbw                m7, m6
452    pmaddubsw                m7, k6k7
453    paddsw                   m0, m2
454    paddsw                   m0, krd
455    psraw                    m0, 7
456    paddsw                   m1, m5
457%ifidn %1, v8_add_src
458    pxor                     m6, m6
459    movu                     m4, [srcq]
460    punpcklbw                m4, m6
461    paddsw                   m0, m4
462%endif
463    packuswb                 m0, m0
464
465    paddsw                   m3, m7
466    paddsw                   m1, m3
467    paddsw                   m1, krd
468    psraw                    m1, 7
469%ifidn %1, v8_add_src
470    movu                     m4, [src1q]
471    punpcklbw                m4, m6
472    paddsw                   m1, m4
473%endif
474    lea                    srcq, [srcq + sstrideq * 2 ]
475    lea                   src1q, [src1q + sstrideq * 2]
476    packuswb                 m1, m1
477
478%ifidn %1, v8_avg
479    movx                     m2, [dstq]
480    pavgb                    m0, m2
481%endif
482    movx                 [dstq], m0
483    add                    dstq, dst_stride
484%ifidn %1, v8_avg
485    movx                     m3, [dstq]
486    pavgb                    m1, m3
487%endif
488    movx                 [dstq], m1
489    add                    dstq, dst_stride
490    sub                 heightd, 2
491    jg                    .loop
492
493    ; Do last row if output_height is odd
494    jne                   .done
495
496    movx                     m0, [srcq                ]     ;A
497    movx                     m1, [srcq + sstrideq     ]     ;B
498    movx                     m6, [srcq + sstride6q    ]     ;G
499    punpcklbw                m0, m1                         ;A B
500    movx                     m7, [src1q + sstride6q   ]     ;H
501    pmaddubsw                m0, k0k1
502    movx                     m2, [srcq + sstrideq * 2 ]     ;C
503    punpcklbw                m6, m7                         ;G H
504    movx                     m3, [src1q + sstrideq * 2]     ;D
505    pmaddubsw                m6, k6k7
506    movx                     m4, [srcq + sstrideq * 4 ]     ;E
507    punpcklbw                m2, m3                         ;C D
508    movx                     m5, [src1q + sstrideq * 4]     ;F
509    punpcklbw                m4, m5                         ;E F
510    pmaddubsw                m2, k2k3
511    pmaddubsw                m4, k4k5
512    paddsw                   m2, m6
513    paddsw                   m0, m4
514    paddsw                   m0, m2
515    paddsw                   m0, krd
516    psraw                    m0, 7
517%ifidn %1, v8_add_src
518    pxor                     m6, m6
519    movu                     m4, [srcq]
520    punpcklbw                m4, m6
521    paddsw                   m0, m4
522%endif
523    packuswb                 m0, m0
524%ifidn %1, v8_avg
525    movx                     m1, [dstq]
526    pavgb                    m0, m1
527%endif
528    movx                 [dstq], m0
529
530%else
531    ; ARCH_X86_64
532
533    movx                     m0, [srcq                ]     ;A
534    movx                     m1, [srcq + sstrideq     ]     ;B
535    lea                    srcq, [srcq + sstrideq * 2 ]
536    movx                     m2, [srcq]                     ;C
537    movx                     m3, [srcq + sstrideq]          ;D
538    lea                    srcq, [srcq + sstrideq * 2 ]
539    movx                     m4, [srcq]                     ;E
540    movx                     m5, [srcq + sstrideq]          ;F
541    lea                    srcq, [srcq + sstrideq * 2 ]
542    movx                     m6, [srcq]                     ;G
543    punpcklbw                m0, m1                         ;A B
544    punpcklbw                m1, m2                         ;A B next iter
545    punpcklbw                m2, m3                         ;C D
546    punpcklbw                m3, m4                         ;C D next iter
547    punpcklbw                m4, m5                         ;E F
548    punpcklbw                m5, m6                         ;E F next iter
549
550.loop:
551    ;Do two rows at once
552    movx                     m7, [srcq + sstrideq]          ;H
553    lea                    srcq, [srcq + sstrideq * 2 ]
554    movx                    m14, [srcq]                     ;H next iter
555    punpcklbw                m6, m7                         ;G H
556    punpcklbw                m7, m14                        ;G H next iter
557    pmaddubsw                m8, m0, k0k1
558    pmaddubsw                m9, m1, k0k1
559    mova                     m0, m2
560    mova                     m1, m3
561    pmaddubsw               m10, m2, k2k3
562    pmaddubsw               m11, m3, k2k3
563    mova                     m2, m4
564    mova                     m3, m5
565    pmaddubsw                m4, k4k5
566    pmaddubsw                m5, k4k5
567    paddsw                   m8, m4
568    paddsw                   m9, m5
569    mova                     m4, m6
570    mova                     m5, m7
571    pmaddubsw                m6, k6k7
572    pmaddubsw                m7, k6k7
573    paddsw                  m10, m6
574    paddsw                  m11, m7
575    paddsw                   m8, m10
576    paddsw                   m9, m11
577    mova                     m6, m14
578    paddsw                   m8, krd
579    paddsw                   m9, krd
580    psraw                    m8, 7
581    psraw                    m9, 7
582%ifidn %2, 4
583    packuswb                 m8, m8
584    packuswb                 m9, m9
585%else
586    packuswb                 m8, m9
587%endif
588
589%ifidn %1, v8_avg
590    movx                     m7, [dstq]
591%ifidn %2, 4
592    movx                    m10, [dstq + dstrideq]
593    pavgb                    m9, m10
594%else
595    movhpd                   m7, [dstq + dstrideq]
596%endif
597    pavgb                    m8, m7
598%endif
599    movx                 [dstq], m8
600%ifidn %2, 4
601    movx      [dstq + dstrideq], m9
602%else
603    movhpd    [dstq + dstrideq], m8
604%endif
605
606    lea                    dstq, [dstq + dstrideq * 2 ]
607    sub                 heightd, 2
608    jg                    .loop
609
610    ; Do last row if output_height is odd
611    jne                   .done
612
613    movx                     m7, [srcq + sstrideq]          ;H
614    punpcklbw                m6, m7                         ;G H
615    pmaddubsw                m0, k0k1
616    pmaddubsw                m2, k2k3
617    pmaddubsw                m4, k4k5
618    pmaddubsw                m6, k6k7
619    paddsw                   m0, m4
620    paddsw                   m2, m6
621    paddsw                   m0, m2
622    paddsw                   m0, krd
623    psraw                    m0, 7
624    packuswb                 m0, m0
625%ifidn %1, v8_avg
626    movx                     m1, [dstq]
627    pavgb                    m0, m1
628%endif
629    movx                 [dstq], m0
630
631%endif ; ARCH_X86_64
632
633.done:
634    REP_RET
635
636%endm
637
638;-------------------------------------------------------------------------------
639%macro SUBPIX_VFILTER16 1
640cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
641                             src, sstride, dst, dstride, height, filter
642    mova                     m4, [filterq]
643    SETUP_LOCAL_VARS
644
645%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
646
647%if ARCH_X86_64
648    %define               src1q  r7
649    %define           sstride6q  r8
650    %define          dst_stride  dstrideq
651%else
652    %define               src1q  filterq
653    %define           sstride6q  dstrideq
654    %define          dst_stride  dstridemp
655%endif
656    lea                   src1q, [srcq + sstrideq]
657    lea               sstride6q, [sstrideq + sstrideq * 4]
658    add               sstride6q, sstrideq                   ;pitch * 6
659
660.loop:
661    movh                     m0, [srcq                ]     ;A
662    movh                     m1, [src1q               ]     ;B
663    movh                     m2, [srcq + sstrideq * 2 ]     ;C
664    movh                     m3, [src1q + sstrideq * 2]     ;D
665    movh                     m4, [srcq + sstrideq * 4 ]     ;E
666    movh                     m5, [src1q + sstrideq * 4]     ;F
667
668    punpcklbw                m0, m1                         ;A B
669    movh                     m6, [srcq + sstride6q]         ;G
670    punpcklbw                m2, m3                         ;C D
671    movh                     m7, [src1q + sstride6q]        ;H
672    punpcklbw                m4, m5                         ;E F
673    pmaddubsw                m0, k0k1
674    movh                     m3, [srcq + 8]                 ;A
675    pmaddubsw                m2, k2k3
676    punpcklbw                m6, m7                         ;G H
677    movh                     m5, [srcq + sstrideq + 8]      ;B
678    pmaddubsw                m4, k4k5
679    punpcklbw                m3, m5                         ;A B
680    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
681    pmaddubsw                m6, k6k7
682    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
683    punpcklbw                m7, m5                         ;C D
684    paddsw                   m2, m6
685    pmaddubsw                m3, k0k1
686    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
687    paddsw                   m0, m4
688    pmaddubsw                m7, k2k3
689    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
690    punpcklbw                m1, m6                         ;E F
691    paddsw                   m0, m2
692    paddsw                   m0, krd
693    movh                     m2, [srcq + sstride6q + 8]     ;G
694    pmaddubsw                m1, k4k5
695    movh                     m5, [src1q + sstride6q + 8]    ;H
696    psraw                    m0, 7
697    punpcklbw                m2, m5                         ;G H
698    pmaddubsw                m2, k6k7
699    paddsw                   m7, m2
700    paddsw                   m3, m1
701    paddsw                   m3, m7
702    paddsw                   m3, krd
703    psraw                    m3, 7
704%ifidn %1, v8_add_src
705    pxor                     m6, m6
706    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
707    mova                     m5, m4
708    punpcklbw                m4, m6
709    punpckhbw                m5, m6
710    paddsw                   m0, m4
711    paddsw                   m3, m5
712%endif
713    packuswb                 m0, m3
714
715    add                    srcq, sstrideq
716    add                   src1q, sstrideq
717%ifidn %1, v8_avg
718    pavgb                    m0, [dstq]
719%endif
720    mova                 [dstq], m0
721    add                    dstq, dst_stride
722    dec                 heightd
723    jnz                   .loop
724    REP_RET
725
726%else
727    ; ARCH_X86_64
728    dec                 heightd
729
730    movu                     m1, [srcq                ]     ;A
731    movu                     m3, [srcq + sstrideq     ]     ;B
732    lea                    srcq, [srcq + sstrideq * 2]
733    punpcklbw                m0, m1, m3                     ;A B
734    punpckhbw                m1, m3                         ;A B
735    movu                     m5, [srcq]                     ;C
736    punpcklbw                m2, m3, m5                     ;A B next iter
737    punpckhbw                m3, m5                         ;A B next iter
738    mova                   tmp0, m2                         ;store to stack
739    mova                   tmp1, m3                         ;store to stack
740    movu                     m7, [srcq + sstrideq]          ;D
741    lea                    srcq, [srcq + sstrideq * 2]
742    punpcklbw                m4, m5, m7                     ;C D
743    punpckhbw                m5, m7                         ;C D
744    movu                     m9, [srcq]                     ;E
745    punpcklbw                m6, m7, m9                     ;C D next iter
746    punpckhbw                m7, m9                         ;C D next iter
747    movu                    m11, [srcq + sstrideq]          ;F
748    lea                    srcq, [srcq + sstrideq * 2]
749    punpcklbw                m8, m9, m11                    ;E F
750    punpckhbw                m9, m11                        ;E F
751    movu                     m2, [srcq]                     ;G
752    punpcklbw               m10, m11, m2                    ;E F next iter
753    punpckhbw               m11, m2                         ;E F next iter
754
755.loop:
756    ;Do two rows at once
757    pmaddubsw               m13, m0, k0k1
758    mova                     m0, m4
759    pmaddubsw               m14, m8, k4k5
760    pmaddubsw               m15, m4, k2k3
761    mova                     m4, m8
762    paddsw                  m13, m14
763    movu                     m3, [srcq + sstrideq]          ;H
764    lea                    srcq, [srcq + sstrideq * 2]
765    punpcklbw               m14, m2, m3                     ;G H
766    mova                     m8, m14
767    pmaddubsw               m14, k6k7
768    paddsw                  m15, m14
769    paddsw                  m13, m15
770    paddsw                  m13, krd
771    psraw                   m13, 7
772
773    pmaddubsw               m14, m1, k0k1
774    pmaddubsw                m1, m9, k4k5
775    pmaddubsw               m15, m5, k2k3
776    paddsw                  m14, m1
777    mova                     m1, m5
778    mova                     m5, m9
779    punpckhbw                m2, m3                         ;G H
780    mova                     m9, m2
781    pmaddubsw                m2, k6k7
782    paddsw                  m15, m2
783    paddsw                  m14, m15
784    paddsw                  m14, krd
785    psraw                   m14, 7
786    packuswb                m13, m14
787%ifidn %1, v8_avg
788    pavgb                   m13, [dstq]
789%endif
790    mova                 [dstq], m13
791
792    ; next iter
793    pmaddubsw               m15, tmp0, k0k1
794    pmaddubsw               m14, m10, k4k5
795    pmaddubsw               m13, m6, k2k3
796    paddsw                  m15, m14
797    mova                   tmp0, m6
798    mova                     m6, m10
799    movu                     m2, [srcq]                     ;G next iter
800    punpcklbw               m14, m3, m2                     ;G H next iter
801    mova                    m10, m14
802    pmaddubsw               m14, k6k7
803    paddsw                  m13, m14
804    paddsw                  m15, m13
805    paddsw                  m15, krd
806    psraw                   m15, 7
807
808    pmaddubsw               m14, tmp1, k0k1
809    mova                   tmp1, m7
810    pmaddubsw               m13, m7, k2k3
811    mova                     m7, m11
812    pmaddubsw               m11, k4k5
813    paddsw                  m14, m11
814    punpckhbw                m3, m2                         ;G H next iter
815    mova                    m11, m3
816    pmaddubsw                m3, k6k7
817    paddsw                  m13, m3
818    paddsw                  m14, m13
819    paddsw                  m14, krd
820    psraw                   m14, 7
821    packuswb                m15, m14
822%ifidn %1, v8_avg
823    pavgb                   m15, [dstq + dstrideq]
824%endif
825    mova      [dstq + dstrideq], m15
826    lea                    dstq, [dstq + dstrideq * 2]
827    sub                 heightd, 2
828    jg                    .loop
829
830    ; Do last row if output_height is odd
831    jne                   .done
832
833    movu                     m3, [srcq + sstrideq]          ;H
834    punpcklbw                m6, m2, m3                     ;G H
835    punpckhbw                m2, m3                         ;G H
836    pmaddubsw                m0, k0k1
837    pmaddubsw                m1, k0k1
838    pmaddubsw                m4, k2k3
839    pmaddubsw                m5, k2k3
840    pmaddubsw                m8, k4k5
841    pmaddubsw                m9, k4k5
842    pmaddubsw                m6, k6k7
843    pmaddubsw                m2, k6k7
844    paddsw                   m0, m8
845    paddsw                   m1, m9
846    paddsw                   m4, m6
847    paddsw                   m5, m2
848    paddsw                   m0, m4
849    paddsw                   m1, m5
850    paddsw                   m0, krd
851    paddsw                   m1, krd
852    psraw                    m0, 7
853    psraw                    m1, 7
854    packuswb                 m0, m1
855%ifidn %1, v8_avg
856    pavgb                    m0, [dstq]
857%endif
858    mova                 [dstq], m0
859
860.done:
861    REP_RET
862
863%endif ; ARCH_X86_64
864
865%endm
866
867INIT_XMM ssse3
868SUBPIX_VFILTER16     v8
869SUBPIX_VFILTER       v8, 8
870SUBPIX_VFILTER       v8, 4
871