1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 14
18                     times  8 dw  2
19                     times  8 dw 12
20                     times  8 dw  4
21                     times  8 dw 10
22                     times  8 dw  6
23                     times 16 dw  8
24                     times  8 dw  6
25                     times  8 dw 10
26                     times  8 dw  4
27                     times  8 dw 12
28                     times  8 dw  2
29                     times  8 dw 14
30
31bilin_filter_m_ssse3: times  8 db 16,  0
32                      times  8 db 14,  2
33                      times  8 db 12,  4
34                      times  8 db 10,  6
35                      times 16 db  8
36                      times  8 db  6, 10
37                      times  8 db  4, 12
38                      times  8 db  2, 14
39
40SECTION .text
41
42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
43;                               int x_offset, int y_offset,
44;                               const uint8_t *ref, ptrdiff_t ref_stride,
45;                               int height, unsigned int *sse);
46;
47; This function returns the SE and stores SSE in the given pointer.
48
49%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
50  psubw                %3, %4
51  psubw                %1, %2
52  paddw                %5, %3
53  pmaddwd              %3, %3
54  paddw                %5, %1
55  pmaddwd              %1, %1
56  paddd                %6, %3
57  paddd                %6, %1
58%endmacro
59
60%macro STORE_AND_RET 1
61%if %1 > 4
62  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
63  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
64  ; We have to sign-extend it before adding the words within the register
65  ; and outputing to a dword.
66  pcmpgtw              m5, m6           ; mask for 0 > x
67  movhlps              m3, m7
68  punpcklwd            m4, m6, m5
69  punpckhwd            m6, m5           ; sign-extend m6 word->dword
70  paddd                m7, m3
71  paddd                m6, m4
72  pshufd               m3, m7, 0x1
73  movhlps              m4, m6
74  paddd                m7, m3
75  paddd                m6, m4
76  mov                  r1, ssem         ; r1 = unsigned int *sse
77  pshufd               m4, m6, 0x1
78  movd               [r1], m7           ; store sse
79  paddd                m6, m4
80  movd               raxd, m6           ; store sum as return value
81%else ; 4xh
82  pshuflw              m4, m6, 0xe
83  pshuflw              m3, m7, 0xe
84  paddw                m6, m4
85  paddd                m7, m3
86  pcmpgtw              m5, m6           ; mask for 0 > x
87  mov                  r1, ssem         ; r1 = unsigned int *sse
88  punpcklwd            m6, m5           ; sign-extend m6 word->dword
89  movd               [r1], m7           ; store sse
90  pshuflw              m4, m6, 0xe
91  paddd                m6, m4
92  movd               raxd, m6           ; store sum as return value
93%endif
94  RET
95%endmacro
96
97%macro INC_SRC_BY_SRC_STRIDE  0
98%if ARCH_X86=1 && CONFIG_PIC=1
99  add                srcq, src_stridemp
100%else
101  add                srcq, src_strideq
102%endif
103%endmacro
104
105%macro SUBPEL_VARIANCE 1-2 0 ; W
106%if cpuflag(ssse3)
107%define bilin_filter_m bilin_filter_m_ssse3
108%define filter_idx_shift 4
109%else
110%define bilin_filter_m bilin_filter_m_sse2
111%define filter_idx_shift 5
112%endif
113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
114; 11, not 13, if the registers are ordered correctly. May make a minor speed
115; difference on Win64
116
117%if ARCH_X86_64
118  %if %2 == 1 ; avg
119    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
120                                        x_offset, y_offset, ref, ref_stride, \
121                                        second_pred, second_stride, height, sse
122    %define second_str second_strideq
123  %else
124    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
125                                    x_offset, y_offset, ref, ref_stride, \
126                                    height, sse
127  %endif
128  %define block_height heightd
129  %define bilin_filter sseq
130%else
131  %if CONFIG_PIC=1
132    %if %2 == 1 ; avg
133      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
134                                          x_offset, y_offset, ref, ref_stride, \
135                                          second_pred, second_stride, height, sse
136      %define block_height dword heightm
137      %define second_str second_stridemp
138    %else
139      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
140                                      x_offset, y_offset, ref, ref_stride, \
141                                      height, sse
142      %define block_height heightd
143    %endif
144
145    ; reuse argument stack space
146    %define g_bilin_filterm x_offsetm
147    %define g_pw_8m y_offsetm
148
149    ;Store bilin_filter and pw_8 location in stack
150    %if GET_GOT_DEFINED == 1
151      GET_GOT eax
152      add esp, 4                ; restore esp
153    %endif
154
155    lea ecx, [GLOBAL(bilin_filter_m)]
156    mov g_bilin_filterm, ecx
157
158    lea ecx, [GLOBAL(pw_8)]
159    mov g_pw_8m, ecx
160
161    LOAD_IF_USED 0, 1         ; load eax, ecx back
162  %else
163    %if %2 == 1 ; avg
164      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
165                                          x_offset, y_offset, \
166                                          ref, ref_stride, second_pred, second_stride, \
167                                          height, sse
168      %define block_height dword heightm
169      %define second_str second_stridemp
170    %else
171      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
172                                      x_offset, y_offset, ref, ref_stride, \
173                                      height, sse
174      %define block_height heightd
175    %endif
176    %define bilin_filter bilin_filter_m
177  %endif
178%endif
179
180%if %1 == 4
181  %define movx movd
182%else
183  %define movx movh
184%endif
185
186  ASSERT               %1 <= 16         ; m6 overflows if w > 16
187  pxor                 m6, m6           ; sum
188  pxor                 m7, m7           ; sse
189  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
190  ; could perhaps use it for something more productive then
191  pxor                 m5, m5           ; dedicated zero register
192%if %1 < 16
193  sar                   block_height, 1
194%if %2 == 1 ; avg
195  shl             second_str, 1
196%endif
197%endif
198
199  ; FIXME(rbultje) replace by jumptable?
200  test          x_offsetd, x_offsetd
201  jnz .x_nonzero
202  ; x_offset == 0
203  test          y_offsetd, y_offsetd
204  jnz .x_zero_y_nonzero
205
206  ; x_offset == 0 && y_offset == 0
207.x_zero_y_zero_loop:
208%if %1 == 16
209  movu                 m0, [srcq]
210  mova                 m1, [refq]
211%if %2 == 1 ; avg
212  pavgb                m0, [second_predq]
213  punpckhbw            m3, m1, m5
214  punpcklbw            m1, m5
215%endif
216  punpckhbw            m2, m0, m5
217  punpcklbw            m0, m5
218
219%if %2 == 0 ; !avg
220  punpckhbw            m3, m1, m5
221  punpcklbw            m1, m5
222%endif
223  SUM_SSE              m0, m1, m2, m3, m6, m7
224
225  add                srcq, src_strideq
226  add                refq, ref_strideq
227%else ; %1 < 16
228  movx                 m0, [srcq]
229%if %2 == 1 ; avg
230%if %1 > 4
231  movhps               m0, [srcq+src_strideq]
232%else ; 4xh
233  movx                 m1, [srcq+src_strideq]
234  punpckldq            m0, m1
235%endif
236%else ; !avg
237  movx                 m2, [srcq+src_strideq]
238%endif
239
240  movx                 m1, [refq]
241  movx                 m3, [refq+ref_strideq]
242
243%if %2 == 1 ; avg
244%if %1 > 4
245  pavgb                m0, [second_predq]
246%else
247  movh                 m2, [second_predq]
248  pavgb                m0, m2
249%endif
250  punpcklbw            m3, m5
251  punpcklbw            m1, m5
252%if %1 > 4
253  punpckhbw            m2, m0, m5
254  punpcklbw            m0, m5
255%else ; 4xh
256  punpcklbw            m0, m5
257  movhlps              m2, m0
258%endif
259%else ; !avg
260  punpcklbw            m0, m5
261  punpcklbw            m2, m5
262  punpcklbw            m3, m5
263  punpcklbw            m1, m5
264%endif
265  SUM_SSE              m0, m1, m2, m3, m6, m7
266
267  lea                srcq, [srcq+src_strideq*2]
268  lea                refq, [refq+ref_strideq*2]
269%endif
270%if %2 == 1 ; avg
271  add                second_predq, second_str
272%endif
273  dec                   block_height
274  jg .x_zero_y_zero_loop
275  STORE_AND_RET %1
276
277.x_zero_y_nonzero:
278  cmp           y_offsetd, 4
279  jne .x_zero_y_nonhalf
280
281  ; x_offset == 0 && y_offset == 0.5
282.x_zero_y_half_loop:
283%if %1 == 16
284  movu                 m0, [srcq]
285  movu                 m4, [srcq+src_strideq]
286  mova                 m1, [refq]
287  pavgb                m0, m4
288  punpckhbw            m3, m1, m5
289%if %2 == 1 ; avg
290  pavgb                m0, [second_predq]
291%endif
292  punpcklbw            m1, m5
293  punpckhbw            m2, m0, m5
294  punpcklbw            m0, m5
295  SUM_SSE              m0, m1, m2, m3, m6, m7
296
297  add                srcq, src_strideq
298  add                refq, ref_strideq
299%else ; %1 < 16
300  movx                 m0, [srcq]
301  movx                 m2, [srcq+src_strideq]
302%if %2 == 1 ; avg
303%if %1 > 4
304  movhps               m2, [srcq+src_strideq*2]
305%else ; 4xh
306  movx                 m1, [srcq+src_strideq*2]
307  punpckldq            m2, m1
308%endif
309  movx                 m1, [refq]
310%if %1 > 4
311  movlhps              m0, m2
312%else ; 4xh
313  punpckldq            m0, m2
314%endif
315  movx                 m3, [refq+ref_strideq]
316  pavgb                m0, m2
317  punpcklbw            m1, m5
318%if %1 > 4
319  pavgb                m0, [second_predq]
320  punpcklbw            m3, m5
321  punpckhbw            m2, m0, m5
322  punpcklbw            m0, m5
323%else ; 4xh
324  movh                 m4, [second_predq]
325  pavgb                m0, m4
326  punpcklbw            m3, m5
327  punpcklbw            m0, m5
328  movhlps              m2, m0
329%endif
330%else ; !avg
331  movx                 m4, [srcq+src_strideq*2]
332  movx                 m1, [refq]
333  pavgb                m0, m2
334  movx                 m3, [refq+ref_strideq]
335  pavgb                m2, m4
336  punpcklbw            m0, m5
337  punpcklbw            m2, m5
338  punpcklbw            m3, m5
339  punpcklbw            m1, m5
340%endif
341  SUM_SSE              m0, m1, m2, m3, m6, m7
342
343  lea                srcq, [srcq+src_strideq*2]
344  lea                refq, [refq+ref_strideq*2]
345%endif
346%if %2 == 1 ; avg
347  add                second_predq, second_str
348%endif
349  dec                   block_height
350  jg .x_zero_y_half_loop
351  STORE_AND_RET %1
352
353.x_zero_y_nonhalf:
354  ; x_offset == 0 && y_offset == bilin interpolation
355%if ARCH_X86_64
356  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
357%endif
358  shl           y_offsetd, filter_idx_shift
359%if ARCH_X86_64 && %1 > 4
360  mova                 m8, [bilin_filter+y_offsetq]
361%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
362  mova                 m9, [bilin_filter+y_offsetq+16]
363%endif
364  mova                m10, [GLOBAL(pw_8)]
365%define filter_y_a m8
366%define filter_y_b m9
367%define filter_rnd m10
368%else ; x86-32 or mmx
369%if ARCH_X86=1 && CONFIG_PIC=1
370; x_offset == 0, reuse x_offset reg
371%define tempq x_offsetq
372  add y_offsetq, g_bilin_filterm
373%define filter_y_a [y_offsetq]
374%define filter_y_b [y_offsetq+16]
375  mov tempq, g_pw_8m
376%define filter_rnd [tempq]
377%else
378  add           y_offsetq, bilin_filter
379%define filter_y_a [y_offsetq]
380%define filter_y_b [y_offsetq+16]
381%define filter_rnd [GLOBAL(pw_8)]
382%endif
383%endif
384
385.x_zero_y_other_loop:
386%if %1 == 16
387  movu                 m0, [srcq]
388  movu                 m4, [srcq+src_strideq]
389  mova                 m1, [refq]
390%if cpuflag(ssse3)
391  punpckhbw            m2, m0, m4
392  punpcklbw            m0, m4
393  pmaddubsw            m2, filter_y_a
394  pmaddubsw            m0, filter_y_a
395  paddw                m2, filter_rnd
396  paddw                m0, filter_rnd
397%else
398  punpckhbw            m2, m0, m5
399  punpckhbw            m3, m4, m5
400  punpcklbw            m0, m5
401  punpcklbw            m4, m5
402  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
403  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
404  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
405  ; slightly faster because of pmullw latency. It would also cut our rodata
406  ; tables in half for this function, and save 1-2 registers on x86-64.
407  pmullw               m2, filter_y_a
408  pmullw               m3, filter_y_b
409  paddw                m2, filter_rnd
410  pmullw               m0, filter_y_a
411  pmullw               m4, filter_y_b
412  paddw                m0, filter_rnd
413  paddw                m2, m3
414  paddw                m0, m4
415%endif
416  psraw                m2, 4
417  psraw                m0, 4
418%if %2 == 1 ; avg
419  ; FIXME(rbultje) pipeline
420  packuswb             m0, m2
421  pavgb                m0, [second_predq]
422  punpckhbw            m2, m0, m5
423  punpcklbw            m0, m5
424%endif
425  punpckhbw            m3, m1, m5
426  punpcklbw            m1, m5
427  SUM_SSE              m0, m1, m2, m3, m6, m7
428
429  add                srcq, src_strideq
430  add                refq, ref_strideq
431%else ; %1 < 16
432  movx                 m0, [srcq]
433  movx                 m2, [srcq+src_strideq]
434  movx                 m4, [srcq+src_strideq*2]
435  movx                 m3, [refq+ref_strideq]
436%if cpuflag(ssse3)
437  movx                 m1, [refq]
438  punpcklbw            m0, m2
439  punpcklbw            m2, m4
440  pmaddubsw            m0, filter_y_a
441  pmaddubsw            m2, filter_y_a
442  punpcklbw            m3, m5
443  paddw                m2, filter_rnd
444  paddw                m0, filter_rnd
445%else
446  punpcklbw            m0, m5
447  punpcklbw            m2, m5
448  punpcklbw            m4, m5
449  pmullw               m0, filter_y_a
450  pmullw               m1, m2, filter_y_b
451  punpcklbw            m3, m5
452  paddw                m0, filter_rnd
453  pmullw               m2, filter_y_a
454  pmullw               m4, filter_y_b
455  paddw                m0, m1
456  paddw                m2, filter_rnd
457  movx                 m1, [refq]
458  paddw                m2, m4
459%endif
460  psraw                m0, 4
461  psraw                m2, 4
462%if %2 == 1 ; avg
463  ; FIXME(rbultje) pipeline
464%if %1 == 4
465  movlhps              m0, m2
466%endif
467  packuswb             m0, m2
468%if %1 > 4
469  pavgb                m0, [second_predq]
470  punpckhbw            m2, m0, m5
471  punpcklbw            m0, m5
472%else ; 4xh
473  movh                 m2, [second_predq]
474  pavgb                m0, m2
475  punpcklbw            m0, m5
476  movhlps              m2, m0
477%endif
478%endif
479  punpcklbw            m1, m5
480  SUM_SSE              m0, m1, m2, m3, m6, m7
481
482  lea                srcq, [srcq+src_strideq*2]
483  lea                refq, [refq+ref_strideq*2]
484%endif
485%if %2 == 1 ; avg
486  add                second_predq, second_str
487%endif
488  dec                   block_height
489  jg .x_zero_y_other_loop
490%undef filter_y_a
491%undef filter_y_b
492%undef filter_rnd
493  STORE_AND_RET %1
494
495.x_nonzero:
496  cmp           x_offsetd, 4
497  jne .x_nonhalf
498  ; x_offset == 0.5
499  test          y_offsetd, y_offsetd
500  jnz .x_half_y_nonzero
501
502  ; x_offset == 0.5 && y_offset == 0
503.x_half_y_zero_loop:
504%if %1 == 16
505  movu                 m0, [srcq]
506  movu                 m4, [srcq+1]
507  mova                 m1, [refq]
508  pavgb                m0, m4
509  punpckhbw            m3, m1, m5
510%if %2 == 1 ; avg
511  pavgb                m0, [second_predq]
512%endif
513  punpcklbw            m1, m5
514  punpckhbw            m2, m0, m5
515  punpcklbw            m0, m5
516  SUM_SSE              m0, m1, m2, m3, m6, m7
517
518  add                srcq, src_strideq
519  add                refq, ref_strideq
520%else ; %1 < 16
521  movx                 m0, [srcq]
522  movx                 m4, [srcq+1]
523%if %2 == 1 ; avg
524%if %1 > 4
525  movhps               m0, [srcq+src_strideq]
526  movhps               m4, [srcq+src_strideq+1]
527%else ; 4xh
528  movx                 m1, [srcq+src_strideq]
529  punpckldq            m0, m1
530  movx                 m2, [srcq+src_strideq+1]
531  punpckldq            m4, m2
532%endif
533  movx                 m1, [refq]
534  movx                 m3, [refq+ref_strideq]
535  pavgb                m0, m4
536  punpcklbw            m3, m5
537%if %1 > 4
538  pavgb                m0, [second_predq]
539  punpcklbw            m1, m5
540  punpckhbw            m2, m0, m5
541  punpcklbw            m0, m5
542%else ; 4xh
543  movh                 m2, [second_predq]
544  pavgb                m0, m2
545  punpcklbw            m1, m5
546  punpcklbw            m0, m5
547  movhlps              m2, m0
548%endif
549%else ; !avg
550  movx                 m2, [srcq+src_strideq]
551  movx                 m1, [refq]
552  pavgb                m0, m4
553  movx                 m4, [srcq+src_strideq+1]
554  movx                 m3, [refq+ref_strideq]
555  pavgb                m2, m4
556  punpcklbw            m0, m5
557  punpcklbw            m2, m5
558  punpcklbw            m3, m5
559  punpcklbw            m1, m5
560%endif
561  SUM_SSE              m0, m1, m2, m3, m6, m7
562
563  lea                srcq, [srcq+src_strideq*2]
564  lea                refq, [refq+ref_strideq*2]
565%endif
566%if %2 == 1 ; avg
567  add                second_predq, second_str
568%endif
569  dec                   block_height
570  jg .x_half_y_zero_loop
571  STORE_AND_RET %1
572
573.x_half_y_nonzero:
574  cmp           y_offsetd, 4
575  jne .x_half_y_nonhalf
576
577  ; x_offset == 0.5 && y_offset == 0.5
578%if %1 == 16
579  movu                 m0, [srcq]
580  movu                 m3, [srcq+1]
581  add                srcq, src_strideq
582  pavgb                m0, m3
583.x_half_y_half_loop:
584  movu                 m4, [srcq]
585  movu                 m3, [srcq+1]
586  mova                 m1, [refq]
587  pavgb                m4, m3
588  punpckhbw            m3, m1, m5
589  pavgb                m0, m4
590%if %2 == 1 ; avg
591  punpcklbw            m1, m5
592  pavgb                m0, [second_predq]
593  punpckhbw            m2, m0, m5
594  punpcklbw            m0, m5
595%else
596  punpckhbw            m2, m0, m5
597  punpcklbw            m0, m5
598  punpcklbw            m1, m5
599%endif
600  SUM_SSE              m0, m1, m2, m3, m6, m7
601  mova                 m0, m4
602
603  add                srcq, src_strideq
604  add                refq, ref_strideq
605%else ; %1 < 16
606  movx                 m0, [srcq]
607  movx                 m3, [srcq+1]
608  add                srcq, src_strideq
609  pavgb                m0, m3
610.x_half_y_half_loop:
611  movx                 m2, [srcq]
612  movx                 m3, [srcq+1]
613%if %2 == 1 ; avg
614%if %1 > 4
615  movhps               m2, [srcq+src_strideq]
616  movhps               m3, [srcq+src_strideq+1]
617%else
618  movx                 m1, [srcq+src_strideq]
619  punpckldq            m2, m1
620  movx                 m1, [srcq+src_strideq+1]
621  punpckldq            m3, m1
622%endif
623  pavgb                m2, m3
624%if %1 > 4
625  movlhps              m0, m2
626  movhlps              m4, m2
627%else ; 4xh
628  punpckldq            m0, m2
629  pshuflw              m4, m2, 0xe
630%endif
631  movx                 m1, [refq]
632  pavgb                m0, m2
633  movx                 m3, [refq+ref_strideq]
634%if %1 > 4
635  pavgb                m0, [second_predq]
636%else
637  movh                 m2, [second_predq]
638  pavgb                m0, m2
639%endif
640  punpcklbw            m3, m5
641  punpcklbw            m1, m5
642%if %1 > 4
643  punpckhbw            m2, m0, m5
644  punpcklbw            m0, m5
645%else
646  punpcklbw            m0, m5
647  movhlps              m2, m0
648%endif
649%else ; !avg
650  movx                 m4, [srcq+src_strideq]
651  movx                 m1, [srcq+src_strideq+1]
652  pavgb                m2, m3
653  pavgb                m4, m1
654  pavgb                m0, m2
655  pavgb                m2, m4
656  movx                 m1, [refq]
657  movx                 m3, [refq+ref_strideq]
658  punpcklbw            m0, m5
659  punpcklbw            m2, m5
660  punpcklbw            m3, m5
661  punpcklbw            m1, m5
662%endif
663  SUM_SSE              m0, m1, m2, m3, m6, m7
664  mova                 m0, m4
665
666  lea                srcq, [srcq+src_strideq*2]
667  lea                refq, [refq+ref_strideq*2]
668%endif
669%if %2 == 1 ; avg
670  add                second_predq, second_str
671%endif
672  dec                   block_height
673  jg .x_half_y_half_loop
674  STORE_AND_RET %1
675
676.x_half_y_nonhalf:
677  ; x_offset == 0.5 && y_offset == bilin interpolation
678%if ARCH_X86_64
679  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
680%endif
681  shl           y_offsetd, filter_idx_shift
682%if ARCH_X86_64 && %1 > 4
683  mova                 m8, [bilin_filter+y_offsetq]
684%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
685  mova                 m9, [bilin_filter+y_offsetq+16]
686%endif
687  mova                m10, [GLOBAL(pw_8)]
688%define filter_y_a m8
689%define filter_y_b m9
690%define filter_rnd m10
691%else  ;x86_32
692%if ARCH_X86=1 && CONFIG_PIC=1
693; x_offset == 0.5. We can reuse x_offset reg
694%define tempq x_offsetq
695  add y_offsetq, g_bilin_filterm
696%define filter_y_a [y_offsetq]
697%define filter_y_b [y_offsetq+16]
698  mov tempq, g_pw_8m
699%define filter_rnd [tempq]
700%else
701  add           y_offsetq, bilin_filter
702%define filter_y_a [y_offsetq]
703%define filter_y_b [y_offsetq+16]
704%define filter_rnd [GLOBAL(pw_8)]
705%endif
706%endif
707
708%if %1 == 16
709  movu                 m0, [srcq]
710  movu                 m3, [srcq+1]
711  add                srcq, src_strideq
712  pavgb                m0, m3
713.x_half_y_other_loop:
714  movu                 m4, [srcq]
715  movu                 m2, [srcq+1]
716  mova                 m1, [refq]
717  pavgb                m4, m2
718%if cpuflag(ssse3)
719  punpckhbw            m2, m0, m4
720  punpcklbw            m0, m4
721  pmaddubsw            m2, filter_y_a
722  pmaddubsw            m0, filter_y_a
723  paddw                m2, filter_rnd
724  paddw                m0, filter_rnd
725  psraw                m2, 4
726%else
727  punpckhbw            m2, m0, m5
728  punpckhbw            m3, m4, m5
729  pmullw               m2, filter_y_a
730  pmullw               m3, filter_y_b
731  paddw                m2, filter_rnd
732  punpcklbw            m0, m5
733  paddw                m2, m3
734  punpcklbw            m3, m4, m5
735  pmullw               m0, filter_y_a
736  pmullw               m3, filter_y_b
737  paddw                m0, filter_rnd
738  psraw                m2, 4
739  paddw                m0, m3
740%endif
741  punpckhbw            m3, m1, m5
742  psraw                m0, 4
743%if %2 == 1 ; avg
744  ; FIXME(rbultje) pipeline
745  packuswb             m0, m2
746  pavgb                m0, [second_predq]
747  punpckhbw            m2, m0, m5
748  punpcklbw            m0, m5
749%endif
750  punpcklbw            m1, m5
751  SUM_SSE              m0, m1, m2, m3, m6, m7
752  mova                 m0, m4
753
754  add                srcq, src_strideq
755  add                refq, ref_strideq
756%else ; %1 < 16
757  movx                 m0, [srcq]
758  movx                 m3, [srcq+1]
759  add                srcq, src_strideq
760  pavgb                m0, m3
761%if notcpuflag(ssse3)
762  punpcklbw            m0, m5
763%endif
764.x_half_y_other_loop:
765  movx                 m2, [srcq]
766  movx                 m1, [srcq+1]
767  movx                 m4, [srcq+src_strideq]
768  movx                 m3, [srcq+src_strideq+1]
769  pavgb                m2, m1
770  pavgb                m4, m3
771  movx                 m3, [refq+ref_strideq]
772%if cpuflag(ssse3)
773  movx                 m1, [refq]
774  punpcklbw            m0, m2
775  punpcklbw            m2, m4
776  pmaddubsw            m0, filter_y_a
777  pmaddubsw            m2, filter_y_a
778  punpcklbw            m3, m5
779  paddw                m0, filter_rnd
780  paddw                m2, filter_rnd
781%else
782  punpcklbw            m2, m5
783  punpcklbw            m4, m5
784  pmullw               m0, filter_y_a
785  pmullw               m1, m2, filter_y_b
786  punpcklbw            m3, m5
787  paddw                m0, filter_rnd
788  pmullw               m2, filter_y_a
789  paddw                m0, m1
790  pmullw               m1, m4, filter_y_b
791  paddw                m2, filter_rnd
792  paddw                m2, m1
793  movx                 m1, [refq]
794%endif
795  psraw                m0, 4
796  psraw                m2, 4
797%if %2 == 1 ; avg
798  ; FIXME(rbultje) pipeline
799%if %1 == 4
800  movlhps              m0, m2
801%endif
802  packuswb             m0, m2
803%if %1 > 4
804  pavgb                m0, [second_predq]
805  punpckhbw            m2, m0, m5
806  punpcklbw            m0, m5
807%else
808  movh                 m2, [second_predq]
809  pavgb                m0, m2
810  punpcklbw            m0, m5
811  movhlps              m2, m0
812%endif
813%endif
814  punpcklbw            m1, m5
815  SUM_SSE              m0, m1, m2, m3, m6, m7
816  mova                 m0, m4
817
818  lea                srcq, [srcq+src_strideq*2]
819  lea                refq, [refq+ref_strideq*2]
820%endif
821%if %2 == 1 ; avg
822  add                second_predq, second_str
823%endif
824  dec                   block_height
825  jg .x_half_y_other_loop
826%undef filter_y_a
827%undef filter_y_b
828%undef filter_rnd
829  STORE_AND_RET %1
830
831.x_nonhalf:
832  test          y_offsetd, y_offsetd
833  jnz .x_nonhalf_y_nonzero
834
835  ; x_offset == bilin interpolation && y_offset == 0
836%if ARCH_X86_64
837  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
838%endif
839  shl           x_offsetd, filter_idx_shift
840%if ARCH_X86_64 && %1 > 4
841  mova                 m8, [bilin_filter+x_offsetq]
842%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
843  mova                 m9, [bilin_filter+x_offsetq+16]
844%endif
845  mova                m10, [GLOBAL(pw_8)]
846%define filter_x_a m8
847%define filter_x_b m9
848%define filter_rnd m10
849%else    ; x86-32
850%if ARCH_X86=1 && CONFIG_PIC=1
851;y_offset == 0. We can reuse y_offset reg.
852%define tempq y_offsetq
853  add x_offsetq, g_bilin_filterm
854%define filter_x_a [x_offsetq]
855%define filter_x_b [x_offsetq+16]
856  mov tempq, g_pw_8m
857%define filter_rnd [tempq]
858%else
859  add           x_offsetq, bilin_filter
860%define filter_x_a [x_offsetq]
861%define filter_x_b [x_offsetq+16]
862%define filter_rnd [GLOBAL(pw_8)]
863%endif
864%endif
865
866.x_other_y_zero_loop:
867%if %1 == 16
868  movu                 m0, [srcq]
869  movu                 m4, [srcq+1]
870  mova                 m1, [refq]
871%if cpuflag(ssse3)
872  punpckhbw            m2, m0, m4
873  punpcklbw            m0, m4
874  pmaddubsw            m2, filter_x_a
875  pmaddubsw            m0, filter_x_a
876  paddw                m2, filter_rnd
877  paddw                m0, filter_rnd
878%else
879  punpckhbw            m2, m0, m5
880  punpckhbw            m3, m4, m5
881  punpcklbw            m0, m5
882  punpcklbw            m4, m5
883  pmullw               m2, filter_x_a
884  pmullw               m3, filter_x_b
885  paddw                m2, filter_rnd
886  pmullw               m0, filter_x_a
887  pmullw               m4, filter_x_b
888  paddw                m0, filter_rnd
889  paddw                m2, m3
890  paddw                m0, m4
891%endif
892  psraw                m2, 4
893  psraw                m0, 4
894%if %2 == 1 ; avg
895  ; FIXME(rbultje) pipeline
896  packuswb             m0, m2
897  pavgb                m0, [second_predq]
898  punpckhbw            m2, m0, m5
899  punpcklbw            m0, m5
900%endif
901  punpckhbw            m3, m1, m5
902  punpcklbw            m1, m5
903  SUM_SSE              m0, m1, m2, m3, m6, m7
904
905  add                srcq, src_strideq
906  add                refq, ref_strideq
907%else ; %1 < 16
908  movx                 m0, [srcq]
909  movx                 m1, [srcq+1]
910  movx                 m2, [srcq+src_strideq]
911  movx                 m4, [srcq+src_strideq+1]
912  movx                 m3, [refq+ref_strideq]
913%if cpuflag(ssse3)
914  punpcklbw            m0, m1
915  movx                 m1, [refq]
916  punpcklbw            m2, m4
917  pmaddubsw            m0, filter_x_a
918  pmaddubsw            m2, filter_x_a
919  punpcklbw            m3, m5
920  paddw                m0, filter_rnd
921  paddw                m2, filter_rnd
922%else
923  punpcklbw            m0, m5
924  punpcklbw            m1, m5
925  punpcklbw            m2, m5
926  punpcklbw            m4, m5
927  pmullw               m0, filter_x_a
928  pmullw               m1, filter_x_b
929  punpcklbw            m3, m5
930  paddw                m0, filter_rnd
931  pmullw               m2, filter_x_a
932  pmullw               m4, filter_x_b
933  paddw                m0, m1
934  paddw                m2, filter_rnd
935  movx                 m1, [refq]
936  paddw                m2, m4
937%endif
938  psraw                m0, 4
939  psraw                m2, 4
940%if %2 == 1 ; avg
941  ; FIXME(rbultje) pipeline
942%if %1 == 4
943  movlhps              m0, m2
944%endif
945  packuswb             m0, m2
946%if %1 > 4
947  pavgb                m0, [second_predq]
948  punpckhbw            m2, m0, m5
949  punpcklbw            m0, m5
950%else
951  movh                 m2, [second_predq]
952  pavgb                m0, m2
953  punpcklbw            m0, m5
954  movhlps              m2, m0
955%endif
956%endif
957  punpcklbw            m1, m5
958  SUM_SSE              m0, m1, m2, m3, m6, m7
959
960  lea                srcq, [srcq+src_strideq*2]
961  lea                refq, [refq+ref_strideq*2]
962%endif
963%if %2 == 1 ; avg
964  add                second_predq, second_str
965%endif
966  dec                   block_height
967  jg .x_other_y_zero_loop
968%undef filter_x_a
969%undef filter_x_b
970%undef filter_rnd
971  STORE_AND_RET %1
972
973.x_nonhalf_y_nonzero:
974  cmp           y_offsetd, 4
975  jne .x_nonhalf_y_nonhalf
976
977  ; x_offset == bilin interpolation && y_offset == 0.5
978%if ARCH_X86_64
979  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
980%endif
981  shl           x_offsetd, filter_idx_shift
982%if ARCH_X86_64 && %1 > 4
983  mova                 m8, [bilin_filter+x_offsetq]
984%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
985  mova                 m9, [bilin_filter+x_offsetq+16]
986%endif
987  mova                m10, [GLOBAL(pw_8)]
988%define filter_x_a m8
989%define filter_x_b m9
990%define filter_rnd m10
991%else    ; x86-32
992%if ARCH_X86=1 && CONFIG_PIC=1
993; y_offset == 0.5. We can reuse y_offset reg.
994%define tempq y_offsetq
995  add x_offsetq, g_bilin_filterm
996%define filter_x_a [x_offsetq]
997%define filter_x_b [x_offsetq+16]
998  mov tempq, g_pw_8m
999%define filter_rnd [tempq]
1000%else
1001  add           x_offsetq, bilin_filter
1002%define filter_x_a [x_offsetq]
1003%define filter_x_b [x_offsetq+16]
1004%define filter_rnd [GLOBAL(pw_8)]
1005%endif
1006%endif
1007
1008%if %1 == 16
1009  movu                 m0, [srcq]
1010  movu                 m1, [srcq+1]
1011%if cpuflag(ssse3)
1012  punpckhbw            m2, m0, m1
1013  punpcklbw            m0, m1
1014  pmaddubsw            m2, filter_x_a
1015  pmaddubsw            m0, filter_x_a
1016  paddw                m2, filter_rnd
1017  paddw                m0, filter_rnd
1018%else
1019  punpckhbw            m2, m0, m5
1020  punpckhbw            m3, m1, m5
1021  punpcklbw            m0, m5
1022  punpcklbw            m1, m5
1023  pmullw               m0, filter_x_a
1024  pmullw               m1, filter_x_b
1025  paddw                m0, filter_rnd
1026  pmullw               m2, filter_x_a
1027  pmullw               m3, filter_x_b
1028  paddw                m2, filter_rnd
1029  paddw                m0, m1
1030  paddw                m2, m3
1031%endif
1032  psraw                m0, 4
1033  psraw                m2, 4
1034  add                srcq, src_strideq
1035  packuswb             m0, m2
1036.x_other_y_half_loop:
1037  movu                 m4, [srcq]
1038  movu                 m3, [srcq+1]
1039%if cpuflag(ssse3)
1040  mova                 m1, [refq]
1041  punpckhbw            m2, m4, m3
1042  punpcklbw            m4, m3
1043  pmaddubsw            m2, filter_x_a
1044  pmaddubsw            m4, filter_x_a
1045  paddw                m2, filter_rnd
1046  paddw                m4, filter_rnd
1047  psraw                m2, 4
1048  psraw                m4, 4
1049  packuswb             m4, m2
1050  pavgb                m0, m4
1051  punpckhbw            m3, m1, m5
1052  punpcklbw            m1, m5
1053%else
1054  punpckhbw            m2, m4, m5
1055  punpckhbw            m1, m3, m5
1056  punpcklbw            m4, m5
1057  punpcklbw            m3, m5
1058  pmullw               m4, filter_x_a
1059  pmullw               m3, filter_x_b
1060  paddw                m4, filter_rnd
1061  pmullw               m2, filter_x_a
1062  pmullw               m1, filter_x_b
1063  paddw                m2, filter_rnd
1064  paddw                m4, m3
1065  paddw                m2, m1
1066  mova                 m1, [refq]
1067  psraw                m4, 4
1068  psraw                m2, 4
1069  punpckhbw            m3, m1, m5
1070  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1071  ; have a 1-register shortage to be able to store the backup of the bilin
1072  ; filtered second line as words as cache for the next line. Packing into
1073  ; a byte costs 1 pack and 2 unpacks, but saves a register.
1074  packuswb             m4, m2
1075  punpcklbw            m1, m5
1076  pavgb                m0, m4
1077%endif
1078%if %2 == 1 ; avg
1079  ; FIXME(rbultje) pipeline
1080  pavgb                m0, [second_predq]
1081%endif
1082  punpckhbw            m2, m0, m5
1083  punpcklbw            m0, m5
1084  SUM_SSE              m0, m1, m2, m3, m6, m7
1085  mova                 m0, m4
1086
1087  add                srcq, src_strideq
1088  add                refq, ref_strideq
1089%else ; %1 < 16
1090  movx                 m0, [srcq]
1091  movx                 m1, [srcq+1]
1092%if cpuflag(ssse3)
1093  punpcklbw            m0, m1
1094  pmaddubsw            m0, filter_x_a
1095  paddw                m0, filter_rnd
1096%else
1097  punpcklbw            m0, m5
1098  punpcklbw            m1, m5
1099  pmullw               m0, filter_x_a
1100  pmullw               m1, filter_x_b
1101  paddw                m0, filter_rnd
1102  paddw                m0, m1
1103%endif
1104  add                srcq, src_strideq
1105  psraw                m0, 4
1106.x_other_y_half_loop:
1107  movx                 m2, [srcq]
1108  movx                 m1, [srcq+1]
1109  movx                 m4, [srcq+src_strideq]
1110  movx                 m3, [srcq+src_strideq+1]
1111%if cpuflag(ssse3)
1112  punpcklbw            m2, m1
1113  punpcklbw            m4, m3
1114  pmaddubsw            m2, filter_x_a
1115  pmaddubsw            m4, filter_x_a
1116  movx                 m1, [refq]
1117  movx                 m3, [refq+ref_strideq]
1118  paddw                m2, filter_rnd
1119  paddw                m4, filter_rnd
1120%else
1121  punpcklbw            m2, m5
1122  punpcklbw            m1, m5
1123  punpcklbw            m4, m5
1124  punpcklbw            m3, m5
1125  pmullw               m2, filter_x_a
1126  pmullw               m1, filter_x_b
1127  paddw                m2, filter_rnd
1128  pmullw               m4, filter_x_a
1129  pmullw               m3, filter_x_b
1130  paddw                m4, filter_rnd
1131  paddw                m2, m1
1132  movx                 m1, [refq]
1133  paddw                m4, m3
1134  movx                 m3, [refq+ref_strideq]
1135%endif
1136  psraw                m2, 4
1137  psraw                m4, 4
1138  pavgw                m0, m2
1139  pavgw                m2, m4
1140%if %2 == 1 ; avg
1141  ; FIXME(rbultje) pipeline - also consider going to bytes here
1142%if %1 == 4
1143  movlhps              m0, m2
1144%endif
1145  packuswb             m0, m2
1146%if %1 > 4
1147  pavgb                m0, [second_predq]
1148  punpckhbw            m2, m0, m5
1149  punpcklbw            m0, m5
1150%else
1151  movh                 m2, [second_predq]
1152  pavgb                m0, m2
1153  punpcklbw            m0, m5
1154  movhlps              m2, m0
1155%endif
1156%endif
1157  punpcklbw            m3, m5
1158  punpcklbw            m1, m5
1159  SUM_SSE              m0, m1, m2, m3, m6, m7
1160  mova                 m0, m4
1161
1162  lea                srcq, [srcq+src_strideq*2]
1163  lea                refq, [refq+ref_strideq*2]
1164%endif
1165%if %2 == 1 ; avg
1166  add                second_predq, second_str
1167%endif
1168  dec                   block_height
1169  jg .x_other_y_half_loop
1170%undef filter_x_a
1171%undef filter_x_b
1172%undef filter_rnd
1173  STORE_AND_RET %1
1174
1175.x_nonhalf_y_nonhalf:
1176%if ARCH_X86_64
1177  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
1178%endif
1179  shl           x_offsetd, filter_idx_shift
1180  shl           y_offsetd, filter_idx_shift
1181%if ARCH_X86_64 && %1 > 4
1182  mova                 m8, [bilin_filter+x_offsetq]
1183%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1184  mova                 m9, [bilin_filter+x_offsetq+16]
1185%endif
1186  mova                m10, [bilin_filter+y_offsetq]
1187%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1188  mova                m11, [bilin_filter+y_offsetq+16]
1189%endif
1190  mova                m12, [GLOBAL(pw_8)]
1191%define filter_x_a m8
1192%define filter_x_b m9
1193%define filter_y_a m10
1194%define filter_y_b m11
1195%define filter_rnd m12
1196%else   ; x86-32
1197%if ARCH_X86=1 && CONFIG_PIC=1
1198; In this case, there is NO unused register. Used src_stride register. Later,
1199; src_stride has to be loaded from stack when it is needed.
1200%define tempq src_strideq
1201  mov tempq, g_bilin_filterm
1202  add           x_offsetq, tempq
1203  add           y_offsetq, tempq
1204%define filter_x_a [x_offsetq]
1205%define filter_x_b [x_offsetq+16]
1206%define filter_y_a [y_offsetq]
1207%define filter_y_b [y_offsetq+16]
1208
1209  mov tempq, g_pw_8m
1210%define filter_rnd [tempq]
1211%else
1212  add           x_offsetq, bilin_filter
1213  add           y_offsetq, bilin_filter
1214%define filter_x_a [x_offsetq]
1215%define filter_x_b [x_offsetq+16]
1216%define filter_y_a [y_offsetq]
1217%define filter_y_b [y_offsetq+16]
1218%define filter_rnd [GLOBAL(pw_8)]
1219%endif
1220%endif
1221
1222  ; x_offset == bilin interpolation && y_offset == bilin interpolation
1223%if %1 == 16
1224  movu                 m0, [srcq]
1225  movu                 m1, [srcq+1]
1226%if cpuflag(ssse3)
1227  punpckhbw            m2, m0, m1
1228  punpcklbw            m0, m1
1229  pmaddubsw            m2, filter_x_a
1230  pmaddubsw            m0, filter_x_a
1231  paddw                m2, filter_rnd
1232  paddw                m0, filter_rnd
1233%else
1234  punpckhbw            m2, m0, m5
1235  punpckhbw            m3, m1, m5
1236  punpcklbw            m0, m5
1237  punpcklbw            m1, m5
1238  pmullw               m0, filter_x_a
1239  pmullw               m1, filter_x_b
1240  paddw                m0, filter_rnd
1241  pmullw               m2, filter_x_a
1242  pmullw               m3, filter_x_b
1243  paddw                m2, filter_rnd
1244  paddw                m0, m1
1245  paddw                m2, m3
1246%endif
1247  psraw                m0, 4
1248  psraw                m2, 4
1249
1250  INC_SRC_BY_SRC_STRIDE
1251
1252  packuswb             m0, m2
1253.x_other_y_other_loop:
1254%if cpuflag(ssse3)
1255  movu                 m4, [srcq]
1256  movu                 m3, [srcq+1]
1257  mova                 m1, [refq]
1258  punpckhbw            m2, m4, m3
1259  punpcklbw            m4, m3
1260  pmaddubsw            m2, filter_x_a
1261  pmaddubsw            m4, filter_x_a
1262  punpckhbw            m3, m1, m5
1263  paddw                m2, filter_rnd
1264  paddw                m4, filter_rnd
1265  psraw                m2, 4
1266  psraw                m4, 4
1267  packuswb             m4, m2
1268  punpckhbw            m2, m0, m4
1269  punpcklbw            m0, m4
1270  pmaddubsw            m2, filter_y_a
1271  pmaddubsw            m0, filter_y_a
1272  punpcklbw            m1, m5
1273  paddw                m2, filter_rnd
1274  paddw                m0, filter_rnd
1275  psraw                m2, 4
1276  psraw                m0, 4
1277%else
1278  movu                 m3, [srcq]
1279  movu                 m4, [srcq+1]
1280  punpckhbw            m1, m3, m5
1281  punpckhbw            m2, m4, m5
1282  punpcklbw            m3, m5
1283  punpcklbw            m4, m5
1284  pmullw               m3, filter_x_a
1285  pmullw               m4, filter_x_b
1286  paddw                m3, filter_rnd
1287  pmullw               m1, filter_x_a
1288  pmullw               m2, filter_x_b
1289  paddw                m1, filter_rnd
1290  paddw                m3, m4
1291  paddw                m1, m2
1292  psraw                m3, 4
1293  psraw                m1, 4
1294  packuswb             m4, m3, m1
1295  punpckhbw            m2, m0, m5
1296  punpcklbw            m0, m5
1297  pmullw               m2, filter_y_a
1298  pmullw               m1, filter_y_b
1299  paddw                m2, filter_rnd
1300  pmullw               m0, filter_y_a
1301  pmullw               m3, filter_y_b
1302  paddw                m2, m1
1303  mova                 m1, [refq]
1304  paddw                m0, filter_rnd
1305  psraw                m2, 4
1306  paddw                m0, m3
1307  punpckhbw            m3, m1, m5
1308  psraw                m0, 4
1309  punpcklbw            m1, m5
1310%endif
1311%if %2 == 1 ; avg
1312  ; FIXME(rbultje) pipeline
1313  packuswb             m0, m2
1314  pavgb                m0, [second_predq]
1315  punpckhbw            m2, m0, m5
1316  punpcklbw            m0, m5
1317%endif
1318  SUM_SSE              m0, m1, m2, m3, m6, m7
1319  mova                 m0, m4
1320
1321  INC_SRC_BY_SRC_STRIDE
1322  add                refq, ref_strideq
1323%else ; %1 < 16
1324  movx                 m0, [srcq]
1325  movx                 m1, [srcq+1]
1326%if cpuflag(ssse3)
1327  punpcklbw            m0, m1
1328  pmaddubsw            m0, filter_x_a
1329  paddw                m0, filter_rnd
1330%else
1331  punpcklbw            m0, m5
1332  punpcklbw            m1, m5
1333  pmullw               m0, filter_x_a
1334  pmullw               m1, filter_x_b
1335  paddw                m0, filter_rnd
1336  paddw                m0, m1
1337%endif
1338  psraw                m0, 4
1339%if cpuflag(ssse3)
1340  packuswb             m0, m0
1341%endif
1342
1343  INC_SRC_BY_SRC_STRIDE
1344
1345.x_other_y_other_loop:
1346  movx                 m2, [srcq]
1347  movx                 m1, [srcq+1]
1348
1349  INC_SRC_BY_SRC_STRIDE
1350  movx                 m4, [srcq]
1351  movx                 m3, [srcq+1]
1352
1353%if cpuflag(ssse3)
1354  punpcklbw            m2, m1
1355  punpcklbw            m4, m3
1356  pmaddubsw            m2, filter_x_a
1357  pmaddubsw            m4, filter_x_a
1358  movx                 m3, [refq+ref_strideq]
1359  movx                 m1, [refq]
1360  paddw                m2, filter_rnd
1361  paddw                m4, filter_rnd
1362  psraw                m2, 4
1363  psraw                m4, 4
1364  packuswb             m2, m2
1365  packuswb             m4, m4
1366  punpcklbw            m0, m2
1367  punpcklbw            m2, m4
1368  pmaddubsw            m0, filter_y_a
1369  pmaddubsw            m2, filter_y_a
1370  punpcklbw            m3, m5
1371  paddw                m0, filter_rnd
1372  paddw                m2, filter_rnd
1373  psraw                m0, 4
1374  psraw                m2, 4
1375  punpcklbw            m1, m5
1376%else
1377  punpcklbw            m2, m5
1378  punpcklbw            m1, m5
1379  punpcklbw            m4, m5
1380  punpcklbw            m3, m5
1381  pmullw               m2, filter_x_a
1382  pmullw               m1, filter_x_b
1383  paddw                m2, filter_rnd
1384  pmullw               m4, filter_x_a
1385  pmullw               m3, filter_x_b
1386  paddw                m4, filter_rnd
1387  paddw                m2, m1
1388  paddw                m4, m3
1389  psraw                m2, 4
1390  psraw                m4, 4
1391  pmullw               m0, filter_y_a
1392  pmullw               m3, m2, filter_y_b
1393  paddw                m0, filter_rnd
1394  pmullw               m2, filter_y_a
1395  pmullw               m1, m4, filter_y_b
1396  paddw                m2, filter_rnd
1397  paddw                m0, m3
1398  movx                 m3, [refq+ref_strideq]
1399  paddw                m2, m1
1400  movx                 m1, [refq]
1401  psraw                m0, 4
1402  psraw                m2, 4
1403  punpcklbw            m3, m5
1404  punpcklbw            m1, m5
1405%endif
1406%if %2 == 1 ; avg
1407  ; FIXME(rbultje) pipeline
1408%if %1 == 4
1409  movlhps              m0, m2
1410%endif
1411  packuswb             m0, m2
1412%if %1 > 4
1413  pavgb                m0, [second_predq]
1414  punpckhbw            m2, m0, m5
1415  punpcklbw            m0, m5
1416%else
1417  movh                 m2, [second_predq]
1418  pavgb                m0, m2
1419  punpcklbw            m0, m5
1420  movhlps              m2, m0
1421%endif
1422%endif
1423  SUM_SSE              m0, m1, m2, m3, m6, m7
1424  mova                 m0, m4
1425
1426  INC_SRC_BY_SRC_STRIDE
1427  lea                refq, [refq+ref_strideq*2]
1428%endif
1429%if %2 == 1 ; avg
1430  add                second_predq, second_str
1431%endif
1432  dec                   block_height
1433  jg .x_other_y_other_loop
1434%undef filter_x_a
1435%undef filter_x_b
1436%undef filter_y_a
1437%undef filter_y_b
1438%undef filter_rnd
1439%undef movx
1440  STORE_AND_RET %1
1441%endmacro
1442
1443; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1444; between the ssse3 and non-ssse3 version. It may make sense to merge their
1445; code in the sense that the ssse3 version would jump to the appropriate
1446; location in the sse/2 version, rather than duplicating that code in the
1447; binary.
1448
1449INIT_XMM sse2
1450SUBPEL_VARIANCE  4
1451SUBPEL_VARIANCE  8
1452SUBPEL_VARIANCE 16
1453
1454INIT_XMM ssse3
1455SUBPEL_VARIANCE  4
1456SUBPEL_VARIANCE  8
1457SUBPEL_VARIANCE 16
1458
1459INIT_XMM sse2
1460SUBPEL_VARIANCE  4, 1
1461SUBPEL_VARIANCE  8, 1
1462SUBPEL_VARIANCE 16, 1
1463
1464INIT_XMM ssse3
1465SUBPEL_VARIANCE  4, 1
1466SUBPEL_VARIANCE  8, 1
1467SUBPEL_VARIANCE 16, 1
1468