1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16%macro PROCESS_4x2x4 5-6 0
17  movd                  m0, [srcq +%2]
18%if %1 == 1
19  movd                  m6, [ref1q+%3]
20  movd                  m4, [ref2q+%3]
21  movd                  m7, [ref3q+%3]
22  movd                  m5, [ref4q+%3]
23  movd                  m1, [srcq +%4]
24  movd                  m2, [ref1q+%5]
25  punpckldq             m0, m1
26  punpckldq             m6, m2
27  movd                  m1, [ref2q+%5]
28  movd                  m2, [ref3q+%5]
29  movd                  m3, [ref4q+%5]
30  punpckldq             m4, m1
31  punpckldq             m7, m2
32  punpckldq             m5, m3
33  movlhps               m0, m0
34  movlhps               m6, m4
35  movlhps               m7, m5
36  psadbw                m6, m0
37  psadbw                m7, m0
38%else
39  movd                  m1, [ref1q+%3]
40  movd                  m5, [ref1q+%5]
41  movd                  m2, [ref2q+%3]
42  movd                  m4, [ref2q+%5]
43  punpckldq             m1, m5
44  punpckldq             m2, m4
45  movd                  m3, [ref3q+%3]
46  movd                  m5, [ref3q+%5]
47  punpckldq             m3, m5
48  movd                  m4, [ref4q+%3]
49  movd                  m5, [ref4q+%5]
50  punpckldq             m4, m5
51  movd                  m5, [srcq +%4]
52  punpckldq             m0, m5
53  movlhps               m0, m0
54  movlhps               m1, m2
55  movlhps               m3, m4
56  psadbw                m1, m0
57  psadbw                m3, m0
58  paddd                 m6, m1
59  paddd                 m7, m3
60%endif
61%if %6 == 1
62  lea                 srcq, [srcq +src_strideq*2]
63  lea                ref1q, [ref1q+ref_strideq*2]
64  lea                ref2q, [ref2q+ref_strideq*2]
65  lea                ref3q, [ref3q+ref_strideq*2]
66  lea                ref4q, [ref4q+ref_strideq*2]
67%endif
68%endmacro
69
70; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
71%macro PROCESS_8x2x4 5-6 0
72  movh                  m0, [srcq +%2]
73%if %1 == 1
74  movh                  m4, [ref1q+%3]
75  movh                  m5, [ref2q+%3]
76  movh                  m6, [ref3q+%3]
77  movh                  m7, [ref4q+%3]
78  movhps                m0, [srcq +%4]
79  movhps                m4, [ref1q+%5]
80  movhps                m5, [ref2q+%5]
81  movhps                m6, [ref3q+%5]
82  movhps                m7, [ref4q+%5]
83  psadbw                m4, m0
84  psadbw                m5, m0
85  psadbw                m6, m0
86  psadbw                m7, m0
87%else
88  movh                  m1, [ref1q+%3]
89  movh                  m2, [ref2q+%3]
90  movh                  m3, [ref3q+%3]
91  movhps                m0, [srcq +%4]
92  movhps                m1, [ref1q+%5]
93  movhps                m2, [ref2q+%5]
94  movhps                m3, [ref3q+%5]
95  psadbw                m1, m0
96  psadbw                m2, m0
97  psadbw                m3, m0
98  paddd                 m4, m1
99  movh                  m1, [ref4q+%3]
100  movhps                m1, [ref4q+%5]
101  paddd                 m5, m2
102  paddd                 m6, m3
103  psadbw                m1, m0
104  paddd                 m7, m1
105%endif
106%if %6 == 1
107  lea                 srcq, [srcq +src_strideq*2]
108  lea                ref1q, [ref1q+ref_strideq*2]
109  lea                ref2q, [ref2q+ref_strideq*2]
110  lea                ref3q, [ref3q+ref_strideq*2]
111  lea                ref4q, [ref4q+ref_strideq*2]
112%endif
113%endmacro
114
115; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
116%macro PROCESS_16x2x4 5-6 0
117  ; 1st 16 px
118  mova                  m0, [srcq +%2]
119%if %1 == 1
120  movu                  m4, [ref1q+%3]
121  movu                  m5, [ref2q+%3]
122  movu                  m6, [ref3q+%3]
123  movu                  m7, [ref4q+%3]
124  psadbw                m4, m0
125  psadbw                m5, m0
126  psadbw                m6, m0
127  psadbw                m7, m0
128%else
129  movu                  m1, [ref1q+%3]
130  movu                  m2, [ref2q+%3]
131  movu                  m3, [ref3q+%3]
132  psadbw                m1, m0
133  psadbw                m2, m0
134  psadbw                m3, m0
135  paddd                 m4, m1
136  movu                  m1, [ref4q+%3]
137  paddd                 m5, m2
138  paddd                 m6, m3
139  psadbw                m1, m0
140  paddd                 m7, m1
141%endif
142
143  ; 2nd 16 px
144  mova                  m0, [srcq +%4]
145  movu                  m1, [ref1q+%5]
146  movu                  m2, [ref2q+%5]
147  movu                  m3, [ref3q+%5]
148  psadbw                m1, m0
149  psadbw                m2, m0
150  psadbw                m3, m0
151  paddd                 m4, m1
152  movu                  m1, [ref4q+%5]
153  paddd                 m5, m2
154  paddd                 m6, m3
155%if %6 == 1
156  lea                 srcq, [srcq +src_strideq*2]
157  lea                ref1q, [ref1q+ref_strideq*2]
158  lea                ref2q, [ref2q+ref_strideq*2]
159  lea                ref3q, [ref3q+ref_strideq*2]
160  lea                ref4q, [ref4q+ref_strideq*2]
161%endif
162  psadbw                m1, m0
163  paddd                 m7, m1
164%endmacro
165
166; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
167%macro PROCESS_32x2x4 5-6 0
168  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
169  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
170%endmacro
171
172; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
173%macro PROCESS_64x2x4 5-6 0
174  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
175  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
176%endmacro
177
178; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
179;                         uint8_t *ref[4], int ref_stride,
180;                         uint32_t res[4]);
181; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
182%macro SADNXN4D 2
183%if UNIX64
184cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
185                              res, ref2, ref3, ref4
186%else
187cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
188                              ref2, ref3, ref4
189%endif
190  movsxdifnidn src_strideq, src_strided
191  movsxdifnidn ref_strideq, ref_strided
192  mov                ref2q, [ref1q+gprsize*1]
193  mov                ref3q, [ref1q+gprsize*2]
194  mov                ref4q, [ref1q+gprsize*3]
195  mov                ref1q, [ref1q+gprsize*0]
196
197  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
198%rep (%2-4)/2
199  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
200%endrep
201  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
202
203%if %1 > 4
204  pslldq                m5, 4
205  pslldq                m7, 4
206  por                   m4, m5
207  por                   m6, m7
208  mova                  m5, m4
209  mova                  m7, m6
210  punpcklqdq            m4, m6
211  punpckhqdq            m5, m7
212  movifnidn             r4, r4mp
213  paddd                 m4, m5
214  movu                [r4], m4
215  RET
216%else
217  movifnidn             r4, r4mp
218  pshufd            m6, m6, 0x08
219  pshufd            m7, m7, 0x08
220  movq              [r4+0], m6
221  movq              [r4+8], m7
222  RET
223%endif
224%endmacro
225
226INIT_XMM sse2
227SADNXN4D 64, 64
228SADNXN4D 64, 32
229SADNXN4D 32, 64
230SADNXN4D 32, 32
231SADNXN4D 32, 16
232SADNXN4D 16, 32
233SADNXN4D 16, 16
234SADNXN4D 16,  8
235SADNXN4D  8, 16
236SADNXN4D  8,  8
237SADNXN4D  8,  4
238SADNXN4D  4,  8
239SADNXN4D  4,  4
240