1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>  // SSE2
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_ports/mem.h"
17 #include "vpx_dsp/x86/mem_sse2.h"
18 
add32x4_sse2(__m128i val)19 static INLINE unsigned int add32x4_sse2(__m128i val) {
20   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
21   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
22   return _mm_cvtsi128_si32(val);
23 }
24 
vpx_get_mb_ss_sse2(const int16_t * src_ptr)25 unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
26   __m128i vsum = _mm_setzero_si128();
27   int i;
28 
29   for (i = 0; i < 32; ++i) {
30     const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
31     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
32     src_ptr += 8;
33   }
34 
35   return add32x4_sse2(vsum);
36 }
37 
load4x2_sse2(const uint8_t * const p,const int stride)38 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
39   const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
40   const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
41   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
42   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
43 }
44 
variance_kernel_sse2(const __m128i src_ptr,const __m128i ref_ptr,__m128i * const sse,__m128i * const sum)45 static INLINE void variance_kernel_sse2(const __m128i src_ptr,
46                                         const __m128i ref_ptr,
47                                         __m128i *const sse,
48                                         __m128i *const sum) {
49   const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
50   *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
51   *sum = _mm_add_epi16(*sum, diff);
52 }
53 
54 // Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
55 // Slightly faster than variance_final_256_pel_sse2()
variance_final_128_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)56 static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
57                                                unsigned int *const sse,
58                                                int *const sum) {
59   *sse = add32x4_sse2(vsse);
60 
61   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
62   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
63   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
64   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
65 }
66 
67 // Can handle 256 pixels' diff sum (such as 16x16)
variance_final_256_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)68 static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
69                                                unsigned int *const sse,
70                                                int *const sum) {
71   *sse = add32x4_sse2(vsse);
72 
73   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
74   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
75   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
76   *sum += (int16_t)_mm_extract_epi16(vsum, 1);
77 }
78 
79 // Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
variance_final_512_pel_sse2(__m128i vsse,__m128i vsum,unsigned int * const sse,int * const sum)80 static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
81                                                unsigned int *const sse,
82                                                int *const sum) {
83   *sse = add32x4_sse2(vsse);
84 
85   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
86   vsum = _mm_unpacklo_epi16(vsum, vsum);
87   vsum = _mm_srai_epi32(vsum, 16);
88   *sum = add32x4_sse2(vsum);
89 }
90 
sum_to_32bit_sse2(const __m128i sum)91 static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
92   const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
93   const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
94   return _mm_add_epi32(sum_lo, sum_hi);
95 }
96 
97 // Can handle 1024 pixels' diff sum (such as 32x32)
sum_final_sse2(const __m128i sum)98 static INLINE int sum_final_sse2(const __m128i sum) {
99   const __m128i t = sum_to_32bit_sse2(sum);
100   return add32x4_sse2(t);
101 }
102 
variance4_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)103 static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
104                                   const uint8_t *ref_ptr, const int ref_stride,
105                                   const int h, __m128i *const sse,
106                                   __m128i *const sum) {
107   int i;
108 
109   assert(h <= 256);  // May overflow for larger height.
110   *sse = _mm_setzero_si128();
111   *sum = _mm_setzero_si128();
112 
113   for (i = 0; i < h; i += 2) {
114     const __m128i s = load4x2_sse2(src_ptr, src_stride);
115     const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
116 
117     variance_kernel_sse2(s, r, sse, sum);
118     src_ptr += 2 * src_stride;
119     ref_ptr += 2 * ref_stride;
120   }
121 }
122 
variance8_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)123 static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
124                                   const uint8_t *ref_ptr, const int ref_stride,
125                                   const int h, __m128i *const sse,
126                                   __m128i *const sum) {
127   const __m128i zero = _mm_setzero_si128();
128   int i;
129 
130   assert(h <= 128);  // May overflow for larger height.
131   *sse = _mm_setzero_si128();
132   *sum = _mm_setzero_si128();
133 
134   for (i = 0; i < h; i++) {
135     const __m128i s =
136         _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
137     const __m128i r =
138         _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
139 
140     variance_kernel_sse2(s, r, sse, sum);
141     src_ptr += src_stride;
142     ref_ptr += ref_stride;
143   }
144 }
145 
variance16_kernel_sse2(const uint8_t * const src_ptr,const uint8_t * const ref_ptr,__m128i * const sse,__m128i * const sum)146 static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
147                                           const uint8_t *const ref_ptr,
148                                           __m128i *const sse,
149                                           __m128i *const sum) {
150   const __m128i zero = _mm_setzero_si128();
151   const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
152   const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
153   const __m128i src0 = _mm_unpacklo_epi8(s, zero);
154   const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
155   const __m128i src1 = _mm_unpackhi_epi8(s, zero);
156   const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
157 
158   variance_kernel_sse2(src0, ref0, sse, sum);
159   variance_kernel_sse2(src1, ref1, sse, sum);
160 }
161 
variance16_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)162 static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
163                                    const uint8_t *ref_ptr, const int ref_stride,
164                                    const int h, __m128i *const sse,
165                                    __m128i *const sum) {
166   int i;
167 
168   assert(h <= 64);  // May overflow for larger height.
169   *sse = _mm_setzero_si128();
170   *sum = _mm_setzero_si128();
171 
172   for (i = 0; i < h; ++i) {
173     variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
174     src_ptr += src_stride;
175     ref_ptr += ref_stride;
176   }
177 }
178 
variance32_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)179 static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
180                                    const uint8_t *ref_ptr, const int ref_stride,
181                                    const int h, __m128i *const sse,
182                                    __m128i *const sum) {
183   int i;
184 
185   assert(h <= 32);  // May overflow for larger height.
186   // Don't initialize sse here since it's an accumulation.
187   *sum = _mm_setzero_si128();
188 
189   for (i = 0; i < h; ++i) {
190     variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
191     variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
192     src_ptr += src_stride;
193     ref_ptr += ref_stride;
194   }
195 }
196 
variance64_sse2(const uint8_t * src_ptr,const int src_stride,const uint8_t * ref_ptr,const int ref_stride,const int h,__m128i * const sse,__m128i * const sum)197 static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
198                                    const uint8_t *ref_ptr, const int ref_stride,
199                                    const int h, __m128i *const sse,
200                                    __m128i *const sum) {
201   int i;
202 
203   assert(h <= 16);  // May overflow for larger height.
204   // Don't initialize sse here since it's an accumulation.
205   *sum = _mm_setzero_si128();
206 
207   for (i = 0; i < h; ++i) {
208     variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
209     variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
210     variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
211     variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
212     src_ptr += src_stride;
213     ref_ptr += ref_stride;
214   }
215 }
216 
vpx_get8x8var_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,int * sum)217 void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
218                         const uint8_t *ref_ptr, int ref_stride,
219                         unsigned int *sse, int *sum) {
220   __m128i vsse, vsum;
221   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
222   variance_final_128_pel_sse2(vsse, vsum, sse, sum);
223 }
224 
vpx_get16x16var_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse,int * sum)225 void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
226                           const uint8_t *ref_ptr, int ref_stride,
227                           unsigned int *sse, int *sum) {
228   __m128i vsse, vsum;
229   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
230   variance_final_256_pel_sse2(vsse, vsum, sse, sum);
231 }
232 
vpx_variance4x4_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)233 unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
234                                   const uint8_t *ref_ptr, int ref_stride,
235                                   unsigned int *sse) {
236   __m128i vsse, vsum;
237   int sum;
238   variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
239   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
240   return *sse - ((sum * sum) >> 4);
241 }
242 
vpx_variance4x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)243 unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
244                                   const uint8_t *ref_ptr, int ref_stride,
245                                   unsigned int *sse) {
246   __m128i vsse, vsum;
247   int sum;
248   variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
249   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
250   return *sse - ((sum * sum) >> 5);
251 }
252 
vpx_variance8x4_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)253 unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
254                                   const uint8_t *ref_ptr, int ref_stride,
255                                   unsigned int *sse) {
256   __m128i vsse, vsum;
257   int sum;
258   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
259   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
260   return *sse - ((sum * sum) >> 5);
261 }
262 
vpx_variance8x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)263 unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
264                                   const uint8_t *ref_ptr, int ref_stride,
265                                   unsigned int *sse) {
266   __m128i vsse, vsum;
267   int sum;
268   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
269   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
270   return *sse - ((sum * sum) >> 6);
271 }
272 
vpx_variance8x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)273 unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
274                                    const uint8_t *ref_ptr, int ref_stride,
275                                    unsigned int *sse) {
276   __m128i vsse, vsum;
277   int sum;
278   variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
279   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
280   return *sse - ((sum * sum) >> 7);
281 }
282 
vpx_variance16x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)283 unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
284                                    const uint8_t *ref_ptr, int ref_stride,
285                                    unsigned int *sse) {
286   __m128i vsse, vsum;
287   int sum;
288   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
289   variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
290   return *sse - ((sum * sum) >> 7);
291 }
292 
vpx_variance16x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)293 unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
294                                     const uint8_t *ref_ptr, int ref_stride,
295                                     unsigned int *sse) {
296   __m128i vsse, vsum;
297   int sum;
298   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
299   variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
300   return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
301 }
302 
vpx_variance16x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)303 unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
304                                     const uint8_t *ref_ptr, int ref_stride,
305                                     unsigned int *sse) {
306   __m128i vsse, vsum;
307   int sum;
308   variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
309   variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
310   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
311 }
312 
vpx_variance32x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)313 unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
314                                     const uint8_t *ref_ptr, int ref_stride,
315                                     unsigned int *sse) {
316   __m128i vsse = _mm_setzero_si128();
317   __m128i vsum;
318   int sum;
319   variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
320   variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
321   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
322 }
323 
vpx_variance32x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)324 unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
325                                     const uint8_t *ref_ptr, int ref_stride,
326                                     unsigned int *sse) {
327   __m128i vsse = _mm_setzero_si128();
328   __m128i vsum;
329   int sum;
330   variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
331   *sse = add32x4_sse2(vsse);
332   sum = sum_final_sse2(vsum);
333   return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
334 }
335 
vpx_variance32x64_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)336 unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
337                                     const uint8_t *ref_ptr, int ref_stride,
338                                     unsigned int *sse) {
339   __m128i vsse = _mm_setzero_si128();
340   __m128i vsum = _mm_setzero_si128();
341   int sum;
342   int i = 0;
343 
344   for (i = 0; i < 2; i++) {
345     __m128i vsum16;
346     variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
347                     ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
348                     &vsum16);
349     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
350   }
351   *sse = add32x4_sse2(vsse);
352   sum = add32x4_sse2(vsum);
353   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
354 }
355 
vpx_variance64x32_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)356 unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
357                                     const uint8_t *ref_ptr, int ref_stride,
358                                     unsigned int *sse) {
359   __m128i vsse = _mm_setzero_si128();
360   __m128i vsum = _mm_setzero_si128();
361   int sum;
362   int i = 0;
363 
364   for (i = 0; i < 2; i++) {
365     __m128i vsum16;
366     variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
367                     ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
368                     &vsum16);
369     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
370   }
371   *sse = add32x4_sse2(vsse);
372   sum = add32x4_sse2(vsum);
373   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
374 }
375 
vpx_variance64x64_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)376 unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
377                                     const uint8_t *ref_ptr, int ref_stride,
378                                     unsigned int *sse) {
379   __m128i vsse = _mm_setzero_si128();
380   __m128i vsum = _mm_setzero_si128();
381   int sum;
382   int i = 0;
383 
384   for (i = 0; i < 4; i++) {
385     __m128i vsum16;
386     variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
387                     ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
388                     &vsum16);
389     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
390   }
391   *sse = add32x4_sse2(vsse);
392   sum = add32x4_sse2(vsum);
393   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
394 }
395 
vpx_mse8x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)396 unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
397                              const uint8_t *ref_ptr, int ref_stride,
398                              unsigned int *sse) {
399   vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
400   return *sse;
401 }
402 
vpx_mse8x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)403 unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
404                               const uint8_t *ref_ptr, int ref_stride,
405                               unsigned int *sse) {
406   vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
407   return *sse;
408 }
409 
vpx_mse16x8_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)410 unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
411                               const uint8_t *ref_ptr, int ref_stride,
412                               unsigned int *sse) {
413   vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
414   return *sse;
415 }
416 
vpx_mse16x16_sse2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,unsigned int * sse)417 unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
418                                const uint8_t *ref_ptr, int ref_stride,
419                                unsigned int *sse) {
420   vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
421   return *sse;
422 }
423 
424 // The 2 unused parameters are place holders for PIC enabled build.
425 // These definitions are for functions defined in subpel_variance.asm
426 #define DECL(w, opt)                                                          \
427   int vpx_sub_pixel_variance##w##xh_##opt(                                    \
428       const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,             \
429       int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
430       unsigned int *sse, void *unused0, void *unused)
431 #define DECLS(opt1, opt2) \
432   DECL(4, opt1);          \
433   DECL(8, opt1);          \
434   DECL(16, opt1)
435 
436 DECLS(sse2, sse2);
437 DECLS(ssse3, ssse3);
438 #undef DECLS
439 #undef DECL
440 
441 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
442   unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                   \
443       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
444       const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
445     unsigned int sse_tmp;                                                 \
446     int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
447         src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
448         &sse_tmp, NULL, NULL);                                            \
449     if (w > wf) {                                                         \
450       unsigned int sse2;                                                  \
451       int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
452           src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
453           ref_stride, h, &sse2, NULL, NULL);                              \
454       se += se2;                                                          \
455       sse_tmp += sse2;                                                    \
456       if (w > wf * 2) {                                                   \
457         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
458             src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
459             ref_stride, h, &sse2, NULL, NULL);                            \
460         se += se2;                                                        \
461         sse_tmp += sse2;                                                  \
462         se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
463             src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
464             ref_stride, h, &sse2, NULL, NULL);                            \
465         se += se2;                                                        \
466         sse_tmp += sse2;                                                  \
467       }                                                                   \
468     }                                                                     \
469     *sse = sse_tmp;                                                       \
470     return sse_tmp -                                                      \
471            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
472   }
473 
474 #define FNS(opt1, opt2)                              \
475   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
476   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
477   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
478   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
479   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
480   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
481   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
482   FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
483   FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
484   FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
485   FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
486   FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
487   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
488 
489 FNS(sse2, sse2);
490 FNS(ssse3, ssse3);
491 
492 #undef FNS
493 #undef FN
494 
495 // The 2 unused parameters are place holders for PIC enabled build.
496 #define DECL(w, opt)                                                   \
497   int vpx_sub_pixel_avg_variance##w##xh_##opt(                         \
498       const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,      \
499       int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride,      \
500       const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
501       unsigned int *sse, void *unused0, void *unused)
502 #define DECLS(opt1, opt2) \
503   DECL(4, opt1);          \
504   DECL(8, opt1);          \
505   DECL(16, opt1)
506 
507 DECLS(sse2, sse2);
508 DECLS(ssse3, ssse3);
509 #undef DECL
510 #undef DECLS
511 
512 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
513   unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(               \
514       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
515       const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,          \
516       const uint8_t *second_pred) {                                       \
517     unsigned int sse_tmp;                                                 \
518     int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                    \
519         src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride,     \
520         second_pred, w, h, &sse_tmp, NULL, NULL);                         \
521     if (w > wf) {                                                         \
522       unsigned int sse2;                                                  \
523       int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                 \
524           src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
525           ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL);         \
526       se += se2;                                                          \
527       sse_tmp += sse2;                                                    \
528       if (w > wf * 2) {                                                   \
529         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
530             src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
531             ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL);       \
532         se += se2;                                                        \
533         sse_tmp += sse2;                                                  \
534         se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
535             src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
536             ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL);       \
537         se += se2;                                                        \
538         sse_tmp += sse2;                                                  \
539       }                                                                   \
540     }                                                                     \
541     *sse = sse_tmp;                                                       \
542     return sse_tmp -                                                      \
543            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
544   }
545 
546 #define FNS(opt1, opt2)                              \
547   FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
548   FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
549   FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
550   FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
551   FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
552   FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
553   FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
554   FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
555   FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
556   FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
557   FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
558   FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
559   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
560 
561 FNS(sse2, sse);
562 FNS(ssse3, ssse3);
563 
564 #undef FNS
565 #undef FN
566