1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 
13 #include "vp9/encoder/vp9_variance.h"
14 #include "vp9/common/vp9_pragmas.h"
15 #include "vpx_ports/mem.h"
16 
17 extern unsigned int vp9_get4x4var_mmx
18 (
19   const unsigned char *src_ptr,
20   int  source_stride,
21   const unsigned char *ref_ptr,
22   int  recon_stride,
23   unsigned int *SSE,
24   int *Sum
25 );
26 
27 unsigned int vp9_get16x16var_sse2
28 (
29   const unsigned char *src_ptr,
30   int source_stride,
31   const unsigned char *ref_ptr,
32   int recon_stride,
33   unsigned int *SSE,
34   int *Sum
35 );
36 unsigned int vp9_get8x8var_sse2
37 (
38   const unsigned char *src_ptr,
39   int source_stride,
40   const unsigned char *ref_ptr,
41   int recon_stride,
42   unsigned int *SSE,
43   int *Sum
44 );
45 void vp9_half_horiz_vert_variance8x_h_sse2
46 (
47   const unsigned char *ref_ptr,
48   int ref_pixels_per_line,
49   const unsigned char *src_ptr,
50   int src_pixels_per_line,
51   unsigned int Height,
52   int *sum,
53   unsigned int *sumsquared
54 );
55 void vp9_half_horiz_vert_variance16x_h_sse2
56 (
57   const unsigned char *ref_ptr,
58   int ref_pixels_per_line,
59   const unsigned char *src_ptr,
60   int src_pixels_per_line,
61   unsigned int Height,
62   int *sum,
63   unsigned int *sumsquared
64 );
65 void vp9_half_horiz_variance8x_h_sse2
66 (
67   const unsigned char *ref_ptr,
68   int ref_pixels_per_line,
69   const unsigned char *src_ptr,
70   int src_pixels_per_line,
71   unsigned int Height,
72   int *sum,
73   unsigned int *sumsquared
74 );
75 void vp9_half_horiz_variance16x_h_sse2
76 (
77   const unsigned char *ref_ptr,
78   int ref_pixels_per_line,
79   const unsigned char *src_ptr,
80   int src_pixels_per_line,
81   unsigned int Height,
82   int *sum,
83   unsigned int *sumsquared
84 );
85 void vp9_half_vert_variance8x_h_sse2
86 (
87   const unsigned char *ref_ptr,
88   int ref_pixels_per_line,
89   const unsigned char *src_ptr,
90   int src_pixels_per_line,
91   unsigned int Height,
92   int *sum,
93   unsigned int *sumsquared
94 );
95 void vp9_half_vert_variance16x_h_sse2
96 (
97   const unsigned char *ref_ptr,
98   int ref_pixels_per_line,
99   const unsigned char *src_ptr,
100   int src_pixels_per_line,
101   unsigned int Height,
102   int *sum,
103   unsigned int *sumsquared
104 );
105 
106 typedef unsigned int (*get_var_sse2) (
107   const unsigned char *src_ptr,
108   int source_stride,
109   const unsigned char *ref_ptr,
110   int recon_stride,
111   unsigned int *SSE,
112   int *Sum
113 );
114 
variance_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,int w,int h,unsigned int * sse,int * sum,get_var_sse2 var_fn,int block_size)115 static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
116                         const unsigned char *ref_ptr, int  recon_stride,
117                         int  w, int  h, unsigned int *sse, int *sum,
118                         get_var_sse2 var_fn, int block_size) {
119   unsigned int sse0;
120   int sum0;
121   int i, j;
122 
123   *sse = 0;
124   *sum = 0;
125 
126   for (i = 0; i < h; i += block_size) {
127     for (j = 0; j < w; j += block_size) {
128       var_fn(src_ptr + source_stride * i + j, source_stride,
129              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
130       *sse += sse0;
131       *sum += sum0;
132     }
133   }
134 }
135 
vp9_variance4x4_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)136 unsigned int vp9_variance4x4_sse2(
137   const unsigned char *src_ptr,
138   int  source_stride,
139   const unsigned char *ref_ptr,
140   int  recon_stride,
141   unsigned int *sse) {
142   unsigned int var;
143   int avg;
144 
145   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
146                   &var, &avg, vp9_get4x4var_mmx, 4);
147   *sse = var;
148   return (var - (((unsigned int)avg * avg) >> 4));
149 }
150 
vp9_variance8x4_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)151 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
152                                   int  source_stride,
153                                   const uint8_t *ref_ptr,
154                                   int  recon_stride,
155                                   unsigned int *sse) {
156   unsigned int var;
157   int avg;
158 
159   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
160                   &var, &avg, vp9_get4x4var_mmx, 4);
161   *sse = var;
162   return (var - (((unsigned int)avg * avg) >> 5));
163 }
164 
vp9_variance4x8_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)165 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
166                                   int  source_stride,
167                                   const uint8_t *ref_ptr,
168                                   int  recon_stride,
169                                   unsigned int *sse) {
170   unsigned int var;
171   int avg;
172 
173   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
174                   &var, &avg, vp9_get4x4var_mmx, 4);
175   *sse = var;
176   return (var - (((unsigned int)avg * avg) >> 5));
177 }
178 
vp9_variance8x8_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)179 unsigned int vp9_variance8x8_sse2
180 (
181   const unsigned char *src_ptr,
182   int  source_stride,
183   const unsigned char *ref_ptr,
184   int  recon_stride,
185   unsigned int *sse) {
186   unsigned int var;
187   int avg;
188 
189   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
190                   &var, &avg, vp9_get8x8var_sse2, 8);
191   *sse = var;
192   return (var - (((unsigned int)avg * avg) >> 6));
193 }
194 
vp9_variance16x8_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)195 unsigned int vp9_variance16x8_sse2
196 (
197   const unsigned char *src_ptr,
198   int  source_stride,
199   const unsigned char *ref_ptr,
200   int  recon_stride,
201   unsigned int *sse) {
202   unsigned int var;
203   int avg;
204 
205   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
206                   &var, &avg, vp9_get8x8var_sse2, 8);
207   *sse = var;
208   return (var - (((unsigned int)avg * avg) >> 7));
209 }
210 
vp9_variance8x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)211 unsigned int vp9_variance8x16_sse2
212 (
213   const unsigned char *src_ptr,
214   int  source_stride,
215   const unsigned char *ref_ptr,
216   int  recon_stride,
217   unsigned int *sse) {
218   unsigned int var;
219   int avg;
220 
221   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
222                 &var, &avg, vp9_get8x8var_sse2, 8);
223   *sse = var;
224   return (var - (((unsigned int)avg * avg) >> 7));
225 }
226 
vp9_variance16x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)227 unsigned int vp9_variance16x16_sse2
228 (
229   const unsigned char *src_ptr,
230   int  source_stride,
231   const unsigned char *ref_ptr,
232   int  recon_stride,
233   unsigned int *sse) {
234   unsigned int var;
235   int avg;
236 
237   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
238                 &var, &avg, vp9_get16x16var_sse2, 16);
239   *sse = var;
240   return (var - (((unsigned int)avg * avg) >> 8));
241 }
242 
vp9_mse16x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)243 unsigned int vp9_mse16x16_sse2(
244   const unsigned char *src_ptr,
245   int  source_stride,
246   const unsigned char *ref_ptr,
247   int  recon_stride,
248   unsigned int *sse) {
249   unsigned int sse0;
250   int sum0;
251   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
252                        &sum0);
253   *sse = sse0;
254   return sse0;
255 }
256 
vp9_variance32x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)257 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
258                                     int  source_stride,
259                                     const uint8_t *ref_ptr,
260                                     int  recon_stride,
261                                     unsigned int *sse) {
262   unsigned int var;
263   int avg;
264 
265   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
266                 &var, &avg, vp9_get16x16var_sse2, 16);
267   *sse = var;
268   return (var - (((int64_t)avg * avg) >> 10));
269 }
270 
vp9_variance32x16_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)271 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
272                                     int  source_stride,
273                                     const uint8_t *ref_ptr,
274                                     int  recon_stride,
275                                     unsigned int *sse) {
276   unsigned int var;
277   int avg;
278 
279   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
280                 &var, &avg, vp9_get16x16var_sse2, 16);
281   *sse = var;
282   return (var - (((int64_t)avg * avg) >> 9));
283 }
284 
vp9_variance16x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)285 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
286                                     int  source_stride,
287                                     const uint8_t *ref_ptr,
288                                     int  recon_stride,
289                                     unsigned int *sse) {
290   unsigned int var;
291   int avg;
292 
293   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
294                 &var, &avg, vp9_get16x16var_sse2, 16);
295   *sse = var;
296   return (var - (((int64_t)avg * avg) >> 9));
297 }
298 
vp9_variance64x64_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)299 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
300                                     int  source_stride,
301                                     const uint8_t *ref_ptr,
302                                     int  recon_stride,
303                                     unsigned int *sse) {
304   unsigned int var;
305   int avg;
306 
307   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
308                 &var, &avg, vp9_get16x16var_sse2, 16);
309   *sse = var;
310   return (var - (((int64_t)avg * avg) >> 12));
311 }
312 
vp9_variance64x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)313 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
314                                     int  source_stride,
315                                     const uint8_t *ref_ptr,
316                                     int  recon_stride,
317                                     unsigned int *sse) {
318   unsigned int var;
319   int avg;
320 
321   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
322                 &var, &avg, vp9_get16x16var_sse2, 16);
323   *sse = var;
324   return (var - (((int64_t)avg * avg) >> 11));
325 }
326 
vp9_variance32x64_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)327 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
328                                     int  source_stride,
329                                     const uint8_t *ref_ptr,
330                                     int  recon_stride,
331                                     unsigned int *sse) {
332   unsigned int var;
333   int avg;
334 
335   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
336                 &var, &avg, vp9_get16x16var_sse2, 16);
337   *sse = var;
338   return (var - (((int64_t)avg * avg) >> 11));
339 }
340 
341 #define DECL(w, opt) \
342 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
343                                         ptrdiff_t src_stride, \
344                                         int x_offset, int y_offset, \
345                                         const uint8_t *dst, \
346                                         ptrdiff_t dst_stride, \
347                                         int height, unsigned int *sse)
348 #define DECLS(opt1, opt2) \
349 DECL(4, opt2); \
350 DECL(8, opt1); \
351 DECL(16, opt1)
352 
353 DECLS(sse2, sse);
354 DECLS(ssse3, ssse3);
355 #undef DECLS
356 #undef DECL
357 
358 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
359 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
360                                                      int src_stride, \
361                                                      int x_offset, \
362                                                      int y_offset, \
363                                                      const uint8_t *dst, \
364                                                      int dst_stride, \
365                                                      unsigned int *sse_ptr) { \
366   unsigned int sse; \
367   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
368                                                 y_offset, dst, dst_stride, \
369                                                 h, &sse); \
370   if (w > wf) { \
371     unsigned int sse2; \
372     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
373                                                    x_offset, y_offset, \
374                                                    dst + 16, dst_stride, \
375                                                    h, &sse2); \
376     se += se2; \
377     sse += sse2; \
378     if (w > wf * 2) { \
379       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
380                                                  x_offset, y_offset, \
381                                                  dst + 32, dst_stride, \
382                                                  h, &sse2); \
383       se += se2; \
384       sse += sse2; \
385       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
386                                                  x_offset, y_offset, \
387                                                  dst + 48, dst_stride, \
388                                                  h, &sse2); \
389       se += se2; \
390       sse += sse2; \
391     } \
392   } \
393   *sse_ptr = sse; \
394   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
395 }
396 
397 #define FNS(opt1, opt2) \
398 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
399 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
400 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
401 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
402 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
403 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
404 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
405 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
406 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
407 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
408 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
409 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
410 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
411 
412 FNS(sse2, sse);
413 FNS(ssse3, ssse3);
414 
415 #undef FNS
416 #undef FN
417 
418 #define DECL(w, opt) \
419 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
420                                             ptrdiff_t src_stride, \
421                                             int x_offset, int y_offset, \
422                                             const uint8_t *dst, \
423                                             ptrdiff_t dst_stride, \
424                                             const uint8_t *sec, \
425                                             ptrdiff_t sec_stride, \
426                                             int height, unsigned int *sse)
427 #define DECLS(opt1, opt2) \
428 DECL(4, opt2); \
429 DECL(8, opt1); \
430 DECL(16, opt1)
431 
432 DECLS(sse2, sse);
433 DECLS(ssse3, ssse3);
434 #undef DECL
435 #undef DECLS
436 
437 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
438 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
439                                                          int src_stride, \
440                                                          int x_offset, \
441                                                          int y_offset, \
442                                                          const uint8_t *dst, \
443                                                          int dst_stride, \
444                                                          unsigned int *sseptr, \
445                                                          const uint8_t *sec) { \
446   unsigned int sse; \
447   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
448                                                     y_offset, dst, dst_stride, \
449                                                     sec, w, h, &sse); \
450   if (w > wf) { \
451     unsigned int sse2; \
452     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
453                                                        x_offset, y_offset, \
454                                                        dst + 16, dst_stride, \
455                                                        sec + 16, w, h, &sse2); \
456     se += se2; \
457     sse += sse2; \
458     if (w > wf * 2) { \
459       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
460                                                      x_offset, y_offset, \
461                                                      dst + 32, dst_stride, \
462                                                      sec + 32, w, h, &sse2); \
463       se += se2; \
464       sse += sse2; \
465       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
466                                                      x_offset, y_offset, \
467                                                      dst + 48, dst_stride, \
468                                                      sec + 48, w, h, &sse2); \
469       se += se2; \
470       sse += sse2; \
471     } \
472   } \
473   *sseptr = sse; \
474   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
475 }
476 
477 #define FNS(opt1, opt2) \
478 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
479 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
480 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
481 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
482 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
483 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
484 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
485 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
486 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
487 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
488 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
489 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
490 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
491 
492 FNS(sse2, sse);
493 FNS(ssse3, ssse3);
494 
495 #undef FNS
496 #undef FN
497 
vp9_variance_halfpixvar16x16_h_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)498 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
499   const unsigned char *src_ptr,
500   int  src_pixels_per_line,
501   const unsigned char *dst_ptr,
502   int  dst_pixels_per_line,
503   unsigned int *sse) {
504   int xsum0;
505   unsigned int xxsum0;
506 
507   vp9_half_horiz_variance16x_h_sse2(
508     src_ptr, src_pixels_per_line,
509     dst_ptr, dst_pixels_per_line, 16,
510     &xsum0, &xxsum0);
511 
512   *sse = xxsum0;
513   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
514 }
515 
516 
vp9_variance_halfpixvar16x16_v_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)517 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
518   const unsigned char *src_ptr,
519   int  src_pixels_per_line,
520   const unsigned char *dst_ptr,
521   int  dst_pixels_per_line,
522   unsigned int *sse) {
523   int xsum0;
524   unsigned int xxsum0;
525   vp9_half_vert_variance16x_h_sse2(
526     src_ptr, src_pixels_per_line,
527     dst_ptr, dst_pixels_per_line, 16,
528     &xsum0, &xxsum0);
529 
530   *sse = xxsum0;
531   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
532 }
533 
534 
vp9_variance_halfpixvar16x16_hv_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)535 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
536   const unsigned char *src_ptr,
537   int  src_pixels_per_line,
538   const unsigned char *dst_ptr,
539   int  dst_pixels_per_line,
540   unsigned int *sse) {
541   int xsum0;
542   unsigned int xxsum0;
543 
544   vp9_half_horiz_vert_variance16x_h_sse2(
545     src_ptr, src_pixels_per_line,
546     dst_ptr, dst_pixels_per_line, 16,
547     &xsum0, &xxsum0);
548 
549   *sse = xxsum0;
550   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
551 }
552