1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8/common/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
15 #include "vp8/common/x86/filter_x86.h"
16 
17 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
18 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
19 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
20 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
21 
22 extern void vp8_filter_block2d_bil4x4_var_mmx
23 (
24     const unsigned char *ref_ptr,
25     int ref_pixels_per_line,
26     const unsigned char *src_ptr,
27     int src_pixels_per_line,
28     const short *HFilter,
29     const short *VFilter,
30     int *sum,
31     unsigned int *sumsquared
32 );
33 
34 extern unsigned int vp8_get4x4var_mmx
35 (
36     const unsigned char *src_ptr,
37     int  source_stride,
38     const unsigned char *ref_ptr,
39     int  recon_stride,
40     unsigned int *SSE,
41     int *Sum
42 );
43 
44 unsigned int vp8_get_mb_ss_sse2
45 (
46     const short *src_ptr
47 );
48 unsigned int vp8_get16x16var_sse2
49 (
50     const unsigned char *src_ptr,
51     int source_stride,
52     const unsigned char *ref_ptr,
53     int recon_stride,
54     unsigned int *SSE,
55     int *Sum
56 );
57 unsigned int vp8_get8x8var_sse2
58 (
59     const unsigned char *src_ptr,
60     int source_stride,
61     const unsigned char *ref_ptr,
62     int recon_stride,
63     unsigned int *SSE,
64     int *Sum
65 );
66 void vp8_filter_block2d_bil_var_sse2
67 (
68     const unsigned char *ref_ptr,
69     int ref_pixels_per_line,
70     const unsigned char *src_ptr,
71     int src_pixels_per_line,
72     unsigned int Height,
73     int  xoffset,
74     int  yoffset,
75     int *sum,
76     unsigned int *sumsquared
77 );
78 void vp8_half_horiz_vert_variance8x_h_sse2
79 (
80     const unsigned char *ref_ptr,
81     int ref_pixels_per_line,
82     const unsigned char *src_ptr,
83     int src_pixels_per_line,
84     unsigned int Height,
85     int *sum,
86     unsigned int *sumsquared
87 );
88 void vp8_half_horiz_vert_variance16x_h_sse2
89 (
90     const unsigned char *ref_ptr,
91     int ref_pixels_per_line,
92     const unsigned char *src_ptr,
93     int src_pixels_per_line,
94     unsigned int Height,
95     int *sum,
96     unsigned int *sumsquared
97 );
98 void vp8_half_horiz_variance8x_h_sse2
99 (
100     const unsigned char *ref_ptr,
101     int ref_pixels_per_line,
102     const unsigned char *src_ptr,
103     int src_pixels_per_line,
104     unsigned int Height,
105     int *sum,
106     unsigned int *sumsquared
107 );
108 void vp8_half_horiz_variance16x_h_sse2
109 (
110     const unsigned char *ref_ptr,
111     int ref_pixels_per_line,
112     const unsigned char *src_ptr,
113     int src_pixels_per_line,
114     unsigned int Height,
115     int *sum,
116     unsigned int *sumsquared
117 );
118 void vp8_half_vert_variance8x_h_sse2
119 (
120     const unsigned char *ref_ptr,
121     int ref_pixels_per_line,
122     const unsigned char *src_ptr,
123     int src_pixels_per_line,
124     unsigned int Height,
125     int *sum,
126     unsigned int *sumsquared
127 );
128 void vp8_half_vert_variance16x_h_sse2
129 (
130     const unsigned char *ref_ptr,
131     int ref_pixels_per_line,
132     const unsigned char *src_ptr,
133     int src_pixels_per_line,
134     unsigned int Height,
135     int *sum,
136     unsigned int *sumsquared
137 );
138 
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)139 unsigned int vp8_variance4x4_wmt(
140     const unsigned char *src_ptr,
141     int  source_stride,
142     const unsigned char *ref_ptr,
143     int  recon_stride,
144     unsigned int *sse)
145 {
146     unsigned int var;
147     int avg;
148 
149     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
150     *sse = var;
151     return (var - (((unsigned int)avg * avg) >> 4));
152 
153 }
154 
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)155 unsigned int vp8_variance8x8_wmt
156 (
157     const unsigned char *src_ptr,
158     int  source_stride,
159     const unsigned char *ref_ptr,
160     int  recon_stride,
161     unsigned int *sse)
162 {
163     unsigned int var;
164     int avg;
165 
166     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
167     *sse = var;
168     return (var - (((unsigned int)avg * avg) >> 6));
169 
170 }
171 
172 
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)173 unsigned int vp8_variance16x16_wmt
174 (
175     const unsigned char *src_ptr,
176     int  source_stride,
177     const unsigned char *ref_ptr,
178     int  recon_stride,
179     unsigned int *sse)
180 {
181     unsigned int sse0;
182     int sum0;
183 
184 
185     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
186     *sse = sse0;
187     return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
188 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)189 unsigned int vp8_mse16x16_wmt(
190     const unsigned char *src_ptr,
191     int  source_stride,
192     const unsigned char *ref_ptr,
193     int  recon_stride,
194     unsigned int *sse)
195 {
196 
197     unsigned int sse0;
198     int sum0;
199     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
200     *sse = sse0;
201     return sse0;
202 
203 }
204 
205 
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)206 unsigned int vp8_variance16x8_wmt
207 (
208     const unsigned char *src_ptr,
209     int  source_stride,
210     const unsigned char *ref_ptr,
211     int  recon_stride,
212     unsigned int *sse)
213 {
214     unsigned int sse0, sse1, var;
215     int sum0, sum1, avg;
216 
217     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
218     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
219 
220     var = sse0 + sse1;
221     avg = sum0 + sum1;
222     *sse = var;
223     return (var - (((unsigned int)avg * avg) >> 7));
224 
225 }
226 
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)227 unsigned int vp8_variance8x16_wmt
228 (
229     const unsigned char *src_ptr,
230     int  source_stride,
231     const unsigned char *ref_ptr,
232     int  recon_stride,
233     unsigned int *sse)
234 {
235     unsigned int sse0, sse1, var;
236     int sum0, sum1, avg;
237 
238     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
239     vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
240 
241     var = sse0 + sse1;
242     avg = sum0 + sum1;
243     *sse = var;
244     return (var - (((unsigned int)avg * avg) >> 7));
245 
246 }
247 
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)248 unsigned int vp8_sub_pixel_variance4x4_wmt
249 (
250     const unsigned char  *src_ptr,
251     int  src_pixels_per_line,
252     int  xoffset,
253     int  yoffset,
254     const unsigned char *dst_ptr,
255     int dst_pixels_per_line,
256     unsigned int *sse
257 )
258 {
259     int xsum;
260     unsigned int xxsum;
261     vp8_filter_block2d_bil4x4_var_mmx(
262         src_ptr, src_pixels_per_line,
263         dst_ptr, dst_pixels_per_line,
264         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
265         &xsum, &xxsum
266     );
267     *sse = xxsum;
268     return (xxsum - (((unsigned int)xsum * xsum) >> 4));
269 }
270 
271 
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)272 unsigned int vp8_sub_pixel_variance8x8_wmt
273 (
274     const unsigned char  *src_ptr,
275     int  src_pixels_per_line,
276     int  xoffset,
277     int  yoffset,
278     const unsigned char *dst_ptr,
279     int dst_pixels_per_line,
280     unsigned int *sse
281 )
282 {
283     int xsum;
284     unsigned int xxsum;
285 
286     if (xoffset == 4 && yoffset == 0)
287     {
288         vp8_half_horiz_variance8x_h_sse2(
289             src_ptr, src_pixels_per_line,
290             dst_ptr, dst_pixels_per_line, 8,
291             &xsum, &xxsum);
292     }
293     else if (xoffset == 0 && yoffset == 4)
294     {
295         vp8_half_vert_variance8x_h_sse2(
296             src_ptr, src_pixels_per_line,
297             dst_ptr, dst_pixels_per_line, 8,
298             &xsum, &xxsum);
299     }
300     else if (xoffset == 4 && yoffset == 4)
301     {
302         vp8_half_horiz_vert_variance8x_h_sse2(
303             src_ptr, src_pixels_per_line,
304             dst_ptr, dst_pixels_per_line, 8,
305             &xsum, &xxsum);
306     }
307     else
308     {
309         vp8_filter_block2d_bil_var_sse2(
310             src_ptr, src_pixels_per_line,
311             dst_ptr, dst_pixels_per_line, 8,
312             xoffset, yoffset,
313             &xsum, &xxsum);
314     }
315 
316     *sse = xxsum;
317     return (xxsum - (((unsigned int)xsum * xsum) >> 6));
318 }
319 
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)320 unsigned int vp8_sub_pixel_variance16x16_wmt
321 (
322     const unsigned char  *src_ptr,
323     int  src_pixels_per_line,
324     int  xoffset,
325     int  yoffset,
326     const unsigned char *dst_ptr,
327     int dst_pixels_per_line,
328     unsigned int *sse
329 )
330 {
331     int xsum0, xsum1;
332     unsigned int xxsum0, xxsum1;
333 
334 
335     /* note we could avoid these if statements if the calling function
336      * just called the appropriate functions inside.
337      */
338     if (xoffset == 4 && yoffset == 0)
339     {
340         vp8_half_horiz_variance16x_h_sse2(
341             src_ptr, src_pixels_per_line,
342             dst_ptr, dst_pixels_per_line, 16,
343             &xsum0, &xxsum0);
344     }
345     else if (xoffset == 0 && yoffset == 4)
346     {
347         vp8_half_vert_variance16x_h_sse2(
348             src_ptr, src_pixels_per_line,
349             dst_ptr, dst_pixels_per_line, 16,
350             &xsum0, &xxsum0);
351     }
352     else if (xoffset == 4 && yoffset == 4)
353     {
354         vp8_half_horiz_vert_variance16x_h_sse2(
355             src_ptr, src_pixels_per_line,
356             dst_ptr, dst_pixels_per_line, 16,
357             &xsum0, &xxsum0);
358     }
359     else
360     {
361         vp8_filter_block2d_bil_var_sse2(
362             src_ptr, src_pixels_per_line,
363             dst_ptr, dst_pixels_per_line, 16,
364             xoffset, yoffset,
365             &xsum0, &xxsum0
366         );
367 
368         vp8_filter_block2d_bil_var_sse2(
369             src_ptr + 8, src_pixels_per_line,
370             dst_ptr + 8, dst_pixels_per_line, 16,
371             xoffset, yoffset,
372             &xsum1, &xxsum1
373         );
374         xsum0 += xsum1;
375         xxsum0 += xxsum1;
376     }
377 
378     *sse = xxsum0;
379     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
380 }
381 
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)382 unsigned int vp8_sub_pixel_mse16x16_wmt(
383     const unsigned char  *src_ptr,
384     int  src_pixels_per_line,
385     int  xoffset,
386     int  yoffset,
387     const unsigned char *dst_ptr,
388     int dst_pixels_per_line,
389     unsigned int *sse
390 )
391 {
392     vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
393     return *sse;
394 }
395 
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)396 unsigned int vp8_sub_pixel_variance16x8_wmt
397 (
398     const unsigned char  *src_ptr,
399     int  src_pixels_per_line,
400     int  xoffset,
401     int  yoffset,
402     const unsigned char *dst_ptr,
403     int dst_pixels_per_line,
404     unsigned int *sse
405 
406 )
407 {
408     int xsum0, xsum1;
409     unsigned int xxsum0, xxsum1;
410 
411     if (xoffset == 4 && yoffset == 0)
412     {
413         vp8_half_horiz_variance16x_h_sse2(
414             src_ptr, src_pixels_per_line,
415             dst_ptr, dst_pixels_per_line, 8,
416             &xsum0, &xxsum0);
417     }
418     else if (xoffset == 0 && yoffset == 4)
419     {
420         vp8_half_vert_variance16x_h_sse2(
421             src_ptr, src_pixels_per_line,
422             dst_ptr, dst_pixels_per_line, 8,
423             &xsum0, &xxsum0);
424     }
425     else if (xoffset == 4 && yoffset == 4)
426     {
427         vp8_half_horiz_vert_variance16x_h_sse2(
428             src_ptr, src_pixels_per_line,
429             dst_ptr, dst_pixels_per_line, 8,
430             &xsum0, &xxsum0);
431     }
432     else
433     {
434         vp8_filter_block2d_bil_var_sse2(
435             src_ptr, src_pixels_per_line,
436             dst_ptr, dst_pixels_per_line, 8,
437             xoffset, yoffset,
438             &xsum0, &xxsum0);
439 
440         vp8_filter_block2d_bil_var_sse2(
441             src_ptr + 8, src_pixels_per_line,
442             dst_ptr + 8, dst_pixels_per_line, 8,
443             xoffset, yoffset,
444             &xsum1, &xxsum1);
445         xsum0 += xsum1;
446         xxsum0 += xxsum1;
447     }
448 
449     *sse = xxsum0;
450     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
451 }
452 
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)453 unsigned int vp8_sub_pixel_variance8x16_wmt
454 (
455     const unsigned char  *src_ptr,
456     int  src_pixels_per_line,
457     int  xoffset,
458     int  yoffset,
459     const unsigned char *dst_ptr,
460     int dst_pixels_per_line,
461     unsigned int *sse
462 )
463 {
464     int xsum;
465     unsigned int xxsum;
466 
467     if (xoffset == 4 && yoffset == 0)
468     {
469         vp8_half_horiz_variance8x_h_sse2(
470             src_ptr, src_pixels_per_line,
471             dst_ptr, dst_pixels_per_line, 16,
472             &xsum, &xxsum);
473     }
474     else if (xoffset == 0 && yoffset == 4)
475     {
476         vp8_half_vert_variance8x_h_sse2(
477             src_ptr, src_pixels_per_line,
478             dst_ptr, dst_pixels_per_line, 16,
479             &xsum, &xxsum);
480     }
481     else if (xoffset == 4 && yoffset == 4)
482     {
483         vp8_half_horiz_vert_variance8x_h_sse2(
484             src_ptr, src_pixels_per_line,
485             dst_ptr, dst_pixels_per_line, 16,
486             &xsum, &xxsum);
487     }
488     else
489     {
490         vp8_filter_block2d_bil_var_sse2(
491             src_ptr, src_pixels_per_line,
492             dst_ptr, dst_pixels_per_line, 16,
493             xoffset, yoffset,
494             &xsum, &xxsum);
495     }
496 
497     *sse = xxsum;
498     return (xxsum - (((unsigned int)xsum * xsum) >> 7));
499 }
500 
501 
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)502 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
503     const unsigned char *src_ptr,
504     int  src_pixels_per_line,
505     const unsigned char *dst_ptr,
506     int  dst_pixels_per_line,
507     unsigned int *sse)
508 {
509     int xsum0;
510     unsigned int xxsum0;
511 
512     vp8_half_horiz_variance16x_h_sse2(
513         src_ptr, src_pixels_per_line,
514         dst_ptr, dst_pixels_per_line, 16,
515         &xsum0, &xxsum0);
516 
517     *sse = xxsum0;
518     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
519 }
520 
521 
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)522 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
523     const unsigned char *src_ptr,
524     int  src_pixels_per_line,
525     const unsigned char *dst_ptr,
526     int  dst_pixels_per_line,
527     unsigned int *sse)
528 {
529     int xsum0;
530     unsigned int xxsum0;
531     vp8_half_vert_variance16x_h_sse2(
532         src_ptr, src_pixels_per_line,
533         dst_ptr, dst_pixels_per_line, 16,
534         &xsum0, &xxsum0);
535 
536     *sse = xxsum0;
537     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
538 }
539 
540 
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)541 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
542     const unsigned char *src_ptr,
543     int  src_pixels_per_line,
544     const unsigned char *dst_ptr,
545     int  dst_pixels_per_line,
546     unsigned int *sse)
547 {
548     int xsum0;
549     unsigned int xxsum0;
550 
551     vp8_half_horiz_vert_variance16x_h_sse2(
552         src_ptr, src_pixels_per_line,
553         dst_ptr, dst_pixels_per_line, 16,
554         &xsum0, &xxsum0);
555 
556     *sse = xxsum0;
557     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
558 }
559