1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8/common/variance.h"
13 #include "vpx_ports/mem.h"
14 #include "vp8/common/x86/filter_x86.h"
15 
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
20 
21 extern void vp8_filter_block2d_bil4x4_var_mmx
22 (
23     const unsigned char *ref_ptr,
24     int ref_pixels_per_line,
25     const unsigned char *src_ptr,
26     int src_pixels_per_line,
27     const short *HFilter,
28     const short *VFilter,
29     int *sum,
30     unsigned int *sumsquared
31 );
32 
33 extern unsigned int vp8_get4x4var_mmx
34 (
35     const unsigned char *src_ptr,
36     int  source_stride,
37     const unsigned char *ref_ptr,
38     int  recon_stride,
39     unsigned int *SSE,
40     int *Sum
41 );
42 
43 unsigned int vp8_get_mb_ss_sse2
44 (
45     const short *src_ptr
46 );
47 unsigned int vp8_get16x16var_sse2
48 (
49     const unsigned char *src_ptr,
50     int source_stride,
51     const unsigned char *ref_ptr,
52     int recon_stride,
53     unsigned int *SSE,
54     int *Sum
55 );
56 unsigned int vp8_get8x8var_sse2
57 (
58     const unsigned char *src_ptr,
59     int source_stride,
60     const unsigned char *ref_ptr,
61     int recon_stride,
62     unsigned int *SSE,
63     int *Sum
64 );
65 void vp8_filter_block2d_bil_var_sse2
66 (
67     const unsigned char *ref_ptr,
68     int ref_pixels_per_line,
69     const unsigned char *src_ptr,
70     int src_pixels_per_line,
71     unsigned int Height,
72     int  xoffset,
73     int  yoffset,
74     int *sum,
75     unsigned int *sumsquared
76 );
77 void vp8_half_horiz_vert_variance8x_h_sse2
78 (
79     const unsigned char *ref_ptr,
80     int ref_pixels_per_line,
81     const unsigned char *src_ptr,
82     int src_pixels_per_line,
83     unsigned int Height,
84     int *sum,
85     unsigned int *sumsquared
86 );
87 void vp8_half_horiz_vert_variance16x_h_sse2
88 (
89     const unsigned char *ref_ptr,
90     int ref_pixels_per_line,
91     const unsigned char *src_ptr,
92     int src_pixels_per_line,
93     unsigned int Height,
94     int *sum,
95     unsigned int *sumsquared
96 );
97 void vp8_half_horiz_variance8x_h_sse2
98 (
99     const unsigned char *ref_ptr,
100     int ref_pixels_per_line,
101     const unsigned char *src_ptr,
102     int src_pixels_per_line,
103     unsigned int Height,
104     int *sum,
105     unsigned int *sumsquared
106 );
107 void vp8_half_horiz_variance16x_h_sse2
108 (
109     const unsigned char *ref_ptr,
110     int ref_pixels_per_line,
111     const unsigned char *src_ptr,
112     int src_pixels_per_line,
113     unsigned int Height,
114     int *sum,
115     unsigned int *sumsquared
116 );
117 void vp8_half_vert_variance8x_h_sse2
118 (
119     const unsigned char *ref_ptr,
120     int ref_pixels_per_line,
121     const unsigned char *src_ptr,
122     int src_pixels_per_line,
123     unsigned int Height,
124     int *sum,
125     unsigned int *sumsquared
126 );
127 void vp8_half_vert_variance16x_h_sse2
128 (
129     const unsigned char *ref_ptr,
130     int ref_pixels_per_line,
131     const unsigned char *src_ptr,
132     int src_pixels_per_line,
133     unsigned int Height,
134     int *sum,
135     unsigned int *sumsquared
136 );
137 
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)138 unsigned int vp8_variance4x4_wmt(
139     const unsigned char *src_ptr,
140     int  source_stride,
141     const unsigned char *ref_ptr,
142     int  recon_stride,
143     unsigned int *sse)
144 {
145     unsigned int var;
146     int avg;
147 
148     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
149     *sse = var;
150     return (var - (((unsigned int)avg * avg) >> 4));
151 
152 }
153 
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)154 unsigned int vp8_variance8x8_wmt
155 (
156     const unsigned char *src_ptr,
157     int  source_stride,
158     const unsigned char *ref_ptr,
159     int  recon_stride,
160     unsigned int *sse)
161 {
162     unsigned int var;
163     int avg;
164 
165     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
166     *sse = var;
167     return (var - (((unsigned int)avg * avg) >> 6));
168 
169 }
170 
171 
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)172 unsigned int vp8_variance16x16_wmt
173 (
174     const unsigned char *src_ptr,
175     int  source_stride,
176     const unsigned char *ref_ptr,
177     int  recon_stride,
178     unsigned int *sse)
179 {
180     unsigned int sse0;
181     int sum0;
182 
183 
184     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
185     *sse = sse0;
186     return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
187 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)188 unsigned int vp8_mse16x16_wmt(
189     const unsigned char *src_ptr,
190     int  source_stride,
191     const unsigned char *ref_ptr,
192     int  recon_stride,
193     unsigned int *sse)
194 {
195 
196     unsigned int sse0;
197     int sum0;
198     vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
199     *sse = sse0;
200     return sse0;
201 
202 }
203 
204 
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)205 unsigned int vp8_variance16x8_wmt
206 (
207     const unsigned char *src_ptr,
208     int  source_stride,
209     const unsigned char *ref_ptr,
210     int  recon_stride,
211     unsigned int *sse)
212 {
213     unsigned int sse0, sse1, var;
214     int sum0, sum1, avg;
215 
216     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
217     vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
218 
219     var = sse0 + sse1;
220     avg = sum0 + sum1;
221     *sse = var;
222     return (var - (((unsigned int)avg * avg) >> 7));
223 
224 }
225 
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)226 unsigned int vp8_variance8x16_wmt
227 (
228     const unsigned char *src_ptr,
229     int  source_stride,
230     const unsigned char *ref_ptr,
231     int  recon_stride,
232     unsigned int *sse)
233 {
234     unsigned int sse0, sse1, var;
235     int sum0, sum1, avg;
236 
237     vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
238     vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
239 
240     var = sse0 + sse1;
241     avg = sum0 + sum1;
242     *sse = var;
243     return (var - (((unsigned int)avg * avg) >> 7));
244 
245 }
246 
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)247 unsigned int vp8_sub_pixel_variance4x4_wmt
248 (
249     const unsigned char  *src_ptr,
250     int  src_pixels_per_line,
251     int  xoffset,
252     int  yoffset,
253     const unsigned char *dst_ptr,
254     int dst_pixels_per_line,
255     unsigned int *sse
256 )
257 {
258     int xsum;
259     unsigned int xxsum;
260     vp8_filter_block2d_bil4x4_var_mmx(
261         src_ptr, src_pixels_per_line,
262         dst_ptr, dst_pixels_per_line,
263         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
264         &xsum, &xxsum
265     );
266     *sse = xxsum;
267     return (xxsum - (((unsigned int)xsum * xsum) >> 4));
268 }
269 
270 
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)271 unsigned int vp8_sub_pixel_variance8x8_wmt
272 (
273     const unsigned char  *src_ptr,
274     int  src_pixels_per_line,
275     int  xoffset,
276     int  yoffset,
277     const unsigned char *dst_ptr,
278     int dst_pixels_per_line,
279     unsigned int *sse
280 )
281 {
282     int xsum;
283     unsigned int xxsum;
284 
285     if (xoffset == 4 && yoffset == 0)
286     {
287         vp8_half_horiz_variance8x_h_sse2(
288             src_ptr, src_pixels_per_line,
289             dst_ptr, dst_pixels_per_line, 8,
290             &xsum, &xxsum);
291     }
292     else if (xoffset == 0 && yoffset == 4)
293     {
294         vp8_half_vert_variance8x_h_sse2(
295             src_ptr, src_pixels_per_line,
296             dst_ptr, dst_pixels_per_line, 8,
297             &xsum, &xxsum);
298     }
299     else if (xoffset == 4 && yoffset == 4)
300     {
301         vp8_half_horiz_vert_variance8x_h_sse2(
302             src_ptr, src_pixels_per_line,
303             dst_ptr, dst_pixels_per_line, 8,
304             &xsum, &xxsum);
305     }
306     else
307     {
308         vp8_filter_block2d_bil_var_sse2(
309             src_ptr, src_pixels_per_line,
310             dst_ptr, dst_pixels_per_line, 8,
311             xoffset, yoffset,
312             &xsum, &xxsum);
313     }
314 
315     *sse = xxsum;
316     return (xxsum - (((unsigned int)xsum * xsum) >> 6));
317 }
318 
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)319 unsigned int vp8_sub_pixel_variance16x16_wmt
320 (
321     const unsigned char  *src_ptr,
322     int  src_pixels_per_line,
323     int  xoffset,
324     int  yoffset,
325     const unsigned char *dst_ptr,
326     int dst_pixels_per_line,
327     unsigned int *sse
328 )
329 {
330     int xsum0, xsum1;
331     unsigned int xxsum0, xxsum1;
332 
333 
334     /* note we could avoid these if statements if the calling function
335      * just called the appropriate functions inside.
336      */
337     if (xoffset == 4 && yoffset == 0)
338     {
339         vp8_half_horiz_variance16x_h_sse2(
340             src_ptr, src_pixels_per_line,
341             dst_ptr, dst_pixels_per_line, 16,
342             &xsum0, &xxsum0);
343     }
344     else if (xoffset == 0 && yoffset == 4)
345     {
346         vp8_half_vert_variance16x_h_sse2(
347             src_ptr, src_pixels_per_line,
348             dst_ptr, dst_pixels_per_line, 16,
349             &xsum0, &xxsum0);
350     }
351     else if (xoffset == 4 && yoffset == 4)
352     {
353         vp8_half_horiz_vert_variance16x_h_sse2(
354             src_ptr, src_pixels_per_line,
355             dst_ptr, dst_pixels_per_line, 16,
356             &xsum0, &xxsum0);
357     }
358     else
359     {
360         vp8_filter_block2d_bil_var_sse2(
361             src_ptr, src_pixels_per_line,
362             dst_ptr, dst_pixels_per_line, 16,
363             xoffset, yoffset,
364             &xsum0, &xxsum0
365         );
366 
367         vp8_filter_block2d_bil_var_sse2(
368             src_ptr + 8, src_pixels_per_line,
369             dst_ptr + 8, dst_pixels_per_line, 16,
370             xoffset, yoffset,
371             &xsum1, &xxsum1
372         );
373         xsum0 += xsum1;
374         xxsum0 += xxsum1;
375     }
376 
377     *sse = xxsum0;
378     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
379 }
380 
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)381 unsigned int vp8_sub_pixel_mse16x16_wmt(
382     const unsigned char  *src_ptr,
383     int  src_pixels_per_line,
384     int  xoffset,
385     int  yoffset,
386     const unsigned char *dst_ptr,
387     int dst_pixels_per_line,
388     unsigned int *sse
389 )
390 {
391     vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
392     return *sse;
393 }
394 
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)395 unsigned int vp8_sub_pixel_variance16x8_wmt
396 (
397     const unsigned char  *src_ptr,
398     int  src_pixels_per_line,
399     int  xoffset,
400     int  yoffset,
401     const unsigned char *dst_ptr,
402     int dst_pixels_per_line,
403     unsigned int *sse
404 
405 )
406 {
407     int xsum0, xsum1;
408     unsigned int xxsum0, xxsum1;
409 
410     if (xoffset == 4 && yoffset == 0)
411     {
412         vp8_half_horiz_variance16x_h_sse2(
413             src_ptr, src_pixels_per_line,
414             dst_ptr, dst_pixels_per_line, 8,
415             &xsum0, &xxsum0);
416     }
417     else if (xoffset == 0 && yoffset == 4)
418     {
419         vp8_half_vert_variance16x_h_sse2(
420             src_ptr, src_pixels_per_line,
421             dst_ptr, dst_pixels_per_line, 8,
422             &xsum0, &xxsum0);
423     }
424     else if (xoffset == 4 && yoffset == 4)
425     {
426         vp8_half_horiz_vert_variance16x_h_sse2(
427             src_ptr, src_pixels_per_line,
428             dst_ptr, dst_pixels_per_line, 8,
429             &xsum0, &xxsum0);
430     }
431     else
432     {
433         vp8_filter_block2d_bil_var_sse2(
434             src_ptr, src_pixels_per_line,
435             dst_ptr, dst_pixels_per_line, 8,
436             xoffset, yoffset,
437             &xsum0, &xxsum0);
438 
439         vp8_filter_block2d_bil_var_sse2(
440             src_ptr + 8, src_pixels_per_line,
441             dst_ptr + 8, dst_pixels_per_line, 8,
442             xoffset, yoffset,
443             &xsum1, &xxsum1);
444         xsum0 += xsum1;
445         xxsum0 += xxsum1;
446     }
447 
448     *sse = xxsum0;
449     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
450 }
451 
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)452 unsigned int vp8_sub_pixel_variance8x16_wmt
453 (
454     const unsigned char  *src_ptr,
455     int  src_pixels_per_line,
456     int  xoffset,
457     int  yoffset,
458     const unsigned char *dst_ptr,
459     int dst_pixels_per_line,
460     unsigned int *sse
461 )
462 {
463     int xsum;
464     unsigned int xxsum;
465 
466     if (xoffset == 4 && yoffset == 0)
467     {
468         vp8_half_horiz_variance8x_h_sse2(
469             src_ptr, src_pixels_per_line,
470             dst_ptr, dst_pixels_per_line, 16,
471             &xsum, &xxsum);
472     }
473     else if (xoffset == 0 && yoffset == 4)
474     {
475         vp8_half_vert_variance8x_h_sse2(
476             src_ptr, src_pixels_per_line,
477             dst_ptr, dst_pixels_per_line, 16,
478             &xsum, &xxsum);
479     }
480     else if (xoffset == 4 && yoffset == 4)
481     {
482         vp8_half_horiz_vert_variance8x_h_sse2(
483             src_ptr, src_pixels_per_line,
484             dst_ptr, dst_pixels_per_line, 16,
485             &xsum, &xxsum);
486     }
487     else
488     {
489         vp8_filter_block2d_bil_var_sse2(
490             src_ptr, src_pixels_per_line,
491             dst_ptr, dst_pixels_per_line, 16,
492             xoffset, yoffset,
493             &xsum, &xxsum);
494     }
495 
496     *sse = xxsum;
497     return (xxsum - (((unsigned int)xsum * xsum) >> 7));
498 }
499 
500 
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)501 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
502     const unsigned char *src_ptr,
503     int  src_pixels_per_line,
504     const unsigned char *dst_ptr,
505     int  dst_pixels_per_line,
506     unsigned int *sse)
507 {
508     int xsum0;
509     unsigned int xxsum0;
510 
511     vp8_half_horiz_variance16x_h_sse2(
512         src_ptr, src_pixels_per_line,
513         dst_ptr, dst_pixels_per_line, 16,
514         &xsum0, &xxsum0);
515 
516     *sse = xxsum0;
517     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
518 }
519 
520 
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)521 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
522     const unsigned char *src_ptr,
523     int  src_pixels_per_line,
524     const unsigned char *dst_ptr,
525     int  dst_pixels_per_line,
526     unsigned int *sse)
527 {
528     int xsum0;
529     unsigned int xxsum0;
530     vp8_half_vert_variance16x_h_sse2(
531         src_ptr, src_pixels_per_line,
532         dst_ptr, dst_pixels_per_line, 16,
533         &xsum0, &xxsum0);
534 
535     *sse = xxsum0;
536     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
537 }
538 
539 
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)540 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
541     const unsigned char *src_ptr,
542     int  src_pixels_per_line,
543     const unsigned char *dst_ptr,
544     int  dst_pixels_per_line,
545     unsigned int *sse)
546 {
547     int xsum0;
548     unsigned int xxsum0;
549 
550     vp8_half_horiz_vert_variance16x_h_sse2(
551         src_ptr, src_pixels_per_line,
552         dst_ptr, dst_pixels_per_line, 16,
553         &xsum0, &xxsum0);
554 
555     *sse = xxsum0;
556     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
557 }
558