1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_config.h"
12 #include "vp8/common/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
15 #include "vp8/common/x86/filter_x86.h"
16
17 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
18 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
19 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
20 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
21
22 extern void vp8_filter_block2d_bil4x4_var_mmx
23 (
24 const unsigned char *ref_ptr,
25 int ref_pixels_per_line,
26 const unsigned char *src_ptr,
27 int src_pixels_per_line,
28 const short *HFilter,
29 const short *VFilter,
30 int *sum,
31 unsigned int *sumsquared
32 );
33
34 extern unsigned int vp8_get4x4var_mmx
35 (
36 const unsigned char *src_ptr,
37 int source_stride,
38 const unsigned char *ref_ptr,
39 int recon_stride,
40 unsigned int *SSE,
41 int *Sum
42 );
43
44 unsigned int vp8_get_mb_ss_sse2
45 (
46 const short *src_ptr
47 );
48 unsigned int vp8_get16x16var_sse2
49 (
50 const unsigned char *src_ptr,
51 int source_stride,
52 const unsigned char *ref_ptr,
53 int recon_stride,
54 unsigned int *SSE,
55 int *Sum
56 );
57 unsigned int vp8_get8x8var_sse2
58 (
59 const unsigned char *src_ptr,
60 int source_stride,
61 const unsigned char *ref_ptr,
62 int recon_stride,
63 unsigned int *SSE,
64 int *Sum
65 );
66 void vp8_filter_block2d_bil_var_sse2
67 (
68 const unsigned char *ref_ptr,
69 int ref_pixels_per_line,
70 const unsigned char *src_ptr,
71 int src_pixels_per_line,
72 unsigned int Height,
73 int xoffset,
74 int yoffset,
75 int *sum,
76 unsigned int *sumsquared
77 );
78 void vp8_half_horiz_vert_variance8x_h_sse2
79 (
80 const unsigned char *ref_ptr,
81 int ref_pixels_per_line,
82 const unsigned char *src_ptr,
83 int src_pixels_per_line,
84 unsigned int Height,
85 int *sum,
86 unsigned int *sumsquared
87 );
88 void vp8_half_horiz_vert_variance16x_h_sse2
89 (
90 const unsigned char *ref_ptr,
91 int ref_pixels_per_line,
92 const unsigned char *src_ptr,
93 int src_pixels_per_line,
94 unsigned int Height,
95 int *sum,
96 unsigned int *sumsquared
97 );
98 void vp8_half_horiz_variance8x_h_sse2
99 (
100 const unsigned char *ref_ptr,
101 int ref_pixels_per_line,
102 const unsigned char *src_ptr,
103 int src_pixels_per_line,
104 unsigned int Height,
105 int *sum,
106 unsigned int *sumsquared
107 );
108 void vp8_half_horiz_variance16x_h_sse2
109 (
110 const unsigned char *ref_ptr,
111 int ref_pixels_per_line,
112 const unsigned char *src_ptr,
113 int src_pixels_per_line,
114 unsigned int Height,
115 int *sum,
116 unsigned int *sumsquared
117 );
118 void vp8_half_vert_variance8x_h_sse2
119 (
120 const unsigned char *ref_ptr,
121 int ref_pixels_per_line,
122 const unsigned char *src_ptr,
123 int src_pixels_per_line,
124 unsigned int Height,
125 int *sum,
126 unsigned int *sumsquared
127 );
128 void vp8_half_vert_variance16x_h_sse2
129 (
130 const unsigned char *ref_ptr,
131 int ref_pixels_per_line,
132 const unsigned char *src_ptr,
133 int src_pixels_per_line,
134 unsigned int Height,
135 int *sum,
136 unsigned int *sumsquared
137 );
138
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)139 unsigned int vp8_variance4x4_wmt(
140 const unsigned char *src_ptr,
141 int source_stride,
142 const unsigned char *ref_ptr,
143 int recon_stride,
144 unsigned int *sse)
145 {
146 unsigned int var;
147 int avg;
148
149 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
150 *sse = var;
151 return (var - (((unsigned int)avg * avg) >> 4));
152
153 }
154
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)155 unsigned int vp8_variance8x8_wmt
156 (
157 const unsigned char *src_ptr,
158 int source_stride,
159 const unsigned char *ref_ptr,
160 int recon_stride,
161 unsigned int *sse)
162 {
163 unsigned int var;
164 int avg;
165
166 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
167 *sse = var;
168 return (var - (((unsigned int)avg * avg) >> 6));
169
170 }
171
172
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)173 unsigned int vp8_variance16x16_wmt
174 (
175 const unsigned char *src_ptr,
176 int source_stride,
177 const unsigned char *ref_ptr,
178 int recon_stride,
179 unsigned int *sse)
180 {
181 unsigned int sse0;
182 int sum0;
183
184
185 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
186 *sse = sse0;
187 return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
188 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)189 unsigned int vp8_mse16x16_wmt(
190 const unsigned char *src_ptr,
191 int source_stride,
192 const unsigned char *ref_ptr,
193 int recon_stride,
194 unsigned int *sse)
195 {
196
197 unsigned int sse0;
198 int sum0;
199 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
200 *sse = sse0;
201 return sse0;
202
203 }
204
205
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)206 unsigned int vp8_variance16x8_wmt
207 (
208 const unsigned char *src_ptr,
209 int source_stride,
210 const unsigned char *ref_ptr,
211 int recon_stride,
212 unsigned int *sse)
213 {
214 unsigned int sse0, sse1, var;
215 int sum0, sum1, avg;
216
217 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
218 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
219
220 var = sse0 + sse1;
221 avg = sum0 + sum1;
222 *sse = var;
223 return (var - (((unsigned int)avg * avg) >> 7));
224
225 }
226
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)227 unsigned int vp8_variance8x16_wmt
228 (
229 const unsigned char *src_ptr,
230 int source_stride,
231 const unsigned char *ref_ptr,
232 int recon_stride,
233 unsigned int *sse)
234 {
235 unsigned int sse0, sse1, var;
236 int sum0, sum1, avg;
237
238 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
239 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
240
241 var = sse0 + sse1;
242 avg = sum0 + sum1;
243 *sse = var;
244 return (var - (((unsigned int)avg * avg) >> 7));
245
246 }
247
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)248 unsigned int vp8_sub_pixel_variance4x4_wmt
249 (
250 const unsigned char *src_ptr,
251 int src_pixels_per_line,
252 int xoffset,
253 int yoffset,
254 const unsigned char *dst_ptr,
255 int dst_pixels_per_line,
256 unsigned int *sse
257 )
258 {
259 int xsum;
260 unsigned int xxsum;
261 vp8_filter_block2d_bil4x4_var_mmx(
262 src_ptr, src_pixels_per_line,
263 dst_ptr, dst_pixels_per_line,
264 vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
265 &xsum, &xxsum
266 );
267 *sse = xxsum;
268 return (xxsum - (((unsigned int)xsum * xsum) >> 4));
269 }
270
271
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)272 unsigned int vp8_sub_pixel_variance8x8_wmt
273 (
274 const unsigned char *src_ptr,
275 int src_pixels_per_line,
276 int xoffset,
277 int yoffset,
278 const unsigned char *dst_ptr,
279 int dst_pixels_per_line,
280 unsigned int *sse
281 )
282 {
283 int xsum;
284 unsigned int xxsum;
285
286 if (xoffset == 4 && yoffset == 0)
287 {
288 vp8_half_horiz_variance8x_h_sse2(
289 src_ptr, src_pixels_per_line,
290 dst_ptr, dst_pixels_per_line, 8,
291 &xsum, &xxsum);
292 }
293 else if (xoffset == 0 && yoffset == 4)
294 {
295 vp8_half_vert_variance8x_h_sse2(
296 src_ptr, src_pixels_per_line,
297 dst_ptr, dst_pixels_per_line, 8,
298 &xsum, &xxsum);
299 }
300 else if (xoffset == 4 && yoffset == 4)
301 {
302 vp8_half_horiz_vert_variance8x_h_sse2(
303 src_ptr, src_pixels_per_line,
304 dst_ptr, dst_pixels_per_line, 8,
305 &xsum, &xxsum);
306 }
307 else
308 {
309 vp8_filter_block2d_bil_var_sse2(
310 src_ptr, src_pixels_per_line,
311 dst_ptr, dst_pixels_per_line, 8,
312 xoffset, yoffset,
313 &xsum, &xxsum);
314 }
315
316 *sse = xxsum;
317 return (xxsum - (((unsigned int)xsum * xsum) >> 6));
318 }
319
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)320 unsigned int vp8_sub_pixel_variance16x16_wmt
321 (
322 const unsigned char *src_ptr,
323 int src_pixels_per_line,
324 int xoffset,
325 int yoffset,
326 const unsigned char *dst_ptr,
327 int dst_pixels_per_line,
328 unsigned int *sse
329 )
330 {
331 int xsum0, xsum1;
332 unsigned int xxsum0, xxsum1;
333
334
335 /* note we could avoid these if statements if the calling function
336 * just called the appropriate functions inside.
337 */
338 if (xoffset == 4 && yoffset == 0)
339 {
340 vp8_half_horiz_variance16x_h_sse2(
341 src_ptr, src_pixels_per_line,
342 dst_ptr, dst_pixels_per_line, 16,
343 &xsum0, &xxsum0);
344 }
345 else if (xoffset == 0 && yoffset == 4)
346 {
347 vp8_half_vert_variance16x_h_sse2(
348 src_ptr, src_pixels_per_line,
349 dst_ptr, dst_pixels_per_line, 16,
350 &xsum0, &xxsum0);
351 }
352 else if (xoffset == 4 && yoffset == 4)
353 {
354 vp8_half_horiz_vert_variance16x_h_sse2(
355 src_ptr, src_pixels_per_line,
356 dst_ptr, dst_pixels_per_line, 16,
357 &xsum0, &xxsum0);
358 }
359 else
360 {
361 vp8_filter_block2d_bil_var_sse2(
362 src_ptr, src_pixels_per_line,
363 dst_ptr, dst_pixels_per_line, 16,
364 xoffset, yoffset,
365 &xsum0, &xxsum0
366 );
367
368 vp8_filter_block2d_bil_var_sse2(
369 src_ptr + 8, src_pixels_per_line,
370 dst_ptr + 8, dst_pixels_per_line, 16,
371 xoffset, yoffset,
372 &xsum1, &xxsum1
373 );
374 xsum0 += xsum1;
375 xxsum0 += xxsum1;
376 }
377
378 *sse = xxsum0;
379 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
380 }
381
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)382 unsigned int vp8_sub_pixel_mse16x16_wmt(
383 const unsigned char *src_ptr,
384 int src_pixels_per_line,
385 int xoffset,
386 int yoffset,
387 const unsigned char *dst_ptr,
388 int dst_pixels_per_line,
389 unsigned int *sse
390 )
391 {
392 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
393 return *sse;
394 }
395
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)396 unsigned int vp8_sub_pixel_variance16x8_wmt
397 (
398 const unsigned char *src_ptr,
399 int src_pixels_per_line,
400 int xoffset,
401 int yoffset,
402 const unsigned char *dst_ptr,
403 int dst_pixels_per_line,
404 unsigned int *sse
405
406 )
407 {
408 int xsum0, xsum1;
409 unsigned int xxsum0, xxsum1;
410
411 if (xoffset == 4 && yoffset == 0)
412 {
413 vp8_half_horiz_variance16x_h_sse2(
414 src_ptr, src_pixels_per_line,
415 dst_ptr, dst_pixels_per_line, 8,
416 &xsum0, &xxsum0);
417 }
418 else if (xoffset == 0 && yoffset == 4)
419 {
420 vp8_half_vert_variance16x_h_sse2(
421 src_ptr, src_pixels_per_line,
422 dst_ptr, dst_pixels_per_line, 8,
423 &xsum0, &xxsum0);
424 }
425 else if (xoffset == 4 && yoffset == 4)
426 {
427 vp8_half_horiz_vert_variance16x_h_sse2(
428 src_ptr, src_pixels_per_line,
429 dst_ptr, dst_pixels_per_line, 8,
430 &xsum0, &xxsum0);
431 }
432 else
433 {
434 vp8_filter_block2d_bil_var_sse2(
435 src_ptr, src_pixels_per_line,
436 dst_ptr, dst_pixels_per_line, 8,
437 xoffset, yoffset,
438 &xsum0, &xxsum0);
439
440 vp8_filter_block2d_bil_var_sse2(
441 src_ptr + 8, src_pixels_per_line,
442 dst_ptr + 8, dst_pixels_per_line, 8,
443 xoffset, yoffset,
444 &xsum1, &xxsum1);
445 xsum0 += xsum1;
446 xxsum0 += xxsum1;
447 }
448
449 *sse = xxsum0;
450 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
451 }
452
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)453 unsigned int vp8_sub_pixel_variance8x16_wmt
454 (
455 const unsigned char *src_ptr,
456 int src_pixels_per_line,
457 int xoffset,
458 int yoffset,
459 const unsigned char *dst_ptr,
460 int dst_pixels_per_line,
461 unsigned int *sse
462 )
463 {
464 int xsum;
465 unsigned int xxsum;
466
467 if (xoffset == 4 && yoffset == 0)
468 {
469 vp8_half_horiz_variance8x_h_sse2(
470 src_ptr, src_pixels_per_line,
471 dst_ptr, dst_pixels_per_line, 16,
472 &xsum, &xxsum);
473 }
474 else if (xoffset == 0 && yoffset == 4)
475 {
476 vp8_half_vert_variance8x_h_sse2(
477 src_ptr, src_pixels_per_line,
478 dst_ptr, dst_pixels_per_line, 16,
479 &xsum, &xxsum);
480 }
481 else if (xoffset == 4 && yoffset == 4)
482 {
483 vp8_half_horiz_vert_variance8x_h_sse2(
484 src_ptr, src_pixels_per_line,
485 dst_ptr, dst_pixels_per_line, 16,
486 &xsum, &xxsum);
487 }
488 else
489 {
490 vp8_filter_block2d_bil_var_sse2(
491 src_ptr, src_pixels_per_line,
492 dst_ptr, dst_pixels_per_line, 16,
493 xoffset, yoffset,
494 &xsum, &xxsum);
495 }
496
497 *sse = xxsum;
498 return (xxsum - (((unsigned int)xsum * xsum) >> 7));
499 }
500
501
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)502 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
503 const unsigned char *src_ptr,
504 int src_pixels_per_line,
505 const unsigned char *dst_ptr,
506 int dst_pixels_per_line,
507 unsigned int *sse)
508 {
509 int xsum0;
510 unsigned int xxsum0;
511
512 vp8_half_horiz_variance16x_h_sse2(
513 src_ptr, src_pixels_per_line,
514 dst_ptr, dst_pixels_per_line, 16,
515 &xsum0, &xxsum0);
516
517 *sse = xxsum0;
518 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
519 }
520
521
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)522 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
523 const unsigned char *src_ptr,
524 int src_pixels_per_line,
525 const unsigned char *dst_ptr,
526 int dst_pixels_per_line,
527 unsigned int *sse)
528 {
529 int xsum0;
530 unsigned int xxsum0;
531 vp8_half_vert_variance16x_h_sse2(
532 src_ptr, src_pixels_per_line,
533 dst_ptr, dst_pixels_per_line, 16,
534 &xsum0, &xxsum0);
535
536 *sse = xxsum0;
537 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
538 }
539
540
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)541 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
542 const unsigned char *src_ptr,
543 int src_pixels_per_line,
544 const unsigned char *dst_ptr,
545 int dst_pixels_per_line,
546 unsigned int *sse)
547 {
548 int xsum0;
549 unsigned int xxsum0;
550
551 vp8_half_horiz_vert_variance16x_h_sse2(
552 src_ptr, src_pixels_per_line,
553 dst_ptr, dst_pixels_per_line, 16,
554 &xsum0, &xxsum0);
555
556 *sse = xxsum0;
557 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
558 }
559