1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_config.h"
12 #include "vp8/common/variance.h"
13 #include "vpx_ports/mem.h"
14 #include "vp8/common/x86/filter_x86.h"
15
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
20
21 extern void vp8_filter_block2d_bil4x4_var_mmx
22 (
23 const unsigned char *ref_ptr,
24 int ref_pixels_per_line,
25 const unsigned char *src_ptr,
26 int src_pixels_per_line,
27 const short *HFilter,
28 const short *VFilter,
29 int *sum,
30 unsigned int *sumsquared
31 );
32
33 extern unsigned int vp8_get4x4var_mmx
34 (
35 const unsigned char *src_ptr,
36 int source_stride,
37 const unsigned char *ref_ptr,
38 int recon_stride,
39 unsigned int *SSE,
40 int *Sum
41 );
42
43 unsigned int vp8_get_mb_ss_sse2
44 (
45 const short *src_ptr
46 );
47 unsigned int vp8_get16x16var_sse2
48 (
49 const unsigned char *src_ptr,
50 int source_stride,
51 const unsigned char *ref_ptr,
52 int recon_stride,
53 unsigned int *SSE,
54 int *Sum
55 );
56 unsigned int vp8_get8x8var_sse2
57 (
58 const unsigned char *src_ptr,
59 int source_stride,
60 const unsigned char *ref_ptr,
61 int recon_stride,
62 unsigned int *SSE,
63 int *Sum
64 );
65 void vp8_filter_block2d_bil_var_sse2
66 (
67 const unsigned char *ref_ptr,
68 int ref_pixels_per_line,
69 const unsigned char *src_ptr,
70 int src_pixels_per_line,
71 unsigned int Height,
72 int xoffset,
73 int yoffset,
74 int *sum,
75 unsigned int *sumsquared
76 );
77 void vp8_half_horiz_vert_variance8x_h_sse2
78 (
79 const unsigned char *ref_ptr,
80 int ref_pixels_per_line,
81 const unsigned char *src_ptr,
82 int src_pixels_per_line,
83 unsigned int Height,
84 int *sum,
85 unsigned int *sumsquared
86 );
87 void vp8_half_horiz_vert_variance16x_h_sse2
88 (
89 const unsigned char *ref_ptr,
90 int ref_pixels_per_line,
91 const unsigned char *src_ptr,
92 int src_pixels_per_line,
93 unsigned int Height,
94 int *sum,
95 unsigned int *sumsquared
96 );
97 void vp8_half_horiz_variance8x_h_sse2
98 (
99 const unsigned char *ref_ptr,
100 int ref_pixels_per_line,
101 const unsigned char *src_ptr,
102 int src_pixels_per_line,
103 unsigned int Height,
104 int *sum,
105 unsigned int *sumsquared
106 );
107 void vp8_half_horiz_variance16x_h_sse2
108 (
109 const unsigned char *ref_ptr,
110 int ref_pixels_per_line,
111 const unsigned char *src_ptr,
112 int src_pixels_per_line,
113 unsigned int Height,
114 int *sum,
115 unsigned int *sumsquared
116 );
117 void vp8_half_vert_variance8x_h_sse2
118 (
119 const unsigned char *ref_ptr,
120 int ref_pixels_per_line,
121 const unsigned char *src_ptr,
122 int src_pixels_per_line,
123 unsigned int Height,
124 int *sum,
125 unsigned int *sumsquared
126 );
127 void vp8_half_vert_variance16x_h_sse2
128 (
129 const unsigned char *ref_ptr,
130 int ref_pixels_per_line,
131 const unsigned char *src_ptr,
132 int src_pixels_per_line,
133 unsigned int Height,
134 int *sum,
135 unsigned int *sumsquared
136 );
137
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)138 unsigned int vp8_variance4x4_wmt(
139 const unsigned char *src_ptr,
140 int source_stride,
141 const unsigned char *ref_ptr,
142 int recon_stride,
143 unsigned int *sse)
144 {
145 unsigned int var;
146 int avg;
147
148 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
149 *sse = var;
150 return (var - (((unsigned int)avg * avg) >> 4));
151
152 }
153
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)154 unsigned int vp8_variance8x8_wmt
155 (
156 const unsigned char *src_ptr,
157 int source_stride,
158 const unsigned char *ref_ptr,
159 int recon_stride,
160 unsigned int *sse)
161 {
162 unsigned int var;
163 int avg;
164
165 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
166 *sse = var;
167 return (var - (((unsigned int)avg * avg) >> 6));
168
169 }
170
171
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)172 unsigned int vp8_variance16x16_wmt
173 (
174 const unsigned char *src_ptr,
175 int source_stride,
176 const unsigned char *ref_ptr,
177 int recon_stride,
178 unsigned int *sse)
179 {
180 unsigned int sse0;
181 int sum0;
182
183
184 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
185 *sse = sse0;
186 return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
187 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)188 unsigned int vp8_mse16x16_wmt(
189 const unsigned char *src_ptr,
190 int source_stride,
191 const unsigned char *ref_ptr,
192 int recon_stride,
193 unsigned int *sse)
194 {
195
196 unsigned int sse0;
197 int sum0;
198 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
199 *sse = sse0;
200 return sse0;
201
202 }
203
204
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)205 unsigned int vp8_variance16x8_wmt
206 (
207 const unsigned char *src_ptr,
208 int source_stride,
209 const unsigned char *ref_ptr,
210 int recon_stride,
211 unsigned int *sse)
212 {
213 unsigned int sse0, sse1, var;
214 int sum0, sum1, avg;
215
216 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
217 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
218
219 var = sse0 + sse1;
220 avg = sum0 + sum1;
221 *sse = var;
222 return (var - (((unsigned int)avg * avg) >> 7));
223
224 }
225
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)226 unsigned int vp8_variance8x16_wmt
227 (
228 const unsigned char *src_ptr,
229 int source_stride,
230 const unsigned char *ref_ptr,
231 int recon_stride,
232 unsigned int *sse)
233 {
234 unsigned int sse0, sse1, var;
235 int sum0, sum1, avg;
236
237 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
238 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
239
240 var = sse0 + sse1;
241 avg = sum0 + sum1;
242 *sse = var;
243 return (var - (((unsigned int)avg * avg) >> 7));
244
245 }
246
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)247 unsigned int vp8_sub_pixel_variance4x4_wmt
248 (
249 const unsigned char *src_ptr,
250 int src_pixels_per_line,
251 int xoffset,
252 int yoffset,
253 const unsigned char *dst_ptr,
254 int dst_pixels_per_line,
255 unsigned int *sse
256 )
257 {
258 int xsum;
259 unsigned int xxsum;
260 vp8_filter_block2d_bil4x4_var_mmx(
261 src_ptr, src_pixels_per_line,
262 dst_ptr, dst_pixels_per_line,
263 vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
264 &xsum, &xxsum
265 );
266 *sse = xxsum;
267 return (xxsum - (((unsigned int)xsum * xsum) >> 4));
268 }
269
270
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)271 unsigned int vp8_sub_pixel_variance8x8_wmt
272 (
273 const unsigned char *src_ptr,
274 int src_pixels_per_line,
275 int xoffset,
276 int yoffset,
277 const unsigned char *dst_ptr,
278 int dst_pixels_per_line,
279 unsigned int *sse
280 )
281 {
282 int xsum;
283 unsigned int xxsum;
284
285 if (xoffset == 4 && yoffset == 0)
286 {
287 vp8_half_horiz_variance8x_h_sse2(
288 src_ptr, src_pixels_per_line,
289 dst_ptr, dst_pixels_per_line, 8,
290 &xsum, &xxsum);
291 }
292 else if (xoffset == 0 && yoffset == 4)
293 {
294 vp8_half_vert_variance8x_h_sse2(
295 src_ptr, src_pixels_per_line,
296 dst_ptr, dst_pixels_per_line, 8,
297 &xsum, &xxsum);
298 }
299 else if (xoffset == 4 && yoffset == 4)
300 {
301 vp8_half_horiz_vert_variance8x_h_sse2(
302 src_ptr, src_pixels_per_line,
303 dst_ptr, dst_pixels_per_line, 8,
304 &xsum, &xxsum);
305 }
306 else
307 {
308 vp8_filter_block2d_bil_var_sse2(
309 src_ptr, src_pixels_per_line,
310 dst_ptr, dst_pixels_per_line, 8,
311 xoffset, yoffset,
312 &xsum, &xxsum);
313 }
314
315 *sse = xxsum;
316 return (xxsum - (((unsigned int)xsum * xsum) >> 6));
317 }
318
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)319 unsigned int vp8_sub_pixel_variance16x16_wmt
320 (
321 const unsigned char *src_ptr,
322 int src_pixels_per_line,
323 int xoffset,
324 int yoffset,
325 const unsigned char *dst_ptr,
326 int dst_pixels_per_line,
327 unsigned int *sse
328 )
329 {
330 int xsum0, xsum1;
331 unsigned int xxsum0, xxsum1;
332
333
334 /* note we could avoid these if statements if the calling function
335 * just called the appropriate functions inside.
336 */
337 if (xoffset == 4 && yoffset == 0)
338 {
339 vp8_half_horiz_variance16x_h_sse2(
340 src_ptr, src_pixels_per_line,
341 dst_ptr, dst_pixels_per_line, 16,
342 &xsum0, &xxsum0);
343 }
344 else if (xoffset == 0 && yoffset == 4)
345 {
346 vp8_half_vert_variance16x_h_sse2(
347 src_ptr, src_pixels_per_line,
348 dst_ptr, dst_pixels_per_line, 16,
349 &xsum0, &xxsum0);
350 }
351 else if (xoffset == 4 && yoffset == 4)
352 {
353 vp8_half_horiz_vert_variance16x_h_sse2(
354 src_ptr, src_pixels_per_line,
355 dst_ptr, dst_pixels_per_line, 16,
356 &xsum0, &xxsum0);
357 }
358 else
359 {
360 vp8_filter_block2d_bil_var_sse2(
361 src_ptr, src_pixels_per_line,
362 dst_ptr, dst_pixels_per_line, 16,
363 xoffset, yoffset,
364 &xsum0, &xxsum0
365 );
366
367 vp8_filter_block2d_bil_var_sse2(
368 src_ptr + 8, src_pixels_per_line,
369 dst_ptr + 8, dst_pixels_per_line, 16,
370 xoffset, yoffset,
371 &xsum1, &xxsum1
372 );
373 xsum0 += xsum1;
374 xxsum0 += xxsum1;
375 }
376
377 *sse = xxsum0;
378 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
379 }
380
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)381 unsigned int vp8_sub_pixel_mse16x16_wmt(
382 const unsigned char *src_ptr,
383 int src_pixels_per_line,
384 int xoffset,
385 int yoffset,
386 const unsigned char *dst_ptr,
387 int dst_pixels_per_line,
388 unsigned int *sse
389 )
390 {
391 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
392 return *sse;
393 }
394
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)395 unsigned int vp8_sub_pixel_variance16x8_wmt
396 (
397 const unsigned char *src_ptr,
398 int src_pixels_per_line,
399 int xoffset,
400 int yoffset,
401 const unsigned char *dst_ptr,
402 int dst_pixels_per_line,
403 unsigned int *sse
404
405 )
406 {
407 int xsum0, xsum1;
408 unsigned int xxsum0, xxsum1;
409
410 if (xoffset == 4 && yoffset == 0)
411 {
412 vp8_half_horiz_variance16x_h_sse2(
413 src_ptr, src_pixels_per_line,
414 dst_ptr, dst_pixels_per_line, 8,
415 &xsum0, &xxsum0);
416 }
417 else if (xoffset == 0 && yoffset == 4)
418 {
419 vp8_half_vert_variance16x_h_sse2(
420 src_ptr, src_pixels_per_line,
421 dst_ptr, dst_pixels_per_line, 8,
422 &xsum0, &xxsum0);
423 }
424 else if (xoffset == 4 && yoffset == 4)
425 {
426 vp8_half_horiz_vert_variance16x_h_sse2(
427 src_ptr, src_pixels_per_line,
428 dst_ptr, dst_pixels_per_line, 8,
429 &xsum0, &xxsum0);
430 }
431 else
432 {
433 vp8_filter_block2d_bil_var_sse2(
434 src_ptr, src_pixels_per_line,
435 dst_ptr, dst_pixels_per_line, 8,
436 xoffset, yoffset,
437 &xsum0, &xxsum0);
438
439 vp8_filter_block2d_bil_var_sse2(
440 src_ptr + 8, src_pixels_per_line,
441 dst_ptr + 8, dst_pixels_per_line, 8,
442 xoffset, yoffset,
443 &xsum1, &xxsum1);
444 xsum0 += xsum1;
445 xxsum0 += xxsum1;
446 }
447
448 *sse = xxsum0;
449 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
450 }
451
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)452 unsigned int vp8_sub_pixel_variance8x16_wmt
453 (
454 const unsigned char *src_ptr,
455 int src_pixels_per_line,
456 int xoffset,
457 int yoffset,
458 const unsigned char *dst_ptr,
459 int dst_pixels_per_line,
460 unsigned int *sse
461 )
462 {
463 int xsum;
464 unsigned int xxsum;
465
466 if (xoffset == 4 && yoffset == 0)
467 {
468 vp8_half_horiz_variance8x_h_sse2(
469 src_ptr, src_pixels_per_line,
470 dst_ptr, dst_pixels_per_line, 16,
471 &xsum, &xxsum);
472 }
473 else if (xoffset == 0 && yoffset == 4)
474 {
475 vp8_half_vert_variance8x_h_sse2(
476 src_ptr, src_pixels_per_line,
477 dst_ptr, dst_pixels_per_line, 16,
478 &xsum, &xxsum);
479 }
480 else if (xoffset == 4 && yoffset == 4)
481 {
482 vp8_half_horiz_vert_variance8x_h_sse2(
483 src_ptr, src_pixels_per_line,
484 dst_ptr, dst_pixels_per_line, 16,
485 &xsum, &xxsum);
486 }
487 else
488 {
489 vp8_filter_block2d_bil_var_sse2(
490 src_ptr, src_pixels_per_line,
491 dst_ptr, dst_pixels_per_line, 16,
492 xoffset, yoffset,
493 &xsum, &xxsum);
494 }
495
496 *sse = xxsum;
497 return (xxsum - (((unsigned int)xsum * xsum) >> 7));
498 }
499
500
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)501 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
502 const unsigned char *src_ptr,
503 int src_pixels_per_line,
504 const unsigned char *dst_ptr,
505 int dst_pixels_per_line,
506 unsigned int *sse)
507 {
508 int xsum0;
509 unsigned int xxsum0;
510
511 vp8_half_horiz_variance16x_h_sse2(
512 src_ptr, src_pixels_per_line,
513 dst_ptr, dst_pixels_per_line, 16,
514 &xsum0, &xxsum0);
515
516 *sse = xxsum0;
517 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
518 }
519
520
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)521 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
522 const unsigned char *src_ptr,
523 int src_pixels_per_line,
524 const unsigned char *dst_ptr,
525 int dst_pixels_per_line,
526 unsigned int *sse)
527 {
528 int xsum0;
529 unsigned int xxsum0;
530 vp8_half_vert_variance16x_h_sse2(
531 src_ptr, src_pixels_per_line,
532 dst_ptr, dst_pixels_per_line, 16,
533 &xsum0, &xxsum0);
534
535 *sse = xxsum0;
536 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
537 }
538
539
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)540 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
541 const unsigned char *src_ptr,
542 int src_pixels_per_line,
543 const unsigned char *dst_ptr,
544 int dst_pixels_per_line,
545 unsigned int *sse)
546 {
547 int xsum0;
548 unsigned int xxsum0;
549
550 vp8_half_horiz_vert_variance16x_h_sse2(
551 src_ptr, src_pixels_per_line,
552 dst_ptr, dst_pixels_per_line, 16,
553 &xsum0, &xxsum0);
554
555 *sse = xxsum0;
556 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
557 }
558