1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8/common/variance.h"
13 #include "vpx_ports/mem.h"
14 #include "vp8/common/x86/filter_x86.h"
15 
16 extern void filter_block1d_h6_mmx
17 (
18     const unsigned char *src_ptr,
19     unsigned short *output_ptr,
20     unsigned int src_pixels_per_line,
21     unsigned int pixel_step,
22     unsigned int output_height,
23     unsigned int output_width,
24     short *filter
25 );
26 extern void filter_block1d_v6_mmx
27 (
28     const short *src_ptr,
29     unsigned char *output_ptr,
30     unsigned int pixels_per_line,
31     unsigned int pixel_step,
32     unsigned int output_height,
33     unsigned int output_width,
34     short *filter
35 );
36 
37 extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
38 extern unsigned int vp8_get8x8var_mmx
39 (
40     const unsigned char *src_ptr,
41     int  source_stride,
42     const unsigned char *ref_ptr,
43     int  recon_stride,
44     unsigned int *SSE,
45     int *Sum
46 );
47 extern unsigned int vp8_get4x4var_mmx
48 (
49     const unsigned char *src_ptr,
50     int  source_stride,
51     const unsigned char *ref_ptr,
52     int  recon_stride,
53     unsigned int *SSE,
54     int *Sum
55 );
56 extern void vp8_filter_block2d_bil4x4_var_mmx
57 (
58     const unsigned char *ref_ptr,
59     int ref_pixels_per_line,
60     const unsigned char *src_ptr,
61     int src_pixels_per_line,
62     const short *HFilter,
63     const short *VFilter,
64     int *sum,
65     unsigned int *sumsquared
66 );
67 extern void vp8_filter_block2d_bil_var_mmx
68 (
69     const unsigned char *ref_ptr,
70     int ref_pixels_per_line,
71     const unsigned char *src_ptr,
72     int src_pixels_per_line,
73     unsigned int Height,
74     const short *HFilter,
75     const short *VFilter,
76     int *sum,
77     unsigned int *sumsquared
78 );
79 
80 
vp8_variance4x4_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)81 unsigned int vp8_variance4x4_mmx(
82     const unsigned char *src_ptr,
83     int  source_stride,
84     const unsigned char *ref_ptr,
85     int  recon_stride,
86     unsigned int *sse)
87 {
88     unsigned int var;
89     int avg;
90 
91     vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
92     *sse = var;
93     return (var - (((unsigned int)avg * avg) >> 4));
94 
95 }
96 
vp8_variance8x8_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)97 unsigned int vp8_variance8x8_mmx(
98     const unsigned char *src_ptr,
99     int  source_stride,
100     const unsigned char *ref_ptr,
101     int  recon_stride,
102     unsigned int *sse)
103 {
104     unsigned int var;
105     int avg;
106 
107     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
108     *sse = var;
109 
110     return (var - (((unsigned int)avg * avg) >> 6));
111 
112 }
113 
vp8_mse16x16_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)114 unsigned int vp8_mse16x16_mmx(
115     const unsigned char *src_ptr,
116     int  source_stride,
117     const unsigned char *ref_ptr,
118     int  recon_stride,
119     unsigned int *sse)
120 {
121     unsigned int sse0, sse1, sse2, sse3, var;
122     int sum0, sum1, sum2, sum3;
123 
124 
125     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
126     vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
127     vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
128     vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
129 
130     var = sse0 + sse1 + sse2 + sse3;
131     *sse = var;
132     return var;
133 }
134 
135 
vp8_variance16x16_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)136 unsigned int vp8_variance16x16_mmx(
137     const unsigned char *src_ptr,
138     int  source_stride,
139     const unsigned char *ref_ptr,
140     int  recon_stride,
141     unsigned int *sse)
142 {
143     unsigned int sse0, sse1, sse2, sse3, var;
144     int sum0, sum1, sum2, sum3, avg;
145 
146 
147     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
148     vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
149     vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
150     vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
151 
152     var = sse0 + sse1 + sse2 + sse3;
153     avg = sum0 + sum1 + sum2 + sum3;
154     *sse = var;
155     return (var - (((unsigned int)avg * avg) >> 8));
156 }
157 
vp8_variance16x8_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)158 unsigned int vp8_variance16x8_mmx(
159     const unsigned char *src_ptr,
160     int  source_stride,
161     const unsigned char *ref_ptr,
162     int  recon_stride,
163     unsigned int *sse)
164 {
165     unsigned int sse0, sse1, var;
166     int sum0, sum1, avg;
167 
168     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
169     vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
170 
171     var = sse0 + sse1;
172     avg = sum0 + sum1;
173     *sse = var;
174     return (var - (((unsigned int)avg * avg) >> 7));
175 
176 }
177 
178 
vp8_variance8x16_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)179 unsigned int vp8_variance8x16_mmx(
180     const unsigned char *src_ptr,
181     int  source_stride,
182     const unsigned char *ref_ptr,
183     int  recon_stride,
184     unsigned int *sse)
185 {
186     unsigned int sse0, sse1, var;
187     int sum0, sum1, avg;
188 
189     vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
190     vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
191 
192     var = sse0 + sse1;
193     avg = sum0 + sum1;
194     *sse = var;
195 
196     return (var - (((unsigned int)avg * avg) >> 7));
197 
198 }
199 
200 
vp8_sub_pixel_variance4x4_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)201 unsigned int vp8_sub_pixel_variance4x4_mmx
202 (
203     const unsigned char  *src_ptr,
204     int  src_pixels_per_line,
205     int  xoffset,
206     int  yoffset,
207     const unsigned char *dst_ptr,
208     int dst_pixels_per_line,
209     unsigned int *sse)
210 
211 {
212     int xsum;
213     unsigned int xxsum;
214     vp8_filter_block2d_bil4x4_var_mmx(
215         src_ptr, src_pixels_per_line,
216         dst_ptr, dst_pixels_per_line,
217         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
218         &xsum, &xxsum
219     );
220     *sse = xxsum;
221     return (xxsum - (((unsigned int)xsum * xsum) >> 4));
222 }
223 
224 
vp8_sub_pixel_variance8x8_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)225 unsigned int vp8_sub_pixel_variance8x8_mmx
226 (
227     const unsigned char  *src_ptr,
228     int  src_pixels_per_line,
229     int  xoffset,
230     int  yoffset,
231     const unsigned char *dst_ptr,
232     int dst_pixels_per_line,
233     unsigned int *sse
234 )
235 {
236 
237     int xsum;
238     unsigned int xxsum;
239     vp8_filter_block2d_bil_var_mmx(
240         src_ptr, src_pixels_per_line,
241         dst_ptr, dst_pixels_per_line, 8,
242         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
243         &xsum, &xxsum
244     );
245     *sse = xxsum;
246     return (xxsum - (((unsigned int)xsum * xsum) >> 6));
247 }
248 
vp8_sub_pixel_variance16x16_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)249 unsigned int vp8_sub_pixel_variance16x16_mmx
250 (
251     const unsigned char  *src_ptr,
252     int  src_pixels_per_line,
253     int  xoffset,
254     int  yoffset,
255     const unsigned char *dst_ptr,
256     int dst_pixels_per_line,
257     unsigned int *sse
258 )
259 {
260 
261     int xsum0, xsum1;
262     unsigned int xxsum0, xxsum1;
263 
264 
265     vp8_filter_block2d_bil_var_mmx(
266         src_ptr, src_pixels_per_line,
267         dst_ptr, dst_pixels_per_line, 16,
268         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
269         &xsum0, &xxsum0
270     );
271 
272 
273     vp8_filter_block2d_bil_var_mmx(
274         src_ptr + 8, src_pixels_per_line,
275         dst_ptr + 8, dst_pixels_per_line, 16,
276         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
277         &xsum1, &xxsum1
278     );
279 
280     xsum0 += xsum1;
281     xxsum0 += xxsum1;
282 
283     *sse = xxsum0;
284     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
285 
286 
287 }
288 
vp8_sub_pixel_mse16x16_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)289 unsigned int vp8_sub_pixel_mse16x16_mmx(
290     const unsigned char  *src_ptr,
291     int  src_pixels_per_line,
292     int  xoffset,
293     int  yoffset,
294     const unsigned char *dst_ptr,
295     int dst_pixels_per_line,
296     unsigned int *sse
297 )
298 {
299     vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
300     return *sse;
301 }
302 
vp8_sub_pixel_variance16x8_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)303 unsigned int vp8_sub_pixel_variance16x8_mmx
304 (
305     const unsigned char  *src_ptr,
306     int  src_pixels_per_line,
307     int  xoffset,
308     int  yoffset,
309     const unsigned char *dst_ptr,
310     int dst_pixels_per_line,
311     unsigned int *sse
312 )
313 {
314     int xsum0, xsum1;
315     unsigned int xxsum0, xxsum1;
316 
317 
318     vp8_filter_block2d_bil_var_mmx(
319         src_ptr, src_pixels_per_line,
320         dst_ptr, dst_pixels_per_line, 8,
321         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
322         &xsum0, &xxsum0
323     );
324 
325 
326     vp8_filter_block2d_bil_var_mmx(
327         src_ptr + 8, src_pixels_per_line,
328         dst_ptr + 8, dst_pixels_per_line, 8,
329         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
330         &xsum1, &xxsum1
331     );
332 
333     xsum0 += xsum1;
334     xxsum0 += xxsum1;
335 
336     *sse = xxsum0;
337     return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
338 }
339 
vp8_sub_pixel_variance8x16_mmx(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)340 unsigned int vp8_sub_pixel_variance8x16_mmx
341 (
342     const unsigned char  *src_ptr,
343     int  src_pixels_per_line,
344     int  xoffset,
345     int  yoffset,
346     const unsigned char *dst_ptr,
347     int dst_pixels_per_line,
348     unsigned int *sse
349 )
350 {
351     int xsum;
352     unsigned int xxsum;
353     vp8_filter_block2d_bil_var_mmx(
354         src_ptr, src_pixels_per_line,
355         dst_ptr, dst_pixels_per_line, 16,
356         vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
357         &xsum, &xxsum
358     );
359     *sse = xxsum;
360     return (xxsum - (((unsigned int)xsum * xsum) >> 7));
361 }
362 
363 
vp8_variance_halfpixvar16x16_h_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)364 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
365     const unsigned char *src_ptr,
366     int  source_stride,
367     const unsigned char *ref_ptr,
368     int  recon_stride,
369     unsigned int *sse)
370 {
371     return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
372                                            ref_ptr, recon_stride, sse);
373 }
374 
375 
vp8_variance_halfpixvar16x16_v_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)376 unsigned int vp8_variance_halfpixvar16x16_v_mmx(
377     const unsigned char *src_ptr,
378     int  source_stride,
379     const unsigned char *ref_ptr,
380     int  recon_stride,
381     unsigned int *sse)
382 {
383     return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
384                                            ref_ptr, recon_stride, sse);
385 }
386 
387 
vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)388 unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
389     const unsigned char *src_ptr,
390     int  source_stride,
391     const unsigned char *ref_ptr,
392     int  recon_stride,
393     unsigned int *sse)
394 {
395     return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
396                                            ref_ptr, recon_stride, sse);
397 }
398