1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include "./vpx_config.h"
11
12 #include "vp9/encoder/vp9_variance.h"
13 #include "vp9/common/vp9_pragmas.h"
14 #include "vpx_ports/mem.h"
15
16 typedef void (*get_var_avx2) (
17 const unsigned char *src_ptr,
18 int source_stride,
19 const unsigned char *ref_ptr,
20 int recon_stride,
21 unsigned int *SSE,
22 int *Sum
23 );
24
25 void vp9_get16x16var_avx2
26 (
27 const unsigned char *src_ptr,
28 int source_stride,
29 const unsigned char *ref_ptr,
30 int recon_stride,
31 unsigned int *SSE,
32 int *Sum
33 );
34
35 void vp9_get32x32var_avx2
36 (
37 const unsigned char *src_ptr,
38 int source_stride,
39 const unsigned char *ref_ptr,
40 int recon_stride,
41 unsigned int *SSE,
42 int *Sum
43 );
44
45 unsigned int vp9_sub_pixel_variance32xh_avx2
46 (
47 const uint8_t *src,
48 int src_stride,
49 int x_offset,
50 int y_offset,
51 const uint8_t *dst,
52 int dst_stride,
53 int height,
54 unsigned int *sse
55 );
56
57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2
58 (
59 const uint8_t *src,
60 int src_stride,
61 int x_offset,
62 int y_offset,
63 const uint8_t *dst,
64 int dst_stride,
65 const uint8_t *sec,
66 int sec_stride,
67 int height,
68 unsigned int *sseptr
69 );
70
variance_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,int w,int h,unsigned int * sse,int * sum,get_var_avx2 var_fn,int block_size)71 static void variance_avx2(const unsigned char *src_ptr, int source_stride,
72 const unsigned char *ref_ptr, int recon_stride,
73 int w, int h, unsigned int *sse, int *sum,
74 get_var_avx2 var_fn, int block_size) {
75 unsigned int sse0;
76 int sum0;
77 int i, j;
78
79 *sse = 0;
80 *sum = 0;
81
82 for (i = 0; i < h; i += 16) {
83 for (j = 0; j < w; j += block_size) {
84 // processing 16 rows horizontally each call
85 var_fn(src_ptr + source_stride * i + j, source_stride,
86 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
87 *sse += sse0;
88 *sum += sum0;
89 }
90 }
91 }
92
vp9_variance16x16_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)93 unsigned int vp9_variance16x16_avx2
94 (
95 const unsigned char *src_ptr,
96 int source_stride,
97 const unsigned char *ref_ptr,
98 int recon_stride,
99 unsigned int *sse) {
100 unsigned int var;
101 int avg;
102
103 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
104 &var, &avg, vp9_get16x16var_avx2, 16);
105 *sse = var;
106 return (var - (((unsigned int)avg * avg) >> 8));
107 }
108
vp9_mse16x16_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)109 unsigned int vp9_mse16x16_avx2(
110 const unsigned char *src_ptr,
111 int source_stride,
112 const unsigned char *ref_ptr,
113 int recon_stride,
114 unsigned int *sse) {
115 unsigned int sse0;
116 int sum0;
117 vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
118 &sum0);
119 *sse = sse0;
120 return sse0;
121 }
122
vp9_variance32x32_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
124 int source_stride,
125 const uint8_t *ref_ptr,
126 int recon_stride,
127 unsigned int *sse) {
128 unsigned int var;
129 int avg;
130
131 // processing 32 elements vertically in parallel
132 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
133 &var, &avg, vp9_get32x32var_avx2, 32);
134 *sse = var;
135 return (var - (((int64_t)avg * avg) >> 10));
136 }
137
vp9_variance32x16_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
139 int source_stride,
140 const uint8_t *ref_ptr,
141 int recon_stride,
142 unsigned int *sse) {
143 unsigned int var;
144 int avg;
145
146 // processing 32 elements vertically in parallel
147 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
148 &var, &avg, vp9_get32x32var_avx2, 32);
149 *sse = var;
150 return (var - (((int64_t)avg * avg) >> 9));
151 }
152
153
vp9_variance64x64_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
155 int source_stride,
156 const uint8_t *ref_ptr,
157 int recon_stride,
158 unsigned int *sse) {
159 unsigned int var;
160 int avg;
161
162 // processing 32 elements vertically in parallel
163 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
164 &var, &avg, vp9_get32x32var_avx2, 32);
165 *sse = var;
166 return (var - (((int64_t)avg * avg) >> 12));
167 }
168
vp9_variance64x32_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
170 int source_stride,
171 const uint8_t *ref_ptr,
172 int recon_stride,
173 unsigned int *sse) {
174 unsigned int var;
175 int avg;
176
177 // processing 32 elements vertically in parallel
178 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
179 &var, &avg, vp9_get32x32var_avx2, 32);
180
181 *sse = var;
182 return (var - (((int64_t)avg * avg) >> 11));
183 }
184
vp9_sub_pixel_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse_ptr)185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
186 int src_stride,
187 int x_offset,
188 int y_offset,
189 const uint8_t *dst,
190 int dst_stride,
191 unsigned int *sse_ptr) {
192 // processing 32 elements in parallel
193 unsigned int sse;
194 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
195 y_offset, dst, dst_stride,
196 64, &sse);
197 // processing the next 32 elements in parallel
198 unsigned int sse2;
199 int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
200 x_offset, y_offset,
201 dst + 32, dst_stride,
202 64, &sse2);
203 se += se2;
204 sse += sse2;
205 *sse_ptr = sse;
206 return sse - (((int64_t)se * se) >> 12);
207 }
208
vp9_sub_pixel_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse_ptr)209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
210 int src_stride,
211 int x_offset,
212 int y_offset,
213 const uint8_t *dst,
214 int dst_stride,
215 unsigned int *sse_ptr) {
216 // processing 32 element in parallel
217 unsigned int sse;
218 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
219 y_offset, dst, dst_stride,
220 32, &sse);
221 *sse_ptr = sse;
222 return sse - (((int64_t)se * se) >> 10);
223 }
224
vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sseptr,const uint8_t * sec)225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
226 int src_stride,
227 int x_offset,
228 int y_offset,
229 const uint8_t *dst,
230 int dst_stride,
231 unsigned int *sseptr,
232 const uint8_t *sec) {
233 // processing 32 elements in parallel
234 unsigned int sse;
235
236 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
237 y_offset, dst, dst_stride,
238 sec, 64, 64, &sse);
239 unsigned int sse2;
240 // processing the next 32 elements in parallel
241 int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
242 y_offset, dst + 32, dst_stride,
243 sec + 32, 64, 64, &sse2);
244 se += se2;
245 sse += sse2;
246 *sseptr = sse;
247
248 return sse - (((int64_t)se * se) >> 12);
249 }
250
vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sseptr,const uint8_t * sec)251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
252 int src_stride,
253 int x_offset,
254 int y_offset,
255 const uint8_t *dst,
256 int dst_stride,
257 unsigned int *sseptr,
258 const uint8_t *sec) {
259 // processing 32 element in parallel
260 unsigned int sse;
261 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
262 y_offset, dst, dst_stride,
263 sec, 32, 32, &sse);
264 *sseptr = sse;
265 return sse - (((int64_t)se * se) >> 10);
266 }
267
268
269