1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include "./vpx_config.h"
11 
12 #include "vp9/encoder/vp9_variance.h"
13 #include "vp9/common/vp9_pragmas.h"
14 #include "vpx_ports/mem.h"
15 
16 typedef void (*get_var_avx2) (
17   const unsigned char *src_ptr,
18   int source_stride,
19   const unsigned char *ref_ptr,
20   int recon_stride,
21   unsigned int *SSE,
22   int *Sum
23 );
24 
25 void vp9_get16x16var_avx2
26 (
27   const unsigned char *src_ptr,
28   int source_stride,
29   const unsigned char *ref_ptr,
30   int recon_stride,
31   unsigned int *SSE,
32   int *Sum
33 );
34 
35 void vp9_get32x32var_avx2
36 (
37   const unsigned char *src_ptr,
38   int source_stride,
39   const unsigned char *ref_ptr,
40   int recon_stride,
41   unsigned int *SSE,
42   int *Sum
43 );
44 
45 unsigned int vp9_sub_pixel_variance32xh_avx2
46 (
47   const uint8_t *src,
48   int src_stride,
49   int x_offset,
50   int y_offset,
51   const uint8_t *dst,
52   int dst_stride,
53   int height,
54   unsigned int *sse
55 );
56 
57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2
58 (
59   const uint8_t *src,
60   int src_stride,
61   int x_offset,
62   int y_offset,
63   const uint8_t *dst,
64   int dst_stride,
65   const uint8_t *sec,
66   int sec_stride,
67   int height,
68   unsigned int *sseptr
69 );
70 
variance_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,int w,int h,unsigned int * sse,int * sum,get_var_avx2 var_fn,int block_size)71 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
72                         const unsigned char *ref_ptr, int  recon_stride,
73                         int  w, int  h, unsigned int *sse, int *sum,
74                         get_var_avx2 var_fn, int block_size) {
75   unsigned int sse0;
76   int sum0;
77   int i, j;
78 
79   *sse = 0;
80   *sum = 0;
81 
82   for (i = 0; i < h; i += 16) {
83     for (j = 0; j < w; j += block_size) {
84       // processing 16 rows horizontally each call
85       var_fn(src_ptr + source_stride * i + j, source_stride,
86              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
87       *sse += sse0;
88       *sum += sum0;
89     }
90   }
91 }
92 
vp9_variance16x16_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)93 unsigned int vp9_variance16x16_avx2
94 (
95   const unsigned char *src_ptr,
96   int  source_stride,
97   const unsigned char *ref_ptr,
98   int  recon_stride,
99   unsigned int *sse) {
100   unsigned int var;
101   int avg;
102 
103   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
104                 &var, &avg, vp9_get16x16var_avx2, 16);
105   *sse = var;
106   return (var - (((unsigned int)avg * avg) >> 8));
107 }
108 
vp9_mse16x16_avx2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)109 unsigned int vp9_mse16x16_avx2(
110   const unsigned char *src_ptr,
111   int  source_stride,
112   const unsigned char *ref_ptr,
113   int  recon_stride,
114   unsigned int *sse) {
115   unsigned int sse0;
116   int sum0;
117   vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
118                        &sum0);
119   *sse = sse0;
120   return sse0;
121 }
122 
vp9_variance32x32_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
124                                     int  source_stride,
125                                     const uint8_t *ref_ptr,
126                                     int  recon_stride,
127                                     unsigned int *sse) {
128   unsigned int var;
129   int avg;
130 
131   // processing 32 elements vertically in parallel
132   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
133                 &var, &avg, vp9_get32x32var_avx2, 32);
134   *sse = var;
135   return (var - (((int64_t)avg * avg) >> 10));
136 }
137 
vp9_variance32x16_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
139                                     int  source_stride,
140                                     const uint8_t *ref_ptr,
141                                     int  recon_stride,
142                                     unsigned int *sse) {
143   unsigned int var;
144   int avg;
145 
146   // processing 32 elements vertically in parallel
147   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
148                 &var, &avg, vp9_get32x32var_avx2, 32);
149   *sse = var;
150   return (var - (((int64_t)avg * avg) >> 9));
151 }
152 
153 
vp9_variance64x64_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
155                                     int  source_stride,
156                                     const uint8_t *ref_ptr,
157                                     int  recon_stride,
158                                     unsigned int *sse) {
159   unsigned int var;
160   int avg;
161 
162   // processing 32 elements vertically in parallel
163   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
164                 &var, &avg, vp9_get32x32var_avx2, 32);
165   *sse = var;
166   return (var - (((int64_t)avg * avg) >> 12));
167 }
168 
vp9_variance64x32_avx2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
170                                     int  source_stride,
171                                     const uint8_t *ref_ptr,
172                                     int  recon_stride,
173                                     unsigned int *sse) {
174   unsigned int var;
175   int avg;
176 
177   // processing 32 elements vertically in parallel
178   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
179                 &var, &avg, vp9_get32x32var_avx2, 32);
180 
181   *sse = var;
182   return (var - (((int64_t)avg * avg) >> 11));
183 }
184 
vp9_sub_pixel_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse_ptr)185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
186                                               int src_stride,
187                                               int x_offset,
188                                               int y_offset,
189                                               const uint8_t *dst,
190                                               int dst_stride,
191                                               unsigned int *sse_ptr) {
192   // processing 32 elements in parallel
193   unsigned int sse;
194   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
195                                            y_offset, dst, dst_stride,
196                                            64, &sse);
197   // processing the next 32 elements in parallel
198   unsigned int sse2;
199   int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
200                                             x_offset, y_offset,
201                                             dst + 32, dst_stride,
202                                             64, &sse2);
203   se += se2;
204   sse += sse2;
205   *sse_ptr = sse;
206   return sse - (((int64_t)se * se) >> 12);
207 }
208 
vp9_sub_pixel_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse_ptr)209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
210                                               int src_stride,
211                                               int x_offset,
212                                               int y_offset,
213                                               const uint8_t *dst,
214                                               int dst_stride,
215                                               unsigned int *sse_ptr) {
216   // processing 32 element in parallel
217   unsigned int sse;
218   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
219                                            y_offset, dst, dst_stride,
220                                            32, &sse);
221   *sse_ptr = sse;
222   return sse - (((int64_t)se * se) >> 10);
223 }
224 
vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sseptr,const uint8_t * sec)225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
226                                                   int src_stride,
227                                                   int x_offset,
228                                                   int y_offset,
229                                                   const uint8_t *dst,
230                                                   int dst_stride,
231                                                   unsigned int *sseptr,
232                                                   const uint8_t *sec) {
233   // processing 32 elements in parallel
234   unsigned int sse;
235 
236   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
237                                                y_offset, dst, dst_stride,
238                                                sec, 64, 64, &sse);
239   unsigned int sse2;
240   // processing the next 32 elements in parallel
241   int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
242                                                 y_offset, dst + 32, dst_stride,
243                                                 sec + 32, 64, 64, &sse2);
244   se += se2;
245   sse += sse2;
246   *sseptr = sse;
247 
248   return sse - (((int64_t)se * se) >> 12);
249 }
250 
vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sseptr,const uint8_t * sec)251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
252                                                   int src_stride,
253                                                   int x_offset,
254                                                   int y_offset,
255                                                   const uint8_t *dst,
256                                                   int dst_stride,
257                                                   unsigned int *sseptr,
258                                                   const uint8_t *sec) {
259   // processing 32 element in parallel
260   unsigned int sse;
261   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
262                                                  y_offset, dst, dst_stride,
263                                                  sec, 32, 32, &sse);
264   *sseptr = sse;
265   return sse - (((int64_t)se * se) >> 10);
266 }
267 
268 
269