1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/av1_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_ports/mem.h"
21 
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 #include "aom_dsp/variance.h"
25 
26 #include "av1/common/filter.h"
27 #include "av1/common/onyxc_int.h"
28 #include "av1/common/reconinter.h"
29 
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)30 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
31                             int b_stride) {
32   int distortion = 0;
33   int r, c;
34 
35   for (r = 0; r < 4; ++r) {
36     for (c = 0; c < 4; ++c) {
37       int diff = a[c] - b[c];
38       distortion += diff * diff;
39     }
40 
41     a += a_stride;
42     b += b_stride;
43   }
44 
45   return distortion;
46 }
47 
aom_get_mb_ss_c(const int16_t * a)48 uint32_t aom_get_mb_ss_c(const int16_t *a) {
49   unsigned int i, sum = 0;
50 
51   for (i = 0; i < 256; ++i) {
52     sum += a[i] * a[i];
53   }
54 
55   return sum;
56 }
57 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)58 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
59                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
60   int i, j;
61 
62   *sum = 0;
63   *sse = 0;
64 
65   for (i = 0; i < h; ++i) {
66     for (j = 0; j < w; ++j) {
67       const int diff = a[j] - b[j];
68       *sum += diff;
69       *sse += diff * diff;
70     }
71 
72     a += a_stride;
73     b += b_stride;
74   }
75 }
76 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)77 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
78                           int b_stride, int w, int h) {
79   uint32_t sse;
80   int sum;
81   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
82   return sse;
83 }
84 
85 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
86 // or vertical direction to produce the filtered output block. Used to implement
87 // the first-pass of 2-D separable filter.
88 //
89 // Produces int16_t output to retain precision for the next pass. Two filter
90 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
91 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
92 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)93 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
94                                              unsigned int src_pixels_per_line,
95                                              unsigned int pixel_step,
96                                              unsigned int output_height,
97                                              unsigned int output_width,
98                                              const uint8_t *filter) {
99   unsigned int i, j;
100 
101   for (i = 0; i < output_height; ++i) {
102     for (j = 0; j < output_width; ++j) {
103       b[j] = ROUND_POWER_OF_TWO(
104           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
105 
106       ++a;
107     }
108 
109     a += src_pixels_per_line - output_width;
110     b += output_width;
111   }
112 }
113 
114 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
115 // or vertical direction to produce the filtered output block. Used to implement
116 // the second-pass of 2-D separable filter.
117 //
118 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
119 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
120 // filter is applied horizontally (pixel_step = 1) or vertically
121 // (pixel_step = stride). It defines the offset required to move from one input
122 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)123 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
124                                               unsigned int src_pixels_per_line,
125                                               unsigned int pixel_step,
126                                               unsigned int output_height,
127                                               unsigned int output_width,
128                                               const uint8_t *filter) {
129   unsigned int i, j;
130 
131   for (i = 0; i < output_height; ++i) {
132     for (j = 0; j < output_width; ++j) {
133       b[j] = ROUND_POWER_OF_TWO(
134           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
135       ++a;
136     }
137 
138     a += src_pixels_per_line - output_width;
139     b += output_width;
140   }
141 }
142 
143 #define VAR(W, H)                                                    \
144   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
145                                      const uint8_t *b, int b_stride, \
146                                      uint32_t *sse) {                \
147     int sum;                                                         \
148     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
149     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
150   }
151 
152 #define SUBPIX_VAR(W, H)                                                      \
153   uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
154       const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
155       const uint8_t *b, int b_stride, uint32_t *sse) {                        \
156     uint16_t fdata3[(H + 1) * W];                                             \
157     uint8_t temp2[H * W];                                                     \
158                                                                               \
159     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
160                                             bilinear_filters_2t[xoffset]);    \
161     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
162                                              bilinear_filters_2t[yoffset]);   \
163                                                                               \
164     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
165   }
166 
167 #define SUBPIX_AVG_VAR(W, H)                                                   \
168   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
169       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
170       const uint8_t *b, int b_stride, uint32_t *sse,                           \
171       const uint8_t *second_pred) {                                            \
172     uint16_t fdata3[(H + 1) * W];                                              \
173     uint8_t temp2[H * W];                                                      \
174     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
175                                                                                \
176     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
177                                             bilinear_filters_2t[xoffset]);     \
178     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
179                                              bilinear_filters_2t[yoffset]);    \
180                                                                                \
181     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
182                                                                                \
183     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
184   }                                                                            \
185   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
186       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
187       const uint8_t *b, int b_stride, uint32_t *sse,                           \
188       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
189     uint16_t fdata3[(H + 1) * W];                                              \
190     uint8_t temp2[H * W];                                                      \
191     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
192                                                                                \
193     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
194                                             bilinear_filters_2t[xoffset]);     \
195     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
196                                              bilinear_filters_2t[yoffset]);    \
197                                                                                \
198     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
199                                                                                \
200     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
201   }
202 
203 /* Identical to the variance call except it takes an additional parameter, sum,
204  * and returns that value using pass-by-reference instead of returning
205  * sse - sum^2 / w*h
206  */
207 #define GET_VAR(W, H)                                                         \
208   void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
209                                const uint8_t *b, int b_stride, uint32_t *sse, \
210                                int *sum) {                                    \
211     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
212   }
213 
214 /* Identical to the variance call except it does not calculate the
215  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
216  * variable.
217  */
218 #define MSE(W, H)                                               \
219   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
220                                 const uint8_t *b, int b_stride, \
221                                 uint32_t *sse) {                \
222     int sum;                                                    \
223     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
224     return *sse;                                                \
225   }
226 
227 /* All three forms of the variance are available in the same sizes. */
228 #define VARIANCES(W, H) \
229   VAR(W, H)             \
230   SUBPIX_VAR(W, H)      \
231   SUBPIX_AVG_VAR(W, H)
232 
233 VARIANCES(128, 128)
234 VARIANCES(128, 64)
235 VARIANCES(64, 128)
236 VARIANCES(64, 64)
237 VARIANCES(64, 32)
238 VARIANCES(32, 64)
239 VARIANCES(32, 32)
240 VARIANCES(32, 16)
241 VARIANCES(16, 32)
242 VARIANCES(16, 16)
243 VARIANCES(16, 8)
244 VARIANCES(8, 16)
245 VARIANCES(8, 8)
246 VARIANCES(8, 4)
247 VARIANCES(4, 8)
248 VARIANCES(4, 4)
249 VARIANCES(4, 2)
250 VARIANCES(2, 4)
251 VARIANCES(2, 2)
252 VARIANCES(4, 16)
253 VARIANCES(16, 4)
254 VARIANCES(8, 32)
255 VARIANCES(32, 8)
256 VARIANCES(16, 64)
257 VARIANCES(64, 16)
258 
259 GET_VAR(16, 16)
260 GET_VAR(8, 8)
261 
262 MSE(16, 16)
263 MSE(16, 8)
264 MSE(8, 16)
265 MSE(8, 8)
266 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)267 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
268                          int height, const uint8_t *ref, int ref_stride) {
269   int i, j;
270 
271   for (i = 0; i < height; ++i) {
272     for (j = 0; j < width; ++j) {
273       const int tmp = pred[j] + ref[j];
274       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
275     }
276     comp_pred += width;
277     pred += width;
278     ref += ref_stride;
279   }
280 }
281 
282 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)283 void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
284                           int mi_row, int mi_col, const MV *const mv,
285                           uint8_t *comp_pred, int width, int height,
286                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
287                           int ref_stride, int subpel_search) {
288   // expect xd == NULL only in tests
289   if (xd != NULL) {
290     const MB_MODE_INFO *mi = xd->mi[0];
291     const int ref_num = 0;
292     const int is_intrabc = is_intrabc_block(mi);
293     const struct scale_factors *const sf =
294         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
295     const int is_scaled = av1_is_scaled(sf);
296 
297     if (is_scaled) {
298       // Note: This is mostly a copy from the >=8X8 case in
299       // build_inter_predictors() function, with some small tweaks.
300 
301       // Some assumptions.
302       const int plane = 0;
303 
304       // Get pre-requisites.
305       const struct macroblockd_plane *const pd = &xd->plane[plane];
306       const int ssx = pd->subsampling_x;
307       const int ssy = pd->subsampling_y;
308       assert(ssx == 0 && ssy == 0);
309       const struct buf_2d *const dst_buf = &pd->dst;
310       const struct buf_2d *const pre_buf =
311           is_intrabc ? dst_buf : &pd->pre[ref_num];
312       const int mi_x = mi_col * MI_SIZE;
313       const int mi_y = mi_row * MI_SIZE;
314 
315       // Calculate subpel_x/y and x/y_step.
316       const int row_start = 0;  // Because ss_y is 0.
317       const int col_start = 0;  // Because ss_x is 0.
318       const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
319       const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
320       int orig_pos_y = pre_y << SUBPEL_BITS;
321       orig_pos_y += mv->row * (1 << (1 - ssy));
322       int orig_pos_x = pre_x << SUBPEL_BITS;
323       orig_pos_x += mv->col * (1 << (1 - ssx));
324       int pos_y = sf->scale_value_y(orig_pos_y, sf);
325       int pos_x = sf->scale_value_x(orig_pos_x, sf);
326       pos_x += SCALE_EXTRA_OFF;
327       pos_y += SCALE_EXTRA_OFF;
328 
329       const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
330       const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
331       const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
332                          << SCALE_SUBPEL_BITS;
333       const int right = (pre_buf->width + AOM_INTERP_EXTEND)
334                         << SCALE_SUBPEL_BITS;
335       pos_y = clamp(pos_y, top, bottom);
336       pos_x = clamp(pos_x, left, right);
337 
338       const uint8_t *const pre =
339           pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
340           (pos_x >> SCALE_SUBPEL_BITS);
341 
342       const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
343                                            pos_x & SCALE_SUBPEL_MASK,
344                                            pos_y & SCALE_SUBPEL_MASK };
345 
346       // Get warp types.
347       const WarpedMotionParams *const wm =
348           &xd->global_motion[mi->ref_frame[ref_num]];
349       const int is_global = is_global_mv_block(mi, wm->wmtype);
350       WarpTypesAllowed warp_types;
351       warp_types.global_warp_allowed = is_global;
352       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
353 
354       // Get convolve parameters.
355       ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
356       const InterpFilters filters =
357           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
358 
359       // Get the inter predictor.
360       const int build_for_obmc = 0;
361       av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
362                                &subpel_params, sf, width, height, &conv_params,
363                                filters, &warp_types, mi_x >> pd->subsampling_x,
364                                mi_y >> pd->subsampling_y, plane, ref_num, mi,
365                                build_for_obmc, xd, cm->allow_warped_motion);
366 
367       return;
368     }
369   }
370 
371   const InterpFilterParams *filter = av1_get_filter(subpel_search);
372 
373   if (!subpel_x_q3 && !subpel_y_q3) {
374     for (int i = 0; i < height; i++) {
375       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
376       comp_pred += width;
377       ref += ref_stride;
378     }
379   } else if (!subpel_y_q3) {
380     const int16_t *const kernel =
381         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
382     aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
383                           -1, width, height);
384   } else if (!subpel_x_q3) {
385     const int16_t *const kernel =
386         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
387     aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
388                          16, width, height);
389   } else {
390     DECLARE_ALIGNED(16, uint8_t,
391                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
392     const int16_t *const kernel_x =
393         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
394     const int16_t *const kernel_y =
395         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
396     const int intermediate_height =
397         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
398     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
399     aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
400                           ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
401                           width, intermediate_height);
402     aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
403                          MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
404                          width, height);
405   }
406 }
407 
aom_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)408 void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
409                                    int mi_row, int mi_col, const MV *const mv,
410                                    uint8_t *comp_pred, const uint8_t *pred,
411                                    int width, int height, int subpel_x_q3,
412                                    int subpel_y_q3, const uint8_t *ref,
413                                    int ref_stride, int subpel_search) {
414   int i, j;
415 
416   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
417                      subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
418   for (i = 0; i < height; i++) {
419     for (j = 0; j < width; j++) {
420       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
421     }
422     comp_pred += width;
423     pred += width;
424   }
425 }
426 
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)427 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
428                                   int width, int height, const uint8_t *ref,
429                                   int ref_stride,
430                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
431   int i, j;
432   const int fwd_offset = jcp_param->fwd_offset;
433   const int bck_offset = jcp_param->bck_offset;
434 
435   for (i = 0; i < height; ++i) {
436     for (j = 0; j < width; ++j) {
437       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
438       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
439       comp_pred[j] = (uint8_t)tmp;
440     }
441     comp_pred += width;
442     pred += width;
443     ref += ref_stride;
444   }
445 }
446 
aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)447 void aom_dist_wtd_comp_avg_upsampled_pred_c(
448     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
449     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
450     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
451     int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
452   int i, j;
453   const int fwd_offset = jcp_param->fwd_offset;
454   const int bck_offset = jcp_param->bck_offset;
455 
456   aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
457                        subpel_x_q3, subpel_y_q3, ref, ref_stride,
458                        subpel_search);
459 
460   for (i = 0; i < height; i++) {
461     for (j = 0; j < width; j++) {
462       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
463       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
464       comp_pred[j] = (uint8_t)tmp;
465     }
466     comp_pred += width;
467     pred += width;
468   }
469 }
470 
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)471 static void highbd_variance64(const uint8_t *a8, int a_stride,
472                               const uint8_t *b8, int b_stride, int w, int h,
473                               uint64_t *sse, int64_t *sum) {
474   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
475   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
476   int64_t tsum = 0;
477   uint64_t tsse = 0;
478   for (int i = 0; i < h; ++i) {
479     int32_t lsum = 0;
480     for (int j = 0; j < w; ++j) {
481       const int diff = a[j] - b[j];
482       lsum += diff;
483       tsse += (uint32_t)(diff * diff);
484     }
485     tsum += lsum;
486     a += a_stride;
487     b += b_stride;
488   }
489   *sum = tsum;
490   *sse = tsse;
491 }
492 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)493 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
494                                  const uint8_t *b, int b_stride, int w, int h) {
495   uint64_t sse;
496   int64_t sum;
497   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
498   return sse;
499 }
500 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)501 static void highbd_8_variance(const uint8_t *a8, int a_stride,
502                               const uint8_t *b8, int b_stride, int w, int h,
503                               uint32_t *sse, int *sum) {
504   uint64_t sse_long = 0;
505   int64_t sum_long = 0;
506   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
507   *sse = (uint32_t)sse_long;
508   *sum = (int)sum_long;
509 }
510 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)511 static void highbd_10_variance(const uint8_t *a8, int a_stride,
512                                const uint8_t *b8, int b_stride, int w, int h,
513                                uint32_t *sse, int *sum) {
514   uint64_t sse_long = 0;
515   int64_t sum_long = 0;
516   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
517   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
518   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
519 }
520 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)521 static void highbd_12_variance(const uint8_t *a8, int a_stride,
522                                const uint8_t *b8, int b_stride, int w, int h,
523                                uint32_t *sse, int *sum) {
524   uint64_t sse_long = 0;
525   int64_t sum_long = 0;
526   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
527   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
528   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
529 }
530 
531 #define HIGHBD_VAR(W, H)                                                       \
532   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
533                                               const uint8_t *b, int b_stride,  \
534                                               uint32_t *sse) {                 \
535     int sum;                                                                   \
536     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
537     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
538   }                                                                            \
539                                                                                \
540   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
541                                                const uint8_t *b, int b_stride, \
542                                                uint32_t *sse) {                \
543     int sum;                                                                   \
544     int64_t var;                                                               \
545     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
546     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
547     return (var >= 0) ? (uint32_t)var : 0;                                     \
548   }                                                                            \
549                                                                                \
550   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
551                                                const uint8_t *b, int b_stride, \
552                                                uint32_t *sse) {                \
553     int sum;                                                                   \
554     int64_t var;                                                               \
555     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
556     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
557     return (var >= 0) ? (uint32_t)var : 0;                                     \
558   }
559 
560 #define HIGHBD_GET_VAR(S)                                                    \
561   void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
562                                         const uint8_t *ref, int ref_stride,  \
563                                         uint32_t *sse, int *sum) {           \
564     highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
565   }                                                                          \
566                                                                              \
567   void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
568                                          const uint8_t *ref, int ref_stride, \
569                                          uint32_t *sse, int *sum) {          \
570     highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
571   }                                                                          \
572                                                                              \
573   void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
574                                          const uint8_t *ref, int ref_stride, \
575                                          uint32_t *sse, int *sum) {          \
576     highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
577   }
578 
579 #define HIGHBD_MSE(W, H)                                                      \
580   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
581                                          const uint8_t *ref, int ref_stride,  \
582                                          uint32_t *sse) {                     \
583     int sum;                                                                  \
584     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
585     return *sse;                                                              \
586   }                                                                           \
587                                                                               \
588   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
589                                           const uint8_t *ref, int ref_stride, \
590                                           uint32_t *sse) {                    \
591     int sum;                                                                  \
592     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
593     return *sse;                                                              \
594   }                                                                           \
595                                                                               \
596   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
597                                           const uint8_t *ref, int ref_stride, \
598                                           uint32_t *sse) {                    \
599     int sum;                                                                  \
600     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
601     return *sse;                                                              \
602   }
603 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)604 void aom_highbd_var_filter_block2d_bil_first_pass(
605     const uint8_t *src_ptr8, uint16_t *output_ptr,
606     unsigned int src_pixels_per_line, int pixel_step,
607     unsigned int output_height, unsigned int output_width,
608     const uint8_t *filter) {
609   unsigned int i, j;
610   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
611   for (i = 0; i < output_height; ++i) {
612     for (j = 0; j < output_width; ++j) {
613       output_ptr[j] = ROUND_POWER_OF_TWO(
614           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
615           FILTER_BITS);
616 
617       ++src_ptr;
618     }
619 
620     // Next row...
621     src_ptr += src_pixels_per_line - output_width;
622     output_ptr += output_width;
623   }
624 }
625 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)626 void aom_highbd_var_filter_block2d_bil_second_pass(
627     const uint16_t *src_ptr, uint16_t *output_ptr,
628     unsigned int src_pixels_per_line, unsigned int pixel_step,
629     unsigned int output_height, unsigned int output_width,
630     const uint8_t *filter) {
631   unsigned int i, j;
632 
633   for (i = 0; i < output_height; ++i) {
634     for (j = 0; j < output_width; ++j) {
635       output_ptr[j] = ROUND_POWER_OF_TWO(
636           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
637           FILTER_BITS);
638       ++src_ptr;
639     }
640 
641     src_ptr += src_pixels_per_line - output_width;
642     output_ptr += output_width;
643   }
644 }
645 
646 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
647   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
648       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
649       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
650     uint16_t fdata3[(H + 1) * W];                                            \
651     uint16_t temp2[H * W];                                                   \
652                                                                              \
653     aom_highbd_var_filter_block2d_bil_first_pass(                            \
654         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
655     aom_highbd_var_filter_block2d_bil_second_pass(                           \
656         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
657                                                                              \
658     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
659                                               dst, dst_stride, sse);         \
660   }                                                                          \
661                                                                              \
662   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
663       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
664       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
665     uint16_t fdata3[(H + 1) * W];                                            \
666     uint16_t temp2[H * W];                                                   \
667                                                                              \
668     aom_highbd_var_filter_block2d_bil_first_pass(                            \
669         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
670     aom_highbd_var_filter_block2d_bil_second_pass(                           \
671         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
672                                                                              \
673     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
674                                                dst, dst_stride, sse);        \
675   }                                                                          \
676                                                                              \
677   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
678       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
679       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
680     uint16_t fdata3[(H + 1) * W];                                            \
681     uint16_t temp2[H * W];                                                   \
682                                                                              \
683     aom_highbd_var_filter_block2d_bil_first_pass(                            \
684         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
685     aom_highbd_var_filter_block2d_bil_second_pass(                           \
686         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
687                                                                              \
688     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
689                                                dst, dst_stride, sse);        \
690   }
691 
692 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
693   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
694       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
695       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
696       const uint8_t *second_pred) {                                           \
697     uint16_t fdata3[(H + 1) * W];                                             \
698     uint16_t temp2[H * W];                                                    \
699     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
700                                                                               \
701     aom_highbd_var_filter_block2d_bil_first_pass(                             \
702         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
703     aom_highbd_var_filter_block2d_bil_second_pass(                            \
704         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
705                                                                               \
706     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
707                                CONVERT_TO_BYTEPTR(temp2), W);                 \
708                                                                               \
709     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
710                                               dst, dst_stride, sse);          \
711   }                                                                           \
712                                                                               \
713   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
714       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
715       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
716       const uint8_t *second_pred) {                                           \
717     uint16_t fdata3[(H + 1) * W];                                             \
718     uint16_t temp2[H * W];                                                    \
719     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
720                                                                               \
721     aom_highbd_var_filter_block2d_bil_first_pass(                             \
722         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
723     aom_highbd_var_filter_block2d_bil_second_pass(                            \
724         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
725                                                                               \
726     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
727                                CONVERT_TO_BYTEPTR(temp2), W);                 \
728                                                                               \
729     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
730                                                dst, dst_stride, sse);         \
731   }                                                                           \
732                                                                               \
733   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
734       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
735       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
736       const uint8_t *second_pred) {                                           \
737     uint16_t fdata3[(H + 1) * W];                                             \
738     uint16_t temp2[H * W];                                                    \
739     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
740                                                                               \
741     aom_highbd_var_filter_block2d_bil_first_pass(                             \
742         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
743     aom_highbd_var_filter_block2d_bil_second_pass(                            \
744         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
745                                                                               \
746     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
747                                CONVERT_TO_BYTEPTR(temp2), W);                 \
748                                                                               \
749     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
750                                                dst, dst_stride, sse);         \
751   }                                                                           \
752                                                                               \
753   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
754       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
755       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
756       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
757     uint16_t fdata3[(H + 1) * W];                                             \
758     uint16_t temp2[H * W];                                                    \
759     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
760                                                                               \
761     aom_highbd_var_filter_block2d_bil_first_pass(                             \
762         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
763     aom_highbd_var_filter_block2d_bil_second_pass(                            \
764         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
765                                                                               \
766     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
767                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
768                                       jcp_param);                             \
769                                                                               \
770     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
771                                           dst_stride, sse);                   \
772   }                                                                           \
773                                                                               \
774   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
775       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
776       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
777       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
778     uint16_t fdata3[(H + 1) * W];                                             \
779     uint16_t temp2[H * W];                                                    \
780     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
781                                                                               \
782     aom_highbd_var_filter_block2d_bil_first_pass(                             \
783         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
784     aom_highbd_var_filter_block2d_bil_second_pass(                            \
785         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
786                                                                               \
787     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
788                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
789                                       jcp_param);                             \
790                                                                               \
791     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
792                                            dst_stride, sse);                  \
793   }                                                                           \
794                                                                               \
795   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
796       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
797       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
798       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
799     uint16_t fdata3[(H + 1) * W];                                             \
800     uint16_t temp2[H * W];                                                    \
801     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
802                                                                               \
803     aom_highbd_var_filter_block2d_bil_first_pass(                             \
804         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
805     aom_highbd_var_filter_block2d_bil_second_pass(                            \
806         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
807                                                                               \
808     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
809                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
810                                       jcp_param);                             \
811                                                                               \
812     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
813                                            dst_stride, sse);                  \
814   }
815 
816 /* All three forms of the variance are available in the same sizes. */
817 #define HIGHBD_VARIANCES(W, H) \
818   HIGHBD_VAR(W, H)             \
819   HIGHBD_SUBPIX_VAR(W, H)      \
820   HIGHBD_SUBPIX_AVG_VAR(W, H)
821 
822 HIGHBD_VARIANCES(128, 128)
823 HIGHBD_VARIANCES(128, 64)
824 HIGHBD_VARIANCES(64, 128)
825 HIGHBD_VARIANCES(64, 64)
826 HIGHBD_VARIANCES(64, 32)
827 HIGHBD_VARIANCES(32, 64)
828 HIGHBD_VARIANCES(32, 32)
829 HIGHBD_VARIANCES(32, 16)
830 HIGHBD_VARIANCES(16, 32)
831 HIGHBD_VARIANCES(16, 16)
832 HIGHBD_VARIANCES(16, 8)
833 HIGHBD_VARIANCES(8, 16)
834 HIGHBD_VARIANCES(8, 8)
835 HIGHBD_VARIANCES(8, 4)
836 HIGHBD_VARIANCES(4, 8)
837 HIGHBD_VARIANCES(4, 4)
838 HIGHBD_VARIANCES(4, 2)
839 HIGHBD_VARIANCES(2, 4)
840 HIGHBD_VARIANCES(2, 2)
841 HIGHBD_VARIANCES(4, 16)
842 HIGHBD_VARIANCES(16, 4)
843 HIGHBD_VARIANCES(8, 32)
844 HIGHBD_VARIANCES(32, 8)
845 HIGHBD_VARIANCES(16, 64)
846 HIGHBD_VARIANCES(64, 16)
847 
848 HIGHBD_GET_VAR(8)
849 HIGHBD_GET_VAR(16)
850 
851 HIGHBD_MSE(16, 16)
852 HIGHBD_MSE(16, 8)
853 HIGHBD_MSE(8, 16)
854 HIGHBD_MSE(8, 8)
855 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)856 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
857                                 int width, int height, const uint8_t *ref8,
858                                 int ref_stride) {
859   int i, j;
860   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
861   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
862   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
863   for (i = 0; i < height; ++i) {
864     for (j = 0; j < width; ++j) {
865       const int tmp = pred[j] + ref[j];
866       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
867     }
868     comp_pred += width;
869     pred += width;
870     ref += ref_stride;
871   }
872 }
873 
aom_highbd_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)874 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
875                                  const struct AV1Common *const cm, int mi_row,
876                                  int mi_col, const MV *const mv,
877                                  uint8_t *comp_pred8, int width, int height,
878                                  int subpel_x_q3, int subpel_y_q3,
879                                  const uint8_t *ref8, int ref_stride, int bd,
880                                  int subpel_search) {
881   // expect xd == NULL only in tests
882   if (xd != NULL) {
883     const MB_MODE_INFO *mi = xd->mi[0];
884     const int ref_num = 0;
885     const int is_intrabc = is_intrabc_block(mi);
886     const struct scale_factors *const sf =
887         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
888     const int is_scaled = av1_is_scaled(sf);
889 
890     if (is_scaled) {
891       // Note: This is mostly a copy from the >=8X8 case in
892       // build_inter_predictors() function, with some small tweaks.
893       // Some assumptions.
894       const int plane = 0;
895 
896       // Get pre-requisites.
897       const struct macroblockd_plane *const pd = &xd->plane[plane];
898       const int ssx = pd->subsampling_x;
899       const int ssy = pd->subsampling_y;
900       assert(ssx == 0 && ssy == 0);
901       const struct buf_2d *const dst_buf = &pd->dst;
902       const struct buf_2d *const pre_buf =
903           is_intrabc ? dst_buf : &pd->pre[ref_num];
904       const int mi_x = mi_col * MI_SIZE;
905       const int mi_y = mi_row * MI_SIZE;
906 
907       // Calculate subpel_x/y and x/y_step.
908       const int row_start = 0;  // Because ss_y is 0.
909       const int col_start = 0;  // Because ss_x is 0.
910       const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
911       const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
912       int orig_pos_y = pre_y << SUBPEL_BITS;
913       orig_pos_y += mv->row * (1 << (1 - ssy));
914       int orig_pos_x = pre_x << SUBPEL_BITS;
915       orig_pos_x += mv->col * (1 << (1 - ssx));
916       int pos_y = sf->scale_value_y(orig_pos_y, sf);
917       int pos_x = sf->scale_value_x(orig_pos_x, sf);
918       pos_x += SCALE_EXTRA_OFF;
919       pos_y += SCALE_EXTRA_OFF;
920 
921       const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
922       const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
923       const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
924                          << SCALE_SUBPEL_BITS;
925       const int right = (pre_buf->width + AOM_INTERP_EXTEND)
926                         << SCALE_SUBPEL_BITS;
927       pos_y = clamp(pos_y, top, bottom);
928       pos_x = clamp(pos_x, left, right);
929 
930       const uint8_t *const pre =
931           pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
932           (pos_x >> SCALE_SUBPEL_BITS);
933 
934       const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
935                                            pos_x & SCALE_SUBPEL_MASK,
936                                            pos_y & SCALE_SUBPEL_MASK };
937 
938       // Get warp types.
939       const WarpedMotionParams *const wm =
940           &xd->global_motion[mi->ref_frame[ref_num]];
941       const int is_global = is_global_mv_block(mi, wm->wmtype);
942       WarpTypesAllowed warp_types;
943       warp_types.global_warp_allowed = is_global;
944       warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
945 
946       // Get convolve parameters.
947       ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
948       const InterpFilters filters =
949           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
950 
951       // Get the inter predictor.
952       const int build_for_obmc = 0;
953       av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
954                                &subpel_params, sf, width, height, &conv_params,
955                                filters, &warp_types, mi_x >> pd->subsampling_x,
956                                mi_y >> pd->subsampling_y, plane, ref_num, mi,
957                                build_for_obmc, xd, cm->allow_warped_motion);
958 
959       return;
960     }
961   }
962 
963   const InterpFilterParams *filter = av1_get_filter(subpel_search);
964 
965   if (!subpel_x_q3 && !subpel_y_q3) {
966     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
967     uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
968     for (int i = 0; i < height; i++) {
969       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
970       comp_pred += width;
971       ref += ref_stride;
972     }
973   } else if (!subpel_y_q3) {
974     const int16_t *const kernel =
975         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
976     aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
977                                  16, NULL, -1, width, height, bd);
978   } else if (!subpel_x_q3) {
979     const int16_t *const kernel =
980         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
981     aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
982                                 kernel, 16, width, height, bd);
983   } else {
984     DECLARE_ALIGNED(16, uint16_t,
985                     temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
986     const int16_t *const kernel_x =
987         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
988     const int16_t *const kernel_y =
989         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
990     const int intermediate_height =
991         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
992     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
993     aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
994                                  ref_stride, CONVERT_TO_BYTEPTR(temp),
995                                  MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
996                                  intermediate_height, bd);
997     aom_highbd_convolve8_vert_c(
998         CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
999         MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
1000         bd);
1001   }
1002 }
1003 
aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)1004 void aom_highbd_comp_avg_upsampled_pred_c(
1005     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1006     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1007     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1008     int ref_stride, int bd, int subpel_search) {
1009   int i, j;
1010 
1011   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1012   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1013   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1014                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1015                             bd, subpel_search);
1016   for (i = 0; i < height; ++i) {
1017     for (j = 0; j < width; ++j) {
1018       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
1019     }
1020     comp_pred += width;
1021     pred += width;
1022   }
1023 }
1024 
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)1025 void aom_highbd_dist_wtd_comp_avg_pred_c(
1026     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
1027     const uint8_t *ref8, int ref_stride,
1028     const DIST_WTD_COMP_PARAMS *jcp_param) {
1029   int i, j;
1030   const int fwd_offset = jcp_param->fwd_offset;
1031   const int bck_offset = jcp_param->bck_offset;
1032   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1033   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1034   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1035 
1036   for (i = 0; i < height; ++i) {
1037     for (j = 0; j < width; ++j) {
1038       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
1039       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
1040       comp_pred[j] = (uint16_t)tmp;
1041     }
1042     comp_pred += width;
1043     pred += width;
1044     ref += ref_stride;
1045   }
1046 }
1047 
aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)1048 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
1049     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1050     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1051     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1052     int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
1053     int subpel_search) {
1054   int i, j;
1055   const int fwd_offset = jcp_param->fwd_offset;
1056   const int bck_offset = jcp_param->bck_offset;
1057   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1058   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1059   aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1060                               height, subpel_x_q3, subpel_y_q3, ref8,
1061                               ref_stride, bd, subpel_search);
1062 
1063   for (i = 0; i < height; i++) {
1064     for (j = 0; j < width; j++) {
1065       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
1066       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
1067       comp_pred[j] = (uint16_t)tmp;
1068     }
1069     comp_pred += width;
1070     pred += width;
1071   }
1072 }
1073 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1074 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
1075                           int height, const uint8_t *ref, int ref_stride,
1076                           const uint8_t *mask, int mask_stride,
1077                           int invert_mask) {
1078   int i, j;
1079   const uint8_t *src0 = invert_mask ? pred : ref;
1080   const uint8_t *src1 = invert_mask ? ref : pred;
1081   const int stride0 = invert_mask ? width : ref_stride;
1082   const int stride1 = invert_mask ? ref_stride : width;
1083   for (i = 0; i < height; ++i) {
1084     for (j = 0; j < width; ++j) {
1085       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
1086     }
1087     comp_pred += width;
1088     src0 += stride0;
1089     src1 += stride1;
1090     mask += mask_stride;
1091   }
1092 }
1093 
aom_comp_mask_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int subpel_search)1094 void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
1095                                     int mi_row, int mi_col, const MV *const mv,
1096                                     uint8_t *comp_pred, const uint8_t *pred,
1097                                     int width, int height, int subpel_x_q3,
1098                                     int subpel_y_q3, const uint8_t *ref,
1099                                     int ref_stride, const uint8_t *mask,
1100                                     int mask_stride, int invert_mask,
1101                                     int subpel_search) {
1102   if (subpel_x_q3 | subpel_y_q3) {
1103     aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
1104                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
1105                          subpel_search);
1106     ref = comp_pred;
1107     ref_stride = width;
1108   }
1109   aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
1110                        mask_stride, invert_mask);
1111 }
1112 
1113 #define MASK_SUBPIX_VAR(W, H)                                                  \
1114   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
1115       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1116       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1117       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1118       unsigned int *sse) {                                                     \
1119     uint16_t fdata3[(H + 1) * W];                                              \
1120     uint8_t temp2[H * W];                                                      \
1121     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
1122                                                                                \
1123     aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
1124                                             W, bilinear_filters_2t[xoffset]);  \
1125     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
1126                                              bilinear_filters_2t[yoffset]);    \
1127                                                                                \
1128     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
1129                          invert_mask);                                         \
1130     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
1131   }
1132 
1133 MASK_SUBPIX_VAR(4, 4)
1134 MASK_SUBPIX_VAR(4, 8)
1135 MASK_SUBPIX_VAR(8, 4)
1136 MASK_SUBPIX_VAR(8, 8)
1137 MASK_SUBPIX_VAR(8, 16)
1138 MASK_SUBPIX_VAR(16, 8)
1139 MASK_SUBPIX_VAR(16, 16)
1140 MASK_SUBPIX_VAR(16, 32)
1141 MASK_SUBPIX_VAR(32, 16)
1142 MASK_SUBPIX_VAR(32, 32)
1143 MASK_SUBPIX_VAR(32, 64)
1144 MASK_SUBPIX_VAR(64, 32)
1145 MASK_SUBPIX_VAR(64, 64)
1146 MASK_SUBPIX_VAR(64, 128)
1147 MASK_SUBPIX_VAR(128, 64)
1148 MASK_SUBPIX_VAR(128, 128)
1149 MASK_SUBPIX_VAR(4, 16)
1150 MASK_SUBPIX_VAR(16, 4)
1151 MASK_SUBPIX_VAR(8, 32)
1152 MASK_SUBPIX_VAR(32, 8)
1153 MASK_SUBPIX_VAR(16, 64)
1154 MASK_SUBPIX_VAR(64, 16)
1155 
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1156 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
1157                                  int width, int height, const uint8_t *ref8,
1158                                  int ref_stride, const uint8_t *mask,
1159                                  int mask_stride, int invert_mask) {
1160   int i, j;
1161   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1162   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1163   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1164   for (i = 0; i < height; ++i) {
1165     for (j = 0; j < width; ++j) {
1166       if (!invert_mask)
1167         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
1168       else
1169         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
1170     }
1171     comp_pred += width;
1172     pred += width;
1173     ref += ref_stride;
1174     mask += mask_stride;
1175   }
1176 }
1177 
aom_highbd_comp_mask_upsampled_pred(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd,int subpel_search)1178 void aom_highbd_comp_mask_upsampled_pred(
1179     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1180     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1181     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1182     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
1183     int bd, int subpel_search) {
1184   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1185                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1186                             bd, subpel_search);
1187   aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
1188                             mask, mask_stride, invert_mask);
1189 }
1190 
1191 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
1192   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
1193       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1194       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1195       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1196       unsigned int *sse) {                                                     \
1197     uint16_t fdata3[(H + 1) * W];                                              \
1198     uint16_t temp2[H * W];                                                     \
1199     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1200                                                                                \
1201     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1202         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1203     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1204         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1205                                                                                \
1206     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1207                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1208                                 invert_mask);                                  \
1209                                                                                \
1210     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
1211                                               ref, ref_stride, sse);           \
1212   }                                                                            \
1213                                                                                \
1214   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
1215       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1216       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1217       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1218       unsigned int *sse) {                                                     \
1219     uint16_t fdata3[(H + 1) * W];                                              \
1220     uint16_t temp2[H * W];                                                     \
1221     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1222                                                                                \
1223     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1224         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1225     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1226         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1227                                                                                \
1228     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1229                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1230                                 invert_mask);                                  \
1231                                                                                \
1232     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
1233                                                ref, ref_stride, sse);          \
1234   }                                                                            \
1235                                                                                \
1236   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
1237       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1238       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1239       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1240       unsigned int *sse) {                                                     \
1241     uint16_t fdata3[(H + 1) * W];                                              \
1242     uint16_t temp2[H * W];                                                     \
1243     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1244                                                                                \
1245     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1246         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1247     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1248         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1249                                                                                \
1250     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1251                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1252                                 invert_mask);                                  \
1253                                                                                \
1254     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
1255                                                ref, ref_stride, sse);          \
1256   }
1257 
1258 HIGHBD_MASK_SUBPIX_VAR(4, 4)
1259 HIGHBD_MASK_SUBPIX_VAR(4, 8)
1260 HIGHBD_MASK_SUBPIX_VAR(8, 4)
1261 HIGHBD_MASK_SUBPIX_VAR(8, 8)
1262 HIGHBD_MASK_SUBPIX_VAR(8, 16)
1263 HIGHBD_MASK_SUBPIX_VAR(16, 8)
1264 HIGHBD_MASK_SUBPIX_VAR(16, 16)
1265 HIGHBD_MASK_SUBPIX_VAR(16, 32)
1266 HIGHBD_MASK_SUBPIX_VAR(32, 16)
1267 HIGHBD_MASK_SUBPIX_VAR(32, 32)
1268 HIGHBD_MASK_SUBPIX_VAR(32, 64)
1269 HIGHBD_MASK_SUBPIX_VAR(64, 32)
1270 HIGHBD_MASK_SUBPIX_VAR(64, 64)
1271 HIGHBD_MASK_SUBPIX_VAR(64, 128)
1272 HIGHBD_MASK_SUBPIX_VAR(128, 64)
1273 HIGHBD_MASK_SUBPIX_VAR(128, 128)
1274 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1275 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1276 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1277 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1278 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1279 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1280 
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1281 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1282                                  const int32_t *wsrc, const int32_t *mask,
1283                                  int w, int h, unsigned int *sse, int *sum) {
1284   int i, j;
1285 
1286   *sse = 0;
1287   *sum = 0;
1288 
1289   for (i = 0; i < h; i++) {
1290     for (j = 0; j < w; j++) {
1291       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1292       *sum += diff;
1293       *sse += diff * diff;
1294     }
1295 
1296     pre += pre_stride;
1297     wsrc += w;
1298     mask += w;
1299   }
1300 }
1301 
1302 #define OBMC_VAR(W, H)                                            \
1303   unsigned int aom_obmc_variance##W##x##H##_c(                    \
1304       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
1305       const int32_t *mask, unsigned int *sse) {                   \
1306     int sum;                                                      \
1307     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
1308     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1309   }
1310 
1311 #define OBMC_SUBPIX_VAR(W, H)                                                  \
1312   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
1313       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1314       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1315     uint16_t fdata3[(H + 1) * W];                                              \
1316     uint8_t temp2[H * W];                                                      \
1317                                                                                \
1318     aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
1319                                             W, bilinear_filters_2t[xoffset]);  \
1320     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
1321                                              bilinear_filters_2t[yoffset]);    \
1322                                                                                \
1323     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
1324   }
1325 
1326 OBMC_VAR(4, 4)
1327 OBMC_SUBPIX_VAR(4, 4)
1328 
1329 OBMC_VAR(4, 8)
1330 OBMC_SUBPIX_VAR(4, 8)
1331 
1332 OBMC_VAR(8, 4)
1333 OBMC_SUBPIX_VAR(8, 4)
1334 
1335 OBMC_VAR(8, 8)
1336 OBMC_SUBPIX_VAR(8, 8)
1337 
1338 OBMC_VAR(8, 16)
1339 OBMC_SUBPIX_VAR(8, 16)
1340 
1341 OBMC_VAR(16, 8)
1342 OBMC_SUBPIX_VAR(16, 8)
1343 
1344 OBMC_VAR(16, 16)
1345 OBMC_SUBPIX_VAR(16, 16)
1346 
1347 OBMC_VAR(16, 32)
1348 OBMC_SUBPIX_VAR(16, 32)
1349 
1350 OBMC_VAR(32, 16)
1351 OBMC_SUBPIX_VAR(32, 16)
1352 
1353 OBMC_VAR(32, 32)
1354 OBMC_SUBPIX_VAR(32, 32)
1355 
1356 OBMC_VAR(32, 64)
1357 OBMC_SUBPIX_VAR(32, 64)
1358 
1359 OBMC_VAR(64, 32)
1360 OBMC_SUBPIX_VAR(64, 32)
1361 
1362 OBMC_VAR(64, 64)
1363 OBMC_SUBPIX_VAR(64, 64)
1364 
1365 OBMC_VAR(64, 128)
1366 OBMC_SUBPIX_VAR(64, 128)
1367 
1368 OBMC_VAR(128, 64)
1369 OBMC_SUBPIX_VAR(128, 64)
1370 
1371 OBMC_VAR(128, 128)
1372 OBMC_SUBPIX_VAR(128, 128)
1373 
1374 OBMC_VAR(4, 16)
1375 OBMC_SUBPIX_VAR(4, 16)
1376 OBMC_VAR(16, 4)
1377 OBMC_SUBPIX_VAR(16, 4)
1378 OBMC_VAR(8, 32)
1379 OBMC_SUBPIX_VAR(8, 32)
1380 OBMC_VAR(32, 8)
1381 OBMC_SUBPIX_VAR(32, 8)
1382 OBMC_VAR(16, 64)
1383 OBMC_SUBPIX_VAR(16, 64)
1384 OBMC_VAR(64, 16)
1385 OBMC_SUBPIX_VAR(64, 16)
1386 
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1387 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1388                                           const int32_t *wsrc,
1389                                           const int32_t *mask, int w, int h,
1390                                           uint64_t *sse, int64_t *sum) {
1391   int i, j;
1392   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1393 
1394   *sse = 0;
1395   *sum = 0;
1396 
1397   for (i = 0; i < h; i++) {
1398     for (j = 0; j < w; j++) {
1399       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1400       *sum += diff;
1401       *sse += diff * diff;
1402     }
1403 
1404     pre += pre_stride;
1405     wsrc += w;
1406     mask += w;
1407   }
1408 }
1409 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1410 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1411                                         const int32_t *wsrc,
1412                                         const int32_t *mask, int w, int h,
1413                                         unsigned int *sse, int *sum) {
1414   int64_t sum64;
1415   uint64_t sse64;
1416   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1417   *sum = (int)sum64;
1418   *sse = (unsigned int)sse64;
1419 }
1420 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1421 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1422                                            const int32_t *wsrc,
1423                                            const int32_t *mask, int w, int h,
1424                                            unsigned int *sse, int *sum) {
1425   int64_t sum64;
1426   uint64_t sse64;
1427   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1428   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1429   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1430 }
1431 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1432 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1433                                            const int32_t *wsrc,
1434                                            const int32_t *mask, int w, int h,
1435                                            unsigned int *sse, int *sum) {
1436   int64_t sum64;
1437   uint64_t sse64;
1438   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1439   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1440   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1441 }
1442 
1443 #define HIGHBD_OBMC_VAR(W, H)                                              \
1444   unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
1445       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1446       const int32_t *mask, unsigned int *sse) {                            \
1447     int sum;                                                               \
1448     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1449     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1450   }                                                                        \
1451                                                                            \
1452   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1453       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1454       const int32_t *mask, unsigned int *sse) {                            \
1455     int sum;                                                               \
1456     int64_t var;                                                           \
1457     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1458     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1459     return (var >= 0) ? (uint32_t)var : 0;                                 \
1460   }                                                                        \
1461                                                                            \
1462   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1463       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1464       const int32_t *mask, unsigned int *sse) {                            \
1465     int sum;                                                               \
1466     int64_t var;                                                           \
1467     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1468     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1469     return (var >= 0) ? (uint32_t)var : 0;                                 \
1470   }
1471 
1472 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1473   unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
1474       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1475       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1476     uint16_t fdata3[(H + 1) * W];                                              \
1477     uint16_t temp2[H * W];                                                     \
1478                                                                                \
1479     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1480         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1481     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1482         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1483                                                                                \
1484     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1485                                                  wsrc, mask, sse);             \
1486   }                                                                            \
1487                                                                                \
1488   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1489       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1490       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1491     uint16_t fdata3[(H + 1) * W];                                              \
1492     uint16_t temp2[H * W];                                                     \
1493                                                                                \
1494     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1495         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1496     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1497         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1498                                                                                \
1499     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1500                                                     W, wsrc, mask, sse);       \
1501   }                                                                            \
1502                                                                                \
1503   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1504       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1505       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1506     uint16_t fdata3[(H + 1) * W];                                              \
1507     uint16_t temp2[H * W];                                                     \
1508                                                                                \
1509     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1510         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1511     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1512         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1513                                                                                \
1514     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1515                                                     W, wsrc, mask, sse);       \
1516   }
1517 
1518 HIGHBD_OBMC_VAR(4, 4)
1519 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1520 
1521 HIGHBD_OBMC_VAR(4, 8)
1522 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1523 
1524 HIGHBD_OBMC_VAR(8, 4)
1525 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1526 
1527 HIGHBD_OBMC_VAR(8, 8)
1528 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1529 
1530 HIGHBD_OBMC_VAR(8, 16)
1531 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1532 
1533 HIGHBD_OBMC_VAR(16, 8)
1534 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1535 
1536 HIGHBD_OBMC_VAR(16, 16)
1537 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1538 
1539 HIGHBD_OBMC_VAR(16, 32)
1540 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1541 
1542 HIGHBD_OBMC_VAR(32, 16)
1543 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1544 
1545 HIGHBD_OBMC_VAR(32, 32)
1546 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1547 
1548 HIGHBD_OBMC_VAR(32, 64)
1549 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1550 
1551 HIGHBD_OBMC_VAR(64, 32)
1552 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1553 
1554 HIGHBD_OBMC_VAR(64, 64)
1555 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1556 
1557 HIGHBD_OBMC_VAR(64, 128)
1558 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1559 
1560 HIGHBD_OBMC_VAR(128, 64)
1561 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1562 
1563 HIGHBD_OBMC_VAR(128, 128)
1564 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1565 
1566 HIGHBD_OBMC_VAR(4, 16)
1567 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1568 HIGHBD_OBMC_VAR(16, 4)
1569 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1570 HIGHBD_OBMC_VAR(8, 32)
1571 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1572 HIGHBD_OBMC_VAR(32, 8)
1573 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1574 HIGHBD_OBMC_VAR(16, 64)
1575 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1576 HIGHBD_OBMC_VAR(64, 16)
1577 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1578