1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_ports/mem.h"
21
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 #include "aom_dsp/variance.h"
25
26 #include "av1/common/filter.h"
27 #include "av1/common/onyxc_int.h"
28 #include "av1/common/reconinter.h"
29
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)30 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
31 int b_stride) {
32 int distortion = 0;
33 int r, c;
34
35 for (r = 0; r < 4; ++r) {
36 for (c = 0; c < 4; ++c) {
37 int diff = a[c] - b[c];
38 distortion += diff * diff;
39 }
40
41 a += a_stride;
42 b += b_stride;
43 }
44
45 return distortion;
46 }
47
aom_get_mb_ss_c(const int16_t * a)48 uint32_t aom_get_mb_ss_c(const int16_t *a) {
49 unsigned int i, sum = 0;
50
51 for (i = 0; i < 256; ++i) {
52 sum += a[i] * a[i];
53 }
54
55 return sum;
56 }
57
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)58 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
59 int b_stride, int w, int h, uint32_t *sse, int *sum) {
60 int i, j;
61
62 *sum = 0;
63 *sse = 0;
64
65 for (i = 0; i < h; ++i) {
66 for (j = 0; j < w; ++j) {
67 const int diff = a[j] - b[j];
68 *sum += diff;
69 *sse += diff * diff;
70 }
71
72 a += a_stride;
73 b += b_stride;
74 }
75 }
76
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)77 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
78 int b_stride, int w, int h) {
79 uint32_t sse;
80 int sum;
81 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
82 return sse;
83 }
84
85 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
86 // or vertical direction to produce the filtered output block. Used to implement
87 // the first-pass of 2-D separable filter.
88 //
89 // Produces int16_t output to retain precision for the next pass. Two filter
90 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
91 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
92 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)93 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
94 unsigned int src_pixels_per_line,
95 unsigned int pixel_step,
96 unsigned int output_height,
97 unsigned int output_width,
98 const uint8_t *filter) {
99 unsigned int i, j;
100
101 for (i = 0; i < output_height; ++i) {
102 for (j = 0; j < output_width; ++j) {
103 b[j] = ROUND_POWER_OF_TWO(
104 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
105
106 ++a;
107 }
108
109 a += src_pixels_per_line - output_width;
110 b += output_width;
111 }
112 }
113
114 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
115 // or vertical direction to produce the filtered output block. Used to implement
116 // the second-pass of 2-D separable filter.
117 //
118 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
119 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
120 // filter is applied horizontally (pixel_step = 1) or vertically
121 // (pixel_step = stride). It defines the offset required to move from one input
122 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)123 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
124 unsigned int src_pixels_per_line,
125 unsigned int pixel_step,
126 unsigned int output_height,
127 unsigned int output_width,
128 const uint8_t *filter) {
129 unsigned int i, j;
130
131 for (i = 0; i < output_height; ++i) {
132 for (j = 0; j < output_width; ++j) {
133 b[j] = ROUND_POWER_OF_TWO(
134 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
135 ++a;
136 }
137
138 a += src_pixels_per_line - output_width;
139 b += output_width;
140 }
141 }
142
143 #define VAR(W, H) \
144 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
145 const uint8_t *b, int b_stride, \
146 uint32_t *sse) { \
147 int sum; \
148 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
149 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
150 }
151
152 #define SUBPIX_VAR(W, H) \
153 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
154 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
155 const uint8_t *b, int b_stride, uint32_t *sse) { \
156 uint16_t fdata3[(H + 1) * W]; \
157 uint8_t temp2[H * W]; \
158 \
159 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
160 bilinear_filters_2t[xoffset]); \
161 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
162 bilinear_filters_2t[yoffset]); \
163 \
164 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
165 }
166
167 #define SUBPIX_AVG_VAR(W, H) \
168 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
169 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
170 const uint8_t *b, int b_stride, uint32_t *sse, \
171 const uint8_t *second_pred) { \
172 uint16_t fdata3[(H + 1) * W]; \
173 uint8_t temp2[H * W]; \
174 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
175 \
176 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
177 bilinear_filters_2t[xoffset]); \
178 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
179 bilinear_filters_2t[yoffset]); \
180 \
181 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
182 \
183 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
184 } \
185 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
186 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
187 const uint8_t *b, int b_stride, uint32_t *sse, \
188 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
189 uint16_t fdata3[(H + 1) * W]; \
190 uint8_t temp2[H * W]; \
191 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
192 \
193 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
194 bilinear_filters_2t[xoffset]); \
195 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
196 bilinear_filters_2t[yoffset]); \
197 \
198 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
199 \
200 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
201 }
202
203 /* Identical to the variance call except it takes an additional parameter, sum,
204 * and returns that value using pass-by-reference instead of returning
205 * sse - sum^2 / w*h
206 */
207 #define GET_VAR(W, H) \
208 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
209 const uint8_t *b, int b_stride, uint32_t *sse, \
210 int *sum) { \
211 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
212 }
213
214 /* Identical to the variance call except it does not calculate the
215 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
216 * variable.
217 */
218 #define MSE(W, H) \
219 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
220 const uint8_t *b, int b_stride, \
221 uint32_t *sse) { \
222 int sum; \
223 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
224 return *sse; \
225 }
226
227 /* All three forms of the variance are available in the same sizes. */
228 #define VARIANCES(W, H) \
229 VAR(W, H) \
230 SUBPIX_VAR(W, H) \
231 SUBPIX_AVG_VAR(W, H)
232
233 VARIANCES(128, 128)
234 VARIANCES(128, 64)
235 VARIANCES(64, 128)
236 VARIANCES(64, 64)
237 VARIANCES(64, 32)
238 VARIANCES(32, 64)
239 VARIANCES(32, 32)
240 VARIANCES(32, 16)
241 VARIANCES(16, 32)
242 VARIANCES(16, 16)
243 VARIANCES(16, 8)
244 VARIANCES(8, 16)
245 VARIANCES(8, 8)
246 VARIANCES(8, 4)
247 VARIANCES(4, 8)
248 VARIANCES(4, 4)
249 VARIANCES(4, 2)
250 VARIANCES(2, 4)
251 VARIANCES(2, 2)
252 VARIANCES(4, 16)
253 VARIANCES(16, 4)
254 VARIANCES(8, 32)
255 VARIANCES(32, 8)
256 VARIANCES(16, 64)
257 VARIANCES(64, 16)
258
259 GET_VAR(16, 16)
260 GET_VAR(8, 8)
261
262 MSE(16, 16)
263 MSE(16, 8)
264 MSE(8, 16)
265 MSE(8, 8)
266
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)267 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
268 int height, const uint8_t *ref, int ref_stride) {
269 int i, j;
270
271 for (i = 0; i < height; ++i) {
272 for (j = 0; j < width; ++j) {
273 const int tmp = pred[j] + ref[j];
274 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
275 }
276 comp_pred += width;
277 pred += width;
278 ref += ref_stride;
279 }
280 }
281
282 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)283 void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
284 int mi_row, int mi_col, const MV *const mv,
285 uint8_t *comp_pred, int width, int height,
286 int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
287 int ref_stride, int subpel_search) {
288 // expect xd == NULL only in tests
289 if (xd != NULL) {
290 const MB_MODE_INFO *mi = xd->mi[0];
291 const int ref_num = 0;
292 const int is_intrabc = is_intrabc_block(mi);
293 const struct scale_factors *const sf =
294 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
295 const int is_scaled = av1_is_scaled(sf);
296
297 if (is_scaled) {
298 // Note: This is mostly a copy from the >=8X8 case in
299 // build_inter_predictors() function, with some small tweaks.
300
301 // Some assumptions.
302 const int plane = 0;
303
304 // Get pre-requisites.
305 const struct macroblockd_plane *const pd = &xd->plane[plane];
306 const int ssx = pd->subsampling_x;
307 const int ssy = pd->subsampling_y;
308 assert(ssx == 0 && ssy == 0);
309 const struct buf_2d *const dst_buf = &pd->dst;
310 const struct buf_2d *const pre_buf =
311 is_intrabc ? dst_buf : &pd->pre[ref_num];
312 const int mi_x = mi_col * MI_SIZE;
313 const int mi_y = mi_row * MI_SIZE;
314
315 // Calculate subpel_x/y and x/y_step.
316 const int row_start = 0; // Because ss_y is 0.
317 const int col_start = 0; // Because ss_x is 0.
318 const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
319 const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
320 int orig_pos_y = pre_y << SUBPEL_BITS;
321 orig_pos_y += mv->row * (1 << (1 - ssy));
322 int orig_pos_x = pre_x << SUBPEL_BITS;
323 orig_pos_x += mv->col * (1 << (1 - ssx));
324 int pos_y = sf->scale_value_y(orig_pos_y, sf);
325 int pos_x = sf->scale_value_x(orig_pos_x, sf);
326 pos_x += SCALE_EXTRA_OFF;
327 pos_y += SCALE_EXTRA_OFF;
328
329 const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
330 const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
331 const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
332 << SCALE_SUBPEL_BITS;
333 const int right = (pre_buf->width + AOM_INTERP_EXTEND)
334 << SCALE_SUBPEL_BITS;
335 pos_y = clamp(pos_y, top, bottom);
336 pos_x = clamp(pos_x, left, right);
337
338 const uint8_t *const pre =
339 pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
340 (pos_x >> SCALE_SUBPEL_BITS);
341
342 const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
343 pos_x & SCALE_SUBPEL_MASK,
344 pos_y & SCALE_SUBPEL_MASK };
345
346 // Get warp types.
347 const WarpedMotionParams *const wm =
348 &xd->global_motion[mi->ref_frame[ref_num]];
349 const int is_global = is_global_mv_block(mi, wm->wmtype);
350 WarpTypesAllowed warp_types;
351 warp_types.global_warp_allowed = is_global;
352 warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
353
354 // Get convolve parameters.
355 ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
356 const InterpFilters filters =
357 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
358
359 // Get the inter predictor.
360 const int build_for_obmc = 0;
361 av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
362 &subpel_params, sf, width, height, &conv_params,
363 filters, &warp_types, mi_x >> pd->subsampling_x,
364 mi_y >> pd->subsampling_y, plane, ref_num, mi,
365 build_for_obmc, xd, cm->allow_warped_motion);
366
367 return;
368 }
369 }
370
371 const InterpFilterParams *filter = av1_get_filter(subpel_search);
372
373 if (!subpel_x_q3 && !subpel_y_q3) {
374 for (int i = 0; i < height; i++) {
375 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
376 comp_pred += width;
377 ref += ref_stride;
378 }
379 } else if (!subpel_y_q3) {
380 const int16_t *const kernel =
381 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
382 aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
383 -1, width, height);
384 } else if (!subpel_x_q3) {
385 const int16_t *const kernel =
386 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
387 aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
388 16, width, height);
389 } else {
390 DECLARE_ALIGNED(16, uint8_t,
391 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
392 const int16_t *const kernel_x =
393 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
394 const int16_t *const kernel_y =
395 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
396 const int intermediate_height =
397 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
398 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
399 aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
400 ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
401 width, intermediate_height);
402 aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
403 MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
404 width, height);
405 }
406 }
407
aom_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)408 void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
409 int mi_row, int mi_col, const MV *const mv,
410 uint8_t *comp_pred, const uint8_t *pred,
411 int width, int height, int subpel_x_q3,
412 int subpel_y_q3, const uint8_t *ref,
413 int ref_stride, int subpel_search) {
414 int i, j;
415
416 aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
417 subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
418 for (i = 0; i < height; i++) {
419 for (j = 0; j < width; j++) {
420 comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
421 }
422 comp_pred += width;
423 pred += width;
424 }
425 }
426
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)427 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
428 int width, int height, const uint8_t *ref,
429 int ref_stride,
430 const DIST_WTD_COMP_PARAMS *jcp_param) {
431 int i, j;
432 const int fwd_offset = jcp_param->fwd_offset;
433 const int bck_offset = jcp_param->bck_offset;
434
435 for (i = 0; i < height; ++i) {
436 for (j = 0; j < width; ++j) {
437 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
438 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
439 comp_pred[j] = (uint8_t)tmp;
440 }
441 comp_pred += width;
442 pred += width;
443 ref += ref_stride;
444 }
445 }
446
aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)447 void aom_dist_wtd_comp_avg_upsampled_pred_c(
448 MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
449 const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
450 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
451 int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
452 int i, j;
453 const int fwd_offset = jcp_param->fwd_offset;
454 const int bck_offset = jcp_param->bck_offset;
455
456 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
457 subpel_x_q3, subpel_y_q3, ref, ref_stride,
458 subpel_search);
459
460 for (i = 0; i < height; i++) {
461 for (j = 0; j < width; j++) {
462 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
463 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
464 comp_pred[j] = (uint8_t)tmp;
465 }
466 comp_pred += width;
467 pred += width;
468 }
469 }
470
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)471 static void highbd_variance64(const uint8_t *a8, int a_stride,
472 const uint8_t *b8, int b_stride, int w, int h,
473 uint64_t *sse, int64_t *sum) {
474 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
475 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
476 int64_t tsum = 0;
477 uint64_t tsse = 0;
478 for (int i = 0; i < h; ++i) {
479 int32_t lsum = 0;
480 for (int j = 0; j < w; ++j) {
481 const int diff = a[j] - b[j];
482 lsum += diff;
483 tsse += (uint32_t)(diff * diff);
484 }
485 tsum += lsum;
486 a += a_stride;
487 b += b_stride;
488 }
489 *sum = tsum;
490 *sse = tsse;
491 }
492
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)493 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
494 const uint8_t *b, int b_stride, int w, int h) {
495 uint64_t sse;
496 int64_t sum;
497 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
498 return sse;
499 }
500
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)501 static void highbd_8_variance(const uint8_t *a8, int a_stride,
502 const uint8_t *b8, int b_stride, int w, int h,
503 uint32_t *sse, int *sum) {
504 uint64_t sse_long = 0;
505 int64_t sum_long = 0;
506 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
507 *sse = (uint32_t)sse_long;
508 *sum = (int)sum_long;
509 }
510
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)511 static void highbd_10_variance(const uint8_t *a8, int a_stride,
512 const uint8_t *b8, int b_stride, int w, int h,
513 uint32_t *sse, int *sum) {
514 uint64_t sse_long = 0;
515 int64_t sum_long = 0;
516 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
517 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
518 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
519 }
520
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)521 static void highbd_12_variance(const uint8_t *a8, int a_stride,
522 const uint8_t *b8, int b_stride, int w, int h,
523 uint32_t *sse, int *sum) {
524 uint64_t sse_long = 0;
525 int64_t sum_long = 0;
526 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
527 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
528 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
529 }
530
531 #define HIGHBD_VAR(W, H) \
532 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
533 const uint8_t *b, int b_stride, \
534 uint32_t *sse) { \
535 int sum; \
536 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
537 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
538 } \
539 \
540 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
541 const uint8_t *b, int b_stride, \
542 uint32_t *sse) { \
543 int sum; \
544 int64_t var; \
545 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
546 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
547 return (var >= 0) ? (uint32_t)var : 0; \
548 } \
549 \
550 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
551 const uint8_t *b, int b_stride, \
552 uint32_t *sse) { \
553 int sum; \
554 int64_t var; \
555 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
556 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
557 return (var >= 0) ? (uint32_t)var : 0; \
558 }
559
560 #define HIGHBD_GET_VAR(S) \
561 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
562 const uint8_t *ref, int ref_stride, \
563 uint32_t *sse, int *sum) { \
564 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
565 } \
566 \
567 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
568 const uint8_t *ref, int ref_stride, \
569 uint32_t *sse, int *sum) { \
570 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
571 } \
572 \
573 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
574 const uint8_t *ref, int ref_stride, \
575 uint32_t *sse, int *sum) { \
576 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
577 }
578
579 #define HIGHBD_MSE(W, H) \
580 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
581 const uint8_t *ref, int ref_stride, \
582 uint32_t *sse) { \
583 int sum; \
584 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
585 return *sse; \
586 } \
587 \
588 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
589 const uint8_t *ref, int ref_stride, \
590 uint32_t *sse) { \
591 int sum; \
592 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
593 return *sse; \
594 } \
595 \
596 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
597 const uint8_t *ref, int ref_stride, \
598 uint32_t *sse) { \
599 int sum; \
600 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
601 return *sse; \
602 }
603
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)604 void aom_highbd_var_filter_block2d_bil_first_pass(
605 const uint8_t *src_ptr8, uint16_t *output_ptr,
606 unsigned int src_pixels_per_line, int pixel_step,
607 unsigned int output_height, unsigned int output_width,
608 const uint8_t *filter) {
609 unsigned int i, j;
610 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
611 for (i = 0; i < output_height; ++i) {
612 for (j = 0; j < output_width; ++j) {
613 output_ptr[j] = ROUND_POWER_OF_TWO(
614 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
615 FILTER_BITS);
616
617 ++src_ptr;
618 }
619
620 // Next row...
621 src_ptr += src_pixels_per_line - output_width;
622 output_ptr += output_width;
623 }
624 }
625
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)626 void aom_highbd_var_filter_block2d_bil_second_pass(
627 const uint16_t *src_ptr, uint16_t *output_ptr,
628 unsigned int src_pixels_per_line, unsigned int pixel_step,
629 unsigned int output_height, unsigned int output_width,
630 const uint8_t *filter) {
631 unsigned int i, j;
632
633 for (i = 0; i < output_height; ++i) {
634 for (j = 0; j < output_width; ++j) {
635 output_ptr[j] = ROUND_POWER_OF_TWO(
636 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
637 FILTER_BITS);
638 ++src_ptr;
639 }
640
641 src_ptr += src_pixels_per_line - output_width;
642 output_ptr += output_width;
643 }
644 }
645
646 #define HIGHBD_SUBPIX_VAR(W, H) \
647 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
648 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
649 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
650 uint16_t fdata3[(H + 1) * W]; \
651 uint16_t temp2[H * W]; \
652 \
653 aom_highbd_var_filter_block2d_bil_first_pass( \
654 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
655 aom_highbd_var_filter_block2d_bil_second_pass( \
656 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
657 \
658 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
659 dst, dst_stride, sse); \
660 } \
661 \
662 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
663 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
664 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
665 uint16_t fdata3[(H + 1) * W]; \
666 uint16_t temp2[H * W]; \
667 \
668 aom_highbd_var_filter_block2d_bil_first_pass( \
669 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
670 aom_highbd_var_filter_block2d_bil_second_pass( \
671 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
672 \
673 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
674 dst, dst_stride, sse); \
675 } \
676 \
677 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
678 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
679 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
680 uint16_t fdata3[(H + 1) * W]; \
681 uint16_t temp2[H * W]; \
682 \
683 aom_highbd_var_filter_block2d_bil_first_pass( \
684 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
685 aom_highbd_var_filter_block2d_bil_second_pass( \
686 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
687 \
688 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
689 dst, dst_stride, sse); \
690 }
691
692 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
693 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
694 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
695 const uint8_t *dst, int dst_stride, uint32_t *sse, \
696 const uint8_t *second_pred) { \
697 uint16_t fdata3[(H + 1) * W]; \
698 uint16_t temp2[H * W]; \
699 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
700 \
701 aom_highbd_var_filter_block2d_bil_first_pass( \
702 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
703 aom_highbd_var_filter_block2d_bil_second_pass( \
704 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
705 \
706 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
707 CONVERT_TO_BYTEPTR(temp2), W); \
708 \
709 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
710 dst, dst_stride, sse); \
711 } \
712 \
713 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
714 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
715 const uint8_t *dst, int dst_stride, uint32_t *sse, \
716 const uint8_t *second_pred) { \
717 uint16_t fdata3[(H + 1) * W]; \
718 uint16_t temp2[H * W]; \
719 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
720 \
721 aom_highbd_var_filter_block2d_bil_first_pass( \
722 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
723 aom_highbd_var_filter_block2d_bil_second_pass( \
724 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
725 \
726 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
727 CONVERT_TO_BYTEPTR(temp2), W); \
728 \
729 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
730 dst, dst_stride, sse); \
731 } \
732 \
733 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
734 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
735 const uint8_t *dst, int dst_stride, uint32_t *sse, \
736 const uint8_t *second_pred) { \
737 uint16_t fdata3[(H + 1) * W]; \
738 uint16_t temp2[H * W]; \
739 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
740 \
741 aom_highbd_var_filter_block2d_bil_first_pass( \
742 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
743 aom_highbd_var_filter_block2d_bil_second_pass( \
744 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
745 \
746 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
747 CONVERT_TO_BYTEPTR(temp2), W); \
748 \
749 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
750 dst, dst_stride, sse); \
751 } \
752 \
753 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
754 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
755 const uint8_t *dst, int dst_stride, uint32_t *sse, \
756 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
757 uint16_t fdata3[(H + 1) * W]; \
758 uint16_t temp2[H * W]; \
759 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
760 \
761 aom_highbd_var_filter_block2d_bil_first_pass( \
762 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
763 aom_highbd_var_filter_block2d_bil_second_pass( \
764 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
765 \
766 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
767 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
768 jcp_param); \
769 \
770 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
771 dst_stride, sse); \
772 } \
773 \
774 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
775 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
776 const uint8_t *dst, int dst_stride, uint32_t *sse, \
777 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
778 uint16_t fdata3[(H + 1) * W]; \
779 uint16_t temp2[H * W]; \
780 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
781 \
782 aom_highbd_var_filter_block2d_bil_first_pass( \
783 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
784 aom_highbd_var_filter_block2d_bil_second_pass( \
785 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
786 \
787 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
788 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
789 jcp_param); \
790 \
791 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
792 dst_stride, sse); \
793 } \
794 \
795 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
796 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
797 const uint8_t *dst, int dst_stride, uint32_t *sse, \
798 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
799 uint16_t fdata3[(H + 1) * W]; \
800 uint16_t temp2[H * W]; \
801 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
802 \
803 aom_highbd_var_filter_block2d_bil_first_pass( \
804 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
805 aom_highbd_var_filter_block2d_bil_second_pass( \
806 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
807 \
808 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
809 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
810 jcp_param); \
811 \
812 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
813 dst_stride, sse); \
814 }
815
816 /* All three forms of the variance are available in the same sizes. */
817 #define HIGHBD_VARIANCES(W, H) \
818 HIGHBD_VAR(W, H) \
819 HIGHBD_SUBPIX_VAR(W, H) \
820 HIGHBD_SUBPIX_AVG_VAR(W, H)
821
822 HIGHBD_VARIANCES(128, 128)
823 HIGHBD_VARIANCES(128, 64)
824 HIGHBD_VARIANCES(64, 128)
825 HIGHBD_VARIANCES(64, 64)
826 HIGHBD_VARIANCES(64, 32)
827 HIGHBD_VARIANCES(32, 64)
828 HIGHBD_VARIANCES(32, 32)
829 HIGHBD_VARIANCES(32, 16)
830 HIGHBD_VARIANCES(16, 32)
831 HIGHBD_VARIANCES(16, 16)
832 HIGHBD_VARIANCES(16, 8)
833 HIGHBD_VARIANCES(8, 16)
834 HIGHBD_VARIANCES(8, 8)
835 HIGHBD_VARIANCES(8, 4)
836 HIGHBD_VARIANCES(4, 8)
837 HIGHBD_VARIANCES(4, 4)
838 HIGHBD_VARIANCES(4, 2)
839 HIGHBD_VARIANCES(2, 4)
840 HIGHBD_VARIANCES(2, 2)
841 HIGHBD_VARIANCES(4, 16)
842 HIGHBD_VARIANCES(16, 4)
843 HIGHBD_VARIANCES(8, 32)
844 HIGHBD_VARIANCES(32, 8)
845 HIGHBD_VARIANCES(16, 64)
846 HIGHBD_VARIANCES(64, 16)
847
848 HIGHBD_GET_VAR(8)
849 HIGHBD_GET_VAR(16)
850
851 HIGHBD_MSE(16, 16)
852 HIGHBD_MSE(16, 8)
853 HIGHBD_MSE(8, 16)
854 HIGHBD_MSE(8, 8)
855
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)856 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
857 int width, int height, const uint8_t *ref8,
858 int ref_stride) {
859 int i, j;
860 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
861 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
862 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
863 for (i = 0; i < height; ++i) {
864 for (j = 0; j < width; ++j) {
865 const int tmp = pred[j] + ref[j];
866 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
867 }
868 comp_pred += width;
869 pred += width;
870 ref += ref_stride;
871 }
872 }
873
aom_highbd_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)874 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
875 const struct AV1Common *const cm, int mi_row,
876 int mi_col, const MV *const mv,
877 uint8_t *comp_pred8, int width, int height,
878 int subpel_x_q3, int subpel_y_q3,
879 const uint8_t *ref8, int ref_stride, int bd,
880 int subpel_search) {
881 // expect xd == NULL only in tests
882 if (xd != NULL) {
883 const MB_MODE_INFO *mi = xd->mi[0];
884 const int ref_num = 0;
885 const int is_intrabc = is_intrabc_block(mi);
886 const struct scale_factors *const sf =
887 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
888 const int is_scaled = av1_is_scaled(sf);
889
890 if (is_scaled) {
891 // Note: This is mostly a copy from the >=8X8 case in
892 // build_inter_predictors() function, with some small tweaks.
893 // Some assumptions.
894 const int plane = 0;
895
896 // Get pre-requisites.
897 const struct macroblockd_plane *const pd = &xd->plane[plane];
898 const int ssx = pd->subsampling_x;
899 const int ssy = pd->subsampling_y;
900 assert(ssx == 0 && ssy == 0);
901 const struct buf_2d *const dst_buf = &pd->dst;
902 const struct buf_2d *const pre_buf =
903 is_intrabc ? dst_buf : &pd->pre[ref_num];
904 const int mi_x = mi_col * MI_SIZE;
905 const int mi_y = mi_row * MI_SIZE;
906
907 // Calculate subpel_x/y and x/y_step.
908 const int row_start = 0; // Because ss_y is 0.
909 const int col_start = 0; // Because ss_x is 0.
910 const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
911 const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
912 int orig_pos_y = pre_y << SUBPEL_BITS;
913 orig_pos_y += mv->row * (1 << (1 - ssy));
914 int orig_pos_x = pre_x << SUBPEL_BITS;
915 orig_pos_x += mv->col * (1 << (1 - ssx));
916 int pos_y = sf->scale_value_y(orig_pos_y, sf);
917 int pos_x = sf->scale_value_x(orig_pos_x, sf);
918 pos_x += SCALE_EXTRA_OFF;
919 pos_y += SCALE_EXTRA_OFF;
920
921 const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
922 const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
923 const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
924 << SCALE_SUBPEL_BITS;
925 const int right = (pre_buf->width + AOM_INTERP_EXTEND)
926 << SCALE_SUBPEL_BITS;
927 pos_y = clamp(pos_y, top, bottom);
928 pos_x = clamp(pos_x, left, right);
929
930 const uint8_t *const pre =
931 pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
932 (pos_x >> SCALE_SUBPEL_BITS);
933
934 const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
935 pos_x & SCALE_SUBPEL_MASK,
936 pos_y & SCALE_SUBPEL_MASK };
937
938 // Get warp types.
939 const WarpedMotionParams *const wm =
940 &xd->global_motion[mi->ref_frame[ref_num]];
941 const int is_global = is_global_mv_block(mi, wm->wmtype);
942 WarpTypesAllowed warp_types;
943 warp_types.global_warp_allowed = is_global;
944 warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
945
946 // Get convolve parameters.
947 ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
948 const InterpFilters filters =
949 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
950
951 // Get the inter predictor.
952 const int build_for_obmc = 0;
953 av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
954 &subpel_params, sf, width, height, &conv_params,
955 filters, &warp_types, mi_x >> pd->subsampling_x,
956 mi_y >> pd->subsampling_y, plane, ref_num, mi,
957 build_for_obmc, xd, cm->allow_warped_motion);
958
959 return;
960 }
961 }
962
963 const InterpFilterParams *filter = av1_get_filter(subpel_search);
964
965 if (!subpel_x_q3 && !subpel_y_q3) {
966 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
967 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
968 for (int i = 0; i < height; i++) {
969 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
970 comp_pred += width;
971 ref += ref_stride;
972 }
973 } else if (!subpel_y_q3) {
974 const int16_t *const kernel =
975 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
976 aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
977 16, NULL, -1, width, height, bd);
978 } else if (!subpel_x_q3) {
979 const int16_t *const kernel =
980 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
981 aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
982 kernel, 16, width, height, bd);
983 } else {
984 DECLARE_ALIGNED(16, uint16_t,
985 temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
986 const int16_t *const kernel_x =
987 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
988 const int16_t *const kernel_y =
989 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
990 const int intermediate_height =
991 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
992 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
993 aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
994 ref_stride, CONVERT_TO_BYTEPTR(temp),
995 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
996 intermediate_height, bd);
997 aom_highbd_convolve8_vert_c(
998 CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
999 MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
1000 bd);
1001 }
1002 }
1003
aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)1004 void aom_highbd_comp_avg_upsampled_pred_c(
1005 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1006 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1007 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1008 int ref_stride, int bd, int subpel_search) {
1009 int i, j;
1010
1011 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1012 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1013 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1014 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1015 bd, subpel_search);
1016 for (i = 0; i < height; ++i) {
1017 for (j = 0; j < width; ++j) {
1018 comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
1019 }
1020 comp_pred += width;
1021 pred += width;
1022 }
1023 }
1024
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)1025 void aom_highbd_dist_wtd_comp_avg_pred_c(
1026 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
1027 const uint8_t *ref8, int ref_stride,
1028 const DIST_WTD_COMP_PARAMS *jcp_param) {
1029 int i, j;
1030 const int fwd_offset = jcp_param->fwd_offset;
1031 const int bck_offset = jcp_param->bck_offset;
1032 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1033 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1034 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1035
1036 for (i = 0; i < height; ++i) {
1037 for (j = 0; j < width; ++j) {
1038 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
1039 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
1040 comp_pred[j] = (uint16_t)tmp;
1041 }
1042 comp_pred += width;
1043 pred += width;
1044 ref += ref_stride;
1045 }
1046 }
1047
aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)1048 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
1049 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1050 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1051 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1052 int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
1053 int subpel_search) {
1054 int i, j;
1055 const int fwd_offset = jcp_param->fwd_offset;
1056 const int bck_offset = jcp_param->bck_offset;
1057 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1058 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1059 aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1060 height, subpel_x_q3, subpel_y_q3, ref8,
1061 ref_stride, bd, subpel_search);
1062
1063 for (i = 0; i < height; i++) {
1064 for (j = 0; j < width; j++) {
1065 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
1066 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
1067 comp_pred[j] = (uint16_t)tmp;
1068 }
1069 comp_pred += width;
1070 pred += width;
1071 }
1072 }
1073
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1074 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
1075 int height, const uint8_t *ref, int ref_stride,
1076 const uint8_t *mask, int mask_stride,
1077 int invert_mask) {
1078 int i, j;
1079 const uint8_t *src0 = invert_mask ? pred : ref;
1080 const uint8_t *src1 = invert_mask ? ref : pred;
1081 const int stride0 = invert_mask ? width : ref_stride;
1082 const int stride1 = invert_mask ? ref_stride : width;
1083 for (i = 0; i < height; ++i) {
1084 for (j = 0; j < width; ++j) {
1085 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
1086 }
1087 comp_pred += width;
1088 src0 += stride0;
1089 src1 += stride1;
1090 mask += mask_stride;
1091 }
1092 }
1093
aom_comp_mask_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int subpel_search)1094 void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
1095 int mi_row, int mi_col, const MV *const mv,
1096 uint8_t *comp_pred, const uint8_t *pred,
1097 int width, int height, int subpel_x_q3,
1098 int subpel_y_q3, const uint8_t *ref,
1099 int ref_stride, const uint8_t *mask,
1100 int mask_stride, int invert_mask,
1101 int subpel_search) {
1102 if (subpel_x_q3 | subpel_y_q3) {
1103 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
1104 subpel_x_q3, subpel_y_q3, ref, ref_stride,
1105 subpel_search);
1106 ref = comp_pred;
1107 ref_stride = width;
1108 }
1109 aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
1110 mask_stride, invert_mask);
1111 }
1112
1113 #define MASK_SUBPIX_VAR(W, H) \
1114 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
1115 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1116 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1117 const uint8_t *msk, int msk_stride, int invert_mask, \
1118 unsigned int *sse) { \
1119 uint16_t fdata3[(H + 1) * W]; \
1120 uint8_t temp2[H * W]; \
1121 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
1122 \
1123 aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
1124 W, bilinear_filters_2t[xoffset]); \
1125 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1126 bilinear_filters_2t[yoffset]); \
1127 \
1128 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
1129 invert_mask); \
1130 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
1131 }
1132
1133 MASK_SUBPIX_VAR(4, 4)
1134 MASK_SUBPIX_VAR(4, 8)
1135 MASK_SUBPIX_VAR(8, 4)
1136 MASK_SUBPIX_VAR(8, 8)
1137 MASK_SUBPIX_VAR(8, 16)
1138 MASK_SUBPIX_VAR(16, 8)
1139 MASK_SUBPIX_VAR(16, 16)
1140 MASK_SUBPIX_VAR(16, 32)
1141 MASK_SUBPIX_VAR(32, 16)
1142 MASK_SUBPIX_VAR(32, 32)
1143 MASK_SUBPIX_VAR(32, 64)
1144 MASK_SUBPIX_VAR(64, 32)
1145 MASK_SUBPIX_VAR(64, 64)
1146 MASK_SUBPIX_VAR(64, 128)
1147 MASK_SUBPIX_VAR(128, 64)
1148 MASK_SUBPIX_VAR(128, 128)
1149 MASK_SUBPIX_VAR(4, 16)
1150 MASK_SUBPIX_VAR(16, 4)
1151 MASK_SUBPIX_VAR(8, 32)
1152 MASK_SUBPIX_VAR(32, 8)
1153 MASK_SUBPIX_VAR(16, 64)
1154 MASK_SUBPIX_VAR(64, 16)
1155
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1156 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
1157 int width, int height, const uint8_t *ref8,
1158 int ref_stride, const uint8_t *mask,
1159 int mask_stride, int invert_mask) {
1160 int i, j;
1161 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1162 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1163 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1164 for (i = 0; i < height; ++i) {
1165 for (j = 0; j < width; ++j) {
1166 if (!invert_mask)
1167 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
1168 else
1169 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
1170 }
1171 comp_pred += width;
1172 pred += width;
1173 ref += ref_stride;
1174 mask += mask_stride;
1175 }
1176 }
1177
aom_highbd_comp_mask_upsampled_pred(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd,int subpel_search)1178 void aom_highbd_comp_mask_upsampled_pred(
1179 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1180 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1181 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1182 int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
1183 int bd, int subpel_search) {
1184 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1185 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1186 bd, subpel_search);
1187 aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
1188 mask, mask_stride, invert_mask);
1189 }
1190
1191 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
1192 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
1193 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1194 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1195 const uint8_t *msk, int msk_stride, int invert_mask, \
1196 unsigned int *sse) { \
1197 uint16_t fdata3[(H + 1) * W]; \
1198 uint16_t temp2[H * W]; \
1199 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1200 \
1201 aom_highbd_var_filter_block2d_bil_first_pass( \
1202 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1203 aom_highbd_var_filter_block2d_bil_second_pass( \
1204 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1205 \
1206 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1207 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1208 invert_mask); \
1209 \
1210 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1211 ref, ref_stride, sse); \
1212 } \
1213 \
1214 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
1215 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1216 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1217 const uint8_t *msk, int msk_stride, int invert_mask, \
1218 unsigned int *sse) { \
1219 uint16_t fdata3[(H + 1) * W]; \
1220 uint16_t temp2[H * W]; \
1221 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1222 \
1223 aom_highbd_var_filter_block2d_bil_first_pass( \
1224 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1225 aom_highbd_var_filter_block2d_bil_second_pass( \
1226 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1227 \
1228 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1229 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1230 invert_mask); \
1231 \
1232 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1233 ref, ref_stride, sse); \
1234 } \
1235 \
1236 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
1237 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1238 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1239 const uint8_t *msk, int msk_stride, int invert_mask, \
1240 unsigned int *sse) { \
1241 uint16_t fdata3[(H + 1) * W]; \
1242 uint16_t temp2[H * W]; \
1243 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1244 \
1245 aom_highbd_var_filter_block2d_bil_first_pass( \
1246 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1247 aom_highbd_var_filter_block2d_bil_second_pass( \
1248 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1249 \
1250 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1251 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1252 invert_mask); \
1253 \
1254 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1255 ref, ref_stride, sse); \
1256 }
1257
1258 HIGHBD_MASK_SUBPIX_VAR(4, 4)
1259 HIGHBD_MASK_SUBPIX_VAR(4, 8)
1260 HIGHBD_MASK_SUBPIX_VAR(8, 4)
1261 HIGHBD_MASK_SUBPIX_VAR(8, 8)
1262 HIGHBD_MASK_SUBPIX_VAR(8, 16)
1263 HIGHBD_MASK_SUBPIX_VAR(16, 8)
1264 HIGHBD_MASK_SUBPIX_VAR(16, 16)
1265 HIGHBD_MASK_SUBPIX_VAR(16, 32)
1266 HIGHBD_MASK_SUBPIX_VAR(32, 16)
1267 HIGHBD_MASK_SUBPIX_VAR(32, 32)
1268 HIGHBD_MASK_SUBPIX_VAR(32, 64)
1269 HIGHBD_MASK_SUBPIX_VAR(64, 32)
1270 HIGHBD_MASK_SUBPIX_VAR(64, 64)
1271 HIGHBD_MASK_SUBPIX_VAR(64, 128)
1272 HIGHBD_MASK_SUBPIX_VAR(128, 64)
1273 HIGHBD_MASK_SUBPIX_VAR(128, 128)
1274 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1275 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1276 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1277 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1278 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1279 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1280
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1281 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1282 const int32_t *wsrc, const int32_t *mask,
1283 int w, int h, unsigned int *sse, int *sum) {
1284 int i, j;
1285
1286 *sse = 0;
1287 *sum = 0;
1288
1289 for (i = 0; i < h; i++) {
1290 for (j = 0; j < w; j++) {
1291 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1292 *sum += diff;
1293 *sse += diff * diff;
1294 }
1295
1296 pre += pre_stride;
1297 wsrc += w;
1298 mask += w;
1299 }
1300 }
1301
1302 #define OBMC_VAR(W, H) \
1303 unsigned int aom_obmc_variance##W##x##H##_c( \
1304 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1305 const int32_t *mask, unsigned int *sse) { \
1306 int sum; \
1307 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1308 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1309 }
1310
1311 #define OBMC_SUBPIX_VAR(W, H) \
1312 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
1313 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1314 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1315 uint16_t fdata3[(H + 1) * W]; \
1316 uint8_t temp2[H * W]; \
1317 \
1318 aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
1319 W, bilinear_filters_2t[xoffset]); \
1320 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1321 bilinear_filters_2t[yoffset]); \
1322 \
1323 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
1324 }
1325
1326 OBMC_VAR(4, 4)
1327 OBMC_SUBPIX_VAR(4, 4)
1328
1329 OBMC_VAR(4, 8)
1330 OBMC_SUBPIX_VAR(4, 8)
1331
1332 OBMC_VAR(8, 4)
1333 OBMC_SUBPIX_VAR(8, 4)
1334
1335 OBMC_VAR(8, 8)
1336 OBMC_SUBPIX_VAR(8, 8)
1337
1338 OBMC_VAR(8, 16)
1339 OBMC_SUBPIX_VAR(8, 16)
1340
1341 OBMC_VAR(16, 8)
1342 OBMC_SUBPIX_VAR(16, 8)
1343
1344 OBMC_VAR(16, 16)
1345 OBMC_SUBPIX_VAR(16, 16)
1346
1347 OBMC_VAR(16, 32)
1348 OBMC_SUBPIX_VAR(16, 32)
1349
1350 OBMC_VAR(32, 16)
1351 OBMC_SUBPIX_VAR(32, 16)
1352
1353 OBMC_VAR(32, 32)
1354 OBMC_SUBPIX_VAR(32, 32)
1355
1356 OBMC_VAR(32, 64)
1357 OBMC_SUBPIX_VAR(32, 64)
1358
1359 OBMC_VAR(64, 32)
1360 OBMC_SUBPIX_VAR(64, 32)
1361
1362 OBMC_VAR(64, 64)
1363 OBMC_SUBPIX_VAR(64, 64)
1364
1365 OBMC_VAR(64, 128)
1366 OBMC_SUBPIX_VAR(64, 128)
1367
1368 OBMC_VAR(128, 64)
1369 OBMC_SUBPIX_VAR(128, 64)
1370
1371 OBMC_VAR(128, 128)
1372 OBMC_SUBPIX_VAR(128, 128)
1373
1374 OBMC_VAR(4, 16)
1375 OBMC_SUBPIX_VAR(4, 16)
1376 OBMC_VAR(16, 4)
1377 OBMC_SUBPIX_VAR(16, 4)
1378 OBMC_VAR(8, 32)
1379 OBMC_SUBPIX_VAR(8, 32)
1380 OBMC_VAR(32, 8)
1381 OBMC_SUBPIX_VAR(32, 8)
1382 OBMC_VAR(16, 64)
1383 OBMC_SUBPIX_VAR(16, 64)
1384 OBMC_VAR(64, 16)
1385 OBMC_SUBPIX_VAR(64, 16)
1386
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1387 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1388 const int32_t *wsrc,
1389 const int32_t *mask, int w, int h,
1390 uint64_t *sse, int64_t *sum) {
1391 int i, j;
1392 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1393
1394 *sse = 0;
1395 *sum = 0;
1396
1397 for (i = 0; i < h; i++) {
1398 for (j = 0; j < w; j++) {
1399 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1400 *sum += diff;
1401 *sse += diff * diff;
1402 }
1403
1404 pre += pre_stride;
1405 wsrc += w;
1406 mask += w;
1407 }
1408 }
1409
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1410 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1411 const int32_t *wsrc,
1412 const int32_t *mask, int w, int h,
1413 unsigned int *sse, int *sum) {
1414 int64_t sum64;
1415 uint64_t sse64;
1416 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1417 *sum = (int)sum64;
1418 *sse = (unsigned int)sse64;
1419 }
1420
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1421 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1422 const int32_t *wsrc,
1423 const int32_t *mask, int w, int h,
1424 unsigned int *sse, int *sum) {
1425 int64_t sum64;
1426 uint64_t sse64;
1427 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1428 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1429 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1430 }
1431
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1432 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1433 const int32_t *wsrc,
1434 const int32_t *mask, int w, int h,
1435 unsigned int *sse, int *sum) {
1436 int64_t sum64;
1437 uint64_t sse64;
1438 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1439 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1440 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1441 }
1442
1443 #define HIGHBD_OBMC_VAR(W, H) \
1444 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1445 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1446 const int32_t *mask, unsigned int *sse) { \
1447 int sum; \
1448 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1449 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1450 } \
1451 \
1452 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1453 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1454 const int32_t *mask, unsigned int *sse) { \
1455 int sum; \
1456 int64_t var; \
1457 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1458 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1459 return (var >= 0) ? (uint32_t)var : 0; \
1460 } \
1461 \
1462 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1463 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1464 const int32_t *mask, unsigned int *sse) { \
1465 int sum; \
1466 int64_t var; \
1467 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1468 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1469 return (var >= 0) ? (uint32_t)var : 0; \
1470 }
1471
1472 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1473 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1474 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1475 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1476 uint16_t fdata3[(H + 1) * W]; \
1477 uint16_t temp2[H * W]; \
1478 \
1479 aom_highbd_var_filter_block2d_bil_first_pass( \
1480 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1481 aom_highbd_var_filter_block2d_bil_second_pass( \
1482 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1483 \
1484 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1485 wsrc, mask, sse); \
1486 } \
1487 \
1488 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1489 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1490 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1491 uint16_t fdata3[(H + 1) * W]; \
1492 uint16_t temp2[H * W]; \
1493 \
1494 aom_highbd_var_filter_block2d_bil_first_pass( \
1495 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1496 aom_highbd_var_filter_block2d_bil_second_pass( \
1497 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1498 \
1499 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1500 W, wsrc, mask, sse); \
1501 } \
1502 \
1503 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1504 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1505 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1506 uint16_t fdata3[(H + 1) * W]; \
1507 uint16_t temp2[H * W]; \
1508 \
1509 aom_highbd_var_filter_block2d_bil_first_pass( \
1510 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1511 aom_highbd_var_filter_block2d_bil_second_pass( \
1512 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1513 \
1514 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1515 W, wsrc, mask, sse); \
1516 }
1517
1518 HIGHBD_OBMC_VAR(4, 4)
1519 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1520
1521 HIGHBD_OBMC_VAR(4, 8)
1522 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1523
1524 HIGHBD_OBMC_VAR(8, 4)
1525 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1526
1527 HIGHBD_OBMC_VAR(8, 8)
1528 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1529
1530 HIGHBD_OBMC_VAR(8, 16)
1531 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1532
1533 HIGHBD_OBMC_VAR(16, 8)
1534 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1535
1536 HIGHBD_OBMC_VAR(16, 16)
1537 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1538
1539 HIGHBD_OBMC_VAR(16, 32)
1540 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1541
1542 HIGHBD_OBMC_VAR(32, 16)
1543 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1544
1545 HIGHBD_OBMC_VAR(32, 32)
1546 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1547
1548 HIGHBD_OBMC_VAR(32, 64)
1549 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1550
1551 HIGHBD_OBMC_VAR(64, 32)
1552 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1553
1554 HIGHBD_OBMC_VAR(64, 64)
1555 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1556
1557 HIGHBD_OBMC_VAR(64, 128)
1558 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1559
1560 HIGHBD_OBMC_VAR(128, 64)
1561 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1562
1563 HIGHBD_OBMC_VAR(128, 128)
1564 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1565
1566 HIGHBD_OBMC_VAR(4, 16)
1567 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1568 HIGHBD_OBMC_VAR(16, 4)
1569 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1570 HIGHBD_OBMC_VAR(8, 32)
1571 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1572 HIGHBD_OBMC_VAR(32, 8)
1573 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1574 HIGHBD_OBMC_VAR(16, 64)
1575 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1576 HIGHBD_OBMC_VAR(64, 16)
1577 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1578