1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25 
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27                              int dst_stride, int w, int h,
28                              const int16_t *x_filters, int x0_qn,
29                              int x_step_qn) {
30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31   for (int y = 0; y < h; ++y) {
32     int x_qn = x0_qn;
33     for (int x = 0; x < w; ++x) {
34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35       const int x_filter_idx =
36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37       assert(x_filter_idx <= RS_SUBPEL_MASK);
38       const int16_t *const x_filter =
39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40       int sum = 0;
41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42         sum += src_x[k] * x_filter[k];
43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44       x_qn += x_step_qn;
45     }
46     src += src_stride;
47     dst += dst_stride;
48   }
49 }
50 
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52                                     uint16_t *dst, int dst_stride, int w, int h,
53                                     const int16_t *x_filters, int x0_qn,
54                                     int x_step_qn, int bd) {
55   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56   for (int y = 0; y < h; ++y) {
57     int x_qn = x0_qn;
58     for (int x = 0; x < w; ++x) {
59       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60       const int x_filter_idx =
61           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62       assert(x_filter_idx <= RS_SUBPEL_MASK);
63       const int16_t *const x_filter =
64           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65       int sum = 0;
66       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67         sum += src_x[k] * x_filter[k];
68       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69       x_qn += x_step_qn;
70     }
71     src += src_stride;
72     dst += dst_stride;
73   }
74 }
75 
av1_convolve_2d_sobel_y_c(const uint8_t * src,int src_stride,double * dst,int dst_stride,int w,int h,int dir,double norm)76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
77                                int dst_stride, int w, int h, int dir,
78                                double norm) {
79   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
80   DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
81   DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
82   const int taps = 3;
83   int im_h = h + taps - 1;
84   int im_stride = w;
85   const int fo_vert = 1;
86   const int fo_horiz = 1;
87 
88   // horizontal filter
89   const uint8_t *src_horiz = src - fo_vert * src_stride;
90   const int16_t *x_filter = dir ? sobel_a : sobel_b;
91   for (int y = 0; y < im_h; ++y) {
92     for (int x = 0; x < w; ++x) {
93       int16_t sum = 0;
94       for (int k = 0; k < taps; ++k) {
95         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
96       }
97       im_block[y * im_stride + x] = sum;
98     }
99   }
100 
101   // vertical filter
102   int16_t *src_vert = im_block + fo_vert * im_stride;
103   const int16_t *y_filter = dir ? sobel_b : sobel_a;
104   for (int y = 0; y < h; ++y) {
105     for (int x = 0; x < w; ++x) {
106       int16_t sum = 0;
107       for (int k = 0; k < taps; ++k) {
108         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
109       }
110       dst[y * dst_stride + x] = sum * norm;
111     }
112   }
113 }
114 
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
116                           int dst_stride, int w, int h,
117                           const InterpFilterParams *filter_params_x,
118                           const InterpFilterParams *filter_params_y,
119                           const int subpel_x_qn, const int subpel_y_qn,
120                           ConvolveParams *conv_params) {
121   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
122   int im_h = h + filter_params_y->taps - 1;
123   int im_stride = w;
124   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
125   const int fo_vert = filter_params_y->taps / 2 - 1;
126   const int fo_horiz = filter_params_x->taps / 2 - 1;
127   const int bd = 8;
128   const int bits =
129       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
130 
131   // horizontal filter
132   const uint8_t *src_horiz = src - fo_vert * src_stride;
133   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
134       filter_params_x, subpel_x_qn & SUBPEL_MASK);
135   for (int y = 0; y < im_h; ++y) {
136     for (int x = 0; x < w; ++x) {
137       int32_t sum = (1 << (bd + FILTER_BITS - 1));
138       for (int k = 0; k < filter_params_x->taps; ++k) {
139         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
140       }
141       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
142       im_block[y * im_stride + x] =
143           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
144     }
145   }
146 
147   // vertical filter
148   int16_t *src_vert = im_block + fo_vert * im_stride;
149   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
150       filter_params_y, subpel_y_qn & SUBPEL_MASK);
151   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
152   for (int y = 0; y < h; ++y) {
153     for (int x = 0; x < w; ++x) {
154       int32_t sum = 1 << offset_bits;
155       for (int k = 0; k < filter_params_y->taps; ++k) {
156         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
157       }
158       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
159       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
160                     ((1 << (offset_bits - conv_params->round_1)) +
161                      (1 << (offset_bits - conv_params->round_1 - 1)));
162       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
163     }
164   }
165 }
166 
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)167 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
168                          int dst_stride, int w, int h,
169                          const InterpFilterParams *filter_params_x,
170                          const InterpFilterParams *filter_params_y,
171                          const int subpel_x_qn, const int subpel_y_qn,
172                          ConvolveParams *conv_params) {
173   const int fo_vert = filter_params_y->taps / 2 - 1;
174   (void)filter_params_x;
175   (void)subpel_x_qn;
176   (void)conv_params;
177 
178   assert(conv_params->round_0 <= FILTER_BITS);
179   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
180          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
181 
182   // vertical filter
183   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
184       filter_params_y, subpel_y_qn & SUBPEL_MASK);
185   for (int y = 0; y < h; ++y) {
186     for (int x = 0; x < w; ++x) {
187       int32_t res = 0;
188       for (int k = 0; k < filter_params_y->taps; ++k) {
189         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
190       }
191       dst[y * dst_stride + x] =
192           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
193     }
194   }
195 }
196 
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)197 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
198                          int dst_stride, int w, int h,
199                          const InterpFilterParams *filter_params_x,
200                          const InterpFilterParams *filter_params_y,
201                          const int subpel_x_qn, const int subpel_y_qn,
202                          ConvolveParams *conv_params) {
203   const int fo_horiz = filter_params_x->taps / 2 - 1;
204   const int bits = FILTER_BITS - conv_params->round_0;
205   (void)filter_params_y;
206   (void)subpel_y_qn;
207   (void)conv_params;
208 
209   assert(bits >= 0);
210   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
211          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
212 
213   // horizontal filter
214   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
215       filter_params_x, subpel_x_qn & SUBPEL_MASK);
216 
217   for (int y = 0; y < h; ++y) {
218     for (int x = 0; x < w; ++x) {
219       int32_t res = 0;
220       for (int k = 0; k < filter_params_x->taps; ++k) {
221         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
222       }
223       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
224       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
225     }
226   }
227 }
228 
av1_convolve_2d_copy_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)229 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
230                                int dst_stride, int w, int h,
231                                const InterpFilterParams *filter_params_x,
232                                const InterpFilterParams *filter_params_y,
233                                const int subpel_x_qn, const int subpel_y_qn,
234                                ConvolveParams *conv_params) {
235   (void)filter_params_x;
236   (void)filter_params_y;
237   (void)subpel_x_qn;
238   (void)subpel_y_qn;
239   (void)conv_params;
240 
241   for (int y = 0; y < h; ++y) {
242     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
243   }
244 }
245 
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)246 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
247                                 uint8_t *dst, int dst_stride, int w, int h,
248                                 const InterpFilterParams *filter_params_x,
249                                 const InterpFilterParams *filter_params_y,
250                                 const int subpel_x_qn, const int subpel_y_qn,
251                                 ConvolveParams *conv_params) {
252   CONV_BUF_TYPE *dst16 = conv_params->dst;
253   int dst16_stride = conv_params->dst_stride;
254   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
255   int im_h = h + filter_params_y->taps - 1;
256   int im_stride = w;
257   const int fo_vert = filter_params_y->taps / 2 - 1;
258   const int fo_horiz = filter_params_x->taps / 2 - 1;
259   const int bd = 8;
260   const int round_bits =
261       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
262 
263   // horizontal filter
264   const uint8_t *src_horiz = src - fo_vert * src_stride;
265   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
266       filter_params_x, subpel_x_qn & SUBPEL_MASK);
267   for (int y = 0; y < im_h; ++y) {
268     for (int x = 0; x < w; ++x) {
269       int32_t sum = (1 << (bd + FILTER_BITS - 1));
270       for (int k = 0; k < filter_params_x->taps; ++k) {
271         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
272       }
273       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
274       im_block[y * im_stride + x] =
275           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
276     }
277   }
278 
279   // vertical filter
280   int16_t *src_vert = im_block + fo_vert * im_stride;
281   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
282       filter_params_y, subpel_y_qn & SUBPEL_MASK);
283   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
284   for (int y = 0; y < h; ++y) {
285     for (int x = 0; x < w; ++x) {
286       int32_t sum = 1 << offset_bits;
287       for (int k = 0; k < filter_params_y->taps; ++k) {
288         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
289       }
290       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
291       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
292       if (conv_params->do_average) {
293         int32_t tmp = dst16[y * dst16_stride + x];
294         if (conv_params->use_dist_wtd_comp_avg) {
295           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
296           tmp = tmp >> DIST_PRECISION_BITS;
297         } else {
298           tmp += res;
299           tmp = tmp >> 1;
300         }
301         tmp -= (1 << (offset_bits - conv_params->round_1)) +
302                (1 << (offset_bits - conv_params->round_1 - 1));
303         dst[y * dst_stride + x] =
304             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
305       } else {
306         dst16[y * dst16_stride + x] = res;
307       }
308     }
309   }
310 }
311 
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)312 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
313                                int dst_stride, int w, int h,
314                                const InterpFilterParams *filter_params_x,
315                                const InterpFilterParams *filter_params_y,
316                                const int subpel_x_qn, const int subpel_y_qn,
317                                ConvolveParams *conv_params) {
318   CONV_BUF_TYPE *dst16 = conv_params->dst;
319   int dst16_stride = conv_params->dst_stride;
320   const int fo_vert = filter_params_y->taps / 2 - 1;
321   const int bits = FILTER_BITS - conv_params->round_0;
322   const int bd = 8;
323   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
324   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
325                            (1 << (offset_bits - conv_params->round_1 - 1));
326   const int round_bits =
327       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
328   (void)filter_params_x;
329   (void)subpel_x_qn;
330 
331   // vertical filter
332   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
333       filter_params_y, subpel_y_qn & SUBPEL_MASK);
334   for (int y = 0; y < h; ++y) {
335     for (int x = 0; x < w; ++x) {
336       int32_t res = 0;
337       for (int k = 0; k < filter_params_y->taps; ++k) {
338         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
339       }
340       res *= (1 << bits);
341       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
342 
343       if (conv_params->do_average) {
344         int32_t tmp = dst16[y * dst16_stride + x];
345         if (conv_params->use_dist_wtd_comp_avg) {
346           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
347           tmp = tmp >> DIST_PRECISION_BITS;
348         } else {
349           tmp += res;
350           tmp = tmp >> 1;
351         }
352         tmp -= round_offset;
353         dst[y * dst_stride + x] =
354             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
355       } else {
356         dst16[y * dst16_stride + x] = res;
357       }
358     }
359   }
360 }
361 
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)362 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
363                                int dst_stride, int w, int h,
364                                const InterpFilterParams *filter_params_x,
365                                const InterpFilterParams *filter_params_y,
366                                const int subpel_x_qn, const int subpel_y_qn,
367                                ConvolveParams *conv_params) {
368   CONV_BUF_TYPE *dst16 = conv_params->dst;
369   int dst16_stride = conv_params->dst_stride;
370   const int fo_horiz = filter_params_x->taps / 2 - 1;
371   const int bits = FILTER_BITS - conv_params->round_1;
372   const int bd = 8;
373   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
374   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
375                            (1 << (offset_bits - conv_params->round_1 - 1));
376   const int round_bits =
377       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
378   (void)filter_params_y;
379   (void)subpel_y_qn;
380 
381   // horizontal filter
382   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
383       filter_params_x, subpel_x_qn & SUBPEL_MASK);
384   for (int y = 0; y < h; ++y) {
385     for (int x = 0; x < w; ++x) {
386       int32_t res = 0;
387       for (int k = 0; k < filter_params_x->taps; ++k) {
388         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
389       }
390       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
391       res += round_offset;
392 
393       if (conv_params->do_average) {
394         int32_t tmp = dst16[y * dst16_stride + x];
395         if (conv_params->use_dist_wtd_comp_avg) {
396           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
397           tmp = tmp >> DIST_PRECISION_BITS;
398         } else {
399           tmp += res;
400           tmp = tmp >> 1;
401         }
402         tmp -= round_offset;
403         dst[y * dst_stride + x] =
404             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
405       } else {
406         dst16[y * dst16_stride + x] = res;
407       }
408     }
409   }
410 }
411 
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)412 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
413                                      uint8_t *dst, int dst_stride, int w, int h,
414                                      const InterpFilterParams *filter_params_x,
415                                      const InterpFilterParams *filter_params_y,
416                                      const int subpel_x_qn,
417                                      const int subpel_y_qn,
418                                      ConvolveParams *conv_params) {
419   CONV_BUF_TYPE *dst16 = conv_params->dst;
420   int dst16_stride = conv_params->dst_stride;
421   const int bits =
422       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
423   const int bd = 8;
424   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
425   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
426                            (1 << (offset_bits - conv_params->round_1 - 1));
427   (void)filter_params_x;
428   (void)filter_params_y;
429   (void)subpel_x_qn;
430   (void)subpel_y_qn;
431 
432   for (int y = 0; y < h; ++y) {
433     for (int x = 0; x < w; ++x) {
434       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
435       res += round_offset;
436 
437       if (conv_params->do_average) {
438         int32_t tmp = dst16[y * dst16_stride + x];
439         if (conv_params->use_dist_wtd_comp_avg) {
440           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
441           tmp = tmp >> DIST_PRECISION_BITS;
442         } else {
443           tmp += res;
444           tmp = tmp >> 1;
445         }
446         tmp -= round_offset;
447         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
448       } else {
449         dst16[y * dst16_stride + x] = res;
450       }
451     }
452   }
453 }
454 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)455 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
456                              int dst_stride, int w, int h,
457                              const InterpFilterParams *filter_params_x,
458                              const InterpFilterParams *filter_params_y,
459                              const int subpel_x_qn, const int x_step_qn,
460                              const int subpel_y_qn, const int y_step_qn,
461                              ConvolveParams *conv_params) {
462   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
463   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
464              filter_params_y->taps;
465   CONV_BUF_TYPE *dst16 = conv_params->dst;
466   const int dst16_stride = conv_params->dst_stride;
467   const int bits =
468       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
469   assert(bits >= 0);
470   int im_stride = w;
471   const int fo_vert = filter_params_y->taps / 2 - 1;
472   const int fo_horiz = filter_params_x->taps / 2 - 1;
473   const int bd = 8;
474 
475   // horizontal filter
476   const uint8_t *src_horiz = src - fo_vert * src_stride;
477   for (int y = 0; y < im_h; ++y) {
478     int x_qn = subpel_x_qn;
479     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
480       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
481       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
482       assert(x_filter_idx < SUBPEL_SHIFTS);
483       const int16_t *x_filter =
484           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
485       int32_t sum = (1 << (bd + FILTER_BITS - 1));
486       for (int k = 0; k < filter_params_x->taps; ++k) {
487         sum += x_filter[k] * src_x[k - fo_horiz];
488       }
489       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
490       im_block[y * im_stride + x] =
491           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
492     }
493     src_horiz += src_stride;
494   }
495 
496   // vertical filter
497   int16_t *src_vert = im_block + fo_vert * im_stride;
498   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
499   for (int x = 0; x < w; ++x) {
500     int y_qn = subpel_y_qn;
501     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
502       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
503       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
504       assert(y_filter_idx < SUBPEL_SHIFTS);
505       const int16_t *y_filter =
506           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
507       int32_t sum = 1 << offset_bits;
508       for (int k = 0; k < filter_params_y->taps; ++k) {
509         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
510       }
511       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
512       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
513       if (conv_params->is_compound) {
514         if (conv_params->do_average) {
515           int32_t tmp = dst16[y * dst16_stride + x];
516           if (conv_params->use_dist_wtd_comp_avg) {
517             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
518             tmp = tmp >> DIST_PRECISION_BITS;
519           } else {
520             tmp += res;
521             tmp = tmp >> 1;
522           }
523           /* Subtract round offset and convolve round */
524           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
525                        (1 << (offset_bits - conv_params->round_1 - 1)));
526           dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
527         } else {
528           dst16[y * dst16_stride + x] = res;
529         }
530       } else {
531         /* Subtract round offset and convolve round */
532         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
533                              (1 << (offset_bits - conv_params->round_1 - 1)));
534         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
535       }
536     }
537     src_vert++;
538   }
539 }
540 
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)541 static void convolve_2d_scale_wrapper(
542     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
543     int h, const InterpFilterParams *filter_params_x,
544     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
545     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
546     ConvolveParams *conv_params) {
547   if (conv_params->is_compound) {
548     assert(conv_params->dst != NULL);
549   }
550   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
551                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
552                         y_step_qn, conv_params);
553 }
554 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf)555 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
556                             int dst_stride, int w, int h,
557                             const InterpFilterParams *interp_filters[2],
558                             const int subpel_x_qn, int x_step_q4,
559                             const int subpel_y_qn, int y_step_q4, int scaled,
560                             ConvolveParams *conv_params,
561                             const struct scale_factors *sf) {
562   (void)x_step_q4;
563   (void)y_step_q4;
564   (void)dst;
565   (void)dst_stride;
566 
567   const InterpFilterParams *filter_params_x = interp_filters[0];
568   const InterpFilterParams *filter_params_y = interp_filters[1];
569 
570   // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
571   // Do we have SIMD support to 4-tap case?
572   // 2-tap filter indicates that it is for IntraBC.
573   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
574     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
575     assert(!scaled);
576     if (subpel_x_qn && subpel_y_qn) {
577       av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
578                            filter_params_x, filter_params_y, subpel_x_qn,
579                            subpel_y_qn, conv_params);
580       return;
581     } else if (subpel_x_qn) {
582       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
583                           filter_params_x, filter_params_y, subpel_x_qn,
584                           subpel_y_qn, conv_params);
585       return;
586     } else if (subpel_y_qn) {
587       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
588                           filter_params_x, filter_params_y, subpel_x_qn,
589                           subpel_y_qn, conv_params);
590       return;
591     }
592   }
593 
594   if (scaled) {
595     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
596                               filter_params_x, filter_params_y, subpel_x_qn,
597                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
598   } else {
599     sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
600         src, src_stride, dst, dst_stride, w, h, filter_params_x,
601         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
602   }
603 }
604 
605 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)606 void av1_highbd_convolve_2d_copy_sr_c(
607     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
608     int h, const InterpFilterParams *filter_params_x,
609     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
610     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
611   (void)filter_params_x;
612   (void)filter_params_y;
613   (void)subpel_x_qn;
614   (void)subpel_y_qn;
615   (void)conv_params;
616   (void)bd;
617 
618   for (int y = 0; y < h; ++y) {
619     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
620   }
621 }
622 
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)623 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
624                                 uint16_t *dst, int dst_stride, int w, int h,
625                                 const InterpFilterParams *filter_params_x,
626                                 const InterpFilterParams *filter_params_y,
627                                 const int subpel_x_qn, const int subpel_y_qn,
628                                 ConvolveParams *conv_params, int bd) {
629   const int fo_horiz = filter_params_x->taps / 2 - 1;
630   const int bits = FILTER_BITS - conv_params->round_0;
631   (void)filter_params_y;
632   (void)subpel_y_qn;
633 
634   assert(bits >= 0);
635   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
636          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
637 
638   // horizontal filter
639   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
640       filter_params_x, subpel_x_qn & SUBPEL_MASK);
641   for (int y = 0; y < h; ++y) {
642     for (int x = 0; x < w; ++x) {
643       int32_t res = 0;
644       for (int k = 0; k < filter_params_x->taps; ++k) {
645         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
646       }
647       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
648       dst[y * dst_stride + x] =
649           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
650     }
651   }
652 }
653 
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)654 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
655                                 uint16_t *dst, int dst_stride, int w, int h,
656                                 const InterpFilterParams *filter_params_x,
657                                 const InterpFilterParams *filter_params_y,
658                                 const int subpel_x_qn, const int subpel_y_qn,
659                                 ConvolveParams *conv_params, int bd) {
660   const int fo_vert = filter_params_y->taps / 2 - 1;
661   (void)filter_params_x;
662   (void)subpel_x_qn;
663   (void)conv_params;
664 
665   assert(conv_params->round_0 <= FILTER_BITS);
666   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
667          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
668   // vertical filter
669   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
670       filter_params_y, subpel_y_qn & SUBPEL_MASK);
671   for (int y = 0; y < h; ++y) {
672     for (int x = 0; x < w; ++x) {
673       int32_t res = 0;
674       for (int k = 0; k < filter_params_y->taps; ++k) {
675         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
676       }
677       dst[y * dst_stride + x] =
678           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
679     }
680   }
681 }
682 
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)683 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
684                                  uint16_t *dst, int dst_stride, int w, int h,
685                                  const InterpFilterParams *filter_params_x,
686                                  const InterpFilterParams *filter_params_y,
687                                  const int subpel_x_qn, const int subpel_y_qn,
688                                  ConvolveParams *conv_params, int bd) {
689   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
690   int im_h = h + filter_params_y->taps - 1;
691   int im_stride = w;
692   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
693   const int fo_vert = filter_params_y->taps / 2 - 1;
694   const int fo_horiz = filter_params_x->taps / 2 - 1;
695   const int bits =
696       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
697   assert(bits >= 0);
698 
699   // horizontal filter
700   const uint16_t *src_horiz = src - fo_vert * src_stride;
701   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
702       filter_params_x, subpel_x_qn & SUBPEL_MASK);
703   for (int y = 0; y < im_h; ++y) {
704     for (int x = 0; x < w; ++x) {
705       int32_t sum = (1 << (bd + FILTER_BITS - 1));
706       for (int k = 0; k < filter_params_x->taps; ++k) {
707         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
708       }
709       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
710       im_block[y * im_stride + x] =
711           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
712     }
713   }
714 
715   // vertical filter
716   int16_t *src_vert = im_block + fo_vert * im_stride;
717   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
718       filter_params_y, subpel_y_qn & SUBPEL_MASK);
719   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
720   for (int y = 0; y < h; ++y) {
721     for (int x = 0; x < w; ++x) {
722       int32_t sum = 1 << offset_bits;
723       for (int k = 0; k < filter_params_y->taps; ++k) {
724         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
725       }
726       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
727       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
728                     ((1 << (offset_bits - conv_params->round_1)) +
729                      (1 << (offset_bits - conv_params->round_1 - 1)));
730       dst[y * dst_stride + x] =
731           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
732     }
733   }
734 }
735 
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)736 void av1_highbd_dist_wtd_convolve_2d_c(
737     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
738     int h, const InterpFilterParams *filter_params_x,
739     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
740     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
741   int x, y, k;
742   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
743   CONV_BUF_TYPE *dst16 = conv_params->dst;
744   int dst16_stride = conv_params->dst_stride;
745   int im_h = h + filter_params_y->taps - 1;
746   int im_stride = w;
747   const int fo_vert = filter_params_y->taps / 2 - 1;
748   const int fo_horiz = filter_params_x->taps / 2 - 1;
749   const int round_bits =
750       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
751   assert(round_bits >= 0);
752 
753   // horizontal filter
754   const uint16_t *src_horiz = src - fo_vert * src_stride;
755   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756       filter_params_x, subpel_x_qn & SUBPEL_MASK);
757   for (y = 0; y < im_h; ++y) {
758     for (x = 0; x < w; ++x) {
759       int32_t sum = (1 << (bd + FILTER_BITS - 1));
760       for (k = 0; k < filter_params_x->taps; ++k) {
761         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
762       }
763       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
764       (void)bd;
765       im_block[y * im_stride + x] =
766           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
767     }
768   }
769 
770   // vertical filter
771   int16_t *src_vert = im_block + fo_vert * im_stride;
772   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
773   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
774       filter_params_y, subpel_y_qn & SUBPEL_MASK);
775   for (y = 0; y < h; ++y) {
776     for (x = 0; x < w; ++x) {
777       int32_t sum = 1 << offset_bits;
778       for (k = 0; k < filter_params_y->taps; ++k) {
779         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
780       }
781       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
782       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
783       if (conv_params->do_average) {
784         int32_t tmp = dst16[y * dst16_stride + x];
785         if (conv_params->use_dist_wtd_comp_avg) {
786           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
787           tmp = tmp >> DIST_PRECISION_BITS;
788         } else {
789           tmp += res;
790           tmp = tmp >> 1;
791         }
792         tmp -= (1 << (offset_bits - conv_params->round_1)) +
793                (1 << (offset_bits - conv_params->round_1 - 1));
794         dst[y * dst_stride + x] =
795             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
796       } else {
797         dst16[y * dst16_stride + x] = res;
798       }
799     }
800   }
801 }
802 
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)803 void av1_highbd_dist_wtd_convolve_x_c(
804     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
805     int h, const InterpFilterParams *filter_params_x,
806     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
807     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
808   CONV_BUF_TYPE *dst16 = conv_params->dst;
809   int dst16_stride = conv_params->dst_stride;
810   const int fo_horiz = filter_params_x->taps / 2 - 1;
811   const int bits = FILTER_BITS - conv_params->round_1;
812   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
813   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
814                            (1 << (offset_bits - conv_params->round_1 - 1));
815   const int round_bits =
816       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
817   assert(round_bits >= 0);
818   (void)filter_params_y;
819   (void)subpel_y_qn;
820   assert(bits >= 0);
821   // horizontal filter
822   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
823       filter_params_x, subpel_x_qn & SUBPEL_MASK);
824   for (int y = 0; y < h; ++y) {
825     for (int x = 0; x < w; ++x) {
826       int32_t res = 0;
827       for (int k = 0; k < filter_params_x->taps; ++k) {
828         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
829       }
830       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
831       res += round_offset;
832 
833       if (conv_params->do_average) {
834         int32_t tmp = dst16[y * dst16_stride + x];
835         if (conv_params->use_dist_wtd_comp_avg) {
836           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
837           tmp = tmp >> DIST_PRECISION_BITS;
838         } else {
839           tmp += res;
840           tmp = tmp >> 1;
841         }
842         tmp -= round_offset;
843         dst[y * dst_stride + x] =
844             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
845       } else {
846         dst16[y * dst16_stride + x] = res;
847       }
848     }
849   }
850 }
851 
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)852 void av1_highbd_dist_wtd_convolve_y_c(
853     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
854     int h, const InterpFilterParams *filter_params_x,
855     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
856     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
857   CONV_BUF_TYPE *dst16 = conv_params->dst;
858   int dst16_stride = conv_params->dst_stride;
859   const int fo_vert = filter_params_y->taps / 2 - 1;
860   const int bits = FILTER_BITS - conv_params->round_0;
861   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
862   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
863                            (1 << (offset_bits - conv_params->round_1 - 1));
864   const int round_bits =
865       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
866   assert(round_bits >= 0);
867   (void)filter_params_x;
868   (void)subpel_x_qn;
869   assert(bits >= 0);
870   // vertical filter
871   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
872       filter_params_y, subpel_y_qn & SUBPEL_MASK);
873   for (int y = 0; y < h; ++y) {
874     for (int x = 0; x < w; ++x) {
875       int32_t res = 0;
876       for (int k = 0; k < filter_params_y->taps; ++k) {
877         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
878       }
879       res *= (1 << bits);
880       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
881 
882       if (conv_params->do_average) {
883         int32_t tmp = dst16[y * dst16_stride + x];
884         if (conv_params->use_dist_wtd_comp_avg) {
885           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
886           tmp = tmp >> DIST_PRECISION_BITS;
887         } else {
888           tmp += res;
889           tmp = tmp >> 1;
890         }
891         tmp -= round_offset;
892         dst[y * dst_stride + x] =
893             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
894       } else {
895         dst16[y * dst16_stride + x] = res;
896       }
897     }
898   }
899 }
900 
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)901 void av1_highbd_dist_wtd_convolve_2d_copy_c(
902     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
903     int h, const InterpFilterParams *filter_params_x,
904     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
905     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
906   CONV_BUF_TYPE *dst16 = conv_params->dst;
907   int dst16_stride = conv_params->dst_stride;
908   const int bits =
909       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
910   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
911   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
912                            (1 << (offset_bits - conv_params->round_1 - 1));
913   assert(bits >= 0);
914   (void)filter_params_x;
915   (void)filter_params_y;
916   (void)subpel_x_qn;
917   (void)subpel_y_qn;
918 
919   for (int y = 0; y < h; ++y) {
920     for (int x = 0; x < w; ++x) {
921       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
922       res += round_offset;
923       if (conv_params->do_average) {
924         int32_t tmp = dst16[y * dst16_stride + x];
925         if (conv_params->use_dist_wtd_comp_avg) {
926           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
927           tmp = tmp >> DIST_PRECISION_BITS;
928         } else {
929           tmp += res;
930           tmp = tmp >> 1;
931         }
932         tmp -= round_offset;
933         dst[y * dst_stride + x] =
934             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
935       } else {
936         dst16[y * dst16_stride + x] = res;
937       }
938     }
939   }
940 }
941 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)942 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
943                                     uint16_t *dst, int dst_stride, int w, int h,
944                                     const InterpFilterParams *filter_params_x,
945                                     const InterpFilterParams *filter_params_y,
946                                     const int subpel_x_qn, const int x_step_qn,
947                                     const int subpel_y_qn, const int y_step_qn,
948                                     ConvolveParams *conv_params, int bd) {
949   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
950   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
951              filter_params_y->taps;
952   int im_stride = w;
953   const int fo_vert = filter_params_y->taps / 2 - 1;
954   const int fo_horiz = filter_params_x->taps / 2 - 1;
955   CONV_BUF_TYPE *dst16 = conv_params->dst;
956   const int dst16_stride = conv_params->dst_stride;
957   const int bits =
958       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
959   assert(bits >= 0);
960   // horizontal filter
961   const uint16_t *src_horiz = src - fo_vert * src_stride;
962   for (int y = 0; y < im_h; ++y) {
963     int x_qn = subpel_x_qn;
964     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
965       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
966       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
967       assert(x_filter_idx < SUBPEL_SHIFTS);
968       const int16_t *x_filter =
969           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
970       int32_t sum = (1 << (bd + FILTER_BITS - 1));
971       for (int k = 0; k < filter_params_x->taps; ++k) {
972         sum += x_filter[k] * src_x[k - fo_horiz];
973       }
974       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
975       im_block[y * im_stride + x] =
976           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
977     }
978     src_horiz += src_stride;
979   }
980 
981   // vertical filter
982   int16_t *src_vert = im_block + fo_vert * im_stride;
983   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
984   for (int x = 0; x < w; ++x) {
985     int y_qn = subpel_y_qn;
986     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
987       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
988       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
989       assert(y_filter_idx < SUBPEL_SHIFTS);
990       const int16_t *y_filter =
991           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
992       int32_t sum = 1 << offset_bits;
993       for (int k = 0; k < filter_params_y->taps; ++k) {
994         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
995       }
996       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
997       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
998       if (conv_params->is_compound) {
999         if (conv_params->do_average) {
1000           int32_t tmp = dst16[y * dst16_stride + x];
1001           if (conv_params->use_dist_wtd_comp_avg) {
1002             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1003             tmp = tmp >> DIST_PRECISION_BITS;
1004           } else {
1005             tmp += res;
1006             tmp = tmp >> 1;
1007           }
1008           /* Subtract round offset and convolve round */
1009           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1010                        (1 << (offset_bits - conv_params->round_1 - 1)));
1011           dst[y * dst_stride + x] =
1012               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1013         } else {
1014           dst16[y * dst16_stride + x] = res;
1015         }
1016       } else {
1017         /* Subtract round offset and convolve round */
1018         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1019                              (1 << (offset_bits - conv_params->round_1 - 1)));
1020         dst[y * dst_stride + x] =
1021             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1022       }
1023     }
1024     src_vert++;
1025   }
1026 }
1027 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int bd)1028 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1029                                    uint8_t *dst8, int dst_stride, int w, int h,
1030                                    const InterpFilterParams *interp_filters[2],
1031                                    const int subpel_x_qn, int x_step_q4,
1032                                    const int subpel_y_qn, int y_step_q4,
1033                                    int scaled, ConvolveParams *conv_params,
1034                                    const struct scale_factors *sf, int bd) {
1035   (void)x_step_q4;
1036   (void)y_step_q4;
1037   (void)dst_stride;
1038   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1039 
1040   const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1041   const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1042   const InterpFilterParams *filter_params_x =
1043       need_filter_params_x ? interp_filters[0] : NULL;
1044   const InterpFilterParams *filter_params_y =
1045       need_filter_params_y ? interp_filters[1] : NULL;
1046 
1047   if (scaled) {
1048     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1049     if (conv_params->is_compound) {
1050       assert(conv_params->dst != NULL);
1051     }
1052     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1053                                  filter_params_x, filter_params_y, subpel_x_qn,
1054                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1055                                  bd);
1056   } else {
1057     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1058 
1059     sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
1060                                           0][conv_params->is_compound](
1061         src, src_stride, dst, dst_stride, w, h, filter_params_x,
1062         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1063   }
1064 }
1065 #endif  // CONFIG_AV1_HIGHBITDEPTH
1066 
1067 // Note: Fixed size intermediate buffers, place limits on parameters
1068 // of some functions. 2d filtering proceeds in 2 steps:
1069 //   (1) Interpolate horizontally into an intermediate buffer, temp.
1070 //   (2) Interpolate temp vertically to derive the sub-pixel result.
1071 // Deriving the maximum number of rows in the temp buffer (135):
1072 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1073 // --Largest block size is 128x128 pixels.
1074 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1075 //   original frame (in 1/16th pixel units).
1076 // --Must round-up because block may be located at sub-pixel position.
1077 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1078 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1079 #define WIENER_MAX_EXT_SIZE 263
1080 
horz_scalar_product(const uint8_t * a,const int16_t * b)1081 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1082   int sum = 0;
1083   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1084   return sum;
1085 }
1086 
1087 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1088 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1089                                              const int16_t *b) {
1090   int sum = 0;
1091   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1092   return sum;
1093 }
1094 #endif
1095 
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1096 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1097                                              ptrdiff_t a_stride,
1098                                              const int16_t *b) {
1099   int sum = 0;
1100   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1101   return sum;
1102 }
1103 
get_filter_base(const int16_t * filter)1104 static const InterpKernel *get_filter_base(const int16_t *filter) {
1105   // NOTE: This assumes that the filter table is 256-byte aligned.
1106   // TODO(agrange) Modify to make independent of table alignment.
1107   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1108 }
1109 
get_filter_offset(const int16_t * f,const InterpKernel * base)1110 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1111   return (int)((const InterpKernel *)(intptr_t)f - base);
1112 }
1113 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1114 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1115                                        uint16_t *dst, ptrdiff_t dst_stride,
1116                                        const InterpKernel *x_filters, int x0_q4,
1117                                        int x_step_q4, int w, int h,
1118                                        int round0_bits) {
1119   const int bd = 8;
1120   src -= SUBPEL_TAPS / 2 - 1;
1121   for (int y = 0; y < h; ++y) {
1122     int x_q4 = x0_q4;
1123     for (int x = 0; x < w; ++x) {
1124       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1125       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1126       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1127                            (1 << (bd + FILTER_BITS - 1));
1128       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1129       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1130                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1131       x_q4 += x_step_q4;
1132     }
1133     src += src_stride;
1134     dst += dst_stride;
1135   }
1136 }
1137 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1138 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1139                                       uint8_t *dst, ptrdiff_t dst_stride,
1140                                       const InterpKernel *y_filters, int y0_q4,
1141                                       int y_step_q4, int w, int h,
1142                                       int round1_bits) {
1143   const int bd = 8;
1144   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1145 
1146   for (int x = 0; x < w; ++x) {
1147     int y_q4 = y0_q4;
1148     for (int y = 0; y < h; ++y) {
1149       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1150       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1151       const int rounding =
1152           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1153           (1 << (bd + round1_bits - 1));
1154       const int sum =
1155           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1156       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1157       y_q4 += y_step_q4;
1158     }
1159     ++src;
1160     ++dst;
1161   }
1162 }
1163 
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1164 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1165                                    uint8_t *dst, ptrdiff_t dst_stride,
1166                                    const int16_t *filter_x, int x_step_q4,
1167                                    const int16_t *filter_y, int y_step_q4,
1168                                    int w, int h,
1169                                    const ConvolveParams *conv_params) {
1170   const InterpKernel *const filters_x = get_filter_base(filter_x);
1171   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1172 
1173   const InterpKernel *const filters_y = get_filter_base(filter_y);
1174   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1175 
1176   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1177   const int intermediate_height =
1178       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1179   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1180 
1181   assert(w <= MAX_SB_SIZE);
1182   assert(h <= MAX_SB_SIZE);
1183   assert(y_step_q4 <= 32);
1184   assert(x_step_q4 <= 32);
1185 
1186   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1187                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1188                              x_step_q4, w, intermediate_height,
1189                              conv_params->round_0);
1190   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1191                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1192                             y_step_q4, w, h, conv_params->round_1);
1193 }
1194 
1195 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1196 static void highbd_convolve_add_src_horiz_hip(
1197     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1198     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1199     int x_step_q4, int w, int h, int round0_bits, int bd) {
1200   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1201   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1202   src -= SUBPEL_TAPS / 2 - 1;
1203   for (int y = 0; y < h; ++y) {
1204     int x_q4 = x0_q4;
1205     for (int x = 0; x < w; ++x) {
1206       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1207       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1208       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1209                            (1 << (bd + FILTER_BITS - 1));
1210       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1211       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1212                                extraprec_clamp_limit - 1);
1213       x_q4 += x_step_q4;
1214     }
1215     src += src_stride;
1216     dst += dst_stride;
1217   }
1218 }
1219 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1220 static void highbd_convolve_add_src_vert_hip(
1221     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1222     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1223     int y_step_q4, int w, int h, int round1_bits, int bd) {
1224   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1225   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1226   for (int x = 0; x < w; ++x) {
1227     int y_q4 = y0_q4;
1228     for (int y = 0; y < h; ++y) {
1229       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1230       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1231       const int rounding =
1232           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1233           (1 << (bd + round1_bits - 1));
1234       const int sum =
1235           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1236       dst[y * dst_stride] =
1237           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1238       y_q4 += y_step_q4;
1239     }
1240     ++src;
1241     ++dst;
1242   }
1243 }
1244 
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1245 void av1_highbd_wiener_convolve_add_src_c(
1246     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1247     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1248     const int16_t *filter_y, int y_step_q4, int w, int h,
1249     const ConvolveParams *conv_params, int bd) {
1250   const InterpKernel *const filters_x = get_filter_base(filter_x);
1251   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1252 
1253   const InterpKernel *const filters_y = get_filter_base(filter_y);
1254   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1255 
1256   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1257   const int intermediate_height =
1258       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1259 
1260   assert(w <= MAX_SB_SIZE);
1261   assert(h <= MAX_SB_SIZE);
1262   assert(y_step_q4 <= 32);
1263   assert(x_step_q4 <= 32);
1264   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1265 
1266   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1267                                     src_stride, temp, MAX_SB_SIZE, filters_x,
1268                                     x0_q4, x_step_q4, w, intermediate_height,
1269                                     conv_params->round_0, bd);
1270   highbd_convolve_add_src_vert_hip(
1271       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1272       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1273 }
1274 #endif  // CONFIG_AV1_HIGHBITDEPTH
1275