1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "av1/common/blockd.h"
19 #include "av1/common/convolve.h"
20 #include "av1/common/filter.h"
21 #include "av1/common/onyxc_int.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25 
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27                              int dst_stride, int w, int h,
28                              const int16_t *x_filters, int x0_qn,
29                              int x_step_qn) {
30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31   for (int y = 0; y < h; ++y) {
32     int x_qn = x0_qn;
33     for (int x = 0; x < w; ++x) {
34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35       const int x_filter_idx =
36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37       assert(x_filter_idx <= RS_SUBPEL_MASK);
38       const int16_t *const x_filter =
39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40       int sum = 0;
41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42         sum += src_x[k] * x_filter[k];
43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44       x_qn += x_step_qn;
45     }
46     src += src_stride;
47     dst += dst_stride;
48   }
49 }
50 
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52                                     uint16_t *dst, int dst_stride, int w, int h,
53                                     const int16_t *x_filters, int x0_qn,
54                                     int x_step_qn, int bd) {
55   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56   for (int y = 0; y < h; ++y) {
57     int x_qn = x0_qn;
58     for (int x = 0; x < w; ++x) {
59       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60       const int x_filter_idx =
61           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62       assert(x_filter_idx <= RS_SUBPEL_MASK);
63       const int16_t *const x_filter =
64           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65       int sum = 0;
66       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67         sum += src_x[k] * x_filter[k];
68       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69       x_qn += x_step_qn;
70     }
71     src += src_stride;
72     dst += dst_stride;
73   }
74 }
75 
av1_convolve_2d_sobel_y_c(const uint8_t * src,int src_stride,double * dst,int dst_stride,int w,int h,int dir,double norm)76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
77                                int dst_stride, int w, int h, int dir,
78                                double norm) {
79   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
80   DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
81   DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
82   const int taps = 3;
83   int im_h = h + taps - 1;
84   int im_stride = w;
85   const int fo_vert = 1;
86   const int fo_horiz = 1;
87 
88   // horizontal filter
89   const uint8_t *src_horiz = src - fo_vert * src_stride;
90   const int16_t *x_filter = dir ? sobel_a : sobel_b;
91   for (int y = 0; y < im_h; ++y) {
92     for (int x = 0; x < w; ++x) {
93       int16_t sum = 0;
94       for (int k = 0; k < taps; ++k) {
95         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
96       }
97       im_block[y * im_stride + x] = sum;
98     }
99   }
100 
101   // vertical filter
102   int16_t *src_vert = im_block + fo_vert * im_stride;
103   const int16_t *y_filter = dir ? sobel_b : sobel_a;
104   for (int y = 0; y < h; ++y) {
105     for (int x = 0; x < w; ++x) {
106       int16_t sum = 0;
107       for (int k = 0; k < taps; ++k) {
108         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
109       }
110       dst[y * dst_stride + x] = sum * norm;
111     }
112   }
113 }
114 
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
116                           int dst_stride, int w, int h,
117                           const InterpFilterParams *filter_params_x,
118                           const InterpFilterParams *filter_params_y,
119                           const int subpel_x_q4, const int subpel_y_q4,
120                           ConvolveParams *conv_params) {
121   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
122   int im_h = h + filter_params_y->taps - 1;
123   int im_stride = w;
124   const int fo_vert = filter_params_y->taps / 2 - 1;
125   const int fo_horiz = filter_params_x->taps / 2 - 1;
126   const int bd = 8;
127   const int bits =
128       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
129 
130   // horizontal filter
131   const uint8_t *src_horiz = src - fo_vert * src_stride;
132   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
133       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
134   for (int y = 0; y < im_h; ++y) {
135     for (int x = 0; x < w; ++x) {
136       int32_t sum = (1 << (bd + FILTER_BITS - 1));
137       for (int k = 0; k < filter_params_x->taps; ++k) {
138         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
139       }
140       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
141       im_block[y * im_stride + x] =
142           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
143     }
144   }
145 
146   // vertical filter
147   int16_t *src_vert = im_block + fo_vert * im_stride;
148   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
149       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
150   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
151   for (int y = 0; y < h; ++y) {
152     for (int x = 0; x < w; ++x) {
153       int32_t sum = 1 << offset_bits;
154       for (int k = 0; k < filter_params_y->taps; ++k) {
155         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
156       }
157       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
158       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
159                     ((1 << (offset_bits - conv_params->round_1)) +
160                      (1 << (offset_bits - conv_params->round_1 - 1)));
161       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
162     }
163   }
164 }
165 
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)166 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
167                          int dst_stride, int w, int h,
168                          const InterpFilterParams *filter_params_x,
169                          const InterpFilterParams *filter_params_y,
170                          const int subpel_x_q4, const int subpel_y_q4,
171                          ConvolveParams *conv_params) {
172   const int fo_vert = filter_params_y->taps / 2 - 1;
173   (void)filter_params_x;
174   (void)subpel_x_q4;
175   (void)conv_params;
176 
177   assert(conv_params->round_0 <= FILTER_BITS);
178   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
179          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
180 
181   // vertical filter
182   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
183       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
184   for (int y = 0; y < h; ++y) {
185     for (int x = 0; x < w; ++x) {
186       int32_t res = 0;
187       for (int k = 0; k < filter_params_y->taps; ++k) {
188         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
189       }
190       dst[y * dst_stride + x] =
191           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
192     }
193   }
194 }
195 
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)196 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
197                          int dst_stride, int w, int h,
198                          const InterpFilterParams *filter_params_x,
199                          const InterpFilterParams *filter_params_y,
200                          const int subpel_x_q4, const int subpel_y_q4,
201                          ConvolveParams *conv_params) {
202   const int fo_horiz = filter_params_x->taps / 2 - 1;
203   const int bits = FILTER_BITS - conv_params->round_0;
204   (void)filter_params_y;
205   (void)subpel_y_q4;
206   (void)conv_params;
207 
208   assert(bits >= 0);
209   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
210          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
211 
212   // horizontal filter
213   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
214       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
215 
216   for (int y = 0; y < h; ++y) {
217     for (int x = 0; x < w; ++x) {
218       int32_t res = 0;
219       for (int k = 0; k < filter_params_x->taps; ++k) {
220         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
221       }
222       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
223       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
224     }
225   }
226 }
227 
av1_convolve_2d_copy_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)228 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
229                                int dst_stride, int w, int h,
230                                const InterpFilterParams *filter_params_x,
231                                const InterpFilterParams *filter_params_y,
232                                const int subpel_x_q4, const int subpel_y_q4,
233                                ConvolveParams *conv_params) {
234   (void)filter_params_x;
235   (void)filter_params_y;
236   (void)subpel_x_q4;
237   (void)subpel_y_q4;
238   (void)conv_params;
239 
240   for (int y = 0; y < h; ++y) {
241     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
242   }
243 }
244 
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)245 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
246                                 uint8_t *dst8, int dst8_stride, int w, int h,
247                                 const InterpFilterParams *filter_params_x,
248                                 const InterpFilterParams *filter_params_y,
249                                 const int subpel_x_q4, const int subpel_y_q4,
250                                 ConvolveParams *conv_params) {
251   CONV_BUF_TYPE *dst = conv_params->dst;
252   int dst_stride = conv_params->dst_stride;
253   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
254   int im_h = h + filter_params_y->taps - 1;
255   int im_stride = w;
256   const int fo_vert = filter_params_y->taps / 2 - 1;
257   const int fo_horiz = filter_params_x->taps / 2 - 1;
258   const int bd = 8;
259   const int round_bits =
260       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
261 
262   // horizontal filter
263   const uint8_t *src_horiz = src - fo_vert * src_stride;
264   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
265       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
266   for (int y = 0; y < im_h; ++y) {
267     for (int x = 0; x < w; ++x) {
268       int32_t sum = (1 << (bd + FILTER_BITS - 1));
269       for (int k = 0; k < filter_params_x->taps; ++k) {
270         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
271       }
272       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
273       im_block[y * im_stride + x] =
274           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
275     }
276   }
277 
278   // vertical filter
279   int16_t *src_vert = im_block + fo_vert * im_stride;
280   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
281       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
282   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
283   for (int y = 0; y < h; ++y) {
284     for (int x = 0; x < w; ++x) {
285       int32_t sum = 1 << offset_bits;
286       for (int k = 0; k < filter_params_y->taps; ++k) {
287         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
288       }
289       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
290       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
291       if (conv_params->do_average) {
292         int32_t tmp = dst[y * dst_stride + x];
293         if (conv_params->use_dist_wtd_comp_avg) {
294           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
295           tmp = tmp >> DIST_PRECISION_BITS;
296         } else {
297           tmp += res;
298           tmp = tmp >> 1;
299         }
300         tmp -= (1 << (offset_bits - conv_params->round_1)) +
301                (1 << (offset_bits - conv_params->round_1 - 1));
302         dst8[y * dst8_stride + x] =
303             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
304       } else {
305         dst[y * dst_stride + x] = res;
306       }
307     }
308   }
309 }
310 
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)311 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
312                                uint8_t *dst8, int dst8_stride, int w, int h,
313                                const InterpFilterParams *filter_params_x,
314                                const InterpFilterParams *filter_params_y,
315                                const int subpel_x_q4, const int subpel_y_q4,
316                                ConvolveParams *conv_params) {
317   CONV_BUF_TYPE *dst = conv_params->dst;
318   int dst_stride = conv_params->dst_stride;
319   const int fo_vert = filter_params_y->taps / 2 - 1;
320   const int bits = FILTER_BITS - conv_params->round_0;
321   const int bd = 8;
322   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
323   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
324                            (1 << (offset_bits - conv_params->round_1 - 1));
325   const int round_bits =
326       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
327   (void)filter_params_x;
328   (void)subpel_x_q4;
329 
330   // vertical filter
331   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
332       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
333   for (int y = 0; y < h; ++y) {
334     for (int x = 0; x < w; ++x) {
335       int32_t res = 0;
336       for (int k = 0; k < filter_params_y->taps; ++k) {
337         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
338       }
339       res *= (1 << bits);
340       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
341 
342       if (conv_params->do_average) {
343         int32_t tmp = dst[y * dst_stride + x];
344         if (conv_params->use_dist_wtd_comp_avg) {
345           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
346           tmp = tmp >> DIST_PRECISION_BITS;
347         } else {
348           tmp += res;
349           tmp = tmp >> 1;
350         }
351         tmp -= round_offset;
352         dst8[y * dst8_stride + x] =
353             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
354       } else {
355         dst[y * dst_stride + x] = res;
356       }
357     }
358   }
359 }
360 
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)361 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
362                                uint8_t *dst8, int dst8_stride, int w, int h,
363                                const InterpFilterParams *filter_params_x,
364                                const InterpFilterParams *filter_params_y,
365                                const int subpel_x_q4, const int subpel_y_q4,
366                                ConvolveParams *conv_params) {
367   CONV_BUF_TYPE *dst = conv_params->dst;
368   int dst_stride = conv_params->dst_stride;
369   const int fo_horiz = filter_params_x->taps / 2 - 1;
370   const int bits = FILTER_BITS - conv_params->round_1;
371   const int bd = 8;
372   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
373   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
374                            (1 << (offset_bits - conv_params->round_1 - 1));
375   const int round_bits =
376       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
377   (void)filter_params_y;
378   (void)subpel_y_q4;
379 
380   // horizontal filter
381   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
382       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
383   for (int y = 0; y < h; ++y) {
384     for (int x = 0; x < w; ++x) {
385       int32_t res = 0;
386       for (int k = 0; k < filter_params_x->taps; ++k) {
387         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
388       }
389       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
390       res += round_offset;
391 
392       if (conv_params->do_average) {
393         int32_t tmp = dst[y * dst_stride + x];
394         if (conv_params->use_dist_wtd_comp_avg) {
395           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
396           tmp = tmp >> DIST_PRECISION_BITS;
397         } else {
398           tmp += res;
399           tmp = tmp >> 1;
400         }
401         tmp -= round_offset;
402         dst8[y * dst8_stride + x] =
403             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
404       } else {
405         dst[y * dst_stride + x] = res;
406       }
407     }
408   }
409 }
410 
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)411 void av1_dist_wtd_convolve_2d_copy_c(
412     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
413     int h, const InterpFilterParams *filter_params_x,
414     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
415     const int subpel_y_q4, ConvolveParams *conv_params) {
416   CONV_BUF_TYPE *dst = conv_params->dst;
417   int dst_stride = conv_params->dst_stride;
418   const int bits =
419       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
420   const int bd = 8;
421   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
422   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
423                            (1 << (offset_bits - conv_params->round_1 - 1));
424   (void)filter_params_x;
425   (void)filter_params_y;
426   (void)subpel_x_q4;
427   (void)subpel_y_q4;
428 
429   for (int y = 0; y < h; ++y) {
430     for (int x = 0; x < w; ++x) {
431       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
432       res += round_offset;
433 
434       if (conv_params->do_average) {
435         int32_t tmp = dst[y * dst_stride + x];
436         if (conv_params->use_dist_wtd_comp_avg) {
437           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
438           tmp = tmp >> DIST_PRECISION_BITS;
439         } else {
440           tmp += res;
441           tmp = tmp >> 1;
442         }
443         tmp -= round_offset;
444         dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
445       } else {
446         dst[y * dst_stride + x] = res;
447       }
448     }
449   }
450 }
451 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)452 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
453                              int dst8_stride, int w, int h,
454                              const InterpFilterParams *filter_params_x,
455                              const InterpFilterParams *filter_params_y,
456                              const int subpel_x_qn, const int x_step_qn,
457                              const int subpel_y_qn, const int y_step_qn,
458                              ConvolveParams *conv_params) {
459   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
460   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
461              filter_params_y->taps;
462   CONV_BUF_TYPE *dst16 = conv_params->dst;
463   const int dst16_stride = conv_params->dst_stride;
464   const int bits =
465       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
466   assert(bits >= 0);
467   int im_stride = w;
468   const int fo_vert = filter_params_y->taps / 2 - 1;
469   const int fo_horiz = filter_params_x->taps / 2 - 1;
470   const int bd = 8;
471 
472   // horizontal filter
473   const uint8_t *src_horiz = src - fo_vert * src_stride;
474   for (int y = 0; y < im_h; ++y) {
475     int x_qn = subpel_x_qn;
476     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
477       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
478       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
479       assert(x_filter_idx < SUBPEL_SHIFTS);
480       const int16_t *x_filter =
481           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
482       int32_t sum = (1 << (bd + FILTER_BITS - 1));
483       for (int k = 0; k < filter_params_x->taps; ++k) {
484         sum += x_filter[k] * src_x[k - fo_horiz];
485       }
486       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
487       im_block[y * im_stride + x] =
488           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
489     }
490     src_horiz += src_stride;
491   }
492 
493   // vertical filter
494   int16_t *src_vert = im_block + fo_vert * im_stride;
495   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
496   for (int x = 0; x < w; ++x) {
497     int y_qn = subpel_y_qn;
498     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
499       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
500       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
501       assert(y_filter_idx < SUBPEL_SHIFTS);
502       const int16_t *y_filter =
503           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
504       int32_t sum = 1 << offset_bits;
505       for (int k = 0; k < filter_params_y->taps; ++k) {
506         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
507       }
508       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
509       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
510       if (conv_params->is_compound) {
511         if (conv_params->do_average) {
512           int32_t tmp = dst16[y * dst16_stride + x];
513           if (conv_params->use_dist_wtd_comp_avg) {
514             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
515             tmp = tmp >> DIST_PRECISION_BITS;
516           } else {
517             tmp += res;
518             tmp = tmp >> 1;
519           }
520           /* Subtract round offset and convolve round */
521           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
522                        (1 << (offset_bits - conv_params->round_1 - 1)));
523           dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
524         } else {
525           dst16[y * dst16_stride + x] = res;
526         }
527       } else {
528         /* Subtract round offset and convolve round */
529         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
530                              (1 << (offset_bits - conv_params->round_1 - 1)));
531         dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
532       }
533     }
534     src_vert++;
535   }
536 }
537 
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)538 static void convolve_2d_scale_wrapper(
539     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
540     int h, const InterpFilterParams *filter_params_x,
541     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
542     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
543     ConvolveParams *conv_params) {
544   if (conv_params->is_compound) {
545     assert(conv_params->dst != NULL);
546   }
547   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
548                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
549                         y_step_qn, conv_params);
550 }
551 
552 // TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
553 // we may create optimized code to do 2-tap filtering for all bilinear filtering
554 // usages, not just IntraBC.
convolve_2d_for_intrabc(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params)555 static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
556                                     uint8_t *dst, int dst_stride, int w, int h,
557                                     int subpel_x_q4, int subpel_y_q4,
558                                     ConvolveParams *conv_params) {
559   const InterpFilterParams *filter_params_x =
560       subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
561   const InterpFilterParams *filter_params_y =
562       subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
563   if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
564     av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
565                          filter_params_x, filter_params_y, 0, 0, conv_params);
566   } else if (subpel_x_q4 != 0) {
567     av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
568                         filter_params_y, 0, 0, conv_params);
569   } else {
570     av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
571                         filter_params_y, 0, 0, conv_params);
572   }
573 }
574 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int is_intrabc)575 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
576                             int dst_stride, int w, int h,
577                             InterpFilters interp_filters, const int subpel_x_q4,
578                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
579                             int scaled, ConvolveParams *conv_params,
580                             const struct scale_factors *sf, int is_intrabc) {
581   assert(IMPLIES(is_intrabc, !scaled));
582   (void)x_step_q4;
583   (void)y_step_q4;
584   (void)dst;
585   (void)dst_stride;
586 
587   if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
588     convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
589                             subpel_y_q4, conv_params);
590     return;
591   }
592 
593   InterpFilter filter_x = 0;
594   InterpFilter filter_y = 0;
595   const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
596   const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
597   if (need_filter_params_x)
598     filter_x = av1_extract_interp_filter(interp_filters, 1);
599   if (need_filter_params_y)
600     filter_y = av1_extract_interp_filter(interp_filters, 0);
601   const InterpFilterParams *filter_params_x =
602       need_filter_params_x
603           ? av1_get_interp_filter_params_with_block_size(filter_x, w)
604           : NULL;
605   const InterpFilterParams *filter_params_y =
606       need_filter_params_y
607           ? av1_get_interp_filter_params_with_block_size(filter_y, h)
608           : NULL;
609 
610   if (scaled) {
611     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
612                               filter_params_x, filter_params_y, subpel_x_q4,
613                               x_step_q4, subpel_y_q4, y_step_q4, conv_params);
614   } else {
615     sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
616         src, src_stride, dst, dst_stride, w, h, filter_params_x,
617         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
618   }
619 }
620 
av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)621 void av1_highbd_convolve_2d_copy_sr_c(
622     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
623     int h, const InterpFilterParams *filter_params_x,
624     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
625     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
626   (void)filter_params_x;
627   (void)filter_params_y;
628   (void)subpel_x_q4;
629   (void)subpel_y_q4;
630   (void)conv_params;
631   (void)bd;
632 
633   for (int y = 0; y < h; ++y) {
634     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
635   }
636 }
637 
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)638 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
639                                 uint16_t *dst, int dst_stride, int w, int h,
640                                 const InterpFilterParams *filter_params_x,
641                                 const InterpFilterParams *filter_params_y,
642                                 const int subpel_x_q4, const int subpel_y_q4,
643                                 ConvolveParams *conv_params, int bd) {
644   const int fo_horiz = filter_params_x->taps / 2 - 1;
645   const int bits = FILTER_BITS - conv_params->round_0;
646   (void)filter_params_y;
647   (void)subpel_y_q4;
648 
649   assert(bits >= 0);
650   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
651          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
652 
653   // horizontal filter
654   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
655       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
656   for (int y = 0; y < h; ++y) {
657     for (int x = 0; x < w; ++x) {
658       int32_t res = 0;
659       for (int k = 0; k < filter_params_x->taps; ++k) {
660         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
661       }
662       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
663       dst[y * dst_stride + x] =
664           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
665     }
666   }
667 }
668 
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)669 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
670                                 uint16_t *dst, int dst_stride, int w, int h,
671                                 const InterpFilterParams *filter_params_x,
672                                 const InterpFilterParams *filter_params_y,
673                                 const int subpel_x_q4, const int subpel_y_q4,
674                                 ConvolveParams *conv_params, int bd) {
675   const int fo_vert = filter_params_y->taps / 2 - 1;
676   (void)filter_params_x;
677   (void)subpel_x_q4;
678   (void)conv_params;
679 
680   assert(conv_params->round_0 <= FILTER_BITS);
681   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
682          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
683   // vertical filter
684   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
685       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
686   for (int y = 0; y < h; ++y) {
687     for (int x = 0; x < w; ++x) {
688       int32_t res = 0;
689       for (int k = 0; k < filter_params_y->taps; ++k) {
690         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
691       }
692       dst[y * dst_stride + x] =
693           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
694     }
695   }
696 }
697 
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)698 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
699                                  uint16_t *dst, int dst_stride, int w, int h,
700                                  const InterpFilterParams *filter_params_x,
701                                  const InterpFilterParams *filter_params_y,
702                                  const int subpel_x_q4, const int subpel_y_q4,
703                                  ConvolveParams *conv_params, int bd) {
704   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
705   int im_h = h + filter_params_y->taps - 1;
706   int im_stride = w;
707   const int fo_vert = filter_params_y->taps / 2 - 1;
708   const int fo_horiz = filter_params_x->taps / 2 - 1;
709   const int bits =
710       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
711   assert(bits >= 0);
712 
713   // horizontal filter
714   const uint16_t *src_horiz = src - fo_vert * src_stride;
715   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
716       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
717   for (int y = 0; y < im_h; ++y) {
718     for (int x = 0; x < w; ++x) {
719       int32_t sum = (1 << (bd + FILTER_BITS - 1));
720       for (int k = 0; k < filter_params_x->taps; ++k) {
721         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
722       }
723       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
724       im_block[y * im_stride + x] =
725           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
726     }
727   }
728 
729   // vertical filter
730   int16_t *src_vert = im_block + fo_vert * im_stride;
731   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
732       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
733   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
734   for (int y = 0; y < h; ++y) {
735     for (int x = 0; x < w; ++x) {
736       int32_t sum = 1 << offset_bits;
737       for (int k = 0; k < filter_params_y->taps; ++k) {
738         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
739       }
740       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
741       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
742                     ((1 << (offset_bits - conv_params->round_1)) +
743                      (1 << (offset_bits - conv_params->round_1 - 1)));
744       dst[y * dst_stride + x] =
745           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
746     }
747   }
748 }
749 
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)750 void av1_highbd_dist_wtd_convolve_2d_c(
751     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
752     int w, int h, const InterpFilterParams *filter_params_x,
753     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
754     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
755   int x, y, k;
756   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
757   CONV_BUF_TYPE *dst = conv_params->dst;
758   int dst_stride = conv_params->dst_stride;
759   int im_h = h + filter_params_y->taps - 1;
760   int im_stride = w;
761   const int fo_vert = filter_params_y->taps / 2 - 1;
762   const int fo_horiz = filter_params_x->taps / 2 - 1;
763   const int round_bits =
764       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
765   assert(round_bits >= 0);
766 
767   // horizontal filter
768   const uint16_t *src_horiz = src - fo_vert * src_stride;
769   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
770       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
771   for (y = 0; y < im_h; ++y) {
772     for (x = 0; x < w; ++x) {
773       int32_t sum = (1 << (bd + FILTER_BITS - 1));
774       for (k = 0; k < filter_params_x->taps; ++k) {
775         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
776       }
777       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
778       (void)bd;
779       im_block[y * im_stride + x] =
780           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
781     }
782   }
783 
784   // vertical filter
785   int16_t *src_vert = im_block + fo_vert * im_stride;
786   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
787   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
788       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
789   for (y = 0; y < h; ++y) {
790     for (x = 0; x < w; ++x) {
791       int32_t sum = 1 << offset_bits;
792       for (k = 0; k < filter_params_y->taps; ++k) {
793         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
794       }
795       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
796       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
797       if (conv_params->do_average) {
798         int32_t tmp = dst[y * dst_stride + x];
799         if (conv_params->use_dist_wtd_comp_avg) {
800           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
801           tmp = tmp >> DIST_PRECISION_BITS;
802         } else {
803           tmp += res;
804           tmp = tmp >> 1;
805         }
806         tmp -= (1 << (offset_bits - conv_params->round_1)) +
807                (1 << (offset_bits - conv_params->round_1 - 1));
808         dst16[y * dst16_stride + x] =
809             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
810       } else {
811         dst[y * dst_stride + x] = res;
812       }
813     }
814   }
815 }
816 
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)817 void av1_highbd_dist_wtd_convolve_x_c(
818     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
819     int w, int h, const InterpFilterParams *filter_params_x,
820     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
821     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
822   CONV_BUF_TYPE *dst = conv_params->dst;
823   int dst_stride = conv_params->dst_stride;
824   const int fo_horiz = filter_params_x->taps / 2 - 1;
825   const int bits = FILTER_BITS - conv_params->round_1;
826   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
827   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
828                            (1 << (offset_bits - conv_params->round_1 - 1));
829   const int round_bits =
830       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
831   assert(round_bits >= 0);
832   (void)filter_params_y;
833   (void)subpel_y_q4;
834   assert(bits >= 0);
835   // horizontal filter
836   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
837       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
838   for (int y = 0; y < h; ++y) {
839     for (int x = 0; x < w; ++x) {
840       int32_t res = 0;
841       for (int k = 0; k < filter_params_x->taps; ++k) {
842         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
843       }
844       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
845       res += round_offset;
846 
847       if (conv_params->do_average) {
848         int32_t tmp = dst[y * dst_stride + x];
849         if (conv_params->use_dist_wtd_comp_avg) {
850           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
851           tmp = tmp >> DIST_PRECISION_BITS;
852         } else {
853           tmp += res;
854           tmp = tmp >> 1;
855         }
856         tmp -= round_offset;
857         dst16[y * dst16_stride + x] =
858             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
859       } else {
860         dst[y * dst_stride + x] = res;
861       }
862     }
863   }
864 }
865 
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)866 void av1_highbd_dist_wtd_convolve_y_c(
867     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
868     int w, int h, const InterpFilterParams *filter_params_x,
869     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
870     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
871   CONV_BUF_TYPE *dst = conv_params->dst;
872   int dst_stride = conv_params->dst_stride;
873   const int fo_vert = filter_params_y->taps / 2 - 1;
874   const int bits = FILTER_BITS - conv_params->round_0;
875   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
876   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
877                            (1 << (offset_bits - conv_params->round_1 - 1));
878   const int round_bits =
879       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
880   assert(round_bits >= 0);
881   (void)filter_params_x;
882   (void)subpel_x_q4;
883   assert(bits >= 0);
884   // vertical filter
885   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
886       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
887   for (int y = 0; y < h; ++y) {
888     for (int x = 0; x < w; ++x) {
889       int32_t res = 0;
890       for (int k = 0; k < filter_params_y->taps; ++k) {
891         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
892       }
893       res *= (1 << bits);
894       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
895 
896       if (conv_params->do_average) {
897         int32_t tmp = dst[y * dst_stride + x];
898         if (conv_params->use_dist_wtd_comp_avg) {
899           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
900           tmp = tmp >> DIST_PRECISION_BITS;
901         } else {
902           tmp += res;
903           tmp = tmp >> 1;
904         }
905         tmp -= round_offset;
906         dst16[y * dst16_stride + x] =
907             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
908       } else {
909         dst[y * dst_stride + x] = res;
910       }
911     }
912   }
913 }
914 
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)915 void av1_highbd_dist_wtd_convolve_2d_copy_c(
916     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
917     int w, int h, const InterpFilterParams *filter_params_x,
918     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
919     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
920   CONV_BUF_TYPE *dst = conv_params->dst;
921   int dst_stride = conv_params->dst_stride;
922   const int bits =
923       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
924   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
925   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
926                            (1 << (offset_bits - conv_params->round_1 - 1));
927   assert(bits >= 0);
928   (void)filter_params_x;
929   (void)filter_params_y;
930   (void)subpel_x_q4;
931   (void)subpel_y_q4;
932 
933   for (int y = 0; y < h; ++y) {
934     for (int x = 0; x < w; ++x) {
935       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
936       res += round_offset;
937       if (conv_params->do_average) {
938         int32_t tmp = dst[y * dst_stride + x];
939         if (conv_params->use_dist_wtd_comp_avg) {
940           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
941           tmp = tmp >> DIST_PRECISION_BITS;
942         } else {
943           tmp += res;
944           tmp = tmp >> 1;
945         }
946         tmp -= round_offset;
947         dst16[y * dst16_stride + x] =
948             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
949       } else {
950         dst[y * dst_stride + x] = res;
951       }
952     }
953   }
954 }
955 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)956 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
957                                     uint16_t *dst, int dst_stride, int w, int h,
958                                     const InterpFilterParams *filter_params_x,
959                                     const InterpFilterParams *filter_params_y,
960                                     const int subpel_x_qn, const int x_step_qn,
961                                     const int subpel_y_qn, const int y_step_qn,
962                                     ConvolveParams *conv_params, int bd) {
963   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
964   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
965              filter_params_y->taps;
966   int im_stride = w;
967   const int fo_vert = filter_params_y->taps / 2 - 1;
968   const int fo_horiz = filter_params_x->taps / 2 - 1;
969   CONV_BUF_TYPE *dst16 = conv_params->dst;
970   const int dst16_stride = conv_params->dst_stride;
971   const int bits =
972       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
973   assert(bits >= 0);
974   // horizontal filter
975   const uint16_t *src_horiz = src - fo_vert * src_stride;
976   for (int y = 0; y < im_h; ++y) {
977     int x_qn = subpel_x_qn;
978     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
979       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
980       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
981       assert(x_filter_idx < SUBPEL_SHIFTS);
982       const int16_t *x_filter =
983           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
984       int32_t sum = (1 << (bd + FILTER_BITS - 1));
985       for (int k = 0; k < filter_params_x->taps; ++k) {
986         sum += x_filter[k] * src_x[k - fo_horiz];
987       }
988       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
989       im_block[y * im_stride + x] =
990           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
991     }
992     src_horiz += src_stride;
993   }
994 
995   // vertical filter
996   int16_t *src_vert = im_block + fo_vert * im_stride;
997   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
998   for (int x = 0; x < w; ++x) {
999     int y_qn = subpel_y_qn;
1000     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1001       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1002       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1003       assert(y_filter_idx < SUBPEL_SHIFTS);
1004       const int16_t *y_filter =
1005           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1006       int32_t sum = 1 << offset_bits;
1007       for (int k = 0; k < filter_params_y->taps; ++k) {
1008         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1009       }
1010       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
1011       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1012       if (conv_params->is_compound) {
1013         if (conv_params->do_average) {
1014           int32_t tmp = dst16[y * dst16_stride + x];
1015           if (conv_params->use_dist_wtd_comp_avg) {
1016             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1017             tmp = tmp >> DIST_PRECISION_BITS;
1018           } else {
1019             tmp += res;
1020             tmp = tmp >> 1;
1021           }
1022           /* Subtract round offset and convolve round */
1023           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1024                        (1 << (offset_bits - conv_params->round_1 - 1)));
1025           dst[y * dst_stride + x] =
1026               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1027         } else {
1028           dst16[y * dst16_stride + x] = res;
1029         }
1030       } else {
1031         /* Subtract round offset and convolve round */
1032         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1033                              (1 << (offset_bits - conv_params->round_1 - 1)));
1034         dst[y * dst_stride + x] =
1035             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1036       }
1037     }
1038     src_vert++;
1039   }
1040 }
1041 
highbd_convolve_2d_for_intrabc(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params,int bd)1042 static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
1043                                            uint16_t *dst, int dst_stride, int w,
1044                                            int h, int subpel_x_q4,
1045                                            int subpel_y_q4,
1046                                            ConvolveParams *conv_params,
1047                                            int bd) {
1048   const InterpFilterParams *filter_params_x =
1049       subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
1050   const InterpFilterParams *filter_params_y =
1051       subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
1052   if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1053     av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1054                                 filter_params_x, filter_params_y, 0, 0,
1055                                 conv_params, bd);
1056   } else if (subpel_x_q4 != 0) {
1057     av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
1058                                filter_params_x, filter_params_y, 0, 0,
1059                                conv_params, bd);
1060   } else {
1061     av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
1062                                filter_params_x, filter_params_y, 0, 0,
1063                                conv_params, bd);
1064   }
1065 }
1066 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int is_intrabc,int bd)1067 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1068                                    uint8_t *dst8, int dst_stride, int w, int h,
1069                                    InterpFilters interp_filters,
1070                                    const int subpel_x_q4, int x_step_q4,
1071                                    const int subpel_y_q4, int y_step_q4,
1072                                    int scaled, ConvolveParams *conv_params,
1073                                    const struct scale_factors *sf,
1074                                    int is_intrabc, int bd) {
1075   assert(IMPLIES(is_intrabc, !scaled));
1076   (void)x_step_q4;
1077   (void)y_step_q4;
1078   (void)dst_stride;
1079   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1080 
1081   if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
1082     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1083     highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
1084                                    subpel_x_q4, subpel_y_q4, conv_params, bd);
1085     return;
1086   }
1087 
1088   InterpFilter filter_x = 0;
1089   InterpFilter filter_y = 0;
1090   const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
1091   const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
1092   if (need_filter_params_x)
1093     filter_x = av1_extract_interp_filter(interp_filters, 1);
1094   if (need_filter_params_y)
1095     filter_y = av1_extract_interp_filter(interp_filters, 0);
1096   const InterpFilterParams *filter_params_x =
1097       need_filter_params_x
1098           ? av1_get_interp_filter_params_with_block_size(filter_x, w)
1099           : NULL;
1100   const InterpFilterParams *filter_params_y =
1101       need_filter_params_y
1102           ? av1_get_interp_filter_params_with_block_size(filter_y, h)
1103           : NULL;
1104 
1105   if (scaled) {
1106     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1107     if (conv_params->is_compound) {
1108       assert(conv_params->dst != NULL);
1109     }
1110     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1111                                  filter_params_x, filter_params_y, subpel_x_q4,
1112                                  x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1113                                  bd);
1114   } else {
1115     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1116 
1117     sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
1118                                           0][conv_params->is_compound](
1119         src, src_stride, dst, dst_stride, w, h, filter_params_x,
1120         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
1121   }
1122 }
1123 
1124 // Note: Fixed size intermediate buffers, place limits on parameters
1125 // of some functions. 2d filtering proceeds in 2 steps:
1126 //   (1) Interpolate horizontally into an intermediate buffer, temp.
1127 //   (2) Interpolate temp vertically to derive the sub-pixel result.
1128 // Deriving the maximum number of rows in the temp buffer (135):
1129 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1130 // --Largest block size is 128x128 pixels.
1131 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1132 //   original frame (in 1/16th pixel units).
1133 // --Must round-up because block may be located at sub-pixel position.
1134 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1135 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1136 #define WIENER_MAX_EXT_SIZE 263
1137 
horz_scalar_product(const uint8_t * a,const int16_t * b)1138 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1139   int sum = 0;
1140   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1141   return sum;
1142 }
1143 
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1144 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1145                                              const int16_t *b) {
1146   int sum = 0;
1147   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1148   return sum;
1149 }
1150 
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1151 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1152                                              ptrdiff_t a_stride,
1153                                              const int16_t *b) {
1154   int sum = 0;
1155   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1156   return sum;
1157 }
1158 
get_filter_base(const int16_t * filter)1159 static const InterpKernel *get_filter_base(const int16_t *filter) {
1160   // NOTE: This assumes that the filter table is 256-byte aligned.
1161   // TODO(agrange) Modify to make independent of table alignment.
1162   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1163 }
1164 
get_filter_offset(const int16_t * f,const InterpKernel * base)1165 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1166   return (int)((const InterpKernel *)(intptr_t)f - base);
1167 }
1168 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1169 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1170                                        uint16_t *dst, ptrdiff_t dst_stride,
1171                                        const InterpKernel *x_filters, int x0_q4,
1172                                        int x_step_q4, int w, int h,
1173                                        int round0_bits) {
1174   const int bd = 8;
1175   src -= SUBPEL_TAPS / 2 - 1;
1176   for (int y = 0; y < h; ++y) {
1177     int x_q4 = x0_q4;
1178     for (int x = 0; x < w; ++x) {
1179       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1180       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1181       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1182                            (1 << (bd + FILTER_BITS - 1));
1183       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1184       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1185                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1186       x_q4 += x_step_q4;
1187     }
1188     src += src_stride;
1189     dst += dst_stride;
1190   }
1191 }
1192 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1193 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1194                                       uint8_t *dst, ptrdiff_t dst_stride,
1195                                       const InterpKernel *y_filters, int y0_q4,
1196                                       int y_step_q4, int w, int h,
1197                                       int round1_bits) {
1198   const int bd = 8;
1199   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1200 
1201   for (int x = 0; x < w; ++x) {
1202     int y_q4 = y0_q4;
1203     for (int y = 0; y < h; ++y) {
1204       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1205       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1206       const int rounding =
1207           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1208           (1 << (bd + round1_bits - 1));
1209       const int sum =
1210           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1211       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1212       y_q4 += y_step_q4;
1213     }
1214     ++src;
1215     ++dst;
1216   }
1217 }
1218 
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1219 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1220                                    uint8_t *dst, ptrdiff_t dst_stride,
1221                                    const int16_t *filter_x, int x_step_q4,
1222                                    const int16_t *filter_y, int y_step_q4,
1223                                    int w, int h,
1224                                    const ConvolveParams *conv_params) {
1225   const InterpKernel *const filters_x = get_filter_base(filter_x);
1226   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1227 
1228   const InterpKernel *const filters_y = get_filter_base(filter_y);
1229   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1230 
1231   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1232   const int intermediate_height =
1233       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1234   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1235 
1236   assert(w <= MAX_SB_SIZE);
1237   assert(h <= MAX_SB_SIZE);
1238   assert(y_step_q4 <= 32);
1239   assert(x_step_q4 <= 32);
1240 
1241   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1242                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1243                              x_step_q4, w, intermediate_height,
1244                              conv_params->round_0);
1245   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1246                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1247                             y_step_q4, w, h, conv_params->round_1);
1248 }
1249 
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1250 static void highbd_convolve_add_src_horiz_hip(
1251     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1252     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1253     int x_step_q4, int w, int h, int round0_bits, int bd) {
1254   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1255   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1256   src -= SUBPEL_TAPS / 2 - 1;
1257   for (int y = 0; y < h; ++y) {
1258     int x_q4 = x0_q4;
1259     for (int x = 0; x < w; ++x) {
1260       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1261       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1262       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1263                            (1 << (bd + FILTER_BITS - 1));
1264       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1265       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1266                                extraprec_clamp_limit - 1);
1267       x_q4 += x_step_q4;
1268     }
1269     src += src_stride;
1270     dst += dst_stride;
1271   }
1272 }
1273 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1274 static void highbd_convolve_add_src_vert_hip(
1275     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1276     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1277     int y_step_q4, int w, int h, int round1_bits, int bd) {
1278   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1279   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1280   for (int x = 0; x < w; ++x) {
1281     int y_q4 = y0_q4;
1282     for (int y = 0; y < h; ++y) {
1283       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1284       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1285       const int rounding =
1286           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1287           (1 << (bd + round1_bits - 1));
1288       const int sum =
1289           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1290       dst[y * dst_stride] =
1291           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1292       y_q4 += y_step_q4;
1293     }
1294     ++src;
1295     ++dst;
1296   }
1297 }
1298 
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1299 void av1_highbd_wiener_convolve_add_src_c(
1300     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1301     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1302     const int16_t *filter_y, int y_step_q4, int w, int h,
1303     const ConvolveParams *conv_params, int bd) {
1304   const InterpKernel *const filters_x = get_filter_base(filter_x);
1305   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1306 
1307   const InterpKernel *const filters_y = get_filter_base(filter_y);
1308   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1309 
1310   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1311   const int intermediate_height =
1312       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1313 
1314   assert(w <= MAX_SB_SIZE);
1315   assert(h <= MAX_SB_SIZE);
1316   assert(y_step_q4 <= 32);
1317   assert(x_step_q4 <= 32);
1318   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1319 
1320   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1321                                     src_stride, temp, MAX_SB_SIZE, filters_x,
1322                                     x0_q4, x_step_q4, w, intermediate_height,
1323                                     conv_params->round_0, bd);
1324   highbd_convolve_add_src_vert_hip(
1325       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1326       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1327 }
1328