1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <string.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_convolve.h"
18 #include "vpx_dsp/vpx_dsp_common.h"
19 #include "vpx_dsp/vpx_filter.h"
20 #include "vpx_ports/mem.h"
21 
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)22 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
23                            uint8_t *dst, ptrdiff_t dst_stride,
24                            const InterpKernel *x_filters, int x0_q4,
25                            int x_step_q4, int w, int h) {
26   int x, y;
27   src -= SUBPEL_TAPS / 2 - 1;
28 
29   for (y = 0; y < h; ++y) {
30     int x_q4 = x0_q4;
31     for (x = 0; x < w; ++x) {
32       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34       int k, sum = 0;
35       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37       x_q4 += x_step_q4;
38     }
39     src += src_stride;
40     dst += dst_stride;
41   }
42 }
43 
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)44 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45                                uint8_t *dst, ptrdiff_t dst_stride,
46                                const InterpKernel *x_filters, int x0_q4,
47                                int x_step_q4, int w, int h) {
48   int x, y;
49   src -= SUBPEL_TAPS / 2 - 1;
50 
51   for (y = 0; y < h; ++y) {
52     int x_q4 = x0_q4;
53     for (x = 0; x < w; ++x) {
54       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
55       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
56       int k, sum = 0;
57       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
58       dst[x] = ROUND_POWER_OF_TWO(
59           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
60       x_q4 += x_step_q4;
61     }
62     src += src_stride;
63     dst += dst_stride;
64   }
65 }
66 
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)67 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
68                           uint8_t *dst, ptrdiff_t dst_stride,
69                           const InterpKernel *y_filters, int y0_q4,
70                           int y_step_q4, int w, int h) {
71   int x, y;
72   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
73 
74   for (x = 0; x < w; ++x) {
75     int y_q4 = y0_q4;
76     for (y = 0; y < h; ++y) {
77       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
78       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
79       int k, sum = 0;
80       for (k = 0; k < SUBPEL_TAPS; ++k)
81         sum += src_y[k * src_stride] * y_filter[k];
82       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
83       y_q4 += y_step_q4;
84     }
85     ++src;
86     ++dst;
87   }
88 }
89 
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)90 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
91                               uint8_t *dst, ptrdiff_t dst_stride,
92                               const InterpKernel *y_filters, int y0_q4,
93                               int y_step_q4, int w, int h) {
94   int x, y;
95   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
96 
97   for (x = 0; x < w; ++x) {
98     int y_q4 = y0_q4;
99     for (y = 0; y < h; ++y) {
100       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
101       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
102       int k, sum = 0;
103       for (k = 0; k < SUBPEL_TAPS; ++k)
104         sum += src_y[k * src_stride] * y_filter[k];
105       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
106           dst[y * dst_stride] +
107               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
108           1);
109       y_q4 += y_step_q4;
110     }
111     ++src;
112     ++dst;
113   }
114 }
115 
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)116 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
117                      ptrdiff_t dst_stride, const InterpKernel *const x_filters,
118                      int x0_q4, int x_step_q4,
119                      const InterpKernel *const y_filters, int y0_q4,
120                      int y_step_q4, int w, int h) {
121   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
122   // 2d filtering proceeds in 2 steps:
123   //   (1) Interpolate horizontally into an intermediate buffer, temp.
124   //   (2) Interpolate temp vertically to derive the sub-pixel result.
125   // Deriving the maximum number of rows in the temp buffer (135):
126   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
127   // --Largest block size is 64x64 pixels.
128   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
129   //   original frame (in 1/16th pixel units).
130   // --Must round-up because block may be located at sub-pixel position.
131   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
132   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
133   uint8_t temp[64 * 135];
134   const int intermediate_height =
135       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
136 
137   assert(w <= 64);
138   assert(h <= 64);
139   assert(y_step_q4 <= 32);
140   assert(x_step_q4 <= 32);
141 
142   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
143                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
144   convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
145                 y_filters, y0_q4, y_step_q4, w, h);
146 }
147 
vpx_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)148 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
149                            uint8_t *dst, ptrdiff_t dst_stride,
150                            const int16_t *filter_x, int x_step_q4,
151                            const int16_t *filter_y, int y_step_q4, int w,
152                            int h) {
153   const InterpKernel *const filters_x = get_filter_base(filter_x);
154   const int x0_q4 = get_filter_offset(filter_x, filters_x);
155 
156   (void)filter_y;
157   (void)y_step_q4;
158 
159   convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
160                  w, h);
161 }
162 
vpx_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)163 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
164                                uint8_t *dst, ptrdiff_t dst_stride,
165                                const int16_t *filter_x, int x_step_q4,
166                                const int16_t *filter_y, int y_step_q4, int w,
167                                int h) {
168   const InterpKernel *const filters_x = get_filter_base(filter_x);
169   const int x0_q4 = get_filter_offset(filter_x, filters_x);
170 
171   (void)filter_y;
172   (void)y_step_q4;
173 
174   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
175                      x_step_q4, w, h);
176 }
177 
vpx_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)178 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
179                           uint8_t *dst, ptrdiff_t dst_stride,
180                           const int16_t *filter_x, int x_step_q4,
181                           const int16_t *filter_y, int y_step_q4, int w,
182                           int h) {
183   const InterpKernel *const filters_y = get_filter_base(filter_y);
184   const int y0_q4 = get_filter_offset(filter_y, filters_y);
185 
186   (void)filter_x;
187   (void)x_step_q4;
188 
189   convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
190                 w, h);
191 }
192 
vpx_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)193 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
194                               uint8_t *dst, ptrdiff_t dst_stride,
195                               const int16_t *filter_x, int x_step_q4,
196                               const int16_t *filter_y, int y_step_q4, int w,
197                               int h) {
198   const InterpKernel *const filters_y = get_filter_base(filter_y);
199   const int y0_q4 = get_filter_offset(filter_y, filters_y);
200 
201   (void)filter_x;
202   (void)x_step_q4;
203 
204   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
205                     y_step_q4, w, h);
206 }
207 
vpx_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)208 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
209                      ptrdiff_t dst_stride, const int16_t *filter_x,
210                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
211                      int w, int h) {
212   const InterpKernel *const filters_x = get_filter_base(filter_x);
213   const int x0_q4 = get_filter_offset(filter_x, filters_x);
214   const InterpKernel *const filters_y = get_filter_base(filter_y);
215   const int y0_q4 = get_filter_offset(filter_y, filters_y);
216 
217   convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
218            filters_y, y0_q4, y_step_q4, w, h);
219 }
220 
vpx_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)221 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
222                          ptrdiff_t dst_stride, const int16_t *filter_x,
223                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
224                          int w, int h) {
225   // Fixed size intermediate buffer places limits on parameters.
226   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
227   assert(w <= 64);
228   assert(h <= 64);
229 
230   vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
231                   y_step_q4, w, h);
232   vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
233 }
234 
vpx_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)235 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
236                          ptrdiff_t dst_stride, const int16_t *filter_x,
237                          int filter_x_stride, const int16_t *filter_y,
238                          int filter_y_stride, int w, int h) {
239   int r;
240 
241   (void)filter_x;
242   (void)filter_x_stride;
243   (void)filter_y;
244   (void)filter_y_stride;
245 
246   for (r = h; r > 0; --r) {
247     memcpy(dst, src, w);
248     src += src_stride;
249     dst += dst_stride;
250   }
251 }
252 
vpx_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)253 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
254                         ptrdiff_t dst_stride, const int16_t *filter_x,
255                         int filter_x_stride, const int16_t *filter_y,
256                         int filter_y_stride, int w, int h) {
257   int x, y;
258 
259   (void)filter_x;
260   (void)filter_x_stride;
261   (void)filter_y;
262   (void)filter_y_stride;
263 
264   for (y = 0; y < h; ++y) {
265     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
266     src += src_stride;
267     dst += dst_stride;
268   }
269 }
270 
vpx_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)271 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
272                         ptrdiff_t dst_stride, const int16_t *filter_x,
273                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
274                         int w, int h) {
275   vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
276                         filter_y, y_step_q4, w, h);
277 }
278 
vpx_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)279 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
280                        ptrdiff_t dst_stride, const int16_t *filter_x,
281                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
282                        int w, int h) {
283   vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
284                        filter_y, y_step_q4, w, h);
285 }
286 
vpx_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)287 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
288                      ptrdiff_t dst_stride, const int16_t *filter_x,
289                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
290                      int w, int h) {
291   vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
292                   filter_y, y_step_q4, w, h);
293 }
294 
vpx_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)295 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
296                             uint8_t *dst, ptrdiff_t dst_stride,
297                             const int16_t *filter_x, int x_step_q4,
298                             const int16_t *filter_y, int y_step_q4, int w,
299                             int h) {
300   vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
301                             x_step_q4, filter_y, y_step_q4, w, h);
302 }
303 
vpx_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)304 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
305                            uint8_t *dst, ptrdiff_t dst_stride,
306                            const int16_t *filter_x, int x_step_q4,
307                            const int16_t *filter_y, int y_step_q4, int w,
308                            int h) {
309   vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
310                            x_step_q4, filter_y, y_step_q4, w, h);
311 }
312 
vpx_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)313 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
314                          ptrdiff_t dst_stride, const int16_t *filter_x,
315                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
316                          int w, int h) {
317   vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
318                       filter_y, y_step_q4, w, h);
319 }
320 
321 #if CONFIG_VP9_HIGHBITDEPTH
highbd_convolve_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)322 static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
323                                   uint16_t *dst, ptrdiff_t dst_stride,
324                                   const InterpKernel *x_filters, int x0_q4,
325                                   int x_step_q4, int w, int h, int bd) {
326   int x, y;
327   src -= SUBPEL_TAPS / 2 - 1;
328 
329   for (y = 0; y < h; ++y) {
330     int x_q4 = x0_q4;
331     for (x = 0; x < w; ++x) {
332       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
333       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
334       int k, sum = 0;
335       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
336       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
337       x_q4 += x_step_q4;
338     }
339     src += src_stride;
340     dst += dst_stride;
341   }
342 }
343 
highbd_convolve_avg_horiz(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)344 static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
345                                       uint16_t *dst, ptrdiff_t dst_stride,
346                                       const InterpKernel *x_filters, int x0_q4,
347                                       int x_step_q4, int w, int h, int bd) {
348   int x, y;
349   src -= SUBPEL_TAPS / 2 - 1;
350 
351   for (y = 0; y < h; ++y) {
352     int x_q4 = x0_q4;
353     for (x = 0; x < w; ++x) {
354       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
355       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
356       int k, sum = 0;
357       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
358       dst[x] = ROUND_POWER_OF_TWO(
359           dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
360           1);
361       x_q4 += x_step_q4;
362     }
363     src += src_stride;
364     dst += dst_stride;
365   }
366 }
367 
highbd_convolve_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)368 static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
369                                  uint16_t *dst, ptrdiff_t dst_stride,
370                                  const InterpKernel *y_filters, int y0_q4,
371                                  int y_step_q4, int w, int h, int bd) {
372   int x, y;
373   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
374 
375   for (x = 0; x < w; ++x) {
376     int y_q4 = y0_q4;
377     for (y = 0; y < h; ++y) {
378       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
379       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
380       int k, sum = 0;
381       for (k = 0; k < SUBPEL_TAPS; ++k)
382         sum += src_y[k * src_stride] * y_filter[k];
383       dst[y * dst_stride] =
384           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
385       y_q4 += y_step_q4;
386     }
387     ++src;
388     ++dst;
389   }
390 }
391 
highbd_convolve_avg_vert(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)392 static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
393                                      uint16_t *dst, ptrdiff_t dst_stride,
394                                      const InterpKernel *y_filters, int y0_q4,
395                                      int y_step_q4, int w, int h, int bd) {
396   int x, y;
397   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
398 
399   for (x = 0; x < w; ++x) {
400     int y_q4 = y0_q4;
401     for (y = 0; y < h; ++y) {
402       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
403       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
404       int k, sum = 0;
405       for (k = 0; k < SUBPEL_TAPS; ++k)
406         sum += src_y[k * src_stride] * y_filter[k];
407       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
408           dst[y * dst_stride] +
409               clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
410           1);
411       y_q4 += y_step_q4;
412     }
413     ++src;
414     ++dst;
415   }
416 }
417 
highbd_convolve(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)418 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
419                             uint16_t *dst, ptrdiff_t dst_stride,
420                             const InterpKernel *const x_filters, int x0_q4,
421                             int x_step_q4, const InterpKernel *const y_filters,
422                             int y0_q4, int y_step_q4, int w, int h, int bd) {
423   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
424   // 2d filtering proceeds in 2 steps:
425   //   (1) Interpolate horizontally into an intermediate buffer, temp.
426   //   (2) Interpolate temp vertically to derive the sub-pixel result.
427   // Deriving the maximum number of rows in the temp buffer (135):
428   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
429   // --Largest block size is 64x64 pixels.
430   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
431   //   original frame (in 1/16th pixel units).
432   // --Must round-up because block may be located at sub-pixel position.
433   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
434   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
435   uint16_t temp[64 * 135];
436   const int intermediate_height =
437       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
438 
439   assert(w <= 64);
440   assert(h <= 64);
441   assert(y_step_q4 <= 32);
442   assert(x_step_q4 <= 32);
443 
444   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
445                         temp, 64, x_filters, x0_q4, x_step_q4, w,
446                         intermediate_height, bd);
447   highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
448                        y_filters, y0_q4, y_step_q4, w, h, bd);
449 }
450 
vpx_highbd_convolve8_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)451 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
452                                   uint16_t *dst, ptrdiff_t dst_stride,
453                                   const int16_t *filter_x, int x_step_q4,
454                                   const int16_t *filter_y, int y_step_q4, int w,
455                                   int h, int bd) {
456   const InterpKernel *const filters_x = get_filter_base(filter_x);
457   const int x0_q4 = get_filter_offset(filter_x, filters_x);
458 
459   (void)filter_y;
460   (void)y_step_q4;
461 
462   highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
463                         x_step_q4, w, h, bd);
464 }
465 
vpx_highbd_convolve8_avg_horiz_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)466 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
467                                       uint16_t *dst, ptrdiff_t dst_stride,
468                                       const int16_t *filter_x, int x_step_q4,
469                                       const int16_t *filter_y, int y_step_q4,
470                                       int w, int h, int bd) {
471   const InterpKernel *const filters_x = get_filter_base(filter_x);
472   const int x0_q4 = get_filter_offset(filter_x, filters_x);
473 
474   (void)filter_y;
475   (void)y_step_q4;
476 
477   highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
478                             x_step_q4, w, h, bd);
479 }
480 
vpx_highbd_convolve8_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)481 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
482                                  uint16_t *dst, ptrdiff_t dst_stride,
483                                  const int16_t *filter_x, int x_step_q4,
484                                  const int16_t *filter_y, int y_step_q4, int w,
485                                  int h, int bd) {
486   const InterpKernel *const filters_y = get_filter_base(filter_y);
487   const int y0_q4 = get_filter_offset(filter_y, filters_y);
488 
489   (void)filter_x;
490   (void)x_step_q4;
491 
492   highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
493                        y_step_q4, w, h, bd);
494 }
495 
vpx_highbd_convolve8_avg_vert_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)496 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
497                                      uint16_t *dst, ptrdiff_t dst_stride,
498                                      const int16_t *filter_x, int x_step_q4,
499                                      const int16_t *filter_y, int y_step_q4,
500                                      int w, int h, int bd) {
501   const InterpKernel *const filters_y = get_filter_base(filter_y);
502   const int y0_q4 = get_filter_offset(filter_y, filters_y);
503 
504   (void)filter_x;
505   (void)x_step_q4;
506 
507   highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
508                            y_step_q4, w, h, bd);
509 }
510 
vpx_highbd_convolve8_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)511 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
512                             uint16_t *dst, ptrdiff_t dst_stride,
513                             const int16_t *filter_x, int x_step_q4,
514                             const int16_t *filter_y, int y_step_q4, int w,
515                             int h, int bd) {
516   const InterpKernel *const filters_x = get_filter_base(filter_x);
517   const int x0_q4 = get_filter_offset(filter_x, filters_x);
518   const InterpKernel *const filters_y = get_filter_base(filter_y);
519   const int y0_q4 = get_filter_offset(filter_y, filters_y);
520 
521   highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
522                   filters_y, y0_q4, y_step_q4, w, h, bd);
523 }
524 
vpx_highbd_convolve8_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)525 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
526                                 uint16_t *dst, ptrdiff_t dst_stride,
527                                 const int16_t *filter_x, int x_step_q4,
528                                 const int16_t *filter_y, int y_step_q4, int w,
529                                 int h, int bd) {
530   // Fixed size intermediate buffer places limits on parameters.
531   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
532   assert(w <= 64);
533   assert(h <= 64);
534 
535   vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
536                          filter_y, y_step_q4, w, h, bd);
537   vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
538                             bd);
539 }
540 
vpx_highbd_convolve_copy_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)541 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
542                                 uint16_t *dst, ptrdiff_t dst_stride,
543                                 const int16_t *filter_x, int filter_x_stride,
544                                 const int16_t *filter_y, int filter_y_stride,
545                                 int w, int h, int bd) {
546   int r;
547 
548   (void)filter_x;
549   (void)filter_x_stride;
550   (void)filter_y;
551   (void)filter_y_stride;
552   (void)bd;
553 
554   for (r = h; r > 0; --r) {
555     memcpy(dst, src, w * sizeof(uint16_t));
556     src += src_stride;
557     dst += dst_stride;
558   }
559 }
560 
vpx_highbd_convolve_avg_c(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)561 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
562                                uint16_t *dst, ptrdiff_t dst_stride,
563                                const int16_t *filter_x, int filter_x_stride,
564                                const int16_t *filter_y, int filter_y_stride,
565                                int w, int h, int bd) {
566   int x, y;
567 
568   (void)filter_x;
569   (void)filter_x_stride;
570   (void)filter_y;
571   (void)filter_y_stride;
572   (void)bd;
573 
574   for (y = 0; y < h; ++y) {
575     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
576     src += src_stride;
577     dst += dst_stride;
578   }
579 }
580 #endif
581