1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <string.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_convolve.h"
18 #include "vpx_dsp/vpx_dsp_common.h"
19 #include "vpx_dsp/vpx_filter.h"
20 #include "vpx_ports/mem.h"
21 
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)22 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
23                            uint8_t *dst, ptrdiff_t dst_stride,
24                            const InterpKernel *x_filters,
25                            int x0_q4, int x_step_q4, int w, int h) {
26   int x, y;
27   src -= SUBPEL_TAPS / 2 - 1;
28   for (y = 0; y < h; ++y) {
29     int x_q4 = x0_q4;
30     for (x = 0; x < w; ++x) {
31       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
32       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
33       int k, sum = 0;
34       for (k = 0; k < SUBPEL_TAPS; ++k)
35         sum += src_x[k] * x_filter[k];
36       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37       x_q4 += x_step_q4;
38     }
39     src += src_stride;
40     dst += dst_stride;
41   }
42 }
43 
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)44 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45                                uint8_t *dst, ptrdiff_t dst_stride,
46                                const InterpKernel *x_filters,
47                                int x0_q4, int x_step_q4, int w, int h) {
48   int x, y;
49   src -= SUBPEL_TAPS / 2 - 1;
50   for (y = 0; y < h; ++y) {
51     int x_q4 = x0_q4;
52     for (x = 0; x < w; ++x) {
53       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
54       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
55       int k, sum = 0;
56       for (k = 0; k < SUBPEL_TAPS; ++k)
57         sum += src_x[k] * x_filter[k];
58       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
59           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
60       x_q4 += x_step_q4;
61     }
62     src += src_stride;
63     dst += dst_stride;
64   }
65 }
66 
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)67 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
68                           uint8_t *dst, ptrdiff_t dst_stride,
69                           const InterpKernel *y_filters,
70                           int y0_q4, int y_step_q4, int w, int h) {
71   int x, y;
72   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
73 
74   for (x = 0; x < w; ++x) {
75     int y_q4 = y0_q4;
76     for (y = 0; y < h; ++y) {
77       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
78       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
79       int k, sum = 0;
80       for (k = 0; k < SUBPEL_TAPS; ++k)
81         sum += src_y[k * src_stride] * y_filter[k];
82       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
83       y_q4 += y_step_q4;
84     }
85     ++src;
86     ++dst;
87   }
88 }
89 
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)90 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
91                               uint8_t *dst, ptrdiff_t dst_stride,
92                               const InterpKernel *y_filters,
93                               int y0_q4, int y_step_q4, int w, int h) {
94   int x, y;
95   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
96 
97   for (x = 0; x < w; ++x) {
98     int y_q4 = y0_q4;
99     for (y = 0; y < h; ++y) {
100       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
101       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
102       int k, sum = 0;
103       for (k = 0; k < SUBPEL_TAPS; ++k)
104         sum += src_y[k * src_stride] * y_filter[k];
105       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
106           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
107       y_q4 += y_step_q4;
108     }
109     ++src;
110     ++dst;
111   }
112 }
113 
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)114 static void convolve(const uint8_t *src, ptrdiff_t src_stride,
115                      uint8_t *dst, ptrdiff_t dst_stride,
116                      const InterpKernel *const x_filters,
117                      int x0_q4, int x_step_q4,
118                      const InterpKernel *const y_filters,
119                      int y0_q4, int y_step_q4,
120                      int w, int h) {
121   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
122   // 2d filtering proceeds in 2 steps:
123   //   (1) Interpolate horizontally into an intermediate buffer, temp.
124   //   (2) Interpolate temp vertically to derive the sub-pixel result.
125   // Deriving the maximum number of rows in the temp buffer (135):
126   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
127   // --Largest block size is 64x64 pixels.
128   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
129   //   original frame (in 1/16th pixel units).
130   // --Must round-up because block may be located at sub-pixel position.
131   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
132   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
133   uint8_t temp[135 * 64];
134   int intermediate_height =
135           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
136 
137   assert(w <= 64);
138   assert(h <= 64);
139   assert(y_step_q4 <= 32);
140   assert(x_step_q4 <= 32);
141 
142   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
143                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
144   convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
145                 y_filters, y0_q4, y_step_q4, w, h);
146 }
147 
get_filter_base(const int16_t * filter)148 static const InterpKernel *get_filter_base(const int16_t *filter) {
149   // NOTE: This assumes that the filter table is 256-byte aligned.
150   // TODO(agrange) Modify to make independent of table alignment.
151   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
152 }
153 
get_filter_offset(const int16_t * f,const InterpKernel * base)154 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
155   return (int)((const InterpKernel *)(intptr_t)f - base);
156 }
157 
vpx_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)158 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
159                            uint8_t *dst, ptrdiff_t dst_stride,
160                            const int16_t *filter_x, int x_step_q4,
161                            const int16_t *filter_y, int y_step_q4,
162                            int w, int h) {
163   const InterpKernel *const filters_x = get_filter_base(filter_x);
164   const int x0_q4 = get_filter_offset(filter_x, filters_x);
165 
166   (void)filter_y;
167   (void)y_step_q4;
168 
169   convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
170                  x0_q4, x_step_q4, w, h);
171 }
172 
vpx_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)173 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
174                                uint8_t *dst, ptrdiff_t dst_stride,
175                                const int16_t *filter_x, int x_step_q4,
176                                const int16_t *filter_y, int y_step_q4,
177                                int w, int h) {
178   const InterpKernel *const filters_x = get_filter_base(filter_x);
179   const int x0_q4 = get_filter_offset(filter_x, filters_x);
180 
181   (void)filter_y;
182   (void)y_step_q4;
183 
184   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
185                      x0_q4, x_step_q4, w, h);
186 }
187 
vpx_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)188 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
189                           uint8_t *dst, ptrdiff_t dst_stride,
190                           const int16_t *filter_x, int x_step_q4,
191                           const int16_t *filter_y, int y_step_q4,
192                           int w, int h) {
193   const InterpKernel *const filters_y = get_filter_base(filter_y);
194   const int y0_q4 = get_filter_offset(filter_y, filters_y);
195 
196   (void)filter_x;
197   (void)x_step_q4;
198 
199   convolve_vert(src, src_stride, dst, dst_stride, filters_y,
200                 y0_q4, y_step_q4, w, h);
201 }
202 
vpx_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)203 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
204                               uint8_t *dst, ptrdiff_t dst_stride,
205                               const int16_t *filter_x, int x_step_q4,
206                               const int16_t *filter_y, int y_step_q4,
207                               int w, int h) {
208   const InterpKernel *const filters_y = get_filter_base(filter_y);
209   const int y0_q4 = get_filter_offset(filter_y, filters_y);
210 
211   (void)filter_x;
212   (void)x_step_q4;
213 
214   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
215                     y0_q4, y_step_q4, w, h);
216 }
217 
vpx_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)218 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
219                      uint8_t *dst, ptrdiff_t dst_stride,
220                      const int16_t *filter_x, int x_step_q4,
221                      const int16_t *filter_y, int y_step_q4,
222                      int w, int h) {
223   const InterpKernel *const filters_x = get_filter_base(filter_x);
224   const int x0_q4 = get_filter_offset(filter_x, filters_x);
225 
226   const InterpKernel *const filters_y = get_filter_base(filter_y);
227   const int y0_q4 = get_filter_offset(filter_y, filters_y);
228 
229   convolve(src, src_stride, dst, dst_stride,
230            filters_x, x0_q4, x_step_q4,
231            filters_y, y0_q4, y_step_q4, w, h);
232 }
233 
vpx_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)234 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
235                          uint8_t *dst, ptrdiff_t dst_stride,
236                          const int16_t *filter_x, int x_step_q4,
237                          const int16_t *filter_y, int y_step_q4,
238                          int w, int h) {
239   /* Fixed size intermediate buffer places limits on parameters. */
240   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
241   assert(w <= 64);
242   assert(h <= 64);
243 
244   vpx_convolve8_c(src, src_stride, temp, 64,
245                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
246   vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
247 }
248 
vpx_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)249 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
250                          uint8_t *dst, ptrdiff_t dst_stride,
251                          const int16_t *filter_x, int filter_x_stride,
252                          const int16_t *filter_y, int filter_y_stride,
253                          int w, int h) {
254   int r;
255 
256   (void)filter_x;  (void)filter_x_stride;
257   (void)filter_y;  (void)filter_y_stride;
258 
259   for (r = h; r > 0; --r) {
260     memcpy(dst, src, w);
261     src += src_stride;
262     dst += dst_stride;
263   }
264 }
265 
vpx_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)266 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
267                         uint8_t *dst, ptrdiff_t dst_stride,
268                         const int16_t *filter_x, int filter_x_stride,
269                         const int16_t *filter_y, int filter_y_stride,
270                         int w, int h) {
271   int x, y;
272 
273   (void)filter_x;  (void)filter_x_stride;
274   (void)filter_y;  (void)filter_y_stride;
275 
276   for (y = 0; y < h; ++y) {
277     for (x = 0; x < w; ++x)
278       dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
279 
280     src += src_stride;
281     dst += dst_stride;
282   }
283 }
284 
vpx_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)285 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
286                         uint8_t *dst, ptrdiff_t dst_stride,
287                         const int16_t *filter_x, int x_step_q4,
288                         const int16_t *filter_y, int y_step_q4,
289                         int w, int h) {
290   vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
291                         filter_y, y_step_q4, w, h);
292 }
293 
vpx_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)294 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
295                        uint8_t *dst, ptrdiff_t dst_stride,
296                        const int16_t *filter_x, int x_step_q4,
297                        const int16_t *filter_y, int y_step_q4,
298                        int w, int h) {
299   vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
300                        filter_y, y_step_q4, w, h);
301 }
302 
vpx_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)303 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
304                      uint8_t *dst, ptrdiff_t dst_stride,
305                      const int16_t *filter_x, int x_step_q4,
306                      const int16_t *filter_y, int y_step_q4,
307                      int w, int h) {
308   vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
309                   filter_y, y_step_q4, w, h);
310 }
311 
vpx_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)312 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
313                             uint8_t *dst, ptrdiff_t dst_stride,
314                             const int16_t *filter_x, int x_step_q4,
315                             const int16_t *filter_y, int y_step_q4,
316                             int w, int h) {
317   vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
318                             x_step_q4, filter_y, y_step_q4, w, h);
319 }
320 
vpx_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)321 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
322                            uint8_t *dst, ptrdiff_t dst_stride,
323                            const int16_t *filter_x, int x_step_q4,
324                            const int16_t *filter_y, int y_step_q4,
325                            int w, int h) {
326   vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
327                            x_step_q4, filter_y, y_step_q4, w, h);
328 }
329 
vpx_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)330 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
331                      uint8_t *dst, ptrdiff_t dst_stride,
332                      const int16_t *filter_x, int x_step_q4,
333                      const int16_t *filter_y, int y_step_q4,
334                      int w, int h) {
335   vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
336                       filter_y, y_step_q4, w, h);
337 }
338 
339 #if CONFIG_VP9_HIGHBITDEPTH
highbd_convolve_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)340 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
341                                   uint8_t *dst8, ptrdiff_t dst_stride,
342                                   const InterpKernel *x_filters,
343                                   int x0_q4, int x_step_q4,
344                                   int w, int h, int bd) {
345   int x, y;
346   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
347   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
348   src -= SUBPEL_TAPS / 2 - 1;
349   for (y = 0; y < h; ++y) {
350     int x_q4 = x0_q4;
351     for (x = 0; x < w; ++x) {
352       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
353       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
354       int k, sum = 0;
355       for (k = 0; k < SUBPEL_TAPS; ++k)
356         sum += src_x[k] * x_filter[k];
357       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
358       x_q4 += x_step_q4;
359     }
360     src += src_stride;
361     dst += dst_stride;
362   }
363 }
364 
highbd_convolve_avg_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)365 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
366                                       uint8_t *dst8, ptrdiff_t dst_stride,
367                                       const InterpKernel *x_filters,
368                                       int x0_q4, int x_step_q4,
369                                       int w, int h, int bd) {
370   int x, y;
371   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
372   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
373   src -= SUBPEL_TAPS / 2 - 1;
374   for (y = 0; y < h; ++y) {
375     int x_q4 = x0_q4;
376     for (x = 0; x < w; ++x) {
377       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
378       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
379       int k, sum = 0;
380       for (k = 0; k < SUBPEL_TAPS; ++k)
381         sum += src_x[k] * x_filter[k];
382       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
383           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
384       x_q4 += x_step_q4;
385     }
386     src += src_stride;
387     dst += dst_stride;
388   }
389 }
390 
highbd_convolve_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)391 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
392                                  uint8_t *dst8, ptrdiff_t dst_stride,
393                                  const InterpKernel *y_filters,
394                                  int y0_q4, int y_step_q4, int w, int h,
395                                  int bd) {
396   int x, y;
397   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
398   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
399   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
400   for (x = 0; x < w; ++x) {
401     int y_q4 = y0_q4;
402     for (y = 0; y < h; ++y) {
403       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
404       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
405       int k, sum = 0;
406       for (k = 0; k < SUBPEL_TAPS; ++k)
407         sum += src_y[k * src_stride] * y_filter[k];
408       dst[y * dst_stride] = clip_pixel_highbd(
409           ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
410       y_q4 += y_step_q4;
411     }
412     ++src;
413     ++dst;
414   }
415 }
416 
highbd_convolve_avg_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)417 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
418                                      uint8_t *dst8, ptrdiff_t dst_stride,
419                                      const InterpKernel *y_filters,
420                                      int y0_q4, int y_step_q4, int w, int h,
421                                      int bd) {
422   int x, y;
423   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
424   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
425   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
426   for (x = 0; x < w; ++x) {
427     int y_q4 = y0_q4;
428     for (y = 0; y < h; ++y) {
429       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
430       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
431       int k, sum = 0;
432       for (k = 0; k < SUBPEL_TAPS; ++k)
433         sum += src_y[k * src_stride] * y_filter[k];
434       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
435           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
436       y_q4 += y_step_q4;
437     }
438     ++src;
439     ++dst;
440   }
441 }
442 
highbd_convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)443 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
444                             uint8_t *dst, ptrdiff_t dst_stride,
445                             const InterpKernel *const x_filters,
446                             int x0_q4, int x_step_q4,
447                             const InterpKernel *const y_filters,
448                             int y0_q4, int y_step_q4,
449                             int w, int h, int bd) {
450   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
451   // 2d filtering proceeds in 2 steps:
452   //   (1) Interpolate horizontally into an intermediate buffer, temp.
453   //   (2) Interpolate temp vertically to derive the sub-pixel result.
454   // Deriving the maximum number of rows in the temp buffer (135):
455   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
456   // --Largest block size is 64x64 pixels.
457   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
458   //   original frame (in 1/16th pixel units).
459   // --Must round-up because block may be located at sub-pixel position.
460   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
461   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
462   uint16_t temp[64 * 135];
463   int intermediate_height =
464           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
465 
466   assert(w <= 64);
467   assert(h <= 64);
468   assert(y_step_q4 <= 32);
469   assert(x_step_q4 <= 32);
470 
471   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
472                         src_stride, CONVERT_TO_BYTEPTR(temp), 64,
473                         x_filters, x0_q4, x_step_q4, w,
474                         intermediate_height, bd);
475   highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
476                        64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
477                        w, h, bd);
478 }
479 
480 
vpx_highbd_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)481 void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
482                                   uint8_t *dst, ptrdiff_t dst_stride,
483                                   const int16_t *filter_x, int x_step_q4,
484                                   const int16_t *filter_y, int y_step_q4,
485                                   int w, int h, int bd) {
486   const InterpKernel *const filters_x = get_filter_base(filter_x);
487   const int x0_q4 = get_filter_offset(filter_x, filters_x);
488   (void)filter_y;
489   (void)y_step_q4;
490 
491   highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
492                         x0_q4, x_step_q4, w, h, bd);
493 }
494 
vpx_highbd_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)495 void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
496                                       uint8_t *dst, ptrdiff_t dst_stride,
497                                       const int16_t *filter_x, int x_step_q4,
498                                       const int16_t *filter_y, int y_step_q4,
499                                       int w, int h, int bd) {
500   const InterpKernel *const filters_x = get_filter_base(filter_x);
501   const int x0_q4 = get_filter_offset(filter_x, filters_x);
502   (void)filter_y;
503   (void)y_step_q4;
504 
505   highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
506                             x0_q4, x_step_q4, w, h, bd);
507 }
508 
vpx_highbd_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)509 void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
510                                  uint8_t *dst, ptrdiff_t dst_stride,
511                                  const int16_t *filter_x, int x_step_q4,
512                                  const int16_t *filter_y, int y_step_q4,
513                                  int w, int h, int bd) {
514   const InterpKernel *const filters_y = get_filter_base(filter_y);
515   const int y0_q4 = get_filter_offset(filter_y, filters_y);
516   (void)filter_x;
517   (void)x_step_q4;
518 
519   highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
520                        y0_q4, y_step_q4, w, h, bd);
521 }
522 
vpx_highbd_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)523 void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
524                                      uint8_t *dst, ptrdiff_t dst_stride,
525                                      const int16_t *filter_x, int x_step_q4,
526                                      const int16_t *filter_y, int y_step_q4,
527                                      int w, int h, int bd) {
528   const InterpKernel *const filters_y = get_filter_base(filter_y);
529   const int y0_q4 = get_filter_offset(filter_y, filters_y);
530   (void)filter_x;
531   (void)x_step_q4;
532 
533   highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
534                            y0_q4, y_step_q4, w, h, bd);
535 }
536 
vpx_highbd_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)537 void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
538                             uint8_t *dst, ptrdiff_t dst_stride,
539                             const int16_t *filter_x, int x_step_q4,
540                             const int16_t *filter_y, int y_step_q4,
541                             int w, int h, int bd) {
542   const InterpKernel *const filters_x = get_filter_base(filter_x);
543   const int x0_q4 = get_filter_offset(filter_x, filters_x);
544 
545   const InterpKernel *const filters_y = get_filter_base(filter_y);
546   const int y0_q4 = get_filter_offset(filter_y, filters_y);
547 
548   highbd_convolve(src, src_stride, dst, dst_stride,
549                   filters_x, x0_q4, x_step_q4,
550                   filters_y, y0_q4, y_step_q4, w, h, bd);
551 }
552 
vpx_highbd_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)553 void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
554                                 uint8_t *dst, ptrdiff_t dst_stride,
555                                 const int16_t *filter_x, int x_step_q4,
556                                 const int16_t *filter_y, int y_step_q4,
557                                 int w, int h, int bd) {
558   // Fixed size intermediate buffer places limits on parameters.
559   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
560   assert(w <= 64);
561   assert(h <= 64);
562 
563   vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
564                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
565   vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
566                             NULL, 0, NULL, 0, w, h, bd);
567 }
568 
vpx_highbd_convolve_copy_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)569 void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
570                                 uint8_t *dst8, ptrdiff_t dst_stride,
571                                 const int16_t *filter_x, int filter_x_stride,
572                                 const int16_t *filter_y, int filter_y_stride,
573                                 int w, int h, int bd) {
574   int r;
575   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
576   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
577   (void)filter_x;
578   (void)filter_y;
579   (void)filter_x_stride;
580   (void)filter_y_stride;
581   (void)bd;
582 
583   for (r = h; r > 0; --r) {
584     memcpy(dst, src, w * sizeof(uint16_t));
585     src += src_stride;
586     dst += dst_stride;
587   }
588 }
589 
vpx_highbd_convolve_avg_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)590 void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
591                                uint8_t *dst8, ptrdiff_t dst_stride,
592                                const int16_t *filter_x, int filter_x_stride,
593                                const int16_t *filter_y, int filter_y_stride,
594                                int w, int h, int bd) {
595   int x, y;
596   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
597   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
598   (void)filter_x;
599   (void)filter_y;
600   (void)filter_x_stride;
601   (void)filter_y_stride;
602   (void)bd;
603 
604   for (y = 0; y < h; ++y) {
605     for (x = 0; x < w; ++x) {
606       dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
607     }
608     src += src_stride;
609     dst += dst_stride;
610   }
611 }
612 #endif
613