1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <string.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_convolve.h"
18 #include "vpx_dsp/vpx_dsp_common.h"
19 #include "vpx_dsp/vpx_filter.h"
20 #include "vpx_ports/mem.h"
21
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)22 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
23 uint8_t *dst, ptrdiff_t dst_stride,
24 const InterpKernel *x_filters,
25 int x0_q4, int x_step_q4, int w, int h) {
26 int x, y;
27 src -= SUBPEL_TAPS / 2 - 1;
28 for (y = 0; y < h; ++y) {
29 int x_q4 = x0_q4;
30 for (x = 0; x < w; ++x) {
31 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
32 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
33 int k, sum = 0;
34 for (k = 0; k < SUBPEL_TAPS; ++k)
35 sum += src_x[k] * x_filter[k];
36 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 x_q4 += x_step_q4;
38 }
39 src += src_stride;
40 dst += dst_stride;
41 }
42 }
43
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)44 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 const InterpKernel *x_filters,
47 int x0_q4, int x_step_q4, int w, int h) {
48 int x, y;
49 src -= SUBPEL_TAPS / 2 - 1;
50 for (y = 0; y < h; ++y) {
51 int x_q4 = x0_q4;
52 for (x = 0; x < w; ++x) {
53 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
54 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
55 int k, sum = 0;
56 for (k = 0; k < SUBPEL_TAPS; ++k)
57 sum += src_x[k] * x_filter[k];
58 dst[x] = ROUND_POWER_OF_TWO(dst[x] +
59 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
60 x_q4 += x_step_q4;
61 }
62 src += src_stride;
63 dst += dst_stride;
64 }
65 }
66
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)67 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
68 uint8_t *dst, ptrdiff_t dst_stride,
69 const InterpKernel *y_filters,
70 int y0_q4, int y_step_q4, int w, int h) {
71 int x, y;
72 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
73
74 for (x = 0; x < w; ++x) {
75 int y_q4 = y0_q4;
76 for (y = 0; y < h; ++y) {
77 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
78 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
79 int k, sum = 0;
80 for (k = 0; k < SUBPEL_TAPS; ++k)
81 sum += src_y[k * src_stride] * y_filter[k];
82 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
83 y_q4 += y_step_q4;
84 }
85 ++src;
86 ++dst;
87 }
88 }
89
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)90 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
91 uint8_t *dst, ptrdiff_t dst_stride,
92 const InterpKernel *y_filters,
93 int y0_q4, int y_step_q4, int w, int h) {
94 int x, y;
95 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
96
97 for (x = 0; x < w; ++x) {
98 int y_q4 = y0_q4;
99 for (y = 0; y < h; ++y) {
100 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
101 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
102 int k, sum = 0;
103 for (k = 0; k < SUBPEL_TAPS; ++k)
104 sum += src_y[k * src_stride] * y_filter[k];
105 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
106 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
107 y_q4 += y_step_q4;
108 }
109 ++src;
110 ++dst;
111 }
112 }
113
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)114 static void convolve(const uint8_t *src, ptrdiff_t src_stride,
115 uint8_t *dst, ptrdiff_t dst_stride,
116 const InterpKernel *const x_filters,
117 int x0_q4, int x_step_q4,
118 const InterpKernel *const y_filters,
119 int y0_q4, int y_step_q4,
120 int w, int h) {
121 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
122 // 2d filtering proceeds in 2 steps:
123 // (1) Interpolate horizontally into an intermediate buffer, temp.
124 // (2) Interpolate temp vertically to derive the sub-pixel result.
125 // Deriving the maximum number of rows in the temp buffer (135):
126 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
127 // --Largest block size is 64x64 pixels.
128 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
129 // original frame (in 1/16th pixel units).
130 // --Must round-up because block may be located at sub-pixel position.
131 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
132 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
133 uint8_t temp[135 * 64];
134 int intermediate_height =
135 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
136
137 assert(w <= 64);
138 assert(h <= 64);
139 assert(y_step_q4 <= 32);
140 assert(x_step_q4 <= 32);
141
142 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
143 x_filters, x0_q4, x_step_q4, w, intermediate_height);
144 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
145 y_filters, y0_q4, y_step_q4, w, h);
146 }
147
get_filter_base(const int16_t * filter)148 static const InterpKernel *get_filter_base(const int16_t *filter) {
149 // NOTE: This assumes that the filter table is 256-byte aligned.
150 // TODO(agrange) Modify to make independent of table alignment.
151 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
152 }
153
get_filter_offset(const int16_t * f,const InterpKernel * base)154 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
155 return (int)((const InterpKernel *)(intptr_t)f - base);
156 }
157
vpx_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)158 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
159 uint8_t *dst, ptrdiff_t dst_stride,
160 const int16_t *filter_x, int x_step_q4,
161 const int16_t *filter_y, int y_step_q4,
162 int w, int h) {
163 const InterpKernel *const filters_x = get_filter_base(filter_x);
164 const int x0_q4 = get_filter_offset(filter_x, filters_x);
165
166 (void)filter_y;
167 (void)y_step_q4;
168
169 convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
170 x0_q4, x_step_q4, w, h);
171 }
172
vpx_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)173 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
174 uint8_t *dst, ptrdiff_t dst_stride,
175 const int16_t *filter_x, int x_step_q4,
176 const int16_t *filter_y, int y_step_q4,
177 int w, int h) {
178 const InterpKernel *const filters_x = get_filter_base(filter_x);
179 const int x0_q4 = get_filter_offset(filter_x, filters_x);
180
181 (void)filter_y;
182 (void)y_step_q4;
183
184 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
185 x0_q4, x_step_q4, w, h);
186 }
187
vpx_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)188 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
189 uint8_t *dst, ptrdiff_t dst_stride,
190 const int16_t *filter_x, int x_step_q4,
191 const int16_t *filter_y, int y_step_q4,
192 int w, int h) {
193 const InterpKernel *const filters_y = get_filter_base(filter_y);
194 const int y0_q4 = get_filter_offset(filter_y, filters_y);
195
196 (void)filter_x;
197 (void)x_step_q4;
198
199 convolve_vert(src, src_stride, dst, dst_stride, filters_y,
200 y0_q4, y_step_q4, w, h);
201 }
202
vpx_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)203 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
204 uint8_t *dst, ptrdiff_t dst_stride,
205 const int16_t *filter_x, int x_step_q4,
206 const int16_t *filter_y, int y_step_q4,
207 int w, int h) {
208 const InterpKernel *const filters_y = get_filter_base(filter_y);
209 const int y0_q4 = get_filter_offset(filter_y, filters_y);
210
211 (void)filter_x;
212 (void)x_step_q4;
213
214 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
215 y0_q4, y_step_q4, w, h);
216 }
217
vpx_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)218 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
219 uint8_t *dst, ptrdiff_t dst_stride,
220 const int16_t *filter_x, int x_step_q4,
221 const int16_t *filter_y, int y_step_q4,
222 int w, int h) {
223 const InterpKernel *const filters_x = get_filter_base(filter_x);
224 const int x0_q4 = get_filter_offset(filter_x, filters_x);
225
226 const InterpKernel *const filters_y = get_filter_base(filter_y);
227 const int y0_q4 = get_filter_offset(filter_y, filters_y);
228
229 convolve(src, src_stride, dst, dst_stride,
230 filters_x, x0_q4, x_step_q4,
231 filters_y, y0_q4, y_step_q4, w, h);
232 }
233
vpx_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)234 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
235 uint8_t *dst, ptrdiff_t dst_stride,
236 const int16_t *filter_x, int x_step_q4,
237 const int16_t *filter_y, int y_step_q4,
238 int w, int h) {
239 /* Fixed size intermediate buffer places limits on parameters. */
240 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
241 assert(w <= 64);
242 assert(h <= 64);
243
244 vpx_convolve8_c(src, src_stride, temp, 64,
245 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
246 vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
247 }
248
vpx_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)249 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
250 uint8_t *dst, ptrdiff_t dst_stride,
251 const int16_t *filter_x, int filter_x_stride,
252 const int16_t *filter_y, int filter_y_stride,
253 int w, int h) {
254 int r;
255
256 (void)filter_x; (void)filter_x_stride;
257 (void)filter_y; (void)filter_y_stride;
258
259 for (r = h; r > 0; --r) {
260 memcpy(dst, src, w);
261 src += src_stride;
262 dst += dst_stride;
263 }
264 }
265
vpx_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)266 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
267 uint8_t *dst, ptrdiff_t dst_stride,
268 const int16_t *filter_x, int filter_x_stride,
269 const int16_t *filter_y, int filter_y_stride,
270 int w, int h) {
271 int x, y;
272
273 (void)filter_x; (void)filter_x_stride;
274 (void)filter_y; (void)filter_y_stride;
275
276 for (y = 0; y < h; ++y) {
277 for (x = 0; x < w; ++x)
278 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
279
280 src += src_stride;
281 dst += dst_stride;
282 }
283 }
284
vpx_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)285 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
286 uint8_t *dst, ptrdiff_t dst_stride,
287 const int16_t *filter_x, int x_step_q4,
288 const int16_t *filter_y, int y_step_q4,
289 int w, int h) {
290 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
291 filter_y, y_step_q4, w, h);
292 }
293
vpx_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)294 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
295 uint8_t *dst, ptrdiff_t dst_stride,
296 const int16_t *filter_x, int x_step_q4,
297 const int16_t *filter_y, int y_step_q4,
298 int w, int h) {
299 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
300 filter_y, y_step_q4, w, h);
301 }
302
vpx_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)303 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
304 uint8_t *dst, ptrdiff_t dst_stride,
305 const int16_t *filter_x, int x_step_q4,
306 const int16_t *filter_y, int y_step_q4,
307 int w, int h) {
308 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
309 filter_y, y_step_q4, w, h);
310 }
311
vpx_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)312 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
313 uint8_t *dst, ptrdiff_t dst_stride,
314 const int16_t *filter_x, int x_step_q4,
315 const int16_t *filter_y, int y_step_q4,
316 int w, int h) {
317 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
318 x_step_q4, filter_y, y_step_q4, w, h);
319 }
320
vpx_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)321 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
322 uint8_t *dst, ptrdiff_t dst_stride,
323 const int16_t *filter_x, int x_step_q4,
324 const int16_t *filter_y, int y_step_q4,
325 int w, int h) {
326 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
327 x_step_q4, filter_y, y_step_q4, w, h);
328 }
329
vpx_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)330 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
331 uint8_t *dst, ptrdiff_t dst_stride,
332 const int16_t *filter_x, int x_step_q4,
333 const int16_t *filter_y, int y_step_q4,
334 int w, int h) {
335 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
336 filter_y, y_step_q4, w, h);
337 }
338
339 #if CONFIG_VP9_HIGHBITDEPTH
highbd_convolve_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)340 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
341 uint8_t *dst8, ptrdiff_t dst_stride,
342 const InterpKernel *x_filters,
343 int x0_q4, int x_step_q4,
344 int w, int h, int bd) {
345 int x, y;
346 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
347 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
348 src -= SUBPEL_TAPS / 2 - 1;
349 for (y = 0; y < h; ++y) {
350 int x_q4 = x0_q4;
351 for (x = 0; x < w; ++x) {
352 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
353 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
354 int k, sum = 0;
355 for (k = 0; k < SUBPEL_TAPS; ++k)
356 sum += src_x[k] * x_filter[k];
357 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
358 x_q4 += x_step_q4;
359 }
360 src += src_stride;
361 dst += dst_stride;
362 }
363 }
364
highbd_convolve_avg_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)365 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
366 uint8_t *dst8, ptrdiff_t dst_stride,
367 const InterpKernel *x_filters,
368 int x0_q4, int x_step_q4,
369 int w, int h, int bd) {
370 int x, y;
371 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
372 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
373 src -= SUBPEL_TAPS / 2 - 1;
374 for (y = 0; y < h; ++y) {
375 int x_q4 = x0_q4;
376 for (x = 0; x < w; ++x) {
377 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
378 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
379 int k, sum = 0;
380 for (k = 0; k < SUBPEL_TAPS; ++k)
381 sum += src_x[k] * x_filter[k];
382 dst[x] = ROUND_POWER_OF_TWO(dst[x] +
383 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
384 x_q4 += x_step_q4;
385 }
386 src += src_stride;
387 dst += dst_stride;
388 }
389 }
390
highbd_convolve_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)391 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
392 uint8_t *dst8, ptrdiff_t dst_stride,
393 const InterpKernel *y_filters,
394 int y0_q4, int y_step_q4, int w, int h,
395 int bd) {
396 int x, y;
397 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
398 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
399 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
400 for (x = 0; x < w; ++x) {
401 int y_q4 = y0_q4;
402 for (y = 0; y < h; ++y) {
403 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
404 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
405 int k, sum = 0;
406 for (k = 0; k < SUBPEL_TAPS; ++k)
407 sum += src_y[k * src_stride] * y_filter[k];
408 dst[y * dst_stride] = clip_pixel_highbd(
409 ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
410 y_q4 += y_step_q4;
411 }
412 ++src;
413 ++dst;
414 }
415 }
416
highbd_convolve_avg_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)417 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
418 uint8_t *dst8, ptrdiff_t dst_stride,
419 const InterpKernel *y_filters,
420 int y0_q4, int y_step_q4, int w, int h,
421 int bd) {
422 int x, y;
423 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
424 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
425 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
426 for (x = 0; x < w; ++x) {
427 int y_q4 = y0_q4;
428 for (y = 0; y < h; ++y) {
429 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
430 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
431 int k, sum = 0;
432 for (k = 0; k < SUBPEL_TAPS; ++k)
433 sum += src_y[k * src_stride] * y_filter[k];
434 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
435 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
436 y_q4 += y_step_q4;
437 }
438 ++src;
439 ++dst;
440 }
441 }
442
highbd_convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)443 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
444 uint8_t *dst, ptrdiff_t dst_stride,
445 const InterpKernel *const x_filters,
446 int x0_q4, int x_step_q4,
447 const InterpKernel *const y_filters,
448 int y0_q4, int y_step_q4,
449 int w, int h, int bd) {
450 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
451 // 2d filtering proceeds in 2 steps:
452 // (1) Interpolate horizontally into an intermediate buffer, temp.
453 // (2) Interpolate temp vertically to derive the sub-pixel result.
454 // Deriving the maximum number of rows in the temp buffer (135):
455 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
456 // --Largest block size is 64x64 pixels.
457 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
458 // original frame (in 1/16th pixel units).
459 // --Must round-up because block may be located at sub-pixel position.
460 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
461 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
462 uint16_t temp[64 * 135];
463 int intermediate_height =
464 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
465
466 assert(w <= 64);
467 assert(h <= 64);
468 assert(y_step_q4 <= 32);
469 assert(x_step_q4 <= 32);
470
471 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
472 src_stride, CONVERT_TO_BYTEPTR(temp), 64,
473 x_filters, x0_q4, x_step_q4, w,
474 intermediate_height, bd);
475 highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
476 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
477 w, h, bd);
478 }
479
480
vpx_highbd_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)481 void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
482 uint8_t *dst, ptrdiff_t dst_stride,
483 const int16_t *filter_x, int x_step_q4,
484 const int16_t *filter_y, int y_step_q4,
485 int w, int h, int bd) {
486 const InterpKernel *const filters_x = get_filter_base(filter_x);
487 const int x0_q4 = get_filter_offset(filter_x, filters_x);
488 (void)filter_y;
489 (void)y_step_q4;
490
491 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
492 x0_q4, x_step_q4, w, h, bd);
493 }
494
vpx_highbd_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)495 void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
496 uint8_t *dst, ptrdiff_t dst_stride,
497 const int16_t *filter_x, int x_step_q4,
498 const int16_t *filter_y, int y_step_q4,
499 int w, int h, int bd) {
500 const InterpKernel *const filters_x = get_filter_base(filter_x);
501 const int x0_q4 = get_filter_offset(filter_x, filters_x);
502 (void)filter_y;
503 (void)y_step_q4;
504
505 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
506 x0_q4, x_step_q4, w, h, bd);
507 }
508
vpx_highbd_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)509 void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
510 uint8_t *dst, ptrdiff_t dst_stride,
511 const int16_t *filter_x, int x_step_q4,
512 const int16_t *filter_y, int y_step_q4,
513 int w, int h, int bd) {
514 const InterpKernel *const filters_y = get_filter_base(filter_y);
515 const int y0_q4 = get_filter_offset(filter_y, filters_y);
516 (void)filter_x;
517 (void)x_step_q4;
518
519 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
520 y0_q4, y_step_q4, w, h, bd);
521 }
522
vpx_highbd_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)523 void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
524 uint8_t *dst, ptrdiff_t dst_stride,
525 const int16_t *filter_x, int x_step_q4,
526 const int16_t *filter_y, int y_step_q4,
527 int w, int h, int bd) {
528 const InterpKernel *const filters_y = get_filter_base(filter_y);
529 const int y0_q4 = get_filter_offset(filter_y, filters_y);
530 (void)filter_x;
531 (void)x_step_q4;
532
533 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
534 y0_q4, y_step_q4, w, h, bd);
535 }
536
vpx_highbd_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)537 void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
538 uint8_t *dst, ptrdiff_t dst_stride,
539 const int16_t *filter_x, int x_step_q4,
540 const int16_t *filter_y, int y_step_q4,
541 int w, int h, int bd) {
542 const InterpKernel *const filters_x = get_filter_base(filter_x);
543 const int x0_q4 = get_filter_offset(filter_x, filters_x);
544
545 const InterpKernel *const filters_y = get_filter_base(filter_y);
546 const int y0_q4 = get_filter_offset(filter_y, filters_y);
547
548 highbd_convolve(src, src_stride, dst, dst_stride,
549 filters_x, x0_q4, x_step_q4,
550 filters_y, y0_q4, y_step_q4, w, h, bd);
551 }
552
vpx_highbd_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)553 void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
554 uint8_t *dst, ptrdiff_t dst_stride,
555 const int16_t *filter_x, int x_step_q4,
556 const int16_t *filter_y, int y_step_q4,
557 int w, int h, int bd) {
558 // Fixed size intermediate buffer places limits on parameters.
559 DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
560 assert(w <= 64);
561 assert(h <= 64);
562
563 vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
564 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
565 vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
566 NULL, 0, NULL, 0, w, h, bd);
567 }
568
vpx_highbd_convolve_copy_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)569 void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
570 uint8_t *dst8, ptrdiff_t dst_stride,
571 const int16_t *filter_x, int filter_x_stride,
572 const int16_t *filter_y, int filter_y_stride,
573 int w, int h, int bd) {
574 int r;
575 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
576 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
577 (void)filter_x;
578 (void)filter_y;
579 (void)filter_x_stride;
580 (void)filter_y_stride;
581 (void)bd;
582
583 for (r = h; r > 0; --r) {
584 memcpy(dst, src, w * sizeof(uint16_t));
585 src += src_stride;
586 dst += dst_stride;
587 }
588 }
589
vpx_highbd_convolve_avg_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)590 void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
591 uint8_t *dst8, ptrdiff_t dst_stride,
592 const int16_t *filter_x, int filter_x_stride,
593 const int16_t *filter_y, int filter_y_stride,
594 int w, int h, int bd) {
595 int x, y;
596 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
597 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
598 (void)filter_x;
599 (void)filter_y;
600 (void)filter_x_stride;
601 (void)filter_y_stride;
602 (void)bd;
603
604 for (y = 0; y < h; ++y) {
605 for (x = 0; x < w; ++x) {
606 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
607 }
608 src += src_stride;
609 dst += dst_stride;
610 }
611 }
612 #endif
613