1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_ports/mem.h"
19
load_4x4(const int16_t * s,const ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)20 static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
21 int16x4_t *const s0, int16x4_t *const s1,
22 int16x4_t *const s2, int16x4_t *const s3) {
23 *s0 = vld1_s16(s);
24 s += p;
25 *s1 = vld1_s16(s);
26 s += p;
27 *s2 = vld1_s16(s);
28 s += p;
29 *s3 = vld1_s16(s);
30 }
31
load_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)32 static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
33 uint16x8_t *const s0, uint16x8_t *const s1,
34 uint16x8_t *const s2, uint16x8_t *const s3) {
35 *s0 = vld1q_u16(s);
36 s += p;
37 *s1 = vld1q_u16(s);
38 s += p;
39 *s2 = vld1q_u16(s);
40 s += p;
41 *s3 = vld1q_u16(s);
42 }
43
load_8x8(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)44 static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
45 int16x8_t *const s0, int16x8_t *const s1,
46 int16x8_t *const s2, int16x8_t *const s3,
47 int16x8_t *const s4, int16x8_t *const s5,
48 int16x8_t *const s6, int16x8_t *const s7) {
49 *s0 = vld1q_s16(s);
50 s += p;
51 *s1 = vld1q_s16(s);
52 s += p;
53 *s2 = vld1q_s16(s);
54 s += p;
55 *s3 = vld1q_s16(s);
56 s += p;
57 *s4 = vld1q_s16(s);
58 s += p;
59 *s5 = vld1q_s16(s);
60 s += p;
61 *s6 = vld1q_s16(s);
62 s += p;
63 *s7 = vld1q_s16(s);
64 }
65
store_8x8(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)66 static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
67 const uint16x8_t s0, const uint16x8_t s1,
68 const uint16x8_t s2, const uint16x8_t s3,
69 const uint16x8_t s4, const uint16x8_t s5,
70 const uint16x8_t s6, const uint16x8_t s7) {
71 vst1q_u16(s, s0);
72 s += p;
73 vst1q_u16(s, s1);
74 s += p;
75 vst1q_u16(s, s2);
76 s += p;
77 vst1q_u16(s, s3);
78 s += p;
79 vst1q_u16(s, s4);
80 s += p;
81 vst1q_u16(s, s5);
82 s += p;
83 vst1q_u16(s, s6);
84 s += p;
85 vst1q_u16(s, s7);
86 }
87
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters)88 static INLINE int32x4_t highbd_convolve8_4(
89 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
90 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
91 const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
92 const int16x4_t filters_lo = vget_low_s16(filters);
93 const int16x4_t filters_hi = vget_high_s16(filters);
94 int32x4_t sum;
95
96 sum = vmull_lane_s16(s0, filters_lo, 0);
97 sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
98 sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
99 sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
100 sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
101 sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
102 sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
103 sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
104 return sum;
105 }
106
107 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const uint16x8_t max)108 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
109 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
110 const int16x8_t s6, const int16x8_t s7,
111 const int16x8_t filters, const uint16x8_t max) {
112 const int16x4_t filters_lo = vget_low_s16(filters);
113 const int16x4_t filters_hi = vget_high_s16(filters);
114 int32x4_t sum0, sum1;
115 uint16x8_t d;
116
117 sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
118 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
119 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
120 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
121 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
122 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
123 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
124 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
125 sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
126 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
127 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
128 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
129 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
130 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
131 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
132 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
133 d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
134 d = vminq_u16(d, max);
135 return d;
136 }
137
vpx_highbd_convolve8_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)138 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
139 uint16_t *dst, ptrdiff_t dst_stride,
140 const InterpKernel *filter, int x0_q4,
141 int x_step_q4, int y0_q4, int y_step_q4,
142 int w, int h, int bd) {
143 if (x_step_q4 != 16) {
144 vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
145 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
146 } else {
147 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
148 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
149 uint16x8_t t0, t1, t2, t3;
150
151 assert(!((intptr_t)dst & 3));
152 assert(!(dst_stride & 3));
153
154 src -= 3;
155
156 if (h == 4) {
157 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
158 int32x4_t d0, d1, d2, d3;
159 uint16x8_t d01, d23;
160
161 __builtin_prefetch(src + 0 * src_stride);
162 __builtin_prefetch(src + 1 * src_stride);
163 __builtin_prefetch(src + 2 * src_stride);
164 __builtin_prefetch(src + 3 * src_stride);
165 load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
166 transpose_u16_8x4(&t0, &t1, &t2, &t3);
167 s0 = vreinterpret_s16_u16(vget_low_u16(t0));
168 s1 = vreinterpret_s16_u16(vget_low_u16(t1));
169 s2 = vreinterpret_s16_u16(vget_low_u16(t2));
170 s3 = vreinterpret_s16_u16(vget_low_u16(t3));
171 s4 = vreinterpret_s16_u16(vget_high_u16(t0));
172 s5 = vreinterpret_s16_u16(vget_high_u16(t1));
173 s6 = vreinterpret_s16_u16(vget_high_u16(t2));
174 __builtin_prefetch(dst + 0 * dst_stride);
175 __builtin_prefetch(dst + 1 * dst_stride);
176 __builtin_prefetch(dst + 2 * dst_stride);
177 __builtin_prefetch(dst + 3 * dst_stride);
178 src += 7;
179
180 do {
181 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
182 transpose_s16_4x4d(&s7, &s8, &s9, &s10);
183
184 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
185 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
186 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
187 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
188
189 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
190 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
191 d01 = vminq_u16(d01, max);
192 d23 = vminq_u16(d23, max);
193 transpose_u16_4x4q(&d01, &d23);
194
195 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
196 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
197 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
198 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
199
200 s0 = s4;
201 s1 = s5;
202 s2 = s6;
203 s3 = s7;
204 s4 = s8;
205 s5 = s9;
206 s6 = s10;
207 src += 4;
208 dst += 4;
209 w -= 4;
210 } while (w > 0);
211 } else {
212 int16x8_t t4, t5, t6, t7;
213 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
214 uint16x8_t d0, d1, d2, d3;
215
216 if (w == 4) {
217 do {
218 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
219 &s5, &s6, &s7);
220 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
221
222 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
223 &t4, &t5, &t6, &t7);
224 src += 8 * src_stride;
225 __builtin_prefetch(dst + 0 * dst_stride);
226 __builtin_prefetch(dst + 1 * dst_stride);
227 __builtin_prefetch(dst + 2 * dst_stride);
228 __builtin_prefetch(dst + 3 * dst_stride);
229 __builtin_prefetch(dst + 4 * dst_stride);
230 __builtin_prefetch(dst + 5 * dst_stride);
231 __builtin_prefetch(dst + 6 * dst_stride);
232 __builtin_prefetch(dst + 7 * dst_stride);
233 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
234
235 __builtin_prefetch(src + 0 * src_stride);
236 __builtin_prefetch(src + 1 * src_stride);
237 __builtin_prefetch(src + 2 * src_stride);
238 __builtin_prefetch(src + 3 * src_stride);
239 __builtin_prefetch(src + 4 * src_stride);
240 __builtin_prefetch(src + 5 * src_stride);
241 __builtin_prefetch(src + 6 * src_stride);
242 __builtin_prefetch(src + 7 * src_stride);
243 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
244 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
245 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
246 d3 =
247 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
248
249 transpose_u16_8x4(&d0, &d1, &d2, &d3);
250 vst1_u16(dst, vget_low_u16(d0));
251 dst += dst_stride;
252 vst1_u16(dst, vget_low_u16(d1));
253 dst += dst_stride;
254 vst1_u16(dst, vget_low_u16(d2));
255 dst += dst_stride;
256 vst1_u16(dst, vget_low_u16(d3));
257 dst += dst_stride;
258 vst1_u16(dst, vget_high_u16(d0));
259 dst += dst_stride;
260 vst1_u16(dst, vget_high_u16(d1));
261 dst += dst_stride;
262 vst1_u16(dst, vget_high_u16(d2));
263 dst += dst_stride;
264 vst1_u16(dst, vget_high_u16(d3));
265 dst += dst_stride;
266 h -= 8;
267 } while (h > 0);
268 } else {
269 int width;
270 const uint16_t *s;
271 uint16_t *d;
272 int16x8_t s11, s12, s13, s14;
273 uint16x8_t d4, d5, d6, d7;
274
275 do {
276 __builtin_prefetch(src + 0 * src_stride);
277 __builtin_prefetch(src + 1 * src_stride);
278 __builtin_prefetch(src + 2 * src_stride);
279 __builtin_prefetch(src + 3 * src_stride);
280 __builtin_prefetch(src + 4 * src_stride);
281 __builtin_prefetch(src + 5 * src_stride);
282 __builtin_prefetch(src + 6 * src_stride);
283 __builtin_prefetch(src + 7 * src_stride);
284 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
285 &s5, &s6, &s7);
286 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
287
288 width = w;
289 s = src + 7;
290 d = dst;
291 __builtin_prefetch(dst + 0 * dst_stride);
292 __builtin_prefetch(dst + 1 * dst_stride);
293 __builtin_prefetch(dst + 2 * dst_stride);
294 __builtin_prefetch(dst + 3 * dst_stride);
295 __builtin_prefetch(dst + 4 * dst_stride);
296 __builtin_prefetch(dst + 5 * dst_stride);
297 __builtin_prefetch(dst + 6 * dst_stride);
298 __builtin_prefetch(dst + 7 * dst_stride);
299
300 do {
301 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
302 &s12, &s13, &s14);
303 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
304
305 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
306 max);
307 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
308 max);
309 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
310 max);
311 d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
312 max);
313 d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
314 max);
315 d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
316 max);
317 d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
318 max);
319 d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
320 filters, max);
321
322 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
323 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
324
325 s0 = s8;
326 s1 = s9;
327 s2 = s10;
328 s3 = s11;
329 s4 = s12;
330 s5 = s13;
331 s6 = s14;
332 s += 8;
333 d += 8;
334 width -= 8;
335 } while (width > 0);
336 src += 8 * src_stride;
337 dst += 8 * dst_stride;
338 h -= 8;
339 } while (h > 0);
340 }
341 }
342 }
343 }
344
vpx_highbd_convolve8_avg_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)345 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
346 ptrdiff_t src_stride, uint16_t *dst,
347 ptrdiff_t dst_stride,
348 const InterpKernel *filter, int x0_q4,
349 int x_step_q4, int y0_q4,
350 int y_step_q4, int w, int h, int bd) {
351 if (x_step_q4 != 16) {
352 vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
353 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
354 bd);
355 } else {
356 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
357 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
358 uint16x8_t t0, t1, t2, t3;
359
360 assert(!((intptr_t)dst & 3));
361 assert(!(dst_stride & 3));
362
363 src -= 3;
364
365 if (h == 4) {
366 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
367 int32x4_t d0, d1, d2, d3;
368 uint16x8_t d01, d23, t01, t23;
369
370 __builtin_prefetch(src + 0 * src_stride);
371 __builtin_prefetch(src + 1 * src_stride);
372 __builtin_prefetch(src + 2 * src_stride);
373 __builtin_prefetch(src + 3 * src_stride);
374 load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
375 transpose_u16_8x4(&t0, &t1, &t2, &t3);
376 s0 = vreinterpret_s16_u16(vget_low_u16(t0));
377 s1 = vreinterpret_s16_u16(vget_low_u16(t1));
378 s2 = vreinterpret_s16_u16(vget_low_u16(t2));
379 s3 = vreinterpret_s16_u16(vget_low_u16(t3));
380 s4 = vreinterpret_s16_u16(vget_high_u16(t0));
381 s5 = vreinterpret_s16_u16(vget_high_u16(t1));
382 s6 = vreinterpret_s16_u16(vget_high_u16(t2));
383 __builtin_prefetch(dst + 0 * dst_stride);
384 __builtin_prefetch(dst + 1 * dst_stride);
385 __builtin_prefetch(dst + 2 * dst_stride);
386 __builtin_prefetch(dst + 3 * dst_stride);
387 src += 7;
388
389 do {
390 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
391 transpose_s16_4x4d(&s7, &s8, &s9, &s10);
392
393 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
394 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
395 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
396 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
397
398 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
399 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
400 t01 = vminq_u16(t01, max);
401 t23 = vminq_u16(t23, max);
402 transpose_u16_4x4q(&t01, &t23);
403
404 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
405 vld1_u16(dst + 2 * dst_stride));
406 d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
407 vld1_u16(dst + 3 * dst_stride));
408 d01 = vrhaddq_u16(d01, t01);
409 d23 = vrhaddq_u16(d23, t23);
410
411 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
412 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
413 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
414 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
415
416 s0 = s4;
417 s1 = s5;
418 s2 = s6;
419 s3 = s7;
420 s4 = s8;
421 s5 = s9;
422 s6 = s10;
423 src += 4;
424 dst += 4;
425 w -= 4;
426 } while (w > 0);
427 } else {
428 int16x8_t t4, t5, t6, t7;
429 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
430 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
431
432 if (w == 4) {
433 do {
434 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
435 &s5, &s6, &s7);
436 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
437
438 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
439 &t4, &t5, &t6, &t7);
440 src += 8 * src_stride;
441 __builtin_prefetch(dst + 0 * dst_stride);
442 __builtin_prefetch(dst + 1 * dst_stride);
443 __builtin_prefetch(dst + 2 * dst_stride);
444 __builtin_prefetch(dst + 3 * dst_stride);
445 __builtin_prefetch(dst + 4 * dst_stride);
446 __builtin_prefetch(dst + 5 * dst_stride);
447 __builtin_prefetch(dst + 6 * dst_stride);
448 __builtin_prefetch(dst + 7 * dst_stride);
449 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
450
451 __builtin_prefetch(src + 0 * src_stride);
452 __builtin_prefetch(src + 1 * src_stride);
453 __builtin_prefetch(src + 2 * src_stride);
454 __builtin_prefetch(src + 3 * src_stride);
455 __builtin_prefetch(src + 4 * src_stride);
456 __builtin_prefetch(src + 5 * src_stride);
457 __builtin_prefetch(src + 6 * src_stride);
458 __builtin_prefetch(src + 7 * src_stride);
459 t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
460 t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
461 t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
462 t3 =
463 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
464 transpose_u16_8x4(&t0, &t1, &t2, &t3);
465
466 d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
467 vld1_u16(dst + 4 * dst_stride));
468 d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
469 vld1_u16(dst + 5 * dst_stride));
470 d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
471 vld1_u16(dst + 6 * dst_stride));
472 d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
473 vld1_u16(dst + 7 * dst_stride));
474 d0 = vrhaddq_u16(d0, t0);
475 d1 = vrhaddq_u16(d1, t1);
476 d2 = vrhaddq_u16(d2, t2);
477 d3 = vrhaddq_u16(d3, t3);
478
479 vst1_u16(dst, vget_low_u16(d0));
480 dst += dst_stride;
481 vst1_u16(dst, vget_low_u16(d1));
482 dst += dst_stride;
483 vst1_u16(dst, vget_low_u16(d2));
484 dst += dst_stride;
485 vst1_u16(dst, vget_low_u16(d3));
486 dst += dst_stride;
487 vst1_u16(dst, vget_high_u16(d0));
488 dst += dst_stride;
489 vst1_u16(dst, vget_high_u16(d1));
490 dst += dst_stride;
491 vst1_u16(dst, vget_high_u16(d2));
492 dst += dst_stride;
493 vst1_u16(dst, vget_high_u16(d3));
494 dst += dst_stride;
495 h -= 8;
496 } while (h > 0);
497 } else {
498 int width;
499 const uint16_t *s;
500 uint16_t *d;
501 int16x8_t s11, s12, s13, s14;
502 uint16x8_t d4, d5, d6, d7;
503
504 do {
505 __builtin_prefetch(src + 0 * src_stride);
506 __builtin_prefetch(src + 1 * src_stride);
507 __builtin_prefetch(src + 2 * src_stride);
508 __builtin_prefetch(src + 3 * src_stride);
509 __builtin_prefetch(src + 4 * src_stride);
510 __builtin_prefetch(src + 5 * src_stride);
511 __builtin_prefetch(src + 6 * src_stride);
512 __builtin_prefetch(src + 7 * src_stride);
513 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
514 &s5, &s6, &s7);
515 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
516
517 width = w;
518 s = src + 7;
519 d = dst;
520 __builtin_prefetch(dst + 0 * dst_stride);
521 __builtin_prefetch(dst + 1 * dst_stride);
522 __builtin_prefetch(dst + 2 * dst_stride);
523 __builtin_prefetch(dst + 3 * dst_stride);
524 __builtin_prefetch(dst + 4 * dst_stride);
525 __builtin_prefetch(dst + 5 * dst_stride);
526 __builtin_prefetch(dst + 6 * dst_stride);
527 __builtin_prefetch(dst + 7 * dst_stride);
528
529 do {
530 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
531 &s12, &s13, &s14);
532 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
533
534 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
535 max);
536 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
537 max);
538 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
539 max);
540 d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
541 max);
542 d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
543 max);
544 d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
545 max);
546 d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
547 max);
548 d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
549 filters, max);
550
551 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
552
553 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
554 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
555 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
556 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
557 d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
558 d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
559 d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
560 d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
561
562 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
563
564 s0 = s8;
565 s1 = s9;
566 s2 = s10;
567 s3 = s11;
568 s4 = s12;
569 s5 = s13;
570 s6 = s14;
571 s += 8;
572 d += 8;
573 width -= 8;
574 } while (width > 0);
575 src += 8 * src_stride;
576 dst += 8 * dst_stride;
577 h -= 8;
578 } while (h > 0);
579 }
580 }
581 }
582 }
583
vpx_highbd_convolve8_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)584 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
585 uint16_t *dst, ptrdiff_t dst_stride,
586 const InterpKernel *filter, int x0_q4,
587 int x_step_q4, int y0_q4, int y_step_q4,
588 int w, int h, int bd) {
589 if (y_step_q4 != 16) {
590 vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
591 x_step_q4, y0_q4, y_step_q4, w, h, bd);
592 } else {
593 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
594 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
595
596 assert(!((intptr_t)dst & 3));
597 assert(!(dst_stride & 3));
598
599 src -= 3 * src_stride;
600
601 if (w == 4) {
602 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
603 int32x4_t d0, d1, d2, d3;
604 uint16x8_t d01, d23;
605
606 s0 = vreinterpret_s16_u16(vld1_u16(src));
607 src += src_stride;
608 s1 = vreinterpret_s16_u16(vld1_u16(src));
609 src += src_stride;
610 s2 = vreinterpret_s16_u16(vld1_u16(src));
611 src += src_stride;
612 s3 = vreinterpret_s16_u16(vld1_u16(src));
613 src += src_stride;
614 s4 = vreinterpret_s16_u16(vld1_u16(src));
615 src += src_stride;
616 s5 = vreinterpret_s16_u16(vld1_u16(src));
617 src += src_stride;
618 s6 = vreinterpret_s16_u16(vld1_u16(src));
619 src += src_stride;
620
621 do {
622 s7 = vreinterpret_s16_u16(vld1_u16(src));
623 src += src_stride;
624 s8 = vreinterpret_s16_u16(vld1_u16(src));
625 src += src_stride;
626 s9 = vreinterpret_s16_u16(vld1_u16(src));
627 src += src_stride;
628 s10 = vreinterpret_s16_u16(vld1_u16(src));
629 src += src_stride;
630
631 __builtin_prefetch(dst + 0 * dst_stride);
632 __builtin_prefetch(dst + 1 * dst_stride);
633 __builtin_prefetch(dst + 2 * dst_stride);
634 __builtin_prefetch(dst + 3 * dst_stride);
635 __builtin_prefetch(src + 0 * src_stride);
636 __builtin_prefetch(src + 1 * src_stride);
637 __builtin_prefetch(src + 2 * src_stride);
638 __builtin_prefetch(src + 3 * src_stride);
639 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
640 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
641 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
642 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
643
644 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
645 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
646 d01 = vminq_u16(d01, max);
647 d23 = vminq_u16(d23, max);
648 vst1_u16(dst, vget_low_u16(d01));
649 dst += dst_stride;
650 vst1_u16(dst, vget_high_u16(d01));
651 dst += dst_stride;
652 vst1_u16(dst, vget_low_u16(d23));
653 dst += dst_stride;
654 vst1_u16(dst, vget_high_u16(d23));
655 dst += dst_stride;
656
657 s0 = s4;
658 s1 = s5;
659 s2 = s6;
660 s3 = s7;
661 s4 = s8;
662 s5 = s9;
663 s6 = s10;
664 h -= 4;
665 } while (h > 0);
666 } else {
667 int height;
668 const uint16_t *s;
669 uint16_t *d;
670 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
671 uint16x8_t d0, d1, d2, d3;
672
673 do {
674 __builtin_prefetch(src + 0 * src_stride);
675 __builtin_prefetch(src + 1 * src_stride);
676 __builtin_prefetch(src + 2 * src_stride);
677 __builtin_prefetch(src + 3 * src_stride);
678 __builtin_prefetch(src + 4 * src_stride);
679 __builtin_prefetch(src + 5 * src_stride);
680 __builtin_prefetch(src + 6 * src_stride);
681 s = src;
682 s0 = vreinterpretq_s16_u16(vld1q_u16(s));
683 s += src_stride;
684 s1 = vreinterpretq_s16_u16(vld1q_u16(s));
685 s += src_stride;
686 s2 = vreinterpretq_s16_u16(vld1q_u16(s));
687 s += src_stride;
688 s3 = vreinterpretq_s16_u16(vld1q_u16(s));
689 s += src_stride;
690 s4 = vreinterpretq_s16_u16(vld1q_u16(s));
691 s += src_stride;
692 s5 = vreinterpretq_s16_u16(vld1q_u16(s));
693 s += src_stride;
694 s6 = vreinterpretq_s16_u16(vld1q_u16(s));
695 s += src_stride;
696 d = dst;
697 height = h;
698
699 do {
700 s7 = vreinterpretq_s16_u16(vld1q_u16(s));
701 s += src_stride;
702 s8 = vreinterpretq_s16_u16(vld1q_u16(s));
703 s += src_stride;
704 s9 = vreinterpretq_s16_u16(vld1q_u16(s));
705 s += src_stride;
706 s10 = vreinterpretq_s16_u16(vld1q_u16(s));
707 s += src_stride;
708
709 __builtin_prefetch(d + 0 * dst_stride);
710 __builtin_prefetch(d + 1 * dst_stride);
711 __builtin_prefetch(d + 2 * dst_stride);
712 __builtin_prefetch(d + 3 * dst_stride);
713 __builtin_prefetch(s + 0 * src_stride);
714 __builtin_prefetch(s + 1 * src_stride);
715 __builtin_prefetch(s + 2 * src_stride);
716 __builtin_prefetch(s + 3 * src_stride);
717 d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
718 d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
719 d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
720 d3 =
721 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
722
723 vst1q_u16(d, d0);
724 d += dst_stride;
725 vst1q_u16(d, d1);
726 d += dst_stride;
727 vst1q_u16(d, d2);
728 d += dst_stride;
729 vst1q_u16(d, d3);
730 d += dst_stride;
731
732 s0 = s4;
733 s1 = s5;
734 s2 = s6;
735 s3 = s7;
736 s4 = s8;
737 s5 = s9;
738 s6 = s10;
739 height -= 4;
740 } while (height > 0);
741 src += 8;
742 dst += 8;
743 w -= 8;
744 } while (w > 0);
745 }
746 }
747 }
748
vpx_highbd_convolve8_avg_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)749 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
750 ptrdiff_t src_stride, uint16_t *dst,
751 ptrdiff_t dst_stride,
752 const InterpKernel *filter, int x0_q4,
753 int x_step_q4, int y0_q4, int y_step_q4,
754 int w, int h, int bd) {
755 if (y_step_q4 != 16) {
756 vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
757 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
758 bd);
759 } else {
760 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
761 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
762
763 assert(!((intptr_t)dst & 3));
764 assert(!(dst_stride & 3));
765
766 src -= 3 * src_stride;
767
768 if (w == 4) {
769 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
770 int32x4_t d0, d1, d2, d3;
771 uint16x8_t d01, d23, t01, t23;
772
773 s0 = vreinterpret_s16_u16(vld1_u16(src));
774 src += src_stride;
775 s1 = vreinterpret_s16_u16(vld1_u16(src));
776 src += src_stride;
777 s2 = vreinterpret_s16_u16(vld1_u16(src));
778 src += src_stride;
779 s3 = vreinterpret_s16_u16(vld1_u16(src));
780 src += src_stride;
781 s4 = vreinterpret_s16_u16(vld1_u16(src));
782 src += src_stride;
783 s5 = vreinterpret_s16_u16(vld1_u16(src));
784 src += src_stride;
785 s6 = vreinterpret_s16_u16(vld1_u16(src));
786 src += src_stride;
787
788 do {
789 s7 = vreinterpret_s16_u16(vld1_u16(src));
790 src += src_stride;
791 s8 = vreinterpret_s16_u16(vld1_u16(src));
792 src += src_stride;
793 s9 = vreinterpret_s16_u16(vld1_u16(src));
794 src += src_stride;
795 s10 = vreinterpret_s16_u16(vld1_u16(src));
796 src += src_stride;
797
798 __builtin_prefetch(dst + 0 * dst_stride);
799 __builtin_prefetch(dst + 1 * dst_stride);
800 __builtin_prefetch(dst + 2 * dst_stride);
801 __builtin_prefetch(dst + 3 * dst_stride);
802 __builtin_prefetch(src + 0 * src_stride);
803 __builtin_prefetch(src + 1 * src_stride);
804 __builtin_prefetch(src + 2 * src_stride);
805 __builtin_prefetch(src + 3 * src_stride);
806 d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
807 d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
808 d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
809 d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
810
811 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
812 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
813 t01 = vminq_u16(t01, max);
814 t23 = vminq_u16(t23, max);
815
816 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
817 vld1_u16(dst + 1 * dst_stride));
818 d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
819 vld1_u16(dst + 3 * dst_stride));
820 d01 = vrhaddq_u16(d01, t01);
821 d23 = vrhaddq_u16(d23, t23);
822
823 vst1_u16(dst, vget_low_u16(d01));
824 dst += dst_stride;
825 vst1_u16(dst, vget_high_u16(d01));
826 dst += dst_stride;
827 vst1_u16(dst, vget_low_u16(d23));
828 dst += dst_stride;
829 vst1_u16(dst, vget_high_u16(d23));
830 dst += dst_stride;
831
832 s0 = s4;
833 s1 = s5;
834 s2 = s6;
835 s3 = s7;
836 s4 = s8;
837 s5 = s9;
838 s6 = s10;
839 h -= 4;
840 } while (h > 0);
841 } else {
842 int height;
843 const uint16_t *s;
844 uint16_t *d;
845 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
846 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
847
848 do {
849 __builtin_prefetch(src + 0 * src_stride);
850 __builtin_prefetch(src + 1 * src_stride);
851 __builtin_prefetch(src + 2 * src_stride);
852 __builtin_prefetch(src + 3 * src_stride);
853 __builtin_prefetch(src + 4 * src_stride);
854 __builtin_prefetch(src + 5 * src_stride);
855 __builtin_prefetch(src + 6 * src_stride);
856 s = src;
857 s0 = vreinterpretq_s16_u16(vld1q_u16(s));
858 s += src_stride;
859 s1 = vreinterpretq_s16_u16(vld1q_u16(s));
860 s += src_stride;
861 s2 = vreinterpretq_s16_u16(vld1q_u16(s));
862 s += src_stride;
863 s3 = vreinterpretq_s16_u16(vld1q_u16(s));
864 s += src_stride;
865 s4 = vreinterpretq_s16_u16(vld1q_u16(s));
866 s += src_stride;
867 s5 = vreinterpretq_s16_u16(vld1q_u16(s));
868 s += src_stride;
869 s6 = vreinterpretq_s16_u16(vld1q_u16(s));
870 s += src_stride;
871 d = dst;
872 height = h;
873
874 do {
875 s7 = vreinterpretq_s16_u16(vld1q_u16(s));
876 s += src_stride;
877 s8 = vreinterpretq_s16_u16(vld1q_u16(s));
878 s += src_stride;
879 s9 = vreinterpretq_s16_u16(vld1q_u16(s));
880 s += src_stride;
881 s10 = vreinterpretq_s16_u16(vld1q_u16(s));
882 s += src_stride;
883
884 __builtin_prefetch(d + 0 * dst_stride);
885 __builtin_prefetch(d + 1 * dst_stride);
886 __builtin_prefetch(d + 2 * dst_stride);
887 __builtin_prefetch(d + 3 * dst_stride);
888 __builtin_prefetch(s + 0 * src_stride);
889 __builtin_prefetch(s + 1 * src_stride);
890 __builtin_prefetch(s + 2 * src_stride);
891 __builtin_prefetch(s + 3 * src_stride);
892 t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
893 t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
894 t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
895 t3 =
896 highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
897
898 d0 = vld1q_u16(d + 0 * dst_stride);
899 d1 = vld1q_u16(d + 1 * dst_stride);
900 d2 = vld1q_u16(d + 2 * dst_stride);
901 d3 = vld1q_u16(d + 3 * dst_stride);
902 d0 = vrhaddq_u16(d0, t0);
903 d1 = vrhaddq_u16(d1, t1);
904 d2 = vrhaddq_u16(d2, t2);
905 d3 = vrhaddq_u16(d3, t3);
906
907 vst1q_u16(d, d0);
908 d += dst_stride;
909 vst1q_u16(d, d1);
910 d += dst_stride;
911 vst1q_u16(d, d2);
912 d += dst_stride;
913 vst1q_u16(d, d3);
914 d += dst_stride;
915
916 s0 = s4;
917 s1 = s5;
918 s2 = s6;
919 s3 = s7;
920 s4 = s8;
921 s5 = s9;
922 s6 = s10;
923 height -= 4;
924 } while (height > 0);
925 src += 8;
926 dst += 8;
927 w -= 8;
928 } while (w > 0);
929 }
930 }
931 }
932