1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_ports/mem.h"
19 
load_4x4(const int16_t * s,const ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)20 static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
21                             int16x4_t *const s0, int16x4_t *const s1,
22                             int16x4_t *const s2, int16x4_t *const s3) {
23   *s0 = vld1_s16(s);
24   s += p;
25   *s1 = vld1_s16(s);
26   s += p;
27   *s2 = vld1_s16(s);
28   s += p;
29   *s3 = vld1_s16(s);
30 }
31 
load_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)32 static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
33                             uint16x8_t *const s0, uint16x8_t *const s1,
34                             uint16x8_t *const s2, uint16x8_t *const s3) {
35   *s0 = vld1q_u16(s);
36   s += p;
37   *s1 = vld1q_u16(s);
38   s += p;
39   *s2 = vld1q_u16(s);
40   s += p;
41   *s3 = vld1q_u16(s);
42 }
43 
load_8x8(const int16_t * s,const ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)44 static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
45                             int16x8_t *const s0, int16x8_t *const s1,
46                             int16x8_t *const s2, int16x8_t *const s3,
47                             int16x8_t *const s4, int16x8_t *const s5,
48                             int16x8_t *const s6, int16x8_t *const s7) {
49   *s0 = vld1q_s16(s);
50   s += p;
51   *s1 = vld1q_s16(s);
52   s += p;
53   *s2 = vld1q_s16(s);
54   s += p;
55   *s3 = vld1q_s16(s);
56   s += p;
57   *s4 = vld1q_s16(s);
58   s += p;
59   *s5 = vld1q_s16(s);
60   s += p;
61   *s6 = vld1q_s16(s);
62   s += p;
63   *s7 = vld1q_s16(s);
64 }
65 
store_8x8(uint16_t * s,const ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)66 static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
67                              const uint16x8_t s0, const uint16x8_t s1,
68                              const uint16x8_t s2, const uint16x8_t s3,
69                              const uint16x8_t s4, const uint16x8_t s5,
70                              const uint16x8_t s6, const uint16x8_t s7) {
71   vst1q_u16(s, s0);
72   s += p;
73   vst1q_u16(s, s1);
74   s += p;
75   vst1q_u16(s, s2);
76   s += p;
77   vst1q_u16(s, s3);
78   s += p;
79   vst1q_u16(s, s4);
80   s += p;
81   vst1q_u16(s, s5);
82   s += p;
83   vst1q_u16(s, s6);
84   s += p;
85   vst1q_u16(s, s7);
86 }
87 
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters)88 static INLINE int32x4_t highbd_convolve8_4(
89     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
90     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
91     const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
92   const int16x4_t filters_lo = vget_low_s16(filters);
93   const int16x4_t filters_hi = vget_high_s16(filters);
94   int32x4_t sum;
95 
96   sum = vmull_lane_s16(s0, filters_lo, 0);
97   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
98   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
99   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
100   sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
101   sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
102   sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
103   sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
104   return sum;
105 }
106 
107 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const uint16x8_t max)108 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
109                    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
110                    const int16x8_t s6, const int16x8_t s7,
111                    const int16x8_t filters, const uint16x8_t max) {
112   const int16x4_t filters_lo = vget_low_s16(filters);
113   const int16x4_t filters_hi = vget_high_s16(filters);
114   int32x4_t sum0, sum1;
115   uint16x8_t d;
116 
117   sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
118   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
119   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
120   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
121   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
122   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
123   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
124   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
125   sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
126   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
127   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
128   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
129   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
130   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
131   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
132   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
133   d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
134   d = vminq_u16(d, max);
135   return d;
136 }
137 
vpx_highbd_convolve8_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)138 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
139                                      uint16_t *dst, ptrdiff_t dst_stride,
140                                      const InterpKernel *filter, int x0_q4,
141                                      int x_step_q4, int y0_q4, int y_step_q4,
142                                      int w, int h, int bd) {
143   if (x_step_q4 != 16) {
144     vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
145                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
146   } else {
147     const int16x8_t filters = vld1q_s16(filter[x0_q4]);
148     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
149     uint16x8_t t0, t1, t2, t3;
150 
151     assert(!((intptr_t)dst & 3));
152     assert(!(dst_stride & 3));
153 
154     src -= 3;
155 
156     if (h == 4) {
157       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
158       int32x4_t d0, d1, d2, d3;
159       uint16x8_t d01, d23;
160 
161       __builtin_prefetch(src + 0 * src_stride);
162       __builtin_prefetch(src + 1 * src_stride);
163       __builtin_prefetch(src + 2 * src_stride);
164       __builtin_prefetch(src + 3 * src_stride);
165       load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
166       transpose_u16_8x4(&t0, &t1, &t2, &t3);
167       s0 = vreinterpret_s16_u16(vget_low_u16(t0));
168       s1 = vreinterpret_s16_u16(vget_low_u16(t1));
169       s2 = vreinterpret_s16_u16(vget_low_u16(t2));
170       s3 = vreinterpret_s16_u16(vget_low_u16(t3));
171       s4 = vreinterpret_s16_u16(vget_high_u16(t0));
172       s5 = vreinterpret_s16_u16(vget_high_u16(t1));
173       s6 = vreinterpret_s16_u16(vget_high_u16(t2));
174       __builtin_prefetch(dst + 0 * dst_stride);
175       __builtin_prefetch(dst + 1 * dst_stride);
176       __builtin_prefetch(dst + 2 * dst_stride);
177       __builtin_prefetch(dst + 3 * dst_stride);
178       src += 7;
179 
180       do {
181         load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
182         transpose_s16_4x4d(&s7, &s8, &s9, &s10);
183 
184         d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
185         d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
186         d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
187         d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
188 
189         d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
190         d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
191         d01 = vminq_u16(d01, max);
192         d23 = vminq_u16(d23, max);
193         transpose_u16_4x4q(&d01, &d23);
194 
195         vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
196         vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
197         vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
198         vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
199 
200         s0 = s4;
201         s1 = s5;
202         s2 = s6;
203         s3 = s7;
204         s4 = s8;
205         s5 = s9;
206         s6 = s10;
207         src += 4;
208         dst += 4;
209         w -= 4;
210       } while (w > 0);
211     } else {
212       int16x8_t t4, t5, t6, t7;
213       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
214       uint16x8_t d0, d1, d2, d3;
215 
216       if (w == 4) {
217         do {
218           load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
219                    &s5, &s6, &s7);
220           transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
221 
222           load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
223                    &t4, &t5, &t6, &t7);
224           src += 8 * src_stride;
225           __builtin_prefetch(dst + 0 * dst_stride);
226           __builtin_prefetch(dst + 1 * dst_stride);
227           __builtin_prefetch(dst + 2 * dst_stride);
228           __builtin_prefetch(dst + 3 * dst_stride);
229           __builtin_prefetch(dst + 4 * dst_stride);
230           __builtin_prefetch(dst + 5 * dst_stride);
231           __builtin_prefetch(dst + 6 * dst_stride);
232           __builtin_prefetch(dst + 7 * dst_stride);
233           transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
234 
235           __builtin_prefetch(src + 0 * src_stride);
236           __builtin_prefetch(src + 1 * src_stride);
237           __builtin_prefetch(src + 2 * src_stride);
238           __builtin_prefetch(src + 3 * src_stride);
239           __builtin_prefetch(src + 4 * src_stride);
240           __builtin_prefetch(src + 5 * src_stride);
241           __builtin_prefetch(src + 6 * src_stride);
242           __builtin_prefetch(src + 7 * src_stride);
243           d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
244           d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
245           d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
246           d3 =
247               highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
248 
249           transpose_u16_8x4(&d0, &d1, &d2, &d3);
250           vst1_u16(dst, vget_low_u16(d0));
251           dst += dst_stride;
252           vst1_u16(dst, vget_low_u16(d1));
253           dst += dst_stride;
254           vst1_u16(dst, vget_low_u16(d2));
255           dst += dst_stride;
256           vst1_u16(dst, vget_low_u16(d3));
257           dst += dst_stride;
258           vst1_u16(dst, vget_high_u16(d0));
259           dst += dst_stride;
260           vst1_u16(dst, vget_high_u16(d1));
261           dst += dst_stride;
262           vst1_u16(dst, vget_high_u16(d2));
263           dst += dst_stride;
264           vst1_u16(dst, vget_high_u16(d3));
265           dst += dst_stride;
266           h -= 8;
267         } while (h > 0);
268       } else {
269         int width;
270         const uint16_t *s;
271         uint16_t *d;
272         int16x8_t s11, s12, s13, s14;
273         uint16x8_t d4, d5, d6, d7;
274 
275         do {
276           __builtin_prefetch(src + 0 * src_stride);
277           __builtin_prefetch(src + 1 * src_stride);
278           __builtin_prefetch(src + 2 * src_stride);
279           __builtin_prefetch(src + 3 * src_stride);
280           __builtin_prefetch(src + 4 * src_stride);
281           __builtin_prefetch(src + 5 * src_stride);
282           __builtin_prefetch(src + 6 * src_stride);
283           __builtin_prefetch(src + 7 * src_stride);
284           load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
285                    &s5, &s6, &s7);
286           transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
287 
288           width = w;
289           s = src + 7;
290           d = dst;
291           __builtin_prefetch(dst + 0 * dst_stride);
292           __builtin_prefetch(dst + 1 * dst_stride);
293           __builtin_prefetch(dst + 2 * dst_stride);
294           __builtin_prefetch(dst + 3 * dst_stride);
295           __builtin_prefetch(dst + 4 * dst_stride);
296           __builtin_prefetch(dst + 5 * dst_stride);
297           __builtin_prefetch(dst + 6 * dst_stride);
298           __builtin_prefetch(dst + 7 * dst_stride);
299 
300           do {
301             load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
302                      &s12, &s13, &s14);
303             transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
304 
305             d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
306                                     max);
307             d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
308                                     max);
309             d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
310                                     max);
311             d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
312                                     max);
313             d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
314                                     max);
315             d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
316                                     max);
317             d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
318                                     max);
319             d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
320                                     filters, max);
321 
322             transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
323             store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
324 
325             s0 = s8;
326             s1 = s9;
327             s2 = s10;
328             s3 = s11;
329             s4 = s12;
330             s5 = s13;
331             s6 = s14;
332             s += 8;
333             d += 8;
334             width -= 8;
335           } while (width > 0);
336           src += 8 * src_stride;
337           dst += 8 * dst_stride;
338           h -= 8;
339         } while (h > 0);
340       }
341     }
342   }
343 }
344 
vpx_highbd_convolve8_avg_horiz_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)345 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
346                                          ptrdiff_t src_stride, uint16_t *dst,
347                                          ptrdiff_t dst_stride,
348                                          const InterpKernel *filter, int x0_q4,
349                                          int x_step_q4, int y0_q4,
350                                          int y_step_q4, int w, int h, int bd) {
351   if (x_step_q4 != 16) {
352     vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
353                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
354                                      bd);
355   } else {
356     const int16x8_t filters = vld1q_s16(filter[x0_q4]);
357     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
358     uint16x8_t t0, t1, t2, t3;
359 
360     assert(!((intptr_t)dst & 3));
361     assert(!(dst_stride & 3));
362 
363     src -= 3;
364 
365     if (h == 4) {
366       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
367       int32x4_t d0, d1, d2, d3;
368       uint16x8_t d01, d23, t01, t23;
369 
370       __builtin_prefetch(src + 0 * src_stride);
371       __builtin_prefetch(src + 1 * src_stride);
372       __builtin_prefetch(src + 2 * src_stride);
373       __builtin_prefetch(src + 3 * src_stride);
374       load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
375       transpose_u16_8x4(&t0, &t1, &t2, &t3);
376       s0 = vreinterpret_s16_u16(vget_low_u16(t0));
377       s1 = vreinterpret_s16_u16(vget_low_u16(t1));
378       s2 = vreinterpret_s16_u16(vget_low_u16(t2));
379       s3 = vreinterpret_s16_u16(vget_low_u16(t3));
380       s4 = vreinterpret_s16_u16(vget_high_u16(t0));
381       s5 = vreinterpret_s16_u16(vget_high_u16(t1));
382       s6 = vreinterpret_s16_u16(vget_high_u16(t2));
383       __builtin_prefetch(dst + 0 * dst_stride);
384       __builtin_prefetch(dst + 1 * dst_stride);
385       __builtin_prefetch(dst + 2 * dst_stride);
386       __builtin_prefetch(dst + 3 * dst_stride);
387       src += 7;
388 
389       do {
390         load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
391         transpose_s16_4x4d(&s7, &s8, &s9, &s10);
392 
393         d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
394         d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
395         d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
396         d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
397 
398         t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
399         t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
400         t01 = vminq_u16(t01, max);
401         t23 = vminq_u16(t23, max);
402         transpose_u16_4x4q(&t01, &t23);
403 
404         d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
405                            vld1_u16(dst + 2 * dst_stride));
406         d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
407                            vld1_u16(dst + 3 * dst_stride));
408         d01 = vrhaddq_u16(d01, t01);
409         d23 = vrhaddq_u16(d23, t23);
410 
411         vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
412         vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
413         vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
414         vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
415 
416         s0 = s4;
417         s1 = s5;
418         s2 = s6;
419         s3 = s7;
420         s4 = s8;
421         s5 = s9;
422         s6 = s10;
423         src += 4;
424         dst += 4;
425         w -= 4;
426       } while (w > 0);
427     } else {
428       int16x8_t t4, t5, t6, t7;
429       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
430       uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
431 
432       if (w == 4) {
433         do {
434           load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
435                    &s5, &s6, &s7);
436           transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
437 
438           load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
439                    &t4, &t5, &t6, &t7);
440           src += 8 * src_stride;
441           __builtin_prefetch(dst + 0 * dst_stride);
442           __builtin_prefetch(dst + 1 * dst_stride);
443           __builtin_prefetch(dst + 2 * dst_stride);
444           __builtin_prefetch(dst + 3 * dst_stride);
445           __builtin_prefetch(dst + 4 * dst_stride);
446           __builtin_prefetch(dst + 5 * dst_stride);
447           __builtin_prefetch(dst + 6 * dst_stride);
448           __builtin_prefetch(dst + 7 * dst_stride);
449           transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
450 
451           __builtin_prefetch(src + 0 * src_stride);
452           __builtin_prefetch(src + 1 * src_stride);
453           __builtin_prefetch(src + 2 * src_stride);
454           __builtin_prefetch(src + 3 * src_stride);
455           __builtin_prefetch(src + 4 * src_stride);
456           __builtin_prefetch(src + 5 * src_stride);
457           __builtin_prefetch(src + 6 * src_stride);
458           __builtin_prefetch(src + 7 * src_stride);
459           t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
460           t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
461           t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
462           t3 =
463               highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
464           transpose_u16_8x4(&t0, &t1, &t2, &t3);
465 
466           d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
467                             vld1_u16(dst + 4 * dst_stride));
468           d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
469                             vld1_u16(dst + 5 * dst_stride));
470           d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
471                             vld1_u16(dst + 6 * dst_stride));
472           d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
473                             vld1_u16(dst + 7 * dst_stride));
474           d0 = vrhaddq_u16(d0, t0);
475           d1 = vrhaddq_u16(d1, t1);
476           d2 = vrhaddq_u16(d2, t2);
477           d3 = vrhaddq_u16(d3, t3);
478 
479           vst1_u16(dst, vget_low_u16(d0));
480           dst += dst_stride;
481           vst1_u16(dst, vget_low_u16(d1));
482           dst += dst_stride;
483           vst1_u16(dst, vget_low_u16(d2));
484           dst += dst_stride;
485           vst1_u16(dst, vget_low_u16(d3));
486           dst += dst_stride;
487           vst1_u16(dst, vget_high_u16(d0));
488           dst += dst_stride;
489           vst1_u16(dst, vget_high_u16(d1));
490           dst += dst_stride;
491           vst1_u16(dst, vget_high_u16(d2));
492           dst += dst_stride;
493           vst1_u16(dst, vget_high_u16(d3));
494           dst += dst_stride;
495           h -= 8;
496         } while (h > 0);
497       } else {
498         int width;
499         const uint16_t *s;
500         uint16_t *d;
501         int16x8_t s11, s12, s13, s14;
502         uint16x8_t d4, d5, d6, d7;
503 
504         do {
505           __builtin_prefetch(src + 0 * src_stride);
506           __builtin_prefetch(src + 1 * src_stride);
507           __builtin_prefetch(src + 2 * src_stride);
508           __builtin_prefetch(src + 3 * src_stride);
509           __builtin_prefetch(src + 4 * src_stride);
510           __builtin_prefetch(src + 5 * src_stride);
511           __builtin_prefetch(src + 6 * src_stride);
512           __builtin_prefetch(src + 7 * src_stride);
513           load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
514                    &s5, &s6, &s7);
515           transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
516 
517           width = w;
518           s = src + 7;
519           d = dst;
520           __builtin_prefetch(dst + 0 * dst_stride);
521           __builtin_prefetch(dst + 1 * dst_stride);
522           __builtin_prefetch(dst + 2 * dst_stride);
523           __builtin_prefetch(dst + 3 * dst_stride);
524           __builtin_prefetch(dst + 4 * dst_stride);
525           __builtin_prefetch(dst + 5 * dst_stride);
526           __builtin_prefetch(dst + 6 * dst_stride);
527           __builtin_prefetch(dst + 7 * dst_stride);
528 
529           do {
530             load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
531                      &s12, &s13, &s14);
532             transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
533 
534             d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
535                                     max);
536             d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
537                                     max);
538             d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
539                                     max);
540             d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
541                                     max);
542             d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
543                                     max);
544             d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
545                                     max);
546             d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
547                                     max);
548             d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
549                                     filters, max);
550 
551             transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
552 
553             d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
554             d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
555             d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
556             d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
557             d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
558             d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
559             d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
560             d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
561 
562             store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
563 
564             s0 = s8;
565             s1 = s9;
566             s2 = s10;
567             s3 = s11;
568             s4 = s12;
569             s5 = s13;
570             s6 = s14;
571             s += 8;
572             d += 8;
573             width -= 8;
574           } while (width > 0);
575           src += 8 * src_stride;
576           dst += 8 * dst_stride;
577           h -= 8;
578         } while (h > 0);
579       }
580     }
581   }
582 }
583 
vpx_highbd_convolve8_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)584 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
585                                     uint16_t *dst, ptrdiff_t dst_stride,
586                                     const InterpKernel *filter, int x0_q4,
587                                     int x_step_q4, int y0_q4, int y_step_q4,
588                                     int w, int h, int bd) {
589   if (y_step_q4 != 16) {
590     vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
591                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
592   } else {
593     const int16x8_t filters = vld1q_s16(filter[y0_q4]);
594     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
595 
596     assert(!((intptr_t)dst & 3));
597     assert(!(dst_stride & 3));
598 
599     src -= 3 * src_stride;
600 
601     if (w == 4) {
602       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
603       int32x4_t d0, d1, d2, d3;
604       uint16x8_t d01, d23;
605 
606       s0 = vreinterpret_s16_u16(vld1_u16(src));
607       src += src_stride;
608       s1 = vreinterpret_s16_u16(vld1_u16(src));
609       src += src_stride;
610       s2 = vreinterpret_s16_u16(vld1_u16(src));
611       src += src_stride;
612       s3 = vreinterpret_s16_u16(vld1_u16(src));
613       src += src_stride;
614       s4 = vreinterpret_s16_u16(vld1_u16(src));
615       src += src_stride;
616       s5 = vreinterpret_s16_u16(vld1_u16(src));
617       src += src_stride;
618       s6 = vreinterpret_s16_u16(vld1_u16(src));
619       src += src_stride;
620 
621       do {
622         s7 = vreinterpret_s16_u16(vld1_u16(src));
623         src += src_stride;
624         s8 = vreinterpret_s16_u16(vld1_u16(src));
625         src += src_stride;
626         s9 = vreinterpret_s16_u16(vld1_u16(src));
627         src += src_stride;
628         s10 = vreinterpret_s16_u16(vld1_u16(src));
629         src += src_stride;
630 
631         __builtin_prefetch(dst + 0 * dst_stride);
632         __builtin_prefetch(dst + 1 * dst_stride);
633         __builtin_prefetch(dst + 2 * dst_stride);
634         __builtin_prefetch(dst + 3 * dst_stride);
635         __builtin_prefetch(src + 0 * src_stride);
636         __builtin_prefetch(src + 1 * src_stride);
637         __builtin_prefetch(src + 2 * src_stride);
638         __builtin_prefetch(src + 3 * src_stride);
639         d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
640         d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
641         d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
642         d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
643 
644         d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
645         d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
646         d01 = vminq_u16(d01, max);
647         d23 = vminq_u16(d23, max);
648         vst1_u16(dst, vget_low_u16(d01));
649         dst += dst_stride;
650         vst1_u16(dst, vget_high_u16(d01));
651         dst += dst_stride;
652         vst1_u16(dst, vget_low_u16(d23));
653         dst += dst_stride;
654         vst1_u16(dst, vget_high_u16(d23));
655         dst += dst_stride;
656 
657         s0 = s4;
658         s1 = s5;
659         s2 = s6;
660         s3 = s7;
661         s4 = s8;
662         s5 = s9;
663         s6 = s10;
664         h -= 4;
665       } while (h > 0);
666     } else {
667       int height;
668       const uint16_t *s;
669       uint16_t *d;
670       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
671       uint16x8_t d0, d1, d2, d3;
672 
673       do {
674         __builtin_prefetch(src + 0 * src_stride);
675         __builtin_prefetch(src + 1 * src_stride);
676         __builtin_prefetch(src + 2 * src_stride);
677         __builtin_prefetch(src + 3 * src_stride);
678         __builtin_prefetch(src + 4 * src_stride);
679         __builtin_prefetch(src + 5 * src_stride);
680         __builtin_prefetch(src + 6 * src_stride);
681         s = src;
682         s0 = vreinterpretq_s16_u16(vld1q_u16(s));
683         s += src_stride;
684         s1 = vreinterpretq_s16_u16(vld1q_u16(s));
685         s += src_stride;
686         s2 = vreinterpretq_s16_u16(vld1q_u16(s));
687         s += src_stride;
688         s3 = vreinterpretq_s16_u16(vld1q_u16(s));
689         s += src_stride;
690         s4 = vreinterpretq_s16_u16(vld1q_u16(s));
691         s += src_stride;
692         s5 = vreinterpretq_s16_u16(vld1q_u16(s));
693         s += src_stride;
694         s6 = vreinterpretq_s16_u16(vld1q_u16(s));
695         s += src_stride;
696         d = dst;
697         height = h;
698 
699         do {
700           s7 = vreinterpretq_s16_u16(vld1q_u16(s));
701           s += src_stride;
702           s8 = vreinterpretq_s16_u16(vld1q_u16(s));
703           s += src_stride;
704           s9 = vreinterpretq_s16_u16(vld1q_u16(s));
705           s += src_stride;
706           s10 = vreinterpretq_s16_u16(vld1q_u16(s));
707           s += src_stride;
708 
709           __builtin_prefetch(d + 0 * dst_stride);
710           __builtin_prefetch(d + 1 * dst_stride);
711           __builtin_prefetch(d + 2 * dst_stride);
712           __builtin_prefetch(d + 3 * dst_stride);
713           __builtin_prefetch(s + 0 * src_stride);
714           __builtin_prefetch(s + 1 * src_stride);
715           __builtin_prefetch(s + 2 * src_stride);
716           __builtin_prefetch(s + 3 * src_stride);
717           d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
718           d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
719           d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
720           d3 =
721               highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
722 
723           vst1q_u16(d, d0);
724           d += dst_stride;
725           vst1q_u16(d, d1);
726           d += dst_stride;
727           vst1q_u16(d, d2);
728           d += dst_stride;
729           vst1q_u16(d, d3);
730           d += dst_stride;
731 
732           s0 = s4;
733           s1 = s5;
734           s2 = s6;
735           s3 = s7;
736           s4 = s8;
737           s5 = s9;
738           s6 = s10;
739           height -= 4;
740         } while (height > 0);
741         src += 8;
742         dst += 8;
743         w -= 8;
744       } while (w > 0);
745     }
746   }
747 }
748 
vpx_highbd_convolve8_avg_vert_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h,int bd)749 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
750                                         ptrdiff_t src_stride, uint16_t *dst,
751                                         ptrdiff_t dst_stride,
752                                         const InterpKernel *filter, int x0_q4,
753                                         int x_step_q4, int y0_q4, int y_step_q4,
754                                         int w, int h, int bd) {
755   if (y_step_q4 != 16) {
756     vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
757                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
758                                     bd);
759   } else {
760     const int16x8_t filters = vld1q_s16(filter[y0_q4]);
761     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
762 
763     assert(!((intptr_t)dst & 3));
764     assert(!(dst_stride & 3));
765 
766     src -= 3 * src_stride;
767 
768     if (w == 4) {
769       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
770       int32x4_t d0, d1, d2, d3;
771       uint16x8_t d01, d23, t01, t23;
772 
773       s0 = vreinterpret_s16_u16(vld1_u16(src));
774       src += src_stride;
775       s1 = vreinterpret_s16_u16(vld1_u16(src));
776       src += src_stride;
777       s2 = vreinterpret_s16_u16(vld1_u16(src));
778       src += src_stride;
779       s3 = vreinterpret_s16_u16(vld1_u16(src));
780       src += src_stride;
781       s4 = vreinterpret_s16_u16(vld1_u16(src));
782       src += src_stride;
783       s5 = vreinterpret_s16_u16(vld1_u16(src));
784       src += src_stride;
785       s6 = vreinterpret_s16_u16(vld1_u16(src));
786       src += src_stride;
787 
788       do {
789         s7 = vreinterpret_s16_u16(vld1_u16(src));
790         src += src_stride;
791         s8 = vreinterpret_s16_u16(vld1_u16(src));
792         src += src_stride;
793         s9 = vreinterpret_s16_u16(vld1_u16(src));
794         src += src_stride;
795         s10 = vreinterpret_s16_u16(vld1_u16(src));
796         src += src_stride;
797 
798         __builtin_prefetch(dst + 0 * dst_stride);
799         __builtin_prefetch(dst + 1 * dst_stride);
800         __builtin_prefetch(dst + 2 * dst_stride);
801         __builtin_prefetch(dst + 3 * dst_stride);
802         __builtin_prefetch(src + 0 * src_stride);
803         __builtin_prefetch(src + 1 * src_stride);
804         __builtin_prefetch(src + 2 * src_stride);
805         __builtin_prefetch(src + 3 * src_stride);
806         d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
807         d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
808         d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
809         d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
810 
811         t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
812         t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
813         t01 = vminq_u16(t01, max);
814         t23 = vminq_u16(t23, max);
815 
816         d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
817                            vld1_u16(dst + 1 * dst_stride));
818         d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
819                            vld1_u16(dst + 3 * dst_stride));
820         d01 = vrhaddq_u16(d01, t01);
821         d23 = vrhaddq_u16(d23, t23);
822 
823         vst1_u16(dst, vget_low_u16(d01));
824         dst += dst_stride;
825         vst1_u16(dst, vget_high_u16(d01));
826         dst += dst_stride;
827         vst1_u16(dst, vget_low_u16(d23));
828         dst += dst_stride;
829         vst1_u16(dst, vget_high_u16(d23));
830         dst += dst_stride;
831 
832         s0 = s4;
833         s1 = s5;
834         s2 = s6;
835         s3 = s7;
836         s4 = s8;
837         s5 = s9;
838         s6 = s10;
839         h -= 4;
840       } while (h > 0);
841     } else {
842       int height;
843       const uint16_t *s;
844       uint16_t *d;
845       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
846       uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
847 
848       do {
849         __builtin_prefetch(src + 0 * src_stride);
850         __builtin_prefetch(src + 1 * src_stride);
851         __builtin_prefetch(src + 2 * src_stride);
852         __builtin_prefetch(src + 3 * src_stride);
853         __builtin_prefetch(src + 4 * src_stride);
854         __builtin_prefetch(src + 5 * src_stride);
855         __builtin_prefetch(src + 6 * src_stride);
856         s = src;
857         s0 = vreinterpretq_s16_u16(vld1q_u16(s));
858         s += src_stride;
859         s1 = vreinterpretq_s16_u16(vld1q_u16(s));
860         s += src_stride;
861         s2 = vreinterpretq_s16_u16(vld1q_u16(s));
862         s += src_stride;
863         s3 = vreinterpretq_s16_u16(vld1q_u16(s));
864         s += src_stride;
865         s4 = vreinterpretq_s16_u16(vld1q_u16(s));
866         s += src_stride;
867         s5 = vreinterpretq_s16_u16(vld1q_u16(s));
868         s += src_stride;
869         s6 = vreinterpretq_s16_u16(vld1q_u16(s));
870         s += src_stride;
871         d = dst;
872         height = h;
873 
874         do {
875           s7 = vreinterpretq_s16_u16(vld1q_u16(s));
876           s += src_stride;
877           s8 = vreinterpretq_s16_u16(vld1q_u16(s));
878           s += src_stride;
879           s9 = vreinterpretq_s16_u16(vld1q_u16(s));
880           s += src_stride;
881           s10 = vreinterpretq_s16_u16(vld1q_u16(s));
882           s += src_stride;
883 
884           __builtin_prefetch(d + 0 * dst_stride);
885           __builtin_prefetch(d + 1 * dst_stride);
886           __builtin_prefetch(d + 2 * dst_stride);
887           __builtin_prefetch(d + 3 * dst_stride);
888           __builtin_prefetch(s + 0 * src_stride);
889           __builtin_prefetch(s + 1 * src_stride);
890           __builtin_prefetch(s + 2 * src_stride);
891           __builtin_prefetch(s + 3 * src_stride);
892           t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
893           t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
894           t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
895           t3 =
896               highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
897 
898           d0 = vld1q_u16(d + 0 * dst_stride);
899           d1 = vld1q_u16(d + 1 * dst_stride);
900           d2 = vld1q_u16(d + 2 * dst_stride);
901           d3 = vld1q_u16(d + 3 * dst_stride);
902           d0 = vrhaddq_u16(d0, t0);
903           d1 = vrhaddq_u16(d1, t1);
904           d2 = vrhaddq_u16(d2, t2);
905           d3 = vrhaddq_u16(d3, t3);
906 
907           vst1q_u16(d, d0);
908           d += dst_stride;
909           vst1q_u16(d, d1);
910           d += dst_stride;
911           vst1q_u16(d, d2);
912           d += dst_stride;
913           vst1q_u16(d, d3);
914           d += dst_stride;
915 
916           s0 = s4;
917           s1 = s5;
918           s2 = s6;
919           s3 = s7;
920           s4 = s8;
921           s5 = s9;
922           s6 = s10;
923           height -= 4;
924         } while (height > 0);
925         src += 8;
926         dst += 8;
927         w -= 8;
928       } while (w > 0);
929     }
930   }
931 }
932