1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
19 #include "vpx_ports/mem.h"
20 
21 // Note:
22 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
23 // 2. After refactoring the shared code in kernel loops with inline functions,
24 // the decoder speed dropped a lot when using gcc compiler. Therefore there is
25 // no refactoring for those parts by now.
26 // 3. For horizontal convolve, there is an alternative optimization that
27 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
28 // samples in each are read from memory: src, (src+1), (src+2), (src+3),
29 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
30 // instructions. This optimization is much faster in speed unit test, but slowed
31 // down the whole decoder by 5%.
32 
store_u8_8x8(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)33 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
34                                 const uint8x8_t s0, const uint8x8_t s1,
35                                 const uint8x8_t s2, const uint8x8_t s3,
36                                 const uint8x8_t s4, const uint8x8_t s5,
37                                 const uint8x8_t s6, const uint8x8_t s7) {
38   vst1_u8(s, s0);
39   s += p;
40   vst1_u8(s, s1);
41   s += p;
42   vst1_u8(s, s2);
43   s += p;
44   vst1_u8(s, s3);
45   s += p;
46   vst1_u8(s, s4);
47   s += p;
48   vst1_u8(s, s5);
49   s += p;
50   vst1_u8(s, s6);
51   s += p;
52   vst1_u8(s, s7);
53 }
54 
vpx_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)55 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
56                               uint8_t *dst, ptrdiff_t dst_stride,
57                               const InterpKernel *filter, int x0_q4,
58                               int x_step_q4, int y0_q4, int y_step_q4, int w,
59                               int h) {
60   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
61   uint8x8_t t0, t1, t2, t3;
62 
63   assert(!((intptr_t)dst & 3));
64   assert(!(dst_stride & 3));
65   assert(x_step_q4 == 16);
66 
67   (void)x_step_q4;
68   (void)y0_q4;
69   (void)y_step_q4;
70 
71   src -= 3;
72 
73   if (h == 4) {
74     uint8x8_t d01, d23;
75     int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
76         d1, d2, d3;
77     int16x8_t tt0, tt1, tt2, tt3;
78 
79     __builtin_prefetch(src + 0 * src_stride);
80     __builtin_prefetch(src + 1 * src_stride);
81     __builtin_prefetch(src + 2 * src_stride);
82     __builtin_prefetch(src + 3 * src_stride);
83     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
84     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
85     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
86     transpose_u8_8x4(&t0, &t1, &t2, &t3);
87     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
88     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
89     tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
90     tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
91     s0 = vget_low_s16(tt0);
92     s1 = vget_low_s16(tt1);
93     s2 = vget_low_s16(tt2);
94     s3 = vget_low_s16(tt3);
95     s4 = vget_high_s16(tt0);
96     s5 = vget_high_s16(tt1);
97     s6 = vget_high_s16(tt2);
98     __builtin_prefetch(dst + 0 * dst_stride);
99     __builtin_prefetch(dst + 1 * dst_stride);
100     __builtin_prefetch(dst + 2 * dst_stride);
101     __builtin_prefetch(dst + 3 * dst_stride);
102     src += 7;
103 
104     do {
105       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
106       transpose_u8_8x4(&t0, &t1, &t2, &t3);
107       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
108       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
109       tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
110       tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
111       s7 = vget_low_s16(tt0);
112       s8 = vget_low_s16(tt1);
113       s9 = vget_low_s16(tt2);
114       s10 = vget_low_s16(tt3);
115 
116       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
117                        filter4);
118       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
119                        filter4);
120       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
121                        filter4);
122       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
123                        filter4);
124 
125       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
126       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
127       transpose_u8_4x4(&d01, &d23);
128 
129       vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
130                     vreinterpret_u32_u8(d01), 0);
131       vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
132                     vreinterpret_u32_u8(d23), 0);
133       vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
134                     vreinterpret_u32_u8(d01), 1);
135       vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
136                     vreinterpret_u32_u8(d23), 1);
137 
138       s0 = s4;
139       s1 = s5;
140       s2 = s6;
141       s3 = s7;
142       s4 = s8;
143       s5 = s9;
144       s6 = s10;
145       src += 4;
146       dst += 4;
147       w -= 4;
148     } while (w > 0);
149   } else {
150     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
151     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
152     int width;
153     const uint8_t *s;
154     uint8x8_t t4, t5, t6, t7;
155     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
156 
157     if (w == 4) {
158       do {
159         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
160         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
161         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
162         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
163         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
164         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
165         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
166         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
167         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
168 
169         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
170                     &t7);
171         src += 8 * src_stride;
172         __builtin_prefetch(dst + 0 * dst_stride);
173         __builtin_prefetch(dst + 1 * dst_stride);
174         __builtin_prefetch(dst + 2 * dst_stride);
175         __builtin_prefetch(dst + 3 * dst_stride);
176         __builtin_prefetch(dst + 4 * dst_stride);
177         __builtin_prefetch(dst + 5 * dst_stride);
178         __builtin_prefetch(dst + 6 * dst_stride);
179         __builtin_prefetch(dst + 7 * dst_stride);
180         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
181         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
182         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
183         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
184         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
185 
186         __builtin_prefetch(src + 0 * src_stride);
187         __builtin_prefetch(src + 1 * src_stride);
188         __builtin_prefetch(src + 2 * src_stride);
189         __builtin_prefetch(src + 3 * src_stride);
190         __builtin_prefetch(src + 4 * src_stride);
191         __builtin_prefetch(src + 5 * src_stride);
192         __builtin_prefetch(src + 6 * src_stride);
193         __builtin_prefetch(src + 7 * src_stride);
194         t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
195                          filter4);
196         t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
197                          filter4);
198         t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
199                          filter4);
200         t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
201                          filter4);
202 
203         transpose_u8_8x4(&t0, &t1, &t2, &t3);
204         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
205         dst += dst_stride;
206         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
207         dst += dst_stride;
208         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
209         dst += dst_stride;
210         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
211         dst += dst_stride;
212         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
213         dst += dst_stride;
214         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
215         dst += dst_stride;
216         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
217         dst += dst_stride;
218         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
219         dst += dst_stride;
220         h -= 8;
221       } while (h > 0);
222     } else {
223       uint8_t *d;
224       int16x8_t s11, s12, s13, s14;
225 
226       do {
227         __builtin_prefetch(src + 0 * src_stride);
228         __builtin_prefetch(src + 1 * src_stride);
229         __builtin_prefetch(src + 2 * src_stride);
230         __builtin_prefetch(src + 3 * src_stride);
231         __builtin_prefetch(src + 4 * src_stride);
232         __builtin_prefetch(src + 5 * src_stride);
233         __builtin_prefetch(src + 6 * src_stride);
234         __builtin_prefetch(src + 7 * src_stride);
235         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
236         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
237         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
238         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
239         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
240         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
241         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
242         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
243         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
244 
245         width = w;
246         s = src + 7;
247         d = dst;
248         __builtin_prefetch(dst + 0 * dst_stride);
249         __builtin_prefetch(dst + 1 * dst_stride);
250         __builtin_prefetch(dst + 2 * dst_stride);
251         __builtin_prefetch(dst + 3 * dst_stride);
252         __builtin_prefetch(dst + 4 * dst_stride);
253         __builtin_prefetch(dst + 5 * dst_stride);
254         __builtin_prefetch(dst + 6 * dst_stride);
255         __builtin_prefetch(dst + 7 * dst_stride);
256 
257         do {
258           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
259           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
260           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
261           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
262           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
263           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
264           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
265           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
266           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
267           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
268 
269           t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
270                            filter4);
271           t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
272                            filter4);
273           t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
274                            filter4);
275           t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
276                            filter4);
277           t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
278                            filter4);
279           t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
280                            filter4);
281           t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
282                            filter4);
283           t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
284                            filter3, filter4);
285 
286           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
287           store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
288 
289           s0 = s8;
290           s1 = s9;
291           s2 = s10;
292           s3 = s11;
293           s4 = s12;
294           s5 = s13;
295           s6 = s14;
296           s += 8;
297           d += 8;
298           width -= 8;
299         } while (width > 0);
300         src += 8 * src_stride;
301         dst += 8 * dst_stride;
302         h -= 8;
303       } while (h > 0);
304     }
305   }
306 }
307 
vpx_convolve8_avg_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)308 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
309                                   uint8_t *dst, ptrdiff_t dst_stride,
310                                   const InterpKernel *filter, int x0_q4,
311                                   int x_step_q4, int y0_q4, int y_step_q4,
312                                   int w, int h) {
313   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
314   uint8x8_t t0, t1, t2, t3;
315 
316   assert(!((intptr_t)dst & 3));
317   assert(!(dst_stride & 3));
318   assert(x_step_q4 == 16);
319 
320   (void)x_step_q4;
321   (void)y0_q4;
322   (void)y_step_q4;
323 
324   src -= 3;
325 
326   if (h == 4) {
327     uint8x8_t d01, d23;
328     int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
329         d1, d2, d3;
330     int16x8_t tt0, tt1, tt2, tt3;
331     uint32x4_t d0123 = vdupq_n_u32(0);
332 
333     __builtin_prefetch(src + 0 * src_stride);
334     __builtin_prefetch(src + 1 * src_stride);
335     __builtin_prefetch(src + 2 * src_stride);
336     __builtin_prefetch(src + 3 * src_stride);
337     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
338     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
339     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
340     transpose_u8_8x4(&t0, &t1, &t2, &t3);
341     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
342     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
343     tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
344     tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
345     s0 = vget_low_s16(tt0);
346     s1 = vget_low_s16(tt1);
347     s2 = vget_low_s16(tt2);
348     s3 = vget_low_s16(tt3);
349     s4 = vget_high_s16(tt0);
350     s5 = vget_high_s16(tt1);
351     s6 = vget_high_s16(tt2);
352     __builtin_prefetch(dst + 0 * dst_stride);
353     __builtin_prefetch(dst + 1 * dst_stride);
354     __builtin_prefetch(dst + 2 * dst_stride);
355     __builtin_prefetch(dst + 3 * dst_stride);
356     src += 7;
357 
358     do {
359       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
360       transpose_u8_8x4(&t0, &t1, &t2, &t3);
361       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
362       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
363       tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
364       tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
365       s7 = vget_low_s16(tt0);
366       s8 = vget_low_s16(tt1);
367       s9 = vget_low_s16(tt2);
368       s10 = vget_low_s16(tt3);
369 
370       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
371                        filter4);
372       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
373                        filter4);
374       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
375                        filter4);
376       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
377                        filter4);
378 
379       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
380       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
381       transpose_u8_4x4(&d01, &d23);
382 
383       d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
384       d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
385       d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
386       d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
387       d0123 = vreinterpretq_u32_u8(
388           vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
389 
390       vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
391       vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
392       vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
393       vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
394 
395       s0 = s4;
396       s1 = s5;
397       s2 = s6;
398       s3 = s7;
399       s4 = s8;
400       s5 = s9;
401       s6 = s10;
402       src += 4;
403       dst += 4;
404       w -= 4;
405     } while (w > 0);
406   } else {
407     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
408     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
409     int width;
410     const uint8_t *s;
411     uint8x8_t t4, t5, t6, t7;
412     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
413 
414     if (w == 4) {
415       uint32x4_t d0415 = vdupq_n_u32(0);
416       uint32x4_t d2637 = vdupq_n_u32(0);
417       do {
418         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
419         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
420         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
421         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
422         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
423         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
424         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
425         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
426         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
427 
428         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
429                     &t7);
430         src += 8 * src_stride;
431         __builtin_prefetch(dst + 0 * dst_stride);
432         __builtin_prefetch(dst + 1 * dst_stride);
433         __builtin_prefetch(dst + 2 * dst_stride);
434         __builtin_prefetch(dst + 3 * dst_stride);
435         __builtin_prefetch(dst + 4 * dst_stride);
436         __builtin_prefetch(dst + 5 * dst_stride);
437         __builtin_prefetch(dst + 6 * dst_stride);
438         __builtin_prefetch(dst + 7 * dst_stride);
439         transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
440         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
441         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
442         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
443         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
444 
445         __builtin_prefetch(src + 0 * src_stride);
446         __builtin_prefetch(src + 1 * src_stride);
447         __builtin_prefetch(src + 2 * src_stride);
448         __builtin_prefetch(src + 3 * src_stride);
449         __builtin_prefetch(src + 4 * src_stride);
450         __builtin_prefetch(src + 5 * src_stride);
451         __builtin_prefetch(src + 6 * src_stride);
452         __builtin_prefetch(src + 7 * src_stride);
453         t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
454                          filter4);
455         t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
456                          filter4);
457         t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
458                          filter4);
459         t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
460                          filter4);
461 
462         transpose_u8_8x4(&t0, &t1, &t2, &t3);
463 
464         d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
465         d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
466         d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
467         d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
468         d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
469         d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
470         d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
471         d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
472         d0415 = vreinterpretq_u32_u8(
473             vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
474         d2637 = vreinterpretq_u32_u8(
475             vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
476 
477         vst1q_lane_u32((uint32_t *)dst, d0415, 0);
478         dst += dst_stride;
479         vst1q_lane_u32((uint32_t *)dst, d0415, 2);
480         dst += dst_stride;
481         vst1q_lane_u32((uint32_t *)dst, d2637, 0);
482         dst += dst_stride;
483         vst1q_lane_u32((uint32_t *)dst, d2637, 2);
484         dst += dst_stride;
485         vst1q_lane_u32((uint32_t *)dst, d0415, 1);
486         dst += dst_stride;
487         vst1q_lane_u32((uint32_t *)dst, d0415, 3);
488         dst += dst_stride;
489         vst1q_lane_u32((uint32_t *)dst, d2637, 1);
490         dst += dst_stride;
491         vst1q_lane_u32((uint32_t *)dst, d2637, 3);
492         dst += dst_stride;
493         h -= 8;
494       } while (h > 0);
495     } else {
496       uint8_t *d;
497       int16x8_t s11, s12, s13, s14;
498       uint8x16_t d01, d23, d45, d67;
499 
500       do {
501         __builtin_prefetch(src + 0 * src_stride);
502         __builtin_prefetch(src + 1 * src_stride);
503         __builtin_prefetch(src + 2 * src_stride);
504         __builtin_prefetch(src + 3 * src_stride);
505         __builtin_prefetch(src + 4 * src_stride);
506         __builtin_prefetch(src + 5 * src_stride);
507         __builtin_prefetch(src + 6 * src_stride);
508         __builtin_prefetch(src + 7 * src_stride);
509         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
510         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
511         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
512         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
513         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
514         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
515         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
516         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
517         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
518 
519         width = w;
520         s = src + 7;
521         d = dst;
522         __builtin_prefetch(dst + 0 * dst_stride);
523         __builtin_prefetch(dst + 1 * dst_stride);
524         __builtin_prefetch(dst + 2 * dst_stride);
525         __builtin_prefetch(dst + 3 * dst_stride);
526         __builtin_prefetch(dst + 4 * dst_stride);
527         __builtin_prefetch(dst + 5 * dst_stride);
528         __builtin_prefetch(dst + 6 * dst_stride);
529         __builtin_prefetch(dst + 7 * dst_stride);
530 
531         do {
532           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
533           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
534           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
535           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
536           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
537           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
538           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
539           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
540           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
541           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
542 
543           t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
544                            filter4);
545           t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
546                            filter4);
547           t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
548                            filter4);
549           t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
550                            filter4);
551           t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
552                            filter4);
553           t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
554                            filter4);
555           t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
556                            filter4);
557           t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
558                            filter3, filter4);
559 
560           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
561 
562           d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
563                             vld1_u8(d + 1 * dst_stride));
564           d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
565                             vld1_u8(d + 3 * dst_stride));
566           d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
567                             vld1_u8(d + 5 * dst_stride));
568           d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
569                             vld1_u8(d + 7 * dst_stride));
570           d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
571           d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
572           d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
573           d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
574 
575           store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
576                        vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
577                        vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
578 
579           s0 = s8;
580           s1 = s9;
581           s2 = s10;
582           s3 = s11;
583           s4 = s12;
584           s5 = s13;
585           s6 = s14;
586           s += 8;
587           d += 8;
588           width -= 8;
589         } while (width > 0);
590         src += 8 * src_stride;
591         dst += 8 * dst_stride;
592         h -= 8;
593       } while (h > 0);
594     }
595   }
596 }
597 
vpx_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)598 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
599                              uint8_t *dst, ptrdiff_t dst_stride,
600                              const InterpKernel *filter, int x0_q4,
601                              int x_step_q4, int y0_q4, int y_step_q4, int w,
602                              int h) {
603   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
604 
605   assert(!((intptr_t)dst & 3));
606   assert(!(dst_stride & 3));
607   assert(y_step_q4 == 16);
608 
609   (void)x0_q4;
610   (void)x_step_q4;
611   (void)y_step_q4;
612 
613   src -= 3 * src_stride;
614 
615   if (w == 4) {
616     const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
617     const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
618     uint8x8_t d01, d23;
619     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
620 
621     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
622     src += src_stride;
623     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
624     src += src_stride;
625     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
626     src += src_stride;
627     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
628     src += src_stride;
629     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
630     src += src_stride;
631     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
632     src += src_stride;
633     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
634     src += src_stride;
635 
636     do {
637       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
638       src += src_stride;
639       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
640       src += src_stride;
641       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
642       src += src_stride;
643       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
644       src += src_stride;
645 
646       __builtin_prefetch(dst + 0 * dst_stride);
647       __builtin_prefetch(dst + 1 * dst_stride);
648       __builtin_prefetch(dst + 2 * dst_stride);
649       __builtin_prefetch(dst + 3 * dst_stride);
650       __builtin_prefetch(src + 0 * src_stride);
651       __builtin_prefetch(src + 1 * src_stride);
652       __builtin_prefetch(src + 2 * src_stride);
653       __builtin_prefetch(src + 3 * src_stride);
654       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
655                        filter4);
656       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
657                        filter4);
658       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
659                        filter4);
660       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
661                        filter4);
662 
663       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
664       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
665       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
666       dst += dst_stride;
667       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
668       dst += dst_stride;
669       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
670       dst += dst_stride;
671       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
672       dst += dst_stride;
673 
674       s0 = s4;
675       s1 = s5;
676       s2 = s6;
677       s3 = s7;
678       s4 = s8;
679       s5 = s9;
680       s6 = s10;
681       h -= 4;
682     } while (h > 0);
683   } else {
684     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
685     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
686     int height;
687     const uint8_t *s;
688     uint8_t *d;
689     uint8x8_t t0, t1, t2, t3;
690     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
691 
692     do {
693       __builtin_prefetch(src + 0 * src_stride);
694       __builtin_prefetch(src + 1 * src_stride);
695       __builtin_prefetch(src + 2 * src_stride);
696       __builtin_prefetch(src + 3 * src_stride);
697       __builtin_prefetch(src + 4 * src_stride);
698       __builtin_prefetch(src + 5 * src_stride);
699       __builtin_prefetch(src + 6 * src_stride);
700       s = src;
701       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
702       s += src_stride;
703       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
704       s += src_stride;
705       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
706       s += src_stride;
707       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
708       s += src_stride;
709       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
710       s += src_stride;
711       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
712       s += src_stride;
713       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
714       s += src_stride;
715       d = dst;
716       height = h;
717 
718       do {
719         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
720         s += src_stride;
721         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
722         s += src_stride;
723         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
724         s += src_stride;
725         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
726         s += src_stride;
727 
728         __builtin_prefetch(d + 0 * dst_stride);
729         __builtin_prefetch(d + 1 * dst_stride);
730         __builtin_prefetch(d + 2 * dst_stride);
731         __builtin_prefetch(d + 3 * dst_stride);
732         __builtin_prefetch(s + 0 * src_stride);
733         __builtin_prefetch(s + 1 * src_stride);
734         __builtin_prefetch(s + 2 * src_stride);
735         __builtin_prefetch(s + 3 * src_stride);
736         t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
737                          filter4);
738         t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
739                          filter4);
740         t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
741                          filter4);
742         t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
743                          filter4);
744 
745         vst1_u8(d, t0);
746         d += dst_stride;
747         vst1_u8(d, t1);
748         d += dst_stride;
749         vst1_u8(d, t2);
750         d += dst_stride;
751         vst1_u8(d, t3);
752         d += dst_stride;
753 
754         s0 = s4;
755         s1 = s5;
756         s2 = s6;
757         s3 = s7;
758         s4 = s8;
759         s5 = s9;
760         s6 = s10;
761         height -= 4;
762       } while (height > 0);
763       src += 8;
764       dst += 8;
765       w -= 8;
766     } while (w > 0);
767   }
768 }
769 
vpx_convolve8_avg_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)770 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
771                                  uint8_t *dst, ptrdiff_t dst_stride,
772                                  const InterpKernel *filter, int x0_q4,
773                                  int x_step_q4, int y0_q4, int y_step_q4, int w,
774                                  int h) {
775   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
776 
777   assert(!((intptr_t)dst & 3));
778   assert(!(dst_stride & 3));
779   assert(y_step_q4 == 16);
780 
781   (void)x0_q4;
782   (void)x_step_q4;
783   (void)y_step_q4;
784 
785   src -= 3 * src_stride;
786 
787   if (w == 4) {
788     const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
789     const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
790     uint8x8_t d01, d23;
791     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
792     uint32x4_t d0123 = vdupq_n_u32(0);
793 
794     s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
795     src += src_stride;
796     s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
797     src += src_stride;
798     s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
799     src += src_stride;
800     s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
801     src += src_stride;
802     s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
803     src += src_stride;
804     s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
805     src += src_stride;
806     s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
807     src += src_stride;
808 
809     do {
810       s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
811       src += src_stride;
812       s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
813       src += src_stride;
814       s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
815       src += src_stride;
816       s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
817       src += src_stride;
818 
819       __builtin_prefetch(dst + 0 * dst_stride);
820       __builtin_prefetch(dst + 1 * dst_stride);
821       __builtin_prefetch(dst + 2 * dst_stride);
822       __builtin_prefetch(dst + 3 * dst_stride);
823       __builtin_prefetch(src + 0 * src_stride);
824       __builtin_prefetch(src + 1 * src_stride);
825       __builtin_prefetch(src + 2 * src_stride);
826       __builtin_prefetch(src + 3 * src_stride);
827       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
828                        filter4);
829       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
830                        filter4);
831       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
832                        filter4);
833       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
834                        filter4);
835 
836       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
837       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
838 
839       d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
840       d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
841       d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
842       d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
843       d0123 = vreinterpretq_u32_u8(
844           vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
845 
846       vst1q_lane_u32((uint32_t *)dst, d0123, 0);
847       dst += dst_stride;
848       vst1q_lane_u32((uint32_t *)dst, d0123, 1);
849       dst += dst_stride;
850       vst1q_lane_u32((uint32_t *)dst, d0123, 2);
851       dst += dst_stride;
852       vst1q_lane_u32((uint32_t *)dst, d0123, 3);
853       dst += dst_stride;
854 
855       s0 = s4;
856       s1 = s5;
857       s2 = s6;
858       s3 = s7;
859       s4 = s8;
860       s5 = s9;
861       s6 = s10;
862       h -= 4;
863     } while (h > 0);
864   } else {
865     const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
866     const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
867     int height;
868     const uint8_t *s;
869     uint8_t *d;
870     uint8x8_t t0, t1, t2, t3;
871     uint8x16_t d01, d23, dd01, dd23;
872     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
873 
874     do {
875       __builtin_prefetch(src + 0 * src_stride);
876       __builtin_prefetch(src + 1 * src_stride);
877       __builtin_prefetch(src + 2 * src_stride);
878       __builtin_prefetch(src + 3 * src_stride);
879       __builtin_prefetch(src + 4 * src_stride);
880       __builtin_prefetch(src + 5 * src_stride);
881       __builtin_prefetch(src + 6 * src_stride);
882       s = src;
883       s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
884       s += src_stride;
885       s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
886       s += src_stride;
887       s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
888       s += src_stride;
889       s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
890       s += src_stride;
891       s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
892       s += src_stride;
893       s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
894       s += src_stride;
895       s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
896       s += src_stride;
897       d = dst;
898       height = h;
899 
900       do {
901         s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
902         s += src_stride;
903         s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
904         s += src_stride;
905         s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
906         s += src_stride;
907         s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
908         s += src_stride;
909 
910         __builtin_prefetch(d + 0 * dst_stride);
911         __builtin_prefetch(d + 1 * dst_stride);
912         __builtin_prefetch(d + 2 * dst_stride);
913         __builtin_prefetch(d + 3 * dst_stride);
914         __builtin_prefetch(s + 0 * src_stride);
915         __builtin_prefetch(s + 1 * src_stride);
916         __builtin_prefetch(s + 2 * src_stride);
917         __builtin_prefetch(s + 3 * src_stride);
918         t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
919                          filter4);
920         t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
921                          filter4);
922         t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
923                          filter4);
924         t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
925                          filter4);
926 
927         d01 = vcombine_u8(t0, t1);
928         d23 = vcombine_u8(t2, t3);
929         dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
930                            vld1_u8(d + 1 * dst_stride));
931         dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
932                            vld1_u8(d + 3 * dst_stride));
933         dd01 = vrhaddq_u8(dd01, d01);
934         dd23 = vrhaddq_u8(dd23, d23);
935 
936         vst1_u8(d, vget_low_u8(dd01));
937         d += dst_stride;
938         vst1_u8(d, vget_high_u8(dd01));
939         d += dst_stride;
940         vst1_u8(d, vget_low_u8(dd23));
941         d += dst_stride;
942         vst1_u8(d, vget_high_u8(dd23));
943         d += dst_stride;
944 
945         s0 = s4;
946         s1 = s5;
947         s2 = s6;
948         s3 = s7;
949         s4 = s8;
950         s5 = s9;
951         s6 = s10;
952         height -= 4;
953       } while (height > 0);
954       src += 8;
955       dst += 8;
956       w -= 8;
957     } while (w > 0);
958   }
959 }
960