1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
19 #include "vpx_ports/mem.h"
20
21 // Note:
22 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
23 // 2. After refactoring the shared code in kernel loops with inline functions,
24 // the decoder speed dropped a lot when using gcc compiler. Therefore there is
25 // no refactoring for those parts by now.
26 // 3. For horizontal convolve, there is an alternative optimization that
27 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
28 // samples in each are read from memory: src, (src+1), (src+2), (src+3),
29 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
30 // instructions. This optimization is much faster in speed unit test, but slowed
31 // down the whole decoder by 5%.
32
store_u8_8x8(uint8_t * s,const ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)33 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
34 const uint8x8_t s0, const uint8x8_t s1,
35 const uint8x8_t s2, const uint8x8_t s3,
36 const uint8x8_t s4, const uint8x8_t s5,
37 const uint8x8_t s6, const uint8x8_t s7) {
38 vst1_u8(s, s0);
39 s += p;
40 vst1_u8(s, s1);
41 s += p;
42 vst1_u8(s, s2);
43 s += p;
44 vst1_u8(s, s3);
45 s += p;
46 vst1_u8(s, s4);
47 s += p;
48 vst1_u8(s, s5);
49 s += p;
50 vst1_u8(s, s6);
51 s += p;
52 vst1_u8(s, s7);
53 }
54
vpx_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)55 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
56 uint8_t *dst, ptrdiff_t dst_stride,
57 const InterpKernel *filter, int x0_q4,
58 int x_step_q4, int y0_q4, int y_step_q4, int w,
59 int h) {
60 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
61 uint8x8_t t0, t1, t2, t3;
62
63 assert(!((intptr_t)dst & 3));
64 assert(!(dst_stride & 3));
65 assert(x_step_q4 == 16);
66
67 (void)x_step_q4;
68 (void)y0_q4;
69 (void)y_step_q4;
70
71 src -= 3;
72
73 if (h == 4) {
74 uint8x8_t d01, d23;
75 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
76 d1, d2, d3;
77 int16x8_t tt0, tt1, tt2, tt3;
78
79 __builtin_prefetch(src + 0 * src_stride);
80 __builtin_prefetch(src + 1 * src_stride);
81 __builtin_prefetch(src + 2 * src_stride);
82 __builtin_prefetch(src + 3 * src_stride);
83 filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
84 filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
85 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
86 transpose_u8_8x4(&t0, &t1, &t2, &t3);
87 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
88 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
89 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
90 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
91 s0 = vget_low_s16(tt0);
92 s1 = vget_low_s16(tt1);
93 s2 = vget_low_s16(tt2);
94 s3 = vget_low_s16(tt3);
95 s4 = vget_high_s16(tt0);
96 s5 = vget_high_s16(tt1);
97 s6 = vget_high_s16(tt2);
98 __builtin_prefetch(dst + 0 * dst_stride);
99 __builtin_prefetch(dst + 1 * dst_stride);
100 __builtin_prefetch(dst + 2 * dst_stride);
101 __builtin_prefetch(dst + 3 * dst_stride);
102 src += 7;
103
104 do {
105 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
106 transpose_u8_8x4(&t0, &t1, &t2, &t3);
107 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
108 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
109 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
110 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
111 s7 = vget_low_s16(tt0);
112 s8 = vget_low_s16(tt1);
113 s9 = vget_low_s16(tt2);
114 s10 = vget_low_s16(tt3);
115
116 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
117 filter4);
118 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
119 filter4);
120 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
121 filter4);
122 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
123 filter4);
124
125 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
126 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
127 transpose_u8_4x4(&d01, &d23);
128
129 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
130 vreinterpret_u32_u8(d01), 0);
131 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
132 vreinterpret_u32_u8(d23), 0);
133 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
134 vreinterpret_u32_u8(d01), 1);
135 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
136 vreinterpret_u32_u8(d23), 1);
137
138 s0 = s4;
139 s1 = s5;
140 s2 = s6;
141 s3 = s7;
142 s4 = s8;
143 s5 = s9;
144 s6 = s10;
145 src += 4;
146 dst += 4;
147 w -= 4;
148 } while (w > 0);
149 } else {
150 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
151 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
152 int width;
153 const uint8_t *s;
154 uint8x8_t t4, t5, t6, t7;
155 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
156
157 if (w == 4) {
158 do {
159 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
160 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
161 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
162 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
163 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
164 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
165 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
166 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
167 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
168
169 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
170 &t7);
171 src += 8 * src_stride;
172 __builtin_prefetch(dst + 0 * dst_stride);
173 __builtin_prefetch(dst + 1 * dst_stride);
174 __builtin_prefetch(dst + 2 * dst_stride);
175 __builtin_prefetch(dst + 3 * dst_stride);
176 __builtin_prefetch(dst + 4 * dst_stride);
177 __builtin_prefetch(dst + 5 * dst_stride);
178 __builtin_prefetch(dst + 6 * dst_stride);
179 __builtin_prefetch(dst + 7 * dst_stride);
180 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
181 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
182 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
183 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
184 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
185
186 __builtin_prefetch(src + 0 * src_stride);
187 __builtin_prefetch(src + 1 * src_stride);
188 __builtin_prefetch(src + 2 * src_stride);
189 __builtin_prefetch(src + 3 * src_stride);
190 __builtin_prefetch(src + 4 * src_stride);
191 __builtin_prefetch(src + 5 * src_stride);
192 __builtin_prefetch(src + 6 * src_stride);
193 __builtin_prefetch(src + 7 * src_stride);
194 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
195 filter4);
196 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
197 filter4);
198 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
199 filter4);
200 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
201 filter4);
202
203 transpose_u8_8x4(&t0, &t1, &t2, &t3);
204 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
205 dst += dst_stride;
206 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
207 dst += dst_stride;
208 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
209 dst += dst_stride;
210 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
211 dst += dst_stride;
212 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
213 dst += dst_stride;
214 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
215 dst += dst_stride;
216 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
217 dst += dst_stride;
218 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
219 dst += dst_stride;
220 h -= 8;
221 } while (h > 0);
222 } else {
223 uint8_t *d;
224 int16x8_t s11, s12, s13, s14;
225
226 do {
227 __builtin_prefetch(src + 0 * src_stride);
228 __builtin_prefetch(src + 1 * src_stride);
229 __builtin_prefetch(src + 2 * src_stride);
230 __builtin_prefetch(src + 3 * src_stride);
231 __builtin_prefetch(src + 4 * src_stride);
232 __builtin_prefetch(src + 5 * src_stride);
233 __builtin_prefetch(src + 6 * src_stride);
234 __builtin_prefetch(src + 7 * src_stride);
235 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
236 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
237 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
238 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
239 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
240 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
241 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
242 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
243 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
244
245 width = w;
246 s = src + 7;
247 d = dst;
248 __builtin_prefetch(dst + 0 * dst_stride);
249 __builtin_prefetch(dst + 1 * dst_stride);
250 __builtin_prefetch(dst + 2 * dst_stride);
251 __builtin_prefetch(dst + 3 * dst_stride);
252 __builtin_prefetch(dst + 4 * dst_stride);
253 __builtin_prefetch(dst + 5 * dst_stride);
254 __builtin_prefetch(dst + 6 * dst_stride);
255 __builtin_prefetch(dst + 7 * dst_stride);
256
257 do {
258 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
259 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
260 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
261 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
262 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
263 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
264 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
265 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
266 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
267 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
268
269 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
270 filter4);
271 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
272 filter4);
273 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
274 filter4);
275 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
276 filter4);
277 t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
278 filter4);
279 t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
280 filter4);
281 t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
282 filter4);
283 t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
284 filter3, filter4);
285
286 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
287 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
288
289 s0 = s8;
290 s1 = s9;
291 s2 = s10;
292 s3 = s11;
293 s4 = s12;
294 s5 = s13;
295 s6 = s14;
296 s += 8;
297 d += 8;
298 width -= 8;
299 } while (width > 0);
300 src += 8 * src_stride;
301 dst += 8 * dst_stride;
302 h -= 8;
303 } while (h > 0);
304 }
305 }
306 }
307
vpx_convolve8_avg_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)308 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
309 uint8_t *dst, ptrdiff_t dst_stride,
310 const InterpKernel *filter, int x0_q4,
311 int x_step_q4, int y0_q4, int y_step_q4,
312 int w, int h) {
313 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
314 uint8x8_t t0, t1, t2, t3;
315
316 assert(!((intptr_t)dst & 3));
317 assert(!(dst_stride & 3));
318 assert(x_step_q4 == 16);
319
320 (void)x_step_q4;
321 (void)y0_q4;
322 (void)y_step_q4;
323
324 src -= 3;
325
326 if (h == 4) {
327 uint8x8_t d01, d23;
328 int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
329 d1, d2, d3;
330 int16x8_t tt0, tt1, tt2, tt3;
331 uint32x4_t d0123 = vdupq_n_u32(0);
332
333 __builtin_prefetch(src + 0 * src_stride);
334 __builtin_prefetch(src + 1 * src_stride);
335 __builtin_prefetch(src + 2 * src_stride);
336 __builtin_prefetch(src + 3 * src_stride);
337 filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
338 filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
339 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
340 transpose_u8_8x4(&t0, &t1, &t2, &t3);
341 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
342 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
343 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
344 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
345 s0 = vget_low_s16(tt0);
346 s1 = vget_low_s16(tt1);
347 s2 = vget_low_s16(tt2);
348 s3 = vget_low_s16(tt3);
349 s4 = vget_high_s16(tt0);
350 s5 = vget_high_s16(tt1);
351 s6 = vget_high_s16(tt2);
352 __builtin_prefetch(dst + 0 * dst_stride);
353 __builtin_prefetch(dst + 1 * dst_stride);
354 __builtin_prefetch(dst + 2 * dst_stride);
355 __builtin_prefetch(dst + 3 * dst_stride);
356 src += 7;
357
358 do {
359 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
360 transpose_u8_8x4(&t0, &t1, &t2, &t3);
361 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
362 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
363 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
364 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
365 s7 = vget_low_s16(tt0);
366 s8 = vget_low_s16(tt1);
367 s9 = vget_low_s16(tt2);
368 s10 = vget_low_s16(tt3);
369
370 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
371 filter4);
372 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
373 filter4);
374 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
375 filter4);
376 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
377 filter4);
378
379 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
380 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
381 transpose_u8_4x4(&d01, &d23);
382
383 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
384 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
385 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
386 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
387 d0123 = vreinterpretq_u32_u8(
388 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
389
390 vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
391 vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
392 vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
393 vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
394
395 s0 = s4;
396 s1 = s5;
397 s2 = s6;
398 s3 = s7;
399 s4 = s8;
400 s5 = s9;
401 s6 = s10;
402 src += 4;
403 dst += 4;
404 w -= 4;
405 } while (w > 0);
406 } else {
407 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
408 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
409 int width;
410 const uint8_t *s;
411 uint8x8_t t4, t5, t6, t7;
412 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
413
414 if (w == 4) {
415 uint32x4_t d0415 = vdupq_n_u32(0);
416 uint32x4_t d2637 = vdupq_n_u32(0);
417 do {
418 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
419 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
420 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
421 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
422 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
423 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
424 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
425 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
426 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
427
428 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
429 &t7);
430 src += 8 * src_stride;
431 __builtin_prefetch(dst + 0 * dst_stride);
432 __builtin_prefetch(dst + 1 * dst_stride);
433 __builtin_prefetch(dst + 2 * dst_stride);
434 __builtin_prefetch(dst + 3 * dst_stride);
435 __builtin_prefetch(dst + 4 * dst_stride);
436 __builtin_prefetch(dst + 5 * dst_stride);
437 __builtin_prefetch(dst + 6 * dst_stride);
438 __builtin_prefetch(dst + 7 * dst_stride);
439 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
440 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
441 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
442 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
443 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
444
445 __builtin_prefetch(src + 0 * src_stride);
446 __builtin_prefetch(src + 1 * src_stride);
447 __builtin_prefetch(src + 2 * src_stride);
448 __builtin_prefetch(src + 3 * src_stride);
449 __builtin_prefetch(src + 4 * src_stride);
450 __builtin_prefetch(src + 5 * src_stride);
451 __builtin_prefetch(src + 6 * src_stride);
452 __builtin_prefetch(src + 7 * src_stride);
453 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
454 filter4);
455 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
456 filter4);
457 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
458 filter4);
459 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
460 filter4);
461
462 transpose_u8_8x4(&t0, &t1, &t2, &t3);
463
464 d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
465 d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
466 d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
467 d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
468 d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
469 d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
470 d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
471 d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
472 d0415 = vreinterpretq_u32_u8(
473 vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
474 d2637 = vreinterpretq_u32_u8(
475 vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
476
477 vst1q_lane_u32((uint32_t *)dst, d0415, 0);
478 dst += dst_stride;
479 vst1q_lane_u32((uint32_t *)dst, d0415, 2);
480 dst += dst_stride;
481 vst1q_lane_u32((uint32_t *)dst, d2637, 0);
482 dst += dst_stride;
483 vst1q_lane_u32((uint32_t *)dst, d2637, 2);
484 dst += dst_stride;
485 vst1q_lane_u32((uint32_t *)dst, d0415, 1);
486 dst += dst_stride;
487 vst1q_lane_u32((uint32_t *)dst, d0415, 3);
488 dst += dst_stride;
489 vst1q_lane_u32((uint32_t *)dst, d2637, 1);
490 dst += dst_stride;
491 vst1q_lane_u32((uint32_t *)dst, d2637, 3);
492 dst += dst_stride;
493 h -= 8;
494 } while (h > 0);
495 } else {
496 uint8_t *d;
497 int16x8_t s11, s12, s13, s14;
498 uint8x16_t d01, d23, d45, d67;
499
500 do {
501 __builtin_prefetch(src + 0 * src_stride);
502 __builtin_prefetch(src + 1 * src_stride);
503 __builtin_prefetch(src + 2 * src_stride);
504 __builtin_prefetch(src + 3 * src_stride);
505 __builtin_prefetch(src + 4 * src_stride);
506 __builtin_prefetch(src + 5 * src_stride);
507 __builtin_prefetch(src + 6 * src_stride);
508 __builtin_prefetch(src + 7 * src_stride);
509 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
510 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
511 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
512 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
513 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
514 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
515 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
516 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
517 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
518
519 width = w;
520 s = src + 7;
521 d = dst;
522 __builtin_prefetch(dst + 0 * dst_stride);
523 __builtin_prefetch(dst + 1 * dst_stride);
524 __builtin_prefetch(dst + 2 * dst_stride);
525 __builtin_prefetch(dst + 3 * dst_stride);
526 __builtin_prefetch(dst + 4 * dst_stride);
527 __builtin_prefetch(dst + 5 * dst_stride);
528 __builtin_prefetch(dst + 6 * dst_stride);
529 __builtin_prefetch(dst + 7 * dst_stride);
530
531 do {
532 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
533 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
534 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
535 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
536 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
537 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
538 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
539 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
540 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
541 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
542
543 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
544 filter4);
545 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
546 filter4);
547 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
548 filter4);
549 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
550 filter4);
551 t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
552 filter4);
553 t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
554 filter4);
555 t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
556 filter4);
557 t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
558 filter3, filter4);
559
560 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
561
562 d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
563 vld1_u8(d + 1 * dst_stride));
564 d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
565 vld1_u8(d + 3 * dst_stride));
566 d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
567 vld1_u8(d + 5 * dst_stride));
568 d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
569 vld1_u8(d + 7 * dst_stride));
570 d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
571 d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
572 d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
573 d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
574
575 store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
576 vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
577 vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
578
579 s0 = s8;
580 s1 = s9;
581 s2 = s10;
582 s3 = s11;
583 s4 = s12;
584 s5 = s13;
585 s6 = s14;
586 s += 8;
587 d += 8;
588 width -= 8;
589 } while (width > 0);
590 src += 8 * src_stride;
591 dst += 8 * dst_stride;
592 h -= 8;
593 } while (h > 0);
594 }
595 }
596 }
597
vpx_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)598 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
599 uint8_t *dst, ptrdiff_t dst_stride,
600 const InterpKernel *filter, int x0_q4,
601 int x_step_q4, int y0_q4, int y_step_q4, int w,
602 int h) {
603 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
604
605 assert(!((intptr_t)dst & 3));
606 assert(!(dst_stride & 3));
607 assert(y_step_q4 == 16);
608
609 (void)x0_q4;
610 (void)x_step_q4;
611 (void)y_step_q4;
612
613 src -= 3 * src_stride;
614
615 if (w == 4) {
616 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
617 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
618 uint8x8_t d01, d23;
619 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
620
621 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
622 src += src_stride;
623 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
624 src += src_stride;
625 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
626 src += src_stride;
627 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
628 src += src_stride;
629 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
630 src += src_stride;
631 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
632 src += src_stride;
633 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
634 src += src_stride;
635
636 do {
637 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
638 src += src_stride;
639 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
640 src += src_stride;
641 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
642 src += src_stride;
643 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
644 src += src_stride;
645
646 __builtin_prefetch(dst + 0 * dst_stride);
647 __builtin_prefetch(dst + 1 * dst_stride);
648 __builtin_prefetch(dst + 2 * dst_stride);
649 __builtin_prefetch(dst + 3 * dst_stride);
650 __builtin_prefetch(src + 0 * src_stride);
651 __builtin_prefetch(src + 1 * src_stride);
652 __builtin_prefetch(src + 2 * src_stride);
653 __builtin_prefetch(src + 3 * src_stride);
654 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
655 filter4);
656 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
657 filter4);
658 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
659 filter4);
660 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
661 filter4);
662
663 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
664 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
665 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
666 dst += dst_stride;
667 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
668 dst += dst_stride;
669 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
670 dst += dst_stride;
671 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
672 dst += dst_stride;
673
674 s0 = s4;
675 s1 = s5;
676 s2 = s6;
677 s3 = s7;
678 s4 = s8;
679 s5 = s9;
680 s6 = s10;
681 h -= 4;
682 } while (h > 0);
683 } else {
684 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
685 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
686 int height;
687 const uint8_t *s;
688 uint8_t *d;
689 uint8x8_t t0, t1, t2, t3;
690 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
691
692 do {
693 __builtin_prefetch(src + 0 * src_stride);
694 __builtin_prefetch(src + 1 * src_stride);
695 __builtin_prefetch(src + 2 * src_stride);
696 __builtin_prefetch(src + 3 * src_stride);
697 __builtin_prefetch(src + 4 * src_stride);
698 __builtin_prefetch(src + 5 * src_stride);
699 __builtin_prefetch(src + 6 * src_stride);
700 s = src;
701 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
702 s += src_stride;
703 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
704 s += src_stride;
705 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
706 s += src_stride;
707 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
708 s += src_stride;
709 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
710 s += src_stride;
711 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
712 s += src_stride;
713 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
714 s += src_stride;
715 d = dst;
716 height = h;
717
718 do {
719 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
720 s += src_stride;
721 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
722 s += src_stride;
723 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
724 s += src_stride;
725 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
726 s += src_stride;
727
728 __builtin_prefetch(d + 0 * dst_stride);
729 __builtin_prefetch(d + 1 * dst_stride);
730 __builtin_prefetch(d + 2 * dst_stride);
731 __builtin_prefetch(d + 3 * dst_stride);
732 __builtin_prefetch(s + 0 * src_stride);
733 __builtin_prefetch(s + 1 * src_stride);
734 __builtin_prefetch(s + 2 * src_stride);
735 __builtin_prefetch(s + 3 * src_stride);
736 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
737 filter4);
738 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
739 filter4);
740 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
741 filter4);
742 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
743 filter4);
744
745 vst1_u8(d, t0);
746 d += dst_stride;
747 vst1_u8(d, t1);
748 d += dst_stride;
749 vst1_u8(d, t2);
750 d += dst_stride;
751 vst1_u8(d, t3);
752 d += dst_stride;
753
754 s0 = s4;
755 s1 = s5;
756 s2 = s6;
757 s3 = s7;
758 s4 = s8;
759 s5 = s9;
760 s6 = s10;
761 height -= 4;
762 } while (height > 0);
763 src += 8;
764 dst += 8;
765 w -= 8;
766 } while (w > 0);
767 }
768 }
769
vpx_convolve8_avg_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)770 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
771 uint8_t *dst, ptrdiff_t dst_stride,
772 const InterpKernel *filter, int x0_q4,
773 int x_step_q4, int y0_q4, int y_step_q4, int w,
774 int h) {
775 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
776
777 assert(!((intptr_t)dst & 3));
778 assert(!(dst_stride & 3));
779 assert(y_step_q4 == 16);
780
781 (void)x0_q4;
782 (void)x_step_q4;
783 (void)y_step_q4;
784
785 src -= 3 * src_stride;
786
787 if (w == 4) {
788 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
789 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
790 uint8x8_t d01, d23;
791 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
792 uint32x4_t d0123 = vdupq_n_u32(0);
793
794 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
795 src += src_stride;
796 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
797 src += src_stride;
798 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
799 src += src_stride;
800 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
801 src += src_stride;
802 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
803 src += src_stride;
804 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
805 src += src_stride;
806 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
807 src += src_stride;
808
809 do {
810 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
811 src += src_stride;
812 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
813 src += src_stride;
814 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
815 src += src_stride;
816 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
817 src += src_stride;
818
819 __builtin_prefetch(dst + 0 * dst_stride);
820 __builtin_prefetch(dst + 1 * dst_stride);
821 __builtin_prefetch(dst + 2 * dst_stride);
822 __builtin_prefetch(dst + 3 * dst_stride);
823 __builtin_prefetch(src + 0 * src_stride);
824 __builtin_prefetch(src + 1 * src_stride);
825 __builtin_prefetch(src + 2 * src_stride);
826 __builtin_prefetch(src + 3 * src_stride);
827 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
828 filter4);
829 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
830 filter4);
831 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
832 filter4);
833 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
834 filter4);
835
836 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
837 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
838
839 d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
840 d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
841 d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
842 d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
843 d0123 = vreinterpretq_u32_u8(
844 vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
845
846 vst1q_lane_u32((uint32_t *)dst, d0123, 0);
847 dst += dst_stride;
848 vst1q_lane_u32((uint32_t *)dst, d0123, 1);
849 dst += dst_stride;
850 vst1q_lane_u32((uint32_t *)dst, d0123, 2);
851 dst += dst_stride;
852 vst1q_lane_u32((uint32_t *)dst, d0123, 3);
853 dst += dst_stride;
854
855 s0 = s4;
856 s1 = s5;
857 s2 = s6;
858 s3 = s7;
859 s4 = s8;
860 s5 = s9;
861 s6 = s10;
862 h -= 4;
863 } while (h > 0);
864 } else {
865 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
866 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
867 int height;
868 const uint8_t *s;
869 uint8_t *d;
870 uint8x8_t t0, t1, t2, t3;
871 uint8x16_t d01, d23, dd01, dd23;
872 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
873
874 do {
875 __builtin_prefetch(src + 0 * src_stride);
876 __builtin_prefetch(src + 1 * src_stride);
877 __builtin_prefetch(src + 2 * src_stride);
878 __builtin_prefetch(src + 3 * src_stride);
879 __builtin_prefetch(src + 4 * src_stride);
880 __builtin_prefetch(src + 5 * src_stride);
881 __builtin_prefetch(src + 6 * src_stride);
882 s = src;
883 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
884 s += src_stride;
885 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
886 s += src_stride;
887 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
888 s += src_stride;
889 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
890 s += src_stride;
891 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
892 s += src_stride;
893 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
894 s += src_stride;
895 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
896 s += src_stride;
897 d = dst;
898 height = h;
899
900 do {
901 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
902 s += src_stride;
903 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
904 s += src_stride;
905 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
906 s += src_stride;
907 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
908 s += src_stride;
909
910 __builtin_prefetch(d + 0 * dst_stride);
911 __builtin_prefetch(d + 1 * dst_stride);
912 __builtin_prefetch(d + 2 * dst_stride);
913 __builtin_prefetch(d + 3 * dst_stride);
914 __builtin_prefetch(s + 0 * src_stride);
915 __builtin_prefetch(s + 1 * src_stride);
916 __builtin_prefetch(s + 2 * src_stride);
917 __builtin_prefetch(s + 3 * src_stride);
918 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
919 filter4);
920 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
921 filter4);
922 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
923 filter4);
924 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
925 filter4);
926
927 d01 = vcombine_u8(t0, t1);
928 d23 = vcombine_u8(t2, t3);
929 dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
930 vld1_u8(d + 1 * dst_stride));
931 dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
932 vld1_u8(d + 3 * dst_stride));
933 dd01 = vrhaddq_u8(dd01, d01);
934 dd23 = vrhaddq_u8(dd23, d23);
935
936 vst1_u8(d, vget_low_u8(dd01));
937 d += dst_stride;
938 vst1_u8(d, vget_high_u8(dd01));
939 d += dst_stride;
940 vst1_u8(d, vget_low_u8(dd23));
941 d += dst_stride;
942 vst1_u8(d, vget_high_u8(dd23));
943 d += dst_stride;
944
945 s0 = s4;
946 s1 = s5;
947 s2 = s6;
948 s3 = s7;
949 s4 = s8;
950 s5 = s9;
951 s6 = s10;
952 height -= 4;
953 } while (height > 0);
954 src += 8;
955 dst += 8;
956 w -= 8;
957 } while (w > 0);
958 }
959 }
960