1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <string.h>
13 #include "./vpx_config.h"
14 #include "./vp8_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_ports/mem.h"
17 
18 static const int8_t vp8_sub_pel_filters[8][8] = {
19   { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
20   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
21   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
22   { 0, -9, 93, 50, -6, 0, 0, 0 },
23   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
24   { 0, -6, 50, 93, -9, 0, 0, 0 },
25   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
26   { 0, -1, 12, 123, -6, 0, 0, 0 },
27 };
28 
29 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
30 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
31 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
32 // multiply/accumulates which either add or subtract as needed. The other
33 // functions will be updated to use this table later.
34 // It is also expanded to 8 elements to allow loading into 64 bit neon
35 // registers.
36 static const uint8_t abs_filters[8][8] = {
37   { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
38   { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
39   { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
40   { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
41 };
42 
load_and_shift(const unsigned char * a)43 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
44   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
45 }
46 
filter_add_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)47 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
48                                          const uint8x8_t filter, uint16x8_t *c,
49                                          uint16x8_t *d) {
50   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
51                                        vreinterpret_u32_u8(vget_high_u8(a)));
52   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
53                                        vreinterpret_u32_u8(vget_high_u8(b)));
54   *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
55   *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
56 }
57 
filter_sub_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)58 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
59                                          const uint8x8_t filter, uint16x8_t *c,
60                                          uint16x8_t *d) {
61   const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
62                                        vreinterpret_u32_u8(vget_high_u8(a)));
63   const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
64                                        vreinterpret_u32_u8(vget_high_u8(b)));
65   *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
66   *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
67 }
68 
yonly4x4(const unsigned char * src,int src_stride,int filter_offset,unsigned char * dst,int dst_stride)69 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
70                             int filter_offset, unsigned char *dst,
71                             int dst_stride) {
72   uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
73   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
74   uint16x8_t c0, c1, c2, c3;
75   int16x8_t d0, d1;
76   uint8x8_t e0, e1;
77 
78   const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
79   const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
80   const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
81   const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
82   const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
83   const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
84   const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
85 
86   src -= src_stride * 2;
87   // Shift the even rows to allow using 'vext' to combine the vectors. armv8
88   // has vcopy_lane which would be interesting. This started as just a
89   // horrible workaround for clang adding alignment hints to 32bit loads:
90   // https://llvm.org/bugs/show_bug.cgi?id=24421
91   // But it turns out it almost identical to casting the loads.
92   a0 = load_and_shift(src);
93   src += src_stride;
94   a1 = vld1_u8(src);
95   src += src_stride;
96   a2 = load_and_shift(src);
97   src += src_stride;
98   a3 = vld1_u8(src);
99   src += src_stride;
100   a4 = load_and_shift(src);
101   src += src_stride;
102   a5 = vld1_u8(src);
103   src += src_stride;
104   a6 = load_and_shift(src);
105   src += src_stride;
106   a7 = vld1_u8(src);
107   src += src_stride;
108   a8 = vld1_u8(src);
109 
110   // Combine the rows so we can operate on 8 at a time.
111   b0 = vext_u8(a0, a1, 4);
112   b2 = vext_u8(a2, a3, 4);
113   b4 = vext_u8(a4, a5, 4);
114   b6 = vext_u8(a6, a7, 4);
115   b8 = a8;
116 
117   // To keep with the 8-at-a-time theme, combine *alternate* rows. This
118   // allows combining the odd rows with the even.
119   b1 = vext_u8(b0, b2, 4);
120   b3 = vext_u8(b2, b4, 4);
121   b5 = vext_u8(b4, b6, 4);
122   b7 = vext_u8(b6, b8, 4);
123 
124   // Multiply and expand to 16 bits.
125   c0 = vmull_u8(b0, filter0);
126   c1 = vmull_u8(b2, filter0);
127   c2 = vmull_u8(b5, filter5);
128   c3 = vmull_u8(b7, filter5);
129 
130   // Multiply, subtract and accumulate for filters 1 and 4 (the negative
131   // ones).
132   c0 = vmlsl_u8(c0, b4, filter4);
133   c1 = vmlsl_u8(c1, b6, filter4);
134   c2 = vmlsl_u8(c2, b1, filter1);
135   c3 = vmlsl_u8(c3, b3, filter1);
136 
137   // Add more positive ones. vmlal should really return a signed type.
138   // It's doing signed math internally, as evidenced by the fact we can do
139   // subtractions followed by more additions. Ideally we could use
140   // vqmlal/sl but that instruction doesn't exist. Might be able to
141   // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
142   c0 = vmlal_u8(c0, b2, filter2);
143   c1 = vmlal_u8(c1, b4, filter2);
144   c2 = vmlal_u8(c2, b3, filter3);
145   c3 = vmlal_u8(c3, b5, filter3);
146 
147   // Use signed saturation math because vmlsl may have left some negative
148   // numbers in there.
149   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
150   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
151 
152   // Use signed again because numbers like -200 need to be saturated to 0.
153   e0 = vqrshrun_n_s16(d0, 7);
154   e1 = vqrshrun_n_s16(d1, 7);
155 
156   store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
157 }
158 
vp8_sixtap_predict4x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)159 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
160                                 int xoffset, int yoffset,
161                                 unsigned char *dst_ptr, int dst_pitch) {
162   uint8x16_t s0, s1, s2, s3, s4;
163   uint64x2_t s01, s23;
164   // Variables to hold src[] elements for the given filter[]
165   uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
166   uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
167   uint8x16_t s01_f0, s23_f0;
168   uint64x2_t s01_f3, s23_f3;
169   uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
170   // Accumulator variables.
171   uint16x8_t d0123, d4567, d89;
172   uint16x8_t d0123_a, d4567_a, d89_a;
173   int16x8_t e0123, e4567, e89;
174   // Second pass intermediates.
175   uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
176   uint16x8_t c0, c1, c2, c3;
177   int16x8_t d0, d1;
178   uint8x8_t e0, e1;
179   uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
180 
181   if (xoffset == 0) {  // Second pass only.
182     yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
183     return;
184   }
185 
186   if (yoffset == 0) {  // First pass only.
187     src_ptr -= 2;
188   } else {  // Add context for the second pass. 2 extra lines on top.
189     src_ptr -= 2 + (src_pixels_per_line * 2);
190   }
191 
192   filter = vld1_u8(abs_filters[xoffset]);
193   filter0 = vdup_lane_u8(filter, 0);
194   filter1 = vdup_lane_u8(filter, 1);
195   filter2 = vdup_lane_u8(filter, 2);
196   filter3 = vdup_lane_u8(filter, 3);
197   filter4 = vdup_lane_u8(filter, 4);
198   filter5 = vdup_lane_u8(filter, 5);
199 
200   // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
201   // garbage. So much effort for that last single bit.
202   // The low values of each pair are for filter0.
203   s0 = vld1q_u8(src_ptr);
204   src_ptr += src_pixels_per_line;
205   s1 = vld1q_u8(src_ptr);
206   src_ptr += src_pixels_per_line;
207   s2 = vld1q_u8(src_ptr);
208   src_ptr += src_pixels_per_line;
209   s3 = vld1q_u8(src_ptr);
210   src_ptr += src_pixels_per_line;
211 
212   // Shift to extract values for filter[5]
213   // If src[] is 0, this puts:
214   // 3 4 5 6 7 8 9 10 in s0_f5
215   // Can't use vshr.u64 because it crosses the double word boundary.
216   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
217   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
218   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
219   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
220 
221   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
222   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
223 
224   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
225   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
226   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
227   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
228 
229   // Keep original src data as 64 bits to simplify shifting and extracting.
230   s01 = vreinterpretq_u64_u8(s01_f0);
231   s23 = vreinterpretq_u64_u8(s23_f0);
232 
233   // 3 4 5 6 * filter0
234   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
235 
236   // Shift over one to use -1, 0, 1, 2 for filter1
237   // -1 0 1 2 * filter1
238   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
239                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
240                         &d0123, &d4567);
241 
242   // 2 3 4 5 * filter4
243   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
244                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
245                         &d0123, &d4567);
246 
247   // 0 1 2 3 * filter2
248   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
249                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
250                         &d0123, &d4567);
251 
252   // 1 2 3 4 * filter3
253   s01_f3 = vshrq_n_u64(s01, 24);
254   s23_f3 = vshrq_n_u64(s23, 24);
255   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
256                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
257   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
258                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
259   // Accumulate into different registers so it can use saturated addition.
260   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
261   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
262 
263   e0123 =
264       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
265   e4567 =
266       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
267 
268   // Shift and narrow.
269   b0 = vqrshrun_n_s16(e0123, 7);
270   b2 = vqrshrun_n_s16(e4567, 7);
271 
272   if (yoffset == 0) {  // firstpass_filter4x4_only
273     store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
274     return;
275   }
276 
277   // Load additional context when doing both filters.
278   s0 = vld1q_u8(src_ptr);
279   src_ptr += src_pixels_per_line;
280   s1 = vld1q_u8(src_ptr);
281   src_ptr += src_pixels_per_line;
282   s2 = vld1q_u8(src_ptr);
283   src_ptr += src_pixels_per_line;
284   s3 = vld1q_u8(src_ptr);
285   src_ptr += src_pixels_per_line;
286   s4 = vld1q_u8(src_ptr);
287 
288   s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
289   s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
290   s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
291   s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
292   s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
293 
294   // 3 4 5 6 * filter0
295   s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
296   s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
297 
298   s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
299   s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
300   // But this time instead of 16 pixels to filter, there are 20. So an extra
301   // run with a doubleword register.
302   d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
303   d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
304   d89 = vmull_u8(s4_f5, filter5);
305 
306   // Save a copy as u64 for shifting.
307   s01 = vreinterpretq_u64_u8(s01_f0);
308   s23 = vreinterpretq_u64_u8(s23_f0);
309 
310   filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
311   d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
312 
313   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
314                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
315                         &d0123, &d4567);
316   s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
317   d89 = vmlsl_u8(d89, s4_f1, filter1);
318 
319   filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
320                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
321                         &d0123, &d4567);
322   s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
323   d89 = vmlsl_u8(d89, s4_f4, filter4);
324 
325   filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
326                         vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
327                         &d0123, &d4567);
328   s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
329   d89 = vmlal_u8(d89, s4_f2, filter2);
330 
331   s01_f3 = vshrq_n_u64(s01, 24);
332   s23_f3 = vshrq_n_u64(s23, 24);
333   s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
334                       vreinterpret_u32_u64(vget_high_u64(s01_f3)));
335   s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
336                       vreinterpret_u32_u64(vget_high_u64(s23_f3)));
337   s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
338   d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
339   d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
340   d89_a = vmull_u8(s4_f3, filter3);
341 
342   e0123 =
343       vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
344   e4567 =
345       vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
346   e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
347 
348   b4 = vqrshrun_n_s16(e0123, 7);
349   b6 = vqrshrun_n_s16(e4567, 7);
350   b8 = vqrshrun_n_s16(e89, 7);
351 
352   // Second pass: 4x4
353   filter = vld1_u8(abs_filters[yoffset]);
354   filter0 = vdup_lane_u8(filter, 0);
355   filter1 = vdup_lane_u8(filter, 1);
356   filter2 = vdup_lane_u8(filter, 2);
357   filter3 = vdup_lane_u8(filter, 3);
358   filter4 = vdup_lane_u8(filter, 4);
359   filter5 = vdup_lane_u8(filter, 5);
360 
361   b1 = vext_u8(b0, b2, 4);
362   b3 = vext_u8(b2, b4, 4);
363   b5 = vext_u8(b4, b6, 4);
364   b7 = vext_u8(b6, b8, 4);
365 
366   c0 = vmull_u8(b0, filter0);
367   c1 = vmull_u8(b2, filter0);
368   c2 = vmull_u8(b5, filter5);
369   c3 = vmull_u8(b7, filter5);
370 
371   c0 = vmlsl_u8(c0, b4, filter4);
372   c1 = vmlsl_u8(c1, b6, filter4);
373   c2 = vmlsl_u8(c2, b1, filter1);
374   c3 = vmlsl_u8(c3, b3, filter1);
375 
376   c0 = vmlal_u8(c0, b2, filter2);
377   c1 = vmlal_u8(c1, b4, filter2);
378   c2 = vmlal_u8(c2, b3, filter3);
379   c3 = vmlal_u8(c3, b5, filter3);
380 
381   d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
382   d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
383 
384   e0 = vqrshrun_n_s16(d0, 7);
385   e1 = vqrshrun_n_s16(d1, 7);
386 
387   store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
388 }
389 
vp8_sixtap_predict8x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)390 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
391                                 int xoffset, int yoffset,
392                                 unsigned char *dst_ptr, int dst_pitch) {
393   unsigned char *src;
394   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
395   uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
396   uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
397   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
398   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
399   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
400   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
401   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
402   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
403 
404   if (xoffset == 0) {  // secondpass_filter8x4_only
405     // load second_pass filter
406     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
407     d0s8 = vdup_lane_s8(dtmps8, 0);
408     d1s8 = vdup_lane_s8(dtmps8, 1);
409     d2s8 = vdup_lane_s8(dtmps8, 2);
410     d3s8 = vdup_lane_s8(dtmps8, 3);
411     d4s8 = vdup_lane_s8(dtmps8, 4);
412     d5s8 = vdup_lane_s8(dtmps8, 5);
413     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
414     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
415     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
416     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
417     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
418     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
419 
420     // load src data
421     src = src_ptr - src_pixels_per_line * 2;
422     d22u8 = vld1_u8(src);
423     src += src_pixels_per_line;
424     d23u8 = vld1_u8(src);
425     src += src_pixels_per_line;
426     d24u8 = vld1_u8(src);
427     src += src_pixels_per_line;
428     d25u8 = vld1_u8(src);
429     src += src_pixels_per_line;
430     d26u8 = vld1_u8(src);
431     src += src_pixels_per_line;
432     d27u8 = vld1_u8(src);
433     src += src_pixels_per_line;
434     d28u8 = vld1_u8(src);
435     src += src_pixels_per_line;
436     d29u8 = vld1_u8(src);
437     src += src_pixels_per_line;
438     d30u8 = vld1_u8(src);
439 
440     q3u16 = vmull_u8(d22u8, d0u8);
441     q4u16 = vmull_u8(d23u8, d0u8);
442     q5u16 = vmull_u8(d24u8, d0u8);
443     q6u16 = vmull_u8(d25u8, d0u8);
444 
445     q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
446     q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
447     q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
448     q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
449 
450     q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
451     q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
452     q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
453     q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
454 
455     q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
456     q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
457     q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
458     q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
459 
460     q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
461     q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
462     q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
463     q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
464 
465     q7u16 = vmull_u8(d25u8, d3u8);
466     q8u16 = vmull_u8(d26u8, d3u8);
467     q9u16 = vmull_u8(d27u8, d3u8);
468     q10u16 = vmull_u8(d28u8, d3u8);
469 
470     q3s16 = vreinterpretq_s16_u16(q3u16);
471     q4s16 = vreinterpretq_s16_u16(q4u16);
472     q5s16 = vreinterpretq_s16_u16(q5u16);
473     q6s16 = vreinterpretq_s16_u16(q6u16);
474     q7s16 = vreinterpretq_s16_u16(q7u16);
475     q8s16 = vreinterpretq_s16_u16(q8u16);
476     q9s16 = vreinterpretq_s16_u16(q9u16);
477     q10s16 = vreinterpretq_s16_u16(q10u16);
478 
479     q7s16 = vqaddq_s16(q7s16, q3s16);
480     q8s16 = vqaddq_s16(q8s16, q4s16);
481     q9s16 = vqaddq_s16(q9s16, q5s16);
482     q10s16 = vqaddq_s16(q10s16, q6s16);
483 
484     d6u8 = vqrshrun_n_s16(q7s16, 7);
485     d7u8 = vqrshrun_n_s16(q8s16, 7);
486     d8u8 = vqrshrun_n_s16(q9s16, 7);
487     d9u8 = vqrshrun_n_s16(q10s16, 7);
488 
489     vst1_u8(dst_ptr, d6u8);
490     dst_ptr += dst_pitch;
491     vst1_u8(dst_ptr, d7u8);
492     dst_ptr += dst_pitch;
493     vst1_u8(dst_ptr, d8u8);
494     dst_ptr += dst_pitch;
495     vst1_u8(dst_ptr, d9u8);
496     return;
497   }
498 
499   // load first_pass filter
500   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
501   d0s8 = vdup_lane_s8(dtmps8, 0);
502   d1s8 = vdup_lane_s8(dtmps8, 1);
503   d2s8 = vdup_lane_s8(dtmps8, 2);
504   d3s8 = vdup_lane_s8(dtmps8, 3);
505   d4s8 = vdup_lane_s8(dtmps8, 4);
506   d5s8 = vdup_lane_s8(dtmps8, 5);
507   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
508   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
509   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
510   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
511   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
512   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
513 
514   // First pass: output_height lines x output_width columns (9x4)
515   if (yoffset == 0)  // firstpass_filter4x4_only
516     src = src_ptr - 2;
517   else
518     src = src_ptr - 2 - (src_pixels_per_line * 2);
519   q3u8 = vld1q_u8(src);
520   src += src_pixels_per_line;
521   q4u8 = vld1q_u8(src);
522   src += src_pixels_per_line;
523   q5u8 = vld1q_u8(src);
524   src += src_pixels_per_line;
525   q6u8 = vld1q_u8(src);
526 
527   q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
528   q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
529   q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
530   q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
531 
532   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
533   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
534   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
535   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
536 
537   q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
538   q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
539   q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
540   q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
541 
542   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
543   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
544   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
545   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
546 
547   q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
548   q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
549   q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
550   q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
551 
552   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
553   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
554   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
555   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
556 
557   q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
558   q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
559   q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
560   q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
561 
562   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
563   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
564   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
565   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
566 
567   q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
568   q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
569   q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
570   q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
571 
572   d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
573   d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
574   d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
575   d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
576 
577   q3u16 = vmull_u8(d28u8, d3u8);
578   q4u16 = vmull_u8(d29u8, d3u8);
579   q5u16 = vmull_u8(d30u8, d3u8);
580   q6u16 = vmull_u8(d31u8, d3u8);
581 
582   q3s16 = vreinterpretq_s16_u16(q3u16);
583   q4s16 = vreinterpretq_s16_u16(q4u16);
584   q5s16 = vreinterpretq_s16_u16(q5u16);
585   q6s16 = vreinterpretq_s16_u16(q6u16);
586   q7s16 = vreinterpretq_s16_u16(q7u16);
587   q8s16 = vreinterpretq_s16_u16(q8u16);
588   q9s16 = vreinterpretq_s16_u16(q9u16);
589   q10s16 = vreinterpretq_s16_u16(q10u16);
590 
591   q7s16 = vqaddq_s16(q7s16, q3s16);
592   q8s16 = vqaddq_s16(q8s16, q4s16);
593   q9s16 = vqaddq_s16(q9s16, q5s16);
594   q10s16 = vqaddq_s16(q10s16, q6s16);
595 
596   d22u8 = vqrshrun_n_s16(q7s16, 7);
597   d23u8 = vqrshrun_n_s16(q8s16, 7);
598   d24u8 = vqrshrun_n_s16(q9s16, 7);
599   d25u8 = vqrshrun_n_s16(q10s16, 7);
600 
601   if (yoffset == 0) {  // firstpass_filter8x4_only
602     vst1_u8(dst_ptr, d22u8);
603     dst_ptr += dst_pitch;
604     vst1_u8(dst_ptr, d23u8);
605     dst_ptr += dst_pitch;
606     vst1_u8(dst_ptr, d24u8);
607     dst_ptr += dst_pitch;
608     vst1_u8(dst_ptr, d25u8);
609     return;
610   }
611 
612   // First Pass on rest 5-line data
613   src += src_pixels_per_line;
614   q3u8 = vld1q_u8(src);
615   src += src_pixels_per_line;
616   q4u8 = vld1q_u8(src);
617   src += src_pixels_per_line;
618   q5u8 = vld1q_u8(src);
619   src += src_pixels_per_line;
620   q6u8 = vld1q_u8(src);
621   src += src_pixels_per_line;
622   q7u8 = vld1q_u8(src);
623 
624   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
625   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
626   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
627   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
628   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
629 
630   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
631   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
632   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
633   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
634   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
635 
636   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
637   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
638   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
639   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
640   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
641 
642   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
643   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
644   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
645   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
646   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
647 
648   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
649   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
650   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
651   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
652   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
653 
654   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
655   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
656   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
657   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
658   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
659 
660   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
661   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
662   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
663   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
664   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
665 
666   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
667   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
668   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
669   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
670   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
671 
672   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
673   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
674   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
675   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
676   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
677 
678   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
679   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
680   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
681   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
682   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
683 
684   q3u16 = vmull_u8(d27u8, d3u8);
685   q4u16 = vmull_u8(d28u8, d3u8);
686   q5u16 = vmull_u8(d29u8, d3u8);
687   q6u16 = vmull_u8(d30u8, d3u8);
688   q7u16 = vmull_u8(d31u8, d3u8);
689 
690   q3s16 = vreinterpretq_s16_u16(q3u16);
691   q4s16 = vreinterpretq_s16_u16(q4u16);
692   q5s16 = vreinterpretq_s16_u16(q5u16);
693   q6s16 = vreinterpretq_s16_u16(q6u16);
694   q7s16 = vreinterpretq_s16_u16(q7u16);
695   q8s16 = vreinterpretq_s16_u16(q8u16);
696   q9s16 = vreinterpretq_s16_u16(q9u16);
697   q10s16 = vreinterpretq_s16_u16(q10u16);
698   q11s16 = vreinterpretq_s16_u16(q11u16);
699   q12s16 = vreinterpretq_s16_u16(q12u16);
700 
701   q8s16 = vqaddq_s16(q8s16, q3s16);
702   q9s16 = vqaddq_s16(q9s16, q4s16);
703   q10s16 = vqaddq_s16(q10s16, q5s16);
704   q11s16 = vqaddq_s16(q11s16, q6s16);
705   q12s16 = vqaddq_s16(q12s16, q7s16);
706 
707   d26u8 = vqrshrun_n_s16(q8s16, 7);
708   d27u8 = vqrshrun_n_s16(q9s16, 7);
709   d28u8 = vqrshrun_n_s16(q10s16, 7);
710   d29u8 = vqrshrun_n_s16(q11s16, 7);
711   d30u8 = vqrshrun_n_s16(q12s16, 7);
712 
713   // Second pass: 8x4
714   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
715   d0s8 = vdup_lane_s8(dtmps8, 0);
716   d1s8 = vdup_lane_s8(dtmps8, 1);
717   d2s8 = vdup_lane_s8(dtmps8, 2);
718   d3s8 = vdup_lane_s8(dtmps8, 3);
719   d4s8 = vdup_lane_s8(dtmps8, 4);
720   d5s8 = vdup_lane_s8(dtmps8, 5);
721   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
722   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
723   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
724   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
725   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
726   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
727 
728   q3u16 = vmull_u8(d22u8, d0u8);
729   q4u16 = vmull_u8(d23u8, d0u8);
730   q5u16 = vmull_u8(d24u8, d0u8);
731   q6u16 = vmull_u8(d25u8, d0u8);
732 
733   q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
734   q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
735   q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
736   q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
737 
738   q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
739   q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
740   q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
741   q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
742 
743   q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
744   q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
745   q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
746   q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
747 
748   q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
749   q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
750   q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
751   q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
752 
753   q7u16 = vmull_u8(d25u8, d3u8);
754   q8u16 = vmull_u8(d26u8, d3u8);
755   q9u16 = vmull_u8(d27u8, d3u8);
756   q10u16 = vmull_u8(d28u8, d3u8);
757 
758   q3s16 = vreinterpretq_s16_u16(q3u16);
759   q4s16 = vreinterpretq_s16_u16(q4u16);
760   q5s16 = vreinterpretq_s16_u16(q5u16);
761   q6s16 = vreinterpretq_s16_u16(q6u16);
762   q7s16 = vreinterpretq_s16_u16(q7u16);
763   q8s16 = vreinterpretq_s16_u16(q8u16);
764   q9s16 = vreinterpretq_s16_u16(q9u16);
765   q10s16 = vreinterpretq_s16_u16(q10u16);
766 
767   q7s16 = vqaddq_s16(q7s16, q3s16);
768   q8s16 = vqaddq_s16(q8s16, q4s16);
769   q9s16 = vqaddq_s16(q9s16, q5s16);
770   q10s16 = vqaddq_s16(q10s16, q6s16);
771 
772   d6u8 = vqrshrun_n_s16(q7s16, 7);
773   d7u8 = vqrshrun_n_s16(q8s16, 7);
774   d8u8 = vqrshrun_n_s16(q9s16, 7);
775   d9u8 = vqrshrun_n_s16(q10s16, 7);
776 
777   vst1_u8(dst_ptr, d6u8);
778   dst_ptr += dst_pitch;
779   vst1_u8(dst_ptr, d7u8);
780   dst_ptr += dst_pitch;
781   vst1_u8(dst_ptr, d8u8);
782   dst_ptr += dst_pitch;
783   vst1_u8(dst_ptr, d9u8);
784   return;
785 }
786 
vp8_sixtap_predict8x8_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)787 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
788                                 int xoffset, int yoffset,
789                                 unsigned char *dst_ptr, int dst_pitch) {
790   unsigned char *src, *tmpp;
791   unsigned char tmp[64];
792   int i;
793   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
794   uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
795   uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
796   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
797   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
798   uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
799   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
800   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
801   uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
802 
803   if (xoffset == 0) {  // secondpass_filter8x8_only
804     // load second_pass filter
805     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
806     d0s8 = vdup_lane_s8(dtmps8, 0);
807     d1s8 = vdup_lane_s8(dtmps8, 1);
808     d2s8 = vdup_lane_s8(dtmps8, 2);
809     d3s8 = vdup_lane_s8(dtmps8, 3);
810     d4s8 = vdup_lane_s8(dtmps8, 4);
811     d5s8 = vdup_lane_s8(dtmps8, 5);
812     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
813     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
814     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
815     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
816     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
817     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
818 
819     // load src data
820     src = src_ptr - src_pixels_per_line * 2;
821     d18u8 = vld1_u8(src);
822     src += src_pixels_per_line;
823     d19u8 = vld1_u8(src);
824     src += src_pixels_per_line;
825     d20u8 = vld1_u8(src);
826     src += src_pixels_per_line;
827     d21u8 = vld1_u8(src);
828     src += src_pixels_per_line;
829     d22u8 = vld1_u8(src);
830     src += src_pixels_per_line;
831     d23u8 = vld1_u8(src);
832     src += src_pixels_per_line;
833     d24u8 = vld1_u8(src);
834     src += src_pixels_per_line;
835     d25u8 = vld1_u8(src);
836     src += src_pixels_per_line;
837     d26u8 = vld1_u8(src);
838     src += src_pixels_per_line;
839     d27u8 = vld1_u8(src);
840     src += src_pixels_per_line;
841     d28u8 = vld1_u8(src);
842     src += src_pixels_per_line;
843     d29u8 = vld1_u8(src);
844     src += src_pixels_per_line;
845     d30u8 = vld1_u8(src);
846 
847     for (i = 2; i > 0; i--) {
848       q3u16 = vmull_u8(d18u8, d0u8);
849       q4u16 = vmull_u8(d19u8, d0u8);
850       q5u16 = vmull_u8(d20u8, d0u8);
851       q6u16 = vmull_u8(d21u8, d0u8);
852 
853       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
854       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
855       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
856       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
857 
858       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
859       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
860       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
861       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
862 
863       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
864       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
865       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
866       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
867 
868       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
869       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
870       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
871       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
872 
873       q7u16 = vmull_u8(d21u8, d3u8);
874       q8u16 = vmull_u8(d22u8, d3u8);
875       q9u16 = vmull_u8(d23u8, d3u8);
876       q10u16 = vmull_u8(d24u8, d3u8);
877 
878       q3s16 = vreinterpretq_s16_u16(q3u16);
879       q4s16 = vreinterpretq_s16_u16(q4u16);
880       q5s16 = vreinterpretq_s16_u16(q5u16);
881       q6s16 = vreinterpretq_s16_u16(q6u16);
882       q7s16 = vreinterpretq_s16_u16(q7u16);
883       q8s16 = vreinterpretq_s16_u16(q8u16);
884       q9s16 = vreinterpretq_s16_u16(q9u16);
885       q10s16 = vreinterpretq_s16_u16(q10u16);
886 
887       q7s16 = vqaddq_s16(q7s16, q3s16);
888       q8s16 = vqaddq_s16(q8s16, q4s16);
889       q9s16 = vqaddq_s16(q9s16, q5s16);
890       q10s16 = vqaddq_s16(q10s16, q6s16);
891 
892       d6u8 = vqrshrun_n_s16(q7s16, 7);
893       d7u8 = vqrshrun_n_s16(q8s16, 7);
894       d8u8 = vqrshrun_n_s16(q9s16, 7);
895       d9u8 = vqrshrun_n_s16(q10s16, 7);
896 
897       d18u8 = d22u8;
898       d19u8 = d23u8;
899       d20u8 = d24u8;
900       d21u8 = d25u8;
901       d22u8 = d26u8;
902       d23u8 = d27u8;
903       d24u8 = d28u8;
904       d25u8 = d29u8;
905       d26u8 = d30u8;
906 
907       vst1_u8(dst_ptr, d6u8);
908       dst_ptr += dst_pitch;
909       vst1_u8(dst_ptr, d7u8);
910       dst_ptr += dst_pitch;
911       vst1_u8(dst_ptr, d8u8);
912       dst_ptr += dst_pitch;
913       vst1_u8(dst_ptr, d9u8);
914       dst_ptr += dst_pitch;
915     }
916     return;
917   }
918 
919   // load first_pass filter
920   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
921   d0s8 = vdup_lane_s8(dtmps8, 0);
922   d1s8 = vdup_lane_s8(dtmps8, 1);
923   d2s8 = vdup_lane_s8(dtmps8, 2);
924   d3s8 = vdup_lane_s8(dtmps8, 3);
925   d4s8 = vdup_lane_s8(dtmps8, 4);
926   d5s8 = vdup_lane_s8(dtmps8, 5);
927   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
928   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
929   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
930   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
931   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
932   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
933 
934   // First pass: output_height lines x output_width columns (9x4)
935   if (yoffset == 0)  // firstpass_filter4x4_only
936     src = src_ptr - 2;
937   else
938     src = src_ptr - 2 - (src_pixels_per_line * 2);
939 
940   tmpp = tmp;
941   for (i = 2; i > 0; i--) {
942     q3u8 = vld1q_u8(src);
943     src += src_pixels_per_line;
944     q4u8 = vld1q_u8(src);
945     src += src_pixels_per_line;
946     q5u8 = vld1q_u8(src);
947     src += src_pixels_per_line;
948     q6u8 = vld1q_u8(src);
949     src += src_pixels_per_line;
950 
951     __builtin_prefetch(src);
952     __builtin_prefetch(src + src_pixels_per_line);
953     __builtin_prefetch(src + src_pixels_per_line * 2);
954 
955     q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
956     q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
957     q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
958     q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
959 
960     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
961     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
962     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
963     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
964 
965     q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
966     q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
967     q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
968     q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
969 
970     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
971     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
972     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
973     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
974 
975     q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
976     q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
977     q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
978     q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
979 
980     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
981     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
982     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
983     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
984 
985     q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
986     q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
987     q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
988     q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
989 
990     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
991     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
992     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
993     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
994 
995     q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
996     q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
997     q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
998     q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
999 
1000     d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1001     d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1002     d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1003     d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1004 
1005     q3u16 = vmull_u8(d28u8, d3u8);
1006     q4u16 = vmull_u8(d29u8, d3u8);
1007     q5u16 = vmull_u8(d30u8, d3u8);
1008     q6u16 = vmull_u8(d31u8, d3u8);
1009 
1010     q3s16 = vreinterpretq_s16_u16(q3u16);
1011     q4s16 = vreinterpretq_s16_u16(q4u16);
1012     q5s16 = vreinterpretq_s16_u16(q5u16);
1013     q6s16 = vreinterpretq_s16_u16(q6u16);
1014     q7s16 = vreinterpretq_s16_u16(q7u16);
1015     q8s16 = vreinterpretq_s16_u16(q8u16);
1016     q9s16 = vreinterpretq_s16_u16(q9u16);
1017     q10s16 = vreinterpretq_s16_u16(q10u16);
1018 
1019     q7s16 = vqaddq_s16(q7s16, q3s16);
1020     q8s16 = vqaddq_s16(q8s16, q4s16);
1021     q9s16 = vqaddq_s16(q9s16, q5s16);
1022     q10s16 = vqaddq_s16(q10s16, q6s16);
1023 
1024     d22u8 = vqrshrun_n_s16(q7s16, 7);
1025     d23u8 = vqrshrun_n_s16(q8s16, 7);
1026     d24u8 = vqrshrun_n_s16(q9s16, 7);
1027     d25u8 = vqrshrun_n_s16(q10s16, 7);
1028 
1029     if (yoffset == 0) {  // firstpass_filter8x4_only
1030       vst1_u8(dst_ptr, d22u8);
1031       dst_ptr += dst_pitch;
1032       vst1_u8(dst_ptr, d23u8);
1033       dst_ptr += dst_pitch;
1034       vst1_u8(dst_ptr, d24u8);
1035       dst_ptr += dst_pitch;
1036       vst1_u8(dst_ptr, d25u8);
1037       dst_ptr += dst_pitch;
1038     } else {
1039       vst1_u8(tmpp, d22u8);
1040       tmpp += 8;
1041       vst1_u8(tmpp, d23u8);
1042       tmpp += 8;
1043       vst1_u8(tmpp, d24u8);
1044       tmpp += 8;
1045       vst1_u8(tmpp, d25u8);
1046       tmpp += 8;
1047     }
1048   }
1049   if (yoffset == 0) return;
1050 
1051   // First Pass on rest 5-line data
1052   q3u8 = vld1q_u8(src);
1053   src += src_pixels_per_line;
1054   q4u8 = vld1q_u8(src);
1055   src += src_pixels_per_line;
1056   q5u8 = vld1q_u8(src);
1057   src += src_pixels_per_line;
1058   q6u8 = vld1q_u8(src);
1059   src += src_pixels_per_line;
1060   q7u8 = vld1q_u8(src);
1061 
1062   q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1063   q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1064   q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1065   q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1066   q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1067 
1068   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1069   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1070   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1071   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1072   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1073 
1074   q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1075   q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1076   q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1077   q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1078   q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1079 
1080   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1081   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1082   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1083   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1084   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1085 
1086   q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1087   q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1088   q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1089   q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1090   q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1091 
1092   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1093   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1094   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1095   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1096   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1097 
1098   q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1099   q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1100   q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1101   q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1102   q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1103 
1104   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1105   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1106   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1107   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1108   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1109 
1110   q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1111   q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1112   q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1113   q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1114   q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1115 
1116   d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1117   d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1118   d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1119   d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1120   d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1121 
1122   q3u16 = vmull_u8(d27u8, d3u8);
1123   q4u16 = vmull_u8(d28u8, d3u8);
1124   q5u16 = vmull_u8(d29u8, d3u8);
1125   q6u16 = vmull_u8(d30u8, d3u8);
1126   q7u16 = vmull_u8(d31u8, d3u8);
1127 
1128   q3s16 = vreinterpretq_s16_u16(q3u16);
1129   q4s16 = vreinterpretq_s16_u16(q4u16);
1130   q5s16 = vreinterpretq_s16_u16(q5u16);
1131   q6s16 = vreinterpretq_s16_u16(q6u16);
1132   q7s16 = vreinterpretq_s16_u16(q7u16);
1133   q8s16 = vreinterpretq_s16_u16(q8u16);
1134   q9s16 = vreinterpretq_s16_u16(q9u16);
1135   q10s16 = vreinterpretq_s16_u16(q10u16);
1136   q11s16 = vreinterpretq_s16_u16(q11u16);
1137   q12s16 = vreinterpretq_s16_u16(q12u16);
1138 
1139   q8s16 = vqaddq_s16(q8s16, q3s16);
1140   q9s16 = vqaddq_s16(q9s16, q4s16);
1141   q10s16 = vqaddq_s16(q10s16, q5s16);
1142   q11s16 = vqaddq_s16(q11s16, q6s16);
1143   q12s16 = vqaddq_s16(q12s16, q7s16);
1144 
1145   d26u8 = vqrshrun_n_s16(q8s16, 7);
1146   d27u8 = vqrshrun_n_s16(q9s16, 7);
1147   d28u8 = vqrshrun_n_s16(q10s16, 7);
1148   d29u8 = vqrshrun_n_s16(q11s16, 7);
1149   d30u8 = vqrshrun_n_s16(q12s16, 7);
1150 
1151   // Second pass: 8x8
1152   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1153   d0s8 = vdup_lane_s8(dtmps8, 0);
1154   d1s8 = vdup_lane_s8(dtmps8, 1);
1155   d2s8 = vdup_lane_s8(dtmps8, 2);
1156   d3s8 = vdup_lane_s8(dtmps8, 3);
1157   d4s8 = vdup_lane_s8(dtmps8, 4);
1158   d5s8 = vdup_lane_s8(dtmps8, 5);
1159   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1160   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1161   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1162   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1163   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1164   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1165 
1166   tmpp = tmp;
1167   q9u8 = vld1q_u8(tmpp);
1168   tmpp += 16;
1169   q10u8 = vld1q_u8(tmpp);
1170   tmpp += 16;
1171   q11u8 = vld1q_u8(tmpp);
1172   tmpp += 16;
1173   q12u8 = vld1q_u8(tmpp);
1174 
1175   d18u8 = vget_low_u8(q9u8);
1176   d19u8 = vget_high_u8(q9u8);
1177   d20u8 = vget_low_u8(q10u8);
1178   d21u8 = vget_high_u8(q10u8);
1179   d22u8 = vget_low_u8(q11u8);
1180   d23u8 = vget_high_u8(q11u8);
1181   d24u8 = vget_low_u8(q12u8);
1182   d25u8 = vget_high_u8(q12u8);
1183 
1184   for (i = 2; i > 0; i--) {
1185     q3u16 = vmull_u8(d18u8, d0u8);
1186     q4u16 = vmull_u8(d19u8, d0u8);
1187     q5u16 = vmull_u8(d20u8, d0u8);
1188     q6u16 = vmull_u8(d21u8, d0u8);
1189 
1190     q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1191     q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1192     q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1193     q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1194 
1195     q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1196     q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1197     q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1198     q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1199 
1200     q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1201     q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1202     q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1203     q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1204 
1205     q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1206     q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1207     q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1208     q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1209 
1210     q7u16 = vmull_u8(d21u8, d3u8);
1211     q8u16 = vmull_u8(d22u8, d3u8);
1212     q9u16 = vmull_u8(d23u8, d3u8);
1213     q10u16 = vmull_u8(d24u8, d3u8);
1214 
1215     q3s16 = vreinterpretq_s16_u16(q3u16);
1216     q4s16 = vreinterpretq_s16_u16(q4u16);
1217     q5s16 = vreinterpretq_s16_u16(q5u16);
1218     q6s16 = vreinterpretq_s16_u16(q6u16);
1219     q7s16 = vreinterpretq_s16_u16(q7u16);
1220     q8s16 = vreinterpretq_s16_u16(q8u16);
1221     q9s16 = vreinterpretq_s16_u16(q9u16);
1222     q10s16 = vreinterpretq_s16_u16(q10u16);
1223 
1224     q7s16 = vqaddq_s16(q7s16, q3s16);
1225     q8s16 = vqaddq_s16(q8s16, q4s16);
1226     q9s16 = vqaddq_s16(q9s16, q5s16);
1227     q10s16 = vqaddq_s16(q10s16, q6s16);
1228 
1229     d6u8 = vqrshrun_n_s16(q7s16, 7);
1230     d7u8 = vqrshrun_n_s16(q8s16, 7);
1231     d8u8 = vqrshrun_n_s16(q9s16, 7);
1232     d9u8 = vqrshrun_n_s16(q10s16, 7);
1233 
1234     d18u8 = d22u8;
1235     d19u8 = d23u8;
1236     d20u8 = d24u8;
1237     d21u8 = d25u8;
1238     d22u8 = d26u8;
1239     d23u8 = d27u8;
1240     d24u8 = d28u8;
1241     d25u8 = d29u8;
1242     d26u8 = d30u8;
1243 
1244     vst1_u8(dst_ptr, d6u8);
1245     dst_ptr += dst_pitch;
1246     vst1_u8(dst_ptr, d7u8);
1247     dst_ptr += dst_pitch;
1248     vst1_u8(dst_ptr, d8u8);
1249     dst_ptr += dst_pitch;
1250     vst1_u8(dst_ptr, d9u8);
1251     dst_ptr += dst_pitch;
1252   }
1253   return;
1254 }
1255 
vp8_sixtap_predict16x16_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)1256 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
1257                                   int src_pixels_per_line, int xoffset,
1258                                   int yoffset, unsigned char *dst_ptr,
1259                                   int dst_pitch) {
1260   unsigned char *src, *src_tmp, *dst, *tmpp;
1261   unsigned char tmp[336];
1262   int i, j;
1263   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1264   uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1265   uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1266   uint8x8_t d28u8, d29u8, d30u8, d31u8;
1267   int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1268   uint8x16_t q3u8, q4u8;
1269   uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1270   uint16x8_t q11u16, q12u16, q13u16, q15u16;
1271   int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1272   int16x8_t q11s16, q12s16, q13s16, q15s16;
1273 
1274   if (xoffset == 0) {  // secondpass_filter8x8_only
1275     // load second_pass filter
1276     dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1277     d0s8 = vdup_lane_s8(dtmps8, 0);
1278     d1s8 = vdup_lane_s8(dtmps8, 1);
1279     d2s8 = vdup_lane_s8(dtmps8, 2);
1280     d3s8 = vdup_lane_s8(dtmps8, 3);
1281     d4s8 = vdup_lane_s8(dtmps8, 4);
1282     d5s8 = vdup_lane_s8(dtmps8, 5);
1283     d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1284     d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1285     d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1286     d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1287     d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1288     d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1289 
1290     // load src data
1291     src_tmp = src_ptr - src_pixels_per_line * 2;
1292     for (i = 0; i < 2; ++i) {
1293       src = src_tmp + i * 8;
1294       dst = dst_ptr + i * 8;
1295       d18u8 = vld1_u8(src);
1296       src += src_pixels_per_line;
1297       d19u8 = vld1_u8(src);
1298       src += src_pixels_per_line;
1299       d20u8 = vld1_u8(src);
1300       src += src_pixels_per_line;
1301       d21u8 = vld1_u8(src);
1302       src += src_pixels_per_line;
1303       d22u8 = vld1_u8(src);
1304       src += src_pixels_per_line;
1305       for (j = 0; j < 4; ++j) {
1306         d23u8 = vld1_u8(src);
1307         src += src_pixels_per_line;
1308         d24u8 = vld1_u8(src);
1309         src += src_pixels_per_line;
1310         d25u8 = vld1_u8(src);
1311         src += src_pixels_per_line;
1312         d26u8 = vld1_u8(src);
1313         src += src_pixels_per_line;
1314 
1315         q3u16 = vmull_u8(d18u8, d0u8);
1316         q4u16 = vmull_u8(d19u8, d0u8);
1317         q5u16 = vmull_u8(d20u8, d0u8);
1318         q6u16 = vmull_u8(d21u8, d0u8);
1319 
1320         q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1321         q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1322         q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1323         q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1324 
1325         q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1326         q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1327         q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1328         q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1329 
1330         q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1331         q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1332         q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1333         q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1334 
1335         q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1336         q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1337         q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1338         q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1339 
1340         q7u16 = vmull_u8(d21u8, d3u8);
1341         q8u16 = vmull_u8(d22u8, d3u8);
1342         q9u16 = vmull_u8(d23u8, d3u8);
1343         q10u16 = vmull_u8(d24u8, d3u8);
1344 
1345         q3s16 = vreinterpretq_s16_u16(q3u16);
1346         q4s16 = vreinterpretq_s16_u16(q4u16);
1347         q5s16 = vreinterpretq_s16_u16(q5u16);
1348         q6s16 = vreinterpretq_s16_u16(q6u16);
1349         q7s16 = vreinterpretq_s16_u16(q7u16);
1350         q8s16 = vreinterpretq_s16_u16(q8u16);
1351         q9s16 = vreinterpretq_s16_u16(q9u16);
1352         q10s16 = vreinterpretq_s16_u16(q10u16);
1353 
1354         q7s16 = vqaddq_s16(q7s16, q3s16);
1355         q8s16 = vqaddq_s16(q8s16, q4s16);
1356         q9s16 = vqaddq_s16(q9s16, q5s16);
1357         q10s16 = vqaddq_s16(q10s16, q6s16);
1358 
1359         d6u8 = vqrshrun_n_s16(q7s16, 7);
1360         d7u8 = vqrshrun_n_s16(q8s16, 7);
1361         d8u8 = vqrshrun_n_s16(q9s16, 7);
1362         d9u8 = vqrshrun_n_s16(q10s16, 7);
1363 
1364         d18u8 = d22u8;
1365         d19u8 = d23u8;
1366         d20u8 = d24u8;
1367         d21u8 = d25u8;
1368         d22u8 = d26u8;
1369 
1370         vst1_u8(dst, d6u8);
1371         dst += dst_pitch;
1372         vst1_u8(dst, d7u8);
1373         dst += dst_pitch;
1374         vst1_u8(dst, d8u8);
1375         dst += dst_pitch;
1376         vst1_u8(dst, d9u8);
1377         dst += dst_pitch;
1378       }
1379     }
1380     return;
1381   }
1382 
1383   // load first_pass filter
1384   dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1385   d0s8 = vdup_lane_s8(dtmps8, 0);
1386   d1s8 = vdup_lane_s8(dtmps8, 1);
1387   d2s8 = vdup_lane_s8(dtmps8, 2);
1388   d3s8 = vdup_lane_s8(dtmps8, 3);
1389   d4s8 = vdup_lane_s8(dtmps8, 4);
1390   d5s8 = vdup_lane_s8(dtmps8, 5);
1391   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1392   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1393   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1394   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1395   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1396   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1397 
1398   // First pass: output_height lines x output_width columns (9x4)
1399   if (yoffset == 0) {  // firstpass_filter4x4_only
1400     src = src_ptr - 2;
1401     dst = dst_ptr;
1402     for (i = 0; i < 8; ++i) {
1403       d6u8 = vld1_u8(src);
1404       d7u8 = vld1_u8(src + 8);
1405       d8u8 = vld1_u8(src + 16);
1406       src += src_pixels_per_line;
1407       d9u8 = vld1_u8(src);
1408       d10u8 = vld1_u8(src + 8);
1409       d11u8 = vld1_u8(src + 16);
1410       src += src_pixels_per_line;
1411 
1412       __builtin_prefetch(src);
1413       __builtin_prefetch(src + src_pixels_per_line);
1414 
1415       q6u16 = vmull_u8(d6u8, d0u8);
1416       q7u16 = vmull_u8(d7u8, d0u8);
1417       q8u16 = vmull_u8(d9u8, d0u8);
1418       q9u16 = vmull_u8(d10u8, d0u8);
1419 
1420       d20u8 = vext_u8(d6u8, d7u8, 1);
1421       d21u8 = vext_u8(d9u8, d10u8, 1);
1422       d22u8 = vext_u8(d7u8, d8u8, 1);
1423       d23u8 = vext_u8(d10u8, d11u8, 1);
1424       d24u8 = vext_u8(d6u8, d7u8, 4);
1425       d25u8 = vext_u8(d9u8, d10u8, 4);
1426       d26u8 = vext_u8(d7u8, d8u8, 4);
1427       d27u8 = vext_u8(d10u8, d11u8, 4);
1428       d28u8 = vext_u8(d6u8, d7u8, 5);
1429       d29u8 = vext_u8(d9u8, d10u8, 5);
1430 
1431       q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1432       q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1433       q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1434       q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1435       q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1436       q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1437       q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1438       q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1439       q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1440       q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1441 
1442       d20u8 = vext_u8(d7u8, d8u8, 5);
1443       d21u8 = vext_u8(d10u8, d11u8, 5);
1444       d22u8 = vext_u8(d6u8, d7u8, 2);
1445       d23u8 = vext_u8(d9u8, d10u8, 2);
1446       d24u8 = vext_u8(d7u8, d8u8, 2);
1447       d25u8 = vext_u8(d10u8, d11u8, 2);
1448       d26u8 = vext_u8(d6u8, d7u8, 3);
1449       d27u8 = vext_u8(d9u8, d10u8, 3);
1450       d28u8 = vext_u8(d7u8, d8u8, 3);
1451       d29u8 = vext_u8(d10u8, d11u8, 3);
1452 
1453       q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1454       q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1455       q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1456       q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1457       q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1458       q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1459 
1460       q10u16 = vmull_u8(d26u8, d3u8);
1461       q11u16 = vmull_u8(d27u8, d3u8);
1462       q12u16 = vmull_u8(d28u8, d3u8);
1463       q15u16 = vmull_u8(d29u8, d3u8);
1464 
1465       q6s16 = vreinterpretq_s16_u16(q6u16);
1466       q7s16 = vreinterpretq_s16_u16(q7u16);
1467       q8s16 = vreinterpretq_s16_u16(q8u16);
1468       q9s16 = vreinterpretq_s16_u16(q9u16);
1469       q10s16 = vreinterpretq_s16_u16(q10u16);
1470       q11s16 = vreinterpretq_s16_u16(q11u16);
1471       q12s16 = vreinterpretq_s16_u16(q12u16);
1472       q15s16 = vreinterpretq_s16_u16(q15u16);
1473 
1474       q6s16 = vqaddq_s16(q6s16, q10s16);
1475       q8s16 = vqaddq_s16(q8s16, q11s16);
1476       q7s16 = vqaddq_s16(q7s16, q12s16);
1477       q9s16 = vqaddq_s16(q9s16, q15s16);
1478 
1479       d6u8 = vqrshrun_n_s16(q6s16, 7);
1480       d7u8 = vqrshrun_n_s16(q7s16, 7);
1481       d8u8 = vqrshrun_n_s16(q8s16, 7);
1482       d9u8 = vqrshrun_n_s16(q9s16, 7);
1483 
1484       q3u8 = vcombine_u8(d6u8, d7u8);
1485       q4u8 = vcombine_u8(d8u8, d9u8);
1486       vst1q_u8(dst, q3u8);
1487       dst += dst_pitch;
1488       vst1q_u8(dst, q4u8);
1489       dst += dst_pitch;
1490     }
1491     return;
1492   }
1493 
1494   src = src_ptr - 2 - src_pixels_per_line * 2;
1495   tmpp = tmp;
1496   for (i = 0; i < 7; ++i) {
1497     d6u8 = vld1_u8(src);
1498     d7u8 = vld1_u8(src + 8);
1499     d8u8 = vld1_u8(src + 16);
1500     src += src_pixels_per_line;
1501     d9u8 = vld1_u8(src);
1502     d10u8 = vld1_u8(src + 8);
1503     d11u8 = vld1_u8(src + 16);
1504     src += src_pixels_per_line;
1505     d12u8 = vld1_u8(src);
1506     d13u8 = vld1_u8(src + 8);
1507     d14u8 = vld1_u8(src + 16);
1508     src += src_pixels_per_line;
1509 
1510     __builtin_prefetch(src);
1511     __builtin_prefetch(src + src_pixels_per_line);
1512     __builtin_prefetch(src + src_pixels_per_line * 2);
1513 
1514     q8u16 = vmull_u8(d6u8, d0u8);
1515     q9u16 = vmull_u8(d7u8, d0u8);
1516     q10u16 = vmull_u8(d9u8, d0u8);
1517     q11u16 = vmull_u8(d10u8, d0u8);
1518     q12u16 = vmull_u8(d12u8, d0u8);
1519     q13u16 = vmull_u8(d13u8, d0u8);
1520 
1521     d28u8 = vext_u8(d6u8, d7u8, 1);
1522     d29u8 = vext_u8(d9u8, d10u8, 1);
1523     d30u8 = vext_u8(d12u8, d13u8, 1);
1524     q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1525     q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1526     q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1527     d28u8 = vext_u8(d7u8, d8u8, 1);
1528     d29u8 = vext_u8(d10u8, d11u8, 1);
1529     d30u8 = vext_u8(d13u8, d14u8, 1);
1530     q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1531     q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1532     q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1533 
1534     d28u8 = vext_u8(d6u8, d7u8, 4);
1535     d29u8 = vext_u8(d9u8, d10u8, 4);
1536     d30u8 = vext_u8(d12u8, d13u8, 4);
1537     q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1538     q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1539     q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1540     d28u8 = vext_u8(d7u8, d8u8, 4);
1541     d29u8 = vext_u8(d10u8, d11u8, 4);
1542     d30u8 = vext_u8(d13u8, d14u8, 4);
1543     q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1544     q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1545     q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1546 
1547     d28u8 = vext_u8(d6u8, d7u8, 5);
1548     d29u8 = vext_u8(d9u8, d10u8, 5);
1549     d30u8 = vext_u8(d12u8, d13u8, 5);
1550     q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1551     q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1552     q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1553     d28u8 = vext_u8(d7u8, d8u8, 5);
1554     d29u8 = vext_u8(d10u8, d11u8, 5);
1555     d30u8 = vext_u8(d13u8, d14u8, 5);
1556     q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1557     q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1558     q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1559 
1560     d28u8 = vext_u8(d6u8, d7u8, 2);
1561     d29u8 = vext_u8(d9u8, d10u8, 2);
1562     d30u8 = vext_u8(d12u8, d13u8, 2);
1563     q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1564     q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1565     q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1566     d28u8 = vext_u8(d7u8, d8u8, 2);
1567     d29u8 = vext_u8(d10u8, d11u8, 2);
1568     d30u8 = vext_u8(d13u8, d14u8, 2);
1569     q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1570     q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1571     q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1572 
1573     d28u8 = vext_u8(d6u8, d7u8, 3);
1574     d29u8 = vext_u8(d9u8, d10u8, 3);
1575     d30u8 = vext_u8(d12u8, d13u8, 3);
1576     d15u8 = vext_u8(d7u8, d8u8, 3);
1577     d31u8 = vext_u8(d10u8, d11u8, 3);
1578     d6u8 = vext_u8(d13u8, d14u8, 3);
1579     q4u16 = vmull_u8(d28u8, d3u8);
1580     q5u16 = vmull_u8(d29u8, d3u8);
1581     q6u16 = vmull_u8(d30u8, d3u8);
1582     q4s16 = vreinterpretq_s16_u16(q4u16);
1583     q5s16 = vreinterpretq_s16_u16(q5u16);
1584     q6s16 = vreinterpretq_s16_u16(q6u16);
1585     q8s16 = vreinterpretq_s16_u16(q8u16);
1586     q10s16 = vreinterpretq_s16_u16(q10u16);
1587     q12s16 = vreinterpretq_s16_u16(q12u16);
1588     q8s16 = vqaddq_s16(q8s16, q4s16);
1589     q10s16 = vqaddq_s16(q10s16, q5s16);
1590     q12s16 = vqaddq_s16(q12s16, q6s16);
1591 
1592     q6u16 = vmull_u8(d15u8, d3u8);
1593     q7u16 = vmull_u8(d31u8, d3u8);
1594     q3u16 = vmull_u8(d6u8, d3u8);
1595     q3s16 = vreinterpretq_s16_u16(q3u16);
1596     q6s16 = vreinterpretq_s16_u16(q6u16);
1597     q7s16 = vreinterpretq_s16_u16(q7u16);
1598     q9s16 = vreinterpretq_s16_u16(q9u16);
1599     q11s16 = vreinterpretq_s16_u16(q11u16);
1600     q13s16 = vreinterpretq_s16_u16(q13u16);
1601     q9s16 = vqaddq_s16(q9s16, q6s16);
1602     q11s16 = vqaddq_s16(q11s16, q7s16);
1603     q13s16 = vqaddq_s16(q13s16, q3s16);
1604 
1605     d6u8 = vqrshrun_n_s16(q8s16, 7);
1606     d7u8 = vqrshrun_n_s16(q9s16, 7);
1607     d8u8 = vqrshrun_n_s16(q10s16, 7);
1608     d9u8 = vqrshrun_n_s16(q11s16, 7);
1609     d10u8 = vqrshrun_n_s16(q12s16, 7);
1610     d11u8 = vqrshrun_n_s16(q13s16, 7);
1611 
1612     vst1_u8(tmpp, d6u8);
1613     tmpp += 8;
1614     vst1_u8(tmpp, d7u8);
1615     tmpp += 8;
1616     vst1_u8(tmpp, d8u8);
1617     tmpp += 8;
1618     vst1_u8(tmpp, d9u8);
1619     tmpp += 8;
1620     vst1_u8(tmpp, d10u8);
1621     tmpp += 8;
1622     vst1_u8(tmpp, d11u8);
1623     tmpp += 8;
1624   }
1625 
1626   // Second pass: 16x16
1627   dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1628   d0s8 = vdup_lane_s8(dtmps8, 0);
1629   d1s8 = vdup_lane_s8(dtmps8, 1);
1630   d2s8 = vdup_lane_s8(dtmps8, 2);
1631   d3s8 = vdup_lane_s8(dtmps8, 3);
1632   d4s8 = vdup_lane_s8(dtmps8, 4);
1633   d5s8 = vdup_lane_s8(dtmps8, 5);
1634   d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1635   d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1636   d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1637   d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1638   d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1639   d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1640 
1641   for (i = 0; i < 2; ++i) {
1642     dst = dst_ptr + 8 * i;
1643     tmpp = tmp + 8 * i;
1644     d18u8 = vld1_u8(tmpp);
1645     tmpp += 16;
1646     d19u8 = vld1_u8(tmpp);
1647     tmpp += 16;
1648     d20u8 = vld1_u8(tmpp);
1649     tmpp += 16;
1650     d21u8 = vld1_u8(tmpp);
1651     tmpp += 16;
1652     d22u8 = vld1_u8(tmpp);
1653     tmpp += 16;
1654     for (j = 0; j < 4; ++j) {
1655       d23u8 = vld1_u8(tmpp);
1656       tmpp += 16;
1657       d24u8 = vld1_u8(tmpp);
1658       tmpp += 16;
1659       d25u8 = vld1_u8(tmpp);
1660       tmpp += 16;
1661       d26u8 = vld1_u8(tmpp);
1662       tmpp += 16;
1663 
1664       q3u16 = vmull_u8(d18u8, d0u8);
1665       q4u16 = vmull_u8(d19u8, d0u8);
1666       q5u16 = vmull_u8(d20u8, d0u8);
1667       q6u16 = vmull_u8(d21u8, d0u8);
1668 
1669       q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1670       q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1671       q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1672       q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1673 
1674       q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1675       q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1676       q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1677       q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1678 
1679       q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1680       q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1681       q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1682       q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1683 
1684       q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1685       q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1686       q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1687       q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1688 
1689       q7u16 = vmull_u8(d21u8, d3u8);
1690       q8u16 = vmull_u8(d22u8, d3u8);
1691       q9u16 = vmull_u8(d23u8, d3u8);
1692       q10u16 = vmull_u8(d24u8, d3u8);
1693 
1694       q3s16 = vreinterpretq_s16_u16(q3u16);
1695       q4s16 = vreinterpretq_s16_u16(q4u16);
1696       q5s16 = vreinterpretq_s16_u16(q5u16);
1697       q6s16 = vreinterpretq_s16_u16(q6u16);
1698       q7s16 = vreinterpretq_s16_u16(q7u16);
1699       q8s16 = vreinterpretq_s16_u16(q8u16);
1700       q9s16 = vreinterpretq_s16_u16(q9u16);
1701       q10s16 = vreinterpretq_s16_u16(q10u16);
1702 
1703       q7s16 = vqaddq_s16(q7s16, q3s16);
1704       q8s16 = vqaddq_s16(q8s16, q4s16);
1705       q9s16 = vqaddq_s16(q9s16, q5s16);
1706       q10s16 = vqaddq_s16(q10s16, q6s16);
1707 
1708       d6u8 = vqrshrun_n_s16(q7s16, 7);
1709       d7u8 = vqrshrun_n_s16(q8s16, 7);
1710       d8u8 = vqrshrun_n_s16(q9s16, 7);
1711       d9u8 = vqrshrun_n_s16(q10s16, 7);
1712 
1713       d18u8 = d22u8;
1714       d19u8 = d23u8;
1715       d20u8 = d24u8;
1716       d21u8 = d25u8;
1717       d22u8 = d26u8;
1718 
1719       vst1_u8(dst, d6u8);
1720       dst += dst_pitch;
1721       vst1_u8(dst, d7u8);
1722       dst += dst_pitch;
1723       vst1_u8(dst, d8u8);
1724       dst += dst_pitch;
1725       vst1_u8(dst, d9u8);
1726       dst += dst_pitch;
1727     }
1728   }
1729   return;
1730 }
1731