1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "vpx_ports/mem.h"
13
14 static const int8_t vp8_sub_pel_filters[8][8] = {
15 {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
16 {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
17 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
18 {0, -9, 93, 50, -6, 0, 0, 0},
19 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
20 {0, -6, 50, 93, -9, 0, 0, 0},
21 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
22 {0, -1, 12, 123, -6, 0, 0, 0},
23 };
24
vp8_sixtap_predict4x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)25 void vp8_sixtap_predict4x4_neon(
26 unsigned char *src_ptr,
27 int src_pixels_per_line,
28 int xoffset,
29 int yoffset,
30 unsigned char *dst_ptr,
31 int dst_pitch) {
32 unsigned char *src;
33 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
34 uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
35 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
36 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
37 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
38 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
39 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
40 uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
41 uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
42 uint32x2x2_t d0u32x2, d1u32x2;
43
44 if (xoffset == 0) { // secondpass_filter4x4_only
45 uint32x2_t d27u32 = vdup_n_u32(0);
46 uint32x2_t d28u32 = vdup_n_u32(0);
47 uint32x2_t d29u32 = vdup_n_u32(0);
48 uint32x2_t d30u32 = vdup_n_u32(0);
49 uint32x2_t d31u32 = vdup_n_u32(0);
50
51 // load second_pass filter
52 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
53 d0s8 = vdup_lane_s8(dtmps8, 0);
54 d1s8 = vdup_lane_s8(dtmps8, 1);
55 d2s8 = vdup_lane_s8(dtmps8, 2);
56 d3s8 = vdup_lane_s8(dtmps8, 3);
57 d4s8 = vdup_lane_s8(dtmps8, 4);
58 d5s8 = vdup_lane_s8(dtmps8, 5);
59 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
60 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
61 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
62 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
63 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
64 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
65
66 // load src data
67 src = src_ptr - src_pixels_per_line * 2;
68 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
69 src += src_pixels_per_line;
70 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
71 src += src_pixels_per_line;
72 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
73 src += src_pixels_per_line;
74 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
75 src += src_pixels_per_line;
76 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
77 src += src_pixels_per_line;
78 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
79 src += src_pixels_per_line;
80 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
81 src += src_pixels_per_line;
82 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
83 src += src_pixels_per_line;
84 d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
85
86 d27u8 = vreinterpret_u8_u32(d27u32);
87 d28u8 = vreinterpret_u8_u32(d28u32);
88 d29u8 = vreinterpret_u8_u32(d29u32);
89 d30u8 = vreinterpret_u8_u32(d30u32);
90 d31u8 = vreinterpret_u8_u32(d31u32);
91
92 d23u8 = vext_u8(d27u8, d28u8, 4);
93 d24u8 = vext_u8(d28u8, d29u8, 4);
94 d25u8 = vext_u8(d29u8, d30u8, 4);
95 d26u8 = vext_u8(d30u8, d31u8, 4);
96
97 q3u16 = vmull_u8(d27u8, d0u8);
98 q4u16 = vmull_u8(d28u8, d0u8);
99 q5u16 = vmull_u8(d25u8, d5u8);
100 q6u16 = vmull_u8(d26u8, d5u8);
101
102 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
103 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
104 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
105 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
106
107 q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
108 q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
109 q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
110 q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
111
112 q3s16 = vreinterpretq_s16_u16(q3u16);
113 q4s16 = vreinterpretq_s16_u16(q4u16);
114 q5s16 = vreinterpretq_s16_u16(q5u16);
115 q6s16 = vreinterpretq_s16_u16(q6u16);
116
117 q5s16 = vqaddq_s16(q5s16, q3s16);
118 q6s16 = vqaddq_s16(q6s16, q4s16);
119
120 d3u8 = vqrshrun_n_s16(q5s16, 7);
121 d4u8 = vqrshrun_n_s16(q6s16, 7);
122
123 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
124 dst_ptr += dst_pitch;
125 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
126 dst_ptr += dst_pitch;
127 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
128 dst_ptr += dst_pitch;
129 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
130 return;
131 }
132
133 // load first_pass filter
134 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
135 d0s8 = vdup_lane_s8(dtmps8, 0);
136 d1s8 = vdup_lane_s8(dtmps8, 1);
137 d2s8 = vdup_lane_s8(dtmps8, 2);
138 d3s8 = vdup_lane_s8(dtmps8, 3);
139 d4s8 = vdup_lane_s8(dtmps8, 4);
140 d5s8 = vdup_lane_s8(dtmps8, 5);
141 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
142 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
143 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
144 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
145 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
146 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
147
148 // First pass: output_height lines x output_width columns (9x4)
149
150 if (yoffset == 0) // firstpass_filter4x4_only
151 src = src_ptr - 2;
152 else
153 src = src_ptr - 2 - (src_pixels_per_line * 2);
154
155 q3u8 = vld1q_u8(src);
156 src += src_pixels_per_line;
157 q4u8 = vld1q_u8(src);
158 src += src_pixels_per_line;
159 q5u8 = vld1q_u8(src);
160 src += src_pixels_per_line;
161 q6u8 = vld1q_u8(src);
162 src += src_pixels_per_line;
163
164 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
165 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
166 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
167 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
168
169 // vswp here
170 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
171 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
172
173 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
174 vreinterpret_u32_u8(d19u8));
175 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
176 vreinterpret_u32_u8(d21u8));
177 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
178 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
179
180 // keep original src data in q4 q6
181 q4u64 = vreinterpretq_u64_u8(q3u8);
182 q6u64 = vreinterpretq_u64_u8(q5u8);
183
184 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
185 vreinterpret_u32_u8(vget_high_u8(q3u8)));
186 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
187 vreinterpret_u32_u8(vget_high_u8(q5u8)));
188 q9u64 = vshrq_n_u64(q4u64, 8);
189 q10u64 = vshrq_n_u64(q6u64, 8);
190 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
191 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
192
193 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
194 vreinterpret_u32_u64(vget_high_u64(q9u64)));
195 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
196 vreinterpret_u32_u64(vget_high_u64(q10u64)));
197 q3u64 = vshrq_n_u64(q4u64, 32);
198 q5u64 = vshrq_n_u64(q6u64, 32);
199 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
200 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
201
202 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
203 vreinterpret_u32_u64(vget_high_u64(q3u64)));
204 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
205 vreinterpret_u32_u64(vget_high_u64(q5u64)));
206 q9u64 = vshrq_n_u64(q4u64, 16);
207 q10u64 = vshrq_n_u64(q6u64, 16);
208 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
209 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
210
211 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
212 vreinterpret_u32_u64(vget_high_u64(q9u64)));
213 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
214 vreinterpret_u32_u64(vget_high_u64(q10u64)));
215 q3u64 = vshrq_n_u64(q4u64, 24);
216 q5u64 = vshrq_n_u64(q6u64, 24);
217 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
218 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
219
220 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
221 vreinterpret_u32_u64(vget_high_u64(q3u64)));
222 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
223 vreinterpret_u32_u64(vget_high_u64(q5u64)));
224 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
225 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
226
227 q7s16 = vreinterpretq_s16_u16(q7u16);
228 q8s16 = vreinterpretq_s16_u16(q8u16);
229 q9s16 = vreinterpretq_s16_u16(q9u16);
230 q10s16 = vreinterpretq_s16_u16(q10u16);
231 q7s16 = vqaddq_s16(q7s16, q9s16);
232 q8s16 = vqaddq_s16(q8s16, q10s16);
233
234 d27u8 = vqrshrun_n_s16(q7s16, 7);
235 d28u8 = vqrshrun_n_s16(q8s16, 7);
236
237 if (yoffset == 0) { // firstpass_filter4x4_only
238 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
239 dst_ptr += dst_pitch;
240 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
241 dst_ptr += dst_pitch;
242 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
243 dst_ptr += dst_pitch;
244 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
245 return;
246 }
247
248 // First Pass on rest 5-line data
249 q3u8 = vld1q_u8(src);
250 src += src_pixels_per_line;
251 q4u8 = vld1q_u8(src);
252 src += src_pixels_per_line;
253 q5u8 = vld1q_u8(src);
254 src += src_pixels_per_line;
255 q6u8 = vld1q_u8(src);
256 src += src_pixels_per_line;
257 q11u8 = vld1q_u8(src);
258
259 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
260 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
261 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
262 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
263
264 // vswp here
265 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
266 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
267
268 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
269 vreinterpret_u32_u8(d19u8));
270 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
271 vreinterpret_u32_u8(d21u8));
272 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
273 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
274 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
275 q12u16 = vmull_u8(d31u8, d5u8);
276
277 q4u64 = vreinterpretq_u64_u8(q3u8);
278 q6u64 = vreinterpretq_u64_u8(q5u8);
279
280 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
281 vreinterpret_u32_u8(vget_high_u8(q3u8)));
282 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
283 vreinterpret_u32_u8(vget_high_u8(q5u8)));
284 q9u64 = vshrq_n_u64(q4u64, 8);
285 q10u64 = vshrq_n_u64(q6u64, 8);
286 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
287 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
288 q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
289
290 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
291 vreinterpret_u32_u64(vget_high_u64(q9u64)));
292 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
293 vreinterpret_u32_u64(vget_high_u64(q10u64)));
294 q3u64 = vshrq_n_u64(q4u64, 32);
295 q5u64 = vshrq_n_u64(q6u64, 32);
296 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
297 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
298 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
299 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
300
301 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
302 vreinterpret_u32_u64(vget_high_u64(q3u64)));
303 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
304 vreinterpret_u32_u64(vget_high_u64(q5u64)));
305 q9u64 = vshrq_n_u64(q4u64, 16);
306 q10u64 = vshrq_n_u64(q6u64, 16);
307 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
308 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
309 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
310 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
311
312 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
313 vreinterpret_u32_u64(vget_high_u64(q9u64)));
314 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
315 vreinterpret_u32_u64(vget_high_u64(q10u64)));
316 q3u64 = vshrq_n_u64(q4u64, 24);
317 q5u64 = vshrq_n_u64(q6u64, 24);
318 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
319 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
320 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
321 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
322
323 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
324 vreinterpret_u32_u64(vget_high_u64(q3u64)));
325 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
326 vreinterpret_u32_u64(vget_high_u64(q5u64)));
327 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
328 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
329 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
330 q11u16 = vmull_u8(d31u8, d3u8);
331
332 q7s16 = vreinterpretq_s16_u16(q7u16);
333 q8s16 = vreinterpretq_s16_u16(q8u16);
334 q9s16 = vreinterpretq_s16_u16(q9u16);
335 q10s16 = vreinterpretq_s16_u16(q10u16);
336 q11s16 = vreinterpretq_s16_u16(q11u16);
337 q12s16 = vreinterpretq_s16_u16(q12u16);
338 q7s16 = vqaddq_s16(q7s16, q9s16);
339 q8s16 = vqaddq_s16(q8s16, q10s16);
340 q12s16 = vqaddq_s16(q12s16, q11s16);
341
342 d29u8 = vqrshrun_n_s16(q7s16, 7);
343 d30u8 = vqrshrun_n_s16(q8s16, 7);
344 d31u8 = vqrshrun_n_s16(q12s16, 7);
345
346 // Second pass: 4x4
347 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
348 d0s8 = vdup_lane_s8(dtmps8, 0);
349 d1s8 = vdup_lane_s8(dtmps8, 1);
350 d2s8 = vdup_lane_s8(dtmps8, 2);
351 d3s8 = vdup_lane_s8(dtmps8, 3);
352 d4s8 = vdup_lane_s8(dtmps8, 4);
353 d5s8 = vdup_lane_s8(dtmps8, 5);
354 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
355 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
356 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
357 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
358 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
359 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
360
361 d23u8 = vext_u8(d27u8, d28u8, 4);
362 d24u8 = vext_u8(d28u8, d29u8, 4);
363 d25u8 = vext_u8(d29u8, d30u8, 4);
364 d26u8 = vext_u8(d30u8, d31u8, 4);
365
366 q3u16 = vmull_u8(d27u8, d0u8);
367 q4u16 = vmull_u8(d28u8, d0u8);
368 q5u16 = vmull_u8(d25u8, d5u8);
369 q6u16 = vmull_u8(d26u8, d5u8);
370
371 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
372 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
373 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
374 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
375
376 q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
377 q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
378 q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
379 q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
380
381 q3s16 = vreinterpretq_s16_u16(q3u16);
382 q4s16 = vreinterpretq_s16_u16(q4u16);
383 q5s16 = vreinterpretq_s16_u16(q5u16);
384 q6s16 = vreinterpretq_s16_u16(q6u16);
385
386 q5s16 = vqaddq_s16(q5s16, q3s16);
387 q6s16 = vqaddq_s16(q6s16, q4s16);
388
389 d3u8 = vqrshrun_n_s16(q5s16, 7);
390 d4u8 = vqrshrun_n_s16(q6s16, 7);
391
392 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
393 dst_ptr += dst_pitch;
394 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
395 dst_ptr += dst_pitch;
396 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
397 dst_ptr += dst_pitch;
398 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
399 return;
400 }
401
vp8_sixtap_predict8x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)402 void vp8_sixtap_predict8x4_neon(
403 unsigned char *src_ptr,
404 int src_pixels_per_line,
405 int xoffset,
406 int yoffset,
407 unsigned char *dst_ptr,
408 int dst_pitch) {
409 unsigned char *src;
410 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
411 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
412 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
413 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
414 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
415 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
416 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
417 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
418 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
419
420 if (xoffset == 0) { // secondpass_filter8x4_only
421 // load second_pass filter
422 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
423 d0s8 = vdup_lane_s8(dtmps8, 0);
424 d1s8 = vdup_lane_s8(dtmps8, 1);
425 d2s8 = vdup_lane_s8(dtmps8, 2);
426 d3s8 = vdup_lane_s8(dtmps8, 3);
427 d4s8 = vdup_lane_s8(dtmps8, 4);
428 d5s8 = vdup_lane_s8(dtmps8, 5);
429 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
430 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
431 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
432 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
433 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
434 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
435
436 // load src data
437 src = src_ptr - src_pixels_per_line * 2;
438 d22u8 = vld1_u8(src);
439 src += src_pixels_per_line;
440 d23u8 = vld1_u8(src);
441 src += src_pixels_per_line;
442 d24u8 = vld1_u8(src);
443 src += src_pixels_per_line;
444 d25u8 = vld1_u8(src);
445 src += src_pixels_per_line;
446 d26u8 = vld1_u8(src);
447 src += src_pixels_per_line;
448 d27u8 = vld1_u8(src);
449 src += src_pixels_per_line;
450 d28u8 = vld1_u8(src);
451 src += src_pixels_per_line;
452 d29u8 = vld1_u8(src);
453 src += src_pixels_per_line;
454 d30u8 = vld1_u8(src);
455
456 q3u16 = vmull_u8(d22u8, d0u8);
457 q4u16 = vmull_u8(d23u8, d0u8);
458 q5u16 = vmull_u8(d24u8, d0u8);
459 q6u16 = vmull_u8(d25u8, d0u8);
460
461 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
462 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
463 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
464 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
465
466 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
467 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
468 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
469 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
470
471 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
472 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
473 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
474 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
475
476 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
477 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
478 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
479 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
480
481 q7u16 = vmull_u8(d25u8, d3u8);
482 q8u16 = vmull_u8(d26u8, d3u8);
483 q9u16 = vmull_u8(d27u8, d3u8);
484 q10u16 = vmull_u8(d28u8, d3u8);
485
486 q3s16 = vreinterpretq_s16_u16(q3u16);
487 q4s16 = vreinterpretq_s16_u16(q4u16);
488 q5s16 = vreinterpretq_s16_u16(q5u16);
489 q6s16 = vreinterpretq_s16_u16(q6u16);
490 q7s16 = vreinterpretq_s16_u16(q7u16);
491 q8s16 = vreinterpretq_s16_u16(q8u16);
492 q9s16 = vreinterpretq_s16_u16(q9u16);
493 q10s16 = vreinterpretq_s16_u16(q10u16);
494
495 q7s16 = vqaddq_s16(q7s16, q3s16);
496 q8s16 = vqaddq_s16(q8s16, q4s16);
497 q9s16 = vqaddq_s16(q9s16, q5s16);
498 q10s16 = vqaddq_s16(q10s16, q6s16);
499
500 d6u8 = vqrshrun_n_s16(q7s16, 7);
501 d7u8 = vqrshrun_n_s16(q8s16, 7);
502 d8u8 = vqrshrun_n_s16(q9s16, 7);
503 d9u8 = vqrshrun_n_s16(q10s16, 7);
504
505 vst1_u8(dst_ptr, d6u8);
506 dst_ptr += dst_pitch;
507 vst1_u8(dst_ptr, d7u8);
508 dst_ptr += dst_pitch;
509 vst1_u8(dst_ptr, d8u8);
510 dst_ptr += dst_pitch;
511 vst1_u8(dst_ptr, d9u8);
512 return;
513 }
514
515 // load first_pass filter
516 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
517 d0s8 = vdup_lane_s8(dtmps8, 0);
518 d1s8 = vdup_lane_s8(dtmps8, 1);
519 d2s8 = vdup_lane_s8(dtmps8, 2);
520 d3s8 = vdup_lane_s8(dtmps8, 3);
521 d4s8 = vdup_lane_s8(dtmps8, 4);
522 d5s8 = vdup_lane_s8(dtmps8, 5);
523 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
524 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
525 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
526 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
527 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
528 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
529
530 // First pass: output_height lines x output_width columns (9x4)
531 if (yoffset == 0) // firstpass_filter4x4_only
532 src = src_ptr - 2;
533 else
534 src = src_ptr - 2 - (src_pixels_per_line * 2);
535 q3u8 = vld1q_u8(src);
536 src += src_pixels_per_line;
537 q4u8 = vld1q_u8(src);
538 src += src_pixels_per_line;
539 q5u8 = vld1q_u8(src);
540 src += src_pixels_per_line;
541 q6u8 = vld1q_u8(src);
542
543 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
544 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
545 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
546 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
547
548 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
549 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
550 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
551 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
552
553 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
554 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
555 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
556 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
557
558 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
559 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
560 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
561 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
562
563 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
564 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
565 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
566 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
567
568 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
569 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
570 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
571 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
572
573 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
574 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
575 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
576 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
577
578 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
579 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
580 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
581 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
582
583 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
584 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
585 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
586 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
587
588 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
589 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
590 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
591 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
592
593 q3u16 = vmull_u8(d28u8, d3u8);
594 q4u16 = vmull_u8(d29u8, d3u8);
595 q5u16 = vmull_u8(d30u8, d3u8);
596 q6u16 = vmull_u8(d31u8, d3u8);
597
598 q3s16 = vreinterpretq_s16_u16(q3u16);
599 q4s16 = vreinterpretq_s16_u16(q4u16);
600 q5s16 = vreinterpretq_s16_u16(q5u16);
601 q6s16 = vreinterpretq_s16_u16(q6u16);
602 q7s16 = vreinterpretq_s16_u16(q7u16);
603 q8s16 = vreinterpretq_s16_u16(q8u16);
604 q9s16 = vreinterpretq_s16_u16(q9u16);
605 q10s16 = vreinterpretq_s16_u16(q10u16);
606
607 q7s16 = vqaddq_s16(q7s16, q3s16);
608 q8s16 = vqaddq_s16(q8s16, q4s16);
609 q9s16 = vqaddq_s16(q9s16, q5s16);
610 q10s16 = vqaddq_s16(q10s16, q6s16);
611
612 d22u8 = vqrshrun_n_s16(q7s16, 7);
613 d23u8 = vqrshrun_n_s16(q8s16, 7);
614 d24u8 = vqrshrun_n_s16(q9s16, 7);
615 d25u8 = vqrshrun_n_s16(q10s16, 7);
616
617 if (yoffset == 0) { // firstpass_filter8x4_only
618 vst1_u8(dst_ptr, d22u8);
619 dst_ptr += dst_pitch;
620 vst1_u8(dst_ptr, d23u8);
621 dst_ptr += dst_pitch;
622 vst1_u8(dst_ptr, d24u8);
623 dst_ptr += dst_pitch;
624 vst1_u8(dst_ptr, d25u8);
625 return;
626 }
627
628 // First Pass on rest 5-line data
629 src += src_pixels_per_line;
630 q3u8 = vld1q_u8(src);
631 src += src_pixels_per_line;
632 q4u8 = vld1q_u8(src);
633 src += src_pixels_per_line;
634 q5u8 = vld1q_u8(src);
635 src += src_pixels_per_line;
636 q6u8 = vld1q_u8(src);
637 src += src_pixels_per_line;
638 q7u8 = vld1q_u8(src);
639
640 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
641 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
642 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
643 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
644 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
645
646 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
647 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
648 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
649 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
650 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
651
652 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
653 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
654 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
655 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
656 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
657
658 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
659 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
660 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
661 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
662 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
663
664 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
665 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
666 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
667 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
668 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
669
670 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
671 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
672 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
673 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
674 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
675
676 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
677 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
678 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
679 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
680 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
681
682 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
683 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
684 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
685 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
686 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
687
688 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
689 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
690 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
691 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
692 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
693
694 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
695 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
696 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
697 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
698 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
699
700 q3u16 = vmull_u8(d27u8, d3u8);
701 q4u16 = vmull_u8(d28u8, d3u8);
702 q5u16 = vmull_u8(d29u8, d3u8);
703 q6u16 = vmull_u8(d30u8, d3u8);
704 q7u16 = vmull_u8(d31u8, d3u8);
705
706 q3s16 = vreinterpretq_s16_u16(q3u16);
707 q4s16 = vreinterpretq_s16_u16(q4u16);
708 q5s16 = vreinterpretq_s16_u16(q5u16);
709 q6s16 = vreinterpretq_s16_u16(q6u16);
710 q7s16 = vreinterpretq_s16_u16(q7u16);
711 q8s16 = vreinterpretq_s16_u16(q8u16);
712 q9s16 = vreinterpretq_s16_u16(q9u16);
713 q10s16 = vreinterpretq_s16_u16(q10u16);
714 q11s16 = vreinterpretq_s16_u16(q11u16);
715 q12s16 = vreinterpretq_s16_u16(q12u16);
716
717 q8s16 = vqaddq_s16(q8s16, q3s16);
718 q9s16 = vqaddq_s16(q9s16, q4s16);
719 q10s16 = vqaddq_s16(q10s16, q5s16);
720 q11s16 = vqaddq_s16(q11s16, q6s16);
721 q12s16 = vqaddq_s16(q12s16, q7s16);
722
723 d26u8 = vqrshrun_n_s16(q8s16, 7);
724 d27u8 = vqrshrun_n_s16(q9s16, 7);
725 d28u8 = vqrshrun_n_s16(q10s16, 7);
726 d29u8 = vqrshrun_n_s16(q11s16, 7);
727 d30u8 = vqrshrun_n_s16(q12s16, 7);
728
729 // Second pass: 8x4
730 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
731 d0s8 = vdup_lane_s8(dtmps8, 0);
732 d1s8 = vdup_lane_s8(dtmps8, 1);
733 d2s8 = vdup_lane_s8(dtmps8, 2);
734 d3s8 = vdup_lane_s8(dtmps8, 3);
735 d4s8 = vdup_lane_s8(dtmps8, 4);
736 d5s8 = vdup_lane_s8(dtmps8, 5);
737 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
738 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
739 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
740 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
741 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
742 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
743
744 q3u16 = vmull_u8(d22u8, d0u8);
745 q4u16 = vmull_u8(d23u8, d0u8);
746 q5u16 = vmull_u8(d24u8, d0u8);
747 q6u16 = vmull_u8(d25u8, d0u8);
748
749 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
750 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
751 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
752 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
753
754 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
755 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
756 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
757 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
758
759 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
760 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
761 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
762 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
763
764 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
765 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
766 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
767 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
768
769 q7u16 = vmull_u8(d25u8, d3u8);
770 q8u16 = vmull_u8(d26u8, d3u8);
771 q9u16 = vmull_u8(d27u8, d3u8);
772 q10u16 = vmull_u8(d28u8, d3u8);
773
774 q3s16 = vreinterpretq_s16_u16(q3u16);
775 q4s16 = vreinterpretq_s16_u16(q4u16);
776 q5s16 = vreinterpretq_s16_u16(q5u16);
777 q6s16 = vreinterpretq_s16_u16(q6u16);
778 q7s16 = vreinterpretq_s16_u16(q7u16);
779 q8s16 = vreinterpretq_s16_u16(q8u16);
780 q9s16 = vreinterpretq_s16_u16(q9u16);
781 q10s16 = vreinterpretq_s16_u16(q10u16);
782
783 q7s16 = vqaddq_s16(q7s16, q3s16);
784 q8s16 = vqaddq_s16(q8s16, q4s16);
785 q9s16 = vqaddq_s16(q9s16, q5s16);
786 q10s16 = vqaddq_s16(q10s16, q6s16);
787
788 d6u8 = vqrshrun_n_s16(q7s16, 7);
789 d7u8 = vqrshrun_n_s16(q8s16, 7);
790 d8u8 = vqrshrun_n_s16(q9s16, 7);
791 d9u8 = vqrshrun_n_s16(q10s16, 7);
792
793 vst1_u8(dst_ptr, d6u8);
794 dst_ptr += dst_pitch;
795 vst1_u8(dst_ptr, d7u8);
796 dst_ptr += dst_pitch;
797 vst1_u8(dst_ptr, d8u8);
798 dst_ptr += dst_pitch;
799 vst1_u8(dst_ptr, d9u8);
800 return;
801 }
802
vp8_sixtap_predict8x8_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)803 void vp8_sixtap_predict8x8_neon(
804 unsigned char *src_ptr,
805 int src_pixels_per_line,
806 int xoffset,
807 int yoffset,
808 unsigned char *dst_ptr,
809 int dst_pitch) {
810 unsigned char *src, *tmpp;
811 unsigned char tmp[64];
812 int i;
813 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
814 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
815 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
816 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
817 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
818 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
819 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
820 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
821 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
822
823 if (xoffset == 0) { // secondpass_filter8x8_only
824 // load second_pass filter
825 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
826 d0s8 = vdup_lane_s8(dtmps8, 0);
827 d1s8 = vdup_lane_s8(dtmps8, 1);
828 d2s8 = vdup_lane_s8(dtmps8, 2);
829 d3s8 = vdup_lane_s8(dtmps8, 3);
830 d4s8 = vdup_lane_s8(dtmps8, 4);
831 d5s8 = vdup_lane_s8(dtmps8, 5);
832 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
833 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
834 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
835 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
836 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
837 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
838
839 // load src data
840 src = src_ptr - src_pixels_per_line * 2;
841 d18u8 = vld1_u8(src);
842 src += src_pixels_per_line;
843 d19u8 = vld1_u8(src);
844 src += src_pixels_per_line;
845 d20u8 = vld1_u8(src);
846 src += src_pixels_per_line;
847 d21u8 = vld1_u8(src);
848 src += src_pixels_per_line;
849 d22u8 = vld1_u8(src);
850 src += src_pixels_per_line;
851 d23u8 = vld1_u8(src);
852 src += src_pixels_per_line;
853 d24u8 = vld1_u8(src);
854 src += src_pixels_per_line;
855 d25u8 = vld1_u8(src);
856 src += src_pixels_per_line;
857 d26u8 = vld1_u8(src);
858 src += src_pixels_per_line;
859 d27u8 = vld1_u8(src);
860 src += src_pixels_per_line;
861 d28u8 = vld1_u8(src);
862 src += src_pixels_per_line;
863 d29u8 = vld1_u8(src);
864 src += src_pixels_per_line;
865 d30u8 = vld1_u8(src);
866
867 for (i = 2; i > 0; i--) {
868 q3u16 = vmull_u8(d18u8, d0u8);
869 q4u16 = vmull_u8(d19u8, d0u8);
870 q5u16 = vmull_u8(d20u8, d0u8);
871 q6u16 = vmull_u8(d21u8, d0u8);
872
873 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
874 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
875 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
876 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
877
878 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
879 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
880 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
881 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
882
883 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
884 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
885 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
886 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
887
888 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
889 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
890 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
891 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
892
893 q7u16 = vmull_u8(d21u8, d3u8);
894 q8u16 = vmull_u8(d22u8, d3u8);
895 q9u16 = vmull_u8(d23u8, d3u8);
896 q10u16 = vmull_u8(d24u8, d3u8);
897
898 q3s16 = vreinterpretq_s16_u16(q3u16);
899 q4s16 = vreinterpretq_s16_u16(q4u16);
900 q5s16 = vreinterpretq_s16_u16(q5u16);
901 q6s16 = vreinterpretq_s16_u16(q6u16);
902 q7s16 = vreinterpretq_s16_u16(q7u16);
903 q8s16 = vreinterpretq_s16_u16(q8u16);
904 q9s16 = vreinterpretq_s16_u16(q9u16);
905 q10s16 = vreinterpretq_s16_u16(q10u16);
906
907 q7s16 = vqaddq_s16(q7s16, q3s16);
908 q8s16 = vqaddq_s16(q8s16, q4s16);
909 q9s16 = vqaddq_s16(q9s16, q5s16);
910 q10s16 = vqaddq_s16(q10s16, q6s16);
911
912 d6u8 = vqrshrun_n_s16(q7s16, 7);
913 d7u8 = vqrshrun_n_s16(q8s16, 7);
914 d8u8 = vqrshrun_n_s16(q9s16, 7);
915 d9u8 = vqrshrun_n_s16(q10s16, 7);
916
917 d18u8 = d22u8;
918 d19u8 = d23u8;
919 d20u8 = d24u8;
920 d21u8 = d25u8;
921 d22u8 = d26u8;
922 d23u8 = d27u8;
923 d24u8 = d28u8;
924 d25u8 = d29u8;
925 d26u8 = d30u8;
926
927 vst1_u8(dst_ptr, d6u8);
928 dst_ptr += dst_pitch;
929 vst1_u8(dst_ptr, d7u8);
930 dst_ptr += dst_pitch;
931 vst1_u8(dst_ptr, d8u8);
932 dst_ptr += dst_pitch;
933 vst1_u8(dst_ptr, d9u8);
934 dst_ptr += dst_pitch;
935 }
936 return;
937 }
938
939 // load first_pass filter
940 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
941 d0s8 = vdup_lane_s8(dtmps8, 0);
942 d1s8 = vdup_lane_s8(dtmps8, 1);
943 d2s8 = vdup_lane_s8(dtmps8, 2);
944 d3s8 = vdup_lane_s8(dtmps8, 3);
945 d4s8 = vdup_lane_s8(dtmps8, 4);
946 d5s8 = vdup_lane_s8(dtmps8, 5);
947 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
948 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
949 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
950 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
951 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
952 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
953
954 // First pass: output_height lines x output_width columns (9x4)
955 if (yoffset == 0) // firstpass_filter4x4_only
956 src = src_ptr - 2;
957 else
958 src = src_ptr - 2 - (src_pixels_per_line * 2);
959
960 tmpp = tmp;
961 for (i = 2; i > 0; i--) {
962 q3u8 = vld1q_u8(src);
963 src += src_pixels_per_line;
964 q4u8 = vld1q_u8(src);
965 src += src_pixels_per_line;
966 q5u8 = vld1q_u8(src);
967 src += src_pixels_per_line;
968 q6u8 = vld1q_u8(src);
969 src += src_pixels_per_line;
970
971 __builtin_prefetch(src);
972 __builtin_prefetch(src + src_pixels_per_line);
973 __builtin_prefetch(src + src_pixels_per_line * 2);
974
975 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
976 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
977 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
978 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
979
980 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
981 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
982 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
983 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
984
985 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
986 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
987 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
988 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
989
990 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
991 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
992 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
993 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
994
995 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
996 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
997 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
998 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
999
1000 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1001 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1002 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1003 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1004
1005 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
1006 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
1007 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
1008 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
1009
1010 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1011 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1012 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1013 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1014
1015 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
1016 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1017 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
1018 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
1019
1020 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1021 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1022 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1023 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1024
1025 q3u16 = vmull_u8(d28u8, d3u8);
1026 q4u16 = vmull_u8(d29u8, d3u8);
1027 q5u16 = vmull_u8(d30u8, d3u8);
1028 q6u16 = vmull_u8(d31u8, d3u8);
1029
1030 q3s16 = vreinterpretq_s16_u16(q3u16);
1031 q4s16 = vreinterpretq_s16_u16(q4u16);
1032 q5s16 = vreinterpretq_s16_u16(q5u16);
1033 q6s16 = vreinterpretq_s16_u16(q6u16);
1034 q7s16 = vreinterpretq_s16_u16(q7u16);
1035 q8s16 = vreinterpretq_s16_u16(q8u16);
1036 q9s16 = vreinterpretq_s16_u16(q9u16);
1037 q10s16 = vreinterpretq_s16_u16(q10u16);
1038
1039 q7s16 = vqaddq_s16(q7s16, q3s16);
1040 q8s16 = vqaddq_s16(q8s16, q4s16);
1041 q9s16 = vqaddq_s16(q9s16, q5s16);
1042 q10s16 = vqaddq_s16(q10s16, q6s16);
1043
1044 d22u8 = vqrshrun_n_s16(q7s16, 7);
1045 d23u8 = vqrshrun_n_s16(q8s16, 7);
1046 d24u8 = vqrshrun_n_s16(q9s16, 7);
1047 d25u8 = vqrshrun_n_s16(q10s16, 7);
1048
1049 if (yoffset == 0) { // firstpass_filter8x4_only
1050 vst1_u8(dst_ptr, d22u8);
1051 dst_ptr += dst_pitch;
1052 vst1_u8(dst_ptr, d23u8);
1053 dst_ptr += dst_pitch;
1054 vst1_u8(dst_ptr, d24u8);
1055 dst_ptr += dst_pitch;
1056 vst1_u8(dst_ptr, d25u8);
1057 dst_ptr += dst_pitch;
1058 } else {
1059 vst1_u8(tmpp, d22u8);
1060 tmpp += 8;
1061 vst1_u8(tmpp, d23u8);
1062 tmpp += 8;
1063 vst1_u8(tmpp, d24u8);
1064 tmpp += 8;
1065 vst1_u8(tmpp, d25u8);
1066 tmpp += 8;
1067 }
1068 }
1069 if (yoffset == 0)
1070 return;
1071
1072 // First Pass on rest 5-line data
1073 q3u8 = vld1q_u8(src);
1074 src += src_pixels_per_line;
1075 q4u8 = vld1q_u8(src);
1076 src += src_pixels_per_line;
1077 q5u8 = vld1q_u8(src);
1078 src += src_pixels_per_line;
1079 q6u8 = vld1q_u8(src);
1080 src += src_pixels_per_line;
1081 q7u8 = vld1q_u8(src);
1082
1083 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1084 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1085 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1086 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1087 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1088
1089 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1090 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1091 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1092 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1093 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1094
1095 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1096 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1097 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1098 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1099 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1100
1101 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1102 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1103 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1104 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1105 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1106
1107 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1108 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1109 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1110 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1111 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1112
1113 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1114 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1115 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1116 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1117 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1118
1119 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1120 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1121 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1122 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1123 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1124
1125 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1126 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1127 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1128 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1129 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1130
1131 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1132 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1133 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1134 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1135 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1136
1137 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1138 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1139 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1140 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1141 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1142
1143 q3u16 = vmull_u8(d27u8, d3u8);
1144 q4u16 = vmull_u8(d28u8, d3u8);
1145 q5u16 = vmull_u8(d29u8, d3u8);
1146 q6u16 = vmull_u8(d30u8, d3u8);
1147 q7u16 = vmull_u8(d31u8, d3u8);
1148
1149 q3s16 = vreinterpretq_s16_u16(q3u16);
1150 q4s16 = vreinterpretq_s16_u16(q4u16);
1151 q5s16 = vreinterpretq_s16_u16(q5u16);
1152 q6s16 = vreinterpretq_s16_u16(q6u16);
1153 q7s16 = vreinterpretq_s16_u16(q7u16);
1154 q8s16 = vreinterpretq_s16_u16(q8u16);
1155 q9s16 = vreinterpretq_s16_u16(q9u16);
1156 q10s16 = vreinterpretq_s16_u16(q10u16);
1157 q11s16 = vreinterpretq_s16_u16(q11u16);
1158 q12s16 = vreinterpretq_s16_u16(q12u16);
1159
1160 q8s16 = vqaddq_s16(q8s16, q3s16);
1161 q9s16 = vqaddq_s16(q9s16, q4s16);
1162 q10s16 = vqaddq_s16(q10s16, q5s16);
1163 q11s16 = vqaddq_s16(q11s16, q6s16);
1164 q12s16 = vqaddq_s16(q12s16, q7s16);
1165
1166 d26u8 = vqrshrun_n_s16(q8s16, 7);
1167 d27u8 = vqrshrun_n_s16(q9s16, 7);
1168 d28u8 = vqrshrun_n_s16(q10s16, 7);
1169 d29u8 = vqrshrun_n_s16(q11s16, 7);
1170 d30u8 = vqrshrun_n_s16(q12s16, 7);
1171
1172 // Second pass: 8x8
1173 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1174 d0s8 = vdup_lane_s8(dtmps8, 0);
1175 d1s8 = vdup_lane_s8(dtmps8, 1);
1176 d2s8 = vdup_lane_s8(dtmps8, 2);
1177 d3s8 = vdup_lane_s8(dtmps8, 3);
1178 d4s8 = vdup_lane_s8(dtmps8, 4);
1179 d5s8 = vdup_lane_s8(dtmps8, 5);
1180 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1181 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1182 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1183 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1184 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1185 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1186
1187 tmpp = tmp;
1188 q9u8 = vld1q_u8(tmpp);
1189 tmpp += 16;
1190 q10u8 = vld1q_u8(tmpp);
1191 tmpp += 16;
1192 q11u8 = vld1q_u8(tmpp);
1193 tmpp += 16;
1194 q12u8 = vld1q_u8(tmpp);
1195
1196 d18u8 = vget_low_u8(q9u8);
1197 d19u8 = vget_high_u8(q9u8);
1198 d20u8 = vget_low_u8(q10u8);
1199 d21u8 = vget_high_u8(q10u8);
1200 d22u8 = vget_low_u8(q11u8);
1201 d23u8 = vget_high_u8(q11u8);
1202 d24u8 = vget_low_u8(q12u8);
1203 d25u8 = vget_high_u8(q12u8);
1204
1205 for (i = 2; i > 0; i--) {
1206 q3u16 = vmull_u8(d18u8, d0u8);
1207 q4u16 = vmull_u8(d19u8, d0u8);
1208 q5u16 = vmull_u8(d20u8, d0u8);
1209 q6u16 = vmull_u8(d21u8, d0u8);
1210
1211 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1212 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1213 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1214 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1215
1216 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1217 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1218 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1219 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1220
1221 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1222 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1223 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1224 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1225
1226 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1227 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1228 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1229 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1230
1231 q7u16 = vmull_u8(d21u8, d3u8);
1232 q8u16 = vmull_u8(d22u8, d3u8);
1233 q9u16 = vmull_u8(d23u8, d3u8);
1234 q10u16 = vmull_u8(d24u8, d3u8);
1235
1236 q3s16 = vreinterpretq_s16_u16(q3u16);
1237 q4s16 = vreinterpretq_s16_u16(q4u16);
1238 q5s16 = vreinterpretq_s16_u16(q5u16);
1239 q6s16 = vreinterpretq_s16_u16(q6u16);
1240 q7s16 = vreinterpretq_s16_u16(q7u16);
1241 q8s16 = vreinterpretq_s16_u16(q8u16);
1242 q9s16 = vreinterpretq_s16_u16(q9u16);
1243 q10s16 = vreinterpretq_s16_u16(q10u16);
1244
1245 q7s16 = vqaddq_s16(q7s16, q3s16);
1246 q8s16 = vqaddq_s16(q8s16, q4s16);
1247 q9s16 = vqaddq_s16(q9s16, q5s16);
1248 q10s16 = vqaddq_s16(q10s16, q6s16);
1249
1250 d6u8 = vqrshrun_n_s16(q7s16, 7);
1251 d7u8 = vqrshrun_n_s16(q8s16, 7);
1252 d8u8 = vqrshrun_n_s16(q9s16, 7);
1253 d9u8 = vqrshrun_n_s16(q10s16, 7);
1254
1255 d18u8 = d22u8;
1256 d19u8 = d23u8;
1257 d20u8 = d24u8;
1258 d21u8 = d25u8;
1259 d22u8 = d26u8;
1260 d23u8 = d27u8;
1261 d24u8 = d28u8;
1262 d25u8 = d29u8;
1263 d26u8 = d30u8;
1264
1265 vst1_u8(dst_ptr, d6u8);
1266 dst_ptr += dst_pitch;
1267 vst1_u8(dst_ptr, d7u8);
1268 dst_ptr += dst_pitch;
1269 vst1_u8(dst_ptr, d8u8);
1270 dst_ptr += dst_pitch;
1271 vst1_u8(dst_ptr, d9u8);
1272 dst_ptr += dst_pitch;
1273 }
1274 return;
1275 }
1276
vp8_sixtap_predict16x16_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)1277 void vp8_sixtap_predict16x16_neon(
1278 unsigned char *src_ptr,
1279 int src_pixels_per_line,
1280 int xoffset,
1281 int yoffset,
1282 unsigned char *dst_ptr,
1283 int dst_pitch) {
1284 unsigned char *src, *src_tmp, *dst, *tmpp;
1285 unsigned char tmp[336];
1286 int i, j;
1287 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1288 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1289 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1290 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1291 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1292 uint8x16_t q3u8, q4u8;
1293 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1294 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1295 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1296 int16x8_t q11s16, q12s16, q13s16, q15s16;
1297
1298 if (xoffset == 0) { // secondpass_filter8x8_only
1299 // load second_pass filter
1300 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1301 d0s8 = vdup_lane_s8(dtmps8, 0);
1302 d1s8 = vdup_lane_s8(dtmps8, 1);
1303 d2s8 = vdup_lane_s8(dtmps8, 2);
1304 d3s8 = vdup_lane_s8(dtmps8, 3);
1305 d4s8 = vdup_lane_s8(dtmps8, 4);
1306 d5s8 = vdup_lane_s8(dtmps8, 5);
1307 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1308 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1309 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1310 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1311 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1312 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1313
1314 // load src data
1315 src_tmp = src_ptr - src_pixels_per_line * 2;
1316 for (i = 0; i < 2; i++) {
1317 src = src_tmp + i * 8;
1318 dst = dst_ptr + i * 8;
1319 d18u8 = vld1_u8(src);
1320 src += src_pixels_per_line;
1321 d19u8 = vld1_u8(src);
1322 src += src_pixels_per_line;
1323 d20u8 = vld1_u8(src);
1324 src += src_pixels_per_line;
1325 d21u8 = vld1_u8(src);
1326 src += src_pixels_per_line;
1327 d22u8 = vld1_u8(src);
1328 src += src_pixels_per_line;
1329 for (j = 0; j < 4; j++) {
1330 d23u8 = vld1_u8(src);
1331 src += src_pixels_per_line;
1332 d24u8 = vld1_u8(src);
1333 src += src_pixels_per_line;
1334 d25u8 = vld1_u8(src);
1335 src += src_pixels_per_line;
1336 d26u8 = vld1_u8(src);
1337 src += src_pixels_per_line;
1338
1339 q3u16 = vmull_u8(d18u8, d0u8);
1340 q4u16 = vmull_u8(d19u8, d0u8);
1341 q5u16 = vmull_u8(d20u8, d0u8);
1342 q6u16 = vmull_u8(d21u8, d0u8);
1343
1344 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1345 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1346 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1347 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1348
1349 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1350 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1351 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1352 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1353
1354 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1355 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1356 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1357 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1358
1359 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1360 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1361 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1362 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1363
1364 q7u16 = vmull_u8(d21u8, d3u8);
1365 q8u16 = vmull_u8(d22u8, d3u8);
1366 q9u16 = vmull_u8(d23u8, d3u8);
1367 q10u16 = vmull_u8(d24u8, d3u8);
1368
1369 q3s16 = vreinterpretq_s16_u16(q3u16);
1370 q4s16 = vreinterpretq_s16_u16(q4u16);
1371 q5s16 = vreinterpretq_s16_u16(q5u16);
1372 q6s16 = vreinterpretq_s16_u16(q6u16);
1373 q7s16 = vreinterpretq_s16_u16(q7u16);
1374 q8s16 = vreinterpretq_s16_u16(q8u16);
1375 q9s16 = vreinterpretq_s16_u16(q9u16);
1376 q10s16 = vreinterpretq_s16_u16(q10u16);
1377
1378 q7s16 = vqaddq_s16(q7s16, q3s16);
1379 q8s16 = vqaddq_s16(q8s16, q4s16);
1380 q9s16 = vqaddq_s16(q9s16, q5s16);
1381 q10s16 = vqaddq_s16(q10s16, q6s16);
1382
1383 d6u8 = vqrshrun_n_s16(q7s16, 7);
1384 d7u8 = vqrshrun_n_s16(q8s16, 7);
1385 d8u8 = vqrshrun_n_s16(q9s16, 7);
1386 d9u8 = vqrshrun_n_s16(q10s16, 7);
1387
1388 d18u8 = d22u8;
1389 d19u8 = d23u8;
1390 d20u8 = d24u8;
1391 d21u8 = d25u8;
1392 d22u8 = d26u8;
1393
1394 vst1_u8(dst, d6u8);
1395 dst += dst_pitch;
1396 vst1_u8(dst, d7u8);
1397 dst += dst_pitch;
1398 vst1_u8(dst, d8u8);
1399 dst += dst_pitch;
1400 vst1_u8(dst, d9u8);
1401 dst += dst_pitch;
1402 }
1403 }
1404 return;
1405 }
1406
1407 // load first_pass filter
1408 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1409 d0s8 = vdup_lane_s8(dtmps8, 0);
1410 d1s8 = vdup_lane_s8(dtmps8, 1);
1411 d2s8 = vdup_lane_s8(dtmps8, 2);
1412 d3s8 = vdup_lane_s8(dtmps8, 3);
1413 d4s8 = vdup_lane_s8(dtmps8, 4);
1414 d5s8 = vdup_lane_s8(dtmps8, 5);
1415 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1416 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1417 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1418 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1419 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1420 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1421
1422 // First pass: output_height lines x output_width columns (9x4)
1423 if (yoffset == 0) { // firstpass_filter4x4_only
1424 src = src_ptr - 2;
1425 dst = dst_ptr;
1426 for (i = 0; i < 8; i++) {
1427 d6u8 = vld1_u8(src);
1428 d7u8 = vld1_u8(src + 8);
1429 d8u8 = vld1_u8(src + 16);
1430 src += src_pixels_per_line;
1431 d9u8 = vld1_u8(src);
1432 d10u8 = vld1_u8(src + 8);
1433 d11u8 = vld1_u8(src + 16);
1434 src += src_pixels_per_line;
1435
1436 __builtin_prefetch(src);
1437 __builtin_prefetch(src + src_pixels_per_line);
1438
1439 q6u16 = vmull_u8(d6u8, d0u8);
1440 q7u16 = vmull_u8(d7u8, d0u8);
1441 q8u16 = vmull_u8(d9u8, d0u8);
1442 q9u16 = vmull_u8(d10u8, d0u8);
1443
1444 d20u8 = vext_u8(d6u8, d7u8, 1);
1445 d21u8 = vext_u8(d9u8, d10u8, 1);
1446 d22u8 = vext_u8(d7u8, d8u8, 1);
1447 d23u8 = vext_u8(d10u8, d11u8, 1);
1448 d24u8 = vext_u8(d6u8, d7u8, 4);
1449 d25u8 = vext_u8(d9u8, d10u8, 4);
1450 d26u8 = vext_u8(d7u8, d8u8, 4);
1451 d27u8 = vext_u8(d10u8, d11u8, 4);
1452 d28u8 = vext_u8(d6u8, d7u8, 5);
1453 d29u8 = vext_u8(d9u8, d10u8, 5);
1454
1455 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1456 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1457 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1458 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1459 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1460 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1461 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1462 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1463 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1464 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1465
1466 d20u8 = vext_u8(d7u8, d8u8, 5);
1467 d21u8 = vext_u8(d10u8, d11u8, 5);
1468 d22u8 = vext_u8(d6u8, d7u8, 2);
1469 d23u8 = vext_u8(d9u8, d10u8, 2);
1470 d24u8 = vext_u8(d7u8, d8u8, 2);
1471 d25u8 = vext_u8(d10u8, d11u8, 2);
1472 d26u8 = vext_u8(d6u8, d7u8, 3);
1473 d27u8 = vext_u8(d9u8, d10u8, 3);
1474 d28u8 = vext_u8(d7u8, d8u8, 3);
1475 d29u8 = vext_u8(d10u8, d11u8, 3);
1476
1477 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1478 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1479 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1480 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1481 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1482 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1483
1484 q10u16 = vmull_u8(d26u8, d3u8);
1485 q11u16 = vmull_u8(d27u8, d3u8);
1486 q12u16 = vmull_u8(d28u8, d3u8);
1487 q15u16 = vmull_u8(d29u8, d3u8);
1488
1489 q6s16 = vreinterpretq_s16_u16(q6u16);
1490 q7s16 = vreinterpretq_s16_u16(q7u16);
1491 q8s16 = vreinterpretq_s16_u16(q8u16);
1492 q9s16 = vreinterpretq_s16_u16(q9u16);
1493 q10s16 = vreinterpretq_s16_u16(q10u16);
1494 q11s16 = vreinterpretq_s16_u16(q11u16);
1495 q12s16 = vreinterpretq_s16_u16(q12u16);
1496 q15s16 = vreinterpretq_s16_u16(q15u16);
1497
1498 q6s16 = vqaddq_s16(q6s16, q10s16);
1499 q8s16 = vqaddq_s16(q8s16, q11s16);
1500 q7s16 = vqaddq_s16(q7s16, q12s16);
1501 q9s16 = vqaddq_s16(q9s16, q15s16);
1502
1503 d6u8 = vqrshrun_n_s16(q6s16, 7);
1504 d7u8 = vqrshrun_n_s16(q7s16, 7);
1505 d8u8 = vqrshrun_n_s16(q8s16, 7);
1506 d9u8 = vqrshrun_n_s16(q9s16, 7);
1507
1508 q3u8 = vcombine_u8(d6u8, d7u8);
1509 q4u8 = vcombine_u8(d8u8, d9u8);
1510 vst1q_u8(dst, q3u8);
1511 dst += dst_pitch;
1512 vst1q_u8(dst, q4u8);
1513 dst += dst_pitch;
1514 }
1515 return;
1516 }
1517
1518 src = src_ptr - 2 - src_pixels_per_line * 2;
1519 tmpp = tmp;
1520 for (i = 0; i < 7; i++) {
1521 d6u8 = vld1_u8(src);
1522 d7u8 = vld1_u8(src + 8);
1523 d8u8 = vld1_u8(src + 16);
1524 src += src_pixels_per_line;
1525 d9u8 = vld1_u8(src);
1526 d10u8 = vld1_u8(src + 8);
1527 d11u8 = vld1_u8(src + 16);
1528 src += src_pixels_per_line;
1529 d12u8 = vld1_u8(src);
1530 d13u8 = vld1_u8(src + 8);
1531 d14u8 = vld1_u8(src + 16);
1532 src += src_pixels_per_line;
1533
1534 __builtin_prefetch(src);
1535 __builtin_prefetch(src + src_pixels_per_line);
1536 __builtin_prefetch(src + src_pixels_per_line * 2);
1537
1538 q8u16 = vmull_u8(d6u8, d0u8);
1539 q9u16 = vmull_u8(d7u8, d0u8);
1540 q10u16 = vmull_u8(d9u8, d0u8);
1541 q11u16 = vmull_u8(d10u8, d0u8);
1542 q12u16 = vmull_u8(d12u8, d0u8);
1543 q13u16 = vmull_u8(d13u8, d0u8);
1544
1545 d28u8 = vext_u8(d6u8, d7u8, 1);
1546 d29u8 = vext_u8(d9u8, d10u8, 1);
1547 d30u8 = vext_u8(d12u8, d13u8, 1);
1548 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1549 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1550 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1551 d28u8 = vext_u8(d7u8, d8u8, 1);
1552 d29u8 = vext_u8(d10u8, d11u8, 1);
1553 d30u8 = vext_u8(d13u8, d14u8, 1);
1554 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1555 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1556 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1557
1558 d28u8 = vext_u8(d6u8, d7u8, 4);
1559 d29u8 = vext_u8(d9u8, d10u8, 4);
1560 d30u8 = vext_u8(d12u8, d13u8, 4);
1561 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1562 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1563 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1564 d28u8 = vext_u8(d7u8, d8u8, 4);
1565 d29u8 = vext_u8(d10u8, d11u8, 4);
1566 d30u8 = vext_u8(d13u8, d14u8, 4);
1567 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1568 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1569 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1570
1571 d28u8 = vext_u8(d6u8, d7u8, 5);
1572 d29u8 = vext_u8(d9u8, d10u8, 5);
1573 d30u8 = vext_u8(d12u8, d13u8, 5);
1574 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1575 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1576 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1577 d28u8 = vext_u8(d7u8, d8u8, 5);
1578 d29u8 = vext_u8(d10u8, d11u8, 5);
1579 d30u8 = vext_u8(d13u8, d14u8, 5);
1580 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1581 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1582 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1583
1584 d28u8 = vext_u8(d6u8, d7u8, 2);
1585 d29u8 = vext_u8(d9u8, d10u8, 2);
1586 d30u8 = vext_u8(d12u8, d13u8, 2);
1587 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1588 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1589 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1590 d28u8 = vext_u8(d7u8, d8u8, 2);
1591 d29u8 = vext_u8(d10u8, d11u8, 2);
1592 d30u8 = vext_u8(d13u8, d14u8, 2);
1593 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1594 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1595 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1596
1597 d28u8 = vext_u8(d6u8, d7u8, 3);
1598 d29u8 = vext_u8(d9u8, d10u8, 3);
1599 d30u8 = vext_u8(d12u8, d13u8, 3);
1600 d15u8 = vext_u8(d7u8, d8u8, 3);
1601 d31u8 = vext_u8(d10u8, d11u8, 3);
1602 d6u8 = vext_u8(d13u8, d14u8, 3);
1603 q4u16 = vmull_u8(d28u8, d3u8);
1604 q5u16 = vmull_u8(d29u8, d3u8);
1605 q6u16 = vmull_u8(d30u8, d3u8);
1606 q4s16 = vreinterpretq_s16_u16(q4u16);
1607 q5s16 = vreinterpretq_s16_u16(q5u16);
1608 q6s16 = vreinterpretq_s16_u16(q6u16);
1609 q8s16 = vreinterpretq_s16_u16(q8u16);
1610 q10s16 = vreinterpretq_s16_u16(q10u16);
1611 q12s16 = vreinterpretq_s16_u16(q12u16);
1612 q8s16 = vqaddq_s16(q8s16, q4s16);
1613 q10s16 = vqaddq_s16(q10s16, q5s16);
1614 q12s16 = vqaddq_s16(q12s16, q6s16);
1615
1616 q6u16 = vmull_u8(d15u8, d3u8);
1617 q7u16 = vmull_u8(d31u8, d3u8);
1618 q3u16 = vmull_u8(d6u8, d3u8);
1619 q3s16 = vreinterpretq_s16_u16(q3u16);
1620 q6s16 = vreinterpretq_s16_u16(q6u16);
1621 q7s16 = vreinterpretq_s16_u16(q7u16);
1622 q9s16 = vreinterpretq_s16_u16(q9u16);
1623 q11s16 = vreinterpretq_s16_u16(q11u16);
1624 q13s16 = vreinterpretq_s16_u16(q13u16);
1625 q9s16 = vqaddq_s16(q9s16, q6s16);
1626 q11s16 = vqaddq_s16(q11s16, q7s16);
1627 q13s16 = vqaddq_s16(q13s16, q3s16);
1628
1629 d6u8 = vqrshrun_n_s16(q8s16, 7);
1630 d7u8 = vqrshrun_n_s16(q9s16, 7);
1631 d8u8 = vqrshrun_n_s16(q10s16, 7);
1632 d9u8 = vqrshrun_n_s16(q11s16, 7);
1633 d10u8 = vqrshrun_n_s16(q12s16, 7);
1634 d11u8 = vqrshrun_n_s16(q13s16, 7);
1635
1636 vst1_u8(tmpp, d6u8);
1637 tmpp += 8;
1638 vst1_u8(tmpp, d7u8);
1639 tmpp += 8;
1640 vst1_u8(tmpp, d8u8);
1641 tmpp += 8;
1642 vst1_u8(tmpp, d9u8);
1643 tmpp += 8;
1644 vst1_u8(tmpp, d10u8);
1645 tmpp += 8;
1646 vst1_u8(tmpp, d11u8);
1647 tmpp += 8;
1648 }
1649
1650 // Second pass: 16x16
1651 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1652 d0s8 = vdup_lane_s8(dtmps8, 0);
1653 d1s8 = vdup_lane_s8(dtmps8, 1);
1654 d2s8 = vdup_lane_s8(dtmps8, 2);
1655 d3s8 = vdup_lane_s8(dtmps8, 3);
1656 d4s8 = vdup_lane_s8(dtmps8, 4);
1657 d5s8 = vdup_lane_s8(dtmps8, 5);
1658 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1659 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1660 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1661 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1662 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1663 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1664
1665 for (i = 0; i < 2; i++) {
1666 dst = dst_ptr + 8 * i;
1667 tmpp = tmp + 8 * i;
1668 d18u8 = vld1_u8(tmpp);
1669 tmpp += 16;
1670 d19u8 = vld1_u8(tmpp);
1671 tmpp += 16;
1672 d20u8 = vld1_u8(tmpp);
1673 tmpp += 16;
1674 d21u8 = vld1_u8(tmpp);
1675 tmpp += 16;
1676 d22u8 = vld1_u8(tmpp);
1677 tmpp += 16;
1678 for (j = 0; j < 4; j++) {
1679 d23u8 = vld1_u8(tmpp);
1680 tmpp += 16;
1681 d24u8 = vld1_u8(tmpp);
1682 tmpp += 16;
1683 d25u8 = vld1_u8(tmpp);
1684 tmpp += 16;
1685 d26u8 = vld1_u8(tmpp);
1686 tmpp += 16;
1687
1688 q3u16 = vmull_u8(d18u8, d0u8);
1689 q4u16 = vmull_u8(d19u8, d0u8);
1690 q5u16 = vmull_u8(d20u8, d0u8);
1691 q6u16 = vmull_u8(d21u8, d0u8);
1692
1693 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1694 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1695 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1696 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1697
1698 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1699 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1700 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1701 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1702
1703 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1704 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1705 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1706 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1707
1708 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1709 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1710 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1711 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1712
1713 q7u16 = vmull_u8(d21u8, d3u8);
1714 q8u16 = vmull_u8(d22u8, d3u8);
1715 q9u16 = vmull_u8(d23u8, d3u8);
1716 q10u16 = vmull_u8(d24u8, d3u8);
1717
1718 q3s16 = vreinterpretq_s16_u16(q3u16);
1719 q4s16 = vreinterpretq_s16_u16(q4u16);
1720 q5s16 = vreinterpretq_s16_u16(q5u16);
1721 q6s16 = vreinterpretq_s16_u16(q6u16);
1722 q7s16 = vreinterpretq_s16_u16(q7u16);
1723 q8s16 = vreinterpretq_s16_u16(q8u16);
1724 q9s16 = vreinterpretq_s16_u16(q9u16);
1725 q10s16 = vreinterpretq_s16_u16(q10u16);
1726
1727 q7s16 = vqaddq_s16(q7s16, q3s16);
1728 q8s16 = vqaddq_s16(q8s16, q4s16);
1729 q9s16 = vqaddq_s16(q9s16, q5s16);
1730 q10s16 = vqaddq_s16(q10s16, q6s16);
1731
1732 d6u8 = vqrshrun_n_s16(q7s16, 7);
1733 d7u8 = vqrshrun_n_s16(q8s16, 7);
1734 d8u8 = vqrshrun_n_s16(q9s16, 7);
1735 d9u8 = vqrshrun_n_s16(q10s16, 7);
1736
1737 d18u8 = d22u8;
1738 d19u8 = d23u8;
1739 d20u8 = d24u8;
1740 d21u8 = d25u8;
1741 d22u8 = d26u8;
1742
1743 vst1_u8(dst, d6u8);
1744 dst += dst_pitch;
1745 vst1_u8(dst, d7u8);
1746 dst += dst_pitch;
1747 vst1_u8(dst, d8u8);
1748 dst += dst_pitch;
1749 vst1_u8(dst, d9u8);
1750 dst += dst_pitch;
1751 }
1752 }
1753 return;
1754 }
1755