1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #ifdef _MSC_VER
14 #define __builtin_prefetch(x)
15 #endif
16
17 static const int8_t vp8_sub_pel_filters[8][8] = {
18 {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
19 {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
20 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
21 {0, -9, 93, 50, -6, 0, 0, 0},
22 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
23 {0, -6, 50, 93, -9, 0, 0, 0},
24 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
25 {0, -1, 12, 123, -6, 0, 0, 0},
26 };
27
vp8_sixtap_predict4x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)28 void vp8_sixtap_predict4x4_neon(
29 unsigned char *src_ptr,
30 int src_pixels_per_line,
31 int xoffset,
32 int yoffset,
33 unsigned char *dst_ptr,
34 int dst_pitch) {
35 unsigned char *src;
36 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
37 uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
38 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
39 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
40 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
41 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
42 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
43 uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
44 uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
45 uint32x2x2_t d0u32x2, d1u32x2;
46
47 if (xoffset == 0) { // secondpass_filter4x4_only
48 uint32x2_t d27u32 = vdup_n_u32(0);
49 uint32x2_t d28u32 = vdup_n_u32(0);
50 uint32x2_t d29u32 = vdup_n_u32(0);
51 uint32x2_t d30u32 = vdup_n_u32(0);
52 uint32x2_t d31u32 = vdup_n_u32(0);
53
54 // load second_pass filter
55 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
56 d0s8 = vdup_lane_s8(dtmps8, 0);
57 d1s8 = vdup_lane_s8(dtmps8, 1);
58 d2s8 = vdup_lane_s8(dtmps8, 2);
59 d3s8 = vdup_lane_s8(dtmps8, 3);
60 d4s8 = vdup_lane_s8(dtmps8, 4);
61 d5s8 = vdup_lane_s8(dtmps8, 5);
62 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
63 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
64 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
65 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
66 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
67 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
68
69 // load src data
70 src = src_ptr - src_pixels_per_line * 2;
71 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
72 src += src_pixels_per_line;
73 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
74 src += src_pixels_per_line;
75 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
76 src += src_pixels_per_line;
77 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
78 src += src_pixels_per_line;
79 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
80 src += src_pixels_per_line;
81 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
82 src += src_pixels_per_line;
83 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
84 src += src_pixels_per_line;
85 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
86 src += src_pixels_per_line;
87 d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
88
89 d27u8 = vreinterpret_u8_u32(d27u32);
90 d28u8 = vreinterpret_u8_u32(d28u32);
91 d29u8 = vreinterpret_u8_u32(d29u32);
92 d30u8 = vreinterpret_u8_u32(d30u32);
93 d31u8 = vreinterpret_u8_u32(d31u32);
94
95 d23u8 = vext_u8(d27u8, d28u8, 4);
96 d24u8 = vext_u8(d28u8, d29u8, 4);
97 d25u8 = vext_u8(d29u8, d30u8, 4);
98 d26u8 = vext_u8(d30u8, d31u8, 4);
99
100 q3u16 = vmull_u8(d27u8, d0u8);
101 q4u16 = vmull_u8(d28u8, d0u8);
102 q5u16 = vmull_u8(d25u8, d5u8);
103 q6u16 = vmull_u8(d26u8, d5u8);
104
105 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
106 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
107 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
108 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
109
110 q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
111 q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
112 q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
113 q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
114
115 q3s16 = vreinterpretq_s16_u16(q3u16);
116 q4s16 = vreinterpretq_s16_u16(q4u16);
117 q5s16 = vreinterpretq_s16_u16(q5u16);
118 q6s16 = vreinterpretq_s16_u16(q6u16);
119
120 q5s16 = vqaddq_s16(q5s16, q3s16);
121 q6s16 = vqaddq_s16(q6s16, q4s16);
122
123 d3u8 = vqrshrun_n_s16(q5s16, 7);
124 d4u8 = vqrshrun_n_s16(q6s16, 7);
125
126 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
127 dst_ptr += dst_pitch;
128 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
129 dst_ptr += dst_pitch;
130 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
131 dst_ptr += dst_pitch;
132 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
133 return;
134 }
135
136 // load first_pass filter
137 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
138 d0s8 = vdup_lane_s8(dtmps8, 0);
139 d1s8 = vdup_lane_s8(dtmps8, 1);
140 d2s8 = vdup_lane_s8(dtmps8, 2);
141 d3s8 = vdup_lane_s8(dtmps8, 3);
142 d4s8 = vdup_lane_s8(dtmps8, 4);
143 d5s8 = vdup_lane_s8(dtmps8, 5);
144 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
145 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
146 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
147 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
148 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
149 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
150
151 // First pass: output_height lines x output_width columns (9x4)
152
153 if (yoffset == 0) // firstpass_filter4x4_only
154 src = src_ptr - 2;
155 else
156 src = src_ptr - 2 - (src_pixels_per_line * 2);
157
158 q3u8 = vld1q_u8(src);
159 src += src_pixels_per_line;
160 q4u8 = vld1q_u8(src);
161 src += src_pixels_per_line;
162 q5u8 = vld1q_u8(src);
163 src += src_pixels_per_line;
164 q6u8 = vld1q_u8(src);
165 src += src_pixels_per_line;
166
167 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
168 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
169 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
170 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
171
172 // vswp here
173 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
174 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
175
176 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
177 vreinterpret_u32_u8(d19u8));
178 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
179 vreinterpret_u32_u8(d21u8));
180 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
181 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
182
183 // keep original src data in q4 q6
184 q4u64 = vreinterpretq_u64_u8(q3u8);
185 q6u64 = vreinterpretq_u64_u8(q5u8);
186
187 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
188 vreinterpret_u32_u8(vget_high_u8(q3u8)));
189 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
190 vreinterpret_u32_u8(vget_high_u8(q5u8)));
191 q9u64 = vshrq_n_u64(q4u64, 8);
192 q10u64 = vshrq_n_u64(q6u64, 8);
193 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
194 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
195
196 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
197 vreinterpret_u32_u64(vget_high_u64(q9u64)));
198 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
199 vreinterpret_u32_u64(vget_high_u64(q10u64)));
200 q3u64 = vshrq_n_u64(q4u64, 32);
201 q5u64 = vshrq_n_u64(q6u64, 32);
202 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
203 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
204
205 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
206 vreinterpret_u32_u64(vget_high_u64(q3u64)));
207 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
208 vreinterpret_u32_u64(vget_high_u64(q5u64)));
209 q9u64 = vshrq_n_u64(q4u64, 16);
210 q10u64 = vshrq_n_u64(q6u64, 16);
211 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
212 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
213
214 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
215 vreinterpret_u32_u64(vget_high_u64(q9u64)));
216 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
217 vreinterpret_u32_u64(vget_high_u64(q10u64)));
218 q3u64 = vshrq_n_u64(q4u64, 24);
219 q5u64 = vshrq_n_u64(q6u64, 24);
220 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
221 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
222
223 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
224 vreinterpret_u32_u64(vget_high_u64(q3u64)));
225 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
226 vreinterpret_u32_u64(vget_high_u64(q5u64)));
227 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
228 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
229
230 q7s16 = vreinterpretq_s16_u16(q7u16);
231 q8s16 = vreinterpretq_s16_u16(q8u16);
232 q9s16 = vreinterpretq_s16_u16(q9u16);
233 q10s16 = vreinterpretq_s16_u16(q10u16);
234 q7s16 = vqaddq_s16(q7s16, q9s16);
235 q8s16 = vqaddq_s16(q8s16, q10s16);
236
237 d27u8 = vqrshrun_n_s16(q7s16, 7);
238 d28u8 = vqrshrun_n_s16(q8s16, 7);
239
240 if (yoffset == 0) { // firstpass_filter4x4_only
241 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
242 dst_ptr += dst_pitch;
243 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
244 dst_ptr += dst_pitch;
245 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
246 dst_ptr += dst_pitch;
247 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
248 return;
249 }
250
251 // First Pass on rest 5-line data
252 q3u8 = vld1q_u8(src);
253 src += src_pixels_per_line;
254 q4u8 = vld1q_u8(src);
255 src += src_pixels_per_line;
256 q5u8 = vld1q_u8(src);
257 src += src_pixels_per_line;
258 q6u8 = vld1q_u8(src);
259 src += src_pixels_per_line;
260 q11u8 = vld1q_u8(src);
261
262 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
263 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
264 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
265 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
266
267 // vswp here
268 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
269 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
270
271 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19
272 vreinterpret_u32_u8(d19u8));
273 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21
274 vreinterpret_u32_u8(d21u8));
275 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
276 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
277 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
278 q12u16 = vmull_u8(d31u8, d5u8);
279
280 q4u64 = vreinterpretq_u64_u8(q3u8);
281 q6u64 = vreinterpretq_u64_u8(q5u8);
282
283 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7
284 vreinterpret_u32_u8(vget_high_u8(q3u8)));
285 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11
286 vreinterpret_u32_u8(vget_high_u8(q5u8)));
287 q9u64 = vshrq_n_u64(q4u64, 8);
288 q10u64 = vshrq_n_u64(q6u64, 8);
289 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
290 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
291 q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
292
293 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
294 vreinterpret_u32_u64(vget_high_u64(q9u64)));
295 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
296 vreinterpret_u32_u64(vget_high_u64(q10u64)));
297 q3u64 = vshrq_n_u64(q4u64, 32);
298 q5u64 = vshrq_n_u64(q6u64, 32);
299 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
300 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
301 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
302 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
303
304 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
305 vreinterpret_u32_u64(vget_high_u64(q3u64)));
306 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
307 vreinterpret_u32_u64(vget_high_u64(q5u64)));
308 q9u64 = vshrq_n_u64(q4u64, 16);
309 q10u64 = vshrq_n_u64(q6u64, 16);
310 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
311 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
312 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
313 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
314
315 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19
316 vreinterpret_u32_u64(vget_high_u64(q9u64)));
317 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211
318 vreinterpret_u32_u64(vget_high_u64(q10u64)));
319 q3u64 = vshrq_n_u64(q4u64, 24);
320 q5u64 = vshrq_n_u64(q6u64, 24);
321 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
322 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
323 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
324 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
325
326 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7
327 vreinterpret_u32_u64(vget_high_u64(q3u64)));
328 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11
329 vreinterpret_u32_u64(vget_high_u64(q5u64)));
330 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
331 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
332 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
333 q11u16 = vmull_u8(d31u8, d3u8);
334
335 q7s16 = vreinterpretq_s16_u16(q7u16);
336 q8s16 = vreinterpretq_s16_u16(q8u16);
337 q9s16 = vreinterpretq_s16_u16(q9u16);
338 q10s16 = vreinterpretq_s16_u16(q10u16);
339 q11s16 = vreinterpretq_s16_u16(q11u16);
340 q12s16 = vreinterpretq_s16_u16(q12u16);
341 q7s16 = vqaddq_s16(q7s16, q9s16);
342 q8s16 = vqaddq_s16(q8s16, q10s16);
343 q12s16 = vqaddq_s16(q12s16, q11s16);
344
345 d29u8 = vqrshrun_n_s16(q7s16, 7);
346 d30u8 = vqrshrun_n_s16(q8s16, 7);
347 d31u8 = vqrshrun_n_s16(q12s16, 7);
348
349 // Second pass: 4x4
350 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
351 d0s8 = vdup_lane_s8(dtmps8, 0);
352 d1s8 = vdup_lane_s8(dtmps8, 1);
353 d2s8 = vdup_lane_s8(dtmps8, 2);
354 d3s8 = vdup_lane_s8(dtmps8, 3);
355 d4s8 = vdup_lane_s8(dtmps8, 4);
356 d5s8 = vdup_lane_s8(dtmps8, 5);
357 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
358 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
359 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
360 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
361 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
362 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
363
364 d23u8 = vext_u8(d27u8, d28u8, 4);
365 d24u8 = vext_u8(d28u8, d29u8, 4);
366 d25u8 = vext_u8(d29u8, d30u8, 4);
367 d26u8 = vext_u8(d30u8, d31u8, 4);
368
369 q3u16 = vmull_u8(d27u8, d0u8);
370 q4u16 = vmull_u8(d28u8, d0u8);
371 q5u16 = vmull_u8(d25u8, d5u8);
372 q6u16 = vmull_u8(d26u8, d5u8);
373
374 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
375 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
376 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
377 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
378
379 q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
380 q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
381 q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
382 q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
383
384 q3s16 = vreinterpretq_s16_u16(q3u16);
385 q4s16 = vreinterpretq_s16_u16(q4u16);
386 q5s16 = vreinterpretq_s16_u16(q5u16);
387 q6s16 = vreinterpretq_s16_u16(q6u16);
388
389 q5s16 = vqaddq_s16(q5s16, q3s16);
390 q6s16 = vqaddq_s16(q6s16, q4s16);
391
392 d3u8 = vqrshrun_n_s16(q5s16, 7);
393 d4u8 = vqrshrun_n_s16(q6s16, 7);
394
395 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
396 dst_ptr += dst_pitch;
397 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
398 dst_ptr += dst_pitch;
399 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
400 dst_ptr += dst_pitch;
401 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
402 return;
403 }
404
vp8_sixtap_predict8x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)405 void vp8_sixtap_predict8x4_neon(
406 unsigned char *src_ptr,
407 int src_pixels_per_line,
408 int xoffset,
409 int yoffset,
410 unsigned char *dst_ptr,
411 int dst_pitch) {
412 unsigned char *src;
413 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
414 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
415 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
416 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
417 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
418 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
419 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
420 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
421 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
422
423 if (xoffset == 0) { // secondpass_filter8x4_only
424 // load second_pass filter
425 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
426 d0s8 = vdup_lane_s8(dtmps8, 0);
427 d1s8 = vdup_lane_s8(dtmps8, 1);
428 d2s8 = vdup_lane_s8(dtmps8, 2);
429 d3s8 = vdup_lane_s8(dtmps8, 3);
430 d4s8 = vdup_lane_s8(dtmps8, 4);
431 d5s8 = vdup_lane_s8(dtmps8, 5);
432 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
433 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
434 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
435 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
436 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
437 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
438
439 // load src data
440 src = src_ptr - src_pixels_per_line * 2;
441 d22u8 = vld1_u8(src);
442 src += src_pixels_per_line;
443 d23u8 = vld1_u8(src);
444 src += src_pixels_per_line;
445 d24u8 = vld1_u8(src);
446 src += src_pixels_per_line;
447 d25u8 = vld1_u8(src);
448 src += src_pixels_per_line;
449 d26u8 = vld1_u8(src);
450 src += src_pixels_per_line;
451 d27u8 = vld1_u8(src);
452 src += src_pixels_per_line;
453 d28u8 = vld1_u8(src);
454 src += src_pixels_per_line;
455 d29u8 = vld1_u8(src);
456 src += src_pixels_per_line;
457 d30u8 = vld1_u8(src);
458
459 q3u16 = vmull_u8(d22u8, d0u8);
460 q4u16 = vmull_u8(d23u8, d0u8);
461 q5u16 = vmull_u8(d24u8, d0u8);
462 q6u16 = vmull_u8(d25u8, d0u8);
463
464 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
465 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
466 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
467 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
468
469 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
470 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
471 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
472 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
473
474 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
475 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
476 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
477 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
478
479 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
480 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
481 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
482 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
483
484 q7u16 = vmull_u8(d25u8, d3u8);
485 q8u16 = vmull_u8(d26u8, d3u8);
486 q9u16 = vmull_u8(d27u8, d3u8);
487 q10u16 = vmull_u8(d28u8, d3u8);
488
489 q3s16 = vreinterpretq_s16_u16(q3u16);
490 q4s16 = vreinterpretq_s16_u16(q4u16);
491 q5s16 = vreinterpretq_s16_u16(q5u16);
492 q6s16 = vreinterpretq_s16_u16(q6u16);
493 q7s16 = vreinterpretq_s16_u16(q7u16);
494 q8s16 = vreinterpretq_s16_u16(q8u16);
495 q9s16 = vreinterpretq_s16_u16(q9u16);
496 q10s16 = vreinterpretq_s16_u16(q10u16);
497
498 q7s16 = vqaddq_s16(q7s16, q3s16);
499 q8s16 = vqaddq_s16(q8s16, q4s16);
500 q9s16 = vqaddq_s16(q9s16, q5s16);
501 q10s16 = vqaddq_s16(q10s16, q6s16);
502
503 d6u8 = vqrshrun_n_s16(q7s16, 7);
504 d7u8 = vqrshrun_n_s16(q8s16, 7);
505 d8u8 = vqrshrun_n_s16(q9s16, 7);
506 d9u8 = vqrshrun_n_s16(q10s16, 7);
507
508 vst1_u8(dst_ptr, d6u8);
509 dst_ptr += dst_pitch;
510 vst1_u8(dst_ptr, d7u8);
511 dst_ptr += dst_pitch;
512 vst1_u8(dst_ptr, d8u8);
513 dst_ptr += dst_pitch;
514 vst1_u8(dst_ptr, d9u8);
515 return;
516 }
517
518 // load first_pass filter
519 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
520 d0s8 = vdup_lane_s8(dtmps8, 0);
521 d1s8 = vdup_lane_s8(dtmps8, 1);
522 d2s8 = vdup_lane_s8(dtmps8, 2);
523 d3s8 = vdup_lane_s8(dtmps8, 3);
524 d4s8 = vdup_lane_s8(dtmps8, 4);
525 d5s8 = vdup_lane_s8(dtmps8, 5);
526 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
527 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
528 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
529 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
530 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
531 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
532
533 // First pass: output_height lines x output_width columns (9x4)
534 if (yoffset == 0) // firstpass_filter4x4_only
535 src = src_ptr - 2;
536 else
537 src = src_ptr - 2 - (src_pixels_per_line * 2);
538 q3u8 = vld1q_u8(src);
539 src += src_pixels_per_line;
540 q4u8 = vld1q_u8(src);
541 src += src_pixels_per_line;
542 q5u8 = vld1q_u8(src);
543 src += src_pixels_per_line;
544 q6u8 = vld1q_u8(src);
545
546 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
547 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
548 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
549 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
550
551 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
552 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
553 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
554 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
555
556 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
557 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
558 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
559 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
560
561 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
562 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
563 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
564 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
565
566 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
567 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
568 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
569 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
570
571 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
572 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
573 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
574 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
575
576 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
577 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
578 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
579 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
580
581 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
582 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
583 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
584 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
585
586 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
587 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
588 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
589 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
590
591 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
592 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
593 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
594 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
595
596 q3u16 = vmull_u8(d28u8, d3u8);
597 q4u16 = vmull_u8(d29u8, d3u8);
598 q5u16 = vmull_u8(d30u8, d3u8);
599 q6u16 = vmull_u8(d31u8, d3u8);
600
601 q3s16 = vreinterpretq_s16_u16(q3u16);
602 q4s16 = vreinterpretq_s16_u16(q4u16);
603 q5s16 = vreinterpretq_s16_u16(q5u16);
604 q6s16 = vreinterpretq_s16_u16(q6u16);
605 q7s16 = vreinterpretq_s16_u16(q7u16);
606 q8s16 = vreinterpretq_s16_u16(q8u16);
607 q9s16 = vreinterpretq_s16_u16(q9u16);
608 q10s16 = vreinterpretq_s16_u16(q10u16);
609
610 q7s16 = vqaddq_s16(q7s16, q3s16);
611 q8s16 = vqaddq_s16(q8s16, q4s16);
612 q9s16 = vqaddq_s16(q9s16, q5s16);
613 q10s16 = vqaddq_s16(q10s16, q6s16);
614
615 d22u8 = vqrshrun_n_s16(q7s16, 7);
616 d23u8 = vqrshrun_n_s16(q8s16, 7);
617 d24u8 = vqrshrun_n_s16(q9s16, 7);
618 d25u8 = vqrshrun_n_s16(q10s16, 7);
619
620 if (yoffset == 0) { // firstpass_filter8x4_only
621 vst1_u8(dst_ptr, d22u8);
622 dst_ptr += dst_pitch;
623 vst1_u8(dst_ptr, d23u8);
624 dst_ptr += dst_pitch;
625 vst1_u8(dst_ptr, d24u8);
626 dst_ptr += dst_pitch;
627 vst1_u8(dst_ptr, d25u8);
628 return;
629 }
630
631 // First Pass on rest 5-line data
632 src += src_pixels_per_line;
633 q3u8 = vld1q_u8(src);
634 src += src_pixels_per_line;
635 q4u8 = vld1q_u8(src);
636 src += src_pixels_per_line;
637 q5u8 = vld1q_u8(src);
638 src += src_pixels_per_line;
639 q6u8 = vld1q_u8(src);
640 src += src_pixels_per_line;
641 q7u8 = vld1q_u8(src);
642
643 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
644 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
645 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
646 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
647 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
648
649 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
650 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
651 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
652 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
653 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
654
655 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
656 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
657 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
658 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
659 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
660
661 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
662 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
663 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
664 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
665 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
666
667 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
668 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
669 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
670 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
671 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
672
673 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
674 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
675 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
676 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
677 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
678
679 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
680 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
681 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
682 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
683 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
684
685 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
686 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
687 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
688 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
689 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
690
691 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
692 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
693 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
694 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
695 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
696
697 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
698 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
699 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
700 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
701 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
702
703 q3u16 = vmull_u8(d27u8, d3u8);
704 q4u16 = vmull_u8(d28u8, d3u8);
705 q5u16 = vmull_u8(d29u8, d3u8);
706 q6u16 = vmull_u8(d30u8, d3u8);
707 q7u16 = vmull_u8(d31u8, d3u8);
708
709 q3s16 = vreinterpretq_s16_u16(q3u16);
710 q4s16 = vreinterpretq_s16_u16(q4u16);
711 q5s16 = vreinterpretq_s16_u16(q5u16);
712 q6s16 = vreinterpretq_s16_u16(q6u16);
713 q7s16 = vreinterpretq_s16_u16(q7u16);
714 q8s16 = vreinterpretq_s16_u16(q8u16);
715 q9s16 = vreinterpretq_s16_u16(q9u16);
716 q10s16 = vreinterpretq_s16_u16(q10u16);
717 q11s16 = vreinterpretq_s16_u16(q11u16);
718 q12s16 = vreinterpretq_s16_u16(q12u16);
719
720 q8s16 = vqaddq_s16(q8s16, q3s16);
721 q9s16 = vqaddq_s16(q9s16, q4s16);
722 q10s16 = vqaddq_s16(q10s16, q5s16);
723 q11s16 = vqaddq_s16(q11s16, q6s16);
724 q12s16 = vqaddq_s16(q12s16, q7s16);
725
726 d26u8 = vqrshrun_n_s16(q8s16, 7);
727 d27u8 = vqrshrun_n_s16(q9s16, 7);
728 d28u8 = vqrshrun_n_s16(q10s16, 7);
729 d29u8 = vqrshrun_n_s16(q11s16, 7);
730 d30u8 = vqrshrun_n_s16(q12s16, 7);
731
732 // Second pass: 8x4
733 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
734 d0s8 = vdup_lane_s8(dtmps8, 0);
735 d1s8 = vdup_lane_s8(dtmps8, 1);
736 d2s8 = vdup_lane_s8(dtmps8, 2);
737 d3s8 = vdup_lane_s8(dtmps8, 3);
738 d4s8 = vdup_lane_s8(dtmps8, 4);
739 d5s8 = vdup_lane_s8(dtmps8, 5);
740 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
741 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
742 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
743 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
744 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
745 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
746
747 q3u16 = vmull_u8(d22u8, d0u8);
748 q4u16 = vmull_u8(d23u8, d0u8);
749 q5u16 = vmull_u8(d24u8, d0u8);
750 q6u16 = vmull_u8(d25u8, d0u8);
751
752 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
753 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
754 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
755 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
756
757 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
758 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
759 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
760 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
761
762 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
763 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
764 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
765 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
766
767 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
768 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
769 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
770 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
771
772 q7u16 = vmull_u8(d25u8, d3u8);
773 q8u16 = vmull_u8(d26u8, d3u8);
774 q9u16 = vmull_u8(d27u8, d3u8);
775 q10u16 = vmull_u8(d28u8, d3u8);
776
777 q3s16 = vreinterpretq_s16_u16(q3u16);
778 q4s16 = vreinterpretq_s16_u16(q4u16);
779 q5s16 = vreinterpretq_s16_u16(q5u16);
780 q6s16 = vreinterpretq_s16_u16(q6u16);
781 q7s16 = vreinterpretq_s16_u16(q7u16);
782 q8s16 = vreinterpretq_s16_u16(q8u16);
783 q9s16 = vreinterpretq_s16_u16(q9u16);
784 q10s16 = vreinterpretq_s16_u16(q10u16);
785
786 q7s16 = vqaddq_s16(q7s16, q3s16);
787 q8s16 = vqaddq_s16(q8s16, q4s16);
788 q9s16 = vqaddq_s16(q9s16, q5s16);
789 q10s16 = vqaddq_s16(q10s16, q6s16);
790
791 d6u8 = vqrshrun_n_s16(q7s16, 7);
792 d7u8 = vqrshrun_n_s16(q8s16, 7);
793 d8u8 = vqrshrun_n_s16(q9s16, 7);
794 d9u8 = vqrshrun_n_s16(q10s16, 7);
795
796 vst1_u8(dst_ptr, d6u8);
797 dst_ptr += dst_pitch;
798 vst1_u8(dst_ptr, d7u8);
799 dst_ptr += dst_pitch;
800 vst1_u8(dst_ptr, d8u8);
801 dst_ptr += dst_pitch;
802 vst1_u8(dst_ptr, d9u8);
803 return;
804 }
805
vp8_sixtap_predict8x8_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)806 void vp8_sixtap_predict8x8_neon(
807 unsigned char *src_ptr,
808 int src_pixels_per_line,
809 int xoffset,
810 int yoffset,
811 unsigned char *dst_ptr,
812 int dst_pitch) {
813 unsigned char *src, *tmpp;
814 unsigned char tmp[64];
815 int i;
816 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
817 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
818 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
819 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
820 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
821 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
822 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
823 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
824 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
825
826 if (xoffset == 0) { // secondpass_filter8x8_only
827 // load second_pass filter
828 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
829 d0s8 = vdup_lane_s8(dtmps8, 0);
830 d1s8 = vdup_lane_s8(dtmps8, 1);
831 d2s8 = vdup_lane_s8(dtmps8, 2);
832 d3s8 = vdup_lane_s8(dtmps8, 3);
833 d4s8 = vdup_lane_s8(dtmps8, 4);
834 d5s8 = vdup_lane_s8(dtmps8, 5);
835 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
836 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
837 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
838 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
839 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
840 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
841
842 // load src data
843 src = src_ptr - src_pixels_per_line * 2;
844 d18u8 = vld1_u8(src);
845 src += src_pixels_per_line;
846 d19u8 = vld1_u8(src);
847 src += src_pixels_per_line;
848 d20u8 = vld1_u8(src);
849 src += src_pixels_per_line;
850 d21u8 = vld1_u8(src);
851 src += src_pixels_per_line;
852 d22u8 = vld1_u8(src);
853 src += src_pixels_per_line;
854 d23u8 = vld1_u8(src);
855 src += src_pixels_per_line;
856 d24u8 = vld1_u8(src);
857 src += src_pixels_per_line;
858 d25u8 = vld1_u8(src);
859 src += src_pixels_per_line;
860 d26u8 = vld1_u8(src);
861 src += src_pixels_per_line;
862 d27u8 = vld1_u8(src);
863 src += src_pixels_per_line;
864 d28u8 = vld1_u8(src);
865 src += src_pixels_per_line;
866 d29u8 = vld1_u8(src);
867 src += src_pixels_per_line;
868 d30u8 = vld1_u8(src);
869
870 for (i = 2; i > 0; i--) {
871 q3u16 = vmull_u8(d18u8, d0u8);
872 q4u16 = vmull_u8(d19u8, d0u8);
873 q5u16 = vmull_u8(d20u8, d0u8);
874 q6u16 = vmull_u8(d21u8, d0u8);
875
876 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
877 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
878 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
879 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
880
881 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
882 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
883 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
884 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
885
886 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
887 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
888 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
889 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
890
891 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
892 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
893 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
894 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
895
896 q7u16 = vmull_u8(d21u8, d3u8);
897 q8u16 = vmull_u8(d22u8, d3u8);
898 q9u16 = vmull_u8(d23u8, d3u8);
899 q10u16 = vmull_u8(d24u8, d3u8);
900
901 q3s16 = vreinterpretq_s16_u16(q3u16);
902 q4s16 = vreinterpretq_s16_u16(q4u16);
903 q5s16 = vreinterpretq_s16_u16(q5u16);
904 q6s16 = vreinterpretq_s16_u16(q6u16);
905 q7s16 = vreinterpretq_s16_u16(q7u16);
906 q8s16 = vreinterpretq_s16_u16(q8u16);
907 q9s16 = vreinterpretq_s16_u16(q9u16);
908 q10s16 = vreinterpretq_s16_u16(q10u16);
909
910 q7s16 = vqaddq_s16(q7s16, q3s16);
911 q8s16 = vqaddq_s16(q8s16, q4s16);
912 q9s16 = vqaddq_s16(q9s16, q5s16);
913 q10s16 = vqaddq_s16(q10s16, q6s16);
914
915 d6u8 = vqrshrun_n_s16(q7s16, 7);
916 d7u8 = vqrshrun_n_s16(q8s16, 7);
917 d8u8 = vqrshrun_n_s16(q9s16, 7);
918 d9u8 = vqrshrun_n_s16(q10s16, 7);
919
920 d18u8 = d22u8;
921 d19u8 = d23u8;
922 d20u8 = d24u8;
923 d21u8 = d25u8;
924 d22u8 = d26u8;
925 d23u8 = d27u8;
926 d24u8 = d28u8;
927 d25u8 = d29u8;
928 d26u8 = d30u8;
929
930 vst1_u8(dst_ptr, d6u8);
931 dst_ptr += dst_pitch;
932 vst1_u8(dst_ptr, d7u8);
933 dst_ptr += dst_pitch;
934 vst1_u8(dst_ptr, d8u8);
935 dst_ptr += dst_pitch;
936 vst1_u8(dst_ptr, d9u8);
937 dst_ptr += dst_pitch;
938 }
939 return;
940 }
941
942 // load first_pass filter
943 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
944 d0s8 = vdup_lane_s8(dtmps8, 0);
945 d1s8 = vdup_lane_s8(dtmps8, 1);
946 d2s8 = vdup_lane_s8(dtmps8, 2);
947 d3s8 = vdup_lane_s8(dtmps8, 3);
948 d4s8 = vdup_lane_s8(dtmps8, 4);
949 d5s8 = vdup_lane_s8(dtmps8, 5);
950 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
951 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
952 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
953 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
954 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
955 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
956
957 // First pass: output_height lines x output_width columns (9x4)
958 if (yoffset == 0) // firstpass_filter4x4_only
959 src = src_ptr - 2;
960 else
961 src = src_ptr - 2 - (src_pixels_per_line * 2);
962
963 tmpp = tmp;
964 for (i = 2; i > 0; i--) {
965 q3u8 = vld1q_u8(src);
966 src += src_pixels_per_line;
967 q4u8 = vld1q_u8(src);
968 src += src_pixels_per_line;
969 q5u8 = vld1q_u8(src);
970 src += src_pixels_per_line;
971 q6u8 = vld1q_u8(src);
972 src += src_pixels_per_line;
973
974 __builtin_prefetch(src);
975 __builtin_prefetch(src + src_pixels_per_line);
976 __builtin_prefetch(src + src_pixels_per_line * 2);
977
978 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
979 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
980 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
981 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
982
983 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
984 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
985 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
986 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
987
988 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
989 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
990 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
991 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
992
993 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
994 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
995 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
996 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
997
998 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
999 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
1000 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
1001 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
1002
1003 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1004 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1005 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1006 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1007
1008 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
1009 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
1010 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
1011 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
1012
1013 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1014 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1015 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1016 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1017
1018 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
1019 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1020 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
1021 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
1022
1023 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1024 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1025 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1026 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1027
1028 q3u16 = vmull_u8(d28u8, d3u8);
1029 q4u16 = vmull_u8(d29u8, d3u8);
1030 q5u16 = vmull_u8(d30u8, d3u8);
1031 q6u16 = vmull_u8(d31u8, d3u8);
1032
1033 q3s16 = vreinterpretq_s16_u16(q3u16);
1034 q4s16 = vreinterpretq_s16_u16(q4u16);
1035 q5s16 = vreinterpretq_s16_u16(q5u16);
1036 q6s16 = vreinterpretq_s16_u16(q6u16);
1037 q7s16 = vreinterpretq_s16_u16(q7u16);
1038 q8s16 = vreinterpretq_s16_u16(q8u16);
1039 q9s16 = vreinterpretq_s16_u16(q9u16);
1040 q10s16 = vreinterpretq_s16_u16(q10u16);
1041
1042 q7s16 = vqaddq_s16(q7s16, q3s16);
1043 q8s16 = vqaddq_s16(q8s16, q4s16);
1044 q9s16 = vqaddq_s16(q9s16, q5s16);
1045 q10s16 = vqaddq_s16(q10s16, q6s16);
1046
1047 d22u8 = vqrshrun_n_s16(q7s16, 7);
1048 d23u8 = vqrshrun_n_s16(q8s16, 7);
1049 d24u8 = vqrshrun_n_s16(q9s16, 7);
1050 d25u8 = vqrshrun_n_s16(q10s16, 7);
1051
1052 if (yoffset == 0) { // firstpass_filter8x4_only
1053 vst1_u8(dst_ptr, d22u8);
1054 dst_ptr += dst_pitch;
1055 vst1_u8(dst_ptr, d23u8);
1056 dst_ptr += dst_pitch;
1057 vst1_u8(dst_ptr, d24u8);
1058 dst_ptr += dst_pitch;
1059 vst1_u8(dst_ptr, d25u8);
1060 dst_ptr += dst_pitch;
1061 } else {
1062 vst1_u8(tmpp, d22u8);
1063 tmpp += 8;
1064 vst1_u8(tmpp, d23u8);
1065 tmpp += 8;
1066 vst1_u8(tmpp, d24u8);
1067 tmpp += 8;
1068 vst1_u8(tmpp, d25u8);
1069 tmpp += 8;
1070 }
1071 }
1072 if (yoffset == 0)
1073 return;
1074
1075 // First Pass on rest 5-line data
1076 q3u8 = vld1q_u8(src);
1077 src += src_pixels_per_line;
1078 q4u8 = vld1q_u8(src);
1079 src += src_pixels_per_line;
1080 q5u8 = vld1q_u8(src);
1081 src += src_pixels_per_line;
1082 q6u8 = vld1q_u8(src);
1083 src += src_pixels_per_line;
1084 q7u8 = vld1q_u8(src);
1085
1086 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1087 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1088 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1089 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1090 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1091
1092 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1093 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1094 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1095 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1096 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1097
1098 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1099 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1100 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1101 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1102 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1103
1104 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1105 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1106 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1107 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1108 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1109
1110 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1111 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1112 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1113 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1114 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1115
1116 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1117 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1118 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1119 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1120 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1121
1122 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1123 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1124 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1125 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1126 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1127
1128 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1129 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1130 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1131 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1132 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1133
1134 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1135 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1136 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1137 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1138 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1139
1140 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1141 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1142 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1143 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1144 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1145
1146 q3u16 = vmull_u8(d27u8, d3u8);
1147 q4u16 = vmull_u8(d28u8, d3u8);
1148 q5u16 = vmull_u8(d29u8, d3u8);
1149 q6u16 = vmull_u8(d30u8, d3u8);
1150 q7u16 = vmull_u8(d31u8, d3u8);
1151
1152 q3s16 = vreinterpretq_s16_u16(q3u16);
1153 q4s16 = vreinterpretq_s16_u16(q4u16);
1154 q5s16 = vreinterpretq_s16_u16(q5u16);
1155 q6s16 = vreinterpretq_s16_u16(q6u16);
1156 q7s16 = vreinterpretq_s16_u16(q7u16);
1157 q8s16 = vreinterpretq_s16_u16(q8u16);
1158 q9s16 = vreinterpretq_s16_u16(q9u16);
1159 q10s16 = vreinterpretq_s16_u16(q10u16);
1160 q11s16 = vreinterpretq_s16_u16(q11u16);
1161 q12s16 = vreinterpretq_s16_u16(q12u16);
1162
1163 q8s16 = vqaddq_s16(q8s16, q3s16);
1164 q9s16 = vqaddq_s16(q9s16, q4s16);
1165 q10s16 = vqaddq_s16(q10s16, q5s16);
1166 q11s16 = vqaddq_s16(q11s16, q6s16);
1167 q12s16 = vqaddq_s16(q12s16, q7s16);
1168
1169 d26u8 = vqrshrun_n_s16(q8s16, 7);
1170 d27u8 = vqrshrun_n_s16(q9s16, 7);
1171 d28u8 = vqrshrun_n_s16(q10s16, 7);
1172 d29u8 = vqrshrun_n_s16(q11s16, 7);
1173 d30u8 = vqrshrun_n_s16(q12s16, 7);
1174
1175 // Second pass: 8x8
1176 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1177 d0s8 = vdup_lane_s8(dtmps8, 0);
1178 d1s8 = vdup_lane_s8(dtmps8, 1);
1179 d2s8 = vdup_lane_s8(dtmps8, 2);
1180 d3s8 = vdup_lane_s8(dtmps8, 3);
1181 d4s8 = vdup_lane_s8(dtmps8, 4);
1182 d5s8 = vdup_lane_s8(dtmps8, 5);
1183 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1184 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1185 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1186 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1187 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1188 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1189
1190 tmpp = tmp;
1191 q9u8 = vld1q_u8(tmpp);
1192 tmpp += 16;
1193 q10u8 = vld1q_u8(tmpp);
1194 tmpp += 16;
1195 q11u8 = vld1q_u8(tmpp);
1196 tmpp += 16;
1197 q12u8 = vld1q_u8(tmpp);
1198
1199 d18u8 = vget_low_u8(q9u8);
1200 d19u8 = vget_high_u8(q9u8);
1201 d20u8 = vget_low_u8(q10u8);
1202 d21u8 = vget_high_u8(q10u8);
1203 d22u8 = vget_low_u8(q11u8);
1204 d23u8 = vget_high_u8(q11u8);
1205 d24u8 = vget_low_u8(q12u8);
1206 d25u8 = vget_high_u8(q12u8);
1207
1208 for (i = 2; i > 0; i--) {
1209 q3u16 = vmull_u8(d18u8, d0u8);
1210 q4u16 = vmull_u8(d19u8, d0u8);
1211 q5u16 = vmull_u8(d20u8, d0u8);
1212 q6u16 = vmull_u8(d21u8, d0u8);
1213
1214 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1215 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1216 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1217 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1218
1219 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1220 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1221 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1222 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1223
1224 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1225 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1226 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1227 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1228
1229 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1230 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1231 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1232 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1233
1234 q7u16 = vmull_u8(d21u8, d3u8);
1235 q8u16 = vmull_u8(d22u8, d3u8);
1236 q9u16 = vmull_u8(d23u8, d3u8);
1237 q10u16 = vmull_u8(d24u8, d3u8);
1238
1239 q3s16 = vreinterpretq_s16_u16(q3u16);
1240 q4s16 = vreinterpretq_s16_u16(q4u16);
1241 q5s16 = vreinterpretq_s16_u16(q5u16);
1242 q6s16 = vreinterpretq_s16_u16(q6u16);
1243 q7s16 = vreinterpretq_s16_u16(q7u16);
1244 q8s16 = vreinterpretq_s16_u16(q8u16);
1245 q9s16 = vreinterpretq_s16_u16(q9u16);
1246 q10s16 = vreinterpretq_s16_u16(q10u16);
1247
1248 q7s16 = vqaddq_s16(q7s16, q3s16);
1249 q8s16 = vqaddq_s16(q8s16, q4s16);
1250 q9s16 = vqaddq_s16(q9s16, q5s16);
1251 q10s16 = vqaddq_s16(q10s16, q6s16);
1252
1253 d6u8 = vqrshrun_n_s16(q7s16, 7);
1254 d7u8 = vqrshrun_n_s16(q8s16, 7);
1255 d8u8 = vqrshrun_n_s16(q9s16, 7);
1256 d9u8 = vqrshrun_n_s16(q10s16, 7);
1257
1258 d18u8 = d22u8;
1259 d19u8 = d23u8;
1260 d20u8 = d24u8;
1261 d21u8 = d25u8;
1262 d22u8 = d26u8;
1263 d23u8 = d27u8;
1264 d24u8 = d28u8;
1265 d25u8 = d29u8;
1266 d26u8 = d30u8;
1267
1268 vst1_u8(dst_ptr, d6u8);
1269 dst_ptr += dst_pitch;
1270 vst1_u8(dst_ptr, d7u8);
1271 dst_ptr += dst_pitch;
1272 vst1_u8(dst_ptr, d8u8);
1273 dst_ptr += dst_pitch;
1274 vst1_u8(dst_ptr, d9u8);
1275 dst_ptr += dst_pitch;
1276 }
1277 return;
1278 }
1279
vp8_sixtap_predict16x16_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)1280 void vp8_sixtap_predict16x16_neon(
1281 unsigned char *src_ptr,
1282 int src_pixels_per_line,
1283 int xoffset,
1284 int yoffset,
1285 unsigned char *dst_ptr,
1286 int dst_pitch) {
1287 unsigned char *src, *src_tmp, *dst, *tmpp;
1288 unsigned char tmp[336];
1289 int i, j;
1290 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1291 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1292 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1293 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1294 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1295 uint8x16_t q3u8, q4u8;
1296 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1297 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1298 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1299 int16x8_t q11s16, q12s16, q13s16, q15s16;
1300
1301 if (xoffset == 0) { // secondpass_filter8x8_only
1302 // load second_pass filter
1303 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1304 d0s8 = vdup_lane_s8(dtmps8, 0);
1305 d1s8 = vdup_lane_s8(dtmps8, 1);
1306 d2s8 = vdup_lane_s8(dtmps8, 2);
1307 d3s8 = vdup_lane_s8(dtmps8, 3);
1308 d4s8 = vdup_lane_s8(dtmps8, 4);
1309 d5s8 = vdup_lane_s8(dtmps8, 5);
1310 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1311 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1312 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1313 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1314 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1315 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1316
1317 // load src data
1318 src_tmp = src_ptr - src_pixels_per_line * 2;
1319 for (i = 0; i < 2; i++) {
1320 src = src_tmp + i * 8;
1321 dst = dst_ptr + i * 8;
1322 d18u8 = vld1_u8(src);
1323 src += src_pixels_per_line;
1324 d19u8 = vld1_u8(src);
1325 src += src_pixels_per_line;
1326 d20u8 = vld1_u8(src);
1327 src += src_pixels_per_line;
1328 d21u8 = vld1_u8(src);
1329 src += src_pixels_per_line;
1330 d22u8 = vld1_u8(src);
1331 src += src_pixels_per_line;
1332 for (j = 0; j < 4; j++) {
1333 d23u8 = vld1_u8(src);
1334 src += src_pixels_per_line;
1335 d24u8 = vld1_u8(src);
1336 src += src_pixels_per_line;
1337 d25u8 = vld1_u8(src);
1338 src += src_pixels_per_line;
1339 d26u8 = vld1_u8(src);
1340 src += src_pixels_per_line;
1341
1342 q3u16 = vmull_u8(d18u8, d0u8);
1343 q4u16 = vmull_u8(d19u8, d0u8);
1344 q5u16 = vmull_u8(d20u8, d0u8);
1345 q6u16 = vmull_u8(d21u8, d0u8);
1346
1347 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1348 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1349 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1350 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1351
1352 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1353 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1354 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1355 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1356
1357 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1358 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1359 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1360 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1361
1362 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1363 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1364 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1365 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1366
1367 q7u16 = vmull_u8(d21u8, d3u8);
1368 q8u16 = vmull_u8(d22u8, d3u8);
1369 q9u16 = vmull_u8(d23u8, d3u8);
1370 q10u16 = vmull_u8(d24u8, d3u8);
1371
1372 q3s16 = vreinterpretq_s16_u16(q3u16);
1373 q4s16 = vreinterpretq_s16_u16(q4u16);
1374 q5s16 = vreinterpretq_s16_u16(q5u16);
1375 q6s16 = vreinterpretq_s16_u16(q6u16);
1376 q7s16 = vreinterpretq_s16_u16(q7u16);
1377 q8s16 = vreinterpretq_s16_u16(q8u16);
1378 q9s16 = vreinterpretq_s16_u16(q9u16);
1379 q10s16 = vreinterpretq_s16_u16(q10u16);
1380
1381 q7s16 = vqaddq_s16(q7s16, q3s16);
1382 q8s16 = vqaddq_s16(q8s16, q4s16);
1383 q9s16 = vqaddq_s16(q9s16, q5s16);
1384 q10s16 = vqaddq_s16(q10s16, q6s16);
1385
1386 d6u8 = vqrshrun_n_s16(q7s16, 7);
1387 d7u8 = vqrshrun_n_s16(q8s16, 7);
1388 d8u8 = vqrshrun_n_s16(q9s16, 7);
1389 d9u8 = vqrshrun_n_s16(q10s16, 7);
1390
1391 d18u8 = d22u8;
1392 d19u8 = d23u8;
1393 d20u8 = d24u8;
1394 d21u8 = d25u8;
1395 d22u8 = d26u8;
1396
1397 vst1_u8(dst, d6u8);
1398 dst += dst_pitch;
1399 vst1_u8(dst, d7u8);
1400 dst += dst_pitch;
1401 vst1_u8(dst, d8u8);
1402 dst += dst_pitch;
1403 vst1_u8(dst, d9u8);
1404 dst += dst_pitch;
1405 }
1406 }
1407 return;
1408 }
1409
1410 // load first_pass filter
1411 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1412 d0s8 = vdup_lane_s8(dtmps8, 0);
1413 d1s8 = vdup_lane_s8(dtmps8, 1);
1414 d2s8 = vdup_lane_s8(dtmps8, 2);
1415 d3s8 = vdup_lane_s8(dtmps8, 3);
1416 d4s8 = vdup_lane_s8(dtmps8, 4);
1417 d5s8 = vdup_lane_s8(dtmps8, 5);
1418 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1419 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1420 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1421 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1422 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1423 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1424
1425 // First pass: output_height lines x output_width columns (9x4)
1426 if (yoffset == 0) { // firstpass_filter4x4_only
1427 src = src_ptr - 2;
1428 dst = dst_ptr;
1429 for (i = 0; i < 8; i++) {
1430 d6u8 = vld1_u8(src);
1431 d7u8 = vld1_u8(src + 8);
1432 d8u8 = vld1_u8(src + 16);
1433 src += src_pixels_per_line;
1434 d9u8 = vld1_u8(src);
1435 d10u8 = vld1_u8(src + 8);
1436 d11u8 = vld1_u8(src + 16);
1437 src += src_pixels_per_line;
1438
1439 __builtin_prefetch(src);
1440 __builtin_prefetch(src + src_pixels_per_line);
1441
1442 q6u16 = vmull_u8(d6u8, d0u8);
1443 q7u16 = vmull_u8(d7u8, d0u8);
1444 q8u16 = vmull_u8(d9u8, d0u8);
1445 q9u16 = vmull_u8(d10u8, d0u8);
1446
1447 d20u8 = vext_u8(d6u8, d7u8, 1);
1448 d21u8 = vext_u8(d9u8, d10u8, 1);
1449 d22u8 = vext_u8(d7u8, d8u8, 1);
1450 d23u8 = vext_u8(d10u8, d11u8, 1);
1451 d24u8 = vext_u8(d6u8, d7u8, 4);
1452 d25u8 = vext_u8(d9u8, d10u8, 4);
1453 d26u8 = vext_u8(d7u8, d8u8, 4);
1454 d27u8 = vext_u8(d10u8, d11u8, 4);
1455 d28u8 = vext_u8(d6u8, d7u8, 5);
1456 d29u8 = vext_u8(d9u8, d10u8, 5);
1457
1458 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1459 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1460 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1461 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1462 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1463 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1464 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1465 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1466 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1467 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1468
1469 d20u8 = vext_u8(d7u8, d8u8, 5);
1470 d21u8 = vext_u8(d10u8, d11u8, 5);
1471 d22u8 = vext_u8(d6u8, d7u8, 2);
1472 d23u8 = vext_u8(d9u8, d10u8, 2);
1473 d24u8 = vext_u8(d7u8, d8u8, 2);
1474 d25u8 = vext_u8(d10u8, d11u8, 2);
1475 d26u8 = vext_u8(d6u8, d7u8, 3);
1476 d27u8 = vext_u8(d9u8, d10u8, 3);
1477 d28u8 = vext_u8(d7u8, d8u8, 3);
1478 d29u8 = vext_u8(d10u8, d11u8, 3);
1479
1480 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1481 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1482 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1483 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1484 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1485 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1486
1487 q10u16 = vmull_u8(d26u8, d3u8);
1488 q11u16 = vmull_u8(d27u8, d3u8);
1489 q12u16 = vmull_u8(d28u8, d3u8);
1490 q15u16 = vmull_u8(d29u8, d3u8);
1491
1492 q6s16 = vreinterpretq_s16_u16(q6u16);
1493 q7s16 = vreinterpretq_s16_u16(q7u16);
1494 q8s16 = vreinterpretq_s16_u16(q8u16);
1495 q9s16 = vreinterpretq_s16_u16(q9u16);
1496 q10s16 = vreinterpretq_s16_u16(q10u16);
1497 q11s16 = vreinterpretq_s16_u16(q11u16);
1498 q12s16 = vreinterpretq_s16_u16(q12u16);
1499 q15s16 = vreinterpretq_s16_u16(q15u16);
1500
1501 q6s16 = vqaddq_s16(q6s16, q10s16);
1502 q8s16 = vqaddq_s16(q8s16, q11s16);
1503 q7s16 = vqaddq_s16(q7s16, q12s16);
1504 q9s16 = vqaddq_s16(q9s16, q15s16);
1505
1506 d6u8 = vqrshrun_n_s16(q6s16, 7);
1507 d7u8 = vqrshrun_n_s16(q7s16, 7);
1508 d8u8 = vqrshrun_n_s16(q8s16, 7);
1509 d9u8 = vqrshrun_n_s16(q9s16, 7);
1510
1511 q3u8 = vcombine_u8(d6u8, d7u8);
1512 q4u8 = vcombine_u8(d8u8, d9u8);
1513 vst1q_u8(dst, q3u8);
1514 dst += dst_pitch;
1515 vst1q_u8(dst, q4u8);
1516 dst += dst_pitch;
1517 }
1518 return;
1519 }
1520
1521 src = src_ptr - 2 - src_pixels_per_line * 2;
1522 tmpp = tmp;
1523 for (i = 0; i < 7; i++) {
1524 d6u8 = vld1_u8(src);
1525 d7u8 = vld1_u8(src + 8);
1526 d8u8 = vld1_u8(src + 16);
1527 src += src_pixels_per_line;
1528 d9u8 = vld1_u8(src);
1529 d10u8 = vld1_u8(src + 8);
1530 d11u8 = vld1_u8(src + 16);
1531 src += src_pixels_per_line;
1532 d12u8 = vld1_u8(src);
1533 d13u8 = vld1_u8(src + 8);
1534 d14u8 = vld1_u8(src + 16);
1535 src += src_pixels_per_line;
1536
1537 __builtin_prefetch(src);
1538 __builtin_prefetch(src + src_pixels_per_line);
1539 __builtin_prefetch(src + src_pixels_per_line * 2);
1540
1541 q8u16 = vmull_u8(d6u8, d0u8);
1542 q9u16 = vmull_u8(d7u8, d0u8);
1543 q10u16 = vmull_u8(d9u8, d0u8);
1544 q11u16 = vmull_u8(d10u8, d0u8);
1545 q12u16 = vmull_u8(d12u8, d0u8);
1546 q13u16 = vmull_u8(d13u8, d0u8);
1547
1548 d28u8 = vext_u8(d6u8, d7u8, 1);
1549 d29u8 = vext_u8(d9u8, d10u8, 1);
1550 d30u8 = vext_u8(d12u8, d13u8, 1);
1551 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1552 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1553 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1554 d28u8 = vext_u8(d7u8, d8u8, 1);
1555 d29u8 = vext_u8(d10u8, d11u8, 1);
1556 d30u8 = vext_u8(d13u8, d14u8, 1);
1557 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1558 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1559 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1560
1561 d28u8 = vext_u8(d6u8, d7u8, 4);
1562 d29u8 = vext_u8(d9u8, d10u8, 4);
1563 d30u8 = vext_u8(d12u8, d13u8, 4);
1564 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1565 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1566 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1567 d28u8 = vext_u8(d7u8, d8u8, 4);
1568 d29u8 = vext_u8(d10u8, d11u8, 4);
1569 d30u8 = vext_u8(d13u8, d14u8, 4);
1570 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1571 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1572 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1573
1574 d28u8 = vext_u8(d6u8, d7u8, 5);
1575 d29u8 = vext_u8(d9u8, d10u8, 5);
1576 d30u8 = vext_u8(d12u8, d13u8, 5);
1577 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1578 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1579 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1580 d28u8 = vext_u8(d7u8, d8u8, 5);
1581 d29u8 = vext_u8(d10u8, d11u8, 5);
1582 d30u8 = vext_u8(d13u8, d14u8, 5);
1583 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1584 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1585 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1586
1587 d28u8 = vext_u8(d6u8, d7u8, 2);
1588 d29u8 = vext_u8(d9u8, d10u8, 2);
1589 d30u8 = vext_u8(d12u8, d13u8, 2);
1590 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1591 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1592 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1593 d28u8 = vext_u8(d7u8, d8u8, 2);
1594 d29u8 = vext_u8(d10u8, d11u8, 2);
1595 d30u8 = vext_u8(d13u8, d14u8, 2);
1596 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1597 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1598 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1599
1600 d28u8 = vext_u8(d6u8, d7u8, 3);
1601 d29u8 = vext_u8(d9u8, d10u8, 3);
1602 d30u8 = vext_u8(d12u8, d13u8, 3);
1603 d15u8 = vext_u8(d7u8, d8u8, 3);
1604 d31u8 = vext_u8(d10u8, d11u8, 3);
1605 d6u8 = vext_u8(d13u8, d14u8, 3);
1606 q4u16 = vmull_u8(d28u8, d3u8);
1607 q5u16 = vmull_u8(d29u8, d3u8);
1608 q6u16 = vmull_u8(d30u8, d3u8);
1609 q4s16 = vreinterpretq_s16_u16(q4u16);
1610 q5s16 = vreinterpretq_s16_u16(q5u16);
1611 q6s16 = vreinterpretq_s16_u16(q6u16);
1612 q8s16 = vreinterpretq_s16_u16(q8u16);
1613 q10s16 = vreinterpretq_s16_u16(q10u16);
1614 q12s16 = vreinterpretq_s16_u16(q12u16);
1615 q8s16 = vqaddq_s16(q8s16, q4s16);
1616 q10s16 = vqaddq_s16(q10s16, q5s16);
1617 q12s16 = vqaddq_s16(q12s16, q6s16);
1618
1619 q6u16 = vmull_u8(d15u8, d3u8);
1620 q7u16 = vmull_u8(d31u8, d3u8);
1621 q3u16 = vmull_u8(d6u8, d3u8);
1622 q3s16 = vreinterpretq_s16_u16(q3u16);
1623 q6s16 = vreinterpretq_s16_u16(q6u16);
1624 q7s16 = vreinterpretq_s16_u16(q7u16);
1625 q9s16 = vreinterpretq_s16_u16(q9u16);
1626 q11s16 = vreinterpretq_s16_u16(q11u16);
1627 q13s16 = vreinterpretq_s16_u16(q13u16);
1628 q9s16 = vqaddq_s16(q9s16, q6s16);
1629 q11s16 = vqaddq_s16(q11s16, q7s16);
1630 q13s16 = vqaddq_s16(q13s16, q3s16);
1631
1632 d6u8 = vqrshrun_n_s16(q8s16, 7);
1633 d7u8 = vqrshrun_n_s16(q9s16, 7);
1634 d8u8 = vqrshrun_n_s16(q10s16, 7);
1635 d9u8 = vqrshrun_n_s16(q11s16, 7);
1636 d10u8 = vqrshrun_n_s16(q12s16, 7);
1637 d11u8 = vqrshrun_n_s16(q13s16, 7);
1638
1639 vst1_u8(tmpp, d6u8);
1640 tmpp += 8;
1641 vst1_u8(tmpp, d7u8);
1642 tmpp += 8;
1643 vst1_u8(tmpp, d8u8);
1644 tmpp += 8;
1645 vst1_u8(tmpp, d9u8);
1646 tmpp += 8;
1647 vst1_u8(tmpp, d10u8);
1648 tmpp += 8;
1649 vst1_u8(tmpp, d11u8);
1650 tmpp += 8;
1651 }
1652
1653 // Second pass: 16x16
1654 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1655 d0s8 = vdup_lane_s8(dtmps8, 0);
1656 d1s8 = vdup_lane_s8(dtmps8, 1);
1657 d2s8 = vdup_lane_s8(dtmps8, 2);
1658 d3s8 = vdup_lane_s8(dtmps8, 3);
1659 d4s8 = vdup_lane_s8(dtmps8, 4);
1660 d5s8 = vdup_lane_s8(dtmps8, 5);
1661 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1662 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1663 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1664 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1665 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1666 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1667
1668 for (i = 0; i < 2; i++) {
1669 dst = dst_ptr + 8 * i;
1670 tmpp = tmp + 8 * i;
1671 d18u8 = vld1_u8(tmpp);
1672 tmpp += 16;
1673 d19u8 = vld1_u8(tmpp);
1674 tmpp += 16;
1675 d20u8 = vld1_u8(tmpp);
1676 tmpp += 16;
1677 d21u8 = vld1_u8(tmpp);
1678 tmpp += 16;
1679 d22u8 = vld1_u8(tmpp);
1680 tmpp += 16;
1681 for (j = 0; j < 4; j++) {
1682 d23u8 = vld1_u8(tmpp);
1683 tmpp += 16;
1684 d24u8 = vld1_u8(tmpp);
1685 tmpp += 16;
1686 d25u8 = vld1_u8(tmpp);
1687 tmpp += 16;
1688 d26u8 = vld1_u8(tmpp);
1689 tmpp += 16;
1690
1691 q3u16 = vmull_u8(d18u8, d0u8);
1692 q4u16 = vmull_u8(d19u8, d0u8);
1693 q5u16 = vmull_u8(d20u8, d0u8);
1694 q6u16 = vmull_u8(d21u8, d0u8);
1695
1696 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1697 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1698 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1699 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1700
1701 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1702 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1703 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1704 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1705
1706 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1707 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1708 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1709 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1710
1711 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1712 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1713 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1714 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1715
1716 q7u16 = vmull_u8(d21u8, d3u8);
1717 q8u16 = vmull_u8(d22u8, d3u8);
1718 q9u16 = vmull_u8(d23u8, d3u8);
1719 q10u16 = vmull_u8(d24u8, d3u8);
1720
1721 q3s16 = vreinterpretq_s16_u16(q3u16);
1722 q4s16 = vreinterpretq_s16_u16(q4u16);
1723 q5s16 = vreinterpretq_s16_u16(q5u16);
1724 q6s16 = vreinterpretq_s16_u16(q6u16);
1725 q7s16 = vreinterpretq_s16_u16(q7u16);
1726 q8s16 = vreinterpretq_s16_u16(q8u16);
1727 q9s16 = vreinterpretq_s16_u16(q9u16);
1728 q10s16 = vreinterpretq_s16_u16(q10u16);
1729
1730 q7s16 = vqaddq_s16(q7s16, q3s16);
1731 q8s16 = vqaddq_s16(q8s16, q4s16);
1732 q9s16 = vqaddq_s16(q9s16, q5s16);
1733 q10s16 = vqaddq_s16(q10s16, q6s16);
1734
1735 d6u8 = vqrshrun_n_s16(q7s16, 7);
1736 d7u8 = vqrshrun_n_s16(q8s16, 7);
1737 d8u8 = vqrshrun_n_s16(q9s16, 7);
1738 d9u8 = vqrshrun_n_s16(q10s16, 7);
1739
1740 d18u8 = d22u8;
1741 d19u8 = d23u8;
1742 d20u8 = d24u8;
1743 d21u8 = d25u8;
1744 d22u8 = d26u8;
1745
1746 vst1_u8(dst, d6u8);
1747 dst += dst_pitch;
1748 vst1_u8(dst, d7u8);
1749 dst += dst_pitch;
1750 vst1_u8(dst, d8u8);
1751 dst += dst_pitch;
1752 vst1_u8(dst, d9u8);
1753 dst += dst_pitch;
1754 }
1755 }
1756 return;
1757 }
1758