1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13 
vp8_mbloop_filter_neon(uint8x16_t qblimit,uint8x16_t qlimit,uint8x16_t qthresh,uint8x16_t q3,uint8x16_t q4,uint8x16_t q5,uint8x16_t q6,uint8x16_t q7,uint8x16_t q8,uint8x16_t q9,uint8x16_t q10,uint8x16_t * q4r,uint8x16_t * q5r,uint8x16_t * q6r,uint8x16_t * q7r,uint8x16_t * q8r,uint8x16_t * q9r)14 static INLINE void vp8_mbloop_filter_neon(
15         uint8x16_t qblimit,  // mblimit
16         uint8x16_t qlimit,   // limit
17         uint8x16_t qthresh,  // thresh
18         uint8x16_t q3,       // p2
19         uint8x16_t q4,       // p2
20         uint8x16_t q5,       // p1
21         uint8x16_t q6,       // p0
22         uint8x16_t q7,       // q0
23         uint8x16_t q8,       // q1
24         uint8x16_t q9,       // q2
25         uint8x16_t q10,      // q3
26         uint8x16_t *q4r,     // p1
27         uint8x16_t *q5r,     // p1
28         uint8x16_t *q6r,     // p0
29         uint8x16_t *q7r,     // q0
30         uint8x16_t *q8r,     // q1
31         uint8x16_t *q9r) {   // q1
32     uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
33     int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
34     int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
35     uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
36     int8x16_t q0s8, q12s8, q14s8, q15s8;
37     int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
38 
39     q11u8 = vabdq_u8(q3, q4);
40     q12u8 = vabdq_u8(q4, q5);
41     q13u8 = vabdq_u8(q5, q6);
42     q14u8 = vabdq_u8(q8, q7);
43     q1u8  = vabdq_u8(q9, q8);
44     q0u8  = vabdq_u8(q10, q9);
45 
46     q11u8 = vmaxq_u8(q11u8, q12u8);
47     q12u8 = vmaxq_u8(q13u8, q14u8);
48     q1u8  = vmaxq_u8(q1u8, q0u8);
49     q15u8 = vmaxq_u8(q11u8, q12u8);
50 
51     q12u8 = vabdq_u8(q6, q7);
52 
53     // vp8_hevmask
54     q13u8 = vcgtq_u8(q13u8, qthresh);
55     q14u8 = vcgtq_u8(q14u8, qthresh);
56     q15u8 = vmaxq_u8(q15u8, q1u8);
57 
58     q15u8 = vcgeq_u8(qlimit, q15u8);
59 
60     q1u8 = vabdq_u8(q5, q8);
61     q12u8 = vqaddq_u8(q12u8, q12u8);
62 
63     // vp8_filter() function
64     // convert to signed
65     q0u8 = vdupq_n_u8(0x80);
66     q9 = veorq_u8(q9, q0u8);
67     q8 = veorq_u8(q8, q0u8);
68     q7 = veorq_u8(q7, q0u8);
69     q6 = veorq_u8(q6, q0u8);
70     q5 = veorq_u8(q5, q0u8);
71     q4 = veorq_u8(q4, q0u8);
72 
73     q1u8 = vshrq_n_u8(q1u8, 1);
74     q12u8 = vqaddq_u8(q12u8, q1u8);
75 
76     q14u8 = vorrq_u8(q13u8, q14u8);
77     q12u8 = vcgeq_u8(qblimit, q12u8);
78 
79     q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
80                      vget_low_s8(vreinterpretq_s8_u8(q6)));
81     q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
82                       vget_high_s8(vreinterpretq_s8_u8(q6)));
83 
84     q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
85                      vreinterpretq_s8_u8(q8));
86 
87     q11s16 = vdupq_n_s16(3);
88     q2s16  = vmulq_s16(q2s16, q11s16);
89     q13s16 = vmulq_s16(q13s16, q11s16);
90 
91     q15u8 = vandq_u8(q15u8, q12u8);
92 
93     q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
94     q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
95 
96     q12u8 = vdupq_n_u8(3);
97     q11u8 = vdupq_n_u8(4);
98     // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
99     d2 = vqmovn_s16(q2s16);
100     d3 = vqmovn_s16(q13s16);
101     q1s8 = vcombine_s8(d2, d3);
102     q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
103     q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
104 
105     q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
106     q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
107     q2s8 = vshrq_n_s8(q2s8, 3);
108     q13s8 = vshrq_n_s8(q13s8, 3);
109 
110     q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
111     q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
112 
113     q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
114 
115     q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
116     d5 = vdup_n_s8(9);
117     d4 = vdup_n_s8(18);
118 
119     q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
120     q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
121     d5 = vdup_n_s8(27);
122     q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
123     q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
124     q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
125     q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
126 
127     d0  = vqshrn_n_s16(q0s16 , 7);
128     d1  = vqshrn_n_s16(q11s16, 7);
129     d24 = vqshrn_n_s16(q12s16, 7);
130     d25 = vqshrn_n_s16(q13s16, 7);
131     d28 = vqshrn_n_s16(q14s16, 7);
132     d29 = vqshrn_n_s16(q15s16, 7);
133 
134     q0s8  = vcombine_s8(d0, d1);
135     q12s8 = vcombine_s8(d24, d25);
136     q14s8 = vcombine_s8(d28, d29);
137 
138     q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
139     q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
140     q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
141     q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
142     q15s8 = vqsubq_s8((q7s8), q14s8);
143     q14s8 = vqaddq_s8((q6s8), q14s8);
144 
145     q1u8 = vdupq_n_u8(0x80);
146     *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
147     *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
148     *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
149     *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
150     *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
151     *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
152     return;
153 }
154 
vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char * src,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh)155 void vp8_mbloop_filter_horizontal_edge_y_neon(
156         unsigned char *src,
157         int pitch,
158         unsigned char blimit,
159         unsigned char limit,
160         unsigned char thresh) {
161     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
162     uint8x16_t q5, q6, q7, q8, q9, q10;
163 
164     qblimit = vdupq_n_u8(blimit);
165     qlimit = vdupq_n_u8(limit);
166     qthresh = vdupq_n_u8(thresh);
167 
168     src -= (pitch << 2);
169 
170     q3 = vld1q_u8(src);
171     src += pitch;
172     q4 = vld1q_u8(src);
173     src += pitch;
174     q5 = vld1q_u8(src);
175     src += pitch;
176     q6 = vld1q_u8(src);
177     src += pitch;
178     q7 = vld1q_u8(src);
179     src += pitch;
180     q8 = vld1q_u8(src);
181     src += pitch;
182     q9 = vld1q_u8(src);
183     src += pitch;
184     q10 = vld1q_u8(src);
185 
186     vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
187                          q5, q6, q7, q8, q9, q10,
188                          &q4, &q5, &q6, &q7, &q8, &q9);
189 
190     src -= (pitch * 6);
191     vst1q_u8(src, q4);
192     src += pitch;
193     vst1q_u8(src, q5);
194     src += pitch;
195     vst1q_u8(src, q6);
196     src += pitch;
197     vst1q_u8(src, q7);
198     src += pitch;
199     vst1q_u8(src, q8);
200     src += pitch;
201     vst1q_u8(src, q9);
202     return;
203 }
204 
vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char * u,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh,unsigned char * v)205 void vp8_mbloop_filter_horizontal_edge_uv_neon(
206         unsigned char *u,
207         int pitch,
208         unsigned char blimit,
209         unsigned char limit,
210         unsigned char thresh,
211         unsigned char *v) {
212     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
213     uint8x16_t q5, q6, q7, q8, q9, q10;
214     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
215     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
216 
217     qblimit = vdupq_n_u8(blimit);
218     qlimit = vdupq_n_u8(limit);
219     qthresh = vdupq_n_u8(thresh);
220 
221     u -= (pitch << 2);
222     v -= (pitch << 2);
223 
224     d6 = vld1_u8(u);
225     u += pitch;
226     d7 = vld1_u8(v);
227     v += pitch;
228     d8 = vld1_u8(u);
229     u += pitch;
230     d9 = vld1_u8(v);
231     v += pitch;
232     d10 = vld1_u8(u);
233     u += pitch;
234     d11 = vld1_u8(v);
235     v += pitch;
236     d12 = vld1_u8(u);
237     u += pitch;
238     d13 = vld1_u8(v);
239     v += pitch;
240     d14 = vld1_u8(u);
241     u += pitch;
242     d15 = vld1_u8(v);
243     v += pitch;
244     d16 = vld1_u8(u);
245     u += pitch;
246     d17 = vld1_u8(v);
247     v += pitch;
248     d18 = vld1_u8(u);
249     u += pitch;
250     d19 = vld1_u8(v);
251     v += pitch;
252     d20 = vld1_u8(u);
253     d21 = vld1_u8(v);
254 
255     q3 = vcombine_u8(d6, d7);
256     q4 = vcombine_u8(d8, d9);
257     q5 = vcombine_u8(d10, d11);
258     q6 = vcombine_u8(d12, d13);
259     q7 = vcombine_u8(d14, d15);
260     q8 = vcombine_u8(d16, d17);
261     q9 = vcombine_u8(d18, d19);
262     q10 = vcombine_u8(d20, d21);
263 
264     vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
265                          q5, q6, q7, q8, q9, q10,
266                          &q4, &q5, &q6, &q7, &q8, &q9);
267 
268     u -= (pitch * 6);
269     v -= (pitch * 6);
270     vst1_u8(u, vget_low_u8(q4));
271     u += pitch;
272     vst1_u8(v, vget_high_u8(q4));
273     v += pitch;
274     vst1_u8(u, vget_low_u8(q5));
275     u += pitch;
276     vst1_u8(v, vget_high_u8(q5));
277     v += pitch;
278     vst1_u8(u, vget_low_u8(q6));
279     u += pitch;
280     vst1_u8(v, vget_high_u8(q6));
281     v += pitch;
282     vst1_u8(u, vget_low_u8(q7));
283     u += pitch;
284     vst1_u8(v, vget_high_u8(q7));
285     v += pitch;
286     vst1_u8(u, vget_low_u8(q8));
287     u += pitch;
288     vst1_u8(v, vget_high_u8(q8));
289     v += pitch;
290     vst1_u8(u, vget_low_u8(q9));
291     vst1_u8(v, vget_high_u8(q9));
292     return;
293 }
294 
vp8_mbloop_filter_vertical_edge_y_neon(unsigned char * src,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh)295 void vp8_mbloop_filter_vertical_edge_y_neon(
296         unsigned char *src,
297         int pitch,
298         unsigned char blimit,
299         unsigned char limit,
300         unsigned char thresh) {
301     unsigned char *s1, *s2;
302     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
303     uint8x16_t q5, q6, q7, q8, q9, q10;
304     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
305     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
306     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
307     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
308     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
309 
310     qblimit = vdupq_n_u8(blimit);
311     qlimit = vdupq_n_u8(limit);
312     qthresh = vdupq_n_u8(thresh);
313 
314     s1 = src - 4;
315     s2 = s1 + 8 * pitch;
316     d6  = vld1_u8(s1);
317     s1 += pitch;
318     d7  = vld1_u8(s2);
319     s2 += pitch;
320     d8  = vld1_u8(s1);
321     s1 += pitch;
322     d9  = vld1_u8(s2);
323     s2 += pitch;
324     d10 = vld1_u8(s1);
325     s1 += pitch;
326     d11 = vld1_u8(s2);
327     s2 += pitch;
328     d12 = vld1_u8(s1);
329     s1 += pitch;
330     d13 = vld1_u8(s2);
331     s2 += pitch;
332     d14 = vld1_u8(s1);
333     s1 += pitch;
334     d15 = vld1_u8(s2);
335     s2 += pitch;
336     d16 = vld1_u8(s1);
337     s1 += pitch;
338     d17 = vld1_u8(s2);
339     s2 += pitch;
340     d18 = vld1_u8(s1);
341     s1 += pitch;
342     d19 = vld1_u8(s2);
343     s2 += pitch;
344     d20 = vld1_u8(s1);
345     d21 = vld1_u8(s2);
346 
347     q3 = vcombine_u8(d6, d7);
348     q4 = vcombine_u8(d8, d9);
349     q5 = vcombine_u8(d10, d11);
350     q6 = vcombine_u8(d12, d13);
351     q7 = vcombine_u8(d14, d15);
352     q8 = vcombine_u8(d16, d17);
353     q9 = vcombine_u8(d18, d19);
354     q10 = vcombine_u8(d20, d21);
355 
356     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
357     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
358     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
359     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
360 
361     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
362                        vreinterpretq_u16_u32(q2tmp2.val[0]));
363     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
364                        vreinterpretq_u16_u32(q2tmp3.val[0]));
365     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
366                        vreinterpretq_u16_u32(q2tmp2.val[1]));
367     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
368                        vreinterpretq_u16_u32(q2tmp3.val[1]));
369 
370     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
371                        vreinterpretq_u8_u16(q2tmp5.val[0]));
372     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
373                        vreinterpretq_u8_u16(q2tmp5.val[1]));
374     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
375                        vreinterpretq_u8_u16(q2tmp7.val[0]));
376     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
377                        vreinterpretq_u8_u16(q2tmp7.val[1]));
378 
379     q3 = q2tmp8.val[0];
380     q4 = q2tmp8.val[1];
381     q5 = q2tmp9.val[0];
382     q6 = q2tmp9.val[1];
383     q7 = q2tmp10.val[0];
384     q8 = q2tmp10.val[1];
385     q9 = q2tmp11.val[0];
386     q10 = q2tmp11.val[1];
387 
388     vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
389                          q5, q6, q7, q8, q9, q10,
390                          &q4, &q5, &q6, &q7, &q8, &q9);
391 
392     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
393     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
394     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
395     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
396 
397     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
398                        vreinterpretq_u16_u32(q2tmp2.val[0]));
399     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
400                        vreinterpretq_u16_u32(q2tmp3.val[0]));
401     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
402                        vreinterpretq_u16_u32(q2tmp2.val[1]));
403     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
404                        vreinterpretq_u16_u32(q2tmp3.val[1]));
405 
406     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
407                        vreinterpretq_u8_u16(q2tmp5.val[0]));
408     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
409                        vreinterpretq_u8_u16(q2tmp5.val[1]));
410     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
411                        vreinterpretq_u8_u16(q2tmp7.val[0]));
412     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
413                        vreinterpretq_u8_u16(q2tmp7.val[1]));
414 
415     q3 = q2tmp8.val[0];
416     q4 = q2tmp8.val[1];
417     q5 = q2tmp9.val[0];
418     q6 = q2tmp9.val[1];
419     q7 = q2tmp10.val[0];
420     q8 = q2tmp10.val[1];
421     q9 = q2tmp11.val[0];
422     q10 = q2tmp11.val[1];
423 
424     s1 -= 7 * pitch;
425     s2 -= 7 * pitch;
426 
427     vst1_u8(s1, vget_low_u8(q3));
428     s1 += pitch;
429     vst1_u8(s2, vget_high_u8(q3));
430     s2 += pitch;
431     vst1_u8(s1, vget_low_u8(q4));
432     s1 += pitch;
433     vst1_u8(s2, vget_high_u8(q4));
434     s2 += pitch;
435     vst1_u8(s1, vget_low_u8(q5));
436     s1 += pitch;
437     vst1_u8(s2, vget_high_u8(q5));
438     s2 += pitch;
439     vst1_u8(s1, vget_low_u8(q6));
440     s1 += pitch;
441     vst1_u8(s2, vget_high_u8(q6));
442     s2 += pitch;
443     vst1_u8(s1, vget_low_u8(q7));
444     s1 += pitch;
445     vst1_u8(s2, vget_high_u8(q7));
446     s2 += pitch;
447     vst1_u8(s1, vget_low_u8(q8));
448     s1 += pitch;
449     vst1_u8(s2, vget_high_u8(q8));
450     s2 += pitch;
451     vst1_u8(s1, vget_low_u8(q9));
452     s1 += pitch;
453     vst1_u8(s2, vget_high_u8(q9));
454     s2 += pitch;
455     vst1_u8(s1, vget_low_u8(q10));
456     vst1_u8(s2, vget_high_u8(q10));
457     return;
458 }
459 
vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char * u,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh,unsigned char * v)460 void vp8_mbloop_filter_vertical_edge_uv_neon(
461         unsigned char *u,
462         int pitch,
463         unsigned char blimit,
464         unsigned char limit,
465         unsigned char thresh,
466         unsigned char *v) {
467     unsigned char *us, *ud;
468     unsigned char *vs, *vd;
469     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
470     uint8x16_t q5, q6, q7, q8, q9, q10;
471     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
472     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
473     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
474     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
475     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
476 
477     qblimit = vdupq_n_u8(blimit);
478     qlimit = vdupq_n_u8(limit);
479     qthresh = vdupq_n_u8(thresh);
480 
481     us = u - 4;
482     vs = v - 4;
483     d6 = vld1_u8(us);
484     us += pitch;
485     d7 = vld1_u8(vs);
486     vs += pitch;
487     d8 = vld1_u8(us);
488     us += pitch;
489     d9 = vld1_u8(vs);
490     vs += pitch;
491     d10 = vld1_u8(us);
492     us += pitch;
493     d11 = vld1_u8(vs);
494     vs += pitch;
495     d12 = vld1_u8(us);
496     us += pitch;
497     d13 = vld1_u8(vs);
498     vs += pitch;
499     d14 = vld1_u8(us);
500     us += pitch;
501     d15 = vld1_u8(vs);
502     vs += pitch;
503     d16 = vld1_u8(us);
504     us += pitch;
505     d17 = vld1_u8(vs);
506     vs += pitch;
507     d18 = vld1_u8(us);
508     us += pitch;
509     d19 = vld1_u8(vs);
510     vs += pitch;
511     d20 = vld1_u8(us);
512     d21 = vld1_u8(vs);
513 
514     q3 = vcombine_u8(d6, d7);
515     q4 = vcombine_u8(d8, d9);
516     q5 = vcombine_u8(d10, d11);
517     q6 = vcombine_u8(d12, d13);
518     q7 = vcombine_u8(d14, d15);
519     q8 = vcombine_u8(d16, d17);
520     q9 = vcombine_u8(d18, d19);
521     q10 = vcombine_u8(d20, d21);
522 
523     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
524     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
525     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
526     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
527 
528     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
529                        vreinterpretq_u16_u32(q2tmp2.val[0]));
530     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
531                        vreinterpretq_u16_u32(q2tmp3.val[0]));
532     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
533                        vreinterpretq_u16_u32(q2tmp2.val[1]));
534     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
535                        vreinterpretq_u16_u32(q2tmp3.val[1]));
536 
537     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
538                        vreinterpretq_u8_u16(q2tmp5.val[0]));
539     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
540                        vreinterpretq_u8_u16(q2tmp5.val[1]));
541     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
542                        vreinterpretq_u8_u16(q2tmp7.val[0]));
543     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
544                        vreinterpretq_u8_u16(q2tmp7.val[1]));
545 
546     q3 = q2tmp8.val[0];
547     q4 = q2tmp8.val[1];
548     q5 = q2tmp9.val[0];
549     q6 = q2tmp9.val[1];
550     q7 = q2tmp10.val[0];
551     q8 = q2tmp10.val[1];
552     q9 = q2tmp11.val[0];
553     q10 = q2tmp11.val[1];
554 
555     vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
556                          q5, q6, q7, q8, q9, q10,
557                          &q4, &q5, &q6, &q7, &q8, &q9);
558 
559     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
560     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
561     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
562     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
563 
564     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
565                        vreinterpretq_u16_u32(q2tmp2.val[0]));
566     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
567                        vreinterpretq_u16_u32(q2tmp3.val[0]));
568     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
569                        vreinterpretq_u16_u32(q2tmp2.val[1]));
570     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
571                        vreinterpretq_u16_u32(q2tmp3.val[1]));
572 
573     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
574                        vreinterpretq_u8_u16(q2tmp5.val[0]));
575     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
576                        vreinterpretq_u8_u16(q2tmp5.val[1]));
577     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
578                        vreinterpretq_u8_u16(q2tmp7.val[0]));
579     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
580                        vreinterpretq_u8_u16(q2tmp7.val[1]));
581 
582     q3 = q2tmp8.val[0];
583     q4 = q2tmp8.val[1];
584     q5 = q2tmp9.val[0];
585     q6 = q2tmp9.val[1];
586     q7 = q2tmp10.val[0];
587     q8 = q2tmp10.val[1];
588     q9 = q2tmp11.val[0];
589     q10 = q2tmp11.val[1];
590 
591     ud = u - 4;
592     vst1_u8(ud, vget_low_u8(q3));
593     ud += pitch;
594     vst1_u8(ud, vget_low_u8(q4));
595     ud += pitch;
596     vst1_u8(ud, vget_low_u8(q5));
597     ud += pitch;
598     vst1_u8(ud, vget_low_u8(q6));
599     ud += pitch;
600     vst1_u8(ud, vget_low_u8(q7));
601     ud += pitch;
602     vst1_u8(ud, vget_low_u8(q8));
603     ud += pitch;
604     vst1_u8(ud, vget_low_u8(q9));
605     ud += pitch;
606     vst1_u8(ud, vget_low_u8(q10));
607 
608     vd = v - 4;
609     vst1_u8(vd, vget_high_u8(q3));
610     vd += pitch;
611     vst1_u8(vd, vget_high_u8(q4));
612     vd += pitch;
613     vst1_u8(vd, vget_high_u8(q5));
614     vd += pitch;
615     vst1_u8(vd, vget_high_u8(q6));
616     vd += pitch;
617     vst1_u8(vd, vget_high_u8(q7));
618     vd += pitch;
619     vst1_u8(vd, vget_high_u8(q8));
620     vd += pitch;
621     vst1_u8(vd, vget_high_u8(q9));
622     vd += pitch;
623     vst1_u8(vd, vget_high_u8(q10));
624     return;
625 }
626