1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (http://www.webmproject.org/code/)
13 
14 #include "./dsp.h"
15 
16 #if defined(WEBP_USE_NEON)
17 
18 #include <assert.h>
19 
20 #include "./neon.h"
21 #include "../enc/vp8i_enc.h"
22 
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25 
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 
30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
32 
33 // This code works but is *slower* than the inlined-asm version below
34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35 // WEBP_USE_INTRINSICS define.
36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
37 #if defined(WEBP_USE_INTRINSICS)
38 
39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16(uint32x2_t v)40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42 }
43 
44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45 // to the corresponding rows of 'dst'.
SaturateAndStore4x4(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47                                             const int16x8_t dst01,
48                                             const int16x8_t dst23) {
49   // Unsigned saturate to 8b.
50   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52 
53   // Store the results.
54   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58 }
59 
Add4x4(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
61                                const uint8_t* const ref, uint8_t* const dst) {
62   uint32x2_t dst01 = vdup_n_u32(0);
63   uint32x2_t dst23 = vdup_n_u32(0);
64 
65   // Load the source pixels.
66   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
67   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
68   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
69   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
70 
71   {
72     // Convert to 16b.
73     const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
74     const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
75 
76     // Descale with rounding.
77     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
78     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
79     // Add the inverse transform.
80     SaturateAndStore4x4(dst, out01, out23);
81   }
82 }
83 
Transpose8x2(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
85                                      int16x8x2_t* const out) {
86   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
87   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
88   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
89                                                   // b0 d0 b1 d1 b2 d2 ...
90   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
91 }
92 
TransformPass(int16x8x2_t * const rows)93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
94   // {rows} = in0 | in4
95   //          in8 | in12
96   // B1 = in4 | in12
97   const int16x8_t B1 =
98       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
99   // C0 = kC1 * in4 | kC1 * in12
100   // C1 = kC2 * in4 | kC2 * in12
101   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
102   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
103   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
104                                 vget_low_s16(rows->val[1]));   // in0 + in8
105   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
106                                 vget_low_s16(rows->val[1]));   // in0 - in8
107   // c = kC2 * in4 - kC1 * in12
108   // d = kC1 * in4 + kC2 * in12
109   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
110   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
111   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
112   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
113   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
114   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
115   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
116   Transpose8x2(E0, E1, rows);
117 }
118 
ITransformOne(const uint8_t * ref,const int16_t * in,uint8_t * dst)119 static void ITransformOne(const uint8_t* ref,
120                           const int16_t* in, uint8_t* dst) {
121   int16x8x2_t rows;
122   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
123   TransformPass(&rows);
124   TransformPass(&rows);
125   Add4x4(rows.val[0], rows.val[1], ref, dst);
126 }
127 
128 #else
129 
ITransformOne(const uint8_t * ref,const int16_t * in,uint8_t * dst)130 static void ITransformOne(const uint8_t* ref,
131                           const int16_t* in, uint8_t* dst) {
132   const int kBPS = BPS;
133   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
134 
135   __asm__ volatile (
136     "vld1.16         {q1, q2}, [%[in]]           \n"
137     "vld1.16         {d0}, [%[kC1C2]]            \n"
138 
139     // d2: in[0]
140     // d3: in[8]
141     // d4: in[4]
142     // d5: in[12]
143     "vswp            d3, d4                      \n"
144 
145     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
146     // q9 = {in[4], in[12]} * kC2 >> 16
147     "vqdmulh.s16     q8, q2, d0[0]               \n"
148     "vqdmulh.s16     q9, q2, d0[1]               \n"
149 
150     // d22 = a = in[0] + in[8]
151     // d23 = b = in[0] - in[8]
152     "vqadd.s16       d22, d2, d3                 \n"
153     "vqsub.s16       d23, d2, d3                 \n"
154 
155     //  q8 = in[4]/[12] * kC1 >> 16
156     "vshr.s16        q8, q8, #1                  \n"
157 
158     // Add {in[4], in[12]} back after the multiplication.
159     "vqadd.s16       q8, q2, q8                  \n"
160 
161     // d20 = c = in[4]*kC2 - in[12]*kC1
162     // d21 = d = in[4]*kC1 + in[12]*kC2
163     "vqsub.s16       d20, d18, d17               \n"
164     "vqadd.s16       d21, d19, d16               \n"
165 
166     // d2 = tmp[0] = a + d
167     // d3 = tmp[1] = b + c
168     // d4 = tmp[2] = b - c
169     // d5 = tmp[3] = a - d
170     "vqadd.s16       d2, d22, d21                \n"
171     "vqadd.s16       d3, d23, d20                \n"
172     "vqsub.s16       d4, d23, d20                \n"
173     "vqsub.s16       d5, d22, d21                \n"
174 
175     "vzip.16         q1, q2                      \n"
176     "vzip.16         q1, q2                      \n"
177 
178     "vswp            d3, d4                      \n"
179 
180     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
181     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
182     "vqdmulh.s16     q8, q2, d0[0]               \n"
183     "vqdmulh.s16     q9, q2, d0[1]               \n"
184 
185     // d22 = a = tmp[0] + tmp[8]
186     // d23 = b = tmp[0] - tmp[8]
187     "vqadd.s16       d22, d2, d3                 \n"
188     "vqsub.s16       d23, d2, d3                 \n"
189 
190     "vshr.s16        q8, q8, #1                  \n"
191     "vqadd.s16       q8, q2, q8                  \n"
192 
193     // d20 = c = in[4]*kC2 - in[12]*kC1
194     // d21 = d = in[4]*kC1 + in[12]*kC2
195     "vqsub.s16       d20, d18, d17               \n"
196     "vqadd.s16       d21, d19, d16               \n"
197 
198     // d2 = tmp[0] = a + d
199     // d3 = tmp[1] = b + c
200     // d4 = tmp[2] = b - c
201     // d5 = tmp[3] = a - d
202     "vqadd.s16       d2, d22, d21                \n"
203     "vqadd.s16       d3, d23, d20                \n"
204     "vqsub.s16       d4, d23, d20                \n"
205     "vqsub.s16       d5, d22, d21                \n"
206 
207     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
208     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
209     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
210     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
211 
212     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
213 
214     // (val) + 4 >> 3
215     "vrshr.s16       d2, d2, #3                  \n"
216     "vrshr.s16       d3, d3, #3                  \n"
217     "vrshr.s16       d4, d4, #3                  \n"
218     "vrshr.s16       d5, d5, #3                  \n"
219 
220     "vzip.16         q1, q2                      \n"
221     "vzip.16         q1, q2                      \n"
222 
223     // Must accumulate before saturating
224     "vmovl.u8        q8, d6                      \n"
225     "vmovl.u8        q9, d7                      \n"
226 
227     "vqadd.s16       q1, q1, q8                  \n"
228     "vqadd.s16       q2, q2, q9                  \n"
229 
230     "vqmovun.s16     d0, q1                      \n"
231     "vqmovun.s16     d1, q2                      \n"
232 
233     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
234     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
235     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
236     "vst1.32         d1[1], [%[dst]]             \n"
237 
238     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
239     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
240     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
241   );
242 }
243 
244 #endif    // WEBP_USE_INTRINSICS
245 
ITransform(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)246 static void ITransform(const uint8_t* ref,
247                        const int16_t* in, uint8_t* dst, int do_two) {
248   ITransformOne(ref, in, dst);
249   if (do_two) {
250     ITransformOne(ref + 4, in + 16, dst + 4);
251   }
252 }
253 
254 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4(const uint8_t * src)255 static uint8x16_t Load4x4(const uint8_t* src) {
256   uint32x4_t out = vdupq_n_u32(0);
257   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261   return vreinterpretq_u8_u32(out);
262 }
263 
264 // Forward transform.
265 
266 #if defined(WEBP_USE_INTRINSICS)
267 
Transpose4x4_S16(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269                                          const int16x4_t C, const int16x4_t D,
270                                          int16x8_t* const out01,
271                                          int16x8_t* const out32) {
272   const int16x4x2_t AB = vtrn_s16(A, B);
273   const int16x4x2_t CD = vtrn_s16(C, D);
274   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
275                                      vreinterpret_s32_s16(CD.val[0]));
276   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
277                                      vreinterpret_s32_s16(CD.val[1]));
278   *out01 = vreinterpretq_s16_s64(
279       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
280                    vreinterpret_s64_s32(tmp13.val[0])));
281   *out32 = vreinterpretq_s16_s64(
282       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
283                    vreinterpret_s64_s32(tmp02.val[1])));
284 }
285 
DiffU8ToS16(const uint8x8_t a,const uint8x8_t b)286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
287                                          const uint8x8_t b) {
288   return vreinterpretq_s16_u16(vsubl_u8(a, b));
289 }
290 
FTransform(const uint8_t * src,const uint8_t * ref,int16_t * out)291 static void FTransform(const uint8_t* src, const uint8_t* ref,
292                        int16_t* out) {
293   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
294   {
295     const uint8x16_t S0 = Load4x4(src);
296     const uint8x16_t R0 = Load4x4(ref);
297     const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
298     const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
299     const int16x4_t D0 = vget_low_s16(D0D1);
300     const int16x4_t D1 = vget_high_s16(D0D1);
301     const int16x4_t D2 = vget_low_s16(D2D3);
302     const int16x4_t D3 = vget_high_s16(D2D3);
303     Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
304   }
305   {    // 1rst pass
306     const int32x4_t kCst937 = vdupq_n_s32(937);
307     const int32x4_t kCst1812 = vdupq_n_s32(1812);
308     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
309     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
310     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
311     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
312                                     vget_high_s16(a0a1_2));
313     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
314                                     vget_high_s16(a0a1_2));
315     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
316     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
317     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
318     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
319     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
320     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
321     Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
322   }
323   {    // 2nd pass
324     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
325     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
326     const int32x4_t kCst51000 = vdupq_n_s32(51000);
327     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
328     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
329     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
330     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
331     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
332     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
333     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
334     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
335     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
336     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
337     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
338     const int16x4_t a3_eq_0 =
339         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
340     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
341     vst1_s16(out +  0, out0);
342     vst1_s16(out +  4, out1);
343     vst1_s16(out +  8, out2);
344     vst1_s16(out + 12, out3);
345   }
346 }
347 
348 #else
349 
350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
351 static const int16_t kCoeff16[] = {
352   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
353 };
354 static const int32_t kCoeff32[] = {
355    1812,  1812,  1812,  1812,
356     937,   937,   937,   937,
357   12000, 12000, 12000, 12000,
358   51000, 51000, 51000, 51000
359 };
360 
FTransform(const uint8_t * src,const uint8_t * ref,int16_t * out)361 static void FTransform(const uint8_t* src, const uint8_t* ref,
362                        int16_t* out) {
363   const int kBPS = BPS;
364   const uint8_t* src_ptr = src;
365   const uint8_t* ref_ptr = ref;
366   const int16_t* coeff16 = kCoeff16;
367   const int32_t* coeff32 = kCoeff32;
368 
369   __asm__ volatile (
370     // load src into q4, q5 in high half
371     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
372     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
373     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
374     "vld1.8 {d11}, [%[src_ptr]]               \n"
375 
376     // load ref into q6, q7 in high half
377     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
378     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
379     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
380     "vld1.8 {d15}, [%[ref_ptr]]               \n"
381 
382     // Pack the high values in to q4 and q6
383     "vtrn.32     q4, q5                       \n"
384     "vtrn.32     q6, q7                       \n"
385 
386     // d[0-3] = src - ref
387     "vsubl.u8    q0, d8, d12                  \n"
388     "vsubl.u8    q1, d9, d13                  \n"
389 
390     // load coeff16 into q8(d16=5352, d17=2217)
391     "vld1.16     {q8}, [%[coeff16]]           \n"
392 
393     // load coeff32 high half into q9 = 1812, q10 = 937
394     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
395 
396     // load coeff32 low half into q11=12000, q12=51000
397     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
398 
399     // part 1
400     // Transpose. Register dN is the same as dN in C
401     "vtrn.32         d0, d2                   \n"
402     "vtrn.32         d1, d3                   \n"
403     "vtrn.16         d0, d1                   \n"
404     "vtrn.16         d2, d3                   \n"
405 
406     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
407     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
408     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
409     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
410 
411     "vadd.s16        d0, d4, d5               \n" // a0 + a1
412     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
413     "vsub.s16        d2, d4, d5               \n" // a0 - a1
414     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
415 
416     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
417     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
418     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
419     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
420 
421     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
422     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
423     "vshrn.s32       d1, q9, #9               \n"
424     "vshrn.s32       d3, q10, #9              \n"
425 
426     // part 2
427     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
428     "vtrn.32         d0, d2                   \n"
429     "vtrn.32         d1, d3                   \n"
430     "vtrn.16         d0, d1                   \n"
431     "vtrn.16         d2, d3                   \n"
432 
433     "vmov.s16        d26, #7                  \n"
434 
435     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
436     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
437     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
438     "vadd.s16        d4, d4, d26              \n" // a1 + 7
439     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
440 
441     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
442     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
443 
444     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
445     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
446 
447     "vceq.s16        d4, d7, #0               \n"
448 
449     "vshr.s16        d0, d0, #4               \n"
450     "vshr.s16        d2, d2, #4               \n"
451 
452     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
453     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
454 
455     "vmvn            d4, d4                   \n" // !(d1 == 0)
456     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
457     "vshrn.s32       d1, q11, #16             \n"
458     // op[4] += (d1!=0)
459     "vsub.s16        d1, d1, d4               \n"
460     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
461     "vshrn.s32       d3, q12, #16             \n"
462 
463     // set result to out array
464     "vst1.16         {q0, q1}, [%[out]]   \n"
465     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
466       [coeff32] "+r"(coeff32)          // modified registers
467     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
468       [out] "r"(out)                   // constants
469     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
470       "q10", "q11", "q12", "q13"       // clobbered
471   );
472 }
473 
474 #endif
475 
476 #define LOAD_LANE_16b(VALUE, LANE) do {             \
477   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
478   src += stride;                                    \
479 } while (0)
480 
FTransformWHT(const int16_t * src,int16_t * out)481 static void FTransformWHT(const int16_t* src, int16_t* out) {
482   const int stride = 16;
483   const int16x4_t zero = vdup_n_s16(0);
484   int32x4x4_t tmp0;
485   int16x4x4_t in;
486   INIT_VECTOR4(in, zero, zero, zero, zero);
487   LOAD_LANE_16b(in.val[0], 0);
488   LOAD_LANE_16b(in.val[1], 0);
489   LOAD_LANE_16b(in.val[2], 0);
490   LOAD_LANE_16b(in.val[3], 0);
491   LOAD_LANE_16b(in.val[0], 1);
492   LOAD_LANE_16b(in.val[1], 1);
493   LOAD_LANE_16b(in.val[2], 1);
494   LOAD_LANE_16b(in.val[3], 1);
495   LOAD_LANE_16b(in.val[0], 2);
496   LOAD_LANE_16b(in.val[1], 2);
497   LOAD_LANE_16b(in.val[2], 2);
498   LOAD_LANE_16b(in.val[3], 2);
499   LOAD_LANE_16b(in.val[0], 3);
500   LOAD_LANE_16b(in.val[1], 3);
501   LOAD_LANE_16b(in.val[2], 3);
502   LOAD_LANE_16b(in.val[3], 3);
503 
504   {
505     // a0 = in[0 * 16] + in[2 * 16]
506     // a1 = in[1 * 16] + in[3 * 16]
507     // a2 = in[1 * 16] - in[3 * 16]
508     // a3 = in[0 * 16] - in[2 * 16]
509     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
510     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
511     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
512     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
513     tmp0.val[0] = vaddq_s32(a0, a1);
514     tmp0.val[1] = vaddq_s32(a3, a2);
515     tmp0.val[2] = vsubq_s32(a3, a2);
516     tmp0.val[3] = vsubq_s32(a0, a1);
517   }
518   {
519     const int32x4x4_t tmp1 = Transpose4x4(tmp0);
520     // a0 = tmp[0 + i] + tmp[ 8 + i]
521     // a1 = tmp[4 + i] + tmp[12 + i]
522     // a2 = tmp[4 + i] - tmp[12 + i]
523     // a3 = tmp[0 + i] - tmp[ 8 + i]
524     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
525     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
526     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
527     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
528     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
529     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
530     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
531     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
532     const int16x4_t out0 = vmovn_s32(b0);
533     const int16x4_t out1 = vmovn_s32(b1);
534     const int16x4_t out2 = vmovn_s32(b2);
535     const int16x4_t out3 = vmovn_s32(b3);
536 
537     vst1_s16(out +  0, out0);
538     vst1_s16(out +  4, out1);
539     vst1_s16(out +  8, out2);
540     vst1_s16(out + 12, out3);
541   }
542 }
543 #undef LOAD_LANE_16b
544 
545 //------------------------------------------------------------------------------
546 // Texture distortion
547 //
548 // We try to match the spectral content (weighted) between source and
549 // reconstructed samples.
550 
551 // a 0123, b 0123
552 // a 4567, b 4567
553 // a 89ab, b 89ab
554 // a cdef, b cdef
555 //
556 // transpose
557 //
558 // a 048c, b 048c
559 // a 159d, b 159d
560 // a 26ae, b 26ae
561 // a 37bf, b 37bf
562 //
DistoTranspose4x4S16(int16x8x4_t q4_in)563 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
564   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
565   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
566   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
567                                         vreinterpretq_s32_s16(q2_tmp1.val[0]));
568   const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
569                                         vreinterpretq_s32_s16(q2_tmp1.val[1]));
570   q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
571   q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
572   q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
573   q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
574   return q4_in;
575 }
576 
DistoHorizontalPass(const int16x8x4_t q4_in)577 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
578   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
579   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
580   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
581   const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
582   const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
583   const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
584   int16x8x4_t q4_out;
585   // tmp[0] = a0 + a1
586   // tmp[1] = a3 + a2
587   // tmp[2] = a3 - a2
588   // tmp[3] = a0 - a1
589   INIT_VECTOR4(q4_out,
590                vabsq_s16(vaddq_s16(q_a0, q_a1)),
591                vabsq_s16(vaddq_s16(q_a3, q_a2)),
592                vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
593   return q4_out;
594 }
595 
DistoVerticalPass(const uint8x8x4_t q4_in)596 static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
597   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
598                                                         q4_in.val[2]));
599   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
600                                                         q4_in.val[3]));
601   const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
602                                                         q4_in.val[3]));
603   const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
604                                                         q4_in.val[2]));
605   int16x8x4_t q4_out;
606 
607   INIT_VECTOR4(q4_out,
608                vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
609                vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
610   return q4_out;
611 }
612 
DistoLoadW(const uint16_t * w)613 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
614   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
615   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
616   int16x4x4_t d4_w;
617   INIT_VECTOR4(d4_w,
618                vget_low_s16(vreinterpretq_s16_u16(q_w07)),
619                vget_high_s16(vreinterpretq_s16_u16(q_w07)),
620                vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
621                vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
622   return d4_w;
623 }
624 
DistoSum(const int16x8x4_t q4_in,const int16x4x4_t d4_w)625 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
626                                       const int16x4x4_t d4_w) {
627   int32x2_t d_sum;
628   // sum += w[ 0] * abs(b0);
629   // sum += w[ 4] * abs(b1);
630   // sum += w[ 8] * abs(b2);
631   // sum += w[12] * abs(b3);
632   int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
633   int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
634   int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
635   int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
636   q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
637   q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
638   q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
639   q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
640 
641   q_sum0 = vaddq_s32(q_sum0, q_sum1);
642   q_sum2 = vaddq_s32(q_sum2, q_sum3);
643   q_sum2 = vaddq_s32(q_sum0, q_sum2);
644   d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
645   d_sum = vpadd_s32(d_sum, d_sum);
646   return d_sum;
647 }
648 
649 #define LOAD_LANE_32b(src, VALUE, LANE) \
650     (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
651 
652 // Hadamard transform
653 // Returns the weighted sum of the absolute value of transformed coefficients.
654 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)655 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
656                     const uint16_t* const w) {
657   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
658   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
659   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
660   uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
661   uint8x8x4_t d4_in;
662 
663   // load data a, b
664   LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
665   LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
666   LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
667   LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
668   LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
669   LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
670   LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
671   LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
672   INIT_VECTOR4(d4_in,
673                vreinterpret_u8_u32(d_in_ab_0123),
674                vreinterpret_u8_u32(d_in_ab_4567),
675                vreinterpret_u8_u32(d_in_ab_89ab),
676                vreinterpret_u8_u32(d_in_ab_cdef));
677 
678   {
679     // Vertical pass first to avoid a transpose (vertical and horizontal passes
680     // are commutative because w/kWeightY is symmetric) and subsequent
681     // transpose.
682     const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
683     const int16x4x4_t d4_w = DistoLoadW(w);
684     // horizontal pass
685     const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
686     const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
687     int32x2_t d_sum = DistoSum(q4_h, d4_w);
688 
689     // abs(sum2 - sum1) >> 5
690     d_sum = vabs_s32(d_sum);
691     d_sum = vshr_n_s32(d_sum, 5);
692     return vget_lane_s32(d_sum, 0);
693   }
694 }
695 #undef LOAD_LANE_32b
696 
Disto16x16(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)697 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
698                       const uint16_t* const w) {
699   int D = 0;
700   int x, y;
701   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
702     for (x = 0; x < 16; x += 4) {
703       D += Disto4x4(a + x + y, b + x + y, w);
704     }
705   }
706   return D;
707 }
708 
709 //------------------------------------------------------------------------------
710 
CollectHistogram(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)711 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
712                              int start_block, int end_block,
713                              VP8Histogram* const histo) {
714   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
715   int j;
716   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
717   for (j = start_block; j < end_block; ++j) {
718     int16_t out[16];
719     FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
720     {
721       int k;
722       const int16x8_t a0 = vld1q_s16(out + 0);
723       const int16x8_t b0 = vld1q_s16(out + 8);
724       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
725       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
726       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
727       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
728       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
729       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
730       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
731       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
732       // Convert coefficients to bin.
733       for (k = 0; k < 16; ++k) {
734         ++distribution[out[k]];
735       }
736     }
737   }
738   VP8SetHistogramData(distribution, histo);
739 }
740 
741 //------------------------------------------------------------------------------
742 
AccumulateSSE16(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)743 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
744                                         const uint8_t* const b,
745                                         uint32x4_t* const sum) {
746   const uint8x16_t a0 = vld1q_u8(a);
747   const uint8x16_t b0 = vld1q_u8(b);
748   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
749   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
750                                     vget_low_u8(abs_diff));
751   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
752                                     vget_high_u8(abs_diff));
753   /* pair-wise adds and widen */
754   const uint32x4_t sum1 = vpaddlq_u16(prod1);
755   const uint32x4_t sum2 = vpaddlq_u16(prod2);
756   *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
757 }
758 
759 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt(uint32x4_t sum)760 static int SumToInt(uint32x4_t sum) {
761   const uint64x2_t sum2 = vpaddlq_u32(sum);
762   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
763   return (int)sum3;
764 }
765 
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)766 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
767   uint32x4_t sum = vdupq_n_u32(0);
768   int y;
769   for (y = 0; y < 16; ++y) {
770     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
771   }
772   return SumToInt(sum);
773 }
774 
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)775 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
776   uint32x4_t sum = vdupq_n_u32(0);
777   int y;
778   for (y = 0; y < 8; ++y) {
779     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
780   }
781   return SumToInt(sum);
782 }
783 
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)784 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
785   uint32x4_t sum = vdupq_n_u32(0);
786   int y;
787   for (y = 0; y < 8; ++y) {
788     const uint8x8_t a0 = vld1_u8(a + y * BPS);
789     const uint8x8_t b0 = vld1_u8(b + y * BPS);
790     const uint8x8_t abs_diff = vabd_u8(a0, b0);
791     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
792     sum = vpadalq_u16(sum, prod);
793   }
794   return SumToInt(sum);
795 }
796 
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)797 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
798   const uint8x16_t a0 = Load4x4(a);
799   const uint8x16_t b0 = Load4x4(b);
800   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
801   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
802                                     vget_low_u8(abs_diff));
803   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
804                                     vget_high_u8(abs_diff));
805   /* pair-wise adds and widen */
806   const uint32x4_t sum1 = vpaddlq_u16(prod1);
807   const uint32x4_t sum2 = vpaddlq_u16(prod2);
808   return SumToInt(vaddq_u32(sum1, sum2));
809 }
810 
811 //------------------------------------------------------------------------------
812 
813 // Compilation with gcc-4.6.x is problematic for now.
814 #if !defined(WORK_AROUND_GCC)
815 
Quantize(int16_t * const in,const VP8Matrix * const mtx,int offset)816 static int16x8_t Quantize(int16_t* const in,
817                           const VP8Matrix* const mtx, int offset) {
818   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
819   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
820   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
821   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
822   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
823 
824   const int16x8_t a = vld1q_s16(in + offset);                // in
825   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
826   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
827   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
828   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
829   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
830   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
831   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
832   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
833                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
834   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
835   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
836   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
837   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
838   vst1q_s16(in + offset, c4);
839   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
840   return c3;
841 }
842 
843 static const uint8_t kShuffles[4][8] = {
844   { 0,   1,  2,  3,  8,  9, 16, 17 },
845   { 10, 11,  4,  5,  6,  7, 12, 13 },
846   { 18, 19, 24, 25, 26, 27, 20, 21 },
847   { 14, 15, 22, 23, 28, 29, 30, 31 }
848 };
849 
QuantizeBlock(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)850 static int QuantizeBlock(int16_t in[16], int16_t out[16],
851                          const VP8Matrix* const mtx) {
852   const int16x8_t out0 = Quantize(in, mtx, 0);
853   const int16x8_t out1 = Quantize(in, mtx, 8);
854   uint8x8x4_t shuffles;
855   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
856   // non-standard versions there.
857 #if defined(__APPLE__) && defined(__aarch64__) && \
858     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
859   uint8x16x2_t all_out;
860   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
861   INIT_VECTOR4(shuffles,
862                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
863                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
864                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
865                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
866 #else
867   uint8x8x4_t all_out;
868   INIT_VECTOR4(all_out,
869                vreinterpret_u8_s16(vget_low_s16(out0)),
870                vreinterpret_u8_s16(vget_high_s16(out0)),
871                vreinterpret_u8_s16(vget_low_s16(out1)),
872                vreinterpret_u8_s16(vget_high_s16(out1)));
873   INIT_VECTOR4(shuffles,
874                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
875                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
876                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
877                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
878 #endif
879   // Zigzag reordering
880   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
881   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
882   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
883   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
884   // test zeros
885   if (*(uint64_t*)(out +  0) != 0) return 1;
886   if (*(uint64_t*)(out +  4) != 0) return 1;
887   if (*(uint64_t*)(out +  8) != 0) return 1;
888   if (*(uint64_t*)(out + 12) != 0) return 1;
889   return 0;
890 }
891 
Quantize2Blocks(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)892 static int Quantize2Blocks(int16_t in[32], int16_t out[32],
893                            const VP8Matrix* const mtx) {
894   int nz;
895   nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
896   nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
897   return nz;
898 }
899 
900 #endif   // !WORK_AROUND_GCC
901 
902 //------------------------------------------------------------------------------
903 // Entry point
904 
905 extern void VP8EncDspInitNEON(void);
906 
VP8EncDspInitNEON(void)907 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
908   VP8ITransform = ITransform;
909   VP8FTransform = FTransform;
910 
911   VP8FTransformWHT = FTransformWHT;
912 
913   VP8TDisto4x4 = Disto4x4;
914   VP8TDisto16x16 = Disto16x16;
915   VP8CollectHistogram = CollectHistogram;
916 
917   VP8SSE16x16 = SSE16x16_NEON;
918   VP8SSE16x8 = SSE16x8_NEON;
919   VP8SSE8x8 = SSE8x8_NEON;
920   VP8SSE4x4 = SSE4x4_NEON;
921 
922 #if !defined(WORK_AROUND_GCC)
923   VP8EncQuantizeBlock = QuantizeBlock;
924   VP8EncQuantize2Blocks = Quantize2Blocks;
925 #endif
926 }
927 
928 #else  // !WEBP_USE_NEON
929 
930 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
931 
932 #endif  // WEBP_USE_NEON
933