1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19 
20 namespace android {
21 
22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23 
24 #if USE_NEON
25 
26 // use intrinsics if inline arm32 assembly is not possible
27 #if !USE_INLINE_ASSEMBLY
28 #define USE_INTRINSIC
29 #endif
30 
31 // following intrinsics available only on ARM 64 bit ACLE
32 #ifndef __aarch64__
33 #undef vld1q_f32_x2
34 #undef vld1q_s32_x2
35 #endif
36 
37 #define TO_STRING2(x) #x
38 #define TO_STRING(x) TO_STRING2(x)
39 // uncomment to print GCC version, may be relevant for intrinsic optimizations
40 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
41         "." TO_STRING(__GNUC_MINOR__) \
42         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
43 
44 //
45 // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
46 //
47 // Two variants are presented here:
48 // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
49 // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
50 //
51 
52 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
53 // These are only used for inline assembly.
54 #define ASSEMBLY_ACCUMULATE_MONO \
55         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
56         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
57         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
58         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
59         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
60         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
61         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
62 
63 #define ASSEMBLY_ACCUMULATE_STEREO \
64         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
65         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
66         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
67         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
68         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
69         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
70         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
71         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
72 
73 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int16_t * coefsP,const int16_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int16_t * coefsP1,const int16_t * coefsN1)74 static inline void ProcessNeonIntrinsic(int32_t* out,
75         int count,
76         const int16_t* coefsP,
77         const int16_t* coefsN,
78         const int16_t* sP,
79         const int16_t* sN,
80         const int32_t* volumeLR,
81         uint32_t lerpP,
82         const int16_t* coefsP1,
83         const int16_t* coefsN1)
84 {
85     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
86     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
87 
88     sP -= CHANNELS*((STRIDE>>1)-1);
89     coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
90     coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
91 
92     int16x4_t interp;
93     if (!FIXED) {
94         interp = vdup_n_s16(lerpP);
95         //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
96         coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
97         coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
98     }
99     int32x4_t accum, accum2;
100     // warning uninitialized if we use veorq_s32
101     // (alternative to below) accum = veorq_s32(accum, accum);
102     accum = vdupq_n_s32(0);
103     if (CHANNELS == 2) {
104         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
105         accum2 = vdupq_n_s32(0);
106     }
107     do {
108         int16x8_t posCoef = vld1q_s16(coefsP);
109         coefsP += 8;
110         int16x8_t negCoef = vld1q_s16(coefsN);
111         coefsN += 8;
112         if (!FIXED) { // interpolate
113             int16x8_t posCoef1 = vld1q_s16(coefsP1);
114             coefsP1 += 8;
115             int16x8_t negCoef1 = vld1q_s16(coefsN1);
116             coefsN1 += 8;
117 
118             posCoef1 = vsubq_s16(posCoef1, posCoef);
119             negCoef = vsubq_s16(negCoef, negCoef1);
120 
121             posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
122             negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
123 
124             posCoef = vaddq_s16(posCoef, posCoef1);
125             negCoef = vaddq_s16(negCoef, negCoef1);
126         }
127         switch (CHANNELS) {
128         case 1: {
129             int16x8_t posSamp = vld1q_s16(sP);
130             int16x8_t negSamp = vld1q_s16(sN);
131             sN += 8;
132             posSamp = vrev64q_s16(posSamp);
133 
134             // dot product
135             accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
136             accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
137             accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
138             accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
139             sP -= 8;
140         } break;
141         case 2: {
142             int16x8x2_t posSamp = vld2q_s16(sP);
143             int16x8x2_t negSamp = vld2q_s16(sN);
144             sN += 16;
145             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
146             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
147 
148             // dot product
149             accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
150             accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
151             accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
152             accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
153             accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
154             accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
155             accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
156             accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
157             sP -= 16;
158         } break;
159         }
160     } while (count -= 8);
161 
162     // multiply by volume and save
163     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
164     int32x2_t vLR = vld1_s32(volumeLR);
165     int32x2_t outSamp = vld1_s32(out);
166     // combine and funnel down accumulator
167     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
168     if (CHANNELS == 1) {
169         // duplicate accum to both L and R
170         outAccum = vpadd_s32(outAccum, outAccum);
171     } else if (CHANNELS == 2) {
172         // accum2 contains R, fold in
173         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
174         outAccum = vpadd_s32(outAccum, outAccum2);
175     }
176     outAccum = vqrdmulh_s32(outAccum, vLR);
177     outSamp = vqadd_s32(outSamp, outAccum);
178     vst1_s32(out, outSamp);
179 }
180 
181 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int32_t * coefsP,const int32_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int32_t * coefsP1,const int32_t * coefsN1)182 static inline void ProcessNeonIntrinsic(int32_t* out,
183         int count,
184         const int32_t* coefsP,
185         const int32_t* coefsN,
186         const int16_t* sP,
187         const int16_t* sN,
188         const int32_t* volumeLR,
189         uint32_t lerpP,
190         const int32_t* coefsP1,
191         const int32_t* coefsN1)
192 {
193     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
194     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
195 
196     sP -= CHANNELS*((STRIDE>>1)-1);
197     coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
198     coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
199 
200     int32x2_t interp;
201     if (!FIXED) {
202         interp = vdup_n_s32(lerpP);
203         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
204         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
205     }
206     int32x4_t accum, accum2;
207     // warning uninitialized if we use veorq_s32
208     // (alternative to below) accum = veorq_s32(accum, accum);
209     accum = vdupq_n_s32(0);
210     if (CHANNELS == 2) {
211         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
212         accum2 = vdupq_n_s32(0);
213     }
214     do {
215 #ifdef vld1q_s32_x2
216         int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
217         coefsP += 8;
218         int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
219         coefsN += 8;
220 #else
221         int32x4x2_t posCoef;
222         posCoef.val[0] = vld1q_s32(coefsP);
223         coefsP += 4;
224         posCoef.val[1] = vld1q_s32(coefsP);
225         coefsP += 4;
226         int32x4x2_t negCoef;
227         negCoef.val[0] = vld1q_s32(coefsN);
228         coefsN += 4;
229         negCoef.val[1] = vld1q_s32(coefsN);
230         coefsN += 4;
231 #endif
232         if (!FIXED) { // interpolate
233 #ifdef vld1q_s32_x2
234             int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
235             coefsP1 += 8;
236             int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
237             coefsN1 += 8;
238 #else
239             int32x4x2_t posCoef1;
240             posCoef1.val[0] = vld1q_s32(coefsP1);
241             coefsP1 += 4;
242             posCoef1.val[1] = vld1q_s32(coefsP1);
243             coefsP1 += 4;
244             int32x4x2_t negCoef1;
245             negCoef1.val[0] = vld1q_s32(coefsN1);
246             coefsN1 += 4;
247             negCoef1.val[1] = vld1q_s32(coefsN1);
248             coefsN1 += 4;
249 #endif
250 
251             posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
252             posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
253             negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
254             negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
255 
256             posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
257             posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
258             negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
259             negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
260 
261             posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
262             posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
263             negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
264             negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
265         }
266         switch (CHANNELS) {
267         case 1: {
268             int16x8_t posSamp = vld1q_s16(sP);
269             int16x8_t negSamp = vld1q_s16(sN);
270             sN += 8;
271             posSamp = vrev64q_s16(posSamp);
272 
273             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
274             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
275             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
276             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
277 
278             // dot product
279             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
280             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
281             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
282             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
283 
284             accum = vaddq_s32(accum, posSamp0);
285             negSamp0 = vaddq_s32(negSamp0, negSamp1);
286             accum = vaddq_s32(accum, posSamp1);
287             accum = vaddq_s32(accum, negSamp0);
288 
289             sP -= 8;
290         } break;
291         case 2: {
292             int16x8x2_t posSamp = vld2q_s16(sP);
293             int16x8x2_t negSamp = vld2q_s16(sN);
294             sN += 16;
295             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
296             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
297 
298             // left
299             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
300             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
301             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
302             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
303 
304             // dot product
305             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
306             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
307             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
308             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
309 
310             accum = vaddq_s32(accum, posSamp0);
311             negSamp0 = vaddq_s32(negSamp0, negSamp1);
312             accum = vaddq_s32(accum, posSamp1);
313             accum = vaddq_s32(accum, negSamp0);
314 
315             // right
316             posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
317             posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
318             negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
319             negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
320 
321             // dot product
322             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
323             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
324             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
325             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
326 
327             accum2 = vaddq_s32(accum2, posSamp0);
328             negSamp0 = vaddq_s32(negSamp0, negSamp1);
329             accum2 = vaddq_s32(accum2, posSamp1);
330             accum2 = vaddq_s32(accum2, negSamp0);
331 
332             sP -= 16;
333         } break;
334         }
335     } while (count -= 8);
336 
337     // multiply by volume and save
338     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
339     int32x2_t vLR = vld1_s32(volumeLR);
340     int32x2_t outSamp = vld1_s32(out);
341     // combine and funnel down accumulator
342     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
343     if (CHANNELS == 1) {
344         // duplicate accum to both L and R
345         outAccum = vpadd_s32(outAccum, outAccum);
346     } else if (CHANNELS == 2) {
347         // accum2 contains R, fold in
348         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
349         outAccum = vpadd_s32(outAccum, outAccum2);
350     }
351     outAccum = vqrdmulh_s32(outAccum, vLR);
352     outSamp = vqadd_s32(outSamp, outAccum);
353     vst1_s32(out, outSamp);
354 }
355 
356 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)357 static inline void ProcessNeonIntrinsic(float* out,
358         int count,
359         const float* coefsP,
360         const float* coefsN,
361         const float* sP,
362         const float* sN,
363         const float* volumeLR,
364         float lerpP,
365         const float* coefsP1,
366         const float* coefsN1)
367 {
368     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
369     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
370 
371     sP -= CHANNELS*((STRIDE>>1)-1);
372     coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
373     coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
374 
375     float32x2_t interp;
376     if (!FIXED) {
377         interp = vdup_n_f32(lerpP);
378         coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
379         coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
380     }
381     float32x4_t accum, accum2;
382     // warning uninitialized if we use veorq_s32
383     // (alternative to below) accum = veorq_s32(accum, accum);
384     accum = vdupq_n_f32(0);
385     if (CHANNELS == 2) {
386         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
387         accum2 = vdupq_n_f32(0);
388     }
389     do {
390 #ifdef vld1q_f32_x2
391         float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
392         coefsP += 8;
393         float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
394         coefsN += 8;
395 #else
396         float32x4x2_t posCoef;
397         posCoef.val[0] = vld1q_f32(coefsP);
398         coefsP += 4;
399         posCoef.val[1] = vld1q_f32(coefsP);
400         coefsP += 4;
401         float32x4x2_t negCoef;
402         negCoef.val[0] = vld1q_f32(coefsN);
403         coefsN += 4;
404         negCoef.val[1] = vld1q_f32(coefsN);
405         coefsN += 4;
406 #endif
407         if (!FIXED) { // interpolate
408 #ifdef vld1q_f32_x2
409             float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
410             coefsP1 += 8;
411             float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
412             coefsN1 += 8;
413 #else
414             float32x4x2_t posCoef1;
415             posCoef1.val[0] = vld1q_f32(coefsP1);
416             coefsP1 += 4;
417             posCoef1.val[1] = vld1q_f32(coefsP1);
418             coefsP1 += 4;
419             float32x4x2_t negCoef1;
420             negCoef1.val[0] = vld1q_f32(coefsN1);
421             coefsN1 += 4;
422             negCoef1.val[1] = vld1q_f32(coefsN1);
423             coefsN1 += 4;
424 #endif
425             posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
426             posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
427             negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
428             negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
429 
430             posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
431             posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
432             negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
433             negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
434         }
435         switch (CHANNELS) {
436         case 1: {
437 #ifdef vld1q_f32_x2
438             float32x4x2_t posSamp = vld1q_f32_x2(sP);
439             float32x4x2_t negSamp = vld1q_f32_x2(sN);
440             sN += 8;
441             sP -= 8;
442 #else
443             float32x4x2_t posSamp;
444             posSamp.val[0] = vld1q_f32(sP);
445             sP += 4;
446             posSamp.val[1] = vld1q_f32(sP);
447             sP -= 12;
448             float32x4x2_t negSamp;
449             negSamp.val[0] = vld1q_f32(sN);
450             sN += 4;
451             negSamp.val[1] = vld1q_f32(sN);
452             sN += 4;
453 #endif
454             // effectively we want a vrev128q_f32()
455             posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
456             posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
457             posSamp.val[0] = vcombine_f32(
458                     vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
459             posSamp.val[1] = vcombine_f32(
460                     vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
461 
462             accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
463             accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
464             accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
465             accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
466         } break;
467         case 2: {
468             float32x4x2_t posSamp0 = vld2q_f32(sP);
469             sP += 8;
470             float32x4x2_t negSamp0 = vld2q_f32(sN);
471             sN += 8;
472             posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
473             posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
474             posSamp0.val[0] = vcombine_f32(
475                     vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
476             posSamp0.val[1] = vcombine_f32(
477                     vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
478 
479             float32x4x2_t posSamp1 = vld2q_f32(sP);
480             sP -= 24;
481             float32x4x2_t negSamp1 = vld2q_f32(sN);
482             sN += 8;
483             posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
484             posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
485             posSamp1.val[0] = vcombine_f32(
486                     vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
487             posSamp1.val[1] = vcombine_f32(
488                     vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
489 
490             // Note: speed is affected by accumulation order.
491             // Also, speed appears slower using vmul/vadd instead of vmla for
492             // stereo case, comparable for mono.
493 
494             accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
495             accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
496             accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
497             accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
498 
499             accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
500             accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
501             accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
502             accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
503         } break;
504         }
505     } while (count -= 8);
506 
507     // multiply by volume and save
508     volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
509     float32x2_t vLR = vld1_f32(volumeLR);
510     float32x2_t outSamp = vld1_f32(out);
511     // combine and funnel down accumulator
512     float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
513     if (CHANNELS == 1) {
514         // duplicate accum to both L and R
515         outAccum = vpadd_f32(outAccum, outAccum);
516     } else if (CHANNELS == 2) {
517         // accum2 contains R, fold in
518         float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
519         outAccum = vpadd_f32(outAccum, outAccum2);
520     }
521     outSamp = vmla_f32(outSamp, outAccum, vLR);
522     vst1_f32(out, outSamp);
523 }
524 
525 template <>
526 inline void ProcessL<1, 16>(int32_t* const out,
527         int count,
528         const int16_t* coefsP,
529         const int16_t* coefsN,
530         const int16_t* sP,
531         const int16_t* sN,
532         const int32_t* const volumeLR)
533 {
534 #ifdef USE_INTRINSIC
535     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
536             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
537 #else
538     const int CHANNELS = 1; // template specialization does not preserve params
539     const int STRIDE = 16;
540     sP -= CHANNELS*((STRIDE>>1)-1);
541     asm (
542         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
543 
544         "1:                                      \n"
545 
546         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
547         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
548         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
549         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
550 
551         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
552 
553         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
554         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
555         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
556         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
557         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
558 
559         // moving these ARM instructions before neon above seems to be slower
560         "subs           %[count], %[count], #8   \n"// (1) update loop counter
561         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
562 
563         // sP used after branch (warning)
564         "bne            1b                       \n"// loop
565 
566          ASSEMBLY_ACCUMULATE_MONO
567 
568         : [out]     "=Uv" (out[0]),
569           [count]   "+r" (count),
570           [coefsP0] "+r" (coefsP),
571           [coefsN0] "+r" (coefsN),
572           [sP]      "+r" (sP),
573           [sN]      "+r" (sN)
574         : [vLR]     "r" (volumeLR)
575         : "cc", "memory",
576           "q0", "q1", "q2", "q3",
577           "q8", "q10"
578     );
579 #endif
580 }
581 
582 template <>
583 inline void ProcessL<2, 16>(int32_t* const out,
584         int count,
585         const int16_t* coefsP,
586         const int16_t* coefsN,
587         const int16_t* sP,
588         const int16_t* sN,
589         const int32_t* const volumeLR)
590 {
591 #ifdef USE_INTRINSIC
592     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
593             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
594 #else
595     const int CHANNELS = 2; // template specialization does not preserve params
596     const int STRIDE = 16;
597     sP -= CHANNELS*((STRIDE>>1)-1);
598     asm (
599         "veor           q0, q0, q0               \n"// (1) acc_L = 0
600         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
601 
602         "1:                                      \n"
603 
604         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
605         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
606         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
607         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
608 
609         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
610         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
611 
612         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
613         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
614         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
615         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
616         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
617         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
618         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
619         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
620 
621         // moving these ARM before neon seems to be slower
622         "subs           %[count], %[count], #8   \n"// (1) update loop counter
623         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
624 
625         // sP used after branch (warning)
626         "bne            1b                       \n"// loop
627 
628         ASSEMBLY_ACCUMULATE_STEREO
629 
630         : [out] "=Uv" (out[0]),
631           [count] "+r" (count),
632           [coefsP0] "+r" (coefsP),
633           [coefsN0] "+r" (coefsN),
634           [sP] "+r" (sP),
635           [sN] "+r" (sN)
636         : [vLR] "r" (volumeLR)
637         : "cc", "memory",
638           "q0", "q1", "q2", "q3",
639           "q4", "q5", "q6",
640           "q8", "q10"
641      );
642 #endif
643 }
644 
645 template <>
646 inline void Process<1, 16>(int32_t* const out,
647         int count,
648         const int16_t* coefsP,
649         const int16_t* coefsN,
650         const int16_t* coefsP1,
651         const int16_t* coefsN1,
652         const int16_t* sP,
653         const int16_t* sN,
654         uint32_t lerpP,
655         const int32_t* const volumeLR)
656 {
657 #ifdef USE_INTRINSIC
658     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
659             lerpP, coefsP1, coefsN1);
660 #else
661 
662     const int CHANNELS = 1; // template specialization does not preserve params
663     const int STRIDE = 16;
664     sP -= CHANNELS*((STRIDE>>1)-1);
665     asm (
666         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
667         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
668 
669         "1:                                      \n"
670 
671         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
672         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
673         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
674         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
675         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
676         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
677 
678         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
679         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
680 
681         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
682         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
683 
684         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
685 
686         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
687         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
688 
689         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
690         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
691         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
692         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
693         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
694 
695         // moving these ARM instructions before neon above seems to be slower
696         "subs           %[count], %[count], #8   \n"// (1) update loop counter
697         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
698 
699         // sP used after branch (warning)
700         "bne            1b                       \n"// loop
701 
702         ASSEMBLY_ACCUMULATE_MONO
703 
704         : [out]     "=Uv" (out[0]),
705           [count]   "+r" (count),
706           [coefsP0] "+r" (coefsP),
707           [coefsN0] "+r" (coefsN),
708           [coefsP1] "+r" (coefsP1),
709           [coefsN1] "+r" (coefsN1),
710           [sP]      "+r" (sP),
711           [sN]      "+r" (sN)
712         : [lerpP]   "r" (lerpP),
713           [vLR]     "r" (volumeLR)
714         : "cc", "memory",
715           "q0", "q1", "q2", "q3",
716           "q8", "q9", "q10", "q11"
717     );
718 #endif
719 }
720 
721 template <>
722 inline void Process<2, 16>(int32_t* const out,
723         int count,
724         const int16_t* coefsP,
725         const int16_t* coefsN,
726         const int16_t* coefsP1,
727         const int16_t* coefsN1,
728         const int16_t* sP,
729         const int16_t* sN,
730         uint32_t lerpP,
731         const int32_t* const volumeLR)
732 {
733 #ifdef USE_INTRINSIC
734     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
735             lerpP, coefsP1, coefsN1);
736 #else
737     const int CHANNELS = 2; // template specialization does not preserve params
738     const int STRIDE = 16;
739     sP -= CHANNELS*((STRIDE>>1)-1);
740     asm (
741         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
742         "veor           q0, q0, q0               \n"// (1) acc_L = 0
743         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
744 
745         "1:                                      \n"
746 
747         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
748         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
749         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
750         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
751         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
752         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
753 
754         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
755         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
756 
757         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
758         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
759 
760         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
761         "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
762 
763         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
764         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
765 
766         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
767         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
768         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
769         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
770         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
771         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
772         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
773         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
774 
775         // moving these ARM before neon seems to be slower
776         "subs           %[count], %[count], #8   \n"// (1) update loop counter
777         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
778 
779         // sP used after branch (warning)
780         "bne            1b                       \n"// loop
781 
782         ASSEMBLY_ACCUMULATE_STEREO
783 
784         : [out] "=Uv" (out[0]),
785           [count] "+r" (count),
786           [coefsP0] "+r" (coefsP),
787           [coefsN0] "+r" (coefsN),
788           [coefsP1] "+r" (coefsP1),
789           [coefsN1] "+r" (coefsN1),
790           [sP] "+r" (sP),
791           [sN] "+r" (sN)
792         : [lerpP]   "r" (lerpP),
793           [vLR] "r" (volumeLR)
794         : "cc", "memory",
795           "q0", "q1", "q2", "q3",
796           "q4", "q5", "q6",
797           "q8", "q9", "q10", "q11"
798     );
799 #endif
800 }
801 
802 template <>
803 inline void ProcessL<1, 16>(int32_t* const out,
804         int count,
805         const int32_t* coefsP,
806         const int32_t* coefsN,
807         const int16_t* sP,
808         const int16_t* sN,
809         const int32_t* const volumeLR)
810 {
811 #ifdef USE_INTRINSIC
812     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
813             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
814 #else
815     const int CHANNELS = 1; // template specialization does not preserve params
816     const int STRIDE = 16;
817     sP -= CHANNELS*((STRIDE>>1)-1);
818     asm (
819         "veor           q0, q0, q0                    \n"// result, initialize to 0
820 
821         "1:                                           \n"
822 
823         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
824         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
825         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
826         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
827 
828         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
829 
830         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
831         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
832 
833         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
834         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
835 
836         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
837         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
838         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
839         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
840 
841         "vadd.s32       q0, q0, q12                   \n"// accumulate result
842         "vadd.s32       q13, q13, q14                 \n"// accumulate result
843         "vadd.s32       q0, q0, q15                   \n"// accumulate result
844         "vadd.s32       q0, q0, q13                   \n"// accumulate result
845 
846         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
847         "subs           %[count], %[count], #8        \n"// update loop counter
848 
849         "bne            1b                            \n"// loop
850 
851         ASSEMBLY_ACCUMULATE_MONO
852 
853         : [out]     "=Uv" (out[0]),
854           [count]   "+r" (count),
855           [coefsP0] "+r" (coefsP),
856           [coefsN0] "+r" (coefsN),
857           [sP]      "+r" (sP),
858           [sN]      "+r" (sN)
859         : [vLR]     "r" (volumeLR)
860         : "cc", "memory",
861           "q0", "q1", "q2", "q3",
862           "q8", "q9", "q10", "q11",
863           "q12", "q13", "q14", "q15"
864     );
865 #endif
866 }
867 
868 template <>
869 inline void ProcessL<2, 16>(int32_t* const out,
870         int count,
871         const int32_t* coefsP,
872         const int32_t* coefsN,
873         const int16_t* sP,
874         const int16_t* sN,
875         const int32_t* const volumeLR)
876 {
877 #ifdef USE_INTRINSIC
878     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
879             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
880 #else
881     const int CHANNELS = 2; // template specialization does not preserve params
882     const int STRIDE = 16;
883     sP -= CHANNELS*((STRIDE>>1)-1);
884     asm (
885         "veor           q0, q0, q0                    \n"// result, initialize to 0
886         "veor           q4, q4, q4                    \n"// result, initialize to 0
887 
888         "1:                                           \n"
889 
890         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
891         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
892         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
893         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
894 
895         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
896         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
897 
898         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
899         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
900 
901         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
902         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
903 
904         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
905         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
906         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
907         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
908 
909         "vadd.s32       q0, q0, q12                   \n"// accumulate result
910         "vadd.s32       q13, q13, q14                 \n"// accumulate result
911         "vadd.s32       q0, q0, q15                   \n"// accumulate result
912         "vadd.s32       q0, q0, q13                   \n"// accumulate result
913 
914         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
915         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
916 
917         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
918         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
919 
920         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
921         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
922         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
923         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
924 
925         "vadd.s32       q4, q4, q12                   \n"// accumulate result
926         "vadd.s32       q13, q13, q14                 \n"// accumulate result
927         "vadd.s32       q4, q4, q15                   \n"// accumulate result
928         "vadd.s32       q4, q4, q13                   \n"// accumulate result
929 
930         "subs           %[count], %[count], #8        \n"// update loop counter
931         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
932 
933         "bne            1b                            \n"// loop
934 
935         ASSEMBLY_ACCUMULATE_STEREO
936 
937         : [out]     "=Uv" (out[0]),
938           [count]   "+r" (count),
939           [coefsP0] "+r" (coefsP),
940           [coefsN0] "+r" (coefsN),
941           [sP]      "+r" (sP),
942           [sN]      "+r" (sN)
943         : [vLR]     "r" (volumeLR)
944         : "cc", "memory",
945           "q0", "q1", "q2", "q3",
946           "q4", "q5", "q6",
947           "q8", "q9", "q10", "q11",
948           "q12", "q13", "q14", "q15"
949     );
950 #endif
951 }
952 
953 template <>
954 inline void Process<1, 16>(int32_t* const out,
955         int count,
956         const int32_t* coefsP,
957         const int32_t* coefsN,
958         const int32_t* coefsP1,
959         const int32_t* coefsN1,
960         const int16_t* sP,
961         const int16_t* sN,
962         uint32_t lerpP,
963         const int32_t* const volumeLR)
964 {
965 #ifdef USE_INTRINSIC
966     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
967             lerpP, coefsP1, coefsN1);
968 #else
969     const int CHANNELS = 1; // template specialization does not preserve params
970     const int STRIDE = 16;
971     sP -= CHANNELS*((STRIDE>>1)-1);
972     asm (
973         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
974         "veor           q0, q0, q0                    \n"// result, initialize to 0
975 
976         "1:                                           \n"
977 
978         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
979         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
980         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
981         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
982         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
983         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
984 
985         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
986         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
987         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
988         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
989 
990         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
991         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
992         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
993         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
994 
995         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
996         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
997         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
998         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
999 
1000         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
1001 
1002         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1003         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1004 
1005         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
1006         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
1007 
1008         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1009         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1010         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1011         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1012 
1013         "vadd.s32       q0, q0, q12                   \n"// accumulate result
1014         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1015         "vadd.s32       q0, q0, q15                   \n"// accumulate result
1016         "vadd.s32       q0, q0, q13                   \n"// accumulate result
1017 
1018         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
1019         "subs           %[count], %[count], #8        \n"// update loop counter
1020 
1021         "bne            1b                            \n"// loop
1022 
1023         ASSEMBLY_ACCUMULATE_MONO
1024 
1025         : [out]     "=Uv" (out[0]),
1026           [count]   "+r" (count),
1027           [coefsP0] "+r" (coefsP),
1028           [coefsN0] "+r" (coefsN),
1029           [coefsP1] "+r" (coefsP1),
1030           [coefsN1] "+r" (coefsN1),
1031           [sP]      "+r" (sP),
1032           [sN]      "+r" (sN)
1033         : [lerpP]   "r" (lerpP),
1034           [vLR]     "r" (volumeLR)
1035         : "cc", "memory",
1036           "q0", "q1", "q2", "q3",
1037           "q8", "q9", "q10", "q11",
1038           "q12", "q13", "q14", "q15"
1039     );
1040 #endif
1041 }
1042 
1043 template <>
1044 inline void Process<2, 16>(int32_t* const out,
1045         int count,
1046         const int32_t* coefsP,
1047         const int32_t* coefsN,
1048         const int32_t* coefsP1,
1049         const int32_t* coefsN1,
1050         const int16_t* sP,
1051         const int16_t* sN,
1052         uint32_t lerpP,
1053         const int32_t* const volumeLR)
1054 {
1055 #ifdef USE_INTRINSIC
1056     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1057             lerpP, coefsP1, coefsN1);
1058 #else
1059     const int CHANNELS = 2; // template specialization does not preserve params
1060     const int STRIDE = 16;
1061     sP -= CHANNELS*((STRIDE>>1)-1);
1062     asm (
1063         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
1064         "veor           q0, q0, q0                    \n"// result, initialize to 0
1065         "veor           q4, q4, q4                    \n"// result, initialize to 0
1066 
1067         "1:                                           \n"
1068 
1069         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
1070         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
1071         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
1072         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
1073         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
1074         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
1075 
1076         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
1077         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
1078         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
1079         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
1080 
1081         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
1082         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
1083         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
1084         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
1085 
1086         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
1087         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
1088         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
1089         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
1090 
1091         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
1092         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
1093 
1094         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1095         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1096 
1097         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
1098         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
1099 
1100         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1101         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1102         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1103         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1104 
1105         "vadd.s32       q0, q0, q12                   \n"// accumulate result
1106         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1107         "vadd.s32       q0, q0, q15                   \n"// accumulate result
1108         "vadd.s32       q0, q0, q13                   \n"// accumulate result
1109 
1110         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
1111         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
1112 
1113         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
1114         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
1115 
1116         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1117         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1118         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1119         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1120 
1121         "vadd.s32       q4, q4, q12                   \n"// accumulate result
1122         "vadd.s32       q13, q13, q14                 \n"// accumulate result
1123         "vadd.s32       q4, q4, q15                   \n"// accumulate result
1124         "vadd.s32       q4, q4, q13                   \n"// accumulate result
1125 
1126         "subs           %[count], %[count], #8        \n"// update loop counter
1127         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
1128 
1129         "bne            1b                            \n"// loop
1130 
1131         ASSEMBLY_ACCUMULATE_STEREO
1132 
1133         : [out]     "=Uv" (out[0]),
1134           [count]   "+r" (count),
1135           [coefsP0] "+r" (coefsP),
1136           [coefsN0] "+r" (coefsN),
1137           [coefsP1] "+r" (coefsP1),
1138           [coefsN1] "+r" (coefsN1),
1139           [sP]      "+r" (sP),
1140           [sN]      "+r" (sN)
1141         : [lerpP]   "r" (lerpP),
1142           [vLR]     "r" (volumeLR)
1143         : "cc", "memory",
1144           "q0", "q1", "q2", "q3",
1145           "q4", "q5", "q6",
1146           "q8", "q9", "q10", "q11",
1147           "q12", "q13", "q14", "q15"
1148     );
1149 #endif
1150 }
1151 
1152 template<>
1153 inline void ProcessL<1, 16>(float* const out,
1154         int count,
1155         const float* coefsP,
1156         const float* coefsN,
1157         const float* sP,
1158         const float* sN,
1159         const float* const volumeLR)
1160 {
1161     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1162             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1163 }
1164 
1165 template<>
1166 inline void ProcessL<2, 16>(float* const out,
1167         int count,
1168         const float* coefsP,
1169         const float* coefsN,
1170         const float* sP,
1171         const float* sN,
1172         const float* const volumeLR)
1173 {
1174     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1175             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1176 }
1177 
1178 template<>
1179 inline void Process<1, 16>(float* const out,
1180         int count,
1181         const float* coefsP,
1182         const float* coefsN,
1183         const float* coefsP1,
1184         const float* coefsN1,
1185         const float* sP,
1186         const float* sN,
1187         float lerpP,
1188         const float* const volumeLR)
1189 {
1190     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1191             lerpP, coefsP1, coefsN1);
1192 }
1193 
1194 template<>
1195 inline void Process<2, 16>(float* const out,
1196         int count,
1197         const float* coefsP,
1198         const float* coefsN,
1199         const float* coefsP1,
1200         const float* coefsN1,
1201         const float* sP,
1202         const float* sN,
1203         float lerpP,
1204         const float* const volumeLR)
1205 {
1206     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1207             lerpP, coefsP1, coefsN1);
1208 }
1209 
1210 #endif //USE_NEON
1211 
1212 } // namespace android
1213 
1214 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215