1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19 
20 namespace android {
21 
22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23 
24 #if USE_NEON
25 //
26 // NEON specializations are enabled for Process() and ProcessL()
27 //
28 // TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
29 // and looping stride 16 (or vice versa). This has some polyphase coef data alignment
30 // issues with S16 coefs. Consider this later.
31 
32 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
33 #define ASSEMBLY_ACCUMULATE_MONO \
34         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
35         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
36         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
37         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
38         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
39         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
40         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
41 
42 #define ASSEMBLY_ACCUMULATE_STEREO \
43         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
44         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
45         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
46         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
47         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
48         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
49         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
50         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
51 
52 template <>
53 inline void ProcessL<1, 16>(int32_t* const out,
54         int count,
55         const int16_t* coefsP,
56         const int16_t* coefsN,
57         const int16_t* sP,
58         const int16_t* sN,
59         const int32_t* const volumeLR)
60 {
61     const int CHANNELS = 1; // template specialization does not preserve params
62     const int STRIDE = 16;
63     sP -= CHANNELS*((STRIDE>>1)-1);
64     asm (
65         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
66 
67         "1:                                      \n"
68 
69         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
70         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
71         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
72         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
73 
74         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
75 
76         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
77         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
78         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
79         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
80         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
81 
82         // moving these ARM instructions before neon above seems to be slower
83         "subs           %[count], %[count], #8   \n"// (1) update loop counter
84         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
85 
86         // sP used after branch (warning)
87         "bne            1b                       \n"// loop
88 
89          ASSEMBLY_ACCUMULATE_MONO
90 
91         : [out]     "=Uv" (out[0]),
92           [count]   "+r" (count),
93           [coefsP0] "+r" (coefsP),
94           [coefsN0] "+r" (coefsN),
95           [sP]      "+r" (sP),
96           [sN]      "+r" (sN)
97         : [vLR]     "r" (volumeLR)
98         : "cc", "memory",
99           "q0", "q1", "q2", "q3",
100           "q8", "q10"
101     );
102 }
103 
104 template <>
105 inline void ProcessL<2, 16>(int32_t* const out,
106         int count,
107         const int16_t* coefsP,
108         const int16_t* coefsN,
109         const int16_t* sP,
110         const int16_t* sN,
111         const int32_t* const volumeLR)
112 {
113     const int CHANNELS = 2; // template specialization does not preserve params
114     const int STRIDE = 16;
115     sP -= CHANNELS*((STRIDE>>1)-1);
116     asm (
117         "veor           q0, q0, q0               \n"// (1) acc_L = 0
118         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
119 
120         "1:                                      \n"
121 
122         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
123         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
124         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
125         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
126 
127         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
128         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
129 
130         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
131         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
132         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
133         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
134         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
135         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
136         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
137         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
138 
139         // moving these ARM before neon seems to be slower
140         "subs           %[count], %[count], #8   \n"// (1) update loop counter
141         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
142 
143         // sP used after branch (warning)
144         "bne            1b                       \n"// loop
145 
146         ASSEMBLY_ACCUMULATE_STEREO
147 
148         : [out] "=Uv" (out[0]),
149           [count] "+r" (count),
150           [coefsP0] "+r" (coefsP),
151           [coefsN0] "+r" (coefsN),
152           [sP] "+r" (sP),
153           [sN] "+r" (sN)
154         : [vLR] "r" (volumeLR)
155         : "cc", "memory",
156           "q0", "q1", "q2", "q3",
157           "q4", "q5", "q6",
158           "q8", "q10"
159      );
160 }
161 
162 template <>
163 inline void Process<1, 16>(int32_t* const out,
164         int count,
165         const int16_t* coefsP,
166         const int16_t* coefsN,
167         const int16_t* coefsP1,
168         const int16_t* coefsN1,
169         const int16_t* sP,
170         const int16_t* sN,
171         uint32_t lerpP,
172         const int32_t* const volumeLR)
173 {
174     const int CHANNELS = 1; // template specialization does not preserve params
175     const int STRIDE = 16;
176     sP -= CHANNELS*((STRIDE>>1)-1);
177     asm (
178         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
179         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
180 
181         "1:                                      \n"
182 
183         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
184         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
185         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
186         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
187         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
188         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
189 
190         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
191         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
192 
193         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
194         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
195 
196         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
197 
198         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
199         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
200 
201         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
202         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
203         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
204         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
205         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
206 
207         // moving these ARM instructions before neon above seems to be slower
208         "subs           %[count], %[count], #8   \n"// (1) update loop counter
209         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
210 
211         // sP used after branch (warning)
212         "bne            1b                       \n"// loop
213 
214         ASSEMBLY_ACCUMULATE_MONO
215 
216         : [out]     "=Uv" (out[0]),
217           [count]   "+r" (count),
218           [coefsP0] "+r" (coefsP),
219           [coefsN0] "+r" (coefsN),
220           [coefsP1] "+r" (coefsP1),
221           [coefsN1] "+r" (coefsN1),
222           [sP]      "+r" (sP),
223           [sN]      "+r" (sN)
224         : [lerpP]   "r" (lerpP),
225           [vLR]     "r" (volumeLR)
226         : "cc", "memory",
227           "q0", "q1", "q2", "q3",
228           "q8", "q9", "q10", "q11"
229     );
230 }
231 
232 template <>
233 inline void Process<2, 16>(int32_t* const out,
234         int count,
235         const int16_t* coefsP,
236         const int16_t* coefsN,
237         const int16_t* coefsP1,
238         const int16_t* coefsN1,
239         const int16_t* sP,
240         const int16_t* sN,
241         uint32_t lerpP,
242         const int32_t* const volumeLR)
243 {
244     const int CHANNELS = 2; // template specialization does not preserve params
245     const int STRIDE = 16;
246     sP -= CHANNELS*((STRIDE>>1)-1);
247     asm (
248         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
249         "veor           q0, q0, q0               \n"// (1) acc_L = 0
250         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
251 
252         "1:                                      \n"
253 
254         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
255         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
256         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
257         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
258         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
259         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
260 
261         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
262         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
263 
264         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
265         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
266 
267         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
268         "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
269 
270         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
271         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
272 
273         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
274         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
275         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
276         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
277         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
278         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
279         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
280         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
281 
282         // moving these ARM before neon seems to be slower
283         "subs           %[count], %[count], #8   \n"// (1) update loop counter
284         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
285 
286         // sP used after branch (warning)
287         "bne            1b                       \n"// loop
288 
289         ASSEMBLY_ACCUMULATE_STEREO
290 
291         : [out] "=Uv" (out[0]),
292           [count] "+r" (count),
293           [coefsP0] "+r" (coefsP),
294           [coefsN0] "+r" (coefsN),
295           [coefsP1] "+r" (coefsP1),
296           [coefsN1] "+r" (coefsN1),
297           [sP] "+r" (sP),
298           [sN] "+r" (sN)
299         : [lerpP]   "r" (lerpP),
300           [vLR] "r" (volumeLR)
301         : "cc", "memory",
302           "q0", "q1", "q2", "q3",
303           "q4", "q5", "q6",
304           "q8", "q9", "q10", "q11"
305     );
306 }
307 
308 template <>
309 inline void ProcessL<1, 16>(int32_t* const out,
310         int count,
311         const int32_t* coefsP,
312         const int32_t* coefsN,
313         const int16_t* sP,
314         const int16_t* sN,
315         const int32_t* const volumeLR)
316 {
317     const int CHANNELS = 1; // template specialization does not preserve params
318     const int STRIDE = 16;
319     sP -= CHANNELS*((STRIDE>>1)-1);
320     asm (
321         "veor           q0, q0, q0                    \n"// result, initialize to 0
322 
323         "1:                                           \n"
324 
325         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
326         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
327         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
328         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
329 
330         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
331 
332         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
333         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
334 
335         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
336         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
337 
338         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
339         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
340         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
341         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
342 
343         "vadd.s32       q0, q0, q12                   \n"// accumulate result
344         "vadd.s32       q13, q13, q14                 \n"// accumulate result
345         "vadd.s32       q0, q0, q15                   \n"// accumulate result
346         "vadd.s32       q0, q0, q13                   \n"// accumulate result
347 
348         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
349         "subs           %[count], %[count], #8        \n"// update loop counter
350 
351         "bne            1b                            \n"// loop
352 
353         ASSEMBLY_ACCUMULATE_MONO
354 
355         : [out]     "=Uv" (out[0]),
356           [count]   "+r" (count),
357           [coefsP0] "+r" (coefsP),
358           [coefsN0] "+r" (coefsN),
359           [sP]      "+r" (sP),
360           [sN]      "+r" (sN)
361         : [vLR]     "r" (volumeLR)
362         : "cc", "memory",
363           "q0", "q1", "q2", "q3",
364           "q8", "q9", "q10", "q11",
365           "q12", "q13", "q14", "q15"
366     );
367 }
368 
369 template <>
370 inline void ProcessL<2, 16>(int32_t* const out,
371         int count,
372         const int32_t* coefsP,
373         const int32_t* coefsN,
374         const int16_t* sP,
375         const int16_t* sN,
376         const int32_t* const volumeLR)
377 {
378     const int CHANNELS = 2; // template specialization does not preserve params
379     const int STRIDE = 16;
380     sP -= CHANNELS*((STRIDE>>1)-1);
381     asm (
382         "veor           q0, q0, q0                    \n"// result, initialize to 0
383         "veor           q4, q4, q4                    \n"// result, initialize to 0
384 
385         "1:                                           \n"
386 
387         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
388         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
389         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
390         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
391 
392         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
393         "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
394 
395         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
396         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
397 
398         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
399         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
400 
401         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
402         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
403         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
404         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
405 
406         "vadd.s32       q0, q0, q12                   \n"// accumulate result
407         "vadd.s32       q13, q13, q14                 \n"// accumulate result
408         "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
409         "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
410 
411         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
412         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
413 
414         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
415         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
416 
417         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
418         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
419         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
420         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
421 
422         "vadd.s32       q4, q4, q12                   \n"// accumulate result
423         "vadd.s32       q13, q13, q14                 \n"// accumulate result
424         "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
425         "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
426 
427         "subs           %[count], %[count], #8        \n"// update loop counter
428         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
429 
430         "bne            1b                            \n"// loop
431 
432         ASSEMBLY_ACCUMULATE_STEREO
433 
434         : [out]     "=Uv" (out[0]),
435           [count]   "+r" (count),
436           [coefsP0] "+r" (coefsP),
437           [coefsN0] "+r" (coefsN),
438           [sP]      "+r" (sP),
439           [sN]      "+r" (sN)
440         : [vLR]     "r" (volumeLR)
441         : "cc", "memory",
442           "q0", "q1", "q2", "q3",
443           "q4", "q5", "q6",
444           "q8", "q9", "q10", "q11",
445           "q12", "q13", "q14", "q15"
446     );
447 }
448 
449 template <>
450 inline void Process<1, 16>(int32_t* const out,
451         int count,
452         const int32_t* coefsP,
453         const int32_t* coefsN,
454         const int32_t* coefsP1,
455         const int32_t* coefsN1,
456         const int16_t* sP,
457         const int16_t* sN,
458         uint32_t lerpP,
459         const int32_t* const volumeLR)
460 {
461     const int CHANNELS = 1; // template specialization does not preserve params
462     const int STRIDE = 16;
463     sP -= CHANNELS*((STRIDE>>1)-1);
464     asm (
465         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
466         "veor           q0, q0, q0                    \n"// result, initialize to 0
467 
468         "1:                                           \n"
469 
470         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
471         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
472         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
473         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
474         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
475         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
476 
477         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
478         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
479         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
480         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
481 
482         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
483         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
484         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
485         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
486 
487         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
488         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
489         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
490         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
491 
492         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
493 
494         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
495         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
496 
497         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
498         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
499 
500         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
501         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
502         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
503         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
504 
505         "vadd.s32       q0, q0, q12                   \n"// accumulate result
506         "vadd.s32       q13, q13, q14                 \n"// accumulate result
507         "vadd.s32       q0, q0, q15                   \n"// accumulate result
508         "vadd.s32       q0, q0, q13                   \n"// accumulate result
509 
510         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
511         "subs           %[count], %[count], #8        \n"// update loop counter
512 
513         "bne            1b                            \n"// loop
514 
515         ASSEMBLY_ACCUMULATE_MONO
516 
517         : [out]     "=Uv" (out[0]),
518           [count]   "+r" (count),
519           [coefsP0] "+r" (coefsP),
520           [coefsN0] "+r" (coefsN),
521           [coefsP1] "+r" (coefsP1),
522           [coefsN1] "+r" (coefsN1),
523           [sP]      "+r" (sP),
524           [sN]      "+r" (sN)
525         : [lerpP]   "r" (lerpP),
526           [vLR]     "r" (volumeLR)
527         : "cc", "memory",
528           "q0", "q1", "q2", "q3",
529           "q8", "q9", "q10", "q11",
530           "q12", "q13", "q14", "q15"
531     );
532 }
533 
534 template <>
535 inline void Process<2, 16>(int32_t* const out,
536         int count,
537         const int32_t* coefsP,
538         const int32_t* coefsN,
539         const int32_t* coefsP1,
540         const int32_t* coefsN1,
541         const int16_t* sP,
542         const int16_t* sN,
543         uint32_t lerpP,
544         const int32_t* const volumeLR)
545 {
546     const int CHANNELS = 2; // template specialization does not preserve params
547     const int STRIDE = 16;
548     sP -= CHANNELS*((STRIDE>>1)-1);
549     asm (
550         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
551         "veor           q0, q0, q0                    \n"// result, initialize to 0
552         "veor           q4, q4, q4                    \n"// result, initialize to 0
553 
554         "1:                                           \n"
555 
556         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
557         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
558         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
559         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
560         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
561         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
562 
563         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
564         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
565         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
566         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
567 
568         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
569         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
570         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
571         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
572 
573         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
574         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
575         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
576         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
577 
578         "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
579         "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
580 
581         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
582         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
583 
584         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
585         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
586 
587         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
588         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
589         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
590         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
591 
592         "vadd.s32       q0, q0, q12                   \n"// accumulate result
593         "vadd.s32       q13, q13, q14                 \n"// accumulate result
594         "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
595         "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
596 
597         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
598         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
599 
600         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
601         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
602 
603         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
604         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
605         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
606         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
607 
608         "vadd.s32       q4, q4, q12                   \n"// accumulate result
609         "vadd.s32       q13, q13, q14                 \n"// accumulate result
610         "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
611         "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
612 
613         "subs           %[count], %[count], #8        \n"// update loop counter
614         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
615 
616         "bne            1b                            \n"// loop
617 
618         ASSEMBLY_ACCUMULATE_STEREO
619 
620         : [out]     "=Uv" (out[0]),
621           [count]   "+r" (count),
622           [coefsP0] "+r" (coefsP),
623           [coefsN0] "+r" (coefsN),
624           [coefsP1] "+r" (coefsP1),
625           [coefsN1] "+r" (coefsN1),
626           [sP]      "+r" (sP),
627           [sN]      "+r" (sN)
628         : [lerpP]   "r" (lerpP),
629           [vLR]     "r" (volumeLR)
630         : "cc", "memory",
631           "q0", "q1", "q2", "q3",
632           "q4", "q5", "q6",
633           "q8", "q9", "q10", "q11",
634           "q12", "q13", "q14", "q15"
635     );
636 }
637 
638 template <>
639 inline void ProcessL<1, 8>(int32_t* const out,
640         int count,
641         const int16_t* coefsP,
642         const int16_t* coefsN,
643         const int16_t* sP,
644         const int16_t* sN,
645         const int32_t* const volumeLR)
646 {
647     const int CHANNELS = 1; // template specialization does not preserve params
648     const int STRIDE = 8;
649     sP -= CHANNELS*((STRIDE>>1)-1);
650     asm (
651         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
652 
653         "1:                                      \n"
654 
655         "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
656         "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
657         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
658         "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
659 
660         "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
661 
662         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
663         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
664         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
665 
666         // moving these ARM instructions before neon above seems to be slower
667         "subs           %[count], %[count], #4   \n"// (1) update loop counter
668         "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
669 
670         // sP used after branch (warning)
671         "bne            1b                       \n"// loop
672 
673         ASSEMBLY_ACCUMULATE_MONO
674 
675         : [out]     "=Uv" (out[0]),
676           [count]   "+r" (count),
677           [coefsP0] "+r" (coefsP),
678           [coefsN0] "+r" (coefsN),
679           [sP]      "+r" (sP),
680           [sN]      "+r" (sN)
681         : [vLR]     "r" (volumeLR)
682         : "cc", "memory",
683           "q0", "q1", "q2", "q3",
684           "q8", "q10"
685     );
686 }
687 
688 template <>
689 inline void ProcessL<2, 8>(int32_t* const out,
690         int count,
691         const int16_t* coefsP,
692         const int16_t* coefsN,
693         const int16_t* sP,
694         const int16_t* sN,
695         const int32_t* const volumeLR)
696 {
697     const int CHANNELS = 2; // template specialization does not preserve params
698     const int STRIDE = 8;
699     sP -= CHANNELS*((STRIDE>>1)-1);
700     asm (
701         "veor           q0, q0, q0               \n"// (1) acc_L = 0
702         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
703 
704         "1:                                      \n"
705 
706         "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
707         "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
708         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
709         "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
710 
711         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
712 
713         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
714         "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
715         "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
716         "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
717 
718         // moving these ARM before neon seems to be slower
719         "subs           %[count], %[count], #4   \n"// (1) update loop counter
720         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
721 
722         // sP used after branch (warning)
723         "bne            1b                       \n"// loop
724 
725         ASSEMBLY_ACCUMULATE_STEREO
726 
727         : [out] "=Uv" (out[0]),
728           [count] "+r" (count),
729           [coefsP0] "+r" (coefsP),
730           [coefsN0] "+r" (coefsN),
731           [sP] "+r" (sP),
732           [sN] "+r" (sN)
733         : [vLR] "r" (volumeLR)
734         : "cc", "memory",
735           "q0", "q1", "q2", "q3",
736           "q4", "q5", "q6",
737           "q8", "q10"
738      );
739 }
740 
741 template <>
742 inline void Process<1, 8>(int32_t* const out,
743         int count,
744         const int16_t* coefsP,
745         const int16_t* coefsN,
746         const int16_t* coefsP1,
747         const int16_t* coefsN1,
748         const int16_t* sP,
749         const int16_t* sN,
750         uint32_t lerpP,
751         const int32_t* const volumeLR)
752 {
753     const int CHANNELS = 1; // template specialization does not preserve params
754     const int STRIDE = 8;
755     sP -= CHANNELS*((STRIDE>>1)-1);
756     asm (
757         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
758         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
759 
760         "1:                                      \n"
761 
762         "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
763         "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
764         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
765         "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
766         "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
767         "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
768 
769         "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
770         "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
771 
772         "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
773         "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
774 
775         "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
776 
777         "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
778         "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
779 
780         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
781         "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
782         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
783 
784         // moving these ARM instructions before neon above seems to be slower
785         "subs           %[count], %[count], #4   \n"// (1) update loop counter
786         "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
787 
788         // sP used after branch (warning)
789         "bne            1b                       \n"// loop
790 
791         ASSEMBLY_ACCUMULATE_MONO
792 
793         : [out]     "=Uv" (out[0]),
794           [count]   "+r" (count),
795           [coefsP0] "+r" (coefsP),
796           [coefsN0] "+r" (coefsN),
797           [coefsP1] "+r" (coefsP1),
798           [coefsN1] "+r" (coefsN1),
799           [sP]      "+r" (sP),
800           [sN]      "+r" (sN)
801         : [lerpP]   "r" (lerpP),
802           [vLR]     "r" (volumeLR)
803         : "cc", "memory",
804           "q0", "q1", "q2", "q3",
805           "q8", "q9", "q10", "q11"
806     );
807 }
808 
809 template <>
810 inline void Process<2, 8>(int32_t* const out,
811         int count,
812         const int16_t* coefsP,
813         const int16_t* coefsN,
814         const int16_t* coefsP1,
815         const int16_t* coefsN1,
816         const int16_t* sP,
817         const int16_t* sN,
818         uint32_t lerpP,
819         const int32_t* const volumeLR)
820 {
821     const int CHANNELS = 2; // template specialization does not preserve params
822     const int STRIDE = 8;
823     sP -= CHANNELS*((STRIDE>>1)-1);
824     asm (
825         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
826         "veor           q0, q0, q0               \n"// (1) acc_L = 0
827         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
828 
829         "1:                                      \n"
830 
831         "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
832         "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
833         "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
834         "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
835         "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
836         "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
837 
838         "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
839         "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
840 
841         "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
842         "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
843 
844         "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
845 
846         "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
847         "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
848 
849         "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
850         "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
851         "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
852         "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
853 
854         // moving these ARM before neon seems to be slower
855         "subs           %[count], %[count], #4   \n"// (1) update loop counter
856         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
857 
858         // sP used after branch (warning)
859         "bne            1b                       \n"// loop
860 
861         ASSEMBLY_ACCUMULATE_STEREO
862 
863         : [out] "=Uv" (out[0]),
864           [count] "+r" (count),
865           [coefsP0] "+r" (coefsP),
866           [coefsN0] "+r" (coefsN),
867           [coefsP1] "+r" (coefsP1),
868           [coefsN1] "+r" (coefsN1),
869           [sP] "+r" (sP),
870           [sN] "+r" (sN)
871         : [lerpP]   "r" (lerpP),
872           [vLR] "r" (volumeLR)
873         : "cc", "memory",
874           "q0", "q1", "q2", "q3",
875           "q4", "q5", "q6",
876           "q8", "q9", "q10", "q11"
877     );
878 }
879 
880 template <>
881 inline void ProcessL<1, 8>(int32_t* const out,
882         int count,
883         const int32_t* coefsP,
884         const int32_t* coefsN,
885         const int16_t* sP,
886         const int16_t* sN,
887         const int32_t* const volumeLR)
888 {
889     const int CHANNELS = 1; // template specialization does not preserve params
890     const int STRIDE = 8;
891     sP -= CHANNELS*((STRIDE>>1)-1);
892     asm (
893         "veor           q0, q0, q0               \n"// result, initialize to 0
894 
895         "1:                                      \n"
896 
897         "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
898         "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
899         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
900         "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
901 
902         "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
903 
904         "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
905         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
906 
907         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
908         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
909 
910         "vadd.s32       q0, q0, q12              \n"// accumulate result
911         "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
912 
913         "subs           %[count], %[count], #4   \n"// update loop counter
914         "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
915 
916         "bne            1b                       \n"// loop
917 
918         ASSEMBLY_ACCUMULATE_MONO
919 
920         : [out] "=Uv" (out[0]),
921           [count] "+r" (count),
922           [coefsP0] "+r" (coefsP),
923           [coefsN0] "+r" (coefsN),
924           [sP] "+r" (sP),
925           [sN] "+r" (sN)
926         : [vLR] "r" (volumeLR)
927         : "cc", "memory",
928           "q0", "q1", "q2", "q3",
929           "q8", "q9", "q10", "q11",
930           "q12", "q14"
931     );
932 }
933 
934 template <>
935 inline void ProcessL<2, 8>(int32_t* const out,
936         int count,
937         const int32_t* coefsP,
938         const int32_t* coefsN,
939         const int16_t* sP,
940         const int16_t* sN,
941         const int32_t* const volumeLR)
942 {
943     const int CHANNELS = 2; // template specialization does not preserve params
944     const int STRIDE = 8;
945     sP -= CHANNELS*((STRIDE>>1)-1);
946     asm (
947         "veor           q0, q0, q0               \n"// result, initialize to 0
948         "veor           q4, q4, q4               \n"// result, initialize to 0
949 
950         "1:                                      \n"
951 
952         "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
953         "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
954         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
955         "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
956 
957         "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
958 
959         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
960         "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
961 
962         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
963         "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
964 
965         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
966         "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
967         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
968         "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
969 
970         "vadd.s32       q0, q0, q12              \n"// accumulate result
971         "vadd.s32       q4, q4, q13              \n"// accumulate result
972         "vadd.s32       q0, q0, q14              \n"// accumulate result
973         "vadd.s32       q4, q4, q15              \n"// accumulate result
974 
975         "subs           %[count], %[count], #4   \n"// update loop counter
976         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
977 
978         "bne            1b                       \n"// loop
979 
980         ASSEMBLY_ACCUMULATE_STEREO
981 
982         : [out]     "=Uv" (out[0]),
983           [count]   "+r" (count),
984           [coefsP0] "+r" (coefsP),
985           [coefsN0] "+r" (coefsN),
986           [sP]      "+r" (sP),
987           [sN]      "+r" (sN)
988         : [vLR]     "r" (volumeLR)
989         : "cc", "memory",
990           "q0", "q1", "q2", "q3", "q4",
991           "q8", "q9", "q10", "q11",
992           "q12", "q13", "q14", "q15"
993     );
994 }
995 
996 template <>
997 inline void Process<1, 8>(int32_t* const out,
998         int count,
999         const int32_t* coefsP,
1000         const int32_t* coefsN,
1001         const int32_t* coefsP1,
1002         const int32_t* coefsN1,
1003         const int16_t* sP,
1004         const int16_t* sN,
1005         uint32_t lerpP,
1006         const int32_t* const volumeLR)
1007 {
1008     const int CHANNELS = 1; // template specialization does not preserve params
1009     const int STRIDE = 8;
1010     sP -= CHANNELS*((STRIDE>>1)-1);
1011     asm (
1012         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
1013         "veor           q0, q0, q0               \n"// result, initialize to 0
1014 
1015         "1:                                      \n"
1016 
1017         "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
1018         "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
1019         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
1020         "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
1021         "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1022         "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1023 
1024         "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
1025 
1026         "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
1027         "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
1028         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
1029 
1030         "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
1031         "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
1032         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
1033 
1034         "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
1035         "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
1036 
1037         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
1038         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
1039 
1040         "vadd.s32       q0, q0, q12              \n"// accumulate result
1041         "vadd.s32       q0, q0, q14              \n"// accumulate result
1042 
1043         "subs           %[count], %[count], #4   \n"// update loop counter
1044         "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
1045 
1046         "bne            1b                       \n"// loop
1047 
1048         ASSEMBLY_ACCUMULATE_MONO
1049 
1050         : [out]     "=Uv" (out[0]),
1051           [count]   "+r" (count),
1052           [coefsP0] "+r" (coefsP),
1053           [coefsP1] "+r" (coefsP1),
1054           [coefsN0] "+r" (coefsN),
1055           [coefsN1] "+r" (coefsN1),
1056           [sP]      "+r" (sP),
1057           [sN]      "+r" (sN)
1058         : [lerpP]   "r" (lerpP),
1059           [vLR]     "r" (volumeLR)
1060         : "cc", "memory",
1061           "q0", "q1", "q2", "q3",
1062           "q8", "q9", "q10", "q11",
1063           "q12", "q14"
1064     );
1065 }
1066 
1067 template <>
1068 inline
1069 void Process<2, 8>(int32_t* const out,
1070         int count,
1071         const int32_t* coefsP,
1072         const int32_t* coefsN,
1073         const int32_t* coefsP1,
1074         const int32_t* coefsN1,
1075         const int16_t* sP,
1076         const int16_t* sN,
1077         uint32_t lerpP,
1078         const int32_t* const volumeLR)
1079 {
1080     const int CHANNELS = 2; // template specialization does not preserve params
1081     const int STRIDE = 8;
1082     sP -= CHANNELS*((STRIDE>>1)-1);
1083     asm (
1084         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
1085         "veor           q0, q0, q0               \n"// result, initialize to 0
1086         "veor           q4, q4, q4               \n"// result, initialize to 0
1087 
1088         "1:                                      \n"
1089         "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
1090         "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
1091         "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
1092         "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
1093         "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1094         "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1095 
1096         "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
1097 
1098         "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
1099         "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
1100         "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
1101         "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
1102 
1103         "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
1104         "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
1105         "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
1106         "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
1107 
1108         "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
1109         "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
1110 
1111         "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
1112         "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
1113         "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
1114         "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
1115 
1116         "vadd.s32       q0, q0, q12              \n"// accumulate result
1117         "vadd.s32       q4, q4, q13              \n"// accumulate result
1118         "vadd.s32       q0, q0, q14              \n"// accumulate result
1119         "vadd.s32       q4, q4, q15              \n"// accumulate result
1120 
1121         "subs           %[count], %[count], #4   \n"// update loop counter
1122         "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
1123 
1124         "bne            1b                       \n"// loop
1125 
1126         ASSEMBLY_ACCUMULATE_STEREO
1127 
1128         : [out]     "=Uv" (out[0]),
1129           [count]   "+r" (count),
1130           [coefsP0] "+r" (coefsP),
1131           [coefsP1] "+r" (coefsP1),
1132           [coefsN0] "+r" (coefsN),
1133           [coefsN1] "+r" (coefsN1),
1134           [sP]      "+r" (sP),
1135           [sN]      "+r" (sN)
1136         : [lerpP]   "r" (lerpP),
1137           [vLR]     "r" (volumeLR)
1138         : "cc", "memory",
1139           "q0", "q1", "q2", "q3", "q4",
1140           "q8", "q9", "q10", "q11",
1141           "q12", "q13", "q14", "q15"
1142     );
1143 }
1144 
1145 #endif //USE_NEON
1146 
1147 }; // namespace android
1148 
1149 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1150