1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "AudioResamplerSinc"
18 //#define LOG_NDEBUG 0
19 
20 #define __STDC_CONSTANT_MACROS
21 #include <malloc.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <dlfcn.h>
25 
26 #include <cutils/compiler.h>
27 #include <cutils/properties.h>
28 
29 #include <utils/Log.h>
30 #include <audio_utils/primitives.h>
31 
32 #include "AudioResamplerSinc.h"
33 
34 #if defined(__clang__) && !__has_builtin(__builtin_assume_aligned)
35 #define __builtin_assume_aligned(p, a) \
36 	(((uintptr_t(p) % (a)) == 0) ? (p) : (__builtin_unreachable(), (p)))
37 #endif
38 
39 #if defined(__arm__) && !defined(__thumb__)
40 #define USE_INLINE_ASSEMBLY (true)
41 #else
42 #define USE_INLINE_ASSEMBLY (false)
43 #endif
44 
45 #if defined(__aarch64__) || defined(__ARM_NEON__)
46 #include <arm_neon.h>
47 #define USE_NEON
48 #else
49 #undef USE_NEON
50 #endif
51 
52 #define UNUSED(x) ((void)(x))
53 
54 namespace android {
55 // ----------------------------------------------------------------------------
56 
57 
58 /*
59  * These coeficients are computed with the "fir" utility found in
60  * tools/resampler_tools
61  * cmd-line: fir -l 7 -s 48000 -c 20478
62  */
63 const uint32_t AudioResamplerSinc::mFirCoefsUp[] __attribute__ ((aligned (32))) = {
64 #include "AudioResamplerSincUp.h"
65 };
66 
67 /*
68  * These coefficients are optimized for 48KHz -> 44.1KHz
69  * cmd-line: fir -l 7 -s 48000 -c 17189
70  */
71 const uint32_t AudioResamplerSinc::mFirCoefsDown[] __attribute__ ((aligned (32))) = {
72 #include "AudioResamplerSincDown.h"
73 };
74 
75 // we use 15 bits to interpolate between these samples
76 // this cannot change because the mul below rely on it.
77 static const int pLerpBits = 15;
78 
79 static pthread_once_t once_control = PTHREAD_ONCE_INIT;
80 static readCoefficientsFn readResampleCoefficients = NULL;
81 
82 /*static*/ AudioResamplerSinc::Constants AudioResamplerSinc::highQualityConstants;
83 /*static*/ AudioResamplerSinc::Constants AudioResamplerSinc::veryHighQualityConstants;
84 
init_routine()85 void AudioResamplerSinc::init_routine()
86 {
87     // for high quality resampler, the parameters for coefficients are compile-time constants
88     Constants *c = &highQualityConstants;
89     c->coefsBits = RESAMPLE_FIR_LERP_INT_BITS;
90     c->cShift = kNumPhaseBits - c->coefsBits;
91     c->cMask = ((1<< c->coefsBits)-1) << c->cShift;
92     c->pShift = kNumPhaseBits - c->coefsBits - pLerpBits;
93     c->pMask = ((1<< pLerpBits)-1) << c->pShift;
94     c->halfNumCoefs = RESAMPLE_FIR_NUM_COEF;
95 
96     // for very high quality resampler, the parameters are load-time constants
97     veryHighQualityConstants = highQualityConstants;
98 
99     // Open the dll to get the coefficients for VERY_HIGH_QUALITY
100     void *resampleCoeffLib = dlopen("libaudio-resampler.so", RTLD_NOW);
101     ALOGV("Open libaudio-resampler library = %p", resampleCoeffLib);
102     if (resampleCoeffLib == NULL) {
103         ALOGE("Could not open audio-resampler library: %s", dlerror());
104         return;
105     }
106 
107     readResampleFirNumCoeffFn readResampleFirNumCoeff;
108     readResampleFirLerpIntBitsFn readResampleFirLerpIntBits;
109 
110     readResampleCoefficients = (readCoefficientsFn)
111             dlsym(resampleCoeffLib, "readResamplerCoefficients");
112     readResampleFirNumCoeff = (readResampleFirNumCoeffFn)
113             dlsym(resampleCoeffLib, "readResampleFirNumCoeff");
114     readResampleFirLerpIntBits = (readResampleFirLerpIntBitsFn)
115             dlsym(resampleCoeffLib, "readResampleFirLerpIntBits");
116 
117     if (!readResampleCoefficients || !readResampleFirNumCoeff || !readResampleFirLerpIntBits) {
118         readResampleCoefficients = NULL;
119         dlclose(resampleCoeffLib);
120         resampleCoeffLib = NULL;
121         ALOGE("Could not find symbol: %s", dlerror());
122         return;
123     }
124 
125     c = &veryHighQualityConstants;
126     c->coefsBits = readResampleFirLerpIntBits();
127     c->cShift = kNumPhaseBits - c->coefsBits;
128     c->cMask = ((1<<c->coefsBits)-1) << c->cShift;
129     c->pShift = kNumPhaseBits - c->coefsBits - pLerpBits;
130     c->pMask = ((1<<pLerpBits)-1) << c->pShift;
131     // number of zero-crossing on each side
132     c->halfNumCoefs = readResampleFirNumCoeff();
133     ALOGV("coefsBits = %d", c->coefsBits);
134     ALOGV("halfNumCoefs = %d", c->halfNumCoefs);
135     // note that we "leak" resampleCoeffLib until the process exits
136 }
137 
138 // ----------------------------------------------------------------------------
139 
140 static inline
mulRL(int left,int32_t in,uint32_t vRL)141 int32_t mulRL(int left, int32_t in, uint32_t vRL)
142 {
143 #if USE_INLINE_ASSEMBLY
144     int32_t out;
145     if (left) {
146         asm( "smultb %[out], %[in], %[vRL] \n"
147              : [out]"=r"(out)
148              : [in]"%r"(in), [vRL]"r"(vRL)
149              : );
150     } else {
151         asm( "smultt %[out], %[in], %[vRL] \n"
152              : [out]"=r"(out)
153              : [in]"%r"(in), [vRL]"r"(vRL)
154              : );
155     }
156     return out;
157 #else
158     int16_t v = left ? int16_t(vRL) : int16_t(vRL>>16);
159     return int32_t((int64_t(in) * v) >> 16);
160 #endif
161 }
162 
163 static inline
mulAdd(int16_t in,int32_t v,int32_t a)164 int32_t mulAdd(int16_t in, int32_t v, int32_t a)
165 {
166 #if USE_INLINE_ASSEMBLY
167     int32_t out;
168     asm( "smlawb %[out], %[v], %[in], %[a] \n"
169          : [out]"=r"(out)
170          : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
171          : );
172     return out;
173 #else
174     return a + int32_t((int64_t(v) * in) >> 16);
175 #endif
176 }
177 
178 static inline
mulAddRL(int left,uint32_t inRL,int32_t v,int32_t a)179 int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
180 {
181 #if USE_INLINE_ASSEMBLY
182     int32_t out;
183     if (left) {
184         asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
185              : [out]"=r"(out)
186              : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
187              : );
188     } else {
189         asm( "smlawt %[out], %[v], %[inRL], %[a] \n"
190              : [out]"=r"(out)
191              : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
192              : );
193     }
194     return out;
195 #else
196     int16_t s = left ? int16_t(inRL) : int16_t(inRL>>16);
197     return a + int32_t((int64_t(v) * s) >> 16);
198 #endif
199 }
200 
201 // ----------------------------------------------------------------------------
202 
AudioResamplerSinc(int inChannelCount,int32_t sampleRate,src_quality quality)203 AudioResamplerSinc::AudioResamplerSinc(
204         int inChannelCount, int32_t sampleRate, src_quality quality)
205     : AudioResampler(inChannelCount, sampleRate, quality),
206     mState(0), mImpulse(0), mRingFull(0), mFirCoefs(0)
207 {
208     /*
209      * Layout of the state buffer for 32 tap:
210      *
211      * "present" sample            beginning of 2nd buffer
212      *                 v                v
213      *  0              01               2              23              3
214      *  0              F0               0              F0              F
215      * [pppppppppppppppInnnnnnnnnnnnnnnnpppppppppppppppInnnnnnnnnnnnnnnn]
216      *                 ^               ^ head
217      *
218      * p = past samples, convoluted with the (p)ositive side of sinc()
219      * n = future samples, convoluted with the (n)egative side of sinc()
220      * r = extra space for implementing the ring buffer
221      *
222      */
223 
224     mVolumeSIMD[0] = 0;
225     mVolumeSIMD[1] = 0;
226 
227     // Load the constants for coefficients
228     int ok = pthread_once(&once_control, init_routine);
229     if (ok != 0) {
230         ALOGE("%s pthread_once failed: %d", __func__, ok);
231     }
232     mConstants = (quality == VERY_HIGH_QUALITY) ?
233             &veryHighQualityConstants : &highQualityConstants;
234 }
235 
236 
~AudioResamplerSinc()237 AudioResamplerSinc::~AudioResamplerSinc() {
238     free(mState);
239 }
240 
init()241 void AudioResamplerSinc::init() {
242     const Constants& c(*mConstants);
243     const size_t numCoefs = 2 * c.halfNumCoefs;
244     const size_t stateSize = numCoefs * mChannelCount * 2;
245     mState = (int16_t*)memalign(32, stateSize*sizeof(int16_t));
246     memset(mState, 0, sizeof(int16_t)*stateSize);
247     mImpulse  = mState   + (c.halfNumCoefs-1)*mChannelCount;
248     mRingFull = mImpulse + (numCoefs+1)*mChannelCount;
249 }
250 
setVolume(float left,float right)251 void AudioResamplerSinc::setVolume(float left, float right) {
252     AudioResampler::setVolume(left, right);
253     // convert to U4_28 (rounding down).
254     // integer volume values are clamped to 0 to UNITY_GAIN.
255     mVolumeSIMD[0] = u4_28_from_float(clampFloatVol(left));
256     mVolumeSIMD[1] = u4_28_from_float(clampFloatVol(right));
257 }
258 
resample(int32_t * out,size_t outFrameCount,AudioBufferProvider * provider)259 size_t AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
260             AudioBufferProvider* provider)
261 {
262     // FIXME store current state (up or down sample) and only load the coefs when the state
263     // changes. Or load two pointers one for up and one for down in the init function.
264     // Not critical now since the read functions are fast, but would be important if read was slow.
265     if (mConstants == &veryHighQualityConstants && readResampleCoefficients) {
266         mFirCoefs = readResampleCoefficients( mInSampleRate <= mSampleRate );
267     } else {
268         mFirCoefs = (const int32_t *)
269                 ((mInSampleRate <= mSampleRate) ? mFirCoefsUp : mFirCoefsDown);
270     }
271 
272     // select the appropriate resampler
273     switch (mChannelCount) {
274     case 1:
275         return resample<1>(out, outFrameCount, provider);
276     case 2:
277         return resample<2>(out, outFrameCount, provider);
278     default:
279         LOG_ALWAYS_FATAL("invalid channel count: %d", mChannelCount);
280         return 0;
281     }
282 }
283 
284 
285 template<int CHANNELS>
resample(int32_t * out,size_t outFrameCount,AudioBufferProvider * provider)286 size_t AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
287         AudioBufferProvider* provider)
288 {
289     const Constants& c(*mConstants);
290     const size_t headOffset = c.halfNumCoefs*CHANNELS;
291     int16_t* impulse = mImpulse;
292     uint32_t vRL = mVolumeRL;
293     size_t inputIndex = mInputIndex;
294     uint32_t phaseFraction = mPhaseFraction;
295     uint32_t phaseIncrement = mPhaseIncrement;
296     size_t outputIndex = 0;
297     size_t outputSampleCount = outFrameCount * 2;
298     size_t inFrameCount = getInFrameCountRequired(outFrameCount);
299 
300     while (outputIndex < outputSampleCount) {
301         // buffer is empty, fetch a new one
302         while (mBuffer.frameCount == 0) {
303             mBuffer.frameCount = inFrameCount;
304             provider->getNextBuffer(&mBuffer,
305                                     calculateOutputPTS(outputIndex / 2));
306             if (mBuffer.raw == NULL) {
307                 goto resample_exit;
308             }
309             const uint32_t phaseIndex = phaseFraction >> kNumPhaseBits;
310             if (phaseIndex == 1) {
311                 // read one frame
312                 read<CHANNELS>(impulse, phaseFraction, mBuffer.i16, inputIndex);
313             } else if (phaseIndex == 2) {
314                 // read 2 frames
315                 read<CHANNELS>(impulse, phaseFraction, mBuffer.i16, inputIndex);
316                 inputIndex++;
317                 if (inputIndex >= mBuffer.frameCount) {
318                     inputIndex -= mBuffer.frameCount;
319                     provider->releaseBuffer(&mBuffer);
320                 } else {
321                     read<CHANNELS>(impulse, phaseFraction, mBuffer.i16, inputIndex);
322                 }
323             }
324         }
325         int16_t const * const in = mBuffer.i16;
326         const size_t frameCount = mBuffer.frameCount;
327 
328         // Always read-in the first samples from the input buffer
329         int16_t* head = impulse + headOffset;
330         for (size_t i=0 ; i<CHANNELS ; i++) {
331             head[i] = in[inputIndex*CHANNELS + i];
332         }
333 
334         // handle boundary case
335         while (CC_LIKELY(outputIndex < outputSampleCount)) {
336             filterCoefficient<CHANNELS>(&out[outputIndex], phaseFraction, impulse, vRL);
337             outputIndex += 2;
338 
339             phaseFraction += phaseIncrement;
340             const size_t phaseIndex = phaseFraction >> kNumPhaseBits;
341             for (size_t i=0 ; i<phaseIndex ; i++) {
342                 inputIndex++;
343                 if (inputIndex >= frameCount) {
344                     goto done;  // need a new buffer
345                 }
346                 read<CHANNELS>(impulse, phaseFraction, in, inputIndex);
347             }
348         }
349 done:
350         // if done with buffer, save samples
351         if (inputIndex >= frameCount) {
352             inputIndex -= frameCount;
353             provider->releaseBuffer(&mBuffer);
354         }
355     }
356 
357 resample_exit:
358     mImpulse = impulse;
359     mInputIndex = inputIndex;
360     mPhaseFraction = phaseFraction;
361     return outputIndex / CHANNELS;
362 }
363 
364 template<int CHANNELS>
365 /***
366 * read()
367 *
368 * This function reads only one frame from input buffer and writes it in
369 * state buffer
370 *
371 **/
read(int16_t * & impulse,uint32_t & phaseFraction,const int16_t * in,size_t inputIndex)372 void AudioResamplerSinc::read(
373         int16_t*& impulse, uint32_t& phaseFraction,
374         const int16_t* in, size_t inputIndex)
375 {
376     impulse += CHANNELS;
377     phaseFraction -= 1LU<<kNumPhaseBits;
378 
379     const Constants& c(*mConstants);
380     if (CC_UNLIKELY(impulse >= mRingFull)) {
381         const size_t stateSize = (c.halfNumCoefs*2)*CHANNELS;
382         memcpy(mState, mState+stateSize, sizeof(int16_t)*stateSize);
383         impulse -= stateSize;
384     }
385 
386     int16_t* head = impulse + c.halfNumCoefs*CHANNELS;
387     for (size_t i=0 ; i<CHANNELS ; i++) {
388         head[i] = in[inputIndex*CHANNELS + i];
389     }
390 }
391 
392 template<int CHANNELS>
filterCoefficient(int32_t * out,uint32_t phase,const int16_t * samples,uint32_t vRL)393 void AudioResamplerSinc::filterCoefficient(int32_t* out, uint32_t phase,
394          const int16_t *samples, uint32_t vRL)
395 {
396     // NOTE: be very careful when modifying the code here. register
397     // pressure is very high and a small change might cause the compiler
398     // to generate far less efficient code.
399     // Always sanity check the result with objdump or test-resample.
400 
401     // compute the index of the coefficient on the positive side and
402     // negative side
403     const Constants& c(*mConstants);
404     const int32_t ONE = c.cMask | c.pMask;
405     uint32_t indexP = ( phase & c.cMask) >> c.cShift;
406     uint32_t lerpP  = ( phase & c.pMask) >> c.pShift;
407     uint32_t indexN = ((ONE-phase) & c.cMask) >> c.cShift;
408     uint32_t lerpN  = ((ONE-phase) & c.pMask) >> c.pShift;
409 
410     const size_t offset = c.halfNumCoefs;
411     indexP *= offset;
412     indexN *= offset;
413 
414     int32_t const* coefsP = mFirCoefs + indexP;
415     int32_t const* coefsN = mFirCoefs + indexN;
416     int16_t const* sP = samples;
417     int16_t const* sN = samples + CHANNELS;
418 
419     size_t count = offset;
420 
421 #ifndef USE_NEON
422     int32_t l = 0;
423     int32_t r = 0;
424     for (size_t i=0 ; i<count ; i++) {
425         interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
426         sP -= CHANNELS;
427         interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
428         sN += CHANNELS;
429     }
430     out[0] += 2 * mulRL(1, l, vRL);
431     out[1] += 2 * mulRL(0, r, vRL);
432 #else
433     UNUSED(vRL);
434     if (CHANNELS == 1) {
435         int32_t const* coefsP1 = coefsP + offset;
436         int32_t const* coefsN1 = coefsN + offset;
437         sP -= CHANNELS*3;
438 
439         int32x4_t sum;
440         int32x2_t lerpPN;
441         lerpPN = vdup_n_s32(0);
442         lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
443         lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
444         lerpPN = vshl_n_s32(lerpPN, 16);
445         sum = vdupq_n_s32(0);
446 
447         int16x4_t sampleP, sampleN;
448         int32x4_t samplePExt, sampleNExt;
449         int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
450 
451         coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
452         coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
453         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
454         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
455         for (; count > 0; count -= 4) {
456             sampleP = vld1_s16(sP);
457             sampleN = vld1_s16(sN);
458             coefsPV0 = vld1q_s32(coefsP);
459             coefsNV0 = vld1q_s32(coefsN);
460             coefsPV1 = vld1q_s32(coefsP1);
461             coefsNV1 = vld1q_s32(coefsN1);
462             sP -= 4;
463             sN += 4;
464             coefsP += 4;
465             coefsN += 4;
466             coefsP1 += 4;
467             coefsN1 += 4;
468 
469             sampleP = vrev64_s16(sampleP);
470 
471             // interpolate (step1)
472             coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
473             coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
474             samplePExt = vshll_n_s16(sampleP, 15);
475             // interpolate (step2)
476             coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
477             coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
478             sampleNExt = vshll_n_s16(sampleN, 15);
479             // interpolate (step3)
480             coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
481             coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
482 
483             samplePExt = vqrdmulhq_s32(samplePExt, coefsPV0);
484             sampleNExt = vqrdmulhq_s32(sampleNExt, coefsNV0);
485             sum = vaddq_s32(sum, samplePExt);
486             sum = vaddq_s32(sum, sampleNExt);
487         }
488         int32x2_t volumesV, outV;
489         volumesV = vld1_s32(mVolumeSIMD);
490         outV = vld1_s32(out);
491 
492         //add all 4 partial sums
493         int32x2_t sumLow, sumHigh;
494         sumLow = vget_low_s32(sum);
495         sumHigh = vget_high_s32(sum);
496         sumLow = vpadd_s32(sumLow, sumHigh);
497         sumLow = vpadd_s32(sumLow, sumLow);
498 
499         sumLow = vqrdmulh_s32(sumLow, volumesV);
500         outV = vadd_s32(outV, sumLow);
501         vst1_s32(out, outV);
502     } else if (CHANNELS == 2) {
503         int32_t const* coefsP1 = coefsP + offset;
504         int32_t const* coefsN1 = coefsN + offset;
505         sP -= CHANNELS*3;
506 
507         int32x4_t sum0, sum1;
508         int32x2_t lerpPN;
509 
510         lerpPN = vdup_n_s32(0);
511         lerpPN = vld1_lane_s32((int32_t *)&lerpP, lerpPN, 0);
512         lerpPN = vld1_lane_s32((int32_t *)&lerpN, lerpPN, 1);
513         lerpPN = vshl_n_s32(lerpPN, 16);
514         sum0 = vdupq_n_s32(0);
515         sum1 = vdupq_n_s32(0);
516 
517         int16x4x2_t sampleP, sampleN;
518         int32x4x2_t samplePExt, sampleNExt;
519         int32x4_t coefsPV0, coefsPV1, coefsNV0, coefsNV1;
520 
521         coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
522         coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
523         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
524         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
525         for (; count > 0; count -= 4) {
526             sampleP = vld2_s16(sP);
527             sampleN = vld2_s16(sN);
528             coefsPV0 = vld1q_s32(coefsP);
529             coefsNV0 = vld1q_s32(coefsN);
530             coefsPV1 = vld1q_s32(coefsP1);
531             coefsNV1 = vld1q_s32(coefsN1);
532             sP -= 8;
533             sN += 8;
534             coefsP += 4;
535             coefsN += 4;
536             coefsP1 += 4;
537             coefsN1 += 4;
538 
539             sampleP.val[0] = vrev64_s16(sampleP.val[0]);
540             sampleP.val[1] = vrev64_s16(sampleP.val[1]);
541 
542             // interpolate (step1)
543             coefsPV1 = vsubq_s32(coefsPV1, coefsPV0);
544             coefsNV1 = vsubq_s32(coefsNV1, coefsNV0);
545             samplePExt.val[0] = vshll_n_s16(sampleP.val[0], 15);
546             samplePExt.val[1] = vshll_n_s16(sampleP.val[1], 15);
547             // interpolate (step2)
548             coefsPV1 = vqrdmulhq_lane_s32(coefsPV1, lerpPN, 0);
549             coefsNV1 = vqrdmulhq_lane_s32(coefsNV1, lerpPN, 1);
550             sampleNExt.val[0] = vshll_n_s16(sampleN.val[0], 15);
551             sampleNExt.val[1] = vshll_n_s16(sampleN.val[1], 15);
552             // interpolate (step3)
553             coefsPV0 = vaddq_s32(coefsPV0, coefsPV1);
554             coefsNV0 = vaddq_s32(coefsNV0, coefsNV1);
555 
556             samplePExt.val[0] = vqrdmulhq_s32(samplePExt.val[0], coefsPV0);
557             samplePExt.val[1] = vqrdmulhq_s32(samplePExt.val[1], coefsPV0);
558             sampleNExt.val[0] = vqrdmulhq_s32(sampleNExt.val[0], coefsNV0);
559             sampleNExt.val[1] = vqrdmulhq_s32(sampleNExt.val[1], coefsNV0);
560             sum0 = vaddq_s32(sum0, samplePExt.val[0]);
561             sum1 = vaddq_s32(sum1, samplePExt.val[1]);
562             sum0 = vaddq_s32(sum0, sampleNExt.val[0]);
563             sum1 = vaddq_s32(sum1, sampleNExt.val[1]);
564         }
565         int32x2_t volumesV, outV;
566         volumesV = vld1_s32(mVolumeSIMD);
567         outV = vld1_s32(out);
568 
569         //add all 4 partial sums
570         int32x2_t sumLow0, sumHigh0, sumLow1, sumHigh1;
571         sumLow0 = vget_low_s32(sum0);
572         sumHigh0 = vget_high_s32(sum0);
573         sumLow1 = vget_low_s32(sum1);
574         sumHigh1 = vget_high_s32(sum1);
575         sumLow0 = vpadd_s32(sumLow0, sumHigh0);
576         sumLow0 = vpadd_s32(sumLow0, sumLow0);
577         sumLow1 = vpadd_s32(sumLow1, sumHigh1);
578         sumLow1 = vpadd_s32(sumLow1, sumLow1);
579 
580         sumLow0 = vtrn_s32(sumLow0, sumLow1).val[0];
581         sumLow0 = vqrdmulh_s32(sumLow0, volumesV);
582         outV = vadd_s32(outV, sumLow0);
583         vst1_s32(out, outV);
584     }
585 #endif
586 }
587 
588 template<int CHANNELS>
interpolate(int32_t & l,int32_t & r,const int32_t * coefs,size_t offset,int32_t lerp,const int16_t * samples)589 void AudioResamplerSinc::interpolate(
590         int32_t& l, int32_t& r,
591         const int32_t* coefs, size_t offset,
592         int32_t lerp, const int16_t* samples)
593 {
594     int32_t c0 = coefs[0];
595     int32_t c1 = coefs[offset];
596     int32_t sinc = mulAdd(lerp, (c1-c0)<<1, c0);
597     if (CHANNELS == 2) {
598         uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
599         l = mulAddRL(1, rl, sinc, l);
600         r = mulAddRL(0, rl, sinc, r);
601     } else {
602         r = l = mulAdd(samples[0], sinc, l);
603     }
604 }
605 // ----------------------------------------------------------------------------
606 } // namespace android
607