1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19 
20 namespace android {
21 
22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23 
24 #if USE_SSE
25 
26 #define TO_STRING2(x) #x
27 #define TO_STRING(x) TO_STRING2(x)
28 // uncomment to print GCC version, may be relevant for intrinsic optimizations
29 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30         "." TO_STRING(__GNUC_MINOR__) \
31         "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32 
33 //
34 // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35 //
36 
37 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessSSEIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)38 static inline void ProcessSSEIntrinsic(float* out,
39         int count,
40         const float* coefsP,
41         const float* coefsN,
42         const float* sP,
43         const float* sN,
44         const float* volumeLR,
45         float lerpP,
46         const float* coefsP1,
47         const float* coefsN1)
48 {
49     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
50     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
51 
52     sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
53 
54     __m128 interp;
55     if (!FIXED) {
56         interp = _mm_set1_ps(lerpP);
57     }
58 
59     __m128 accL, accR;
60     accL = _mm_setzero_ps();
61     if (CHANNELS == 2) {
62         accR = _mm_setzero_ps();
63     }
64 
65     do {
66         __m128 posCoef = _mm_load_ps(coefsP);
67         __m128 negCoef = _mm_load_ps(coefsN);
68         coefsP += 4;
69         coefsN += 4;
70 
71         if (!FIXED) { // interpolate
72             __m128 posCoef1 = _mm_load_ps(coefsP1);
73             __m128 negCoef1 = _mm_load_ps(coefsN1);
74             coefsP1 += 4;
75             coefsN1 += 4;
76 
77             // Calculate the final coefficient for interpolation
78             // posCoef = interp * (posCoef1 - posCoef) + posCoef
79             // negCoef = interp * (negCoef - negCoef1) + negCoef1
80             posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81             negCoef = _mm_sub_ps(negCoef, negCoef1);
82 
83 
84             #if USE_AVX2
85             posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
86             negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
87             #else
88             posCoef1 = _mm_mul_ps(posCoef1, interp);
89             negCoef = _mm_mul_ps(negCoef, interp);
90             posCoef = _mm_add_ps(posCoef1, posCoef);
91             negCoef = _mm_add_ps(negCoef, negCoef1);
92             #endif //USE_AVX2
93         }
94         switch (CHANNELS) {
95         case 1: {
96             __m128 posSamp = _mm_loadu_ps(sP);
97             __m128 negSamp = _mm_loadu_ps(sN);
98             sP -= 4;
99             sN += 4;
100 
101             posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
102 
103             #if USE_AVX2
104             accL = _mm_fmadd_ps(posSamp, posCoef, accL);
105             accL = _mm_fmadd_ps(negSamp, negCoef, accL);
106             #else
107             posSamp = _mm_mul_ps(posSamp, posCoef);
108             negSamp = _mm_mul_ps(negSamp, negCoef);
109             accL = _mm_add_ps(accL, posSamp);
110             accL = _mm_add_ps(accL, negSamp);
111             #endif
112 
113         } break;
114         case 2: {
115             __m128 posSamp0 = _mm_loadu_ps(sP);
116             __m128 posSamp1 = _mm_loadu_ps(sP+4);
117             __m128 negSamp0 = _mm_loadu_ps(sN);
118             __m128 negSamp1 = _mm_loadu_ps(sN+4);
119             sP -= 8;
120             sN += 8;
121 
122             // deinterleave everything and reverse the positives
123             __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
124             __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
125             __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
126             __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
127 
128            #if USE_AVX2
129            accL = _mm_fmadd_ps(posSampL, posCoef, accL);
130            accR = _mm_fmadd_ps(posSampR, posCoef, accR);
131            accL = _mm_fmadd_ps(negSampL, negCoef, accL);
132            accR = _mm_fmadd_ps(negSampR, negCoef, accR);
133            #else
134            posSampL = _mm_mul_ps(posSampL, posCoef);
135            posSampR = _mm_mul_ps(posSampR, posCoef);
136            negSampL = _mm_mul_ps(negSampL, negCoef);
137            negSampR = _mm_mul_ps(negSampR, negCoef);
138 
139            accL = _mm_add_ps(accL, posSampL);
140            accR = _mm_add_ps(accR, posSampR);
141            accL = _mm_add_ps(accL, negSampL);
142            accR = _mm_add_ps(accR, negSampR);
143            #endif
144 
145         } break;
146         }
147     } while (count -= 4);
148 
149     // multiply by volume and save
150     __m128 vLR = _mm_setzero_ps();
151     __m128 outSamp;
152     vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
153     outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
154 
155     // combine and funnel down accumulator
156     __m128 outAccum = _mm_setzero_ps();
157     if (CHANNELS == 1) {
158         // duplicate accL to both L and R
159         outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
160         outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
161     } else if (CHANNELS == 2) {
162         // accR contains R, fold in
163         outAccum = _mm_hadd_ps(accL, accR);
164         outAccum = _mm_hadd_ps(outAccum, outAccum);
165     }
166     #if USE_AVX2
167     outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
168     #else
169     outAccum = _mm_mul_ps(outAccum, vLR);
170     outSamp = _mm_add_ps(outSamp, outAccum);
171     #endif
172 
173     _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
174 }
175 
176 template<>
177 inline void ProcessL<1, 16>(float* const out,
178         int count,
179         const float* coefsP,
180         const float* coefsN,
181         const float* sP,
182         const float* sN,
183         const float* const volumeLR)
184 {
185     ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
186             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
187 }
188 
189 template<>
190 inline void ProcessL<2, 16>(float* const out,
191         int count,
192         const float* coefsP,
193         const float* coefsN,
194         const float* sP,
195         const float* sN,
196         const float* const volumeLR)
197 {
198     ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
199             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
200 }
201 
202 template<>
203 inline void Process<1, 16>(float* const out,
204         int count,
205         const float* coefsP,
206         const float* coefsN,
207         const float* coefsP1,
208         const float* coefsN1,
209         const float* sP,
210         const float* sN,
211         float lerpP,
212         const float* const volumeLR)
213 {
214     ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
215             lerpP, coefsP1, coefsN1);
216 }
217 
218 template<>
219 inline void Process<2, 16>(float* const out,
220         int count,
221         const float* coefsP,
222         const float* coefsN,
223         const float* coefsP1,
224         const float* coefsN1,
225         const float* sP,
226         const float* sN,
227         float lerpP,
228         const float* const volumeLR)
229 {
230     ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
231             lerpP, coefsP1, coefsN1);
232 }
233 
234 #endif //USE_SSE
235 
236 } // namespace android
237 
238 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
239