1 /*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19
20 namespace android {
21
22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24 #if USE_SSE
25
26 #define TO_STRING2(x) #x
27 #define TO_STRING(x) TO_STRING2(x)
28 // uncomment to print GCC version, may be relevant for intrinsic optimizations
29 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30 "." TO_STRING(__GNUC_MINOR__) \
31 "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32
33 //
34 // SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35 //
36
37 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessSSEIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)38 static inline void ProcessSSEIntrinsic(float* out,
39 int count,
40 const float* coefsP,
41 const float* coefsN,
42 const float* sP,
43 const float* sN,
44 const float* volumeLR,
45 float lerpP,
46 const float* coefsP1,
47 const float* coefsN1)
48 {
49 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
50 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
51
52 sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four
53
54 __m128 interp;
55 if (!FIXED) {
56 interp = _mm_set1_ps(lerpP);
57 }
58
59 __m128 accL, accR;
60 accL = _mm_setzero_ps();
61 if (CHANNELS == 2) {
62 accR = _mm_setzero_ps();
63 }
64
65 do {
66 __m128 posCoef = _mm_load_ps(coefsP);
67 __m128 negCoef = _mm_load_ps(coefsN);
68 coefsP += 4;
69 coefsN += 4;
70
71 if (!FIXED) { // interpolate
72 __m128 posCoef1 = _mm_load_ps(coefsP1);
73 __m128 negCoef1 = _mm_load_ps(coefsN1);
74 coefsP1 += 4;
75 coefsN1 += 4;
76
77 // Calculate the final coefficient for interpolation
78 // posCoef = interp * (posCoef1 - posCoef) + posCoef
79 // negCoef = interp * (negCoef - negCoef1) + negCoef1
80 posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81 negCoef = _mm_sub_ps(negCoef, negCoef1);
82
83 posCoef1 = _mm_mul_ps(posCoef1, interp);
84 negCoef = _mm_mul_ps(negCoef, interp);
85
86 posCoef = _mm_add_ps(posCoef1, posCoef);
87 negCoef = _mm_add_ps(negCoef, negCoef1);
88 }
89 switch (CHANNELS) {
90 case 1: {
91 __m128 posSamp = _mm_loadu_ps(sP);
92 __m128 negSamp = _mm_loadu_ps(sN);
93 sP -= 4;
94 sN += 4;
95
96 posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
97 posSamp = _mm_mul_ps(posSamp, posCoef);
98 negSamp = _mm_mul_ps(negSamp, negCoef);
99
100 accL = _mm_add_ps(accL, posSamp);
101 accL = _mm_add_ps(accL, negSamp);
102 } break;
103 case 2: {
104 __m128 posSamp0 = _mm_loadu_ps(sP);
105 __m128 posSamp1 = _mm_loadu_ps(sP+4);
106 __m128 negSamp0 = _mm_loadu_ps(sN);
107 __m128 negSamp1 = _mm_loadu_ps(sN+4);
108 sP -= 8;
109 sN += 8;
110
111 // deinterleave everything and reverse the positives
112 __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
113 __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
114 __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
115 __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
116
117 posSampL = _mm_mul_ps(posSampL, posCoef);
118 posSampR = _mm_mul_ps(posSampR, posCoef);
119 negSampL = _mm_mul_ps(negSampL, negCoef);
120 negSampR = _mm_mul_ps(negSampR, negCoef);
121
122 accL = _mm_add_ps(accL, posSampL);
123 accR = _mm_add_ps(accR, posSampR);
124 accL = _mm_add_ps(accL, negSampL);
125 accR = _mm_add_ps(accR, negSampR);
126 } break;
127 }
128 } while (count -= 4);
129
130 // multiply by volume and save
131 __m128 vLR = _mm_setzero_ps();
132 __m128 outSamp;
133 vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
134 outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
135
136 // combine and funnel down accumulator
137 __m128 outAccum = _mm_setzero_ps();
138 if (CHANNELS == 1) {
139 // duplicate accL to both L and R
140 outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
141 outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
142 } else if (CHANNELS == 2) {
143 // accR contains R, fold in
144 outAccum = _mm_hadd_ps(accL, accR);
145 outAccum = _mm_hadd_ps(outAccum, outAccum);
146 }
147
148 outAccum = _mm_mul_ps(outAccum, vLR);
149 outSamp = _mm_add_ps(outSamp, outAccum);
150 _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
151 }
152
153 template<>
154 inline void ProcessL<1, 16>(float* const out,
155 int count,
156 const float* coefsP,
157 const float* coefsN,
158 const float* sP,
159 const float* sN,
160 const float* const volumeLR)
161 {
162 ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
163 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
164 }
165
166 template<>
167 inline void ProcessL<2, 16>(float* const out,
168 int count,
169 const float* coefsP,
170 const float* coefsN,
171 const float* sP,
172 const float* sN,
173 const float* const volumeLR)
174 {
175 ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
176 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
177 }
178
179 template<>
180 inline void Process<1, 16>(float* const out,
181 int count,
182 const float* coefsP,
183 const float* coefsN,
184 const float* coefsP1,
185 const float* coefsN1,
186 const float* sP,
187 const float* sN,
188 float lerpP,
189 const float* const volumeLR)
190 {
191 ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
192 lerpP, coefsP1, coefsN1);
193 }
194
195 template<>
196 inline void Process<2, 16>(float* const out,
197 int count,
198 const float* coefsP,
199 const float* coefsN,
200 const float* coefsP1,
201 const float* coefsN1,
202 const float* sP,
203 const float* sN,
204 float lerpP,
205 const float* const volumeLR)
206 {
207 ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
208 lerpP, coefsP1, coefsN1);
209 }
210
211 #endif //USE_SSE
212
213 } // namespace android
214
215 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
216