1 /*
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "aecm_core.h"
12
13 #include <arm_neon.h>
14 #include <assert.h>
15
16
17 // Square root of Hanning window in Q14.
18 static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
19 16384, 16373, 16354, 16325,
20 16286, 16237, 16179, 16111,
21 16034, 15947, 15851, 15746,
22 15631, 15506, 15373, 15231,
23 15079, 14918, 14749, 14571,
24 14384, 14189, 13985, 13773,
25 13553, 13325, 13089, 12845,
26 12594, 12335, 12068, 11795,
27 11514, 11227, 10933, 10633,
28 10326, 10013, 9695, 9370,
29 9040, 8705, 8364, 8019,
30 7668, 7313, 6954, 6591,
31 6224, 5853, 5478, 5101,
32 4720, 4337, 3951, 3562,
33 3172, 2780, 2386, 1990,
34 1594, 1196, 798, 399
35 };
36
WindowAndFFTNeon(WebRtc_Word16 * fft,const WebRtc_Word16 * time_signal,complex16_t * freq_signal,int time_signal_scaling)37 static void WindowAndFFTNeon(WebRtc_Word16* fft,
38 const WebRtc_Word16* time_signal,
39 complex16_t* freq_signal,
40 int time_signal_scaling) {
41 int i, j;
42
43 int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
44 __asm__("vmov.i16 d21, #0" ::: "d21");
45
46 for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
47 int16x4_t tmp16x4_0;
48 int16x4_t tmp16x4_1;
49 int32x4_t tmp32x4_0;
50
51 /* Window near end */
52 // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
53 // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
54 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
55 tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
56
57 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
58 tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
59
60 __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
61 __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
62
63 // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
64 // (time_signal[PART_LEN + i] << time_signal_scaling),
65 // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
66 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
67 tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
68
69 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
70 tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
71
72 __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
73 __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
74 }
75
76 WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
77 WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
78
79 // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
80 for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
81 __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
82 __asm__("vneg.s16 d22, d22" : : : "q10");
83 __asm__("vneg.s16 d23, d23" : : : "q11");
84 __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
85 "r"(&freq_signal[i].real): "q10", "q11");
86 }
87 }
88
InverseFFTAndWindowNeon(AecmCore_t * aecm,WebRtc_Word16 * fft,complex16_t * efw,WebRtc_Word16 * output,const WebRtc_Word16 * nearendClean)89 static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
90 WebRtc_Word16* fft,
91 complex16_t* efw,
92 WebRtc_Word16* output,
93 const WebRtc_Word16* nearendClean) {
94 int i, j, outCFFT;
95 WebRtc_Word32 tmp32no1;
96
97 // Synthesis
98 for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
99 // We overwrite two more elements in fft[], but it's ok.
100 __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
101 __asm__("vmov q11, q10" : : : "q10", "q11");
102
103 __asm__("vneg.s16 d23, d23" : : : "q11");
104 __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
105
106 __asm__("vrev64.16 q10, q10" : : : "q10");
107 __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
108 }
109
110 fft[PART_LEN2] = efw[PART_LEN].real;
111 fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
112
113 // Inverse FFT, result should be scaled with outCFFT.
114 WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
115 outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
116
117 // Take only the real values and scale with outCFFT.
118 for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
119 __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
120 __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
121 }
122
123 int32x4_t tmp32x4_2;
124 __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
125 (outCFFT - aecm->dfaCleanQDomain)));
126 for (i = 0; i < PART_LEN; i += 4) {
127 int16x4_t tmp16x4_0;
128 int16x4_t tmp16x4_1;
129 int32x4_t tmp32x4_0;
130 int32x4_t tmp32x4_1;
131
132 // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
133 // fft[i], WebRtcAecm_kSqrtHanning[i], 14);
134 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
135 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
136 __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
137 __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
138
139 // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
140 // outCFFT - aecm->dfaCleanQDomain);
141 __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
142
143 // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
144 // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
145 // output[i] = fft[i];
146 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
147 __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
148 __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
149 __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
150 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
151 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
152
153 // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
154 // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
155 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
156 __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
157 __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
158 __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
159
160 // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
161 __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
162 // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
163 // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
164 __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
165 __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
166 }
167
168 // Copy the current block to the old position (outBuf is shifted elsewhere).
169 for (i = 0; i < PART_LEN; i += 16) {
170 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
171 "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
172 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
173 }
174 for (i = 0; i < PART_LEN; i += 16) {
175 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
176 "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
177 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
178 "r"(&aecm->dBufNoisy[i]): "q10");
179 }
180 if (nearendClean != NULL) {
181 for (i = 0; i < PART_LEN; i += 16) {
182 __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
183 "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
184 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
185 "r"(&aecm->dBufClean[i]): "q10");
186 }
187 }
188 }
189
CalcLinearEnergiesNeon(AecmCore_t * aecm,const WebRtc_UWord16 * far_spectrum,WebRtc_Word32 * echo_est,WebRtc_UWord32 * far_energy,WebRtc_UWord32 * echo_energy_adapt,WebRtc_UWord32 * echo_energy_stored)190 static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
191 const WebRtc_UWord16* far_spectrum,
192 WebRtc_Word32* echo_est,
193 WebRtc_UWord32* far_energy,
194 WebRtc_UWord32* echo_energy_adapt,
195 WebRtc_UWord32* echo_energy_stored) {
196 int i;
197
198 register WebRtc_UWord32 far_energy_r;
199 register WebRtc_UWord32 echo_energy_stored_r;
200 register WebRtc_UWord32 echo_energy_adapt_r;
201 uint32x4_t tmp32x4_0;
202
203 __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
204 __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored
205 __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt
206
207 for (i = 0; i < PART_LEN - 7; i += 8) {
208 // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
209 __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
210 __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
211 __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
212
213 // Get estimated echo energies for adaptive channel and stored channel.
214 // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
215 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
216 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
217 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
218 __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
219 "q10", "q11");
220
221 // echo_energy_stored += (WebRtc_UWord32)echoEst[i];
222 __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
223 __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
224
225 // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
226 // aecm->channelAdapt16[i], far_spectrum[i]);
227 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
228 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
229 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
230 __asm__("vadd.u32 q9, q10" : : : "q9", "q15");
231 __asm__("vadd.u32 q9, q11" : : : "q9", "q11");
232 }
233
234 __asm__("vadd.u32 d28, d29" : : : "q14");
235 __asm__("vpadd.u32 d28, d28" : : : "q14");
236 __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
237
238 __asm__("vadd.u32 d18, d19" : : : "q9");
239 __asm__("vpadd.u32 d18, d18" : : : "q9");
240 __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
241
242 __asm__("vadd.u32 d16, d17" : : : "q8");
243 __asm__("vpadd.u32 d16, d16" : : : "q8");
244 __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
245
246 // Get estimated echo energies for adaptive channel and stored channel.
247 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
248 *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
249 *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
250 *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
251 aecm->channelAdapt16[i], far_spectrum[i]);
252 }
253
StoreAdaptiveChannelNeon(AecmCore_t * aecm,const WebRtc_UWord16 * far_spectrum,WebRtc_Word32 * echo_est)254 static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
255 const WebRtc_UWord16* far_spectrum,
256 WebRtc_Word32* echo_est) {
257 int i;
258
259 // During startup we store the channel every block.
260 // Recalculate echo estimate.
261 for (i = 0; i < PART_LEN - 7; i += 8) {
262 // aecm->channelStored[i] = acem->channelAdapt16[i];
263 // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
264 __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
265 __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
266 __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
267 __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
268 __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
269 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
270 "r"(&echo_est[i]) : "q10", "q11");
271 }
272 aecm->channelStored[i] = aecm->channelAdapt16[i];
273 echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
274 }
275
ResetAdaptiveChannelNeon(AecmCore_t * aecm)276 static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
277 int i;
278
279 for (i = 0; i < PART_LEN - 7; i += 8) {
280 // aecm->channelAdapt16[i] = aecm->channelStored[i];
281 // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
282 // aecm->channelStored[i], 16);
283 __asm__("vld1.16 {d24, d25}, [%0, :128]" : :
284 "r"(&aecm->channelStored[i]) : "q12");
285 __asm__("vst1.16 {d24, d25}, [%0, :128]" : :
286 "r"(&aecm->channelAdapt16[i]) : "q12");
287 __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
288 __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
289 __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
290 "r"(&aecm->channelAdapt32[i]): "q10", "q11");
291 }
292 aecm->channelAdapt16[i] = aecm->channelStored[i];
293 aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
294 (WebRtc_Word32)aecm->channelStored[i], 16);
295 }
296
WebRtcAecm_InitNeon(void)297 void WebRtcAecm_InitNeon(void) {
298 WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
299 WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
300 WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
301 WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
302 WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
303 }
304