1 /*
2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "nsx_core.h"
12
13 #include <arm_neon.h>
14 #include <assert.h>
15
16 // Update the noise estimation information.
UpdateNoiseEstimateNeon(NsxInst_t * inst,int offset)17 static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
18 int i = 0;
19 const int16_t kExp2Const = 11819; // Q13
20 int16_t* ptr_noiseEstLogQuantile = NULL;
21 int16_t* ptr_noiseEstQuantile = NULL;
22 int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
23 int32x4_t twentyOne32x4 = vdupq_n_s32(21);
24 int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
25 int32x4_t constB32x4 = vdupq_n_s32(0x200000);
26
27 int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
28 inst->magnLen);
29
30 // Guarantee a Q-domain as high as possible and still fit in int16
31 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
32 tmp16,
33 21);
34
35 int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
36
37 for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
38 ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
39 ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
40 ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
41
42 // tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
43 // inst->noiseEstLogQuantile[offset + i]);
44 int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
45 int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
46
47 // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
48 int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
49 v32x4A = vorrq_s32(v32x4A, constB32x4);
50
51 // tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
52 v32x4B = vshrq_n_s32(v32x4B, 21);
53
54 // tmp16 -= 21;// shift 21 to get result in Q0
55 v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
56
57 // tmp16 += (int16_t) inst->qNoise;
58 // shift to get result in Q(qNoise)
59 v32x4B = vaddq_s32(v32x4B, qNoise32x4);
60
61 // if (tmp16 < 0) {
62 // tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
63 // } else {
64 // tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
65 // }
66 v32x4B = vshlq_s32(v32x4A, v32x4B);
67
68 // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
69 v16x4 = vqmovn_s32(v32x4B);
70
71 //inst->noiseEstQuantile[i] = tmp16;
72 vst1_s16(ptr_noiseEstQuantile, v16x4);
73 }
74
75 // Last iteration:
76
77 // inst->quantile[i]=exp(inst->lquantile[offset+i]);
78 // in Q21
79 int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
80 *ptr_noiseEstLogQuantile);
81 int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
82
83 tmp16 = (int16_t) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
84 tmp16 -= 21;// shift 21 to get result in Q0
85 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
86 if (tmp16 < 0) {
87 tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
88 } else {
89 tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
90 }
91 *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
92 }
93
94 // Noise Estimation
NoiseEstimationNeon(NsxInst_t * inst,uint16_t * magn,uint32_t * noise,int16_t * q_noise)95 static void NoiseEstimationNeon(NsxInst_t* inst,
96 uint16_t* magn,
97 uint32_t* noise,
98 int16_t* q_noise) {
99 int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
100 int16_t countProd, delta, zeros, frac;
101 int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
102 const int16_t log2_const = 22713;
103 const int16_t width_factor = 21845;
104
105 int i, s, offset;
106
107 tabind = inst->stages - inst->normData;
108 assert(tabind < 9);
109 assert(tabind > -9);
110 if (tabind < 0) {
111 logval = -WebRtcNsx_kLogTable[-tabind];
112 } else {
113 logval = WebRtcNsx_kLogTable[tabind];
114 }
115
116 int16x8_t logval_16x8 = vdupq_n_s16(logval);
117
118 // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
119 // magn is in Q(-stages), and the real lmagn values are:
120 // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
121 // lmagn in Q8
122 for (i = 0; i < inst->magnLen; i++) {
123 if (magn[i]) {
124 zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
125 frac = (int16_t)((((uint32_t)magn[i] << zeros)
126 & 0x7FFFFFFF) >> 23);
127 assert(frac < 256);
128 // log2(magn(i))
129 log2 = (int16_t)(((31 - zeros) << 8)
130 + WebRtcNsx_kLogTableFrac[frac]);
131 // log2(magn(i))*log(2)
132 lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
133 // + log(2^stages)
134 lmagn[i] += logval;
135 } else {
136 lmagn[i] = logval;
137 }
138 }
139
140 int16x4_t Q3_16x4 = vdup_n_s16(3);
141 int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
142 int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
143
144 int16_t factor = FACTOR_Q7;
145 if (inst->blockIndex < END_STARTUP_LONG)
146 factor = FACTOR_Q7_STARTUP;
147
148 // Loop over simultaneous estimates
149 for (s = 0; s < SIMULT; s++) {
150 offset = s * inst->magnLen;
151
152 // Get counter values from state
153 counter = inst->noiseEstCounter[s];
154 assert(counter < 201);
155 countDiv = WebRtcNsx_kCounterDiv[counter];
156 countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);
157
158 // quant_est(...)
159 int16_t deltaBuff[8];
160 int16x4_t tmp16x4_0;
161 int16x4_t tmp16x4_1;
162 int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
163 int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
164 int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
165 int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
166 int16x8_t tmp16x8_1;
167 int16x8_t tmp16x8_2;
168 int16x8_t tmp16x8_3;
169 int16x8_t tmp16x8_4;
170 int16x8_t tmp16x8_5;
171 int32x4_t tmp32x4;
172
173 for (i = 0; i < inst->magnLen - 7; i += 8) {
174 // Compute delta.
175 // Smaller step size during startup. This prevents from using
176 // unrealistic values causing overflow.
177 tmp16x8_0 = vdupq_n_s16(factor);
178 vst1q_s16(deltaBuff, tmp16x8_0);
179
180 int j;
181 for (j = 0; j < 8; j++) {
182 if (inst->noiseEstDensity[offset + i + j] > 512) {
183 // Get values for deltaBuff by shifting intead of dividing.
184 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
185 deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
186 }
187 }
188
189 // Update log quantile estimate
190
191 // tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
192 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
193 tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
194 tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
195 tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
196 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
197
198 // prepare for the "if" branch
199 // tmp16 += 2;
200 // tmp16_1 = (Word16)(tmp16>>2);
201 tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
202
203 // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
204 tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
205 tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
206
207 // Prepare for the "else" branch
208 // tmp16 += 1;
209 // tmp16_1 = (Word16)(tmp16>>1);
210 tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
211
212 // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
213 tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
214 tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
215
216 // tmp16_2 = (Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16_1,3,1);
217 tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
218 tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
219
220 // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
221 tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
222 tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
223
224 // logval is the smallest fixed point representation we can have. Values
225 // below that will correspond to values in the interval [0, 1], which
226 // can't possibly occur.
227 tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
228
229 // Do the if-else branches:
230 tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
231 tmp16x8_5 = vsubq_s16(tmp16x8_3, tmp16x8_2);
232 __asm__("vcgt.s16 %q0, %q1, #0"::"w"(tmp16x8_4), "w"(tmp16x8_5));
233 __asm__("vbit %q0, %q1, %q2"::
234 "w"(tmp16x8_2), "w"(tmp16x8_1), "w"(tmp16x8_4));
235 __asm__("vbif %q0, %q1, %q2"::
236 "w"(tmp16x8_2), "w"(tmp16x8_0), "w"(tmp16x8_4));
237 vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
238
239 // Update density estimate
240 // tmp16_1 + tmp16_2
241 tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
242 tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
243 tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
244
245 // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
246 tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
247 tmp16x8_3 = vabsq_s16(tmp16x8_3);
248 tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
249 __asm__("vbit %q0, %q1, %q2"::
250 "w"(tmp16x8_1), "w"(tmp16x8_0), "w"(tmp16x8_4));
251 vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
252 } // End loop over magnitude spectrum
253
254 // Last iteration over magnitude spectrum:
255 // compute delta
256 if (inst->noiseEstDensity[offset + i] > 512) {
257 // Get values for deltaBuff by shifting intead of dividing.
258 int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
259 delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
260 } else {
261 delta = FACTOR_Q7;
262 if (inst->blockIndex < END_STARTUP_LONG) {
263 // Smaller step size during startup. This prevents from using
264 // unrealistic values causing overflow.
265 delta = FACTOR_Q7_STARTUP;
266 }
267 }
268 // update log quantile estimate
269 tmp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
270 if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
271 // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
272 // CounterDiv=1/(inst->counter[s]+1) in Q15
273 tmp16 += 2;
274 tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
275 inst->noiseEstLogQuantile[offset + i] += tmp16no1;
276 } else {
277 tmp16 += 1;
278 tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
279 // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
280 tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
281 inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
282 if (inst->noiseEstLogQuantile[offset + i] < logval) {
283 // logval is the smallest fixed point representation we can have.
284 // Values below that will correspond to values in the interval
285 // [0, 1], which can't possibly occur.
286 inst->noiseEstLogQuantile[offset + i] = logval;
287 }
288 }
289
290 // update density estimate
291 if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
292 < WIDTH_Q8) {
293 tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
294 inst->noiseEstDensity[offset + i], countProd, 15);
295 tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
296 width_factor, countDiv, 15);
297 inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
298 }
299
300
301 if (counter >= END_STARTUP_LONG) {
302 inst->noiseEstCounter[s] = 0;
303 if (inst->blockIndex >= END_STARTUP_LONG) {
304 UpdateNoiseEstimateNeon(inst, offset);
305 }
306 }
307 inst->noiseEstCounter[s]++;
308
309 } // end loop over simultaneous estimates
310
311 // Sequentially update the noise during startup
312 if (inst->blockIndex < END_STARTUP_LONG) {
313 UpdateNoiseEstimateNeon(inst, offset);
314 }
315
316 for (i = 0; i < inst->magnLen; i++) {
317 noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
318 }
319 (*q_noise) = (int16_t)inst->qNoise;
320 }
321
322 // Filter the data in the frequency domain, and create spectrum.
PrepareSpectrumNeon(NsxInst_t * inst,int16_t * freq_buf)323 static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
324
325 // (1) Filtering.
326
327 // Fixed point C code for the next block is as follows:
328 // for (i = 0; i < inst->magnLen; i++) {
329 // inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
330 // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
331 // inst->imag[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
332 // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
333 // }
334
335 int16_t* ptr_real = &inst->real[0];
336 int16_t* ptr_imag = &inst->imag[0];
337 uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
338
339 // Filter the rest in the frequency domain.
340 for (; ptr_real < &inst->real[inst->magnLen - 1];) {
341 // Loop unrolled once. Both pointers are incremented by 4 twice.
342 __asm__ __volatile__(
343 "vld1.16 d20, [%[ptr_real]]\n\t"
344 "vld1.16 d22, [%[ptr_imag]]\n\t"
345 "vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
346 "vmull.s16 q10, d20, d23\n\t"
347 "vmull.s16 q11, d22, d23\n\t"
348 "vshrn.s32 d20, q10, #14\n\t"
349 "vshrn.s32 d22, q11, #14\n\t"
350 "vst1.16 d20, [%[ptr_real]]!\n\t"
351 "vst1.16 d22, [%[ptr_imag]]!\n\t"
352
353 "vld1.16 d18, [%[ptr_real]]\n\t"
354 "vld1.16 d24, [%[ptr_imag]]\n\t"
355 "vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t"
356 "vmull.s16 q9, d18, d25\n\t"
357 "vmull.s16 q12, d24, d25\n\t"
358 "vshrn.s32 d18, q9, #14\n\t"
359 "vshrn.s32 d24, q12, #14\n\t"
360 "vst1.16 d18, [%[ptr_real]]!\n\t"
361 "vst1.16 d24, [%[ptr_imag]]!\n\t"
362
363 // Specify constraints.
364 :[ptr_imag]"+r"(ptr_imag),
365 [ptr_real]"+r"(ptr_real),
366 [ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter)
367 :
368 :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
369 "q9", "q10", "q11", "q12"
370 );
371 }
372
373 // Filter the last pair of elements in the frequency domain.
374 *ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real,
375 (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
376 *ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
377 (int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
378
379 // (2) Create spectrum.
380
381 // Fixed point C code for the rest of the function is as follows:
382 // freq_buf[0] = inst->real[0];
383 // freq_buf[1] = -inst->imag[0];
384 // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
385 // tmp16 = (inst->anaLen << 1) - j;
386 // freq_buf[j] = inst->real[i];
387 // freq_buf[j + 1] = -inst->imag[i];
388 // freq_buf[tmp16] = inst->real[i];
389 // freq_buf[tmp16 + 1] = inst->imag[i];
390 // }
391 // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
392 // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
393
394 freq_buf[0] = inst->real[0];
395 freq_buf[1] = -inst->imag[0];
396
397 int offset = -16;
398 int16_t* ptr_realImag1 = &freq_buf[2];
399 int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
400 ptr_real = &inst->real[1];
401 ptr_imag = &inst->imag[1];
402 for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
403 // Loop unrolled once. All pointers are incremented twice.
404 __asm__ __volatile__(
405 "vld1.16 d22, [%[ptr_real]]!\n\t"
406 "vld1.16 d23, [%[ptr_imag]]!\n\t"
407 // Negate and interleave:
408 "vmov.s16 d20, d22\n\t"
409 "vneg.s16 d21, d23\n\t"
410 "vzip.16 d20, d21\n\t"
411 // Write 8 elements to &freq_buf[j]
412 "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
413 // Interleave and reverse elements:
414 "vzip.16 d22, d23\n\t"
415 "vrev64.32 d18, d23\n\t"
416 "vrev64.32 d19, d22\n\t"
417 // Write 8 elements to &freq_buf[tmp16]
418 "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
419
420 "vld1.16 d22, [%[ptr_real]]!\n\t"
421 "vld1.16 d23, [%[ptr_imag]]!\n\t"
422 // Negate and interleave:
423 "vmov.s16 d20, d22\n\t"
424 "vneg.s16 d21, d23\n\t"
425 "vzip.16 d20, d21\n\t"
426 // Write 8 elements to &freq_buf[j]
427 "vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
428 // Interleave and reverse elements:
429 "vzip.16 d22, d23\n\t"
430 "vrev64.32 d18, d23\n\t"
431 "vrev64.32 d19, d22\n\t"
432 // Write 8 elements to &freq_buf[tmp16]
433 "vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
434
435 // Specify constraints.
436 :[ptr_imag]"+r"(ptr_imag),
437 [ptr_real]"+r"(ptr_real),
438 [ptr_realImag1]"+r"(ptr_realImag1),
439 [ptr_realImag2]"+r"(ptr_realImag2)
440 :[offset]"r"(offset)
441 :"d18", "d19", "d20", "d21", "d22", "d23"
442 );
443 }
444 for (ptr_realImag2 += 6;
445 ptr_real <= &inst->real[inst->anaLen2];
446 ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
447 *ptr_realImag1 = *ptr_real;
448 *(ptr_realImag1 + 1) = -(*ptr_imag);
449 *ptr_realImag2 = *ptr_real;
450 *(ptr_realImag2 + 1) = *ptr_imag;
451 }
452
453 freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
454 freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
455 }
456
457 // Denormalize the input buffer.
DenormalizeNeon(NsxInst_t * inst,int16_t * in,int factor)458 static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
459 int16_t* ptr_real = &inst->real[0];
460 int16_t* ptr_in = &in[0];
461
462 __asm__ __volatile__("vdup.32 q10, %0" ::
463 "r"((int32_t)(factor - inst->normData)) : "q10");
464 for (; ptr_real < &inst->real[inst->anaLen];) {
465
466 // Loop unrolled once. Both pointers are incremented.
467 __asm__ __volatile__(
468 // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
469 // factor - inst->normData);
470 "vld2.16 {d24, d25}, [%[ptr_in]]!\n\t"
471 "vmovl.s16 q12, d24\n\t"
472 "vshl.s32 q12, q10\n\t"
473 // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
474 "vqmovn.s32 d24, q12\n\t"
475 "vst1.16 d24, [%[ptr_real]]!\n\t"
476
477 // tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
478 // factor - inst->normData);
479 "vld2.16 {d22, d23}, [%[ptr_in]]!\n\t"
480 "vmovl.s16 q11, d22\n\t"
481 "vshl.s32 q11, q10\n\t"
482 // inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
483 "vqmovn.s32 d22, q11\n\t"
484 "vst1.16 d22, [%[ptr_real]]!\n\t"
485
486 // Specify constraints.
487 :[ptr_in]"+r"(ptr_in),
488 [ptr_real]"+r"(ptr_real)
489 :
490 :"d22", "d23", "d24", "d25"
491 );
492 }
493 }
494
495 // For the noise supress process, synthesis, read out fully processed segment,
496 // and update synthesis buffer.
SynthesisUpdateNeon(NsxInst_t * inst,int16_t * out_frame,int16_t gain_factor)497 static void SynthesisUpdateNeon(NsxInst_t* inst,
498 int16_t* out_frame,
499 int16_t gain_factor) {
500 int16_t* ptr_real = &inst->real[0];
501 int16_t* ptr_syn = &inst->synthesisBuffer[0];
502 int16_t* ptr_window = &inst->window[0];
503
504 // synthesis
505 __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
506 // Loop unrolled once. All pointers are incremented in the assembly code.
507 for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
508 __asm__ __volatile__(
509 // Load variables.
510 "vld1.16 d22, [%[ptr_real]]!\n\t"
511 "vld1.16 d23, [%[ptr_window]]!\n\t"
512 "vld1.16 d25, [%[ptr_syn]]\n\t"
513 // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
514 // inst->window[i], inst->real[i], 14); // Q0, window in Q14
515 "vmull.s16 q11, d22, d23\n\t"
516 "vrshrn.i32 d22, q11, #14\n\t"
517 // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
518 "vmull.s16 q11, d24, d22\n\t"
519 // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
520 "vqrshrn.s32 d22, q11, #13\n\t"
521 // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
522 // inst->synthesisBuffer[i], tmp16b); // Q0
523 "vqadd.s16 d25, d22\n\t"
524 "vst1.16 d25, [%[ptr_syn]]!\n\t"
525
526 // Load variables.
527 "vld1.16 d26, [%[ptr_real]]!\n\t"
528 "vld1.16 d27, [%[ptr_window]]!\n\t"
529 "vld1.16 d28, [%[ptr_syn]]\n\t"
530 // tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
531 // inst->window[i], inst->real[i], 14); // Q0, window in Q14
532 "vmull.s16 q13, d26, d27\n\t"
533 "vrshrn.i32 d26, q13, #14\n\t"
534 // tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13);
535 "vmull.s16 q13, d24, d26\n\t"
536 // tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
537 "vqrshrn.s32 d26, q13, #13\n\t"
538 // inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(
539 // inst->synthesisBuffer[i], tmp16b); // Q0
540 "vqadd.s16 d28, d26\n\t"
541 "vst1.16 d28, [%[ptr_syn]]!\n\t"
542
543 // Specify constraints.
544 :[ptr_real]"+r"(ptr_real),
545 [ptr_window]"+r"(ptr_window),
546 [ptr_syn]"+r"(ptr_syn)
547 :
548 :"d22", "d23", "d24", "d25", "d26", "d27", "d28", "q11", "q12", "q13"
549 );
550 }
551
552 int16_t* ptr_out = &out_frame[0];
553 ptr_syn = &inst->synthesisBuffer[0];
554 // read out fully processed segment
555 for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
556 // Loop unrolled once. Both pointers are incremented in the assembly code.
557 __asm__ __volatile__(
558 // out_frame[i] = inst->synthesisBuffer[i]; // Q0
559 "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
560 "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
561 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
562 "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
563 :[ptr_syn]"+r"(ptr_syn),
564 [ptr_out]"+r"(ptr_out)
565 :
566 :"d22", "d23", "d24", "d25"
567 );
568 }
569
570 // Update synthesis buffer.
571 // C code:
572 // WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
573 // inst->synthesisBuffer + inst->blockLen10ms,
574 // inst->anaLen - inst->blockLen10ms);
575 ptr_out = &inst->synthesisBuffer[0],
576 ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
577 for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
578 // Loop unrolled once. Both pointers are incremented in the assembly code.
579 __asm__ __volatile__(
580 "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
581 "vld1.16 {d24, d25}, [%[ptr_syn]]!\n\t"
582 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
583 "vst1.16 {d24, d25}, [%[ptr_out]]!\n\t"
584 :[ptr_syn]"+r"(ptr_syn),
585 [ptr_out]"+r"(ptr_out)
586 :
587 :"d22", "d23", "d24", "d25"
588 );
589 }
590
591 // C code:
592 // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
593 // + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
594 __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
595 for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
596 // Loop unrolled once. Pointer is incremented in the assembly code.
597 __asm__ __volatile__(
598 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
599 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
600 :[ptr_out]"+r"(ptr_out)
601 :
602 :"d20", "d21"
603 );
604 }
605 }
606
607 // Update analysis buffer for lower band, and window data before FFT.
AnalysisUpdateNeon(NsxInst_t * inst,int16_t * out,int16_t * new_speech)608 static void AnalysisUpdateNeon(NsxInst_t* inst,
609 int16_t* out,
610 int16_t* new_speech) {
611
612 int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
613 int16_t* ptr_out = &inst->analysisBuffer[0];
614
615 // For lower band update analysis buffer.
616 // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
617 // inst->analysisBuffer + inst->blockLen10ms,
618 // inst->anaLen - inst->blockLen10ms);
619 for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
620 // Loop unrolled once, so both pointers are incremented by 8 twice.
621 __asm__ __volatile__(
622 "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
623 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
624 "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
625 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
626 :[ptr_ana]"+r"(ptr_ana),
627 [ptr_out]"+r"(ptr_out)
628 :
629 :"d20", "d21", "d22", "d23"
630 );
631 }
632
633 // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
634 // + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
635 for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
636 // Loop unrolled once, so both pointers are incremented by 8 twice.
637 __asm__ __volatile__(
638 "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
639 "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
640 "vld1.16 {d22, d23}, [%[ptr_ana]]!\n\t"
641 "vst1.16 {d22, d23}, [%[ptr_out]]!\n\t"
642 :[ptr_ana]"+r"(ptr_ana),
643 [ptr_out]"+r"(ptr_out)
644 :
645 :"d20", "d21", "d22", "d23"
646 );
647 }
648
649 // Window data before FFT
650 int16_t* ptr_window = &inst->window[0];
651 ptr_out = &out[0];
652 ptr_ana = &inst->analysisBuffer[0];
653 for (; ptr_out < &out[inst->anaLen];) {
654
655 // Loop unrolled once, so all pointers are incremented by 4 twice.
656 __asm__ __volatile__(
657 "vld1.16 d20, [%[ptr_ana]]!\n\t"
658 "vld1.16 d21, [%[ptr_window]]!\n\t"
659 // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
660 // inst->window[i], inst->analysisBuffer[i], 14); // Q0
661 "vmull.s16 q10, d20, d21\n\t"
662 "vrshrn.i32 d20, q10, #14\n\t"
663 "vst1.16 d20, [%[ptr_out]]!\n\t"
664
665 "vld1.16 d22, [%[ptr_ana]]!\n\t"
666 "vld1.16 d23, [%[ptr_window]]!\n\t"
667 // out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
668 // inst->window[i], inst->analysisBuffer[i], 14); // Q0
669 "vmull.s16 q11, d22, d23\n\t"
670 "vrshrn.i32 d22, q11, #14\n\t"
671 "vst1.16 d22, [%[ptr_out]]!\n\t"
672
673 // Specify constraints.
674 :[ptr_ana]"+r"(ptr_ana),
675 [ptr_window]"+r"(ptr_window),
676 [ptr_out]"+r"(ptr_out)
677 :
678 :"d20", "d21", "d22", "d23", "q10", "q11"
679 );
680 }
681 }
682
683 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
684 // zeros, and normalize it.
CreateComplexBufferNeon(NsxInst_t * inst,int16_t * in,int16_t * out)685 static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
686 int16_t* in,
687 int16_t* out) {
688 int16_t* ptr_out = &out[0];
689 int16_t* ptr_in = &in[0];
690
691 __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
692 __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
693 for (; ptr_in < &in[inst->anaLen];) {
694
695 // Loop unrolled once, so ptr_in is incremented by 8 twice,
696 // and ptr_out is incremented by 8 four times.
697 __asm__ __volatile__(
698 // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
699 "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
700 "vshl.s16 q11, q10\n\t"
701 "vmov d24, d23\n\t"
702
703 // out[j + 1] = 0; // Insert zeros in imaginary part
704 "vmov d23, d25\n\t"
705 "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
706 "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
707
708 // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
709 "vld1.16 {d22, d23}, [%[ptr_in]]!\n\t"
710 "vshl.s16 q11, q10\n\t"
711 "vmov d24, d23\n\t"
712
713 // out[j + 1] = 0; // Insert zeros in imaginary part
714 "vmov d23, d25\n\t"
715 "vst2.16 {d22, d23}, [%[ptr_out]]!\n\t"
716 "vst2.16 {d24, d25}, [%[ptr_out]]!\n\t"
717
718 // Specify constraints.
719 :[ptr_in]"+r"(ptr_in),
720 [ptr_out]"+r"(ptr_out)
721 :
722 :"d22", "d23", "d24", "d25", "q10", "q11"
723 );
724 }
725 }
726
WebRtcNsx_InitNeon(void)727 void WebRtcNsx_InitNeon(void) {
728 WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
729 WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
730 WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
731 WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
732 WebRtcNsx_Denormalize = DenormalizeNeon;
733 WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
734 }
735