1 /*
2  * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3  * Copyright Takuya OOURA, 1996-2001
4  *
5  * You may use, copy, modify and distribute this code for any purpose (include
6  * commercial use) and without fee. Please refer to this package when you modify
7  * this code.
8  *
9  * Changes by the WebRTC authors:
10  *    - Trivial type modifications.
11  *    - Minimal code subset to do rdft of length 128.
12  *    - Optimizations because of known length.
13  *
14  *  All changes are covered by the WebRTC license and IP grant:
15  *  Use of this source code is governed by a BSD-style license
16  *  that can be found in the LICENSE file in the root of the source
17  *  tree. An additional intellectual property rights grant can be found
18  *  in the file PATENTS.  All contributing project authors may
19  *  be found in the AUTHORS file in the root of the source tree.
20  */
21 
22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
23 
24 #include <math.h>
25 
26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
27 #include "webrtc/typedefs.h"
28 
29 // These tables used to be computed at run-time. For example, refer to:
30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
31 // to see the initialization code.
32 const float rdft_w[64] = {
33     1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
34     0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
35     0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
36     0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
37     0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
38     0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
39     0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
40     0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
41     0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
42     0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
43     0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
44     0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
45     0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
46     0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
47     0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
48     0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
49 };
50 const float rdft_wk3ri_first[16] = {
51     1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
52     0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
53     0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
54     0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
55 };
56 const float rdft_wk3ri_second[16] = {
57     -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
58     -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
59     -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
60     -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
61 };
62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
63     1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
64     0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
65     0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
66     0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
67     0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
68     0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
69     0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
70     0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
71 };
72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
73     1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
74     0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
75     0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
76     0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
77     0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
78     0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
79     0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
80     0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
81 };
82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
83     1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
84     0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
85     0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
86     -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
87     0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
88     0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
89     0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
90     -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
91 };
92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
93     -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
94     -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
95     -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
96     -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
97     -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
98     -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
99     -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
100     -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
101 };
102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
103     -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
104     -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
105     -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
106     -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
107     -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
108     -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
109     -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
110     -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
111 };
112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
113     -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
114     -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
115     -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
116     -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
117     -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
118     -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
119     -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
120     -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
121 };
122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
123     0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
124 };
125 
bitrv2_128_C(float * a)126 static void bitrv2_128_C(float* a) {
127   /*
128       Following things have been attempted but are no faster:
129       (a) Storing the swap indexes in a LUT (index calculations are done
130           for 'free' while waiting on memory/L1).
131       (b) Consolidate the load/store of two consecutive floats by a 64 bit
132           integer (execution is memory/L1 bound).
133       (c) Do a mix of floats and 64 bit integer to maximize register
134           utilization (execution is memory/L1 bound).
135       (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
136       (e) Hard-coding of the offsets to completely eliminates index
137           calculations.
138   */
139 
140   unsigned int j, j1, k, k1;
141   float xr, xi, yr, yi;
142 
143   static const int ip[4] = {0, 64, 32, 96};
144   for (k = 0; k < 4; k++) {
145     for (j = 0; j < k; j++) {
146       j1 = 2 * j + ip[k];
147       k1 = 2 * k + ip[j];
148       xr = a[j1 + 0];
149       xi = a[j1 + 1];
150       yr = a[k1 + 0];
151       yi = a[k1 + 1];
152       a[j1 + 0] = yr;
153       a[j1 + 1] = yi;
154       a[k1 + 0] = xr;
155       a[k1 + 1] = xi;
156       j1 += 8;
157       k1 += 16;
158       xr = a[j1 + 0];
159       xi = a[j1 + 1];
160       yr = a[k1 + 0];
161       yi = a[k1 + 1];
162       a[j1 + 0] = yr;
163       a[j1 + 1] = yi;
164       a[k1 + 0] = xr;
165       a[k1 + 1] = xi;
166       j1 += 8;
167       k1 -= 8;
168       xr = a[j1 + 0];
169       xi = a[j1 + 1];
170       yr = a[k1 + 0];
171       yi = a[k1 + 1];
172       a[j1 + 0] = yr;
173       a[j1 + 1] = yi;
174       a[k1 + 0] = xr;
175       a[k1 + 1] = xi;
176       j1 += 8;
177       k1 += 16;
178       xr = a[j1 + 0];
179       xi = a[j1 + 1];
180       yr = a[k1 + 0];
181       yi = a[k1 + 1];
182       a[j1 + 0] = yr;
183       a[j1 + 1] = yi;
184       a[k1 + 0] = xr;
185       a[k1 + 1] = xi;
186     }
187     j1 = 2 * k + 8 + ip[k];
188     k1 = j1 + 8;
189     xr = a[j1 + 0];
190     xi = a[j1 + 1];
191     yr = a[k1 + 0];
192     yi = a[k1 + 1];
193     a[j1 + 0] = yr;
194     a[j1 + 1] = yi;
195     a[k1 + 0] = xr;
196     a[k1 + 1] = xi;
197   }
198 }
199 
cft1st_128_C(float * a)200 static void cft1st_128_C(float* a) {
201   const int n = 128;
202   int j, k1, k2;
203   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
204   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
205 
206   // The processing of the first set of elements was simplified in C to avoid
207   // some operations (multiplication by zero or one, addition of two elements
208   // multiplied by the same weight, ...).
209   x0r = a[0] + a[2];
210   x0i = a[1] + a[3];
211   x1r = a[0] - a[2];
212   x1i = a[1] - a[3];
213   x2r = a[4] + a[6];
214   x2i = a[5] + a[7];
215   x3r = a[4] - a[6];
216   x3i = a[5] - a[7];
217   a[0] = x0r + x2r;
218   a[1] = x0i + x2i;
219   a[4] = x0r - x2r;
220   a[5] = x0i - x2i;
221   a[2] = x1r - x3i;
222   a[3] = x1i + x3r;
223   a[6] = x1r + x3i;
224   a[7] = x1i - x3r;
225   wk1r = rdft_w[2];
226   x0r = a[8] + a[10];
227   x0i = a[9] + a[11];
228   x1r = a[8] - a[10];
229   x1i = a[9] - a[11];
230   x2r = a[12] + a[14];
231   x2i = a[13] + a[15];
232   x3r = a[12] - a[14];
233   x3i = a[13] - a[15];
234   a[8] = x0r + x2r;
235   a[9] = x0i + x2i;
236   a[12] = x2i - x0i;
237   a[13] = x0r - x2r;
238   x0r = x1r - x3i;
239   x0i = x1i + x3r;
240   a[10] = wk1r * (x0r - x0i);
241   a[11] = wk1r * (x0r + x0i);
242   x0r = x3i + x1r;
243   x0i = x3r - x1i;
244   a[14] = wk1r * (x0i - x0r);
245   a[15] = wk1r * (x0i + x0r);
246   k1 = 0;
247   for (j = 16; j < n; j += 16) {
248     k1 += 2;
249     k2 = 2 * k1;
250     wk2r = rdft_w[k1 + 0];
251     wk2i = rdft_w[k1 + 1];
252     wk1r = rdft_w[k2 + 0];
253     wk1i = rdft_w[k2 + 1];
254     wk3r = rdft_wk3ri_first[k1 + 0];
255     wk3i = rdft_wk3ri_first[k1 + 1];
256     x0r = a[j + 0] + a[j + 2];
257     x0i = a[j + 1] + a[j + 3];
258     x1r = a[j + 0] - a[j + 2];
259     x1i = a[j + 1] - a[j + 3];
260     x2r = a[j + 4] + a[j + 6];
261     x2i = a[j + 5] + a[j + 7];
262     x3r = a[j + 4] - a[j + 6];
263     x3i = a[j + 5] - a[j + 7];
264     a[j + 0] = x0r + x2r;
265     a[j + 1] = x0i + x2i;
266     x0r -= x2r;
267     x0i -= x2i;
268     a[j + 4] = wk2r * x0r - wk2i * x0i;
269     a[j + 5] = wk2r * x0i + wk2i * x0r;
270     x0r = x1r - x3i;
271     x0i = x1i + x3r;
272     a[j + 2] = wk1r * x0r - wk1i * x0i;
273     a[j + 3] = wk1r * x0i + wk1i * x0r;
274     x0r = x1r + x3i;
275     x0i = x1i - x3r;
276     a[j + 6] = wk3r * x0r - wk3i * x0i;
277     a[j + 7] = wk3r * x0i + wk3i * x0r;
278     wk1r = rdft_w[k2 + 2];
279     wk1i = rdft_w[k2 + 3];
280     wk3r = rdft_wk3ri_second[k1 + 0];
281     wk3i = rdft_wk3ri_second[k1 + 1];
282     x0r = a[j + 8] + a[j + 10];
283     x0i = a[j + 9] + a[j + 11];
284     x1r = a[j + 8] - a[j + 10];
285     x1i = a[j + 9] - a[j + 11];
286     x2r = a[j + 12] + a[j + 14];
287     x2i = a[j + 13] + a[j + 15];
288     x3r = a[j + 12] - a[j + 14];
289     x3i = a[j + 13] - a[j + 15];
290     a[j + 8] = x0r + x2r;
291     a[j + 9] = x0i + x2i;
292     x0r -= x2r;
293     x0i -= x2i;
294     a[j + 12] = -wk2i * x0r - wk2r * x0i;
295     a[j + 13] = -wk2i * x0i + wk2r * x0r;
296     x0r = x1r - x3i;
297     x0i = x1i + x3r;
298     a[j + 10] = wk1r * x0r - wk1i * x0i;
299     a[j + 11] = wk1r * x0i + wk1i * x0r;
300     x0r = x1r + x3i;
301     x0i = x1i - x3r;
302     a[j + 14] = wk3r * x0r - wk3i * x0i;
303     a[j + 15] = wk3r * x0i + wk3i * x0r;
304   }
305 }
306 
cftmdl_128_C(float * a)307 static void cftmdl_128_C(float* a) {
308   const int l = 8;
309   const int n = 128;
310   const int m = 32;
311   int j0, j1, j2, j3, k, k1, k2, m2;
312   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
313   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
314 
315   for (j0 = 0; j0 < l; j0 += 2) {
316     j1 = j0 + 8;
317     j2 = j0 + 16;
318     j3 = j0 + 24;
319     x0r = a[j0 + 0] + a[j1 + 0];
320     x0i = a[j0 + 1] + a[j1 + 1];
321     x1r = a[j0 + 0] - a[j1 + 0];
322     x1i = a[j0 + 1] - a[j1 + 1];
323     x2r = a[j2 + 0] + a[j3 + 0];
324     x2i = a[j2 + 1] + a[j3 + 1];
325     x3r = a[j2 + 0] - a[j3 + 0];
326     x3i = a[j2 + 1] - a[j3 + 1];
327     a[j0 + 0] = x0r + x2r;
328     a[j0 + 1] = x0i + x2i;
329     a[j2 + 0] = x0r - x2r;
330     a[j2 + 1] = x0i - x2i;
331     a[j1 + 0] = x1r - x3i;
332     a[j1 + 1] = x1i + x3r;
333     a[j3 + 0] = x1r + x3i;
334     a[j3 + 1] = x1i - x3r;
335   }
336   wk1r = rdft_w[2];
337   for (j0 = m; j0 < l + m; j0 += 2) {
338     j1 = j0 + 8;
339     j2 = j0 + 16;
340     j3 = j0 + 24;
341     x0r = a[j0 + 0] + a[j1 + 0];
342     x0i = a[j0 + 1] + a[j1 + 1];
343     x1r = a[j0 + 0] - a[j1 + 0];
344     x1i = a[j0 + 1] - a[j1 + 1];
345     x2r = a[j2 + 0] + a[j3 + 0];
346     x2i = a[j2 + 1] + a[j3 + 1];
347     x3r = a[j2 + 0] - a[j3 + 0];
348     x3i = a[j2 + 1] - a[j3 + 1];
349     a[j0 + 0] = x0r + x2r;
350     a[j0 + 1] = x0i + x2i;
351     a[j2 + 0] = x2i - x0i;
352     a[j2 + 1] = x0r - x2r;
353     x0r = x1r - x3i;
354     x0i = x1i + x3r;
355     a[j1 + 0] = wk1r * (x0r - x0i);
356     a[j1 + 1] = wk1r * (x0r + x0i);
357     x0r = x3i + x1r;
358     x0i = x3r - x1i;
359     a[j3 + 0] = wk1r * (x0i - x0r);
360     a[j3 + 1] = wk1r * (x0i + x0r);
361   }
362   k1 = 0;
363   m2 = 2 * m;
364   for (k = m2; k < n; k += m2) {
365     k1 += 2;
366     k2 = 2 * k1;
367     wk2r = rdft_w[k1 + 0];
368     wk2i = rdft_w[k1 + 1];
369     wk1r = rdft_w[k2 + 0];
370     wk1i = rdft_w[k2 + 1];
371     wk3r = rdft_wk3ri_first[k1 + 0];
372     wk3i = rdft_wk3ri_first[k1 + 1];
373     for (j0 = k; j0 < l + k; j0 += 2) {
374       j1 = j0 + 8;
375       j2 = j0 + 16;
376       j3 = j0 + 24;
377       x0r = a[j0 + 0] + a[j1 + 0];
378       x0i = a[j0 + 1] + a[j1 + 1];
379       x1r = a[j0 + 0] - a[j1 + 0];
380       x1i = a[j0 + 1] - a[j1 + 1];
381       x2r = a[j2 + 0] + a[j3 + 0];
382       x2i = a[j2 + 1] + a[j3 + 1];
383       x3r = a[j2 + 0] - a[j3 + 0];
384       x3i = a[j2 + 1] - a[j3 + 1];
385       a[j0 + 0] = x0r + x2r;
386       a[j0 + 1] = x0i + x2i;
387       x0r -= x2r;
388       x0i -= x2i;
389       a[j2 + 0] = wk2r * x0r - wk2i * x0i;
390       a[j2 + 1] = wk2r * x0i + wk2i * x0r;
391       x0r = x1r - x3i;
392       x0i = x1i + x3r;
393       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
394       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
395       x0r = x1r + x3i;
396       x0i = x1i - x3r;
397       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
398       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
399     }
400     wk1r = rdft_w[k2 + 2];
401     wk1i = rdft_w[k2 + 3];
402     wk3r = rdft_wk3ri_second[k1 + 0];
403     wk3i = rdft_wk3ri_second[k1 + 1];
404     for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
405       j1 = j0 + 8;
406       j2 = j0 + 16;
407       j3 = j0 + 24;
408       x0r = a[j0 + 0] + a[j1 + 0];
409       x0i = a[j0 + 1] + a[j1 + 1];
410       x1r = a[j0 + 0] - a[j1 + 0];
411       x1i = a[j0 + 1] - a[j1 + 1];
412       x2r = a[j2 + 0] + a[j3 + 0];
413       x2i = a[j2 + 1] + a[j3 + 1];
414       x3r = a[j2 + 0] - a[j3 + 0];
415       x3i = a[j2 + 1] - a[j3 + 1];
416       a[j0 + 0] = x0r + x2r;
417       a[j0 + 1] = x0i + x2i;
418       x0r -= x2r;
419       x0i -= x2i;
420       a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
421       a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
422       x0r = x1r - x3i;
423       x0i = x1i + x3r;
424       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
425       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
426       x0r = x1r + x3i;
427       x0i = x1i - x3r;
428       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
429       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
430     }
431   }
432 }
433 
cftfsub_128_C(float * a)434 static void cftfsub_128_C(float* a) {
435   int j, j1, j2, j3, l;
436   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
437 
438   cft1st_128(a);
439   cftmdl_128(a);
440   l = 32;
441   for (j = 0; j < l; j += 2) {
442     j1 = j + l;
443     j2 = j1 + l;
444     j3 = j2 + l;
445     x0r = a[j] + a[j1];
446     x0i = a[j + 1] + a[j1 + 1];
447     x1r = a[j] - a[j1];
448     x1i = a[j + 1] - a[j1 + 1];
449     x2r = a[j2] + a[j3];
450     x2i = a[j2 + 1] + a[j3 + 1];
451     x3r = a[j2] - a[j3];
452     x3i = a[j2 + 1] - a[j3 + 1];
453     a[j] = x0r + x2r;
454     a[j + 1] = x0i + x2i;
455     a[j2] = x0r - x2r;
456     a[j2 + 1] = x0i - x2i;
457     a[j1] = x1r - x3i;
458     a[j1 + 1] = x1i + x3r;
459     a[j3] = x1r + x3i;
460     a[j3 + 1] = x1i - x3r;
461   }
462 }
463 
cftbsub_128_C(float * a)464 static void cftbsub_128_C(float* a) {
465   int j, j1, j2, j3, l;
466   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
467 
468   cft1st_128(a);
469   cftmdl_128(a);
470   l = 32;
471 
472   for (j = 0; j < l; j += 2) {
473     j1 = j + l;
474     j2 = j1 + l;
475     j3 = j2 + l;
476     x0r = a[j] + a[j1];
477     x0i = -a[j + 1] - a[j1 + 1];
478     x1r = a[j] - a[j1];
479     x1i = -a[j + 1] + a[j1 + 1];
480     x2r = a[j2] + a[j3];
481     x2i = a[j2 + 1] + a[j3 + 1];
482     x3r = a[j2] - a[j3];
483     x3i = a[j2 + 1] - a[j3 + 1];
484     a[j] = x0r + x2r;
485     a[j + 1] = x0i - x2i;
486     a[j2] = x0r - x2r;
487     a[j2 + 1] = x0i + x2i;
488     a[j1] = x1r - x3i;
489     a[j1 + 1] = x1i - x3r;
490     a[j3] = x1r + x3i;
491     a[j3 + 1] = x1i + x3r;
492   }
493 }
494 
rftfsub_128_C(float * a)495 static void rftfsub_128_C(float* a) {
496   const float* c = rdft_w + 32;
497   int j1, j2, k1, k2;
498   float wkr, wki, xr, xi, yr, yi;
499 
500   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
501     k2 = 128 - j2;
502     k1 = 32 - j1;
503     wkr = 0.5f - c[k1];
504     wki = c[j1];
505     xr = a[j2 + 0] - a[k2 + 0];
506     xi = a[j2 + 1] + a[k2 + 1];
507     yr = wkr * xr - wki * xi;
508     yi = wkr * xi + wki * xr;
509     a[j2 + 0] -= yr;
510     a[j2 + 1] -= yi;
511     a[k2 + 0] += yr;
512     a[k2 + 1] -= yi;
513   }
514 }
515 
rftbsub_128_C(float * a)516 static void rftbsub_128_C(float* a) {
517   const float* c = rdft_w + 32;
518   int j1, j2, k1, k2;
519   float wkr, wki, xr, xi, yr, yi;
520 
521   a[1] = -a[1];
522   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
523     k2 = 128 - j2;
524     k1 = 32 - j1;
525     wkr = 0.5f - c[k1];
526     wki = c[j1];
527     xr = a[j2 + 0] - a[k2 + 0];
528     xi = a[j2 + 1] + a[k2 + 1];
529     yr = wkr * xr + wki * xi;
530     yi = wkr * xi - wki * xr;
531     a[j2 + 0] = a[j2 + 0] - yr;
532     a[j2 + 1] = yi - a[j2 + 1];
533     a[k2 + 0] = yr + a[k2 + 0];
534     a[k2 + 1] = yi - a[k2 + 1];
535   }
536   a[65] = -a[65];
537 }
538 
aec_rdft_forward_128(float * a)539 void aec_rdft_forward_128(float* a) {
540   float xi;
541   bitrv2_128(a);
542   cftfsub_128(a);
543   rftfsub_128(a);
544   xi = a[0] - a[1];
545   a[0] += a[1];
546   a[1] = xi;
547 }
548 
aec_rdft_inverse_128(float * a)549 void aec_rdft_inverse_128(float* a) {
550   a[1] = 0.5f * (a[0] - a[1]);
551   a[0] -= a[1];
552   rftbsub_128(a);
553   bitrv2_128(a);
554   cftbsub_128(a);
555 }
556 
557 // code path selection
558 RftSub128 cft1st_128;
559 RftSub128 cftmdl_128;
560 RftSub128 rftfsub_128;
561 RftSub128 rftbsub_128;
562 RftSub128 cftfsub_128;
563 RftSub128 cftbsub_128;
564 RftSub128 bitrv2_128;
565 
aec_rdft_init(void)566 void aec_rdft_init(void) {
567   cft1st_128 = cft1st_128_C;
568   cftmdl_128 = cftmdl_128_C;
569   rftfsub_128 = rftfsub_128_C;
570   rftbsub_128 = rftbsub_128_C;
571   cftfsub_128 = cftfsub_128_C;
572   cftbsub_128 = cftbsub_128_C;
573   bitrv2_128 = bitrv2_128_C;
574 #if defined(WEBRTC_ARCH_X86_FAMILY)
575   if (WebRtc_GetCPUInfo(kSSE2)) {
576     aec_rdft_init_sse2();
577   }
578 #endif
579 #if defined(MIPS_FPU_LE)
580   aec_rdft_init_mips();
581 #endif
582 #if defined(WEBRTC_HAS_NEON)
583   aec_rdft_init_neon();
584 #elif defined(WEBRTC_DETECT_NEON)
585   if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
586     aec_rdft_init_neon();
587   }
588 #endif
589 }
590