1 /******************************************************************************
2  *                                                                            *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 #include <stdlib.h>
21 #include <stdio.h>
22 
23 #include <ixheaacd_type_def.h>
24 #include "ixheaacd_interface.h"
25 #include "ixheaacd_constants.h"
26 #include <ixheaacd_basic_ops32.h>
27 #include "ixheaacd_basic_ops40.h"
28 #include "ixheaacd_function_selector.h"
29 
30 extern const WORD32 ixheaacd_twiddle_table_fft_32x32[514];
31 extern const WORD32 ixheaacd_twiddle_table_3pr[1155];
32 extern const WORD32 ixheaacd_twiddle_table_3pi[1155];
33 extern const WORD8 ixheaacd_mps_dig_rev[16];
34 
35 #define PLATFORM_INLINE __inline
36 
37 #define DIG_REV(i, m, j)                                    \
38   do {                                                      \
39     unsigned _ = (i);                                       \
40     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
41     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
42     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
43     (j) = _ >> (m);                                         \
44   } while (0)
45 
ixheaacd_mult32_sat(WORD32 a,WORD32 b)46 static PLATFORM_INLINE WORD32 ixheaacd_mult32_sat(WORD32 a, WORD32 b) {
47   WORD32 result;
48   WORD64 temp_result;
49 
50   temp_result = (WORD64)a * (WORD64)b;
51   result = ixheaacd_sat64_32(temp_result >> 31);
52 
53   return (result);
54 }
55 
ixheaacd_mac32_sat(WORD32 a,WORD32 b,WORD32 c)56 static PLATFORM_INLINE WORD32 ixheaacd_mac32_sat(WORD32 a, WORD32 b, WORD32 c) {
57   WORD32 result;
58 
59   result = ixheaacd_add32_sat(a, ixheaacd_mult32_sat(b, c));
60 
61   return (result);
62 }
63 
64 
ixheaacd_mps_complex_fft_64_dec(WORD32 * ptr_x,WORD32 * fin_re,WORD32 * fin_im,WORD32 nlength)65 VOID ixheaacd_mps_complex_fft_64_dec(WORD32 *ptr_x, WORD32 *fin_re,
66                                      WORD32 *fin_im, WORD32 nlength) {
67   WORD32 i, j, k, n_stages;
68   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
69   WORD32 del, nodespacing, in_loop_cnt;
70   WORD32 y[128];
71   WORD32 npoints = nlength;
72   WORD32 *ptr_y = y;
73   const WORD32 *ptr_w;
74   n_stages = 30 - ixheaacd_norm32(npoints);
75 
76   n_stages = n_stages >> 1;
77 
78   ptr_w = ixheaacd_twiddle_table_fft_32x32;
79 
80   for (i = 0; i < npoints; i += 4) {
81     WORD32 *inp = ptr_x;
82     h2 = ixheaacd_mps_dig_rev[i >> 2];
83     inp += (h2);
84 
85     x0r = *inp;
86     x0i = *(inp + 1);
87     inp += (npoints >> 1);
88 
89     x1r = *inp;
90     x1i = *(inp + 1);
91     inp += (npoints >> 1);
92 
93     x2r = *inp;
94     x2i = *(inp + 1);
95     inp += (npoints >> 1);
96 
97     x3r = *inp;
98     x3i = *(inp + 1);
99 
100     x0r = ixheaacd_add32_sat(x0r, x2r);
101     x0i = ixheaacd_add32_sat(x0i, x2i);
102     x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
103     x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
104     x1r = ixheaacd_add32_sat(x1r, x3r);
105     x1i = ixheaacd_add32_sat(x1i, x3i);
106     x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
107     x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
108 
109     x0r = ixheaacd_add32_sat(x0r, x1r);
110     x0i = ixheaacd_add32_sat(x0i, x1i);
111     x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
112     x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
113     x2r = ixheaacd_add32_sat(x2r, x3i);
114     x2i = ixheaacd_sub32_sat(x2i, x3r);
115     x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
116     x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
117 
118     *ptr_y++ = x0r;
119     *ptr_y++ = x0i;
120     *ptr_y++ = x2r;
121     *ptr_y++ = x2i;
122     *ptr_y++ = x1r;
123     *ptr_y++ = x1i;
124     *ptr_y++ = x3i;
125     *ptr_y++ = x3r;
126   }
127   ptr_y -= 2 * npoints;
128   del = 4;
129   nodespacing = 64;
130   in_loop_cnt = npoints >> 4;
131   for (i = n_stages - 1; i > 0; i--) {
132     const WORD32 *twiddles = ptr_w;
133     WORD32 *data = ptr_y;
134     WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
135     WORD32 sec_loop_cnt;
136 
137     for (k = in_loop_cnt; k != 0; k--) {
138       x0r = (*data);
139       x0i = (*(data + 1));
140       data += (del << 1);
141 
142       x1r = (*data);
143       x1i = (*(data + 1));
144       data += (del << 1);
145 
146       x2r = (*data);
147       x2i = (*(data + 1));
148       data += (del << 1);
149 
150       x3r = (*data);
151       x3i = (*(data + 1));
152       data -= 3 * (del << 1);
153 
154       x0r = ixheaacd_add32_sat(x0r, x2r);
155       x0i = ixheaacd_add32_sat(x0i, x2i);
156       x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
157       x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
158       x1r = ixheaacd_add32_sat(x1r, x3r);
159       x1i = ixheaacd_add32_sat(x1i, x3i);
160       x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
161       x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
162 
163       x0r = ixheaacd_add32_sat(x0r, x1r);
164       x0i = ixheaacd_add32_sat(x0i, x1i);
165       x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
166       x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
167       x2r = ixheaacd_add32_sat(x2r, x3i);
168       x2i = ixheaacd_sub32_sat(x2i, x3r);
169       x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
170       x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
171 
172       *data = x0r;
173       *(data + 1) = x0i;
174       data += (del << 1);
175 
176       *data = x2r;
177       *(data + 1) = x2i;
178       data += (del << 1);
179 
180       *data = x1r;
181       *(data + 1) = x1i;
182       data += (del << 1);
183 
184       *data = x3i;
185       *(data + 1) = x3r;
186       data += (del << 1);
187     }
188     data = ptr_y + 2;
189 
190     sec_loop_cnt = (nodespacing * del);
191     sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
192                    (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
193                    (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
194                    (sec_loop_cnt / 256);
195     j = nodespacing;
196 
197     for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
198       w1h = *(twiddles + 2 * j);
199       w1l = *(twiddles + 2 * j + 1);
200       w2h = *(twiddles + 2 * (j << 1));
201       w2l = *(twiddles + 2 * (j << 1) + 1);
202       w3h = *(twiddles + 2 * j + 2 * (j << 1));
203       w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
204 
205       for (k = in_loop_cnt; k != 0; k--) {
206         WORD32 tmp;
207         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
208 
209         data += (del << 1);
210 
211         x1r = *data;
212         x1i = *(data + 1);
213         data += (del << 1);
214 
215         x2r = *data;
216         x2i = *(data + 1);
217         data += (del << 1);
218 
219         x3r = *data;
220         x3i = *(data + 1);
221         data -= 3 * (del << 1);
222 
223         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
224                                  ixheaacd_mult32_sat(x1i, w1h));
225         x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
226         x1r = tmp;
227 
228         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2l),
229                                  ixheaacd_mult32_sat(x2i, w2h));
230         x2i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
231         x2r = tmp;
232 
233         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3r, w3l),
234                                  ixheaacd_mult32_sat(x3i, w3h));
235         x3i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
236         x3r = tmp;
237 
238         x0r = (*data);
239         x0i = (*(data + 1));
240 
241         x0r = ixheaacd_add32_sat(x0r, x2r);
242         x0i = ixheaacd_add32_sat(x0i, x2i);
243         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
244         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
245         x1r = ixheaacd_add32_sat(x1r, x3r);
246         x1i = ixheaacd_add32_sat(x1i, x3i);
247         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
248         x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
249 
250         x0r = ixheaacd_add32_sat(x0r, x1r);
251         x0i = ixheaacd_add32_sat(x0i, x1i);
252         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
253         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
254         x2r = ixheaacd_add32_sat(x2r, x3i);
255         x2i = ixheaacd_sub32_sat(x2i, x3r);
256         x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
257         x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
258 
259         *data = x0r;
260         *(data + 1) = x0i;
261         data += (del << 1);
262 
263         *data = x2r;
264         *(data + 1) = x2i;
265         data += (del << 1);
266 
267         *data = x1r;
268         *(data + 1) = x1i;
269         data += (del << 1);
270 
271         *data = x3i;
272         *(data + 1) = x3r;
273         data += (del << 1);
274       }
275       data -= 2 * npoints;
276       data += 2;
277     }
278     for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
279       w1h = *(twiddles + 2 * j);
280       w2h = *(twiddles + 2 * (j << 1));
281       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
282       w1l = *(twiddles + 2 * j + 1);
283       w2l = *(twiddles + 2 * (j << 1) + 1);
284       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
285 
286       for (k = in_loop_cnt; k != 0; k--) {
287         WORD32 tmp;
288         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
289 
290         data += (del << 1);
291 
292         x1r = *data;
293         x1i = *(data + 1);
294         data += (del << 1);
295 
296         x2r = *data;
297         x2i = *(data + 1);
298         data += (del << 1);
299 
300         x3r = *data;
301         x3i = *(data + 1);
302         data -= 3 * (del << 1);
303 
304         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
305                                  ixheaacd_mult32_sat(x1i, w1h));
306         x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
307         x1r = tmp;
308 
309         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2l),
310                                  ixheaacd_mult32_sat(x2i, w2h));
311         x2i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
312         x2r = tmp;
313 
314         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3h),
315                                  ixheaacd_mult32_sat(x3i, w3l));
316         x3i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
317                                  ixheaacd_mult32_sat(x3r, w3l));
318         x3r = tmp;
319 
320         x0r = (*data);
321         x0i = (*(data + 1));
322 
323         x0r = ixheaacd_add32_sat(x0r, x2r);
324         x0i = ixheaacd_add32_sat(x0i, x2i);
325         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
326         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
327         x1r = ixheaacd_add32_sat(x1r, x3r);
328         x1i = ixheaacd_add32_sat(x1i, x3i);
329         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
330         x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
331 
332         x0r = ixheaacd_add32_sat(x0r, x1r);
333         x0i = ixheaacd_add32_sat(x0i, x1i);
334         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
335         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
336         x2r = ixheaacd_add32_sat(x2r, x3i);
337         x2i = ixheaacd_sub32_sat(x2i, x3r);
338         x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
339         x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
340 
341         *data = x0r;
342         *(data + 1) = x0i;
343         data += (del << 1);
344 
345         *data = x2r;
346         *(data + 1) = x2i;
347         data += (del << 1);
348 
349         *data = x1r;
350         *(data + 1) = x1i;
351         data += (del << 1);
352 
353         *data = x3i;
354         *(data + 1) = x3r;
355         data += (del << 1);
356       }
357       data -= 2 * npoints;
358       data += 2;
359     }
360     for (; j <= sec_loop_cnt * 2; j += nodespacing) {
361       w1h = *(twiddles + 2 * j);
362       w2h = *(twiddles + 2 * (j << 1) - 512);
363       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
364       w1l = *(twiddles + 2 * j + 1);
365       w2l = *(twiddles + 2 * (j << 1) - 511);
366       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
367 
368       for (k = in_loop_cnt; k != 0; k--) {
369         WORD32 tmp;
370         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
371 
372         data += (del << 1);
373 
374         x1r = *data;
375         x1i = *(data + 1);
376         data += (del << 1);
377 
378         x2r = *data;
379         x2i = *(data + 1);
380         data += (del << 1);
381 
382         x3r = *data;
383         x3i = *(data + 1);
384         data -= 3 * (del << 1);
385 
386         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
387                                  ixheaacd_mult32_sat(x1i, w1h));
388         x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
389         x1r = tmp;
390 
391         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2h),
392                                  ixheaacd_mult32_sat(x2i, w2l));
393         x2i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2i, w2h),
394                                  ixheaacd_mult32_sat(x2r, w2l));
395         x2r = tmp;
396 
397         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3h),
398                                  ixheaacd_mult32_sat(x3i, w3l));
399         x3i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
400                                  ixheaacd_mult32_sat(x3r, w3l));
401         x3r = tmp;
402 
403         x0r = (*data);
404         x0i = (*(data + 1));
405 
406         x0r = ixheaacd_add32_sat(x0r, x2r);
407         x0i = ixheaacd_add32_sat(x0i, x2i);
408         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
409         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
410         x1r = ixheaacd_add32_sat(x1r, x3r);
411         x1i = ixheaacd_add32_sat(x1i, x3i);
412         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
413         x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
414 
415         x0r = ixheaacd_add32_sat(x0r, x1r);
416         x0i = ixheaacd_add32_sat(x0i, x1i);
417         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
418         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
419         x2r = ixheaacd_add32_sat(x2r, x3i);
420         x2i = ixheaacd_sub32_sat(x2i, x3r);
421         x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
422         x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
423 
424         *data = x0r;
425         *(data + 1) = x0i;
426         data += (del << 1);
427 
428         *data = x2r;
429         *(data + 1) = x2i;
430         data += (del << 1);
431 
432         *data = x1r;
433         *(data + 1) = x1i;
434         data += (del << 1);
435 
436         *data = x3i;
437         *(data + 1) = x3r;
438         data += (del << 1);
439       }
440       data -= 2 * npoints;
441       data += 2;
442     }
443     for (; j < nodespacing * del; j += nodespacing) {
444       w1h = *(twiddles + 2 * j);
445       w2h = *(twiddles + 2 * (j << 1) - 512);
446       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
447       w1l = *(twiddles + 2 * j + 1);
448       w2l = *(twiddles + 2 * (j << 1) - 511);
449       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
450 
451       for (k = in_loop_cnt; k != 0; k--) {
452         WORD32 tmp;
453         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
454 
455         data += (del << 1);
456 
457         x1r = *data;
458         x1i = *(data + 1);
459         data += (del << 1);
460 
461         x2r = *data;
462         x2i = *(data + 1);
463         data += (del << 1);
464 
465         x3r = *data;
466         x3i = *(data + 1);
467         data -= 3 * (del << 1);
468 
469         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
470                                  ixheaacd_mult32_sat(x1i, w1h));
471         x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
472         x1r = tmp;
473 
474         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2h),
475                                  ixheaacd_mult32_sat(x2i, w2l));
476         x2i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2i, w2h),
477                                  ixheaacd_mult32_sat(x2r, w2l));
478         x2r = tmp;
479 
480         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
481                                  ixheaacd_mult32_sat(x3r, w3l));
482         x3i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
483         x3r = tmp;
484 
485         x0r = (*data);
486         x0i = (*(data + 1));
487 
488         x0r = ixheaacd_add32_sat(x0r, x2r);
489         x0i = ixheaacd_add32_sat(x0i, x2i);
490         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
491         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
492         x1r = ixheaacd_add32_sat(x1r, x3r);
493         x1i = ixheaacd_sub32_sat(x1i, x3i);
494         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
495         x3i = ixheaacd_add32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
496 
497         x0r = ixheaacd_add32_sat(x0r, x1r);
498         x0i = ixheaacd_add32_sat(x0i, x1i);
499         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
500         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
501         x2r = ixheaacd_add32_sat(x2r, x3i);
502         x2i = ixheaacd_sub32_sat(x2i, x3r);
503         x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
504         x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
505 
506         *data = x0r;
507         *(data + 1) = x0i;
508         data += (del << 1);
509 
510         *data = x2r;
511         *(data + 1) = x2i;
512         data += (del << 1);
513 
514         *data = x1r;
515         *(data + 1) = x1i;
516         data += (del << 1);
517 
518         *data = x3i;
519         *(data + 1) = x3r;
520         data += (del << 1);
521       }
522       data -= 2 * npoints;
523       data += 2;
524     }
525     nodespacing >>= 2;
526     del <<= 2;
527     in_loop_cnt >>= 2;
528   }
529 
530   for (i = 0; i < 2 * nlength; i += 2) {
531     fin_re[i] = y[i];
532     fin_im[i] = y[i + 1];
533   }
534 
535   return;
536 }
537 
ixheaacd_complex_fft_p2_dec(WORD32 * xr,WORD32 * xi,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)538 VOID ixheaacd_complex_fft_p2_dec(WORD32 *xr, WORD32 *xi, WORD32 nlength,
539                                  WORD32 fft_mode, WORD32 *preshift) {
540   WORD32 i, j, k, n_stages;
541   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
542   WORD32 del, nodespacing, in_loop_cnt;
543   WORD32 not_power_4;
544   WORD32 npts, shift;
545   WORD32 dig_rev_shift;
546   WORD32 ptr_x[1024];
547   WORD32 y[1024];
548   WORD32 npoints = nlength;
549   WORD32 n = 0;
550   WORD32 *ptr_y = y;
551   const WORD32 *ptr_w;
552   dig_rev_shift = ixheaacd_norm32(npoints) + 1 - 16;
553   n_stages = 30 - ixheaacd_norm32(npoints);
554   not_power_4 = n_stages & 1;
555 
556   n_stages = n_stages >> 1;
557 
558   npts = npoints;
559   while (npts >> 1) {
560     n++;
561     npts = npts >> 1;
562   }
563 
564   if (n % 2 == 0)
565     shift = ((n + 4)) / 2;
566   else
567     shift = ((n + 3) / 2);
568 
569   for (i = 0; i < nlength; i++) {
570     ptr_x[2 * i] = (xr[i] / (1 << (shift)));
571     ptr_x[2 * i + 1] = (xi[i] / (1 << (shift)));
572   }
573 
574   if (fft_mode == -1) {
575     ptr_w = ixheaacd_twiddle_table_fft_32x32;
576 
577     for (i = 0; i < npoints; i += 4) {
578       WORD32 *inp = ptr_x;
579 
580       DIG_REV(i, dig_rev_shift, h2);
581       if (not_power_4) {
582         h2 += 1;
583         h2 &= ~1;
584       }
585       inp += (h2);
586 
587       x0r = *inp;
588       x0i = *(inp + 1);
589       inp += (npoints >> 1);
590 
591       x1r = *inp;
592       x1i = *(inp + 1);
593       inp += (npoints >> 1);
594 
595       x2r = *inp;
596       x2i = *(inp + 1);
597       inp += (npoints >> 1);
598 
599       x3r = *inp;
600       x3i = *(inp + 1);
601 
602       x0r = ixheaacd_add32_sat(x0r, x2r);
603       x0i = ixheaacd_add32_sat(x0i, x2i);
604       x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
605       x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
606       x1r = ixheaacd_add32_sat(x1r, x3r);
607       x1i = ixheaacd_add32_sat(x1i, x3i);
608       x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
609       x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
610 
611       x0r = ixheaacd_add32_sat(x0r, x1r);
612       x0i = ixheaacd_add32_sat(x0i, x1i);
613       x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
614       x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
615       x2r = ixheaacd_add32_sat(x2r, x3i);
616       x2i = ixheaacd_sub32_sat(x2i, x3r);
617       x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
618       x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
619 
620       *ptr_y++ = x0r;
621       *ptr_y++ = x0i;
622       *ptr_y++ = x2r;
623       *ptr_y++ = x2i;
624       *ptr_y++ = x1r;
625       *ptr_y++ = x1i;
626       *ptr_y++ = x3i;
627       *ptr_y++ = x3r;
628     }
629     ptr_y -= 2 * npoints;
630     del = 4;
631     nodespacing = 64;
632     in_loop_cnt = npoints >> 4;
633     for (i = n_stages - 1; i > 0; i--) {
634       const WORD32 *twiddles = ptr_w;
635       WORD32 *data = ptr_y;
636       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
637       WORD32 sec_loop_cnt;
638 
639       for (k = in_loop_cnt; k != 0; k--) {
640         x0r = (*data);
641         x0i = (*(data + 1));
642         data += (del << 1);
643 
644         x1r = (*data);
645         x1i = (*(data + 1));
646         data += (del << 1);
647 
648         x2r = (*data);
649         x2i = (*(data + 1));
650         data += (del << 1);
651 
652         x3r = (*data);
653         x3i = (*(data + 1));
654         data -= 3 * (del << 1);
655 
656         x0r = ixheaacd_add32_sat(x0r, x2r);
657         x0i = ixheaacd_add32_sat(x0i, x2i);
658         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
659         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
660         x1r = ixheaacd_add32_sat(x1r, x3r);
661         x1i = ixheaacd_add32_sat(x1i, x3i);
662         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
663         x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
664 
665         x0r = ixheaacd_add32_sat(x0r, x1r);
666         x0i = ixheaacd_add32_sat(x0i, x1i);
667         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
668         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
669         x2r = ixheaacd_add32_sat(x2r, x3i);
670         x2i = ixheaacd_sub32_sat(x2i, x3r);
671         x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
672         x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
673 
674         *data = x0r;
675         *(data + 1) = x0i;
676         data += (del << 1);
677 
678         *data = x2r;
679         *(data + 1) = x2i;
680         data += (del << 1);
681 
682         *data = x1r;
683         *(data + 1) = x1i;
684         data += (del << 1);
685 
686         *data = x3i;
687         *(data + 1) = x3r;
688         data += (del << 1);
689       }
690       data = ptr_y + 2;
691 
692       sec_loop_cnt = (nodespacing * del);
693       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
694                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
695                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
696                      (sec_loop_cnt / 256);
697       j = nodespacing;
698 
699       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
700         w1h = *(twiddles + 2 * j);
701         w1l = *(twiddles + 2 * j + 1);
702         w2h = *(twiddles + 2 * (j << 1));
703         w2l = *(twiddles + 2 * (j << 1) + 1);
704         w3h = *(twiddles + 2 * j + 2 * (j << 1));
705         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
706 
707         for (k = in_loop_cnt; k != 0; k--) {
708           WORD32 tmp;
709           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
710 
711           data += (del << 1);
712 
713           x1r = *data;
714           x1i = *(data + 1);
715           data += (del << 1);
716 
717           x2r = *data;
718           x2i = *(data + 1);
719           data += (del << 1);
720 
721           x3r = *data;
722           x3i = *(data + 1);
723           data -= 3 * (del << 1);
724 
725           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
726                                    ixheaacd_mult32_sat(x1i, w1h));
727           x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
728           x1r = tmp;
729 
730           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2l),
731                                    ixheaacd_mult32_sat(x2i, w2h));
732           x2i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
733           x2r = tmp;
734 
735           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3r, w3l),
736                                    ixheaacd_mult32_sat(x3i, w3h));
737           x3i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
738           x3r = tmp;
739 
740           x0r = (*data);
741           x0i = (*(data + 1));
742 
743           x0r = ixheaacd_add32_sat(x0r, x2r);
744           x0i = ixheaacd_add32_sat(x0i, x2i);
745           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
746           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
747           x1r = ixheaacd_add32_sat(x1r, x3r);
748           x1i = ixheaacd_add32_sat(x1i, x3i);
749           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
750           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
751 
752           x0r = ixheaacd_add32_sat(x0r, x1r);
753           x0i = ixheaacd_add32_sat(x0i, x1i);
754           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
755           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
756           x2r = ixheaacd_add32_sat(x2r, x3i);
757           x2i = ixheaacd_sub32_sat(x2i, x3r);
758           x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
759           x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
760 
761           *data = x0r;
762           *(data + 1) = x0i;
763           data += (del << 1);
764 
765           *data = x2r;
766           *(data + 1) = x2i;
767           data += (del << 1);
768 
769           *data = x1r;
770           *(data + 1) = x1i;
771           data += (del << 1);
772 
773           *data = x3i;
774           *(data + 1) = x3r;
775           data += (del << 1);
776         }
777         data -= 2 * npoints;
778         data += 2;
779       }
780       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
781         w1h = *(twiddles + 2 * j);
782         w2h = *(twiddles + 2 * (j << 1));
783         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
784         w1l = *(twiddles + 2 * j + 1);
785         w2l = *(twiddles + 2 * (j << 1) + 1);
786         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
787 
788         for (k = in_loop_cnt; k != 0; k--) {
789           WORD32 tmp;
790           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
791           data += (del << 1);
792 
793           x1r = *data;
794           x1i = *(data + 1);
795           data += (del << 1);
796 
797           x2r = *data;
798           x2i = *(data + 1);
799           data += (del << 1);
800 
801           x3r = *data;
802           x3i = *(data + 1);
803           data -= 3 * (del << 1);
804 
805           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
806                                    ixheaacd_mult32_sat(x1i, w1h));
807           x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
808           x1r = tmp;
809 
810           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2l),
811                                    ixheaacd_mult32_sat(x2i, w2h));
812           x2i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
813           x2r = tmp;
814 
815           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3h),
816                                    ixheaacd_mult32_sat(x3i, w3l));
817           x3i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
818                                    ixheaacd_mult32_sat(x3r, w3l));
819           x3r = tmp;
820 
821           x0r = (*data);
822           x0i = (*(data + 1));
823 
824           x0r = ixheaacd_add32_sat(x0r, x2r);
825           x0i = ixheaacd_add32_sat(x0i, x2i);
826           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
827           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
828           x1r = ixheaacd_add32_sat(x1r, x3r);
829           x1i = ixheaacd_add32_sat(x1i, x3i);
830           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
831           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
832 
833           x0r = ixheaacd_add32_sat(x0r, x1r);
834           x0i = ixheaacd_add32_sat(x0i, x1i);
835           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
836           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
837           x2r = ixheaacd_add32_sat(x2r, x3i);
838           x2i = ixheaacd_sub32_sat(x2i, x3r);
839           x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
840           x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
841 
842           *data = x0r;
843           *(data + 1) = x0i;
844           data += (del << 1);
845 
846           *data = x2r;
847           *(data + 1) = x2i;
848           data += (del << 1);
849 
850           *data = x1r;
851           *(data + 1) = x1i;
852           data += (del << 1);
853 
854           *data = x3i;
855           *(data + 1) = x3r;
856           data += (del << 1);
857         }
858         data -= 2 * npoints;
859         data += 2;
860       }
861       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
862         w1h = *(twiddles + 2 * j);
863         w2h = *(twiddles + 2 * (j << 1) - 512);
864         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
865         w1l = *(twiddles + 2 * j + 1);
866         w2l = *(twiddles + 2 * (j << 1) - 511);
867         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
868 
869         for (k = in_loop_cnt; k != 0; k--) {
870           WORD32 tmp;
871           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
872 
873           data += (del << 1);
874 
875           x1r = *data;
876           x1i = *(data + 1);
877           data += (del << 1);
878 
879           x2r = *data;
880           x2i = *(data + 1);
881           data += (del << 1);
882 
883           x3r = *data;
884           x3i = *(data + 1);
885           data -= 3 * (del << 1);
886 
887           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
888                                    ixheaacd_mult32_sat(x1i, w1h));
889           x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
890           x1r = tmp;
891 
892           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2h),
893                                    ixheaacd_mult32_sat(x2i, w2l));
894           x2i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2i, w2h),
895                                    ixheaacd_mult32_sat(x2r, w2l));
896           x2r = tmp;
897 
898           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3h),
899                                    ixheaacd_mult32_sat(x3i, w3l));
900           x3i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
901                                    ixheaacd_mult32_sat(x3r, w3l));
902           x3r = tmp;
903 
904           x0r = (*data);
905           x0i = (*(data + 1));
906 
907           x0r = ixheaacd_add32_sat(x0r, x2r);
908           x0i = ixheaacd_add32_sat(x0i, x2i);
909           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
910           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
911           x1r = ixheaacd_add32_sat(x1r, x3r);
912           x1i = ixheaacd_add32_sat(x1i, x3i);
913           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
914           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
915 
916           x0r = ixheaacd_add32_sat(x0r, x1r);
917           x0i = ixheaacd_add32_sat(x0i, x1i);
918           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
919           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
920           x2r = ixheaacd_add32_sat(x2r, x3i);
921           x2i = ixheaacd_sub32_sat(x2i, x3r);
922           x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
923           x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
924 
925           *data = x0r;
926           *(data + 1) = x0i;
927           data += (del << 1);
928 
929           *data = x2r;
930           *(data + 1) = x2i;
931           data += (del << 1);
932 
933           *data = x1r;
934           *(data + 1) = x1i;
935           data += (del << 1);
936 
937           *data = x3i;
938           *(data + 1) = x3r;
939           data += (del << 1);
940         }
941         data -= 2 * npoints;
942         data += 2;
943       }
944       for (; j < nodespacing * del; j += nodespacing) {
945         w1h = *(twiddles + 2 * j);
946         w2h = *(twiddles + 2 * (j << 1) - 512);
947         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
948         w1l = *(twiddles + 2 * j + 1);
949         w2l = *(twiddles + 2 * (j << 1) - 511);
950         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
951 
952         for (k = in_loop_cnt; k != 0; k--) {
953           WORD32 tmp;
954           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
955 
956           data += (del << 1);
957 
958           x1r = *data;
959           x1i = *(data + 1);
960           data += (del << 1);
961 
962           x2r = *data;
963           x2i = *(data + 1);
964           data += (del << 1);
965 
966           x3r = *data;
967           x3i = *(data + 1);
968           data -= 3 * (del << 1);
969 
970           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
971                                    ixheaacd_mult32_sat(x1i, w1h));
972           x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
973           x1r = tmp;
974 
975           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2h),
976                                    ixheaacd_mult32_sat(x2i, w2l));
977           x2i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2i, w2h),
978                                    ixheaacd_mult32_sat(x2r, w2l));
979           x2r = tmp;
980 
981           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3i, w3h),
982                                    ixheaacd_mult32_sat(x3r, w3l));
983           x3i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
984           x3r = tmp;
985 
986           x0r = (*data);
987           x0i = (*(data + 1));
988 
989           x0r = ixheaacd_add32_sat(x0r, x2r);
990           x0i = ixheaacd_add32_sat(x0i, x2i);
991           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
992           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
993           x1r = ixheaacd_add32_sat(x1r, x3r);
994           x1i = ixheaacd_sub32_sat(x1i, x3i);
995           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
996           x3i = ixheaacd_add32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
997 
998           x0r = ixheaacd_add32_sat(x0r, x1r);
999           x0i = ixheaacd_add32_sat(x0i, x1i);
1000           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1001           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1002           x2r = ixheaacd_add32_sat(x2r, x3i);
1003           x2i = ixheaacd_sub32_sat(x2i, x3r);
1004           x3i = ixheaacd_sub32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1005           x3r = ixheaacd_add32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1006 
1007           *data = x0r;
1008           *(data + 1) = x0i;
1009           data += (del << 1);
1010 
1011           *data = x2r;
1012           *(data + 1) = x2i;
1013           data += (del << 1);
1014 
1015           *data = x1r;
1016           *(data + 1) = x1i;
1017           data += (del << 1);
1018 
1019           *data = x3i;
1020           *(data + 1) = x3r;
1021           data += (del << 1);
1022         }
1023         data -= 2 * npoints;
1024         data += 2;
1025       }
1026       nodespacing >>= 2;
1027       del <<= 2;
1028       in_loop_cnt >>= 2;
1029     }
1030     if (not_power_4) {
1031       const WORD32 *twiddles = ptr_w;
1032       nodespacing <<= 1;
1033       shift += 1;
1034 
1035       for (j = del / 2; j != 0; j--) {
1036         WORD32 w1h = *twiddles;
1037         WORD32 w1l = *(twiddles + 1);
1038         WORD32 tmp;
1039         twiddles += nodespacing * 2;
1040 
1041         x0r = *ptr_y;
1042         x0i = *(ptr_y + 1);
1043         ptr_y += (del << 1);
1044 
1045         x1r = *ptr_y;
1046         x1i = *(ptr_y + 1);
1047 
1048         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1l),
1049                                  ixheaacd_mult32_sat(x1i, w1h));
1050         x1i = ixheaacd_mac32_sat(ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1051         x1r = tmp;
1052 
1053         *ptr_y = (x0r) / 2 - (x1r) / 2;
1054         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1055         ptr_y -= (del << 1);
1056 
1057         *ptr_y = (x0r) / 2 + (x1r) / 2;
1058         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1059         ptr_y += 2;
1060       }
1061       twiddles = ptr_w;
1062       for (j = del / 2; j != 0; j--) {
1063         WORD32 w1h = *twiddles;
1064         WORD32 w1l = *(twiddles + 1);
1065         WORD32 tmp;
1066         twiddles += nodespacing * 2;
1067 
1068         x0r = *ptr_y;
1069         x0i = *(ptr_y + 1);
1070         ptr_y += (del << 1);
1071 
1072         x1r = *ptr_y;
1073         x1i = *(ptr_y + 1);
1074 
1075         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1h),
1076                                  ixheaacd_mult32_sat(x1i, w1l));
1077         x1i = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1i, w1h),
1078                                  ixheaacd_mult32_sat(x1r, w1l));
1079         x1r = tmp;
1080 
1081         *ptr_y = (x0r) / 2 - (x1r) / 2;
1082         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1083         ptr_y -= (del << 1);
1084 
1085         *ptr_y = (x0r) / 2 + (x1r) / 2;
1086         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1087         ptr_y += 2;
1088       }
1089     }
1090 
1091   }
1092 
1093   else {
1094     ptr_w = ixheaacd_twiddle_table_fft_32x32;
1095 
1096     for (i = 0; i < npoints; i += 4) {
1097       WORD32 *inp = ptr_x;
1098 
1099       DIG_REV(i, dig_rev_shift, h2);
1100       if (not_power_4) {
1101         h2 += 1;
1102         h2 &= ~1;
1103       }
1104       inp += (h2);
1105 
1106       x0r = *inp;
1107       x0i = *(inp + 1);
1108       inp += (npoints >> 1);
1109 
1110       x1r = *inp;
1111       x1i = *(inp + 1);
1112       inp += (npoints >> 1);
1113 
1114       x2r = *inp;
1115       x2i = *(inp + 1);
1116       inp += (npoints >> 1);
1117 
1118       x3r = *inp;
1119       x3i = *(inp + 1);
1120 
1121       x0r = ixheaacd_add32_sat(x0r, x2r);
1122       x0i = ixheaacd_add32_sat(x0i, x2i);
1123       x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1124       x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1125       x1r = ixheaacd_add32_sat(x1r, x3r);
1126       x1i = ixheaacd_add32_sat(x1i, x3i);
1127       x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1128       x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1129 
1130       x0r = ixheaacd_add32_sat(x0r, x1r);
1131       x0i = ixheaacd_add32_sat(x0i, x1i);
1132       x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1133       x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1134       x2r = ixheaacd_sub32_sat(x2r, x3i);
1135       x2i = ixheaacd_add32_sat(x2i, x3r);
1136       x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1137       x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1138 
1139       *ptr_y++ = x0r;
1140       *ptr_y++ = x0i;
1141       *ptr_y++ = x2r;
1142       *ptr_y++ = x2i;
1143       *ptr_y++ = x1r;
1144       *ptr_y++ = x1i;
1145       *ptr_y++ = x3i;
1146       *ptr_y++ = x3r;
1147     }
1148     ptr_y -= 2 * npoints;
1149     del = 4;
1150     nodespacing = 64;
1151     in_loop_cnt = npoints >> 4;
1152     for (i = n_stages - 1; i > 0; i--) {
1153       const WORD32 *twiddles = ptr_w;
1154       WORD32 *data = ptr_y;
1155       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
1156       WORD32 sec_loop_cnt;
1157 
1158       for (k = in_loop_cnt; k != 0; k--) {
1159         x0r = (*data);
1160         x0i = (*(data + 1));
1161         data += (del << 1);
1162 
1163         x1r = (*data);
1164         x1i = (*(data + 1));
1165         data += (del << 1);
1166 
1167         x2r = (*data);
1168         x2i = (*(data + 1));
1169         data += (del << 1);
1170 
1171         x3r = (*data);
1172         x3i = (*(data + 1));
1173         data -= 3 * (del << 1);
1174 
1175         x0r = ixheaacd_add32_sat(x0r, x2r);
1176         x0i = ixheaacd_add32_sat(x0i, x2i);
1177         x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1178         x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1179         x1r = ixheaacd_add32_sat(x1r, x3r);
1180         x1i = ixheaacd_add32_sat(x1i, x3i);
1181         x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1182         x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1183 
1184         x0r = ixheaacd_add32_sat(x0r, x1r);
1185         x0i = ixheaacd_add32_sat(x0i, x1i);
1186         x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1187         x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1188         x2r = ixheaacd_sub32_sat(x2r, x3i);
1189         x2i = ixheaacd_add32_sat(x2i, x3r);
1190         x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1191         x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1192 
1193         *data = x0r;
1194         *(data + 1) = x0i;
1195         data += (del << 1);
1196 
1197         *data = x2r;
1198         *(data + 1) = x2i;
1199         data += (del << 1);
1200 
1201         *data = x1r;
1202         *(data + 1) = x1i;
1203         data += (del << 1);
1204 
1205         *data = x3i;
1206         *(data + 1) = x3r;
1207         data += (del << 1);
1208       }
1209       data = ptr_y + 2;
1210 
1211       sec_loop_cnt = (nodespacing * del);
1212       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
1213                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
1214                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1215                      (sec_loop_cnt / 256);
1216       j = nodespacing;
1217 
1218       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1219         w1h = *(twiddles + 2 * j);
1220         w2h = *(twiddles + 2 * (j << 1));
1221         w3h = *(twiddles + 2 * j + 2 * (j << 1));
1222         w1l = *(twiddles + 2 * j + 1);
1223         w2l = *(twiddles + 2 * (j << 1) + 1);
1224         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
1225 
1226         for (k = in_loop_cnt; k != 0; k--) {
1227           WORD32 tmp;
1228           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1229 
1230           data += (del << 1);
1231 
1232           x1r = *data;
1233           x1i = *(data + 1);
1234           data += (del << 1);
1235 
1236           x2r = *data;
1237           x2i = *(data + 1);
1238           data += (del << 1);
1239 
1240           x3r = *data;
1241           x3i = *(data + 1);
1242           data -= 3 * (del << 1);
1243 
1244           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1245                                    ixheaacd_mult32_sat(x1i, w1h));
1246           x1i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1247           x1r = tmp;
1248 
1249           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2l),
1250                                    ixheaacd_mult32_sat(x2i, w2h));
1251           x2i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
1252           x2r = tmp;
1253 
1254           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3l),
1255                                    ixheaacd_mult32_sat(x3i, w3h));
1256           x3i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
1257           x3r = tmp;
1258 
1259           x0r = (*data);
1260           x0i = (*(data + 1));
1261 
1262           x0r = ixheaacd_add32_sat(x0r, x2r);
1263           x0i = ixheaacd_add32_sat(x0i, x2i);
1264           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1265           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1266           x1r = ixheaacd_add32_sat(x1r, x3r);
1267           x1i = ixheaacd_add32_sat(x1i, x3i);
1268           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1269           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1270 
1271           x0r = ixheaacd_add32_sat(x0r, x1r);
1272           x0i = ixheaacd_add32_sat(x0i, x1i);
1273           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1274           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1275           x2r = ixheaacd_sub32_sat(x2r, x3i);
1276           x2i = ixheaacd_add32_sat(x2i, x3r);
1277           x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1278           x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1279 
1280           *data = x0r;
1281           *(data + 1) = x0i;
1282           data += (del << 1);
1283 
1284           *data = x2r;
1285           *(data + 1) = x2i;
1286           data += (del << 1);
1287 
1288           *data = x1r;
1289           *(data + 1) = x1i;
1290           data += (del << 1);
1291 
1292           *data = x3i;
1293           *(data + 1) = x3r;
1294           data += (del << 1);
1295         }
1296         data -= 2 * npoints;
1297         data += 2;
1298       }
1299       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1300         w1h = *(twiddles + 2 * j);
1301         w2h = *(twiddles + 2 * (j << 1));
1302         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
1303         w1l = *(twiddles + 2 * j + 1);
1304         w2l = *(twiddles + 2 * (j << 1) + 1);
1305         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
1306 
1307         for (k = in_loop_cnt; k != 0; k--) {
1308           WORD32 tmp;
1309           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1310 
1311           data += (del << 1);
1312 
1313           x1r = *data;
1314           x1i = *(data + 1);
1315           data += (del << 1);
1316 
1317           x2r = *data;
1318           x2i = *(data + 1);
1319           data += (del << 1);
1320 
1321           x3r = *data;
1322           x3i = *(data + 1);
1323           data -= 3 * (del << 1);
1324 
1325           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1326                                    ixheaacd_mult32_sat(x1i, w1h));
1327           x1i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1328           x1r = tmp;
1329 
1330           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2l),
1331                                    ixheaacd_mult32_sat(x2i, w2h));
1332           x2i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x2r, w2h), x2i, w2l);
1333           x2r = tmp;
1334 
1335           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3r, w3h),
1336                                    ixheaacd_mult32_sat(x3i, w3l));
1337           x3i = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3l),
1338                                    ixheaacd_mult32_sat(x3i, w3h));
1339           x3r = tmp;
1340 
1341           x0r = (*data);
1342           x0i = (*(data + 1));
1343 
1344           x0r = ixheaacd_add32_sat(x0r, x2r);
1345           x0i = ixheaacd_add32_sat(x0i, x2i);
1346           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1347           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1348           x1r = ixheaacd_add32_sat(x1r, x3r);
1349           x1i = ixheaacd_add32_sat(x1i, x3i);
1350           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1351           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1352 
1353           x0r = ixheaacd_add32_sat(x0r, x1r);
1354           x0i = ixheaacd_add32_sat(x0i, x1i);
1355           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1356           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1357           x2r = ixheaacd_sub32_sat(x2r, x3i);
1358           x2i = ixheaacd_add32_sat(x2i, x3r);
1359           x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1360           x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1361 
1362           *data = x0r;
1363           *(data + 1) = x0i;
1364           data += (del << 1);
1365 
1366           *data = x2r;
1367           *(data + 1) = x2i;
1368           data += (del << 1);
1369 
1370           *data = x1r;
1371           *(data + 1) = x1i;
1372           data += (del << 1);
1373 
1374           *data = x3i;
1375           *(data + 1) = x3r;
1376           data += (del << 1);
1377         }
1378         data -= 2 * npoints;
1379         data += 2;
1380       }
1381       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1382         w1h = *(twiddles + 2 * j);
1383         w2h = *(twiddles + 2 * (j << 1) - 512);
1384         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
1385         w1l = *(twiddles + 2 * j + 1);
1386         w2l = *(twiddles + 2 * (j << 1) - 511);
1387         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
1388 
1389         for (k = in_loop_cnt; k != 0; k--) {
1390           WORD32 tmp;
1391           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1392 
1393           data += (del << 1);
1394 
1395           x1r = *data;
1396           x1i = *(data + 1);
1397           data += (del << 1);
1398 
1399           x2r = *data;
1400           x2i = *(data + 1);
1401           data += (del << 1);
1402 
1403           x3r = *data;
1404           x3i = *(data + 1);
1405           data -= 3 * (del << 1);
1406 
1407           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1408                                    ixheaacd_mult32_sat(x1i, w1h));
1409           x1i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1410           x1r = tmp;
1411 
1412           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2h),
1413                                    ixheaacd_mult32_sat(x2i, w2l));
1414           x2i = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2l),
1415                                    ixheaacd_mult32_sat(x2i, w2h));
1416           x2r = tmp;
1417 
1418           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x3r, w3h),
1419                                    ixheaacd_mult32_sat(x3i, w3l));
1420           x3i = ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3l),
1421                                    ixheaacd_mult32_sat(x3i, w3h));
1422           x3r = tmp;
1423 
1424           x0r = (*data);
1425           x0i = (*(data + 1));
1426 
1427           x0r = ixheaacd_add32_sat(x0r, x2r);
1428           x0i = ixheaacd_add32_sat(x0i, x2i);
1429           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1430           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1431           x1r = ixheaacd_add32_sat(x1r, x3r);
1432           x1i = ixheaacd_add32_sat(x1i, x3i);
1433           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1434           x3i = ixheaacd_sub32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1435 
1436           x0r = ixheaacd_add32_sat(x0r, x1r);
1437           x0i = ixheaacd_add32_sat(x0i, x1i);
1438           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1439           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1440           x2r = ixheaacd_sub32_sat(x2r, x3i);
1441           x2i = ixheaacd_add32_sat(x2i, x3r);
1442           x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1443           x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1444 
1445           *data = x0r;
1446           *(data + 1) = x0i;
1447           data += (del << 1);
1448 
1449           *data = x2r;
1450           *(data + 1) = x2i;
1451           data += (del << 1);
1452 
1453           *data = x1r;
1454           *(data + 1) = x1i;
1455           data += (del << 1);
1456 
1457           *data = x3i;
1458           *(data + 1) = x3r;
1459           data += (del << 1);
1460         }
1461         data -= 2 * npoints;
1462         data += 2;
1463       }
1464       for (; j < nodespacing * del; j += nodespacing) {
1465         w1h = *(twiddles + 2 * j);
1466         w2h = *(twiddles + 2 * (j << 1) - 512);
1467         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
1468         w1l = *(twiddles + 2 * j + 1);
1469         w2l = *(twiddles + 2 * (j << 1) - 511);
1470         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
1471 
1472         for (k = in_loop_cnt; k != 0; k--) {
1473           WORD32 tmp;
1474           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1475 
1476           data += (del << 1);
1477 
1478           x1r = *data;
1479           x1i = *(data + 1);
1480           data += (del << 1);
1481 
1482           x2r = *data;
1483           x2i = *(data + 1);
1484           data += (del << 1);
1485 
1486           x3r = *data;
1487           x3i = *(data + 1);
1488           data -= 3 * (del << 1);
1489 
1490           tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1491                                    ixheaacd_mult32_sat(x1i, w1h));
1492           x1i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1493           x1r = tmp;
1494 
1495           tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x2r, w2h),
1496                                    ixheaacd_mult32_sat(x2i, w2l));
1497           x2i = ixheaacd_add32_sat(ixheaacd_mult32_sat(x2r, w2l),
1498                                    ixheaacd_mult32_sat(x2i, w2h));
1499           x2r = tmp;
1500 
1501           tmp = -ixheaacd_add32_sat(ixheaacd_mult32_sat(x3r, w3l),
1502                                     ixheaacd_mult32_sat(x3i, w3h));
1503           x3i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x3r, w3h), x3i, w3l);
1504           x3r = tmp;
1505 
1506           x0r = (*data);
1507           x0i = (*(data + 1));
1508 
1509           x0r = ixheaacd_add32_sat(x0r, x2r);
1510           x0i = ixheaacd_add32_sat(x0i, x2i);
1511           x2r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x2r, 1));
1512           x2i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x2i, 1));
1513           x1r = ixheaacd_add32_sat(x1r, x3r);
1514           x1i = ixheaacd_sub32_sat(x1i, x3i);
1515           x3r = ixheaacd_sub32_sat(x1r, ixheaacd_shl32_sat(x3r, 1));
1516           x3i = ixheaacd_add32_sat(x1i, ixheaacd_shl32_sat(x3i, 1));
1517 
1518           x0r = ixheaacd_add32_sat(x0r, x1r);
1519           x0i = ixheaacd_add32_sat(x0i, x1i);
1520           x1r = ixheaacd_sub32_sat(x0r, ixheaacd_shl32_sat(x1r, 1));
1521           x1i = ixheaacd_sub32_sat(x0i, ixheaacd_shl32_sat(x1i, 1));
1522           x2r = ixheaacd_sub32_sat(x2r, x3i);
1523           x2i = ixheaacd_add32_sat(x2i, x3r);
1524           x3i = ixheaacd_add32_sat(x2r, ixheaacd_shl32_sat(x3i, 1));
1525           x3r = ixheaacd_sub32_sat(x2i, ixheaacd_shl32_sat(x3r, 1));
1526 
1527           *data = x0r;
1528           *(data + 1) = x0i;
1529           data += (del << 1);
1530 
1531           *data = x2r;
1532           *(data + 1) = x2i;
1533           data += (del << 1);
1534 
1535           *data = x1r;
1536           *(data + 1) = x1i;
1537           data += (del << 1);
1538 
1539           *data = x3i;
1540           *(data + 1) = x3r;
1541           data += (del << 1);
1542         }
1543         data -= 2 * npoints;
1544         data += 2;
1545       }
1546       nodespacing >>= 2;
1547       del <<= 2;
1548       in_loop_cnt >>= 2;
1549     }
1550     if (not_power_4) {
1551       const WORD32 *twiddles = ptr_w;
1552       nodespacing <<= 1;
1553       shift += 1;
1554       for (j = del / 2; j != 0; j--) {
1555         WORD32 w1h = *twiddles;
1556         WORD32 w1l = *(twiddles + 1);
1557 
1558         WORD32 tmp;
1559         twiddles += nodespacing * 2;
1560 
1561         x0r = *ptr_y;
1562         x0i = *(ptr_y + 1);
1563         ptr_y += (del << 1);
1564 
1565         x1r = *ptr_y;
1566         x1i = *(ptr_y + 1);
1567 
1568         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1569                                  ixheaacd_mult32_sat(x1i, w1h));
1570         x1i = ixheaacd_mac32_sat(-ixheaacd_mult32_sat(x1r, w1h), x1i, w1l);
1571         x1r = tmp;
1572 
1573         *ptr_y = (x0r) / 2 - (x1r) / 2;
1574         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1575         ptr_y -= (del << 1);
1576 
1577         *ptr_y = (x0r) / 2 + (x1r) / 2;
1578         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1579         ptr_y += 2;
1580       }
1581       twiddles = ptr_w;
1582       for (j = del / 2; j != 0; j--) {
1583         WORD32 w1h = *twiddles;
1584         WORD32 w1l = *(twiddles + 1);
1585         WORD32 tmp;
1586         twiddles += nodespacing * 2;
1587 
1588         x0r = *ptr_y;
1589         x0i = *(ptr_y + 1);
1590         ptr_y += (del << 1);
1591 
1592         x1r = *ptr_y;
1593         x1i = *(ptr_y + 1);
1594 
1595         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(x1r, w1h),
1596                                  ixheaacd_mult32_sat(x1i, w1l));
1597         x1i = ixheaacd_add32_sat(ixheaacd_mult32_sat(x1r, w1l),
1598                                  ixheaacd_mult32_sat(x1i, w1h));
1599         x1r = tmp;
1600 
1601         *ptr_y = (x0r) / 2 - (x1r) / 2;
1602         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1603         ptr_y -= (del << 1);
1604 
1605         *ptr_y = (x0r) / 2 + (x1r) / 2;
1606         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1607         ptr_y += 2;
1608       }
1609     }
1610   }
1611 
1612   for (i = 0; i < nlength; i++) {
1613     xr[i] = y[2 * i];
1614     xi[i] = y[2 * i + 1];
1615   }
1616 
1617   *preshift = shift - *preshift;
1618   return;
1619 }
1620 
ixheaacd_complex_3point_fft(WORD32 * inp,WORD32 * op,WORD32 sign_dir)1621 static PLATFORM_INLINE void ixheaacd_complex_3point_fft(WORD32 *inp, WORD32 *op,
1622                                                         WORD32 sign_dir) {
1623   WORD32 add_r, sub_r;
1624   WORD32 add_i, sub_i;
1625   WORD32 temp_real, temp_imag, temp;
1626 
1627   WORD32 p1, p2, p3, p4;
1628 
1629   WORD32 sinmu;
1630   sinmu = -1859775393 * sign_dir;
1631 
1632   temp_real = ixheaacd_add32_sat(inp[0], inp[2]);
1633   temp_imag = ixheaacd_add32_sat(inp[1], inp[3]);
1634 
1635   add_r = ixheaacd_add32_sat(inp[2], inp[4]);
1636   add_i = ixheaacd_add32_sat(inp[3], inp[5]);
1637 
1638   sub_r = ixheaacd_sub32_sat(inp[2], inp[4]);
1639   sub_i = ixheaacd_sub32_sat(inp[3], inp[5]);
1640 
1641   p1 = add_r >> 1;
1642   p4 = add_i >> 1;
1643   p2 = ixheaacd_mult32_shl(sub_i, sinmu);
1644   p3 = ixheaacd_mult32_shl(sub_r, sinmu);
1645 
1646   temp = ixheaacd_sub32(inp[0], p1);
1647 
1648   op[0] = ixheaacd_add32_sat(temp_real, inp[4]);
1649   op[1] = ixheaacd_add32_sat(temp_imag, inp[5]);
1650   op[2] = ixheaacd_add32_sat(temp, p2);
1651   op[3] = ixheaacd_sub32_sat(ixheaacd_sub32_sat(inp[1], p3), p4);
1652   op[4] = ixheaacd_sub32_sat(temp, p2);
1653   op[5] = ixheaacd_sub32_sat(ixheaacd_add32_sat(inp[1], p3), p4);
1654 
1655   return;
1656 }
1657 
ixheaacd_complex_fft_p3(WORD32 * xr,WORD32 * xi,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)1658 VOID ixheaacd_complex_fft_p3(WORD32 *xr, WORD32 *xi, WORD32 nlength,
1659                              WORD32 fft_mode, WORD32 *preshift) {
1660   WORD32 i, j;
1661   WORD32 shift = 0;
1662   WORD32 xr_3[384];
1663   WORD32 xi_3[384];
1664   WORD32 x[1024];
1665   WORD32 y[1024];
1666   WORD32 cnfac, npts;
1667   WORD32 mpass = nlength;
1668   WORD32 n = 0;
1669   WORD32 *ptr_x = x;
1670   WORD32 *ptr_y = y;
1671 
1672   cnfac = 0;
1673   while (mpass % 3 == 0) {
1674     mpass /= 3;
1675     cnfac++;
1676   }
1677   npts = mpass;
1678 
1679   for (i = 0; i < 3 * cnfac; i++) {
1680     for (j = 0; j < mpass; j++) {
1681       xr_3[j] = xr[3 * j + i];
1682       xi_3[j] = xi[3 * j + i];
1683     }
1684 
1685     (*ixheaacd_complex_fft_p2)(xr_3, xi_3, mpass, fft_mode, &shift);
1686 
1687     for (j = 0; j < mpass; j++) {
1688       xr[3 * j + i] = xr_3[j];
1689       xi[3 * j + i] = xi_3[j];
1690     }
1691   }
1692 
1693   while (npts >> 1) {
1694     n++;
1695     npts = npts >> 1;
1696   }
1697 
1698   if (n % 2 == 0)
1699     shift = ((n + 4)) / 2;
1700   else
1701     shift = ((n + 5) / 2);
1702 
1703   *preshift = shift - *preshift + 1;
1704 
1705   for (i = 0; i < nlength; i++) {
1706     ptr_x[2 * i] = (xr[i] >> 1);
1707     ptr_x[2 * i + 1] = (xi[i] >> 1);
1708   }
1709 
1710   {
1711     const WORD32 *w1r, *w1i;
1712     WORD32 tmp;
1713     w1r = ixheaacd_twiddle_table_3pr;
1714     w1i = ixheaacd_twiddle_table_3pi;
1715 
1716     if (fft_mode < 0) {
1717       for (i = 0; i < nlength; i += 3) {
1718         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i], (*w1r)),
1719                                  ixheaacd_mult32_sat(ptr_x[2 * i + 1], (*w1i)));
1720         ptr_x[2 * i + 1] =
1721             ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i], (*w1i)),
1722                                ixheaacd_mult32_sat(ptr_x[2 * i + 1], (*w1r)));
1723         ptr_x[2 * i] = tmp;
1724 
1725         w1r++;
1726         w1i++;
1727 
1728         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 2], (*w1r)),
1729                                  ixheaacd_mult32_sat(ptr_x[2 * i + 3], (*w1i)));
1730         ptr_x[2 * i + 3] =
1731             ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 2], (*w1i)),
1732                                ixheaacd_mult32_sat(ptr_x[2 * i + 3], (*w1r)));
1733         ptr_x[2 * i + 2] = tmp;
1734 
1735         w1r++;
1736         w1i++;
1737 
1738         tmp = ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 4], (*w1r)),
1739                                  ixheaacd_mult32_sat(ptr_x[2 * i + 5], (*w1i)));
1740         ptr_x[2 * i + 5] =
1741             ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 4], (*w1i)),
1742                                ixheaacd_mult32_sat(ptr_x[2 * i + 5], (*w1r)));
1743         ptr_x[2 * i + 4] = tmp;
1744 
1745         w1r += 3 * (128 / mpass - 1) + 1;
1746         w1i += 3 * (128 / mpass - 1) + 1;
1747       }
1748     }
1749 
1750     else {
1751       for (i = 0; i < nlength; i += 3) {
1752         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i], (*w1r)),
1753                                  ixheaacd_mult32_sat(ptr_x[2 * i + 1], (*w1i)));
1754         ptr_x[2 * i + 1] =
1755             ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 1], (*w1r)),
1756                                ixheaacd_mult32_sat(ptr_x[2 * i], (*w1i)));
1757         ptr_x[2 * i] = tmp;
1758 
1759         w1r++;
1760         w1i++;
1761 
1762         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 2], (*w1r)),
1763                                  ixheaacd_mult32_sat(ptr_x[2 * i + 3], (*w1i)));
1764         ptr_x[2 * i + 3] =
1765             ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 3], (*w1r)),
1766                                ixheaacd_mult32_sat(ptr_x[2 * i + 2], (*w1i)));
1767         ptr_x[2 * i + 2] = tmp;
1768 
1769         w1r++;
1770         w1i++;
1771 
1772         tmp = ixheaacd_add32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 4], (*w1r)),
1773                                  ixheaacd_mult32_sat(ptr_x[2 * i + 5], (*w1i)));
1774         ptr_x[2 * i + 5] =
1775             ixheaacd_sub32_sat(ixheaacd_mult32_sat(ptr_x[2 * i + 5], (*w1r)),
1776                                ixheaacd_mult32_sat(ptr_x[2 * i + 4], (*w1i)));
1777         ptr_x[2 * i + 4] = tmp;
1778 
1779         w1r += 3 * (128 / mpass - 1) + 1;
1780         w1i += 3 * (128 / mpass - 1) + 1;
1781       }
1782     }
1783   }
1784 
1785   for (i = 0; i < mpass; i++) {
1786     ixheaacd_complex_3point_fft(ptr_x, ptr_y, fft_mode);
1787 
1788     ptr_x = ptr_x + 6;
1789     ptr_y = ptr_y + 6;
1790   }
1791 
1792   for (i = 0; i < mpass; i++) {
1793     xr[i] = y[6 * i];
1794     xi[i] = y[6 * i + 1];
1795   }
1796 
1797   for (i = 0; i < mpass; i++) {
1798     xr[mpass + i] = y[6 * i + 2];
1799     xi[mpass + i] = y[6 * i + 3];
1800   }
1801 
1802   for (i = 0; i < mpass; i++) {
1803     xr[2 * mpass + i] = y[6 * i + 4];
1804     xi[2 * mpass + i] = y[6 * i + 5];
1805   }
1806   return;
1807 }
1808 
ixheaacd_complex_fft(WORD32 * data_r,WORD32 * data_i,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)1809 VOID ixheaacd_complex_fft(WORD32 *data_r, WORD32 *data_i, WORD32 nlength,
1810                           WORD32 fft_mode, WORD32 *preshift) {
1811   if (nlength & (nlength - 1)) {
1812     if ((nlength != 24) && (nlength != 48) && (nlength != 96) &&
1813         (nlength != 192) && (nlength != 384)) {
1814       printf("%d point FFT not supported", nlength);
1815       exit(0);
1816     }
1817     ixheaacd_complex_fft_p3(data_r, data_i, nlength, fft_mode, preshift);
1818   } else
1819     (*ixheaacd_complex_fft_p2)(data_r, data_i, nlength, fft_mode, preshift);
1820 
1821   return;
1822 }
1823