1.text
2.p2align 2
3.global ixheaacd_mps_complex_fft_64_asm
4.type ixheaacd_mps_complex_fft_64_asm, %function
5
6ixheaacd_mps_complex_fft_64_asm:
7    @LDR    r4,[sp]
8    STMFD           sp!, {r0-r12, lr}
9    LDR             r4, [sp, #0x38]
10    SUB             sp, sp, #0x44
11    LDR             r0, [sp, #0x48]
12    EOR             r0, r0, r0, ASR #31
13    CLZ             r0, r0
14    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
15    SUB             r0, r0, #1
16    RSB             r0, r0, #0x1e
17    AND             r1, r0, #1
18    STR             r1, [sp, #0x30]
19    MOV             r1, r0, ASR #1
20    LDR             r0, [sp, #0x48]     @npoints
21    STR             r1, [sp, #0x18]
22    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
23    MOV             r0, #0
24    MOV             r12, r4
25FIRST_STAGE_R4:
26    LDRB            r10, [r12, r0, LSR #2]
27
28
29    ADD             r1, r2, r10, LSL #2
30    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
31    ADD             r1, r1, lr
32    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
33    ADD             r1, r1, lr
34    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
35    ADD             r1, r1, lr
36    LDRD            r10, [r1]           @r10=x3r, r11=x3i
37    ADD             r0, r0, #4
38    CMP             r0, lr, ASR #1
39
40    ADD             r4, r4, r6          @x0r = x0r + x2r@
41    ADD             r5, r5, r7          @x0i = x0i + x2i@
42    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
43    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
44    ADD             r8, r8, r10         @x1r = x1r + x3r@
45    ADD             r9, r9, r11         @x1i = x1i + x3i@
46    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
47    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
48
49    ADD             r4, r4, r8          @x0r = x0r + x1r@
50    ADD             r5, r5, r9          @x0i = x0i + x1i@
51    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
52    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
53    ADD             r6, r6, r11         @x2r = x2r + x3i@
54    SUB             r7, r7, r1          @x2i = x2i - x3r@
55    SUB             r10, r6, r11, lsl#1 @x3i = x2r - (x3i << 1)@
56    ADD             r11, r7, r1, lsl#1  @x3r = x2i + (x3r << 1)@
57
58    STMIA           r3!, {r4-r11}
59    BLT             FIRST_STAGE_R4
60    LDR             r1, [sp, #0x18]
61    LDR             r0, [sp, #0x48]
62    MOV             r12, #0x40          @nodespacing = 64@
63    STR             r12, [sp, #0x38]
64    LDR             r12, [sp, #0x48]
65    SUB             r3, r3, r0, LSL #3
66    SUBS            r1, r1, #1
67    STR             r3, [sp, #0x50]
68    MOV             r4, r12, ASR #4
69    MOV             r0, #4
70    STR             r4, [sp, #0x34]
71    STR             r1, [sp, #0x3c]
72    BLE             EXIT
73OUTER_LOOP:
74    LDR             r1, [sp, #0x44]
75    LDR             r12, [sp, #0x50]    @WORD32 *data = ptr_y@
76    STR             r1, [sp, #0x2c]
77    LDR             r1, [sp, #0x34]
78
79    MOV             r0, r0, LSL #3      @(del<<1) * 4
80LOOP_TRIVIAL_TWIDDLE:
81    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
82    ADD             r12, r12, r0
83    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
84    ADD             r12, r12, r0
85    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
86    ADD             r12, r12, r0
87    LDRD            r10, [r12]          @r10=x3r, r11=x3i
88
89@MOV    r4,r4,ASR #1
90@MOV    r5,r5,ASR #1
91@MOV    r6,r6,ASR #1
92@MOV    r7,r7,ASR #1
93@MOV    r8,r8,ASR #1
94@MOV    r9,r9,ASR #1
95@MOV    r10,r10,ASR #1
96@MOV    r11,r11,ASR #1
97
98    ADD             r4, r4, r8          @x0r = x0r + x2r@
99    ADD             r5, r5, r9          @x0i = x0i + x2i@
100    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
101    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
102    ADD             r6, r6, r10         @x1r = x1r + x3r@
103    ADD             r7, r7, r11         @x1i = x1i + x3i@
104    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
105    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
106
107    ADD             r4, r4, r6          @x0r = x0r + x1r@
108    ADD             r5, r5, r7          @x0i = x0i + x1i@
109@MOV    r4,r4,ASR #1
110@MOV    r5,r5,ASR #1
111    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
112    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
113    ADD             r8, r8, r11         @x2r = x2r + x3i@
114    SUB             r9, r9, r2          @x2i = x2i - x3r@
115    SUB             r10, r8, r11, lsl#1 @x3i = x2r - (x3i << 1)@
116    ADD             r11, r9, r2, lsl#1  @x3r = x2i + (x3r << 1)
117
118    STRD            r10, [r12]          @r10=x3r, r11=x3i
119    SUB             r12, r12, r0
120    STRD            r6, [r12]           @r6=x1r,  r7=x1i
121    SUB             r12, r12, r0
122    STRD            r8, [r12]           @r8=x2r,  r9=x2i
123    SUB             r12, r12, r0
124    STRD            r4, [r12]           @r4=x0r,  r5=x0i
125    ADD             r12, r12, r0, lsl #2
126
127    SUBS            r1, r1, #1
128    BNE             LOOP_TRIVIAL_TWIDDLE
129
130    MOV             r0, r0, ASR #3
131    LDR             r4, [sp, #0x38]
132    LDR             r3, [sp, #0x50]
133    MUL             r1, r0, r4
134    ADD             r12, r3, #8
135    STR             r1, [sp, #0x40]
136    MOV             r3, r1, ASR #2
137    ADD             r3, r3, r1, ASR #3
138    SUB             r3, r3, r1, ASR #4
139    ADD             r3, r3, r1, ASR #5
140    SUB             r3, r3, r1, ASR #6
141    ADD             r3, r3, r1, ASR #7
142    SUB             r3, r3, r1, ASR #8
143    STR             r3, [sp, #0x18]
144SECOND_LOOP:
145    LDR             r3, [sp, #0x2c]
146    LDR             r14, [sp, #0x34]
147    MOV             r0, r0, LSL #3      @(del<<1) * 4
148    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
149    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
150    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
151    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
152    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
153    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
154
155    STR             r4, [sp, #0x24]
156    STR             r1, [sp, #0x14]
157    STR             r2, [sp, #0x10]
158    STR             r5, [sp, #0x0c]
159    STR             r6, [sp, #0x08]
160    STR             r7, [sp, #0x04]
161    STR             r8, [sp]
162
163RADIX4_BFLY:
164
165    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
166    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
167    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
168    SUBS            r14, r14, #1
169
170    LDR             r1, [sp, #0x14]
171    LDR             r2, [sp, #0x10]
172
173    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
174    LSR             r3, r3, #31
175    ORR             r4, r3, r4, LSL#1
176    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
177    LSR             r3, r3, #31
178    ORR             r6, r3, r6, LSL#1
179    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
180    LSR             r3, r3, #31
181    ORR             r5, r3, r5, LSL#1
182    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
183    LSR             r3, r3, #31
184    ORR             r7, r3, r7, LSL#1
185    ADD             r7, r7, r6
186    SUB             r6, r4, r5          @
187
188    LDR             r1, [sp, #0x0c]
189    LDR             r2, [sp, #0x08]
190
191    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
192    LSR             r3, r3, #31
193    ORR             r4, r3, r4, LSL#1
194    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
195    LSR             r3, r3, #31
196    ORR             r8, r3, r8, LSL#1
197    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
198    LSR             r3, r3, #31
199    ORR             r5, r3, r5, LSL#1
200    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
201    LSR             r3, r3, #31
202    ORR             r9, r3, r9, LSL#1
203    ADD             r9, r9, r8
204    SUB             r8, r4, r5          @
205
206    LDR             r1, [sp, #0x04]
207    LDR             r2, [sp]
208
209    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
210    LSR             r3, r3, #31
211    ORR             r4, r3, r4, LSL#1
212    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
213    LSR             r3, r3, #31
214    ORR             r10, r3, r10, LSL#1
215    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
216    LSR             r3, r3, #31
217    ORR             r5, r3, r5, LSL#1
218    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
219    LSR             r3, r3, #31
220    ORR             r11, r3, r11, LSL#1
221    ADD             r11, r11, r10
222    SUB             r10, r4, r5         @
223
224    @SUB   r12,r12,r0,lsl #1
225    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
226    LDR             r4, [r12, -r0, lsl #1]! @
227    LDR             r5, [r12, #0x04]
228
229
230    ADD             r4, r8, r4          @x0r = x0r + x2r@
231    ADD             r5, r9, r5          @x0i = x0i + x2i@
232    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
233    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
234    ADD             r6, r6, r10         @x1r = x1r + x3r@
235    ADD             r7, r7, r11         @x1i = x1i + x3i@
236    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
237    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
238
239    ADD             r4, r4, r6          @x0r = x0r + x1r@
240    ADD             r5, r5, r7          @x0i = x0i + x1i@
241    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
242    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
243    STRD            r4, [r12]           @r4=x0r,  r5=x0i
244    ADD             r12, r12, r0
245
246    ADD             r8, r8, r11         @x2r = x2r + x3i@
247    SUB             r9, r9, r10         @x2i = x2i - x3r@
248    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
249    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
250
251    STRD            r8, [r12]           @r8=x2r,  r9=x2i
252    ADD             r12, r12, r0
253    STRD            r6, [r12]           @r6=x1r,  r7=x1i
254    ADD             r12, r12, r0
255    STRD            r4, [r12]           @r10=x3r, r11=x3i
256    ADD             r12, r12, r0
257
258    BNE             RADIX4_BFLY
259    MOV             r0, r0, ASR #3
260
261    LDR             r1, [sp, #0x48]
262    LDR             r4, [sp, #0x24]
263    SUB             r1, r12, r1, LSL #3
264    LDR             r6, [sp, #0x38]
265    ADD             r12, r1, #8
266    LDR             r7, [sp, #0x18]
267    ADD             r4, r4, r6
268    CMP             r4, r7
269    BLE             SECOND_LOOP
270
271SECOND_LOOP_2:
272    LDR             r3, [sp, #0x2c]
273    LDR             r14, [sp, #0x34]
274    MOV             r0, r0, LSL #3      @(del<<1) * 4
275
276    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
277    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
278    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
279    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
280    SUB             r3, r3, #2048       @ 512 *4
281    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
282    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
283
284    STR             r4, [sp, #0x24]
285
286    STR             r1, [sp, #0x14]
287    STR             r2, [sp, #0x10]
288    STR             r5, [sp, #0x0c]
289    STR             r6, [sp, #0x08]
290    STR             r7, [sp, #0x04]
291    STR             r8, [sp]
292
293RADIX4_BFLY_2:
294    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
295    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
296    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
297    SUBS            r14, r14, #1
298    LDR             r1, [sp, #0x14]
299    LDR             r2, [sp, #0x10]
300
301    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
302    LSR             r3, r3, #31
303    ORR             r4, r3, r4, LSL#1
304    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
305    LSR             r3, r3, #31
306    ORR             r6, r3, r6, LSL#1
307    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
308    LSR             r3, r3, #31
309    ORR             r5, r3, r5, LSL#1
310    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
311    LSR             r3, r3, #31
312    ORR             r7, r3, r7, LSL#1
313    ADD             r7, r7, r6
314    SUB             r6, r4, r5          @
315
316    LDR             r1, [sp, #0x0c]
317    LDR             r2, [sp, #0x08]
318
319    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
320    LSR             r3, r3, #31
321    ORR             r4, r3, r4, LSL#1
322    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
323    LSR             r3, r3, #31
324    ORR             r8, r3, r8, LSL#1
325    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
326    LSR             r3, r3, #31
327    ORR             r5, r3, r5, LSL#1
328    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
329    LSR             r3, r3, #31
330    ORR             r9, r3, r9, LSL#1
331    ADD             r9, r9, r8
332    SUB             r8, r4, r5          @
333
334    LDR             r1, [sp, #0x04]
335    LDR             r2, [sp]
336
337    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
338    LSR             r3, r3, #31
339    ORR             r4, r3, r4, LSL#1
340    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
341    LSR             r3, r3, #31
342    ORR             r10, r3, r10, LSL#1
343    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
344    LSR             r3, r3, #31
345    ORR             r5, r3, r5, LSL#1
346    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
347    LSR             r3, r3, #31
348    ORR             r11, r3, r11, LSL#1
349    ADD             r10, r11, r10
350    SUB             r11, r5, r4         @
351
352    @SUB    r12,r12,r0,lsl #1
353    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
354    LDR             r4, [r12, -r0, lsl #1]! @
355    LDR             r5, [r12, #0x04]
356
357
358    ADD             r4, r8, r4          @x0r = x0r + x2r@
359    ADD             r5, r9, r5          @x0i = x0i + x2i@
360    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
361    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
362    ADD             r6, r6, r10         @x1r = x1r + x3r@
363    ADD             r7, r7, r11         @x1i = x1i + x3i@
364    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
365    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
366
367    ADD             r4, r4, r6          @x0r = x0r + x1r@
368    ADD             r5, r5, r7          @x0i = x0i + x1i@
369    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
370    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
371    STRD            r4, [r12]           @r4=x0r,  r5=x0i
372    ADD             r12, r12, r0
373
374    ADD             r8, r8, r11         @x2r = x2r + x3i@
375    SUB             r9, r9, r10         @x2i = x2i - x3r@
376    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
377    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
378
379    STRD            r8, [r12]           @r8=x2r,  r9=x2i
380    ADD             r12, r12, r0
381    STRD            r6, [r12]           @r6=x1r,  r7=x1i
382    ADD             r12, r12, r0
383    STRD            r4, [r12]           @r10=x3r, r11=x3i
384    ADD             r12, r12, r0
385
386    BNE             RADIX4_BFLY_2
387    MOV             r0, r0, ASR #3
388
389    LDR             r1, [sp, #0x48]
390    LDR             r4, [sp, #0x24]
391    SUB             r1, r12, r1, LSL #3
392    LDR             r6, [sp, #0x38]
393    ADD             r12, r1, #8
394    LDR             r7, [sp, #0x40]
395    ADD             r4, r4, r6
396    CMP             r4, r7, ASR #1
397    BLE             SECOND_LOOP_2
398    LDR             r7, [sp, #0x18]
399    CMP             r4, r7, LSL #1
400    BGT             SECOND_LOOP_4
401
402SECOND_LOOP_3:
403    LDR             r3, [sp, #0x2c]
404    LDR             r14, [sp, #0x34]
405    MOV             r0, r0, LSL #3      @(del<<1) * 4
406
407    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
408    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
409    SUB             r3, r3, #2048       @ 512 *4
410    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
411    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
412    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
413    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
414
415    STR             r4, [sp, #0x24]
416    STR             r1, [sp, #0x14]
417    STR             r2, [sp, #0x10]
418    STR             r5, [sp, #0x0c]
419    STR             r6, [sp, #0x08]
420    STR             r7, [sp, #0x04]
421    STR             r8, [sp]
422
423
424RADIX4_BFLY_3:
425    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
426    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
427    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
428    SUBS            r14, r14, #1
429
430    LDR             r1, [sp, #0x14]
431    LDR             r2, [sp, #0x10]
432
433    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
434    LSR             r3, r3, #31
435    ORR             r4, r3, r4, LSL#1
436    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
437    LSR             r3, r3, #31
438    ORR             r6, r3, r6, LSL#1
439    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
440    LSR             r3, r3, #31
441    ORR             r5, r3, r5, LSL#1
442    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
443    LSR             r3, r3, #31
444    ORR             r7, r3, r7, LSL#1
445    ADD             r7, r7, r6
446    SUB             r6, r4, r5          @
447
448    LDR             r1, [sp, #0x0c]
449    LDR             r2, [sp, #0x08]
450
451    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
452    LSR             r3, r3, #31
453    ORR             r4, r3, r4, LSL#1
454    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
455    LSR             r3, r3, #31
456    ORR             r8, r3, r8, LSL#1
457    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
458    LSR             r3, r3, #31
459    ORR             r5, r3, r5, LSL#1
460    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
461    LSR             r3, r3, #31
462    ORR             r9, r3, r9, LSL#1
463    ADD             r8, r9, r8
464    SUB             r9, r5, r4          @
465
466    LDR             r1, [sp, #0x04]
467    LDR             r2, [sp]
468
469    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
470    LSR             r3, r3, #31
471    ORR             r4, r3, r4, LSL#1
472    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
473    LSR             r3, r3, #31
474    ORR             r10, r3, r10, LSL#1
475    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
476    LSR             r3, r3, #31
477    ORR             r5, r3, r5, LSL#1
478    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
479    LSR             r3, r3, #31
480    ORR             r11, r3, r11, LSL#1
481    ADD             r10, r11, r10
482    SUB             r11, r5, r4         @
483
484    @SUB    r12,r12,r0,lsl #1
485    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
486    LDR             r4, [r12, -r0, lsl #1]! @
487    LDR             r5, [r12, #0x04]
488
489
490    ADD             r4, r8, r4          @x0r = x0r + x2r@
491    ADD             r5, r9, r5          @x0i = x0i + x2i@
492    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
493    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
494    ADD             r6, r6, r10         @x1r = x1r + x3r@
495    ADD             r7, r7, r11         @x1i = x1i + x3i@
496    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
497    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
498
499    ADD             r4, r4, r6          @x0r = x0r + x1r@
500    ADD             r5, r5, r7          @x0i = x0i + x1i@
501    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
502    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
503    STRD            r4, [r12]           @r4=x0r,  r5=x0i
504    ADD             r12, r12, r0
505
506    ADD             r8, r8, r11         @x2r = x2r + x3i@
507    SUB             r9, r9, r10         @x2i = x2i - x3r@
508    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
509    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
510
511    STRD            r8, [r12]           @r8=x2r,  r9=x2i
512    ADD             r12, r12, r0
513    STRD            r6, [r12]           @r6=x1r,  r7=x1i
514    ADD             r12, r12, r0
515    STRD            r4, [r12]           @r10=x3r, r11=x3i
516    ADD             r12, r12, r0
517
518    BNE             RADIX4_BFLY_3
519    MOV             r0, r0, ASR #3
520
521    LDR             r1, [sp, #0x48]
522    LDR             r4, [sp, #0x24]
523    SUB             r1, r12, r1, LSL #3
524    LDR             r6, [sp, #0x38]
525    ADD             r12, r1, #8
526    LDR             r7, [sp, #0x18]
527    ADD             r4, r4, r6
528    CMP             r4, r7, LSL #1
529    BLE             SECOND_LOOP_3
530
531SECOND_LOOP_4:
532    LDR             r3, [sp, #0x2c]
533    LDR             r14, [sp, #0x34]
534    MOV             r0, r0, LSL #3      @(del<<1) * 4
535
536    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
537    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
538    SUB             r3, r3, #2048       @ 512 *4
539    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
540    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
541    SUB             r3, r3, #2048       @ 512 *4
542    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
543    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
544
545
546    STR             r4, [sp, #0x24]
547    STR             r1, [sp, #0x14]
548    STR             r2, [sp, #0x10]
549    STR             r5, [sp, #0x0c]
550    STR             r6, [sp, #0x08]
551    STR             r7, [sp, #0x04]
552    STR             r8, [sp]
553
554RADIX4_BFLY_4:
555    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
556    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
557    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
558    SUBS            r14, r14, #1
559
560    LDR             r1, [sp, #0x14]
561    LDR             r2, [sp, #0x10]
562
563    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
564    LSR             r3, r3, #31
565    ORR             r4, r3, r4, LSL#1
566    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
567    LSR             r3, r3, #31
568    ORR             r6, r3, r6, LSL#1
569    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
570    LSR             r3, r3, #31
571    ORR             r5, r3, r5, LSL#1
572    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
573    LSR             r3, r3, #31
574    ORR             r7, r3, r7, LSL#1
575    ADD             r7, r7, r6
576    SUB             r6, r4, r5          @
577
578    LDR             r1, [sp, #0x0c]
579    LDR             r2, [sp, #0x08]
580
581    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
582    LSR             r3, r3, #31
583    ORR             r4, r3, r4, LSL#1
584    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
585    LSR             r3, r3, #31
586    ORR             r8, r3, r8, LSL#1
587    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
588    LSR             r3, r3, #31
589    ORR             r5, r3, r5, LSL#1
590    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
591    LSR             r3, r3, #31
592    ORR             r9, r3, r9, LSL#1
593    ADD             r8, r9, r8
594    SUB             r9, r5, r4          @
595
596    LDR             r1, [sp, #0x04]
597    LDR             r2, [sp]
598
599    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
600    LSR             r3, r3, #31
601    ORR             r4, r3, r4, LSL#1
602    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
603    LSR             r3, r3, #31
604    ORR             r10, r3, r10, LSL#1
605    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
606    LSR             r3, r3, #31
607    ORR             r5, r3, r5, LSL#1
608    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
609    LSR             r3, r3, #31
610    ORR             r11, r3, r11, LSL#1
611    ADD             r11, r11, r10
612    SUB             r10, r5, r4         @
613
614    @SUB    r12,r12,r0,lsl #1
615    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
616    LDR             r4, [r12, -r0, lsl #1]! @
617    LDR             r5, [r12, #0x04]
618
619
620    ADD             r4, r8, r4          @x0r = x0r + x2r@
621    ADD             r5, r9, r5          @x0i = x0i + x2i@
622    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
623    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
624    ADD             r6, r6, r10         @x1r = x1r + x3r@
625    SUB             r7, r7, r11         @x1i = x1i - x3i@
626    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
627    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
628
629    ADD             r4, r4, r6          @x0r = x0r + x1r@
630    ADD             r5, r5, r7          @x0i = x0i + x1i@
631    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
632    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
633    STRD            r4, [r12]           @r4=x0r,  r5=x0i
634    ADD             r12, r12, r0
635    ADD             r8, r8, r11         @x2r = x2r + x3i@
636    SUB             r9, r9, r10         @x2i = x2i - x3r@
637    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
638    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
639
640    STRD            r8, [r12]           @r8=x2r,  r9=x2i
641    ADD             r12, r12, r0
642    STRD            r6, [r12]           @r6=x1r,  r7=x1i
643    ADD             r12, r12, r0
644    STRD            r4, [r12]           @r10=x3r, r11=x3i
645    ADD             r12, r12, r0
646
647    BNE             RADIX4_BFLY_4
648    MOV             r0, r0, ASR #3
649
650    LDR             r1, [sp, #0x48]
651    LDR             r4, [sp, #0x24]
652    SUB             r1, r12, r1, LSL #3
653    LDR             r6, [sp, #0x38]
654    ADD             r12, r1, #8
655    LDR             r7, [sp, #0x40]
656    ADD             r4, r4, r6
657    CMP             r4, r7
658    BLT             SECOND_LOOP_4
659
660    LDR             r1, [sp, #0x38]
661    MOV             r0, r0, LSL #2
662    MOV             r1, r1, ASR #2
663    STR             r1, [sp, #0x38]
664    LDR             r1, [sp, #0x34]
665    MOV             r1, r1, ASR #2
666    STR             r1, [sp, #0x34]
667    LDR             r1, [sp, #0x3c]
668    SUBS            r1, r1, #1
669    STR             r1, [sp, #0x3c]
670    BGT             OUTER_LOOP
671
672    LDR             r1, [sp, #0x30]
673    CMP             r1, #0
674    BEQ             EXIT
675    LDR             r12, [sp, #0x38]
676    LDR             r1, [sp, #0x44]
677    CMP             r12, #0
678    MOVEQ           r4, #1
679    MOVNE           r4, r12, LSL #1
680    MOVS            r3, r0
681    BEQ             EXIT
682
683    MOV             r3, r3, ASR #1
684    LDR             r5, [sp, #0x50]
685    MOV             r0, r0, LSL #3      @(del<<1) * 4
686    STR             r1, [sp, #0x18]
687
688EXIT:
689    ADD             sp, sp, #0x54
690    LDMFD           sp!, {r4-r12, pc}
691
692