1.text
2.p2align 2
3.global ixheaacd_complex_ifft_p2_asm
4
5ixheaacd_complex_ifft_p2_asm:
6    STMFD           sp!, {r0-r12, lr}
7    SUB             sp, sp, #0x44
8    LDR             r0, [sp, #0x48]
9    EOR             r0, r0, r0, ASR #31
10    CLZ             r0, r0
11    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
12    SUB             r0, r0, #1
13    RSB             r0, r0, #0x1e
14    AND             r1, r0, #1
15    STR             r1, [sp, #0x30]
16    MOV             r1, r0, ASR #1
17    LDR             r0, [sp, #0x48]     @npoints
18    STR             r1, [sp, #0x18]
19    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
20    MOV             r0, #0
21
22FIRST_STAGE_R4:
23    MOVW            r4, #0x3333
24    MOVT            r4, #0x3333
25    MOVW            r5, #0x0F0F
26    MOVT            r5, #0x0F0F
27    AND             r6, r4, r0
28    AND             r7, r4, r0, LSR #2
29    ORR             r4, r7, r6, LSL #2
30    AND             r6, r5, r4
31    AND             r7, r5, r4, LSR #4
32    ORR             r4, r7, r6, LSL #4
33    BIC             r6, r4, #0x0000FF00
34    BIC             r7, r4, #0x00FF0000
35    MOV             r7, r7, LSR #8
36    ORR             r4, r7, r6, LSL #8
37    LDR             r5, [sp, #0x30]
38    MOV             r10, r4, LSR r12
39    CMP             r5, #0
40    ADDNE           r10, r10, #1
41    BICNE           r10, r10, #1
42
43    ADD             r1, r2, r10, LSL #2
44    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
45    ADD             r1, r1, lr
46    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
47    ADD             r1, r1, lr
48    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
49    ADD             r1, r1, lr
50    LDRD            r10, [r1]           @r10=x3r, r11=x3i
51    ADD             r0, r0, #4
52    CMP             r0, lr, ASR #1
53
54    ADD             r4, r4, r6          @x0r = x0r + x2r@
55    ADD             r5, r5, r7          @x0i = x0i + x2i@
56    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
57    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
58    ADD             r8, r8, r10         @x1r = x1r + x3r@
59    ADD             r9, r9, r11         @x1i = x1i + x3i@
60    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
61    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
62
63    ADD             r4, r4, r8          @x0r = x0r + x1r@
64    ADD             r5, r5, r9          @x0i = x0i + x1i@
65    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
66    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
67    SUB             r6, r6, r11         @x2r = x2r - x3i@
68    ADD             r7, r7, r1          @x2i = x2i + x3r@
69    ADD             r10, r6, r11, lsl#1 @x3i = x2r + (x3i << 1)@
70    SUB             r11, r7, r1, lsl#1  @x3r = x2i - (x3r << 1)@
71
72    STMIA           r3!, {r4-r11}
73    BLT             FIRST_STAGE_R4
74    LDR             r1, [sp, #0x18]
75    LDR             r0, [sp, #0x48]
76    MOV             r12, #0x40          @nodespacing = 64@
77    STR             r12, [sp, #0x38]
78    LDR             r12, [sp, #0x48]
79    SUB             r3, r3, r0, LSL #3
80    SUBS            r1, r1, #1
81    STR             r3, [sp, #0x50]
82    MOV             r4, r12, ASR #4
83    MOV             r0, #4
84    STR             r4, [sp, #0x34]
85    STR             r1, [sp, #0x3c]
86    BLE             RADIX2
87OUTER_LOOP:
88    LDR             r1, [sp, #0x44]
89    LDR             r12, [sp, #0x50]    @WORD32 *data = ptr_y@
90    STR             r1, [sp, #0x2c]
91    LDR             r1, [sp, #0x34]
92
93    MOV             r0, r0, LSL #3      @(del<<1) * 4
94LOOP_TRIVIAL_TWIDDLE:
95    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
96    ADD             r12, r12, r0
97    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
98    ADD             r12, r12, r0
99    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
100    ADD             r12, r12, r0
101    LDRD            r10, [r12]          @r10=x3r, r11=x3i
102
103@MOV    r4,r4,ASR #1
104@MOV    r5,r5,ASR #1
105@MOV    r6,r6,ASR #1
106@MOV    r7,r7,ASR #1
107@MOV    r8,r8,ASR #1
108@MOV    r9,r9,ASR #1
109@MOV    r10,r10,ASR #1
110@MOV    r11,r11,ASR #1
111
112    ADD             r4, r4, r8          @x0r = x0r + x2r@
113    ADD             r5, r5, r9          @x0i = x0i + x2i@
114    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
115    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
116    ADD             r6, r6, r10         @x1r = x1r + x3r@
117    ADD             r7, r7, r11         @x1i = x1i + x3i@
118    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
119    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
120
121    ADD             r4, r4, r6          @x0r = x0r + x1r@
122    ADD             r5, r5, r7          @x0i = x0i + x1i@
123@MOV    r4,r4,ASR #1
124@MOV    r5,r5,ASR #1
125    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
126    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
127    SUB             r8, r8, r11         @x2r = x2r - x3i@
128    ADD             r9, r9, r2          @x2i = x2i + x3r@
129    ADD             r10, r8, r11, lsl#1 @x3i = x2r + (x3i << 1)@
130    SUB             r11, r9, r2, lsl#1  @x3r = x2i - (x3r << 1)
131
132    STRD            r10, [r12]          @r10=x3r, r11=x3i
133    SUB             r12, r12, r0
134    STRD            r6, [r12]           @r6=x1r,  r7=x1i
135    SUB             r12, r12, r0
136    STRD            r8, [r12]           @r8=x2r,  r9=x2i
137    SUB             r12, r12, r0
138    STRD            r4, [r12]           @r4=x0r,  r5=x0i
139    ADD             r12, r12, r0, lsl #2
140
141    SUBS            r1, r1, #1
142    BNE             LOOP_TRIVIAL_TWIDDLE
143
144    MOV             r0, r0, ASR #3
145    LDR             r4, [sp, #0x38]
146    LDR             r3, [sp, #0x50]
147    MUL             r1, r0, r4
148    ADD             r12, r3, #8
149    STR             r1, [sp, #0x40]
150    MOV             r3, r1, ASR #2
151    ADD             r3, r3, r1, ASR #3
152    SUB             r3, r3, r1, ASR #4
153    ADD             r3, r3, r1, ASR #5
154    SUB             r3, r3, r1, ASR #6
155    ADD             r3, r3, r1, ASR #7
156    SUB             r3, r3, r1, ASR #8
157    STR             r3, [sp, #0x18]
158SECOND_LOOP:
159    LDR             r3, [sp, #0x2c]
160    LDR             r14, [sp, #0x34]
161    MOV             r0, r0, LSL #3      @(del<<1) * 4
162    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
163    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
164    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
165    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
166    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
167    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
168
169    STR             r4, [sp, #0x24]
170    STR             r1, [sp, #0x14]
171    STR             r2, [sp, #0x10]
172    STR             r5, [sp, #0x0c]
173    STR             r6, [sp, #0x08]
174    STR             r7, [sp, #0x04]
175    STR             r8, [sp]
176
177RADIX4_BFLY:
178
179    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
180    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
181    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
182    SUBS            r14, r14, #1
183
184    LDR             r1, [sp, #0x14]
185    LDR             r2, [sp, #0x10]
186
187    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
188    LSR             r3, r3, #31
189    ORR             r4, r3, r4, LSL#1
190    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
191    LSR             r3, r3, #31
192    ORR             r6, r3, r6, LSL#1
193    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
194    LSR             r3, r3, #31
195    ORR             r5, r3, r5, LSL#1
196    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
197    LSR             r3, r3, #31
198    ORR             r7, r3, r7, LSL#1
199    SUB             r7, r7, r6
200    ADD             r6, r4, r5          @
201
202    LDR             r1, [sp, #0x0c]
203    LDR             r2, [sp, #0x08]
204
205    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
206    LSR             r3, r3, #31
207    ORR             r4, r3, r4, LSL#1
208    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
209    LSR             r3, r3, #31
210    ORR             r8, r3, r8, LSL#1
211    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
212    LSR             r3, r3, #31
213    ORR             r5, r3, r5, LSL#1
214    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
215    LSR             r3, r3, #31
216    ORR             r9, r3, r9, LSL#1
217    SUB             r9, r9, r8
218    ADD             r8, r4, r5          @
219
220    LDR             r1, [sp, #0x04]
221    LDR             r2, [sp]
222
223    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
224    LSR             r3, r3, #31
225    ORR             r4, r3, r4, LSL#1
226    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
227    LSR             r3, r3, #31
228    ORR             r10, r3, r10, LSL#1
229    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
230    LSR             r3, r3, #31
231    ORR             r5, r3, r5, LSL#1
232    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
233    LSR             r3, r3, #31
234    ORR             r11, r3, r11, LSL#1
235    SUB             r11, r11, r10
236    ADD             r10, r4, r5         @
237
238    @SUB   r12,r12,r0,lsl #1
239    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
240    LDR             r4, [r12, -r0, lsl #1]! @
241    LDR             r5, [r12, #0x04]
242
243
244    ADD             r4, r8, r4          @x0r = x0r + x2r@
245    ADD             r5, r9, r5          @x0i = x0i + x2i@
246    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
247    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
248    ADD             r6, r6, r10         @x1r = x1r + x3r@
249    ADD             r7, r7, r11         @x1i = x1i + x3i@
250    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
251    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
252
253    ADD             r4, r4, r6          @x0r = x0r + x1r@
254    ADD             r5, r5, r7          @x0i = x0i + x1i@
255    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
256    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
257    STRD            r4, [r12]           @r4=x0r,  r5=x0i
258    ADD             r12, r12, r0
259
260    SUB             r8, r8, r11         @x2r = x2r - x3i@
261    ADD             r9, r9, r10         @x2i = x2i + x3r@
262    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
263    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
264
265    STRD            r8, [r12]           @r8=x2r,  r9=x2i
266    ADD             r12, r12, r0
267    STRD            r6, [r12]           @r6=x1r,  r7=x1i
268    ADD             r12, r12, r0
269    STRD            r4, [r12]           @r10=x3r, r11=x3i
270    ADD             r12, r12, r0
271
272    BNE             RADIX4_BFLY
273    MOV             r0, r0, ASR #3
274
275    LDR             r1, [sp, #0x48]
276    LDR             r4, [sp, #0x24]
277    SUB             r1, r12, r1, LSL #3
278    LDR             r6, [sp, #0x38]
279    ADD             r12, r1, #8
280    LDR             r7, [sp, #0x18]
281    ADD             r4, r4, r6
282    CMP             r4, r7
283    BLE             SECOND_LOOP
284
285SECOND_LOOP_2:
286    LDR             r3, [sp, #0x2c]
287    LDR             r14, [sp, #0x34]
288    MOV             r0, r0, LSL #3      @(del<<1) * 4
289
290    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
291    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
292    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
293    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
294    SUB             r3, r3, #2048       @ 512 *4
295    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
296    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
297
298    STR             r4, [sp, #0x24]
299
300    STR             r1, [sp, #0x14]
301    STR             r2, [sp, #0x10]
302    STR             r5, [sp, #0x0c]
303    STR             r6, [sp, #0x08]
304    STR             r7, [sp, #0x04]
305    STR             r8, [sp]
306
307RADIX4_BFLY_2:
308    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
309    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
310    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
311    SUBS            r14, r14, #1
312    LDR             r1, [sp, #0x14]
313    LDR             r2, [sp, #0x10]
314
315    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
316    LSR             r3, r3, #31
317    ORR             r4, r3, r4, LSL#1
318    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
319    LSR             r3, r3, #31
320    ORR             r6, r3, r6, LSL#1
321    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
322    LSR             r3, r3, #31
323    ORR             r5, r3, r5, LSL#1
324    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
325    LSR             r3, r3, #31
326    ORR             r7, r3, r7, LSL#1
327    SUB             r7, r7, r6
328    ADD             r6, r4, r5          @
329
330    LDR             r1, [sp, #0x0c]
331    LDR             r2, [sp, #0x08]
332
333    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
334    LSR             r3, r3, #31
335    ORR             r4, r3, r4, LSL#1
336    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
337    LSR             r3, r3, #31
338    ORR             r8, r3, r8, LSL#1
339    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
340    LSR             r3, r3, #31
341    ORR             r5, r3, r5, LSL#1
342    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
343    LSR             r3, r3, #31
344    ORR             r9, r3, r9, LSL#1
345    SUB             r9, r9, r8
346    ADD             r8, r4, r5          @
347
348    LDR             r1, [sp, #0x04]
349    LDR             r2, [sp]
350
351    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
352    LSR             r3, r3, #31
353    ORR             r4, r3, r4, LSL#1
354    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
355    LSR             r3, r3, #31
356    ORR             r10, r3, r10, LSL#1
357    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
358    LSR             r3, r3, #31
359    ORR             r5, r3, r5, LSL#1
360    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
361    LSR             r3, r3, #31
362    ORR             r11, r3, r11, LSL#1
363    SUB             r10, r10, r11
364    ADD             r11, r5, r4         @
365
366    @SUB    r12,r12,r0,lsl #1
367    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
368    LDR             r4, [r12, -r0, lsl #1]! @
369    LDR             r5, [r12, #0x04]
370
371
372    ADD             r4, r8, r4          @x0r = x0r + x2r@
373    ADD             r5, r9, r5          @x0i = x0i + x2i@
374    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
375    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
376    ADD             r6, r6, r10         @x1r = x1r + x3r@
377    ADD             r7, r7, r11         @x1i = x1i + x3i@
378    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
379    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
380
381    ADD             r4, r4, r6          @x0r = x0r + x1r@
382    ADD             r5, r5, r7          @x0i = x0i + x1i@
383    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
384    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
385    STRD            r4, [r12]           @r4=x0r,  r5=x0i
386    ADD             r12, r12, r0
387
388    SUB             r8, r8, r11         @x2r = x2r - x3i@
389    ADD             r9, r9, r10         @x2i = x2i + x3r@
390    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
391    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
392
393    STRD            r8, [r12]           @r8=x2r,  r9=x2i
394    ADD             r12, r12, r0
395    STRD            r6, [r12]           @r6=x1r,  r7=x1i
396    ADD             r12, r12, r0
397    STRD            r4, [r12]           @r10=x3r, r11=x3i
398    ADD             r12, r12, r0
399
400    BNE             RADIX4_BFLY_2
401    MOV             r0, r0, ASR #3
402
403    LDR             r1, [sp, #0x48]
404    LDR             r4, [sp, #0x24]
405    SUB             r1, r12, r1, LSL #3
406    LDR             r6, [sp, #0x38]
407    ADD             r12, r1, #8
408    LDR             r7, [sp, #0x40]
409    ADD             r4, r4, r6
410    CMP             r4, r7, ASR #1
411    BLE             SECOND_LOOP_2
412    LDR             r7, [sp, #0x18]
413    CMP             r4, r7, LSL #1
414    BGT             SECOND_LOOP_4
415
416SECOND_LOOP_3:
417    LDR             r3, [sp, #0x2c]
418    LDR             r14, [sp, #0x34]
419    MOV             r0, r0, LSL #3      @(del<<1) * 4
420
421    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
422    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
423    SUB             r3, r3, #2048       @ 512 *4
424    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
425    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
426    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
427    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
428
429    STR             r4, [sp, #0x24]
430    STR             r1, [sp, #0x14]
431    STR             r2, [sp, #0x10]
432    STR             r5, [sp, #0x0c]
433    STR             r6, [sp, #0x08]
434    STR             r7, [sp, #0x04]
435    STR             r8, [sp]
436
437
438RADIX4_BFLY_3:
439    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
440    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
441    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
442    SUBS            r14, r14, #1
443
444    LDR             r1, [sp, #0x14]
445    LDR             r2, [sp, #0x10]
446
447    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
448    LSR             r3, r3, #31
449    ORR             r4, r3, r4, LSL#1
450    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
451    LSR             r3, r3, #31
452    ORR             r6, r3, r6, LSL#1
453    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
454    LSR             r3, r3, #31
455    ORR             r5, r3, r5, LSL#1
456    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
457    LSR             r3, r3, #31
458    ORR             r7, r3, r7, LSL#1
459    SUB             r7, r7, r6
460    ADD             r6, r4, r5          @
461
462    LDR             r1, [sp, #0x0c]
463    LDR             r2, [sp, #0x08]
464
465    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
466    LSR             r3, r3, #31
467    ORR             r4, r3, r4, LSL#1
468    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
469    LSR             r3, r3, #31
470    ORR             r8, r3, r8, LSL#1
471    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
472    LSR             r3, r3, #31
473    ORR             r5, r3, r5, LSL#1
474    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
475    LSR             r3, r3, #31
476    ORR             r9, r3, r9, LSL#1
477    SUB             r8, r8, r9
478    ADD             r9, r5, r4          @
479
480    LDR             r1, [sp, #0x04]
481    LDR             r2, [sp]
482
483    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
484    LSR             r3, r3, #31
485    ORR             r4, r3, r4, LSL#1
486    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
487    LSR             r3, r3, #31
488    ORR             r10, r3, r10, LSL#1
489    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
490    LSR             r3, r3, #31
491    ORR             r5, r3, r5, LSL#1
492    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
493    LSR             r3, r3, #31
494    ORR             r11, r3, r11, LSL#1
495    SUB             r10, r10, r11
496    ADD             r11, r5, r4         @
497
498    @SUB    r12,r12,r0,lsl #1
499    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
500    LDR             r4, [r12, -r0, lsl #1]! @
501    LDR             r5, [r12, #0x04]
502
503
504    ADD             r4, r8, r4          @x0r = x0r + x2r@
505    ADD             r5, r9, r5          @x0i = x0i + x2i@
506    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
507    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
508    ADD             r6, r6, r10         @x1r = x1r + x3r@
509    ADD             r7, r7, r11         @x1i = x1i + x3i@
510    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
511    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
512
513    ADD             r4, r4, r6          @x0r = x0r + x1r@
514    ADD             r5, r5, r7          @x0i = x0i + x1i@
515    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
516    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
517    STRD            r4, [r12]           @r4=x0r,  r5=x0i
518    ADD             r12, r12, r0
519
520    SUB             r8, r8, r11         @x2r = x2r - x3i@
521    ADD             r9, r9, r10         @x2i = x2i + x3r@
522    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
523    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
524
525    STRD            r8, [r12]           @r8=x2r,  r9=x2i
526    ADD             r12, r12, r0
527    STRD            r6, [r12]           @r6=x1r,  r7=x1i
528    ADD             r12, r12, r0
529    STRD            r4, [r12]           @r10=x3r, r11=x3i
530    ADD             r12, r12, r0
531
532    BNE             RADIX4_BFLY_3
533    MOV             r0, r0, ASR #3
534
535    LDR             r1, [sp, #0x48]
536    LDR             r4, [sp, #0x24]
537    SUB             r1, r12, r1, LSL #3
538    LDR             r6, [sp, #0x38]
539    ADD             r12, r1, #8
540    LDR             r7, [sp, #0x18]
541    ADD             r4, r4, r6
542    CMP             r4, r7, LSL #1
543    BLE             SECOND_LOOP_3
544
545SECOND_LOOP_4:
546    LDR             r3, [sp, #0x2c]
547    LDR             r14, [sp, #0x34]
548    MOV             r0, r0, LSL #3      @(del<<1) * 4
549
550    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
551    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
552    SUB             r3, r3, #2048       @ 512 *4
553    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
554    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
555    SUB             r3, r3, #2048       @ 512 *4
556    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
557    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
558
559
560    STR             r4, [sp, #0x24]
561    STR             r1, [sp, #0x14]
562    STR             r2, [sp, #0x10]
563    STR             r5, [sp, #0x0c]
564    STR             r6, [sp, #0x08]
565    STR             r7, [sp, #0x04]
566    STR             r8, [sp]
567
568RADIX4_BFLY_4:
569    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
570    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
571    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
572    SUBS            r14, r14, #1
573
574    LDR             r1, [sp, #0x14]
575    LDR             r2, [sp, #0x10]
576
577    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
578    LSR             r3, r3, #31
579    ORR             r4, r3, r4, LSL#1
580    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
581    LSR             r3, r3, #31
582    ORR             r6, r3, r6, LSL#1
583    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
584    LSR             r3, r3, #31
585    ORR             r5, r3, r5, LSL#1
586    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
587    LSR             r3, r3, #31
588    ORR             r7, r3, r7, LSL#1
589    SUB             r7, r7, r6
590    ADD             r6, r4, r5          @
591
592    LDR             r1, [sp, #0x0c]
593    LDR             r2, [sp, #0x08]
594
595    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
596    LSR             r3, r3, #31
597    ORR             r4, r3, r4, LSL#1
598    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
599    LSR             r3, r3, #31
600    ORR             r8, r3, r8, LSL#1
601    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
602    LSR             r3, r3, #31
603    ORR             r5, r3, r5, LSL#1
604    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
605    LSR             r3, r3, #31
606    ORR             r9, r3, r9, LSL#1
607    SUB             r8, r8, r9
608    ADD             r9, r5, r4          @
609
610    LDR             r1, [sp, #0x04]
611    LDR             r2, [sp]
612
613    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
614    LSR             r3, r3, #31
615    ORR             r4, r3, r4, LSL#1
616    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
617    LSR             r3, r3, #31
618    ORR             r10, r3, r10, LSL#1
619    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
620    LSR             r3, r3, #31
621    ORR             r5, r3, r5, LSL#1
622    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
623    LSR             r3, r3, #31
624    ORR             r11, r3, r11, LSL#1
625    SUB             r11, r11, r10
626    ADD             r10, r5, r4         @
627    RSB             r10, r10, #0
628
629    @SUB    r12,r12,r0,lsl #1
630    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
631    LDR             r4, [r12, -r0, lsl #1]! @
632    LDR             r5, [r12, #0x04]
633
634
635    ADD             r4, r8, r4          @x0r = x0r + x2r@
636    ADD             r5, r9, r5          @x0i = x0i + x2i@
637    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
638    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
639    ADD             r6, r6, r10         @x1r = x1r + x3r@
640    SUB             r7, r7, r11         @x1i = x1i - x3i@
641    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
642    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
643
644    ADD             r4, r4, r6          @x0r = x0r + x1r@
645    ADD             r5, r5, r7          @x0i = x0i + x1i@
646    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
647    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
648    STRD            r4, [r12]           @r4=x0r,  r5=x0i
649    ADD             r12, r12, r0
650
651    SUB             r8, r8, r11         @x2r = x2r - x3i@
652    ADD             r9, r9, r10         @x2i = x2i + x3r@
653    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
654    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
655
656    STRD            r8, [r12]           @r8=x2r,  r9=x2i
657    ADD             r12, r12, r0
658    STRD            r6, [r12]           @r6=x1r,  r7=x1i
659    ADD             r12, r12, r0
660    STRD            r4, [r12]           @r10=x3r, r11=x3i
661    ADD             r12, r12, r0
662
663    BNE             RADIX4_BFLY_4
664    MOV             r0, r0, ASR #3
665
666    LDR             r1, [sp, #0x48]
667    LDR             r4, [sp, #0x24]
668    SUB             r1, r12, r1, LSL #3
669    LDR             r6, [sp, #0x38]
670    ADD             r12, r1, #8
671    LDR             r7, [sp, #0x40]
672    ADD             r4, r4, r6
673    CMP             r4, r7
674    BLT             SECOND_LOOP_4
675
676    LDR             r1, [sp, #0x38]
677    MOV             r0, r0, LSL #2
678    MOV             r1, r1, ASR #2
679    STR             r1, [sp, #0x38]
680    LDR             r1, [sp, #0x34]
681    MOV             r1, r1, ASR #2
682    STR             r1, [sp, #0x34]
683    LDR             r1, [sp, #0x3c]
684    SUBS            r1, r1, #1
685    STR             r1, [sp, #0x3c]
686    BGT             OUTER_LOOP
687
688RADIX2:
689    LDR             r1, [sp, #0x30]
690    CMP             r1, #0
691    BEQ             EXIT
692    LDR             r12, [sp, #0x38]
693    LDR             r1, [sp, #0x44]
694    CMP             r12, #0
695    MOVEQ           r4, #1
696    MOVNE           r4, r12, LSL #1
697    MOVS            r3, r0
698    BEQ             EXIT
699
700    MOV             r3, r3, ASR #1
701    LDR             r5, [sp, #0x50]
702    MOV             r0, r0, LSL #3      @(del<<1) * 4
703    STR             r1, [sp, #0x18]
704RADIX2_BFLY:
705    LDR             r1, [sp, #0x18]
706    LDRD            r6, [r5]            @r6 = x0r
707    ADD             r5, r5, r0
708    LDRD            r8, [r5]            @r8 = x1r
709
710    LDR             r2, [r1]
711    SUBS            r3, r3, #1
712
713
714    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
715    LSR             r1, r1, #31
716    ORR             r11, r1, r11, LSL#1
717    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
718    LSR             r1, r1, #31
719    ORR             r10, r1, r10, LSL#1
720
721
722    LDR             r1, [sp, #0x18]
723    LDR             r2, [r1, #0x04]
724    ADD             r1, r1, r4, LSL #3
725    STR             r1, [sp, #0x18]
726
727    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
728    LSR             r1, r1, #31
729    ORR             r8, r1, r8, LSL#1
730    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
731    LSR             r1, r1, #31
732    ORR             r9, r1, r9, LSL#1
733
734    ADD             r8, r8, r10
735    SUB             r9, r9, r11
736
737    ASR             r8, r8, #1
738    ASR             r6, r6, #1
739    ASR             r9, r9, #1
740    ASR             r7, r7, #1
741    ADD             r10, r8, r6         @(x0r/2) + (x1r/2)
742    ADD             r11, r9, r7         @(x0i/2) + (x1i/2)@
743    SUB             r8, r6, r8          @(x0r/2) - (x1r/2)
744    SUB             r9, r7, r9          @(x0i/2) - (x1i/2)@
745
746    STRD            r8, [r5]
747    SUB             r5, r5, r0
748    STRD            r10, [r5], #8
749
750    BNE             RADIX2_BFLY
751
752    LDR             r1, [sp, #0x44]
753    MOV             r3, r0, ASR #4
754    STR             r1, [sp, #0x18]
755RADIX2_BFLY_2:
756    LDR             r1, [sp, #0x18]
757    LDRD            r6, [r5]            @r6 = x0r
758    ADD             r5, r5, r0
759    LDRD            r8, [r5]            @r8 = x1r
760
761    LDR             r2, [r1]
762    SUBS            r3, r3, #1
763
764
765
766    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
767    LSR             r1, r1, #31
768    ORR             r11, r1, r11, LSL#1
769    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
770    LSR             r1, r1, #31
771    ORR             r10, r1, r10, LSL#1
772
773
774    LDR             r1, [sp, #0x18]
775    LDR             r2, [r1, #0x04]
776    ADD             r1, r1, r4, LSL #3
777    STR             r1, [sp, #0x18]
778
779    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
780    LSR             r1, r1, #31
781    ORR             r8, r1, r8, LSL#1
782    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
783    LSR             r1, r1, #31
784    ORR             r9, r1, r9, LSL#1
785
786    SUB             r11, r11, r9
787    ADD             r9, r10, r8         @
788    MOV             r8, r11
789
790    ASR             r8, r8, #1
791    ASR             r6, r6, #1
792    ASR             r9, r9, #1
793    ASR             r7, r7, #1
794    ADD             r10, r8, r6         @(x0r>>1) + (x1r)
795    ADD             r11, r9, r7         @(x0i>>1) + (x1i)@
796    SUB             r8, r6, r8          @(x0r>>1) - (x1r)
797    SUB             r9, r7, r9          @(x0i>>1) - (x1i)@
798
799    STRD            r8, [r5]
800    SUB             r5, r5, r0
801    STRD            r10, [r5], #8
802
803    BNE             RADIX2_BFLY_2
804
805EXIT:
806    ADD             sp, sp, #0x54
807    LDMFD           sp!, {r4-r12, pc}
808
809