1.text
2.p2align 2
3.global ixheaacd_mps_complex_fft_64_asm
4
5ixheaacd_mps_complex_fft_64_asm:
6    @LDR    r4,[sp]
7    STMFD           sp!, {r0-r12, lr}
8    LDR             r4, [sp, #0x38]
9    SUB             sp, sp, #0x28
10@       LDR     r4,[sp,#0x30]
11    LDR             r0, [sp, #0x2c]
12    @LDR      r12,[sp,#0x5c+4]
13    EOR             r0, r0, r0, ASR #31
14    CLZ             r0, r0
15    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
16    SUB             r0, r0, #1
17    RSB             r0, r0, #0x1e
18    AND             r1, r0, #1
19    STR             r1, [sp, #0x14]
20    MOV             r1, r0, ASR #1
21    LDR             r0, [sp, #0x2c]     @npoints
22    STR             r1, [sp, #-4]!
23    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
24    MOV             r0, #0
25    MOV             r12, r4
26FIRST_STAGE_R4:
27    LDRB            r10, [r12, r0, LSR #2]
28
29
30    ADD             r1, r2, r10, LSL #2
31    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
32    ADD             r1, r1, lr
33    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
34    ADD             r1, r1, lr
35    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
36    ADD             r1, r1, lr
37    LDRD            r10, [r1]           @r10=x3r, r11=x3i
38    ADD             r0, r0, #4
39    CMP             r0, lr, ASR #1
40
41    ADD             r4, r4, r6          @x0r = x0r + x2r@
42    ADD             r5, r5, r7          @x0i = x0i + x2i@
43    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
44    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
45    ADD             r8, r8, r10         @x1r = x1r + x3r@
46    ADD             r9, r9, r11         @x1i = x1i + x3i@
47    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
48    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
49
50    ADD             r4, r4, r8          @x0r = x0r + x1r@
51    ADD             r5, r5, r9          @x0i = x0i + x1i@
52    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
53    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
54    ADD             r6, r6, r11         @x2r = x2r + x3i@
55    SUB             r7, r7, r1          @x2i = x2i - x3r@
56    SUB             r10, r6, r11, lsl#1 @x3i = x2r - (x3i << 1)@
57    ADD             r11, r7, r1, lsl#1  @x3r = x2i + (x3r << 1)@
58
59    STMIA           r3!, {r4-r11}
60    BLT             FIRST_STAGE_R4
61    LDR             r1, [sp], #4
62    LDR             r0, [sp, #0x2c]
63    MOV             r12, #0x40          @nodespacing = 64@
64    STR             r12, [sp, #0x1c]
65    LDR             r12, [sp, #0x2c]
66    SUB             r3, r3, r0, LSL #3
67    SUBS            r1, r1, #1
68    STR             r3, [sp, #0x34]
69    MOV             r4, r12, ASR #4
70    MOV             r0, #4
71    STR             r4, [sp, #0x18]
72    STR             r1, [sp, #0x20]
73    BLE             EXIT
74OUTER_LOOP:
75    LDR             r1, [sp, #0x28]
76    LDR             r12, [sp, #0x34]    @WORD32 *data = ptr_y@
77    STR             r1, [sp, #0x10]
78    LDR             r1, [sp, #0x18]
79
80    MOV             r0, r0, LSL #3      @(del<<1) * 4
81LOOP_TRIVIAL_TWIDDLE:
82    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
83    ADD             r12, r12, r0
84    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
85    ADD             r12, r12, r0
86    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
87    ADD             r12, r12, r0
88    LDRD            r10, [r12]          @r10=x3r, r11=x3i
89
90@MOV    r4,r4,ASR #1
91@MOV    r5,r5,ASR #1
92@MOV    r6,r6,ASR #1
93@MOV    r7,r7,ASR #1
94@MOV    r8,r8,ASR #1
95@MOV    r9,r9,ASR #1
96@MOV    r10,r10,ASR #1
97@MOV    r11,r11,ASR #1
98
99    ADD             r4, r4, r8          @x0r = x0r + x2r@
100    ADD             r5, r5, r9          @x0i = x0i + x2i@
101    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
102    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
103    ADD             r6, r6, r10         @x1r = x1r + x3r@
104    ADD             r7, r7, r11         @x1i = x1i + x3i@
105    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
106    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
107
108    ADD             r4, r4, r6          @x0r = x0r + x1r@
109    ADD             r5, r5, r7          @x0i = x0i + x1i@
110@MOV    r4,r4,ASR #1
111@MOV    r5,r5,ASR #1
112    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
113    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
114    ADD             r8, r8, r11         @x2r = x2r + x3i@
115    SUB             r9, r9, r2          @x2i = x2i - x3r@
116    SUB             r10, r8, r11, lsl#1 @x3i = x2r - (x3i << 1)@
117    ADD             r11, r9, r2, lsl#1  @x3r = x2i + (x3r << 1)
118
119    STRD            r10, [r12]          @r10=x3r, r11=x3i
120    SUB             r12, r12, r0
121    STRD            r6, [r12]           @r6=x1r,  r7=x1i
122    SUB             r12, r12, r0
123    STRD            r8, [r12]           @r8=x2r,  r9=x2i
124    SUB             r12, r12, r0
125    STRD            r4, [r12]           @r4=x0r,  r5=x0i
126    ADD             r12, r12, r0, lsl #2
127
128    SUBS            r1, r1, #1
129    BNE             LOOP_TRIVIAL_TWIDDLE
130
131    MOV             r0, r0, ASR #3
132    LDR             r4, [sp, #0x1c]
133    LDR             r3, [sp, #0x34]
134    MUL             r1, r0, r4
135    ADD             r12, r3, #8
136    STR             r1, [sp, #0x24]
137    MOV             r3, r1, ASR #2
138    ADD             r3, r3, r1, ASR #3
139    SUB             r3, r3, r1, ASR #4
140    ADD             r3, r3, r1, ASR #5
141    SUB             r3, r3, r1, ASR #6
142    ADD             r3, r3, r1, ASR #7
143    SUB             r3, r3, r1, ASR #8
144    STR             r3, [sp, #-4]!
145SECOND_LOOP:
146    LDR             r3, [sp, #0x10+4]
147    LDR             r14, [sp, #0x18+4]
148    MOV             r0, r0, LSL #3      @(del<<1) * 4
149    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
150    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
151    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
152    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
153    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
154    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
155
156    STR             r4, [sp, #8+4]
157    STR             r1, [sp, #-4]
158    STR             r2, [sp, #-8]
159    STR             r5, [sp, #-12]
160    STR             r6, [sp, #-16]
161    STR             r7, [sp, #-20]
162    STR             r8, [sp, #-24]
163
164RADIX4_BFLY:
165
166    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
167    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
168    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
169    SUBS            r14, r14, #1
170
171    LDR             r1, [sp, #-4]
172    LDR             r2, [sp, #-8]
173
174    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
175    LSR             r3, r3, #31
176    ORR             r4, r3, r4, LSL#1
177    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
178    LSR             r3, r3, #31
179    ORR             r6, r3, r6, LSL#1
180    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
181    LSR             r3, r3, #31
182    ORR             r5, r3, r5, LSL#1
183    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
184    LSR             r3, r3, #31
185    ORR             r7, r3, r7, LSL#1
186    ADD             r7, r7, r6
187    SUB             r6, r4, r5          @
188
189    LDR             r1, [sp, #-12]
190    LDR             r2, [sp, #-16]
191
192    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
193    LSR             r3, r3, #31
194    ORR             r4, r3, r4, LSL#1
195    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
196    LSR             r3, r3, #31
197    ORR             r8, r3, r8, LSL#1
198    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
199    LSR             r3, r3, #31
200    ORR             r5, r3, r5, LSL#1
201    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
202    LSR             r3, r3, #31
203    ORR             r9, r3, r9, LSL#1
204    ADD             r9, r9, r8
205    SUB             r8, r4, r5          @
206
207    LDR             r1, [sp, #-20]
208    LDR             r2, [sp, #-24]
209
210    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
211    LSR             r3, r3, #31
212    ORR             r4, r3, r4, LSL#1
213    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
214    LSR             r3, r3, #31
215    ORR             r10, r3, r10, LSL#1
216    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
217    LSR             r3, r3, #31
218    ORR             r5, r3, r5, LSL#1
219    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
220    LSR             r3, r3, #31
221    ORR             r11, r3, r11, LSL#1
222    ADD             r11, r11, r10
223    SUB             r10, r4, r5         @
224
225    @SUB   r12,r12,r0,lsl #1
226    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
227    LDR             r4, [r12, -r0, lsl #1]! @
228    LDR             r5, [r12, #4]
229
230
231    ADD             r4, r8, r4          @x0r = x0r + x2r@
232    ADD             r5, r9, r5          @x0i = x0i + x2i@
233    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
234    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
235    ADD             r6, r6, r10         @x1r = x1r + x3r@
236    ADD             r7, r7, r11         @x1i = x1i + x3i@
237    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
238    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
239
240    ADD             r4, r4, r6          @x0r = x0r + x1r@
241    ADD             r5, r5, r7          @x0i = x0i + x1i@
242    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
243    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
244    STRD            r4, [r12]           @r4=x0r,  r5=x0i
245    ADD             r12, r12, r0
246
247    ADD             r8, r8, r11         @x2r = x2r + x3i@
248    SUB             r9, r9, r10         @x2i = x2i - x3r@
249    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
250    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
251
252    STRD            r8, [r12]           @r8=x2r,  r9=x2i
253    ADD             r12, r12, r0
254    STRD            r6, [r12]           @r6=x1r,  r7=x1i
255    ADD             r12, r12, r0
256    STRD            r4, [r12]           @r10=x3r, r11=x3i
257    ADD             r12, r12, r0
258
259    BNE             RADIX4_BFLY
260    MOV             r0, r0, ASR #3
261
262    LDR             r1, [sp, #0x2c+4]
263    LDR             r4, [sp, #8+4]
264    SUB             r1, r12, r1, LSL #3
265    LDR             r6, [sp, #0x1c+4]
266    ADD             r12, r1, #8
267    LDR             r7, [sp, #0]
268    ADD             r4, r4, r6
269    CMP             r4, r7
270    BLE             SECOND_LOOP
271
272SECOND_LOOP_2:
273    LDR             r3, [sp, #0x10+4]
274    LDR             r14, [sp, #0x18+4]
275    MOV             r0, r0, LSL #3      @(del<<1) * 4
276
277    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
278    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
279    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
280    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
281    SUB             r3, r3, #2048       @ 512 *4
282    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
283    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
284
285    STR             r4, [sp, #8+4]
286
287    STR             r1, [sp, #-4]
288    STR             r2, [sp, #-8]
289    STR             r5, [sp, #-12]
290    STR             r6, [sp, #-16]
291    STR             r7, [sp, #-20]
292    STR             r8, [sp, #-24]
293
294RADIX4_BFLY_2:
295    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
296    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
297    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
298    SUBS            r14, r14, #1
299    LDR             r1, [sp, #-4]
300    LDR             r2, [sp, #-8]
301
302    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
303    LSR             r3, r3, #31
304    ORR             r4, r3, r4, LSL#1
305    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
306    LSR             r3, r3, #31
307    ORR             r6, r3, r6, LSL#1
308    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
309    LSR             r3, r3, #31
310    ORR             r5, r3, r5, LSL#1
311    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
312    LSR             r3, r3, #31
313    ORR             r7, r3, r7, LSL#1
314    ADD             r7, r7, r6
315    SUB             r6, r4, r5          @
316
317    LDR             r1, [sp, #-12]
318    LDR             r2, [sp, #-16]
319
320    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
321    LSR             r3, r3, #31
322    ORR             r4, r3, r4, LSL#1
323    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
324    LSR             r3, r3, #31
325    ORR             r8, r3, r8, LSL#1
326    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
327    LSR             r3, r3, #31
328    ORR             r5, r3, r5, LSL#1
329    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
330    LSR             r3, r3, #31
331    ORR             r9, r3, r9, LSL#1
332    ADD             r9, r9, r8
333    SUB             r8, r4, r5          @
334
335    LDR             r1, [sp, #-20]
336    LDR             r2, [sp, #-24]
337
338    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
339    LSR             r3, r3, #31
340    ORR             r4, r3, r4, LSL#1
341    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
342    LSR             r3, r3, #31
343    ORR             r10, r3, r10, LSL#1
344    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
345    LSR             r3, r3, #31
346    ORR             r5, r3, r5, LSL#1
347    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
348    LSR             r3, r3, #31
349    ORR             r11, r3, r11, LSL#1
350    ADD             r10, r11, r10
351    SUB             r11, r5, r4         @
352
353    @SUB    r12,r12,r0,lsl #1
354    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
355    LDR             r4, [r12, -r0, lsl #1]! @
356    LDR             r5, [r12, #4]
357
358
359    ADD             r4, r8, r4          @x0r = x0r + x2r@
360    ADD             r5, r9, r5          @x0i = x0i + x2i@
361    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
362    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
363    ADD             r6, r6, r10         @x1r = x1r + x3r@
364    ADD             r7, r7, r11         @x1i = x1i + x3i@
365    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
366    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
367
368    ADD             r4, r4, r6          @x0r = x0r + x1r@
369    ADD             r5, r5, r7          @x0i = x0i + x1i@
370    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
371    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
372    STRD            r4, [r12]           @r4=x0r,  r5=x0i
373    ADD             r12, r12, r0
374
375    ADD             r8, r8, r11         @x2r = x2r + x3i@
376    SUB             r9, r9, r10         @x2i = x2i - x3r@
377    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
378    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
379
380    STRD            r8, [r12]           @r8=x2r,  r9=x2i
381    ADD             r12, r12, r0
382    STRD            r6, [r12]           @r6=x1r,  r7=x1i
383    ADD             r12, r12, r0
384    STRD            r4, [r12]           @r10=x3r, r11=x3i
385    ADD             r12, r12, r0
386
387    BNE             RADIX4_BFLY_2
388    MOV             r0, r0, ASR #3
389
390    LDR             r1, [sp, #0x2c+4]
391    LDR             r4, [sp, #8+4]
392    SUB             r1, r12, r1, LSL #3
393    LDR             r6, [sp, #0x1c+4]
394    ADD             r12, r1, #8
395    LDR             r7, [sp, #0x24+4]
396    ADD             r4, r4, r6
397    CMP             r4, r7, ASR #1
398    BLE             SECOND_LOOP_2
399    LDR             r7, [sp, #0]
400    CMP             r4, r7, LSL #1
401    BGT             SECOND_LOOP_4
402
403SECOND_LOOP_3:
404    LDR             r3, [sp, #0x10+4]
405    LDR             r14, [sp, #0x18+4]
406    MOV             r0, r0, LSL #3      @(del<<1) * 4
407
408    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
409    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
410    SUB             r3, r3, #2048       @ 512 *4
411    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
412    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
413    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
414    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
415
416    STR             r4, [sp, #8+4]
417    STR             r1, [sp, #-4]
418    STR             r2, [sp, #-8]
419    STR             r5, [sp, #-12]
420    STR             r6, [sp, #-16]
421    STR             r7, [sp, #-20]
422    STR             r8, [sp, #-24]
423
424
425RADIX4_BFLY_3:
426    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
427    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
428    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
429    SUBS            r14, r14, #1
430
431    LDR             r1, [sp, #-4]
432    LDR             r2, [sp, #-8]
433
434    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
435    LSR             r3, r3, #31
436    ORR             r4, r3, r4, LSL#1
437    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
438    LSR             r3, r3, #31
439    ORR             r6, r3, r6, LSL#1
440    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
441    LSR             r3, r3, #31
442    ORR             r5, r3, r5, LSL#1
443    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
444    LSR             r3, r3, #31
445    ORR             r7, r3, r7, LSL#1
446    ADD             r7, r7, r6
447    SUB             r6, r4, r5          @
448
449    LDR             r1, [sp, #-12]
450    LDR             r2, [sp, #-16]
451
452    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
453    LSR             r3, r3, #31
454    ORR             r4, r3, r4, LSL#1
455    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
456    LSR             r3, r3, #31
457    ORR             r8, r3, r8, LSL#1
458    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
459    LSR             r3, r3, #31
460    ORR             r5, r3, r5, LSL#1
461    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
462    LSR             r3, r3, #31
463    ORR             r9, r3, r9, LSL#1
464    ADD             r8, r9, r8
465    SUB             r9, r5, r4          @
466
467    LDR             r1, [sp, #-20]
468    LDR             r2, [sp, #-24]
469
470    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
471    LSR             r3, r3, #31
472    ORR             r4, r3, r4, LSL#1
473    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
474    LSR             r3, r3, #31
475    ORR             r10, r3, r10, LSL#1
476    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
477    LSR             r3, r3, #31
478    ORR             r5, r3, r5, LSL#1
479    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
480    LSR             r3, r3, #31
481    ORR             r11, r3, r11, LSL#1
482    ADD             r10, r11, r10
483    SUB             r11, r5, r4         @
484
485    @SUB    r12,r12,r0,lsl #1
486    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
487    LDR             r4, [r12, -r0, lsl #1]! @
488    LDR             r5, [r12, #4]
489
490
491    ADD             r4, r8, r4          @x0r = x0r + x2r@
492    ADD             r5, r9, r5          @x0i = x0i + x2i@
493    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
494    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
495    ADD             r6, r6, r10         @x1r = x1r + x3r@
496    ADD             r7, r7, r11         @x1i = x1i + x3i@
497    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
498    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
499
500    ADD             r4, r4, r6          @x0r = x0r + x1r@
501    ADD             r5, r5, r7          @x0i = x0i + x1i@
502    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
503    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
504    STRD            r4, [r12]           @r4=x0r,  r5=x0i
505    ADD             r12, r12, r0
506
507    ADD             r8, r8, r11         @x2r = x2r + x3i@
508    SUB             r9, r9, r10         @x2i = x2i - x3r@
509    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
510    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
511
512    STRD            r8, [r12]           @r8=x2r,  r9=x2i
513    ADD             r12, r12, r0
514    STRD            r6, [r12]           @r6=x1r,  r7=x1i
515    ADD             r12, r12, r0
516    STRD            r4, [r12]           @r10=x3r, r11=x3i
517    ADD             r12, r12, r0
518
519    BNE             RADIX4_BFLY_3
520    MOV             r0, r0, ASR #3
521
522    LDR             r1, [sp, #0x2c+4]
523    LDR             r4, [sp, #8+4]
524    SUB             r1, r12, r1, LSL #3
525    LDR             r6, [sp, #0x1c+4]
526    ADD             r12, r1, #8
527    LDR             r7, [sp, #0]
528    ADD             r4, r4, r6
529    CMP             r4, r7, LSL #1
530    BLE             SECOND_LOOP_3
531
532SECOND_LOOP_4:
533    LDR             r3, [sp, #0x10+4]
534    LDR             r14, [sp, #0x18+4]
535    MOV             r0, r0, LSL #3      @(del<<1) * 4
536
537    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
538    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
539    SUB             r3, r3, #2048       @ 512 *4
540    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
541    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
542    SUB             r3, r3, #2048       @ 512 *4
543    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
544    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
545
546
547    STR             r4, [sp, #8+4]
548    STR             r1, [sp, #-4]
549    STR             r2, [sp, #-8]
550    STR             r5, [sp, #-12]
551    STR             r6, [sp, #-16]
552    STR             r7, [sp, #-20]
553    STR             r8, [sp, #-24]
554
555RADIX4_BFLY_4:
556    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
557    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
558    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
559    SUBS            r14, r14, #1
560
561    LDR             r1, [sp, #-4]
562    LDR             r2, [sp, #-8]
563
564    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
565    LSR             r3, r3, #31
566    ORR             r4, r3, r4, LSL#1
567    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
568    LSR             r3, r3, #31
569    ORR             r6, r3, r6, LSL#1
570    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
571    LSR             r3, r3, #31
572    ORR             r5, r3, r5, LSL#1
573    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
574    LSR             r3, r3, #31
575    ORR             r7, r3, r7, LSL#1
576    ADD             r7, r7, r6
577    SUB             r6, r4, r5          @
578
579    LDR             r1, [sp, #-12]
580    LDR             r2, [sp, #-16]
581
582    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
583    LSR             r3, r3, #31
584    ORR             r4, r3, r4, LSL#1
585    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
586    LSR             r3, r3, #31
587    ORR             r8, r3, r8, LSL#1
588    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
589    LSR             r3, r3, #31
590    ORR             r5, r3, r5, LSL#1
591    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
592    LSR             r3, r3, #31
593    ORR             r9, r3, r9, LSL#1
594    ADD             r8, r9, r8
595    SUB             r9, r5, r4          @
596
597    LDR             r1, [sp, #-20]
598    LDR             r2, [sp, #-24]
599
600    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
601    LSR             r3, r3, #31
602    ORR             r4, r3, r4, LSL#1
603    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
604    LSR             r3, r3, #31
605    ORR             r10, r3, r10, LSL#1
606    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
607    LSR             r3, r3, #31
608    ORR             r5, r3, r5, LSL#1
609    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
610    LSR             r3, r3, #31
611    ORR             r11, r3, r11, LSL#1
612    ADD             r11, r11, r10
613    SUB             r10, r5, r4         @
614
615    @SUB    r12,r12,r0,lsl #1
616    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
617    LDR             r4, [r12, -r0, lsl #1]! @
618    LDR             r5, [r12, #4]
619
620
621    ADD             r4, r8, r4          @x0r = x0r + x2r@
622    ADD             r5, r9, r5          @x0i = x0i + x2i@
623    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
624    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
625    ADD             r6, r6, r10         @x1r = x1r + x3r@
626    SUB             r7, r7, r11         @x1i = x1i - x3i@
627    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
628    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
629
630    ADD             r4, r4, r6          @x0r = x0r + x1r@
631    ADD             r5, r5, r7          @x0i = x0i + x1i@
632    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
633    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
634    STRD            r4, [r12]           @r4=x0r,  r5=x0i
635    ADD             r12, r12, r0
636    ADD             r8, r8, r11         @x2r = x2r + x3i@
637    SUB             r9, r9, r10         @x2i = x2i - x3r@
638    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
639    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
640
641    STRD            r8, [r12]           @r8=x2r,  r9=x2i
642    ADD             r12, r12, r0
643    STRD            r6, [r12]           @r6=x1r,  r7=x1i
644    ADD             r12, r12, r0
645    STRD            r4, [r12]           @r10=x3r, r11=x3i
646    ADD             r12, r12, r0
647
648    BNE             RADIX4_BFLY_4
649    MOV             r0, r0, ASR #3
650
651    LDR             r1, [sp, #0x2c+4]
652    LDR             r4, [sp, #8+4]
653    SUB             r1, r12, r1, LSL #3
654    LDR             r6, [sp, #0x1c+4]
655    ADD             r12, r1, #8
656    LDR             r7, [sp, #0x24+4]
657    ADD             r4, r4, r6
658    CMP             r4, r7
659    BLT             SECOND_LOOP_4
660    ADD             sp, sp, #4
661
662    LDR             r1, [sp, #0x1c]
663    MOV             r0, r0, LSL #2
664    MOV             r1, r1, ASR #2
665    STR             r1, [sp, #0x1c]
666    LDR             r1, [sp, #0x18]
667    MOV             r1, r1, ASR #2
668    STR             r1, [sp, #0x18]
669    LDR             r1, [sp, #0x20]
670    SUBS            r1, r1, #1
671    STR             r1, [sp, #0x20]
672    BGT             OUTER_LOOP
673
674    LDR             r1, [sp, #0x14]
675    CMP             r1, #0
676    BEQ             EXIT
677    LDR             r12, [sp, #0x1c]
678    LDR             r1, [sp, #0x28]
679    CMP             r12, #0
680    LDRNE           r12, [sp, #0x1c]
681    MOVEQ           r4, #1
682    MOVNE           r4, r12, LSL #1
683    MOVS            r3, r0
684    BEQ             EXIT
685
686    MOV             r3, r3, ASR #1
687    LDR             r5, [sp, #0x34]
688    MOV             r0, r0, LSL #3      @(del<<1) * 4
689    STR             r1, [sp, #-4]
690
691EXIT:
692    ADD             sp, sp, #0x38
693    LDMFD           sp!, {r4-r12, pc}
694
695