1.text
2.p2align 2
3.global ixheaacd_complex_fft_p2_asm
4.type ixheaacd_complex_fft_p2_asm, %function
5
6ixheaacd_complex_fft_p2_asm:
7    STMFD           sp!, {r0-r12, lr}
8    SUB             sp, sp, #0x44
9    LDR             r0, [sp, #0x48]
10    EOR             r0, r0, r0, ASR #31
11    CLZ             r0, r0
12    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
13    SUB             r0, r0, #1
14    RSB             r0, r0, #0x1e
15    AND             r1, r0, #1
16    STR             r1, [sp, #0x30]
17    MOV             r1, r0, ASR #1
18    LDR             r0, [sp, #0x48]     @npoints
19    STR             r1, [sp, #0x18]
20    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
21    MOV             r0, #0
22
23FIRST_STAGE_R4:
24    MOVW            r4, #0x3333
25    MOVT            r4, #0x3333
26    MOVW            r5, #0x0F0F
27    MOVT            r5, #0x0F0F
28    AND             r6, r4, r0
29    AND             r7, r4, r0, LSR #2
30    ORR             r4, r7, r6, LSL #2
31    AND             r6, r5, r4
32    AND             r7, r5, r4, LSR #4
33    ORR             r4, r7, r6, LSL #4
34    BIC             r6, r4, #0x0000FF00
35    BIC             r7, r4, #0x00FF0000
36    MOV             r7, r7, LSR #8
37    ORR             r4, r7, r6, LSL #8
38    LDR             r5, [sp, #0x30]
39    MOV             r10, r4, LSR r12
40    CMP             r5, #0
41    ADDNE           r10, r10, #1
42    BICNE           r10, r10, #1
43
44    ADD             r1, r2, r10, LSL #2
45    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
46    ADD             r1, r1, lr
47    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
48    ADD             r1, r1, lr
49    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
50    ADD             r1, r1, lr
51    LDRD            r10, [r1]           @r10=x3r, r11=x3i
52    ADD             r0, r0, #4
53    CMP             r0, lr, ASR #1
54
55    ADD             r4, r4, r6          @x0r = x0r + x2r@
56    ADD             r5, r5, r7          @x0i = x0i + x2i@
57    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
58    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
59    ADD             r8, r8, r10         @x1r = x1r + x3r@
60    ADD             r9, r9, r11         @x1i = x1i + x3i@
61    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
62    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
63
64    ADD             r4, r4, r8          @x0r = x0r + x1r@
65    ADD             r5, r5, r9          @x0i = x0i + x1i@
66    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
67    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
68    ADD             r6, r6, r11         @x2r = x2r + x3i@
69    SUB             r7, r7, r1          @x2i = x2i - x3r@
70    SUB             r10, r6, r11, lsl#1 @x3i = x2r - (x3i << 1)@
71    ADD             r11, r7, r1, lsl#1  @x3r = x2i + (x3r << 1)@
72
73    STMIA           r3!, {r4-r11}
74    BLT             FIRST_STAGE_R4
75    LDR             r1, [sp, #0x18]
76    LDR             r0, [sp, #0x48]
77    MOV             r12, #0x40          @nodespacing = 64@
78    STR             r12, [sp, #0x38]
79    LDR             r12, [sp, #0x48]
80    SUB             r3, r3, r0, LSL #3
81    SUBS            r1, r1, #1
82    STR             r3, [sp, #0x50]
83    MOV             r4, r12, ASR #4
84    MOV             r0, #4
85    STR             r4, [sp, #0x34]
86    STR             r1, [sp, #0x3c]
87    BLE             RADIX2
88OUTER_LOOP:
89    LDR             r1, [sp, #0x44]
90    LDR             r12, [sp, #0x50]    @WORD32 *data = ptr_y@
91    STR             r1, [sp, #0x2c]
92    LDR             r1, [sp, #0x34]
93
94    MOV             r0, r0, LSL #3      @(del<<1) * 4
95LOOP_TRIVIAL_TWIDDLE:
96    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
97    ADD             r12, r12, r0
98    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
99    ADD             r12, r12, r0
100    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
101    ADD             r12, r12, r0
102    LDRD            r10, [r12]          @r10=x3r, r11=x3i
103
104@MOV    r4,r4,ASR #1
105@MOV    r5,r5,ASR #1
106@MOV    r6,r6,ASR #1
107@MOV    r7,r7,ASR #1
108@MOV    r8,r8,ASR #1
109@MOV    r9,r9,ASR #1
110@MOV    r10,r10,ASR #1
111@MOV    r11,r11,ASR #1
112
113    ADD             r4, r4, r8          @x0r = x0r + x2r@
114    ADD             r5, r5, r9          @x0i = x0i + x2i@
115    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
116    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
117    ADD             r6, r6, r10         @x1r = x1r + x3r@
118    ADD             r7, r7, r11         @x1i = x1i + x3i@
119    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
120    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
121
122    ADD             r4, r4, r6          @x0r = x0r + x1r@
123    ADD             r5, r5, r7          @x0i = x0i + x1i@
124@MOV    r4,r4,ASR #1
125@MOV    r5,r5,ASR #1
126    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
127    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
128    ADD             r8, r8, r11         @x2r = x2r + x3i@
129    SUB             r9, r9, r2          @x2i = x2i - x3r@
130    SUB             r10, r8, r11, lsl#1 @x3i = x2r - (x3i << 1)@
131    ADD             r11, r9, r2, lsl#1  @x3r = x2i + (x3r << 1)
132
133    STRD            r10, [r12]          @r10=x3r, r11=x3i
134    SUB             r12, r12, r0
135    STRD            r6, [r12]           @r6=x1r,  r7=x1i
136    SUB             r12, r12, r0
137    STRD            r8, [r12]           @r8=x2r,  r9=x2i
138    SUB             r12, r12, r0
139    STRD            r4, [r12]           @r4=x0r,  r5=x0i
140    ADD             r12, r12, r0, lsl #2
141
142    SUBS            r1, r1, #1
143    BNE             LOOP_TRIVIAL_TWIDDLE
144
145    MOV             r0, r0, ASR #3
146    LDR             r4, [sp, #0x38]
147    LDR             r3, [sp, #0x50]
148    MUL             r1, r0, r4
149    ADD             r12, r3, #8
150    STR             r1, [sp, #0x40]
151    MOV             r3, r1, ASR #2
152    ADD             r3, r3, r1, ASR #3
153    SUB             r3, r3, r1, ASR #4
154    ADD             r3, r3, r1, ASR #5
155    SUB             r3, r3, r1, ASR #6
156    ADD             r3, r3, r1, ASR #7
157    SUB             r3, r3, r1, ASR #8
158    STR             r3, [sp, #0x18]
159SECOND_LOOP:
160    LDR             r3, [sp, #0x2c]
161    LDR             r14, [sp, #0x34]
162    MOV             r0, r0, LSL #3      @(del<<1) * 4
163    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
164    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
165    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
166    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
167    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
168    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
169
170    STR             r4, [sp, #0x24]
171    STR             r1, [sp, #0x14]
172    STR             r2, [sp, #0x10]
173    STR             r5, [sp, #0x0c]
174    STR             r6, [sp, #0x08]
175    STR             r7, [sp, #0x04]
176    STR             r8, [sp]
177
178RADIX4_BFLY:
179
180    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
181    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
182    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
183    SUBS            r14, r14, #1
184
185    LDR             r1, [sp, #0x14]
186    LDR             r2, [sp, #0x10]
187
188    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
189    LSR             r3, r3, #31
190    ORR             r4, r3, r4, LSL#1
191    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
192    LSR             r3, r3, #31
193    ORR             r6, r3, r6, LSL#1
194    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
195    LSR             r3, r3, #31
196    ORR             r5, r3, r5, LSL#1
197    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
198    LSR             r3, r3, #31
199    ORR             r7, r3, r7, LSL#1
200    ADD             r7, r7, r6
201    SUB             r6, r4, r5          @
202
203    LDR             r1, [sp, #0x0c]
204    LDR             r2, [sp, #0x08]
205
206    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
207    LSR             r3, r3, #31
208    ORR             r4, r3, r4, LSL#1
209    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
210    LSR             r3, r3, #31
211    ORR             r8, r3, r8, LSL#1
212    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
213    LSR             r3, r3, #31
214    ORR             r5, r3, r5, LSL#1
215    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
216    LSR             r3, r3, #31
217    ORR             r9, r3, r9, LSL#1
218    ADD             r9, r9, r8
219    SUB             r8, r4, r5          @
220
221    LDR             r1, [sp, #0x04]
222    LDR             r2, [sp]
223
224    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
225    LSR             r3, r3, #31
226    ORR             r4, r3, r4, LSL#1
227    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
228    LSR             r3, r3, #31
229    ORR             r10, r3, r10, LSL#1
230    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
231    LSR             r3, r3, #31
232    ORR             r5, r3, r5, LSL#1
233    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
234    LSR             r3, r3, #31
235    ORR             r11, r3, r11, LSL#1
236    ADD             r11, r11, r10
237    SUB             r10, r4, r5         @
238
239    @SUB   r12,r12,r0,lsl #1
240    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
241    LDR             r4, [r12, -r0, lsl #1]! @
242    LDR             r5, [r12, #0x04]
243
244
245    ADD             r4, r8, r4          @x0r = x0r + x2r@
246    ADD             r5, r9, r5          @x0i = x0i + x2i@
247    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
248    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
249    ADD             r6, r6, r10         @x1r = x1r + x3r@
250    ADD             r7, r7, r11         @x1i = x1i + x3i@
251    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
252    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
253
254    ADD             r4, r4, r6          @x0r = x0r + x1r@
255    ADD             r5, r5, r7          @x0i = x0i + x1i@
256    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
257    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
258    STRD            r4, [r12]           @r4=x0r,  r5=x0i
259    ADD             r12, r12, r0
260
261    ADD             r8, r8, r11         @x2r = x2r + x3i@
262    SUB             r9, r9, r10         @x2i = x2i - x3r@
263    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
264    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
265
266    STRD            r8, [r12]           @r8=x2r,  r9=x2i
267    ADD             r12, r12, r0
268    STRD            r6, [r12]           @r6=x1r,  r7=x1i
269    ADD             r12, r12, r0
270    STRD            r4, [r12]           @r10=x3r, r11=x3i
271    ADD             r12, r12, r0
272
273    BNE             RADIX4_BFLY
274    MOV             r0, r0, ASR #3
275
276    LDR             r1, [sp, #0x48]
277    LDR             r4, [sp, #0x24]
278    SUB             r1, r12, r1, LSL #3
279    LDR             r6, [sp, #0x38]
280    ADD             r12, r1, #8
281    LDR             r7, [sp, #0x18]
282    ADD             r4, r4, r6
283    CMP             r4, r7
284    BLE             SECOND_LOOP
285
286SECOND_LOOP_2:
287    LDR             r3, [sp, #0x2c]
288    LDR             r14, [sp, #0x34]
289    MOV             r0, r0, LSL #3      @(del<<1) * 4
290
291    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
292    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
293    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
294    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
295    SUB             r3, r3, #2048       @ 512 *4
296    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
297    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
298
299    STR             r4, [sp, #0x24]
300
301    STR             r1, [sp, #0x14]
302    STR             r2, [sp, #0x10]
303    STR             r5, [sp, #0x0c]
304    STR             r6, [sp, #0x08]
305    STR             r7, [sp, #0x04]
306    STR             r8, [sp]
307
308RADIX4_BFLY_2:
309    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
310    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
311    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
312    SUBS            r14, r14, #1
313    LDR             r1, [sp, #0x14]
314    LDR             r2, [sp, #0x10]
315
316    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
317    LSR             r3, r3, #31
318    ORR             r4, r3, r4, LSL#1
319    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
320    LSR             r3, r3, #31
321    ORR             r6, r3, r6, LSL#1
322    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
323    LSR             r3, r3, #31
324    ORR             r5, r3, r5, LSL#1
325    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
326    LSR             r3, r3, #31
327    ORR             r7, r3, r7, LSL#1
328    ADD             r7, r7, r6
329    SUB             r6, r4, r5          @
330
331    LDR             r1, [sp, #0x0c]
332    LDR             r2, [sp, #0x08]
333
334    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
335    LSR             r3, r3, #31
336    ORR             r4, r3, r4, LSL#1
337    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
338    LSR             r3, r3, #31
339    ORR             r8, r3, r8, LSL#1
340    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
341    LSR             r3, r3, #31
342    ORR             r5, r3, r5, LSL#1
343    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
344    LSR             r3, r3, #31
345    ORR             r9, r3, r9, LSL#1
346    ADD             r9, r9, r8
347    SUB             r8, r4, r5          @
348
349    LDR             r1, [sp, #0x04]
350    LDR             r2, [sp]
351
352    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
353    LSR             r3, r3, #31
354    ORR             r4, r3, r4, LSL#1
355    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
356    LSR             r3, r3, #31
357    ORR             r10, r3, r10, LSL#1
358    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
359    LSR             r3, r3, #31
360    ORR             r5, r3, r5, LSL#1
361    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
362    LSR             r3, r3, #31
363    ORR             r11, r3, r11, LSL#1
364    ADD             r10, r11, r10
365    SUB             r11, r5, r4         @
366
367    @SUB    r12,r12,r0,lsl #1
368    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
369    LDR             r4, [r12, -r0, lsl #1]! @
370    LDR             r5, [r12, #0x04]
371
372
373    ADD             r4, r8, r4          @x0r = x0r + x2r@
374    ADD             r5, r9, r5          @x0i = x0i + x2i@
375    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
376    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
377    ADD             r6, r6, r10         @x1r = x1r + x3r@
378    ADD             r7, r7, r11         @x1i = x1i + x3i@
379    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
380    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
381
382    ADD             r4, r4, r6          @x0r = x0r + x1r@
383    ADD             r5, r5, r7          @x0i = x0i + x1i@
384    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
385    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
386    STRD            r4, [r12]           @r4=x0r,  r5=x0i
387    ADD             r12, r12, r0
388
389    ADD             r8, r8, r11         @x2r = x2r + x3i@
390    SUB             r9, r9, r10         @x2i = x2i - x3r@
391    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
392    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
393
394    STRD            r8, [r12]           @r8=x2r,  r9=x2i
395    ADD             r12, r12, r0
396    STRD            r6, [r12]           @r6=x1r,  r7=x1i
397    ADD             r12, r12, r0
398    STRD            r4, [r12]           @r10=x3r, r11=x3i
399    ADD             r12, r12, r0
400
401    BNE             RADIX4_BFLY_2
402    MOV             r0, r0, ASR #3
403
404    LDR             r1, [sp, #0x48]
405    LDR             r4, [sp, #0x24]
406    SUB             r1, r12, r1, LSL #3
407    LDR             r6, [sp, #0x38]
408    ADD             r12, r1, #8
409    LDR             r7, [sp, #0x40]
410    ADD             r4, r4, r6
411    CMP             r4, r7, ASR #1
412    BLE             SECOND_LOOP_2
413    LDR             r7, [sp, #0x18]
414    CMP             r4, r7, LSL #1
415    BGT             SECOND_LOOP_4
416
417SECOND_LOOP_3:
418    LDR             r3, [sp, #0x2c]
419    LDR             r14, [sp, #0x34]
420    MOV             r0, r0, LSL #3      @(del<<1) * 4
421
422    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
423    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
424    SUB             r3, r3, #2048       @ 512 *4
425    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
426    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
427    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
428    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
429
430    STR             r4, [sp, #0x24]
431    STR             r1, [sp, #0x14]
432    STR             r2, [sp, #0x10]
433    STR             r5, [sp, #0x0c]
434    STR             r6, [sp, #0x08]
435    STR             r7, [sp, #0x04]
436    STR             r8, [sp]
437
438
439RADIX4_BFLY_3:
440    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
441    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
442    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
443    SUBS            r14, r14, #1
444
445    LDR             r1, [sp, #0x14]
446    LDR             r2, [sp, #0x10]
447
448    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
449    LSR             r3, r3, #31
450    ORR             r4, r3, r4, LSL#1
451    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
452    LSR             r3, r3, #31
453    ORR             r6, r3, r6, LSL#1
454    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
455    LSR             r3, r3, #31
456    ORR             r5, r3, r5, LSL#1
457    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
458    LSR             r3, r3, #31
459    ORR             r7, r3, r7, LSL#1
460    ADD             r7, r7, r6
461    SUB             r6, r4, r5          @
462
463    LDR             r1, [sp, #0x0c]
464    LDR             r2, [sp, #0x08]
465
466    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
467    LSR             r3, r3, #31
468    ORR             r4, r3, r4, LSL#1
469    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
470    LSR             r3, r3, #31
471    ORR             r8, r3, r8, LSL#1
472    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
473    LSR             r3, r3, #31
474    ORR             r5, r3, r5, LSL#1
475    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
476    LSR             r3, r3, #31
477    ORR             r9, r3, r9, LSL#1
478    ADD             r8, r9, r8
479    SUB             r9, r5, r4          @
480
481    LDR             r1, [sp, #0x04]
482    LDR             r2, [sp]
483
484    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
485    LSR             r3, r3, #31
486    ORR             r4, r3, r4, LSL#1
487    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
488    LSR             r3, r3, #31
489    ORR             r10, r3, r10, LSL#1
490    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
491    LSR             r3, r3, #31
492    ORR             r5, r3, r5, LSL#1
493    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
494    LSR             r3, r3, #31
495    ORR             r11, r3, r11, LSL#1
496    ADD             r10, r11, r10
497    SUB             r11, r5, r4         @
498
499    @SUB    r12,r12,r0,lsl #1
500    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
501    LDR             r4, [r12, -r0, lsl #1]! @
502    LDR             r5, [r12, #0x04]
503
504
505    ADD             r4, r8, r4          @x0r = x0r + x2r@
506    ADD             r5, r9, r5          @x0i = x0i + x2i@
507    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
508    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
509    ADD             r6, r6, r10         @x1r = x1r + x3r@
510    ADD             r7, r7, r11         @x1i = x1i + x3i@
511    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
512    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
513
514    ADD             r4, r4, r6          @x0r = x0r + x1r@
515    ADD             r5, r5, r7          @x0i = x0i + x1i@
516    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
517    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
518    STRD            r4, [r12]           @r4=x0r,  r5=x0i
519    ADD             r12, r12, r0
520
521    ADD             r8, r8, r11         @x2r = x2r + x3i@
522    SUB             r9, r9, r10         @x2i = x2i - x3r@
523    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
524    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
525
526    STRD            r8, [r12]           @r8=x2r,  r9=x2i
527    ADD             r12, r12, r0
528    STRD            r6, [r12]           @r6=x1r,  r7=x1i
529    ADD             r12, r12, r0
530    STRD            r4, [r12]           @r10=x3r, r11=x3i
531    ADD             r12, r12, r0
532
533    BNE             RADIX4_BFLY_3
534    MOV             r0, r0, ASR #3
535
536    LDR             r1, [sp, #0x48]
537    LDR             r4, [sp, #0x24]
538    SUB             r1, r12, r1, LSL #3
539    LDR             r6, [sp, #0x38]
540    ADD             r12, r1, #8
541    LDR             r7, [sp, #0x18]
542    ADD             r4, r4, r6
543    CMP             r4, r7, LSL #1
544    BLE             SECOND_LOOP_3
545
546SECOND_LOOP_4:
547    LDR             r3, [sp, #0x2c]
548    LDR             r14, [sp, #0x34]
549    MOV             r0, r0, LSL #3      @(del<<1) * 4
550
551    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
552    LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
553    SUB             r3, r3, #2048       @ 512 *4
554    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
555    LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
556    SUB             r3, r3, #2048       @ 512 *4
557    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
558    LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
559
560
561    STR             r4, [sp, #0x24]
562    STR             r1, [sp, #0x14]
563    STR             r2, [sp, #0x10]
564    STR             r5, [sp, #0x0c]
565    STR             r6, [sp, #0x08]
566    STR             r7, [sp, #0x04]
567    STR             r8, [sp]
568
569RADIX4_BFLY_4:
570    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
571    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
572    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
573    SUBS            r14, r14, #1
574
575    LDR             r1, [sp, #0x14]
576    LDR             r2, [sp, #0x10]
577
578    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
579    LSR             r3, r3, #31
580    ORR             r4, r3, r4, LSL#1
581    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
582    LSR             r3, r3, #31
583    ORR             r6, r3, r6, LSL#1
584    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
585    LSR             r3, r3, #31
586    ORR             r5, r3, r5, LSL#1
587    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
588    LSR             r3, r3, #31
589    ORR             r7, r3, r7, LSL#1
590    ADD             r7, r7, r6
591    SUB             r6, r4, r5          @
592
593    LDR             r1, [sp, #0x0c]
594    LDR             r2, [sp, #0x08]
595
596    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
597    LSR             r3, r3, #31
598    ORR             r4, r3, r4, LSL#1
599    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
600    LSR             r3, r3, #31
601    ORR             r8, r3, r8, LSL#1
602    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
603    LSR             r3, r3, #31
604    ORR             r5, r3, r5, LSL#1
605    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
606    LSR             r3, r3, #31
607    ORR             r9, r3, r9, LSL#1
608    ADD             r8, r9, r8
609    SUB             r9, r5, r4          @
610
611    LDR             r1, [sp, #0x04]
612    LDR             r2, [sp]
613
614    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
615    LSR             r3, r3, #31
616    ORR             r4, r3, r4, LSL#1
617    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
618    LSR             r3, r3, #31
619    ORR             r10, r3, r10, LSL#1
620    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
621    LSR             r3, r3, #31
622    ORR             r5, r3, r5, LSL#1
623    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
624    LSR             r3, r3, #31
625    ORR             r11, r3, r11, LSL#1
626    ADD             r11, r11, r10
627    SUB             r10, r5, r4         @
628
629    @SUB    r12,r12,r0,lsl #1
630    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
631    LDR             r4, [r12, -r0, lsl #1]! @
632    LDR             r5, [r12, #0x04]
633
634
635    ADD             r4, r8, r4          @x0r = x0r + x2r@
636    ADD             r5, r9, r5          @x0i = x0i + x2i@
637    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
638    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
639    ADD             r6, r6, r10         @x1r = x1r + x3r@
640    SUB             r7, r7, r11         @x1i = x1i - x3i@
641    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
642    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
643
644    ADD             r4, r4, r6          @x0r = x0r + x1r@
645    ADD             r5, r5, r7          @x0i = x0i + x1i@
646    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
647    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
648    STRD            r4, [r12]           @r4=x0r,  r5=x0i
649    ADD             r12, r12, r0
650
651    ADD             r8, r8, r11         @x2r = x2r + x3i@
652    SUB             r9, r9, r10         @x2i = x2i - x3r@
653    SUB             r4, r8, r11, lsl#1  @x3i = x2r - (x3i << 1)@
654    ADD             r5, r9, r10, lsl#1  @x3r = x2i + (x3r << 1)
655
656    STRD            r8, [r12]           @r8=x2r,  r9=x2i
657    ADD             r12, r12, r0
658    STRD            r6, [r12]           @r6=x1r,  r7=x1i
659    ADD             r12, r12, r0
660    STRD            r4, [r12]           @r10=x3r, r11=x3i
661    ADD             r12, r12, r0
662
663    BNE             RADIX4_BFLY_4
664    MOV             r0, r0, ASR #3
665
666    LDR             r1, [sp, #0x48]
667    LDR             r4, [sp, #0x24]
668    SUB             r1, r12, r1, LSL #3
669    LDR             r6, [sp, #0x38]
670    ADD             r12, r1, #8
671    LDR             r7, [sp, #0x40]
672    ADD             r4, r4, r6
673    CMP             r4, r7
674    BLT             SECOND_LOOP_4
675
676    LDR             r1, [sp, #0x38]
677    MOV             r0, r0, LSL #2
678    MOV             r1, r1, ASR #2
679    STR             r1, [sp, #0x38]
680    LDR             r1, [sp, #0x34]
681    MOV             r1, r1, ASR #2
682    STR             r1, [sp, #0x34]
683    LDR             r1, [sp, #0x3c]
684    SUBS            r1, r1, #1
685    STR             r1, [sp, #0x3c]
686    BGT             OUTER_LOOP
687
688RADIX2:
689    LDR             r1, [sp, #0x30]
690    CMP             r1, #0
691    BEQ             EXIT
692    LDR             r12, [sp, #0x38]
693    LDR             r1, [sp, #0x44]
694    CMP             r12, #0
695    MOVEQ           r4, #1
696    MOVNE           r4, r12, LSL #1
697    MOVS            r3, r0
698    BEQ             EXIT
699
700    MOV             r3, r3, ASR #1
701    LDR             r5, [sp, #0x50]
702    MOV             r0, r0, LSL #3      @(del<<1) * 4
703    STR             r1, [sp, #0x18]
704RADIX2_BFLY:
705    LDR             r1, [sp, #0x18]
706    LDRD            r6, [r5]            @r6 = x0r
707    ADD             r5, r5, r0
708    LDRD            r8, [r5]            @r8 = x1r
709
710    LDR             r2, [r1]
711    SUBS            r3, r3, #1
712
713
714    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
715    LSR             r1, r1, #31
716    ORR             r11, r1, r11, LSL#1
717    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
718    LSR             r1, r1, #31
719    ORR             r10, r1, r10, LSL#1
720
721
722    LDR             r1, [sp, #0x18]
723    LDR             r2, [r1, #0x04]
724    ADD             r1, r1, r4, LSL #3
725    STR             r1, [sp, #0x18]
726
727    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
728    LSR             r1, r1, #31
729    ORR             r8, r1, r8, LSL#1
730    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
731    LSR             r1, r1, #31
732    ORR             r9, r1, r9, LSL#1
733
734    SUB             r8, r8, r10
735    ADD             r9, r9, r11
736
737
738    ADD             r10, r8, r6         @(x0r/2) + (x1r/2)
739    ASR             r10, r10, #1
740    ADD             r11, r9, r7         @(x0i/2) + (x1i/2)@
741    ASR             r11, r11, #1
742    SUB             r8, r6, r8          @(x0r/2) - (x1r/2)
743    ASR             r8, r8, #1
744    SUB             r9, r7, r9          @(x0i/2) - (x1i/2)@
745    ASR             r9, r9, #1
746
747    STRD            r8, [r5]
748    SUB             r5, r5, r0
749    STRD            r10, [r5], #8
750
751    BNE             RADIX2_BFLY
752
753    LDR             r1, [sp, #0x44]
754    MOV             r3, r0, ASR #4
755    STR             r1, [sp, #0x18]
756RADIX2_BFLY_2:
757    LDR             r1, [sp, #0x18]
758    LDRD            r6, [r5]            @r6 = x0r
759    ADD             r5, r5, r0
760    LDRD            r8, [r5]            @r8 = x1r
761
762    LDR             r2, [r1]
763    SUBS            r3, r3, #1
764
765
766
767    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
768    LSR             r1, r1, #31
769    ORR             r11, r1, r11, LSL#1
770    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
771    LSR             r1, r1, #31
772    ORR             r10, r1, r10, LSL#1
773
774
775    LDR             r1, [sp, #0x18]
776    LDR             r2, [r1, #0x04]
777    ADD             r1, r1, r4, LSL #3
778    STR             r1, [sp, #0x18]
779
780    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
781    LSR             r1, r1, #31
782    ORR             r8, r1, r8, LSL#1
783    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
784    LSR             r1, r1, #31
785    ORR             r9, r1, r9, LSL#1
786
787    ADD             r11, r11, r9
788    SUB             r9, r10, r8         @
789    MOV             r8, r11
790
791    ADD             r10, r8, r6         @(x0r>>1) + (x1r)
792    ASR             r10, r10, #1
793    ADD             r11, r9, r7         @(x0i>>1) + (x1i)@
794    ASR             r11, r11, #1
795    SUB             r8, r6, r8          @(x0r>>1) - (x1r)
796    ASR             r8, r8, #1
797    SUB             r9, r7, r9          @(x0i>>1) - (x1i)@
798    ASR             r9, r9, #1
799
800    STRD            r8, [r5]
801    SUB             r5, r5, r0
802    STRD            r10, [r5], #8
803
804    BNE             RADIX2_BFLY_2
805
806EXIT:
807    ADD             sp, sp, #0x54
808    LDMFD           sp!, {r4-r12, pc}
809
810