1
2
3.text
4.p2align 2
5.global ixheaacd_fft_15_ld_armv7
6
7ixheaacd_fft_15_ld_armv7:
8
9    STMFD           r13!, {r4 - r12, r14} @
10    STR             r1  , [r13, #-4]!   @
11    STR             r3  , [r13, #-4]!   @
12    MOV             lr, r2              @ lr - fft3out
13    MOV             r12, #384           @
14
15
16LOOP_FFT5:
17    LDRD            r2, [r0]            @ r2 = buf1a[0] and r3 = buf1a[1]
18    ADD             r0, r0, r12
19    LDRD            r4, [r0]            @ r4 = buf1a[2] and r5 = buf1a[3]
20    ADD             r0, r0, r12
21    LDRD            r6, [r0]            @ r6 = buf1a[4] and r7 = buf1a[5]
22    ADD             r0, r0, r12
23    LDRD            r8, [r0]            @ r8 = buf1a[6] and r9 = buf1a[7]
24    ADD             r0, r0, r12
25    LDRD            r10, [r0]           @ r10 = buf1a[8] and r11 = buf1a[9]
26
27
28    ADD             r1, r4, r10         @ r1 = buf1a[2] + buf1a[8]
29    SUB             r4, r4, r10         @ r4 = buf1a[2] - buf1a[8]@
30    MOVW            r10, #0xB000
31    MOVT            r10, #0x478E
32    ADD             r12, r6, r8         @ r3 = buf1a[4] + buf1a[6]
33    SUB             r8, r6, r8          @ r2 = buf1a[4] - buf1a[6]
34
35    SUB             r6, r1, r12         @ (r1 - r3)
36    SMULWT          r6, r6, r10         @ t = mult32x16in32_shl((r1 - r3), C54)
37    ADD             r1, r1, r12         @ r1 = r1 + r3@
38    ADD             r2, r2, r1          @ temp1 = inp[0] + r1@
39    SMULWB          r1, r1, r10         @ mult32_shl(r1, C55)
40    ADD             r1, r2, r1, lsl #2  @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
41    MOVW            r10, #0x9D84
42    MOVT            r10, #0x79BC
43    STR             r2, [lr], #4        @ *buf2++ = temp1@
44
45    SUB             r12, r1, r6, LSL #1 @ r3 = r1 - t@
46    ADD             r1, r1, r6, LSL #1  @ r1 = r1 + t@
47
48    ADD             r2, r4, r8          @ (r4 + r2)
49    SMULWT          r2, r2, r10         @ t = mult32_shl((r4 + r2), C51)@
50
51    @LSL     r2, r2, #1
52    MOV             r2, r2, LSL #1
53
54    SMULWB          r4, r4, r10         @ mult32_shl(r4, C52)
55    MOVW            r10, #0xD180
56    MOVT            r10, #0xFFFF
57    ADD             r4, r2, r4, LSL #2  @ r4 = t + (mult32_shl(r4, C52) << 1)@
58
59    SMULWB          r8, r8, r10         @ mult32_shl(r2, C53)
60    ADD             r2, r2, r8, LSL #1  @ r2 = t + mult32_shl(r2, C53)@
61
62    ADD             r6, r5, r11         @ s1 = buf1a[3] + buf1a[9]
63    SUB             r8, r5, r11         @ s4 = buf1a[3] - buf1a[9]
64    MOVW            r10, #0xB000
65    MOVT            r10, #0x478E
66    ADD             r5, r7, r9          @ s3 = buf1a[5] + buf1a[7]@
67    SUB             r7, r7, r9          @ s2 = buf1a[5] + buf1a[7]@
68
69
70    SUB             r9, r6, r5          @ (s1 - s3)
71    SMULWT          r9, r9, r10         @ t = mult32x16in32_shl((s1 - s3), C54)
72    ADD             r6, r6, r5          @ s1 = s1 + s3@
73    ADD             r3, r3, r6          @ temp2 = buf1a[1] + s1
74    SMULWB          r6, r6, r10         @ mult32_shl(s1, C55)
75    ADD             r6, r3, r6, lsl #2  @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
76    MOVW            r10, #0x9D84
77    MOVT            r10, #0x79BC
78    STR             r3, [lr], #4        @ *buf2++ = temp2@
79
80    SUB             r5, r6, r9, LSL #1  @ s3 = s1 - t@
81    ADD             r6, r6, r9, LSL #1  @ s1 = s1 + t@
82    SUB             r0, r0, #896        @ r0 -inp[160]
83
84    ADD             r11, r7, r8         @ (s4 + s2)
85    SMULWT          r11, r11, r10       @ t = mult32_shl((s4 + s2), C51)@
86    @LSL     r11, r11, #1           @
87    MOV             r11, r11, LSL #1
88
89
90    SMULWB          r8, r8, r10         @ mult32_shl(s4, C52)
91    MOVW            r10, #0xD180
92    MOVT            r10, #0xFFFF
93    ADD             r8, r11, r8, LSL #2 @ s4 = t + (mult32_shl(s4, C52) << 1)@
94
95    SMULWB          r7, r7, r10         @ mult32_shl(s2, C53)
96    ADD             r7, r11, r7, LSL #1 @ s2 = t + mult32_shl(s2, C53)@
97
98
99    ADD             r3, r1, r7          @ buf2[2] = r1 + s2
100    SUB             r9, r6, r2          @ buf2[3] = s1 - r2
101    SUB             r10, r12, r8        @ buf2[4] = r3 - s4
102    ADD             r11, r5, r4         @ buf2[5] = s3 + r4
103    ADD             r12, r12, r8        @ buf2[6] = r3 + s4
104    SUB             r4, r5, r4          @ buf2[7] = s3 - r4
105    SUB             r5, r1, r7          @ buf2[8] = r1 - s2
106    ADD             r6, r6, r2          @ buf2[9] = s1 + r2
107    STMIA           lr!, {r3, r9-r12}   @
108
109    MOV             r12, #384           @
110    MOVW            r1, #0xFA00
111    MOVT            r1, #0xFFFF
112
113    STMIA           lr!, {r4-r6}        @
114
115
116    LDRD            r2, [r0]            @ r2 = buf1a[0] and r3 = buf1a[1]
117    ADD             r0, r0, r12
118    LDRD            r4, [r0]            @ r4 = buf1a[2] and r5 = buf1a[3]
119    ADD             r0, r0, r12
120    LDRD            r6, [r0]            @ r6 = buf1a[4] and r7 = buf1a[5]
121    ADD             r0, r0, r12
122    LDRD            r8, [r0]            @ r8 = buf1a[6] and r9 = buf1a[7]
123    ADD             r0, r0, r1
124    LDRD            r10, [r0]           @ r10 = buf1a[8] and r11 = buf1a[9]
125    ADD             r0, r0, #1024       @ r0 -inp[320]
126
127    ADD             r1, r4, r10         @ r1 = buf1a[2] + buf1a[8]
128    SUB             r4, r4, r10         @ r4 = buf1a[2] - buf1a[8]@
129    MOVW            r10, #0xB000
130    MOVT            r10, #0x478E
131    ADD             r12, r6, r8         @ r3 = buf1a[4] + buf1a[6]
132    SUB             r8, r6, r8          @ r2 = buf1a[4] - buf1a[6]
133
134    SUB             r6, r1, r12         @ (r1 - r3)
135    SMULWT          r6, r6, r10         @ t = mult32x16in32_shl((r1 - r3), C54)
136    ADD             r1, r1, r12         @ r1 = r1 + r3@
137    ADD             r2, r2, r1          @ temp1 = inp[0] + r1@
138    SMULWB          r1, r1, r10         @ mult32_shl(r1, C55)
139    ADD             r1, r2, r1, lsl #2  @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
140    MOVW            r10, #0x9D84
141    MOVT            r10, #0x79BC
142    STR             r2, [lr], #4        @ *buf2++ = temp1@
143
144    SUB             r12, r1, r6, LSL #1 @ r3 = r1 - t@
145    ADD             r1, r1, r6, LSL #1  @ r1 = r1 + t@
146
147    ADD             r2, r4, r8          @ (r4 + r2)
148    SMULWT          r2, r2, r10         @ t = mult32_shl((r4 + r2), C51)@
149    @LSL     r2, r2, #1
150    MOV             r2, r2, LSL #1
151
152
153    SMULWB          r4, r4, r10         @ mult32_shl(r4, C52)
154    MOVW            r10, #0xD180
155    MOVT            r10, #0xFFFF
156    ADD             r4, r2, r4, LSL #2  @ r4 = t + (mult32_shl(r4, C52) << 1)@
157
158    SMULWB          r8, r8, r10         @ mult32_shl(r2, C53)
159    ADD             r2, r2, r8, LSL #1  @ r2 = t + mult32_shl(r2, C53)@
160
161    ADD             r6, r5, r11         @ s1 = buf1a[3] + buf1a[9]
162    SUB             r8, r5, r11         @ s4 = buf1a[3] - buf1a[9]
163    MOVW            r10, #0xB000
164    MOVT            r10, #0x478E
165    ADD             r5, r7, r9          @ s3 = buf1a[5] + buf1a[7]@
166    SUB             r7, r7, r9          @ s2 = buf1a[5] + buf1a[7]@
167
168
169    SUB             r9, r6, r5          @ (s1 - s3)
170    SMULWT          r9, r9, r10         @ t = mult32x16in32_shl((s1 - s3), C54)
171    ADD             r6, r6, r5          @ s1 = s1 + s3@
172    ADD             r3, r3, r6          @ temp2 = buf1a[1] + s1
173    SMULWB          r6, r6, r10         @ mult32_shl(s1, C55)
174    ADD             r6, r3, r6, lsl #2  @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
175    MOVW            r10, #0x9D84
176    MOVT            r10, #0x79BC
177    STR             r3, [lr], #4        @ *buf2++ = temp2@
178
179
180    SUB             r5, r6, r9, LSL #1  @ s3 = s1 - t@
181    ADD             r6, r6, r9, LSL #1  @ s1 = s1 + t@
182
183    ADD             r11, r7, r8         @ (s4 + s2)
184    SMULWT          r11, r11, r10       @ t = mult32_shl((s4 + s2), C51)@
185    @LSL     r11, r11, #1
186    MOV             r11, r11, LSL #1
187
188    SMULWB          r8, r8, r10         @mult32_shl(s4, C52)
189    MOVW            r10, #0xD180
190    MOVT            r10, #0xFFFF
191    ADD             r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
192
193    SMULWB          r7, r7, r10         @mult32_shl(s2, C53)
194    ADD             r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
195
196    ADD             r3, r1, r7          @buf2[2] = r1 + s2
197    SUB             r9, r6, r2          @buf2[3] = s1 - r2
198    SUB             r10, r12, r8        @buf2[4] = r3 - s4
199    ADD             r11, r5, r4         @buf2[5] = s3 + r4
200    ADD             r12, r12, r8        @buf2[6] = r3 + s4
201    SUB             r4, r5, r4          @buf2[7] = s3 - r4
202    SUB             r5, r1, r7          @buf2[8] = r1 - s2
203    ADD             r6, r6, r2          @buf2[9] = s1 + r2
204    MOVW            r1, #0xFA00
205    MOVT            r1, #0xFFFF
206
207    STMIA           lr!, {r3, r9-r12}
208    MOV             r12, #384           @
209    STMIA           lr!, {r4-r6}        @
210
211    LDRD            r2, [r0]            @ r2 = buf1a[0] and r3 = buf1a[1]
212    ADD             r0, r0, r12
213    LDRD            r4, [r0]            @ r4 = buf1a[2] and r5 = buf1a[3]
214    ADD             r0, r0, r1
215
216    LDRD            r6, [r0]            @ r6 = buf1a[4] and r7 = buf1a[5]
217    ADD             r0, r0, r12
218    LDRD            r8, [r0]            @ r8 = buf1a[6] and r9 = buf1a[7]
219    ADD             r0, r0, r12
220    LDRD            r10, [r0]           @ r10 = buf1a[8] and r11 = buf1a[9]
221    ADD             r0, r0, r12
222
223    ADD             r1, r4, r10         @ r1 = buf1a[2] + buf1a[8]
224    SUB             r4, r4, r10         @ r4 = buf1a[2] - buf1a[8]@
225    MOVW            r10, #0xB000
226    MOVT            r10, #0x478E
227    ADD             r12, r6, r8         @ r3 = buf1a[4] + buf1a[6]
228    SUB             r8, r6, r8          @ r2 = buf1a[4] - buf1a[6]
229
230    SUB             r6, r1, r12         @ (r1 - r3)
231    SMULWT          r6, r6, r10         @ t = mult32x16in32_shl((r1 - r3), C54)
232    ADD             r1, r1, r12         @ r1 = r1 + r3@
233    ADD             r2, r2, r1          @ temp1 = inp[0] + r1@
234    SMULWB          r1, r1, r10         @ mult32_shl(r1, C55)
235    ADD             r1, r2, r1, lsl #2  @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
236    MOVW            r10, #0x9D84
237    MOVT            r10, #0x79BC
238    STR             r2, [lr], #4        @ *buf2++ = temp1@
239
240    SUB             r12, r1, r6, LSL #1 @ r3 = r1 - t@
241    ADD             r1, r1, r6, LSL #1  @ r1 = r1 + t@
242
243    ADD             r2, r4, r8          @ (r4 + r2)
244    SMULWT          r2, r2, r10         @ t = mult32_shl((r4 + r2), C51)@
245    @LSL     r2, r2, #1
246    MOV             r2, r2, LSL #1
247
248    SMULWB          r4, r4, r10         @ mult32_shl(r4, C52)
249    MOVW            r10, #0xD180
250    MOVT            r10, #0xFFFF
251    ADD             r4, r2, r4, LSL #2  @ r4 = t + (mult32_shl(r4, C52) << 1)@
252
253    SMULWB          r8, r8, r10         @ mult32_shl(r2, C53)
254    ADD             r2, r2, r8, LSL #1  @ r2 = t + mult32_shl(r2, C53)@
255
256    ADD             r6, r5, r11         @ s1 = buf1a[3] + buf1a[9]
257    SUB             r8, r5, r11         @ s4 = buf1a[3] - buf1a[9]
258    MOVW            r10, #0xB000
259    MOVT            r10, #0x478E
260    ADD             r5, r7, r9          @ s3 = buf1a[5] + buf1a[7]@
261    SUB             r7, r7, r9          @ s2 = buf1a[5] + buf1a[7]@
262
263    SUB             r9, r6, r5          @ (s1 - s3)
264    SMULWT          r9, r9, r10         @ t = mult32x16in32_shl((s1 - s3), C54)
265    ADD             r6, r6, r5          @ s1 = s1 + s3@
266    ADD             r3, r3, r6          @ temp2 = buf1a[1] + s1
267    SMULWB          r6, r6, r10         @ mult32_shl(s1, C55)
268    ADD             r6, r3, r6, lsl #2  @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
269    MOVW            r10, #0x9D84
270    MOVT            r10, #0x79BC
271    STR             r3, [lr], #4        @ *buf2++ = temp2@
272
273    SUB             r5, r6, r9, LSL #1  @ s3 = s1 - t@
274    ADD             r6, r6, r9, LSL #1  @ s1 = s1 + t@
275
276    ADD             r11, r7, r8         @ (s4 + s2)
277    SMULWT          r11, r11, r10       @ t = mult32_shl((s4 + s2), C51)@
278    @LSL     r11, r11, #1           @
279    MOV             r11, r11, LSL #1
280
281    SMULWB          r8, r8, r10         @mult32_shl(s4, C52)
282    MOVW            r10, #0xD180
283    MOVT            r10, #0xFFFF
284    ADD             r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
285
286
287    SMULWB          r7, r7, r10         @mult32_shl(s2, C53)
288    ADD             r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
289
290    ADD             r3, r1, r7          @buf2[2] = r1 + s2
291    SUB             r9, r6, r2          @buf2[3] = s1 - r2
292    SUB             r10, r12, r8        @buf2[4] = r3 - s4
293    ADD             r11, r5, r4         @buf2[5] = s3 + r4
294    ADD             r12, r12, r8        @buf2[6] = r3 + s4
295    SUB             r4, r5, r4          @buf2[7] = s3 - r4
296    SUB             r5, r1, r7          @buf2[8] = r1 - s2
297    ADD             r6, r6, r2          @buf2[9] = s1 + r2
298
299    STMIA           lr!, {r3, r9-r12}
300    STMIA           lr!, {r4-r6}        @
301
302    SUB             lr, lr, #120        @
303    MOVW            r12, # 28378        @
304    LDMFD           r13!, {r10, r11}    @
305
306
307LOOP_FFT3:
308    LDRD            r0, [lr]            @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
309    LDRD            r2, [lr, #40]       @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
310    LDRD            r4, [lr, #80]       @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
311    ADD             lr, lr, #8          @
312
313    ADD             r6, r0, r2          @ X01r = add32(buf1[0], buf1[2])
314    ADD             r7, r1, r3          @ X01i = add32(buf1[1], buf1[3])
315
316    ADD             r8, r2, r4          @ add_r = add32(buf1[2], buf1[4])
317    ADD             r9, r3, r5          @ add_i = add32(buf1[3], buf1[5])
318
319    SUB             r2, r2, r4          @ sub_r = sub32(buf1[2], buf1[4])@
320    SUB             r3, r3, r5          @ sub_i = sub32(buf1[3], buf1[5])@
321
322    @ASR        r8, r8, #1          @ p1 = add_r >> 1@
323    MOV             r8, r8, ASR #1
324
325    @ASR        r9, r9, #1          @ p4 = add_i >> 1@
326    MOV             r9, r9, ASR #1
327
328    SMULWB          r3, r3, r12         @ p2 = mult32x16in32_shl(sub_i, sinmu)@
329    SMULWB          r2, r2, r12         @ p3 = mult32x16in32_shl(sub_r, sinmu)@
330
331    SUB             r0, r0, r8          @ temp = sub32(buf1a[0], p1)@
332    ADD             r8, r1, r2, LSL #1  @ temp1 = add32(buf1a[1], p3)@
333    SUB             r2, r1, r2, LSL #1  @ temp2 = sub32(buf1a[1], p3)@
334
335    ADD             r4, r6, r4          @ add32(X01r, buf1a[4])@
336    ADD             r5, r7, r5          @ add32(X01i, buf1a[5])@
337    ADD             r6, r0, r3, LSL #1  @ add32(temp, p2)@
338    SUB             r7, r2, r9          @ sub32(temp2, p4)@
339    SUB             r9, r8, r9          @ sub32(temp1, p4)@
340    SUB             r8, r0, r3, LSL #1  @ sub32(temp, p2)@
341
342    MOV             r3, r11             @
343    LDRB            r0, [r10], #1       @
344    LDRB            r1, [r10], #1       @
345    LDRB            r2, [r10], #1       @
346    ADD             r0, r11, r0, lsl #3 @
347    ADD             r1, r11, r1, lsl #3 @
348    ADD             r2, r11, r2, lsl #3 @
349    STRD            r4, [r0]            @
350    STRD            r6, [r1]            @
351    STRD            r8, [r2]            @
352
353    LDRD            r0, [lr]            @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
354    LDRD            r2, [lr, #40]       @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
355    LDRD            r4, [lr, #80]       @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
356    ADD             lr, lr, #8          @
357
358
359    ADD             r6, r0, r2          @ X01r = add32(buf1[0], buf1[2])
360    ADD             r7, r1, r3          @ X01i = add32(buf1[1], buf1[3])
361
362    ADD             r8, r2, r4          @ add_r = add32(buf1[2], buf1[4])
363    ADD             r9, r3, r5          @ add_i = add32(buf1[3], buf1[5])
364
365    SUB             r2, r2, r4          @ sub_r = sub32(buf1[2], buf1[4])@
366    SUB             r3, r3, r5          @ sub_i = sub32(buf1[3], buf1[5])@
367
368    @ASR        r8, r8, #1          @ p1 = add_r >> 1@
369    MOV             r8, r8, ASR #1
370    @ASR        r9, r9, #1          @ p4 = add_i >> 1@
371    MOV             r9, r9, ASR #1
372
373    SMULWB          r3, r3, r12         @ p2 = mult32x16in32_shl(sub_i, sinmu)@
374    SMULWB          r2, r2, r12         @ p3 = mult32x16in32_shl(sub_r, sinmu)@
375
376    SUB             r0, r0, r8          @ temp = sub32(buf1a[0], p1)@
377    ADD             r8, r1, r2, LSL #1  @ temp1 = add32(buf1a[1], p3)@
378    SUB             r2, r1, r2, LSL #1  @ temp2 = sub32(buf1a[1], p3)@
379
380    ADD             r4, r6, r4          @ add32(X01r, buf1a[4])@
381    ADD             r5, r7, r5          @ add32(X01i, buf1a[5])@
382    ADD             r6, r0, r3, LSL #1  @ add32(temp, p2)@
383    SUB             r7, r2, r9          @ sub32(temp2, p4)@
384    SUB             r9, r8, r9          @ sub32(temp1, p4)@
385    SUB             r8, r0, r3, LSL #1  @ sub32(temp, p2)@
386
387    LDRB            r0, [r10], #1       @
388    LDRB            r1, [r10], #1       @
389    LDRB            r2, [r10], #1       @
390    ADD             r0, r11, r0, lsl #3 @
391    ADD             r1, r11, r1, lsl #3 @
392    ADD             r2, r11, r2, lsl #3 @
393    STRD            r4, [r0]            @
394    STRD            r6, [r1]            @
395    STRD            r8, [r2]            @
396
397    LDRD            r0, [lr]            @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
398    LDRD            r2, [lr, #40]       @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
399    LDRD            r4, [lr, #80]       @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
400    ADD             lr, lr, #8          @
401
402
403    ADD             r6, r0, r2          @ X01r = add32(buf1[0], buf1[2])
404    ADD             r7, r1, r3          @ X01i = add32(buf1[1], buf1[3])
405
406    ADD             r8, r2, r4          @ add_r = add32(buf1[2], buf1[4])
407    ADD             r9, r3, r5          @ add_i = add32(buf1[3], buf1[5])
408
409    SUB             r2, r2, r4          @ sub_r = sub32(buf1[2], buf1[4])@
410    SUB             r3, r3, r5          @ sub_i = sub32(buf1[3], buf1[5])@
411
412
413    @ASR        r8, r8, #1          @ p1 = add_r >> 1@
414    MOV             r8, r8, ASR #1
415    @ASR        r9, r9, #1          @ p4 = add_i >> 1@
416    MOV             r9, r9, ASR #1
417
418    SMULWB          r3, r3, r12         @ p2 = mult32x16in32_shl(sub_i, sinmu)@
419    SMULWB          r2, r2, r12         @ p3 = mult32x16in32_shl(sub_r, sinmu)@
420
421    SUB             r0, r0, r8          @ temp = sub32(buf1a[0], p1)@
422    ADD             r8, r1, r2, LSL #1  @ temp1 = add32(buf1a[1], p3)@
423    SUB             r2, r1, r2, LSL #1  @ temp2 = sub32(buf1a[1], p3)@
424
425    ADD             r4, r6, r4          @ add32(X01r, buf1a[4])@
426    ADD             r5, r7, r5          @ add32(X01i, buf1a[5])@
427    ADD             r6, r0, r3, LSL #1  @ add32(temp, p2)@
428    SUB             r7, r2, r9          @ sub32(temp2, p4)@
429    SUB             r9, r8, r9          @ sub32(temp1, p4)@
430    SUB             r8, r0, r3, LSL #1  @ sub32(temp, p2)@
431
432    LDRB            r0, [r10], #1       @
433    LDRB            r1, [r10], #1       @
434    LDRB            r2, [r10], #1       @
435    ADD             r0, r11, r0, lsl #3 @
436    ADD             r1, r11, r1, lsl #3 @
437    ADD             r2, r11, r2, lsl #3 @
438    STRD            r4, [r0]            @
439    STRD            r6, [r1]            @
440    STRD            r8, [r2]            @
441
442    LDRD            r0, [lr]            @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
443    LDRD            r2, [lr, #40]       @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
444    LDRD            r4, [lr, #80]       @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
445    ADD             lr, lr, #8          @
446
447    ADD             r6, r0, r2          @ X01r = add32(buf1[0], buf1[2])
448    ADD             r7, r1, r3          @ X01i = add32(buf1[1], buf1[3])
449
450    ADD             r8, r2, r4          @ add_r = add32(buf1[2], buf1[4])
451    ADD             r9, r3, r5          @ add_i = add32(buf1[3], buf1[5])
452
453    SUB             r2, r2, r4          @ sub_r = sub32(buf1[2], buf1[4])@
454    SUB             r3, r3, r5          @ sub_i = sub32(buf1[3], buf1[5])@
455
456    @ASR        r8, r8, #1          @ p1 = add_r >> 1@
457    MOV             r8, r8, ASR #1
458    @ASR        r9, r9, #1          @ p4 = add_i >> 1@
459    MOV             r9, r9, ASR #1
460
461    SMULWB          r3, r3, r12         @ p2 = mult32x16in32_shl(sub_i, sinmu)@
462    SMULWB          r2, r2, r12         @ p3 = mult32x16in32_shl(sub_r, sinmu)@
463
464    SUB             r0, r0, r8          @ temp = sub32(buf1a[0], p1)@
465    ADD             r8, r1, r2, LSL #1  @ temp1 = add32(buf1a[1], p3)@
466    SUB             r2, r1, r2, LSL #1  @ temp2 = sub32(buf1a[1], p3)@
467
468    ADD             r4, r6, r4          @ add32(X01r, buf1a[4])@
469    ADD             r5, r7, r5          @ add32(X01i, buf1a[5])@
470    ADD             r6, r0, r3, LSL #1  @ add32(temp, p2)@
471    SUB             r7, r2, r9          @ sub32(temp2, p4)@
472    SUB             r9, r8, r9          @ sub32(temp1, p4)@
473    SUB             r8, r0, r3, LSL #1  @ sub32(temp, p2)@
474
475    LDRB            r0, [r10], #1       @
476    LDRB            r1, [r10], #1       @
477    LDRB            r2, [r10], #1       @
478    ADD             r0, r11, r0, lsl #3 @
479    ADD             r1, r11, r1, lsl #3 @
480    ADD             r2, r11, r2, lsl #3 @
481    STRD            r4, [r0]            @
482    STRD            r6, [r1]            @
483    STRD            r8, [r2]            @
484
485    LDRD            r0, [lr]            @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
486    LDRD            r2, [lr, #40]       @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
487    LDRD            r4, [lr, #80]       @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
488
489    ADD             r6, r0, r2          @ X01r = add32(buf1[0], buf1[2])
490    ADD             r7, r1, r3          @ X01i = add32(buf1[1], buf1[3])
491
492    ADD             r8, r2, r4          @ add_r = add32(buf1[2], buf1[4])
493    ADD             r9, r3, r5          @ add_i = add32(buf1[3], buf1[5])
494
495    SUB             r2, r2, r4          @ sub_r = sub32(buf1[2], buf1[4])@
496    SUB             r3, r3, r5          @ sub_i = sub32(buf1[3], buf1[5])@
497
498    @ASR        r8, r8, #1          @ p1 = add_r >> 1@
499    MOV             r8, r8, ASR #1
500    @ASR        r9, r9, #1          @ p4 = add_i >> 1@
501    MOV             r9, r9, ASR #1
502
503    SMULWB          r3, r3, r12         @ p2 = mult32x16in32_shl(sub_i, sinmu)@
504    SMULWB          r2, r2, r12         @ p3 = mult32x16in32_shl(sub_r, sinmu)@
505
506    SUB             r0, r0, r8          @ temp = sub32(buf1a[0], p1)@
507    ADD             r8, r1, r2, LSL #1  @ temp1 = add32(buf1a[1], p3)@
508    SUB             r2, r1, r2, LSL #1  @ temp2 = sub32(buf1a[1], p3)@
509
510    ADD             r4, r6, r4          @ add32(X01r, buf1a[4])@
511    ADD             r5, r7, r5          @ add32(X01i, buf1a[5])@
512    ADD             r6, r0, r3, LSL #1  @ add32(temp, p2)@
513    SUB             r7, r2, r9          @ sub32(temp2, p4)@
514    SUB             r9, r8, r9          @ sub32(temp1, p4)@
515    SUB             r8, r0, r3, LSL #1  @ sub32(temp, p2)@
516
517    LDRB            r0, [r10], #1       @
518    LDRB            r1, [r10], #1       @
519    LDRB            r2, [r10], #1       @
520    ADD             r0, r11, r0, lsl #3 @
521    ADD             r1, r11, r1, lsl #3 @
522    ADD             r2, r11, r2, lsl #3 @
523    STRD            r4, [r0]            @
524    STRD            r6, [r1]            @
525    STRD            r8, [r2]            @
526
527    LDMFD           r13!, {r4 - r12, r15}
528
529
530