1.macro push_v_regs
2    stp             X8, X9, [sp, #-16]!
3    stp             X10, X11, [sp, #-16]!
4    stp             X12, X13, [sp, #-16]!
5    stp             X14, X15, [sp, #-16]!
6    stp             X16, X17, [sp, #-16]!
7    stp             X18, X19, [sp, #-16]!
8    stp             X20, X21, [sp, #-16]!
9    stp             X22, X24, [sp, #-16]!
10    stp             X29, X30, [sp, #-16]!
11.endm
12
13.macro pop_v_regs
14    ldp             X29, X30, [sp], #16
15    ldp             X22, X24, [sp], #16
16    ldp             X20, X21, [sp], #16
17    ldp             X18, X19, [sp], #16
18    ldp             X16, X17, [sp], #16
19    ldp             X14, X15, [sp], #16
20    ldp             X12, X13, [sp], #16
21    ldp             X10, X11, [sp], #16
22    ldp             X8, X9, [sp], #16
23.endm
24
25
26.text
27.p2align 2
28.global ixheaacd_fft32x32_ld2_armv8
29
30ixheaacd_fft32x32_ld2_armv8:
31
32    // STMFD sp!, {x4-x12,x14}
33    push_v_regs
34    stp             x19, x20, [sp, #-16]!
35
36    //DIT Radix-4 FFT First Stage
37    //First Butterfly
38    MOV             x0, x2
39    MOV             x1, x3
40    LDR             w2, [x0]            //x_0 = x[0 ]
41    sxtw            x2, w2
42    LDR             w3, [x0, #32]       //x_2 = x[8 ]
43    sxtw            x3, w3
44    LDR             w4, [x0, #64]       //x_4 = x[16]
45    sxtw            x4, w4
46    LDR             w5, [x0, #96]       //x_6 = x[24]
47    sxtw            x5, w5
48    ADD             w6, w2, w4          //xh0_0 = x_0 + x_4
49    SUB             w7, w2, w4          //xl0_0 = x_0 - x_4
50    ADD             w8, w3, w5          //xh0_1 = x_2 + x_6
51    SUB             w9, w3, w5          //xl0_1 = x_2 - x_6
52
53    LDR             w2, [x0, #4]        //x_1 = x[0 +1]
54    sxtw            x2, w2
55    LDR             w3, [x0, #36]       //x_3 = x[8 +1]
56    sxtw            x3, w3
57    LDR             w4, [x0, #68]       //x_5 = x[16+1]
58    sxtw            x4, w4
59    LDR             w5, [x0, #100]      //x_7 = x[24+1]
60    sxtw            x5, w5
61    ADD             w10, w2, w4         //xh1_0 = x_1 + x_5
62    SUB             w11, w2, w4         //xl1_0 = x_1 - x_5
63    ADD             w12, w3, w5         //xh1_1 = x_3 + x_7
64    SUB             w14, w3, w5         //xl1_1 = x_3 - x_7
65
66    ADD             w2, w6, w8          //n00 = xh0_0 + xh0_1
67    ADD             w3, w7, w14         //n10 = xl0_0 + xl1_1
68    SUB             w4, w6, w8          //n20 = xh0_0 - xh0_1
69    SUB             w5, w7, w14         //n30 = xl0_0 - xl1_1
70    STR             w2, [x0]            //x[0 ] = n00
71    STR             w3, [x0, #32]       //x[8 ] = n10
72    STR             w4, [x0, #64]       //x[16] = n20
73    STR             w5, [x0, #96]       //x[24] = n30
74
75    ADD             w2, w10, w12        //n01 = xh1_0 + xh1_1
76    SUB             w3, w11, w9         //n11 = xl1_0 - xl0_1
77    SUB             w4, w10, w12        //n21 = xh1_0 - xh1_1
78    ADD             w5, w11, w9         //n31 = xl1_0 + xl0_1
79    STR             w2, [x0, #4]        //x[1   ] = n01
80    STR             w3, [x0, #36]       //x[8+1 ] = n11
81    STR             w4, [x0, #68]       //x[16+1] = n21
82    STR             w5, [x0, #100]      //x[24+1] = n31
83
84    //Second Butterfly
85    LDR             w2, [x0, #8]        //x_0 = x[2 ]
86    sxtw            x2, w2
87    LDR             w3, [x0, #40]       //x_2 = x[10]
88    sxtw            x3, w3
89    LDR             w4, [x0, #72]       //x_4 = x[18]
90    sxtw            x4, w4
91    LDR             w5, [x0, #104]      //x_6 = x[26]
92    sxtw            x5, w5
93    ADD             w6, w2, w4          //xh0_0 = x_0 + x_4
94    SUB             w7, w2, w4          //xl0_0 = x_0 - x_4
95    ADD             w8, w3, w5          //xh0_1 = x_2 + x_6
96    SUB             w9, w3, w5          //xl0_1 = x_2 - x_6
97
98    LDR             w2, [x0, #12]       //x_1 = x[2 +1]
99    sxtw            x2, w2
100    LDR             w3, [x0, #44]       //x_3 = x[10+1]
101    sxtw            x3, w3
102    LDR             w4, [x0, #76]       //x_5 = x[18+1]
103    sxtw            x4, w4
104    LDR             w5, [x0, #108]      //x_7 = x[26+1]
105    sxtw            x5, w5
106    ADD             w10, w2, w4         //xh1_0 = x_1 + x_5
107    SUB             w11, w2, w4         //xl1_0 = x_1 - x_5
108    ADD             w12, w3, w5         //xh1_1 = x_3 + x_7
109    SUB             w14, w3, w5         //xl1_1 = x_3 - x_7
110
111    ADD             w2, w6, w8          //n00 = xh0_0 + xh0_1
112    ADD             w3, w7, w14         //n10 = xl0_0 + xl1_1
113    SUB             w4, w6, w8          //n20 = xh0_0 - xh0_1
114    SUB             w5, w7, w14         //n30 = xl0_0 - xl1_1
115    STR             w2, [x0, #8]        //x[2 ] = n00
116    STR             w3, [x0, #40]       //x[10] = n10
117    STR             w4, [x0, #72]       //x[18] = n20
118    STR             w5, [x0, #104]      //x[26] = n30
119
120    ADD             w2, w10, w12        //n01 = xh1_0 + xh1_1
121    SUB             w3, w11, w9         //n11 = xl1_0 - xl0_1
122    SUB             w4, w10, w12        //n21 = xh1_0 - xh1_1
123    ADD             w5, w11, w9         //n31 = xl1_0 + xl0_1
124    STR             w2, [x0, #12]       //x[2 +1] = n01
125    STR             w3, [x0, #44]       //x[10+1] = n11
126    STR             w4, [x0, #76]       //x[18+1] = n21
127    STR             w5, [x0, #108]      //x[26+1] = n31
128
129    //Third Butterfly
130    LDR             w2, [x0, #16]       //x_0 = x[4 ]
131    sxtw            x2, w2
132    LDR             w3, [x0, #48]       //x_2 = x[12]
133    sxtw            x3, w3
134    LDR             w4, [x0, #80]       //x_4 = x[20]
135    sxtw            x4, w4
136    LDR             w5, [x0, #112]      //x_6 = x[28]
137    sxtw            x5, w5
138    ADD             w6, w2, w4          //xh0_0 = x_0 + x_4
139    SUB             w7, w2, w4          //xl0_0 = x_0 - x_4
140    ADD             w8, w3, w5          //xh0_1 = x_2 + x_6
141    SUB             w9, w3, w5          //xl0_1 = x_2 - x_6
142
143    LDR             w2, [x0, #20]       //x_1 = x[4 +1]
144    sxtw            x2, w2
145    LDR             w3, [x0, #52]       //x_3 = x[12+1]
146    sxtw            x3, w3
147    LDR             w4, [x0, #84]       //x_5 = x[20+1]
148    sxtw            x4, w4
149    LDR             w5, [x0, #116]      //x_7 = x[28+1]
150    sxtw            x5, w5
151    ADD             w10, w2, w4         //xh1_0 = x_1 + x_5
152    SUB             w11, w2, w4         //xl1_0 = x_1 - x_5
153    ADD             w12, w3, w5         //xh1_1 = x_3 + x_7
154    SUB             w14, w3, w5         //xl1_1 = x_3 - x_7
155
156    ADD             w2, w6, w8          //n00 = xh0_0 + xh0_1
157    ADD             w3, w7, w14         //n10 = xl0_0 + xl1_1
158    SUB             w4, w6, w8          //n20 = xh0_0 - xh0_1
159    SUB             w5, w7, w14         //n30 = xl0_0 - xl1_1
160    STR             w2, [x0, #16]       //x[4 ] = n00
161    STR             w3, [x0, #48]       //x[12] = n10
162    STR             w4, [x0, #80]       //x[20] = n20
163    STR             w5, [x0, #112]      //x[28] = n30
164
165    ADD             w2, w10, w12        //n01 = xh1_0 + xh1_1
166    SUB             w3, w11, w9         //n11 = xl1_0 - xl0_1
167    SUB             w4, w10, w12        //n21 = xh1_0 - xh1_1
168    ADD             w5, w11, w9         //n31 = xl1_0 + xl0_1
169    STR             w2, [x0, #20]       //x[4 +1] = n01
170    STR             w3, [x0, #52]       //x[12+1] = n11
171    STR             w4, [x0, #84]       //x[20+1] = n21
172    STR             w5, [x0, #116]      //x[28+1] = n31
173
174    //Fourth Butterfly
175    LDR             w2, [x0, #24]       //x_0 = x[6 ]
176    sxtw            x2, w2
177    LDR             w3, [x0, #56]       //x_2 = x[14]
178    sxtw            x3, w3
179    LDR             w4, [x0, #88]       //x_4 = x[22]
180    sxtw            x4, w4
181    LDR             w5, [x0, #120]      //x_6 = x[30]
182    sxtw            x5, w5
183    ADD             w6, w2, w4          //xh0_0 = x_0 + x_4
184    SUB             w7, w2, w4          //xl0_0 = x_0 - x_4
185    ADD             w8, w3, w5          //xh0_1 = x_2 + x_6
186    SUB             w9, w3, w5          //xl0_1 = x_2 - x_6
187
188    LDR             w2, [x0, #28]       //x_1 = x[6 +1]
189    sxtw            x2, w2
190    LDR             w3, [x0, #60]       //x_3 = x[14+1]
191    sxtw            x3, w3
192    LDR             w4, [x0, #92]       //x_5 = x[22+1]
193    sxtw            x4, w4
194    LDR             w5, [x0, #124]      //x_7 = x[30+1]
195    sxtw            x5, w5
196    ADD             w10, w2, w4         //xh1_0 = x_1 + x_5
197    SUB             w11, w2, w4         //xl1_0 = x_1 - x_5
198    ADD             w12, w3, w5         //xh1_1 = x_3 + x_7
199    SUB             w14, w3, w5         //xl1_1 = x_3 - x_7
200
201    ADD             w2, w6, w8          //n00 = xh0_0 + xh0_1
202    ADD             w3, w7, w14         //n10 = xl0_0 + xl1_1
203    SUB             w4, w6, w8          //n20 = xh0_0 - xh0_1
204    SUB             w5, w7, w14         //n30 = xl0_0 - xl1_1
205    STR             w2, [x0, #24]       //x[6 ] = n00
206    STR             w3, [x0, #56]       //x[14] = n10
207    STR             w4, [x0, #88]       //x[22] = n20
208    STR             w5, [x0, #120]      //x[30] = n30
209
210    ADD             w2, w10, w12        //n01 = xh1_0 + xh1_1
211    SUB             w3, w11, w9         //n11 = xl1_0 - xl0_1
212    SUB             w4, w10, w12        //n21 = xh1_0 - xh1_1
213    ADD             w5, w11, w9         //n31 = xl1_0 + xl0_1
214    STR             w2, [x0, #28]       //x[6 +1] = n01
215    STR             w3, [x0, #60]       //x[14+1] = n11
216    STR             w4, [x0, #92]       //x[22+1] = n21
217    STR             w5, [x0, #124]      //x[30+1] = n31
218
219
220    //DIT Radix-4 FFT Second Stage
221    //First Butterfly
222    LDR             w2, [x0]            //inp_0qr = x[0]
223    sxtw            x2, w2
224    LDR             w3, [x0, #8]        //inp_1qr = x[2]
225    sxtw            x3, w3
226    LDR             w4, [x0, #16]       //inp_2qr = x[4]
227    sxtw            x4, w4
228    LDR             w5, [x0, #24]       //inp_3qr = x[6]
229    sxtw            x5, w5
230    ADD             w6, w2, w4          //sum_0qr  = mul_0qr + mul_2qr
231    SUB             w7, w2, w4          //sum_1qr  = mul_0qr - mul_2qr
232    ADD             w8, w3, w5          //sum_2qr  = mul_1qr + mul_3qr
233    SUB             w9, w3, w5          //sum_3qr  = mul_1qr - mul_3qr
234
235    LDR             w2, [x0, #4]        //inp_0qi = x[1]
236    sxtw            x2, w2
237    LDR             w3, [x0, #12]       //inp_1qi = x[3]
238    sxtw            x3, w3
239    LDR             w4, [x0, #20]       //inp_2qi = x[5]
240    sxtw            x4, w4
241    LDR             w5, [x0, #28]       //inp_3qi = x[7]
242    sxtw            x5, w5
243    ADD             w10, w2, w4         //sum_0qi  = mul_0qi + mul_2qi
244    SUB             w11, w2, w4         //sum_1qi  = mul_0qi - mul_2qi
245    ADD             w12, w3, w5         //sum_2qi  = mul_1qi + mul_3qi
246    SUB             w14, w3, w5         //sum_3qi  = mul_1qi - mul_3qi
247
248    ADD             w2, w6, w8          //sum_0qr + sum_2qr
249    ADD             w3, w7, w14         //sum_1qr + sum_3qi
250    SUB             w4, w6, w8          //sum_0qr - sum_2qr
251    SUB             w5, w7, w14         //sum_1qr - sum_3qi
252    STR             w2, [x1]            //y[0 ] = sum_0qr + sum_2qr
253    STR             w3, [x1, #32]       //y[8 ] = sum_1qr + sum_3qi
254    STR             w4, [x1, #64]       //y[16] = sum_0qr - sum_2qr
255    STR             w5, [x1, #96]       //y[24] = sum_1qr - sum_3qi
256
257    ADD             w2, w10, w12        //sum_0qi + sum_2qi
258    SUB             w3, w11, w9         //sum_1qi - sum_3qr
259    SUB             w4, w10, w12        //sum_0qi - sum_2qi
260    ADD             w5, w11, w9         //sum_1qi + sum_3qr
261    STR             w2, [x1, #4]        //y[0 +1] = sum_0qi + sum_2qi
262    STR             w3, [x1, #36]       //y[8 +1] = sum_1qi - sum_3qr
263    STR             w4, [x1, #68]       //y[16+1] = sum_0qi - sum_2qi
264    STR             w5, [x1, #100]      //y[24+1] = sum_1qi + sum_3qr
265
266
267    //Load twiddle factors
268//    LDR w11,  =2310960706            //0x89BE7642
269    MOV             w11, #0x7642
270    sxth            w11, w11
271    MOV             w21, #0x89BE
272    sxth            w21, w21
273//    LDR w12,  =3473158396            //0xCF0430FC
274    MOV             w12, #0x30FC
275    sxth            w12, w12
276    MOV             w22, #0xCF04
277    sxth            w22, w22
278//    LDR w14,  =2776455811            //0xA57D5A83
279    MOV             w14, #0x5A83
280    sxth            w14, w14
281    MOV             w24, #0xA57D
282    sxth            w24, w24
283
284    //Second Butterfly
285    LDR             w2, [x0, #32]       //mul_0qr = inp_0qr = x[8]
286    sxtw            x2, w2
287    LDR             w3, [x0, #36]       //mul_0qi = inp_1qr = x[9]
288    sxtw            x3, w3
289
290    LDR             w5, [x0, #40]       //inp_1qr = x[10]
291    sxtw            x5, w5
292    LDR             w6, [x0, #44]       //inp_1qi = x[11]
293    sxtw            x6, w6
294
295    SMULL           x4, w5, w11
296    ASR             x4, x4, #16
297//    SMULWB      x4, x5, x11                 //mul_1qr = mpy_16_32_ns( 0x7642 , inp_1qr)
298
299    SMULL           x20, w6, w12
300    ASR             x20, x20, #16
301    ADD             w4, w4, w20
302//    SMLAWB      x4, x6, x12, x4             //mul_1qr -= mpy_16_32_ns(-0x30FC , inp_1qi)
303
304    SMULL           x5, w5, w22
305    ASR             x5, x5, #16
306//  SMULWT      x5, x5, x12                 //mul_1qi = mpy_16_32_ns(-0x30FC , inp_1qr)
307
308    LDR             w7, [x0, #48]       //inp_2qr = x[12]
309    sxtw            x7, w7
310    LDR             w8, [x0, #52]       //inp_2qi = x[13]
311    sxtw            x8, w8
312
313    //Moved for delay slot
314    SMULL           x20, w6, w11
315    ASR             x20, x20, #16
316    ADD             w5, w5, w20
317//    SMLAWB      x5, x6, x11, x5             //mul_1qi += mpy_16_32_ns( 0x7642 , inp_1qi)
318
319    ADD             w6, w7, w8          //(inp_2qr + inp_2qi)
320
321    SMULL           x6, w6, w14
322    ASR             x6, x6, #16
323//    SMULWB      x6, x6, x14                 //mul_2qr = mpy_16_32_ns(0x5A83 , (inp_2qr + inp_2qi))
324
325    SUB             w7, w8, w7          //(-inp_2qr + inp_2qi)
326
327    SMULL           x7, w7, w14
328    ASR             x7, x7, #16
329//  SMULWB      x7, x7, x14                 //mul_2qi = mpy_16_32_ns(0x5A83 , (-inp_2qr + inp_2qi))
330
331    LDR             x9 , [x0, #56]      //inp_3qr = x[14]
332    sxtw            x9, w9
333    LDR             w10, [x0, #60]      //inp_3qi = x[15]
334    sxtw            x10, w10
335
336    SMULL           x8, w9, w12
337    ASR             x8, x8, #16
338//    SMULWB      x8, x9 , x12                //mul_3qr = mpy_16_32_ns( 0x30FC , inp_3qr)
339
340    SMULL           x20, w10, w11
341    ASR             x20, x20, #16
342    ADD             w8, w8, w20
343//  SMLAWB      x8, x10, x11, x8            //mul_3qr -= mpy_16_32_ns(-0x7642 , inp_3qi)//
344
345    SMULL           x9, w9 , w21
346    ASR             x9, x9, #16
347//  SMULWT      x9, x9 , x11                //mul_3qi = mpy_16_32_ns(-0x7642 , inp_3qr)
348
349    SMULL           x20, w10, w12
350    ASR             x20, x20, #16
351    ADD             w9, w9, w20
352//    SMLAWB      x9, x10, x12, x9            //mul_3qi += mpy_16_32_ns( 0x30FC , inp_3qi)
353
354    ADD             w10, w2, w6, lsl #1 //sum_0qr  = mul_0qr + (mul_2qr << 1)
355    SUB             w2 , w2, w6, lsl #1 //sum_1qr  = mul_0qr - (mul_2qr << 1)
356    ADD             w6 , w4, w8         //sum_2qr  = mul_1qr + mul_3qr
357    SUB             w4 , w4, w8         //sum_3qr  = mul_1qr - mul_3qr
358
359    ADD             w8 , w3, w7, lsl #1 //sum_0qi  = mul_0qi + (mul_2qi << 1)
360    SUB             w3 , w3, w7, lsl #1 //sum_1qi  = mul_0qi - (mul_2qi << 1)
361    ADD             w7 , w5, w9         //sum_2qi  = mul_1qi + mul_3qi
362    SUB             w5 , w5, w9         //sum_3qi  = mul_1qi - mul_3qi
363
364    ADD             w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1)
365    SUB             w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1)
366    ADD             w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1)
367    SUB             w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1)
368    STR             w9 , [x1, #8]       //y[2 ] = sum_0qr + (sum_2qr << 1)
369    STR             w10, [x1, #72]      //y[18] = sum_0qr - (sum_2qr << 1)
370    STR             w6 , [x1, #40]      //y[10] = sum_1qr + (sum_3qi << 1)
371    STR             w2 , [x1, #104]     //y[26] = sum_1qr - (sum_3qi << 1)
372
373    ADD             w5 , w8 , w7, lsl #1 //sum_0qi + (sum_2qi << 1)
374    SUB             w8 , w8 , w7, lsl #1 //sum_0qi - (sum_2qi << 1)
375    SUB             w7 , w3 , w4, lsl #1 //sum_1qi - (sum_3qr << 1)
376    ADD             w3 , w3 , w4, lsl #1 //sum_1qi + (sum_3qr << 1)
377    STR             w5 , [x1, #12]      //y[2 +1] = sum_0qi + (sum_2qi << 1)
378    STR             w8 , [x1, #76]      //y[18+1] = sum_0qi - (sum_2qi << 1)
379    STR             w7 , [x1, #44]      //y[10+1] = sum_1qi - (sum_3qr << 1)
380    STR             w3 , [x1, #108]     //y[26+1] = sum_1qi + (sum_3qr << 1)
381
382    //Third Butterfly
383    LDR             w2, [x0, #64]       //mul_0qr = inp_0qr = x[16]
384    sxtw            x2, w2
385    LDR             w5, [x0, #72]       //inp_1qr = x[18]
386    sxtw            x5, w5
387    LDR             w6, [x0, #76]       //inp_1qi = x[19]
388    sxtw            x6, w6
389    //Moved for delay slot
390    LDR             w3, [x0, #68]       //mul_0qi = inp_1qr = x[17]
391    sxtw            x3, w3
392
393    ADD             w4, w5, w6          //(inp_1qr + inp_1qi)
394
395    SMULL           x4, w4, w14
396    ASR             x4, x4, #16
397//    SMULWB      x4, x4, x14                 //mul_1qr = mpy_16_32_ns(0x5A83 , (inp_1qr + inp_1qi))
398    SUB             w5, w6, w5          //(-inp_1qr + inp_1qi)
399
400    SMULL           x5, w5, w14
401    ASR             x5, x5, #16
402//  SMULWB      x5, x5, x14                 //mul_1qi = mpy_16_32_ns(0x5A83 , (-inp_1qr + inp_1qi))
403
404    LDR             w6, [x0, #84]       //mul_2qr = inp_2qi = x[21]
405    sxtw            x6, w6
406    LDR             x9 , [x0, #88]      //inp_3qr = x[22]
407    sxtw            x9, w9
408    LDR             w10, [x0, #92]      //inp_3qi = x[23]
409    sxtw            x10, w10
410    //Moved for delay slot
411    LDR             w7, [x0, #80]       //mul_2qi = inp_2qr = x[20]
412    sxtw            x7, w7
413
414    SUB             w8 , w10, w9        //(-inp_3qr + inp_3qi)
415
416    SMULL           x8, w8, w14
417    ASR             x8, x8, #16
418//    SMULWB      x8 , x8 , x14               //mul_3qr = mpy_16_32_ns( 0x5A83 , (-inp_3qr + inp_3qi))
419
420    ADD             w9 , w9 , w10       //(inp_3qr + inp_3qi)
421
422    SMULL           x9, w9, w24
423    ASR             x9, x9, #16
424//    SMULWT      x9 , x9 , x14               //mul_3qi = mpy_16_32_ns(-0x5A83 , (inp_3qr + inp_3qi))
425
426    ADD             w10, w2, w6         //sum_0qr  = mul_0qr + mul_2qr
427    SUB             w2 , w2, w6         //sum_1qr  = mul_0qr - mul_2qr
428    ADD             w6 , w4, w8         //sum_2qr  = mul_1qr + mul_3qr
429    SUB             w4 , w4, w8         //sum_3qr  = mul_1qr - mul_3qr
430
431    SUB             w8 , w3, w7         //sum_0qi  = mul_0qi - mul_2qi
432    ADD             w3 , w3, w7         //sum_1qi  = mul_0qi + mul_2qi
433    ADD             w7 , w5, w9         //sum_2qi  = mul_1qi + mul_3qi
434    SUB             w5 , w5, w9         //sum_3qi  = mul_1qi - mul_3qi
435
436    ADD             w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1)
437    SUB             w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1)
438    ADD             w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1)
439    SUB             w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1)
440    STR             w9 , [x1, #16]      //y[4 ] = sum_0qr + (sum_2qr << 1)
441    STR             w10, [x1, #80]      //y[20] = sum_0qr - (sum_2qr << 1)
442    STR             w6 , [x1, #48]      //y[12] = sum_1qr + (sum_3qi << 1)
443    STR             w2 , [x1, #112]     //y[28] = sum_1qr - (sum_3qi << 1)
444
445    ADD             w5, w8, w7, lsl #1  //sum_0qi + (sum_2qi << 1)
446    SUB             w8, w8, w7, lsl #1  //sum_0qi - (sum_2qi << 1)
447    SUB             w7, w3, w4, lsl #1  //sum_1qi - (sum_3qr << 1)
448    ADD             w3, w3, w4, lsl #1  //sum_1qi + (sum_3qr << 1)
449    STR             w5 , [x1, #20]      //y[4 +1] = sum_0qi + (sum_2qi << 1)
450    STR             w8 , [x1, #84]      //y[20+1] = sum_0qi - (sum_2qi << 1)
451    STR             w7 , [x1, #52]      //y[12+1] = sum_1qi - (sum_3qr << 1)
452    STR             w3 , [x1, #116]     //y[28+1] = sum_1qi + (sum_3qr << 1)
453
454    //Fourth Butterfly
455    LDR             w2, [x0, #96]       //mul_0qr = inp_0qr = x[24]
456    sxtw            x2, w2
457    LDR             w3, [x0, #100]      //mul_0qi = inp_1qr = x[25]
458    sxtw            x3, w3
459
460    LDR             w5, [x0, #104]      //inp_1qr = x[26]
461    sxtw            x5, w5
462    LDR             w6, [x0, #108]      //inp_1qi = x[27]
463    sxtw            x6, w6
464
465    SMULL           x4, w5, w12
466    ASR             x4, x4, #16
467//    SMULWB      x4, x5, x12                 //mul_1qr = mpy_16_32_ns( 0x30FC , inp_1qr)
468
469    SMULL           x20, w6, w11
470    ASR             x20, x20, #16
471    ADD             w4, w4, w20
472//  SMLAWB      x4, x6, x11, x4             //mul_1qr -= mpy_16_32_ns(-0x7642 , inp_1qi)
473
474    SMULL           x5, w5, w21
475    ASR             x5, x5, #16
476//  SMULWT      x5, x5, x11                 //mul_1qi = mpy_16_32_ns(-0x7642 , inp_1qr)
477
478    LDR             w7, [x0, #112]      //inp_2qr = x[28]
479    sxtw            x7, w7
480    LDR             w8, [x0, #116]      //inp_2qi = x[29]
481    sxtw            x8, w8
482
483    //Moved for delay slot
484    SMULL           x20, w6, w12
485    ASR             x20, x20, #16
486    ADD             w5, w5, w20
487//    SMLAWB      x5, x6, x12, x5             //mul_1qi += mpy_16_32_ns( 0x30FC , inp_1qi)
488
489    SUB             w6, w8, w7          //(-inp_2qr + inp_2qi)
490
491    SMULL           x6, w6, w14
492    ASR             x6, x6, #16
493//    SMULWB      x6, x6, x14                 //mul_2qr = mpy_16_32_ns( 0x5A83 , (-inp_2qr + inp_2qi))
494    ADD             w7, w8, w7          //(inp_2qr + inp_2qi)
495
496    SMULL           x7, w7, w24
497    ASR             x7, x7, #16
498//   SMULWT      x7, x7, x14                 //mul_2qi = mpy_16_32_ns(-0x5A83 , (inp_2qr + inp_2qi))
499
500    LDR             w9 , [x0, #120]     //inp_3qr = x[30]
501    sxtw            x9, w9
502    LDR             w10, [x0, #124]     //inp_3qi = x[31]
503    sxtw            x10, w10
504
505    SMULL           x8, w9, w21
506    ASR             x8, x8, #16
507//    SMULWT      x8, x9 , x11                //mul_3qr = mpy_16_32_ns(-0x7642 , inp_3qr)
508
509    SMULL           x20, w10, w22
510    ASR             x20, x20, #16
511    ADD             w8, w8, w20
512//  SMLAWT      x8, x10, x12, x8            //mul_3qr -= mpy_16_32_ns( 0x30FC , inp_3qi)//
513
514    SMULL           x9, w9, w12
515    ASR             x9, x9, #16
516//  SMULWB      x9, x9 , x12                //mul_3qi = mpy_16_32_ns( 0x30FC , inp_3qr)
517
518    SMULL           x20, w10, w21
519    ASR             x20, x20, #16
520    ADD             w9, w9, w20
521//  SMLAWT      x9, x10, x11, x9            //mul_3qi += mpy_16_32_ns(-0x7642 , inp_3qi)
522
523    ADD             w10, w2, w6, lsl #1 //sum_0qr  = mul_0qr + (mul_2qr << 1)
524    SUB             w2 , w2, w6, lsl #1 //sum_1qr  = mul_0qr - (mul_2qr << 1)
525    ADD             w6 , w4, w8         //sum_2qr  = mul_1qr + mul_3qr
526    SUB             w4 , w4, w8         //sum_3qr  = mul_1qr - mul_3qr
527
528    ADD             w8 , w3, w7, lsl #1 //sum_0qi  = mul_0qi + (mul_2qi << 1)
529    SUB             w3 , w3, w7, lsl #1 //sum_1qi  = mul_0qi - (mul_2qi << 1)
530    ADD             w7 , w5, w9         //sum_2qi  = mul_1qi + mul_3qi
531    SUB             w5 , w5, w9         //sum_3qi  = mul_1qi - mul_3qi
532
533    ADD             w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1)
534    SUB             w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1)
535    ADD             w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1)
536    SUB             w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1)
537    STR             w9 , [x1, #24]      //y[6 ] = sum_0qr + (sum_2qr << 1)
538    STR             w10, [x1, #88]      //y[22] = sum_0qr - (sum_2qr << 1)
539    STR             w6 , [x1, #56]      //y[14] = sum_1qr + (sum_3qi << 1)
540    STR             w2 , [x1, #120]     //y[30] = sum_1qr - (sum_3qi << 1)
541
542    ADD             w5 , w8 , w7, lsl #1 //sum_0qi + (sum_2qi << 1)
543    SUB             w8 , w8 , w7, lsl #1 //sum_0qi - (sum_2qi << 1)
544    SUB             w7 , w3 , w4, lsl #1 //sum_1qi - (sum_3qr << 1)
545    ADD             w3 , w3 , w4, lsl #1 //sum_1qi + (sum_3qr << 1)
546    STR             w5 , [x1, #28]      //y[6 +1] = sum_0qi + (sum_2qi << 1)
547    STR             w8 , [x1, #92]      //y[22+1] = sum_0qi - (sum_2qi << 1)
548    STR             w7 , [x1, #60]      //y[14+1] = sum_1qi - (sum_3qr << 1)
549    STR             w3 , [x1, #124]     //y[30+1] = sum_1qi + (sum_3qr << 1)
550
551    // LDMFD sp!, {x4-x12,x15}
552    ldp             x19, x20, [sp], #16
553    pop_v_regs
554    ret
555
556