1//VOID ixheaacd_inv_dit_fft_8pt(WORD32 *y,
2//                            WORD32 *real,
3//                            WORD32 *imag)
4
5.macro push_v_regs
6    stp             q8, q9, [sp, #-32]!
7    stp             q10, q11, [sp, #-32]!
8    stp             q12, q13, [sp, #-32]!
9    stp             q14, q15, [sp, #-32]!
10.endm
11.macro pop_v_regs
12    ldp             q14, q15, [sp], #32
13    ldp             q12, q13, [sp], #32
14    ldp             q10, q11, [sp], #32
15    ldp             q8, q9, [sp], #32
16.endm
17
18
19.text
20.global ixheaacd_inv_dit_fft_8pt_armv8
21ixheaacd_inv_dit_fft_8pt_armv8:
22    push_v_regs
23    MOV             w3, #0x5A820000
24    DUP             v0.2s, w3
25    MOV             x5, #8
26    ADD             x6, x0, #4
27
28    //LD2 {v1.2s,v2.2s},[x0],x5
29    //LD2 {v3.2s,v4.2s},[x0],x5
30    //LD2 {v5.2s,v6.2s},[x0],x5
31    //LD2 {v7.2s,v8.2s},[x0],x5
32
33    LD1             {v1.s}[0], [x0], x5
34    LD1             {v2.s}[0], [x6], x5
35    LD1             {v1.s}[1], [x0], x5
36    LD1             {v2.s}[1], [x6], x5
37    LD1             {v3.s}[0], [x0], x5
38    LD1             {v4.s}[0], [x6], x5
39    LD1             {v3.s}[1], [x0], x5
40    LD1             {v4.s}[1], [x6], x5
41    LD1             {v5.s}[0], [x0], x5
42    LD1             {v6.s}[0], [x6], x5
43    LD1             {v5.s}[1], [x0], x5
44    LD1             {v6.s}[1], [x6], x5
45    LD1             {v7.s}[0], [x0], x5
46    LD1             {v8.s}[0], [x6], x5
47    LD1             {v7.s}[1], [x0], x5
48    LD1             {v8.s}[1], [x6], x5
49
50    //v1 - y0_2
51    //v2 - y1_3
52    //v3 - y4_6
53    //v4 - y5_7
54    //v5 - y8_10
55    //v6 - y9_11
56    //v7 - y12_14
57    //v8 - y13_15
58
59    SQADD           v9.2s, v1.2s, v5.2s //a00_v = vqadd_s32(y0_2,y8_10);
60    SQADD           v10.2s, v2.2s, v6.2s //a20_v = vqadd_s32(y1_3,y9_11);
61    SQADD           v11.2s, v3.2s, v7.2s //a10_v = vqadd_s32(y4_6,y12_14);
62    SQADD           v12.2s, v4.2s, v8.2s //a30_v = vqadd_s32(y5_7,y13_15);
63
64    SQSUB           v1.2s, v1.2s, v5.2s //a0_v  = vqsub_s32(y0_2,y8_10);
65    SQSUB           v5.2s, v2.2s, v6.2s //a3_v  = vqsub_s32(y1_3,y9_11);
66    SQSUB           v2.2s, v3.2s, v7.2s //a2_v  = vqsub_s32(y4_6,y12_14);
67    SQSUB           v6.2s, v4.2s, v8.2s //a1_v  = vqsub_s32(y5_7,y13_15);
68
69    SQADD           v3.2s, v9.2s, v11.2s //x0_8  = vqadd_s32(a00_v,a10_v);
70    SQADD           v7.2s, v10.2s, v12.2s //x1_9  = vqadd_s32(a20_v,a30_v);
71
72    SQSUB           v4.2s, v9.2s, v11.2s //x4_12 = vqsub_s32(a00_v,a10_v);
73    SQSUB           v8.2s, v10.2s, v12.2s //x5_13 = vqsub_s32(a20_v,a30_v);
74
75    SQADD           v9.2s, v1.2s, v6.2s //x6_14 = vqadd_s32(a0_v,a1_v);
76    SQADD           v11.2s, v5.2s, v2.2s //x3_11 = vqadd_s32(a3_v,a2_v);
77    SQSUB           v10.2s, v1.2s, v6.2s //x2_10 = vqsub_s32(a0_v,a1_v);
78    SQSUB           v13.2s, v5.2s, v2.2s //x7_15 = vqsub_s32(a3_v,a2_v);
79
80    UZP1            v1.2s, v3.2s, v7.2s //x0_1 = vuzp1_s32(x0_8,x1_9);
81    UZP2            v5.2s, v3.2s, v7.2s //x8_9 = vuzp2_s32(x0_8,x1_9);
82
83    UZP1            v6.2s, v4.2s, v8.2s //x4_5      = vuzp1_s32(x4_12,x5_13);
84    UZP2            v7.2s, v4.2s, v8.2s //x12_13    = vuzp2_s32(x4_12,x5_13);
85    REV64           v7.2s, v7.2s        //x13_12    = vrev64_s32(x12_13);
86
87    SQADD           v3.2s, v1.2s, v5.2s //real_imag0 = vqadd_s32(x0_1,x8_9);
88    SQSUB           v8.2s, v1.2s, v5.2s //a00_10_v = vqsub_s32(x0_1,x8_9);
89
90    SQADD           v12.2s, v6.2s, v7.2s //real_imag4 = vqadd_s32(x4_5,x13_12);
91    SQSUB           v14.2s, v6.2s, v7.2s //a0_1_v    = vqsub_s32(x4_5,x13_12);
92
93
94    MOV             w4, v12.s[1]
95    MOV             v12.s[1], v14.s[1]
96    MOV             v14.s[1], w4
97
98    UZP1            v6.2s, v10.2s, v11.2s //x2_3
99
100    SQSUB           v1.2s, v10.2s, v11.2s //tempr = vqsub_s32(x2_10,x3_11)
101    SQADD           v5.2s, v10.2s, v11.2s //tempi = vqadd_s32(x2_10,x3_11)
102
103    SMULL           v7.2d, v1.2s, v0.2s
104    SMULL           v10.2d, v5.2s, v0.2s
105
106    SSHR            v7.2d, v7.2d, #32   //tempr_q
107    SSHR            v10.2d, v10.2d, #32 //tempi_q
108
109    SHL             v7.4s, v7.4s, #1
110    SHL             v10.4s, v10.4s, #1
111
112
113
114    MOV             v1.s[0], v7.s[2]
115    MOV             v1.s[1], v10.s[2]   //vr_i
116
117    SQSUB           v7.2s, v6.2s, v1.2s //a2_3_v = vqsub_s32(x2_3,vr_i);
118    SQADD           v4.2s, v6.2s, v1.2s //real_imag1 = vqadd_s32(x2_3,vr_i);
119    SQADD           v5.2s, v14.2s, v7.2s //real_imag2 = vqadd_s32(a0_1_v,a2_3_v);
120
121    UZP1            v1.2s, v9.2s, v13.2s //x6_7
122    SQADD           v6.2s, v9.2s, v13.2s //tempr = vqadd_s32(x6_14,x7_15);
123    SQSUB           v14.2s, v9.2s, v13.2s //tempi = vqsub_s32(x6_14,x7_15);
124
125    SMULL           v9.2d, v6.2s, v0.2s
126    SMULL           v13.2d, v14.2s, v0.2s
127
128    SSHR            v9.2d, v9.2d, #32
129    SSHR            v13.2d, v13.2d, #32
130
131    SHL             v9.4s, v9.4s, #1
132    SHL             v13.4s, v13.4s, #1
133
134
135
136    MOV             v0.s[0], v9.s[2]
137    MOV             v0.s[1], v13.s[2]
138
139    SQSUB           v9.2s, v1.2s, v0.2s // a20_30_v
140    SQADD           v13.2s, v1.2s, v0.2s //real_imag5
141
142
143    MOV             w4, v9.s[1]
144    MOV             v9.s[1], v13.s[1]
145    MOV             v13.s[1], w4
146
147    SQADD           v6.2s, v9.2s, v8.2s //real_imag3
148
149    ST1             {v3.s}[0], [x1], #4
150    ST1             {v4.s}[0], [x1], #4
151    ST1             {v5.s}[0], [x1], #4
152    ST1             {v6.s}[0], [x1], #4
153    ST1             {v12.s}[0], [x1], #4
154    ST1             {v13.s}[0], [x1], #4
155
156    ST1             {v3.s}[1], [x2], #4
157    ST1             {v4.s}[1], [x2], #4
158    ST1             {v5.s}[1], [x2], #4
159    ST1             {v6.s}[1], [x2], #4
160    ST1             {v12.s}[1], [x2], #4
161    ST1             {v13.s}[1], [x2], #4
162    //ST4 {v3.s,v4.s,v5.s,v6.s}[0],[x1],x5
163    //ST4 {v3.s,v4.s,v5.s,v6.s}[1],[x2],x5
164    //ST2 {v12.s,v13.s}[0],[x1]
165    //ST2 {v12.s,v13.s}[1],[x2]
166    pop_v_regs
167    ret
168
169
170
171
172
173
174
175