1//.include "ihevc_neon_macros.s"
2.macro push_v_regs
3    stp             x8, x9, [sp, #-16]!
4    stp             x10, X11, [sp, #-16]!
5    stp             X12, X13, [sp, #-16]!
6    stp             X14, X15, [sp, #-16]!
7    stp             X29, X30, [sp, #-16]!
8.endm
9.macro pop_v_regs
10    ldp             X29, X30, [sp], #16
11    ldp             X14, X15, [sp], #16
12    ldp             X12, X13, [sp], #16
13    ldp             X10, X11, [sp], #16
14    ldp             X8, X9, [sp], #16
15.endm
16
17.text
18.p2align 2
19        .global ixheaacd_postradixcompute4
20
21
22ixheaacd_postradixcompute4:
23
24    // STMFD sp!, {x4-x12, x14}
25    push_v_regs
26    //SUB         sp, sp, #16
27
28    //HARD CODED for FFT Length of 16
29    // x3 is always 16
30
31
32    //SUB         x4, x3, #2              ; y to y offset calculated
33    //MOV         x4, #14
34    //STR         x4, [sp, #8]            ; (npoints / 2)*4bytes - 4bytes
35
36    //STR         x0, [sp, #12]           ; (3*(npoints/2))*4bytes - 4bytes
37                                        // x0 to x2 offset (npoints / 2)*4bytes
38    ADD             x4, x1, x3, lsl #1  // x1 -> x0, x4 -> x2
39    MOV             x3, #2
40
41
42POSTRADIX4_START:
43
44//    LDMIA       x1!, {x5-x12}               // x_0 :x_7
45
46    LDP             w5, w6, [x1], #8    // x_0 :x_1
47    LDP             w7, w8, [x1], #8    // x_2 :x_3
48    LDP             w9, w10, [x1], #8   // x_4 :x_5
49    LDP             w11, w12, [x1], #8  // x_6 :x_7
50
51    ADD             w14, w5, w9         // xh0_0 = x_0 + x_4
52    SUB             w5, w5, w9          // xl0_0 = x_0 - x_4
53
54    ADD             w9, w6, w10         // xh1_0 = x_1 + x_5
55    SUB             w6, w6, w10         // xl1_0 = x_1 - x_5
56
57    ADD             w10, w7, w11        // xh0_1 = x_2 + x_6
58    SUB             w7, w7, w11         // xl0_1 = x_2 - x_6
59
60    ADD             w11, w8, w12        // xh1_1 = x_3 + x_7
61    SUB             w8, w8, w12         // xl1_1 = x_3 - x_7
62
63    ADD             w12, w14, w10       // n00 = xh0_0 + xh0_1
64    SUB             w14, w14, w10       // n20 = xh0_0 - xh0_1
65
66    ADD             w10, w9, w11        // n01 = xh1_0 + xh1_1
67    SUB             w9, w9, w11         // n21 = xh1_0 - xh1_1
68
69    ADD             w11, w5, w8         // n10 = xl0_0 + xl1_1
70    SUB             w5, w5, w8          // n30 = xl0_0 - xl1_1
71
72    ADD             w8, w6, w7          // n31 = xl1_0 + xl0_1
73    SUB             w6, w6, w7          // n11 = xl1_0 - xl0_1
74
75
76    STR             w12, [x0], #4       // y0[h2] = n00, x7 -> y0[h2 + 1]
77
78    STR             w10, [x0], #14<<1   // y0[h2 + 1] = n01, x7 -> y1[h2]
79
80    STR             w11, [x0], #4       // y1[h2] = n10, x7 -> y1[h2 + 1]
81    STR             w6 , [x0], #14<<1   // y1[h2 + 1] = n11, x7 -> y2[h2]
82
83    STR             w14, [x0], #4       // y2[h2] = n20, x7 -> y2[h2 + 1]
84    STR             w9 , [x0], #14<<1   // y2[h2 + 1] = n21, x7 -> y3[h2]
85
86    STR             w5, [x0], #4        // y3[h2] = n30, x7 -> y3[h2 + 1]
87    STR             w8, [x0], #0        // y3[h2 + 1] = n31, x7 -> y0[h2+2]
88
89//    LDMIA       x4!, {x5-x12}               // x_0 :x_7
90
91    LDP             w5, w6, [x4], #8    // x_8 :x_8
92    LDP             w7, w8, [x4], #8    // x_a :x_b
93    LDP             w9, w10, [x4], #8   // x_c :x_d
94    LDP             w11, w12, [x4], #8  // x_e :x_f
95
96    SUB             x0, x0, #92         // #4*3 + #14<<1 * 3 - 8
97
98
99    ADD             w14, w5, w9
100    SUB             w5, w5, w9
101
102    ADD             w9, w6, w10
103    SUB             w6, w6, w10
104
105    ADD             w10, w7, w11
106    SUB             w7, w7, w11
107
108    ADD             w11, w8, w12
109    SUB             w8, w8, w12
110
111    ADD             w12, w14, w10
112    SUB             w14, w14, w10
113
114    ADD             w10, w9, w11
115    SUB             w9, w9, w11
116
117    ADD             w11, w5, w8
118    SUB             w5, w5, w8
119
120    ADD             w8, w6, w7
121    SUB             w6, w6, w7
122
123    STR             w12, [x0], #4
124    STR             w10, [x0], #14<<1
125
126    STR             w11, [x0], #4
127    STR             w6, [x0], #14<<1
128
129    STR             w14, [x0], #4
130    STR             w9, [x0], #14<<1
131
132
133    STR             w5, [x0], #4
134    STR             w8, [x0], #0
135
136    ADD             x1, x1, #1 << 5     // x0 += (Word32) npoints >> 1
137    ADD             x4, x4, #1 << 5     // x2 += (Word32) npoints >> 1
138    SUB             x0, x0, #100-8
139
140    SUBS            w3, w3, #1
141
142    BGT             POSTRADIX4_START
143
144    // LDMFD sp!, {x4-x12, x15}
145    pop_v_regs
146    ret
147
148
149