1.macro push_v_regs
2    stp             q8, q9, [sp, #-32]!
3    stp             q10, q11, [sp, #-32]!
4    stp             q12, q13, [sp, #-32]!
5    stp             q14, q15, [sp, #-32]!
6    stp             X8, X9, [sp, #-16]!
7    stp             X10, X11, [sp, #-16]!
8    stp             X12, X13, [sp, #-16]!
9    stp             X14, X15, [sp, #-16]!
10    stp             X16, X17, [sp, #-16]!
11    stp             X18, X19, [sp, #-16]!
12    stp             X20, X21, [sp, #-16]!
13    stp             X29, X30, [sp, #-16]!
14.endm
15
16.macro pop_v_regs
17    ldp             X29, X30, [sp], #16
18    ldp             X20, X21, [sp], #16
19    ldp             X18, X19, [sp], #16
20    ldp             X16, X17, [sp], #16
21    ldp             X14, X15, [sp], #16
22    ldp             X12, X13, [sp], #16
23    ldp             X10, X11, [sp], #16
24    ldp             X8, X9, [sp], #16
25    ldp             q14, q15, [sp], #32
26    ldp             q12, q13, [sp], #32
27    ldp             q10, q11, [sp], #32
28    ldp             q8, q9, [sp], #32
29.endm
30
31.text
32.p2align 2
33
34     .global ixheaacd_cos_sin_mod_loop1
35ixheaacd_cos_sin_mod_loop1:
36
37    // STMFD sp!, {x4-x12, x14}
38    push_v_regs
39    //stp x19, x20,[sp,#-16]!
40    //VPUSH {D8-D11}
41//generating load addresses
42    ADD             x4, x0, x1, lsl #3  //psubband1
43    SUB             x4, x4, #4
44    ADD             x5, x3, x1, lsl #3  //psubband1_t
45    SUB             x5, x5, #8
46    ASR             x6, x1, #2
47
48    MOV             w19, #0
49    DUP             V0.8h, w19
50LOOP1:
51//first part
52    ld1             {v0.h}[0] , [x2]
53    ADD             x2, x2, #2
54    ld1             {v0.h}[2] , [x2]
55    ADD             x2, x2, #2
56    rev64           v1.2s, v0.2s
57    ld1             {v2.s}[0], [x0]
58    ADD             x0, x0, #4
59    ADD             x7, x0, #252
60    ld1             {v2.s}[1], [x7]
61    ld1             {v3.s}[0], [x4]
62    ADD             x7, x4, #256
63    ld1             {v3.s}[1], [x7]
64    SUB             x4, x4, #4
65
66    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
67    sshr            v4.2d, v4.2d, #16
68    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
69    sshr            v6.2d, v6.2d, #16
70    sMULL           v8.2d, v1.2s, v2.2s //add 1st
71    sshr            v8.2d, v8.2d, #16
72    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
73    sshr            v10.2d, v10.2d, #16
74
75    add             v0.4s, v8.4s , v6.4s
76    SQSUB           v2.4s, v10.4s , v4.4s
77
78    //shrn  v0.2s, v0.2d,#32
79    //shrn  v2.2s, v2.2d,#32
80    mov             v3.16b, v0.16b
81    mov             v1.16b, v2.16b
82    ST2             {v0.s, v1.s}[0], [x3]
83    ADD             x3, x3, #8
84    ADD             x7, x3, #248
85    ST2             {v2.s, v3.s}[2], [x7]
86    MOV             w19, #0
87    DUP             V0.8h, w19
88//second part
89    ld1             {v0.h}[0] , [x2]
90    ADD             x2, x2, #2
91    ld1             {v0.h}[2] , [x2]
92    ADD             x2, x2, #2
93    rev64           v1.2s, v0.2s
94    ld1             {v2.s}[0], [x0]
95    ADD             x0, x0, #4
96    ADD             x7, x0, #252
97    ld1             {v2.s}[1], [x7]
98    ld1             {v3.s}[0], [x4]
99    ADD             x7, x4, #256
100    ld1             {v3.s}[1], [x7]
101    SUB             x4, x4, #4
102
103    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
104    sshr            v4.2d, v4.2d, #16
105    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
106    sshr            v6.2d, v6.2d, #16
107    sMULL           v8.2d, v1.2s, v2.2s //add 1st
108    sshr            v8.2d, v8.2d, #16
109    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
110    sshr            v10.2d, v10.2d, #16
111
112    ADD             v0.4s, v10.4s , v4.4s
113    SQSUB           v2.4s, v8.4s , v6.4s
114
115    //shrn  v0.2s, v0.2d,#32
116    //shrn  v2.2s, v2.2d,#32
117    mov             v3.16b, v0.16b
118    mov             v1.16b, v2.16b
119    ST2             {v0.s, v1.s}[0], [x5]
120    ADD             x7, x5, #256
121    ST2             {v2.s, v3.s}[2], [x7]
122    SUB             x5, x5, #8
123    MOV             w19, #0
124    DUP             V0.8h, w19
125//Third part
126    ld1             {v0.h}[0] , [x2]
127    ADD             x2, x2, #2
128    ld1             {v0.h}[2] , [x2]
129    ADD             x2, x2, #2
130    rev64           v1.2s, v0.2s
131    ld1             {v2.s}[0], [x0], #4
132    ADD             x7, x0, #252
133    ld1             {v2.s}[1], [x7]
134    ld1             {v3.s}[0], [x4]
135    ADD             x7, x4, #256
136    ld1             {v3.s}[1], [x7]
137    SUB             x4, x4, #4
138
139    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
140    sshr            v4.2d, v4.2d, #16
141    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
142    sshr            v6.2d, v6.2d, #16
143    sMULL           v8.2d, v1.2s, v2.2s //add 1st
144    sshr            v8.2d, v8.2d, #16
145    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
146    sshr            v10.2d, v10.2d, #16
147
148    add             v0.4s, v8.4s , v6.4s
149    SQSUB           v2.4s, v10.4s , v4.4s
150
151    //shrn  v0.2s, v0.2d,#32
152    //shrn  v2.2s, v2.2d,#32
153    mov             v3.16b, v0.16b
154    mov             v1.16b, v2.16b
155    ST2             {v0.s, v1.s}[0], [x3]
156    ADD             x3, x3, #8
157    ADD             x7, x3, #248
158    ST2             {v2.s, v3.s}[2], [x7]
159    MOV             w19, #0
160    DUP             V0.8h, w19
161//Fourth part
162    ld1             {v0.h}[0] , [x2]
163    ADD             x2, x2, #2
164    ld1             {v0.h}[2] , [x2]
165    ADD             x2, x2, #2
166    rev64           v1.2s, v0.2s
167    ld1             {v2.s}[0], [x0]
168    ADD             x0, x0, #4
169    ADD             x7, x0, #252
170    ld1             {v2.s}[1], [x7]
171    ld1             {v3.s}[0], [x4]
172    ADD             x7, x4, #256
173    ld1             {v3.s}[1], [x7]
174    SUB             x4, x4, #4
175
176    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
177    sshr            v4.2d, v4.2d, #16
178    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
179    sshr            v6.2d, v6.2d, #16
180    sMULL           v8.2d, v1.2s, v2.2s //add 1st
181    sshr            v8.2d, v8.2d, #16
182    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
183    sshr            v10.2d, v10.2d, #16
184
185
186    ADD             v0.4s, v10.4s , v4.4s
187    SQSUB           v2.4s, v8.4s , v6.4s
188
189    //shrn  v0.2s, v0.2d,#32
190    //shrn  v2.2s, v2.2d,#32
191    mov             v3.16b, v0.16b
192    mov             v1.16b, v2.16b
193    ST2             {v0.s, v1.s}[0], [x5]
194    ADD             x7, x5, #256
195    SUBS            x6, x6, #1
196    ST2             {v2.s, v3.s}[2], [x7]
197    SUB             x5, x5, #8
198    MOV             w19, #0
199    DUP             V0.8h, w19
200    BGT             LOOP1
201    //VPOP {D8-D11}
202    // LDMFD sp!, {x4-x12, x15}
203    //ldp x19, x20,[sp],#16
204    pop_v_regs
205    ret
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232