1.macro push_v_regs
2    stp             q8, q9, [sp, #-32]!
3    stp             q10, q11, [sp, #-32]!
4    stp             q12, q13, [sp, #-32]!
5    stp             q14, q15, [sp, #-32]!
6    stp             X8, X9, [sp, #-16]!
7    stp             X10, X11, [sp, #-16]!
8    stp             X12, X13, [sp, #-16]!
9    stp             X14, X15, [sp, #-16]!
10    stp             X16, X17, [sp, #-16]!
11    stp             X18, X19, [sp, #-16]!
12    stp             X20, X21, [sp, #-16]!
13    stp             X29, X30, [sp, #-16]!
14.endm
15
16.macro pop_v_regs
17    ldp             X29, X30, [sp], #16
18    ldp             X20, X21, [sp], #16
19    ldp             X18, X19, [sp], #16
20    ldp             X16, X17, [sp], #16
21    ldp             X14, X15, [sp], #16
22    ldp             X12, X13, [sp], #16
23    ldp             X10, X11, [sp], #16
24    ldp             X8, X9, [sp], #16
25    ldp             q14, q15, [sp], #32
26    ldp             q12, q13, [sp], #32
27    ldp             q10, q11, [sp], #32
28    ldp             q8, q9, [sp], #32
29.endm
30
31.text
32.p2align 2
33
34     .global ixheaacd_cos_sin_mod_loop2
35ixheaacd_cos_sin_mod_loop2:
36
37    // STMFD sp!, {x4-x12, x14}
38    push_v_regs
39    //stp x19, x20,[sp,#-16]!
40    //VPUSH {D8-D15}
41    //generating load addresses
42    ADD             x3, x0, x2, LSL #3  //psubband1 = &subband[2 * M - 1];
43    SUB             x3, x3, #4
44    ADD             x10, x0, #256
45    ADD             x11, x10, x2, LSL #3
46    SUB             x11, x11, #4
47    MOV             x8, #-4
48    MOV             w19, #0
49    DUP             V0.4s, w19
50    DUP             V1.4s, w19
51
52    LDR             w6, [x0]
53    sxtw            x6, w6
54    ASR             x4, x2, #1          //M_2 = ixheaacd_shx32(M, 1);
55    SUB             x4, x4, #1
56
57    ASR             x6, x6, #1          //*psubband = *psubband >> 1;
58    LD1             {v2.s}[0], [x3]
59
60    STR             w6, [x0], #4        //psubband++;
61    sxtw            x6, w6
62    LDR             w7, [x0]
63    sxtw            x7, w7
64    ASR             x7, x7, #1
65    sub             x20, x7, #0
66    neg             x6, x20
67    STR             w6, [x3], #-4
68    sxtw            x6, w6
69    LD1             {v3.s}[0], [x3]     //  im = *psubband1;
70
71    LD2             {v0.h, v1.h}[0], [x1], #4
72    sxtl            v0.4s, v0.4h
73    sxtl            v1.4s, v1.4h
74    dup             v0.2s, v0.s[0]
75    dup             v1.2s, v1.s[0]
76
77    LD1             {v2.s}[1], [x11]    //re = *psubband12;
78
79//    LDR w6,  [x10]
80//  sxtw x6,w6
81//    ASR x7, x6, #1
82//    MOV x9, #0
83//    QSUB x7, x9, x7
84    LD1             {v4.s}[0], [x10]
85    SSHR            v4.2s, v4.2s, #1
86    MOV             x9, #0
87    DUP             v6.2s, w9
88    SQSUB           v4.2s, v6.2s, v4.2s
89
90    ST1             {v4.s}[0], [x11]
91//  str     X7, [X11]
92    SUB             x11, x11, #4
93//  sxtw x7,w7
94
95    LDR             w6, [x10, #4]
96    sxtw            x6, w6
97    ASR             x6, x6, #1
98    STR             w6, [x10], #4
99    sxtw            x6, w6
100
101    LD1             {v3.s}[1], [x11]
102
103    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
104    sshr            v4.2d, v4.2d, #16
105    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
106    sshr            v6.2d, v6.2d, #16
107    sMULL           v8.2d, v1.2s, v2.2s //add 1st
108    sshr            v8.2d, v8.2d, #16
109    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
110    sshr            v10.2d, v10.2d, #16
111
112    add             v12.2d, v8.2d , v6.2d
113    SQSUB           v14.2d, v10.2d , v4.2d
114    SQSUB           v16.2d, v4.2d , v10.2d
115
116    //shrn  v12.2s, v12.2d,#32
117    //shrn  v14.2s, v14.2d,#32
118    //shrn  v16.2s, v16.2d,#32
119
120    ST1             {v12.s}[0], [x3], x8
121
122    ST1             {v14.s}[0], [x0], #4
123
124    SQNEG           v12.4s, v12.4s
125
126
127    ST1             {v12.s}[2], [x10], #4
128
129    ST1             {v16.s}[2], [x11], x8
130
131LOOP1:
132    LD1             {v2.2s}, [x0]
133    LD1             {v3.2s}, [x10]
134    LDR             w5, [x3]            //RE2
135    sxtw            x5, w5
136    LDR             w6, [x11]           //RE3
137    sxtw            x6, w6
138    //VTRN.32 D2, D3
139    TRN1            v4.2s, v2.2s, v3.2s
140    TRN2            v3.2s, v2.2s, v3.2s
141    MOV             v2.8b, v4.8b
142
143    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
144    sshr            v4.2d, v4.2d, #16
145    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
146    sshr            v6.2d, v6.2d, #16
147    sMULL           v8.2d, v1.2s, v2.2s //add 1st
148    sshr            v8.2d, v8.2d, #16
149    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
150    sshr            v10.2d, v10.2d, #16
151
152    add             v12.2d, v8.2d , v6.2d
153    SQSUB           v14.2d, v4.2d , v10.2d
154    SQSUB           v16.2d, v10.2d , v4.2d
155
156    //shrn  v12.2s, v12.2d,#32
157    //shrn  v14.2s, v14.2d,#32
158    //shrn  v16.2s, v16.2d,#32
159
160    ST1             {v12.s}[0], [x0], #4
161    ST1             {v14.s}[0], [x3], x8
162    SQNEG           v12.4s, v12.4s
163
164    ST1             {v12.s}[2], [x11], x8
165    ST1             {v16.s}[2], [x10], #4
166
167    MOV             w19, #0
168    DUP             V0.4s, w19
169    DUP             V1.4s, w19
170    // second part
171    LD2             {v0.h, v1.h}[0], [x1], #4
172    sxtl            v0.4s, v0.4h
173    sxtl            v1.4s, v1.4h
174    dup             v0.2s, v0.s[0]
175    dup             v1.2s, v1.s[0]
176
177    mov             v3.s[0], w5
178    mov             v3.s[1], w6
179    LD1             {v2.s}[0], [x3]
180    LD1             {v2.s}[1], [x11]
181
182    sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
183    sshr            v4.2d, v4.2d, #16
184    sMULL           v6.2d, v0.2s, v3.2s //add 2nd
185    sshr            v6.2d, v6.2d, #16
186    sMULL           v8.2d, v1.2s, v2.2s //add 1st
187    sshr            v8.2d, v8.2d, #16
188    sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
189    sshr            v10.2d, v10.2d, #16
190
191    add             v12.2d, v4.2d , v10.2d
192    SQSUB           v14.2d, v8.2d , v6.2d
193    SQSUB           v16.2d, v6.2d , v8.2d
194
195    //shrn  v12.2s, v12.2d,#32
196    //shrn  v14.2s, v14.2d,#32
197    //shrn  v16.2s, v16.2d,#32
198
199    ST1             {v12.s}[0], [x3], x8
200    ST1             {v14.s}[0], [x0], #4
201
202    SQNEG           v12.4s, v12.4s
203
204    subs            x4, x4, #1
205    ST1             {v12.s}[2], [x10], #4
206    ST1             {v16.s}[2], [x11], x8
207
208    BGT             LOOP1
209    //VPOP {D8-D15}
210    // LDMFD sp!, {x4-x12, x15}
211    //ldp x19, x20,[sp],#16
212    pop_v_regs
213    ret
214