1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             q8, q9, [sp, #-32]!
24    stp             q10, q11, [sp, #-32]!
25    stp             q12, q13, [sp, #-32]!
26    stp             q14, q15, [sp, #-32]!
27    stp             x21, x22, [sp, #-16]!
28    stp             x23, x24, [sp, #-16]!
29.endm
30.macro pop_v_regs
31    ldp             x23, x24, [sp], #16
32    ldp             x21, x22, [sp], #16
33    ldp             q14, q15, [sp], #32
34    ldp             q12, q13, [sp], #32
35    ldp             q10, q11, [sp], #32
36    ldp             q8, q9, [sp], #32
37.endm
38
39.macro swp reg1, reg2
40    MOV             X16, \reg1
41    MOV             \reg1, \reg2
42    MOV             \reg2, x16
43.endm
44.text
45.global ixheaacd_sbr_qmfsyn64_winadd
46
47ixheaacd_sbr_qmfsyn64_winadd:
48
49    push_v_regs
50
51
52
53    MOV             w7, #0x8000
54    LD1             {v0.4h}, [x0], #8
55    MOV             x12, x2
56
57    dup             v30.4s, w7
58    LD1             {v1.4h}, [x2], #8
59    dup             v22.4s, w4
60
61    MOV             x10, x0
62    MOV             x11, x2
63    ADD             x0, x0, #504
64    ADD             x2, x2, #248
65
66    NEG             v28.4s, v22.4s
67    sshL            v20.4s, v30.4s, v28.4s
68    MOV             x6, #64
69    LSL             x6, x6, #1
70    ADD             x12, x12, x6
71    MOV             x7, #128
72    LSL             x9, x7, #1
73    ADD             x1, x1, x9
74    MOV             x6, #16
75    MOV             x7, #128
76    LSL             x9, x7, #1
77    MOV             x7, #256
78    LSL             x8, x7, #1
79
80    LSL             x5, x5, #1
81    LD1             {v2.4h}, [x0], x8
82    mov             v26.16b, v20.16b
83
84
85    sMLAL           v26.4s, v0.4h, v1.4h
86    LD1             {v3.4h}, [x2], x9
87
88    LD1             {v4.4h}, [x0], x8
89    sMLAL           v26.4s, v2.4h, v3.4h
90
91    LD1             {v5.4h}, [x2], x9
92
93    LD1             {v6.4h}, [x0], x8
94    sMLAL           v26.4s, v5.4h, v4.4h
95
96    LD1             {v7.4h}, [x2], x9
97
98    LD1             {v8.4h}, [x0], x8
99    sMLAL           v26.4s, v7.4h, v6.4h
100
101    LD1             {v9.4h}, [x2], x9
102    MOV             x0, x10
103
104
105    MOV             x2, x11
106    LD1             {v10.4h}, [x1], #8
107    sMLAL           v26.4s, v9.4h, v8.4h
108
109    MOV             x10, x1
110    LD1             {v11.4h}, [x12], #8
111    ADD             x1, x1, #504
112
113
114
115    MOV             x11, x12
116    LD1             {v12.4h}, [x1], x8
117    ADD             x12, x12, #248
118
119    sMLAL           v26.4s, v10.4h, v11.4h
120    LD1             {v13.4h}, [x12], x9
121
122    LD1             {v14.4h}, [x1], x8
123    sMLAL           v26.4s, v12.4h, v13.4h
124
125    LD1             {v15.4h}, [x12], x9
126
127    LD1             {v16.4h}, [x1], x8
128    sMLAL           v26.4s, v15.4h, v14.4h
129
130    LD1             {v17.4h}, [x12], x9
131
132    LD1             {v18.4h}, [x1], x8
133    sMLAL           v26.4s, v17.4h, v16.4h
134
135    LD1             {v19.4h}, [x12], x9
136
137    sMLAL           v26.4s, v19.4h, v18.4h
138    LD1             {v0.4h}, [x0], #8
139    MOV             x12, x11
140
141    MOV             x1, x10
142    LD1             {v1.4h}, [x2], #8
143    MOV             x10, x0
144
145    sQshL           v26.4s, v26.4s, v22.4s
146
147    ADD             x0, x0, #504
148
149    MOV             x11, x2
150    LD1             {v2.4h}, [x0], x8
151    ADD             x2, x2, #248
152
153    sshR            v28.4s, v26.4s, #16
154    LD1             {v3.4h}, [x2], x9
155
156
157    UZP2            v29.8h, v28.8h, v28.8h
158    UZP1            v28.8h, v28.8h, v28.8h
159    mov             v26.16b, v20.16b
160
161
162
163
164    LD1             {v4.4h}, [x0], x8
165    LD1             {v5.4h}, [x2], x9
166
167    LD1             {v6.4h}, [x0], x8
168    LD1             {v7.4h}, [x2], x9
169
170    LD1             {v8.4h}, [x0], x8
171    LD1             {v9.4h}, [x2], x9
172    MOV             x0, x10
173
174
175    MOV             x2, x11
176    LD1             {v10.4h}, [x1], #8
177
178    MOV             x10, x1
179    LD1             {v11.4h}, [x12], #8
180    ADD             x1, x1, #504
181
182
183    MOV             x11, x12
184    LD1             {v12.4h}, [x1], x8
185    ADD             x12, x12, #248
186
187
188    LD1             {v13.4h}, [x12], x9
189
190    LD1             {v14.4h}, [x1], x8
191    LD1             {v15.4h}, [x12], x9
192
193    LD1             {v16.4h}, [x1], x8
194    LD1             {v17.4h}, [x12], x9
195
196    LD1             {v18.4h}, [x1], x8
197    SUB             x6, x6, #2
198    LD1             {v19.4h}, [x12], x9
199    MOV             x1, x10
200
201    MOV             x12, x11
202
203LOOP_1:
204
205    sMLAL           v26.4s, v0.4h, v1.4h
206    ST1             {v28.h}[0], [x3], x5
207
208    sMLAL           v26.4s, v2.4h, v3.4h
209    LD1             {v0.4h}, [x0], #8
210    sMLAL           v26.4s, v5.4h, v4.4h
211
212    sMLAL           v26.4s, v7.4h, v6.4h
213    ST1             {v28.h}[1], [x3], x5
214
215
216    MOV             x10, x0
217    LD1             {v1.4h}, [x2], #8
218    ADD             x0, x0, #504
219
220    sMLAL           v26.4s, v9.4h, v8.4h
221    ST1             {v28.h}[2], [x3], x5
222
223    sMLAL           v26.4s, v10.4h, v11.4h
224    ST1             {v28.h}[3], [x3], x5
225
226    MOV             x11, x2
227    LD1             {v2.4h}, [x0], x8
228    ADD             x2, x2, #248
229
230    sMLAL           v26.4s, v12.4h, v13.4h
231    LD1             {v3.4h}, [x2], x9
232    sMLAL           v26.4s, v15.4h, v14.4h
233
234    sMLAL           v26.4s, v17.4h, v16.4h
235    LD1             {v4.4h}, [x0], x8
236    sMLAL           v26.4s, v19.4h, v18.4h
237
238    LD1             {v5.4h}, [x2], x9
239
240    LD1             {v6.4h}, [x0], x8
241    sQshL           v26.4s, v26.4s, v22.4s
242
243    sshR            v28.4s, v26.4s, #16
244    LD1             {v7.4h}, [x2], x9
245    mov             v26.16b, v20.16b
246
247
248    UZP2            v29.8h, v28.8h, v28.8h
249    UZP1            v28.8h, v28.8h, v28.8h
250    sMLAL           v26.4s, v0.4h, v1.4h
251
252    sMLAL           v26.4s, v2.4h, v3.4h
253    LD1             {v8.4h}, [x0], x8
254    sMLAL           v26.4s, v5.4h, v4.4h
255
256    sMLAL           v26.4s, v7.4h, v6.4h
257    LD1             {v9.4h}, [x2], x9
258
259
260    LD1             {v10.4h}, [x1], #8
261    sMLAL           v26.4s, v9.4h, v8.4h
262
263    MOV             x2, x11
264    LD1             {v11.4h}, [x12], #8
265    MOV             x0, x10
266
267    MOV             x10, x1
268
269    ADD             x1, x1, #504
270
271    MOV             x11, x12
272    LD1             {v12.4h}, [x1], x8
273    ADD             x12, x12, #248
274
275    LD1             {v13.4h}, [x12], x9
276    sMLAL           v26.4s, v10.4h, v11.4h
277
278    LD1             {v14.4h}, [x1], x8
279    sMLAL           v26.4s, v12.4h, v13.4h
280
281    LD1             {v15.4h}, [x12], x9
282
283    LD1             {v16.4h}, [x1], x8
284    sMLAL           v26.4s, v15.4h, v14.4h
285
286    LD1             {v17.4h}, [x12], x9
287
288    LD1             {v18.4h}, [x1], x8
289    sMLAL           v26.4s, v17.4h, v16.4h
290
291    LD1             {v19.4h}, [x12], x9
292    MOV             x1, x10
293
294    sMLAL           v26.4s, v19.4h, v18.4h
295    ST1             {v28.h}[0], [x3], x5
296
297    MOV             x12, x11
298    LD1             {v0.4h}, [x0], #8
299
300    LD1             {v1.4h}, [x2], #8
301    sQshL           v26.4s, v26.4s, v22.4s
302
303
304    ST1             {v28.h}[1], [x3], x5
305    MOV             x10, x0
306
307    ST1             {v28.h}[2], [x3], x5
308    ADD             x0, x0, #504
309
310    ST1             {v28.h}[3], [x3], x5
311    MOV             x11, x2
312
313    sshR            v28.4s, v26.4s, #16
314    LD1             {v2.4h}, [x0], x8
315    ADD             x2, x2, #248
316
317    LD1             {v3.4h}, [x2], x9
318    LD1             {v4.4h}, [x0], x8
319    LD1             {v5.4h}, [x2], x9
320    LD1             {v6.4h}, [x0], x8
321    LD1             {v7.4h}, [x2], x9
322    LD1             {v8.4h}, [x0], x8
323    LD1             {v9.4h}, [x2], x9
324
325    UZP2            v29.8h, v28.8h, v28.8h
326    UZP1            v28.8h, v28.8h, v28.8h
327    mov             v26.16b, v20.16b
328
329
330
331
332    MOV             x0, x10
333    LD1             {v10.4h}, [x1], #8
334    MOV             x2, x11
335
336    MOV             x10, x1
337    LD1             {v11.4h}, [x12], #8
338    ADD             x1, x1, #504
339
340
341    MOV             x11, x12
342    LD1             {v12.4h}, [x1], x8
343    ADD             x12, x12, #248
344
345
346    LD1             {v13.4h}, [x12], x9
347
348    LD1             {v14.4h}, [x1], x8
349    LD1             {v15.4h}, [x12], x9
350
351    LD1             {v16.4h}, [x1], x8
352    LD1             {v17.4h}, [x12], x9
353
354    SUBS            x6, x6, #2
355    LD1             {v18.4h}, [x1], x8
356
357    MOV             x1, x10
358    LD1             {v19.4h}, [x12], x9
359
360    MOV             x12, x11
361
362
363    BGT             LOOP_1
364
365    sMLAL           v26.4s, v0.4h, v1.4h
366    ST1             {v28.h}[0], [x3], x5
367    sMLAL           v26.4s, v2.4h, v3.4h
368
369    sMLAL           v26.4s, v5.4h, v4.4h
370    ST1             {v28.h}[1], [x3], x5
371    sMLAL           v26.4s, v7.4h, v6.4h
372
373    sMLAL           v26.4s, v9.4h, v8.4h
374    ST1             {v28.h}[2], [x3], x5
375    sMLAL           v26.4s, v10.4h, v11.4h
376
377    sMLAL           v26.4s, v12.4h, v13.4h
378    ST1             {v28.h}[3], [x3], x5
379    sMLAL           v26.4s, v15.4h, v14.4h
380
381
382
383    sMLAL           v26.4s, v17.4h, v16.4h
384
385    sMLAL           v26.4s, v19.4h, v18.4h
386
387    sQshL           v26.4s, v26.4s, v22.4s
388
389    sshR            v28.4s, v26.4s, #16
390
391    UZP2            v29.8h, v28.8h, v28.8h
392    UZP1            v28.8h, v28.8h, v28.8h
393
394
395    ST1             {v28.h}[0], [x3], x5
396    ST1             {v28.h}[1], [x3], x5
397    ST1             {v28.h}[2], [x3], x5
398    ST1             {v28.h}[3], [x3], x5
399
400
401    pop_v_regs
402    ret
403
404