1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21.macro push_v_regs
22    stp             q8, q9, [sp, #-32]!
23    stp             q10, q11, [sp, #-32]!
24    stp             q12, q13, [sp, #-32]!
25    stp             q14, q15, [sp, #-32]!
26    stp             X8, X9, [sp, #-16]!
27    stp             X10, X11, [sp, #-16]!
28    stp             X12, X13, [sp, #-16]!
29    stp             X14, X15, [sp, #-16]!
30    stp             X16, X17, [sp, #-16]!
31    stp             X29, X30, [sp, #-16]!
32.endm
33.macro pop_v_regs
34    ldp             X29, X30, [sp], #16
35    ldp             X16, X17, [sp], #16
36    ldp             X14, X15, [sp], #16
37    ldp             X12, X13, [sp], #16
38    ldp             X10, X11, [sp], #16
39    ldp             X8, X9, [sp], #16
40    ldp             q14, q15, [sp], #32
41    ldp             q12, q13, [sp], #32
42    ldp             q10, q11, [sp], #32
43    ldp             q8, q9, [sp], #32
44.endm
45
46.text
47.global ixheaacd_over_lap_add1_armv8
48ixheaacd_over_lap_add1_armv8:
49    push_v_regs
50    LSL             X10, X5, #1
51    SUB             X11, X10, #1
52    LSL             X10, X11, #2
53    ADD             X10, X0, X10
54    SUB             X10, X10, #12
55    LSL             X8, X11, #1
56    ADD             X8, X8, X3
57    SUB             X8, X8, #14
58    MOV             X12, #-16
59    DUP             V11.8H, W4
60    LD1             {V3.4S}, [X10], X12
61    MOV             W7, #0x2000
62
63    NEG             W7, W7
64    SQNEG           V0.4S, V3.4S
65    DUP             V10.4S, W7
66    UZP1            V31.8H, V0.8H, V0.8H
67    UZP2            V30.8H, V0.8H, V0.8H
68    REV64           V31.8h, V31.8h
69    REV64           V30.8h, V30.8h
70    SUB             X11, X5, #1
71    UZP1            V7.8H, V3.8H, V3.8H
72    UZP2            V6.8H, V3.8H, V3.8H
73    REV64           V7.8H, V7.8H
74    REV64           V6.8H, V6.8H
75    MOV             V16.S[0], W6
76    MOV             V17.S[0], W11
77    SMULL           V17.4S, V16.4H, V17.4H
78    MOV             W11, V17.S[0]
79    LSL             X11, X11, #1
80
81    LD2             {V2.4H, V3.4H}, [X8], X12
82    ADD             X11, X11, X2
83    REV64           V2.4H, V2.4H
84    REV64           V3.4H, V3.4H
85    LSL             X4, X6, #1
86    NEG             X4, X4
87    LSL             X9, X6, #1
88    MOV             V16.S[0], W5
89    MOV             V17.S[0], W6
90    SMULL           V17.4S, V16.4H, V17.4H
91    MOV             W6, V17.S[0]
92    LSL             W6, W6, #1
93    ADD             X6, X6, X2
94
95    UMULL           V15.4S, V7.4H, V2.4H
96    LD1             {V4.4S}, [X1], #16
97    USHR            V15.4S, V15.4S, #16
98
99    SMLAL           V15.4S, V6.4H, V2.4H
100    SQSHL           V15.4S, V15.4S, V11.4S
101    SSHLL           V27.4S, V3.4H, #0
102    SMULL           V28.2D, V27.2S, V4.2S
103    SMULL2          V29.2D, V27.4S, V4.4S
104    SQXTN           V28.2S, V28.2D
105    SQXTN2          V28.4S, V29.2D
106    MOV             V14.16B, V28.16B
107
108    SQADD           V14.4S, V14.4S, V10.4S
109    SQSUB           V13.4S, V15.4S, V14.4S
110    SQSHL           V13.4S, V13.4S, #2
111    SSHR            V13.4S, V13.4S, #16
112    UZP1            V26.8H, V13.8H, V13.8H
113
114    UMULL           V12.4S, V31.4H, V3.4H
115    USHR            V12.4S, V12.4S, #16
116    SMLAL           V12.4S, V30.4H, V3.4H
117    SQSHL           V12.4S, V12.4S, V11.4S
118    LD1             {V3.4S}, [X10], X12
119
120    SSHLL           V27.4S, V2.4H, #0
121    SMULL           V28.2D, V27.2S, V4.2S
122    SMULL2          V29.2D, V27.4S, V4.4S
123    SQXTN           V28.2S, V28.2D
124    SQXTN2          V28.4S, V29.2D
125    MOV             V8.16B, V28.16B
126
127    SQADD           V8.4S, V8.4S, V10.4S
128
129    SQNEG           V0.4S, V3.4S
130    UZP1            V1.8H, V0.8H, V0.8H
131    UZP2            V0.8H, V0.8H, V0.8H
132    REV64           V1.8h, V1.8h
133    REV64           V0.8h, V0.8h
134    SQSUB           V9.4S, V12.4S, V8.4S
135    UZP1            V7.8H, V3.8H, V3.8H
136    UZP2            V6.8H, V3.8H, V3.8H
137    REV64           V7.8h, V7.8h
138    REV64           V6.8h, V6.8h
139    SQSHL           V9.4S, V9.4S, #2
140    LD2             {V2.4H, V3.4H}, [X8], X12
141    SSHR            V9.4S, V9.4S, #16
142    REV64           V2.4H, V2.4H
143    REV64           V3.4H, V3.4H
144    UZP1            V18.8H, V9.8H, V9.8H
145
146    LD1             {V4.4S}, [X1], #16
147    SUB             W5, W5, #8
148
149
150LOOP_1:
151
152    ST1             {V26.H}[0], [X11], X4
153    UMULL           V15.4S, V7.4H, V2.4H
154    ST1             {V26.H}[1], [X11], X4
155    UMULL           V12.4S, V1.4H, V3.4H
156    ST1             {V26.H}[2], [X11], X4
157    USHR            V15.4S, V15.4S, #16
158    ST1             {V26.H}[3], [X11], X4
159    USHR            V12.4S, V12.4S, #16
160    ST1             {V18.H}[0], [X6], X9
161    SMLAL           V15.4S, V6.4H, V2.4H
162    ST1             {V18.H}[1], [X6], X9
163    SMLAL           V12.4S, V0.4H, V3.4H
164    ST1             {V18.H}[2], [X6], X9
165    SQSHL           V15.4S, V15.4S, V11.4S
166    ST1             {V18.H}[3], [X6], X9
167    SQSHL           V12.4S, V12.4S, V11.4S
168    LD1             {V6.4S}, [X10], X12
169
170    SSHLL           V27.4S, V3.4H, #0
171    SMULL           V28.2D, V27.2S, V4.2S
172    SMULL2          V29.2D, V27.4S, V4.4S
173    SQXTN           V28.2S, V28.2D
174    SQXTN2          V28.4S, V29.2D
175    MOV             V14.16B, V28.16B
176
177    SSHLL           V27.4S, V2.4H, #0
178    SMULL           V28.2D, V27.2S, V4.2S
179    SMULL2          V29.2D, V27.4S, V4.4S
180    SQXTN           V28.2S, V28.2D
181    SQXTN2          V28.4S, V29.2D
182    MOV             V8.16B, V28.16B
183
184    LD2             {V2.4H, V3.4H}, [X8], X12
185
186    SQNEG           V0.4S, V6.4S
187
188    LD1             {V4.4S}, [X1], #16
189
190    SQADD           V14.4S, V14.4S, V10.4S
191    UZP1            V1.8H, V0.8H, V0.8H
192    UZP2            V0.8H, V0.8H, V0.8H
193    REV64           V1.8h, V1.8h
194    REV64           V0.8h, V0.8h
195    SQADD           V8.4S, V8.4S, V10.4S
196    UZP1            V7.8H, V6.8H, V6.8H
197    UZP2            V6.8H, V6.8H, V6.8H
198    REV64           V7.8h, V7.8h
199    REV64           V6.8h, V6.8h
200    SQSUB           V13.4S, V15.4S, V14.4S
201    REV64           V2.4H, V2.4H
202    REV64           V3.4H, V3.4H
203    SQSUB           V9.4S, V12.4S, V8.4S
204    SQSHL           V13.4S, V13.4S, #2
205    SQSHL           V9.4S, V9.4S, #2
206    UMULL           V15.4S, V7.4H, V2.4H
207    SSHR            V13.4S, V13.4S, #16
208    UZP1            V26.8H, V13.8H, V13.8H
209    SSHR            V9.4S, V9.4S, #16
210    ST1             {V26.H}[0], [X11], X4
211    UMULL           V12.4S, V1.4H, V3.4H
212    UZP1            V18.8H, V9.8H, V9.8H
213    USHR            V15.4S, V15.4S, #16
214    ST1             {V26.H}[1], [X11], X4
215    SMLAL           V15.4S, V6.4H, V2.4H
216    ST1             {V26.H}[2], [X11], X4
217    USHR            V12.4S, V12.4S, #16
218    ST1             {V26.H}[3], [X11], X4
219    SMLAL           V12.4S, V0.4H, V3.4H
220    ST1             {V18.H}[0], [X6], X9
221    SQSHL           V15.4S, V15.4S, V11.4S
222    ST1             {V18.H}[1], [X6], X9
223    SQSHL           V12.4S, V12.4S, V11.4S
224    ST1             {V18.H}[2], [X6], X9
225
226    SSHLL           V27.4S, V3.4H, #0
227    SMULL           V28.2D, V27.2S, V4.2S
228    SMULL2          V29.2D, V27.4S, V4.4S
229    SQXTN           V28.2S, V28.2D
230    SQXTN2          V28.4S, V29.2D
231    MOV             V14.16B, V28.16B
232
233    ST1             {V18.H}[3], [X6], X9
234
235
236    SSHLL           V27.4S, V2.4H, #0
237    SMULL           V28.2D, V27.2S, V4.2S
238    SMULL2          V29.2D, V27.4S, V4.4S
239    SQXTN           V28.2S, V28.2D
240    SQXTN2          V28.4S, V29.2D
241    MOV             V8.16B, V28.16B
242
243    LD1             {V3.4S}, [X10], X12
244    SQADD           V14.4S, V14.4S, V10.4S
245
246    SQNEG           V0.4S, V3.4S
247    UZP1            V1.8H, V0.8H, V0.8H
248    UZP2            V0.8H, V0.8H, V0.8H
249    REV64           V1.8H, V1.8H
250    REV64           V0.8H, V0.8H
251    SQSUB           V13.4S, V15.4S, V14.4S
252    UZP1            V7.8H, V3.8H, V3.8H
253    UZP2            V6.8H, V3.8H, V3.8H
254    REV64           V7.8H, V7.8H
255    REV64           V6.8H, V6.8H
256    SQADD           V8.4S, V8.4S, V10.4S
257    LD2             {V2.4H, V3.4H}, [X8], X12
258    SQSUB           V9.4S, V12.4S, V8.4S
259    REV64           V2.4H, V2.4H
260    REV64           V3.4H, V3.4H
261    SQSHL           V13.4S, V13.4S, #2
262    LD1             {V4.4S}, [X1], #16
263
264    SQSHL           V9.4S, V9.4S, #2
265    SSHR            V13.4S, V13.4S, #16
266    SUBS            X5, X5, #8
267    SSHR            V9.4S, V9.4S, #16
268    UZP1            V26.8H, V13.8H, V13.8H
269    UZP1            V18.8H, V9.8H, V9.8H
270
271    BGT             LOOP_1
272
273    ST1             {V26.H}[0], [X11], X4
274    UMULL           V15.4S, V7.4H, V2.4H
275    ST1             {V26.H}[1], [X11], X4
276    UMULL           V12.4s, V1.4H, V3.4H
277    ST1             {V26.H}[2], [X11], X4
278    USHR            V15.4S, V15.4S, #16
279    ST1             {V26.H}[3], [X11], X4
280    USHR            V12.4S, V12.4S, #16
281
282    ST1             {V18.H}[0], [X6], X9
283    SMLAL           V15.4S, V6.4H, V2.4H
284    ST1             {V18.H}[1], [X6], X9
285    SMLAL           V12.4S, V0.4H, V3.4H
286    ST1             {V18.H}[2], [X6], X9
287    SQSHL           V15.4S, V15.4S, V11.4S
288    ST1             {V18.H}[3], [X6], X9
289    SQSHL           V12.4S, V12.4S, V11.4S
290
291
292    SSHLL           V27.4S, V3.4H, #0
293    SMULL           V28.2D, V27.2S, V4.2S
294    SMULL2          V29.2D, V27.4S, V4.4S
295    SQXTN           V28.2S, V28.2D
296    SQXTN2          V28.4S, V29.2D
297    MOV             V14.16B, V28.16B
298
299    SSHLL           V27.4S, V2.4H, #0
300    SMULL           V28.2D, V27.2S, V4.2S
301    SMULL2          V29.2D, V27.4S, V4.4S
302    SQXTN           V28.2S, V28.2D
303    SQXTN2          V28.4S, V29.2D
304    MOV             V8.16B, V28.16B
305
306    SQADD           V14.4S, V14.4S, V10.4S
307    SQADD           V8.4S, V8.4S, V10.4S
308    SQSUB           V13.4S, V15.4S, V14.4S
309    SQSUB           V9.4S, V12.4S, V8.4S
310    SQSHL           V13.4S, V13.4S, #2
311    SQSHL           V9.4S, V9.4S, #2
312    SSHR            V13.4S, V13.4S, #16
313    SSHR            V9.4S, V9.4S, #16
314    UZP1            V26.8H, V13.8H, V13.8H
315
316    UZP1            V18.8H, V9.8H, V9.8H
317
318
319    ST1             {V26.H}[0], [X11], X4
320    ST1             {V26.H}[1], [X11], X4
321    ST1             {V26.H}[2], [X11], X4
322    ST1             {V26.H}[3], [X11], X4
323
324    ST1             {V18.H}[0], [X6], X9
325    ST1             {V18.H}[1], [X6], X9
326    ST1             {V18.H}[2], [X6], X9
327    ST1             {V18.H}[3], [X6], X9
328    pop_v_regs
329    RET
330
331
332
333
334