1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             q8, q9, [sp, #-32]!
24    stp             q10, q11, [sp, #-32]!
25    stp             q12, q13, [sp, #-32]!
26    stp             q14, q15, [sp, #-32]!
27    stp             X8, X9, [sp, #-16]!
28    stp             X10, X11, [sp, #-16]!
29    stp             X12, X13, [sp, #-16]!
30    stp             X22, X23, [sp, #-16]!
31    stp             X16, X17, [sp, #-16]!
32    stp             X20, X21, [sp, #-16]!
33.endm
34.macro pop_v_regs
35    ldp             X20, X21, [sp], #16
36    ldp             X16, X17, [sp], #16
37    ldp             X22, X23, [sp], #16
38    ldp             X12, X13, [sp], #16
39    ldp             X10, X11, [sp], #16
40    ldp             X8, X9, [sp], #16
41    ldp             q14, q15, [sp], #32
42    ldp             q12, q13, [sp], #32
43    ldp             q10, q11, [sp], #32
44    ldp             q8, q9, [sp], #32
45.endm
46
47.macro swp reg1, reg2
48    MOV             X16, \reg1
49    MOV             \reg1, \reg2
50    MOV             \reg2, x16
51.endm
52.text
53.global ixheaacd_pretwiddle_compute_armv8
54
55ixheaacd_pretwiddle_compute_armv8:
56
57    push_v_regs
58
59    LSL             x7, x4, #4
60    ADD             x7, x2, x7
61    SUB             x7, x7, #4
62    LDR             x22, =7500
63    ADD             x3, x3, x22
64    MVN             w5, w5
65    ADD             w5, w5, #1
66
67
68
69
70
71ARM_PROLOGUE:
72    LDRH            w21, [x3]
73    LDRH            w22, [x3, #2]
74    LSL             w22, w22, #16
75    LSL             w21, w21, #16
76
77    LDR             w8, [x3], #4
78    LDR             w9, [x0], #4
79
80
81
82
83
84
85
86
87
88
89
90
91    SMULL           X12, w9, w21
92    ASR             X12, x12, #32
93    LDR             w10, [x1], #-4
94    SMULL           X11, w9, w22
95    ASR             X11, x11, #32
96    SMULL           X23, w10, w22
97    ASR             X23, x23, #32
98    ADD             w9, w12, w23
99    SMULL           X6, w10, w21
100    ASR             X6, x6, #32
101
102
103    MVN             w9, w9
104    ADD             w9, w9, #1
105    SUB             w11, w11, w6
106    CMP             w5, #0
107    BGT             NEXT
108    MVN             w8, w5
109    ADD             w8, w8, #1
110    ASR             w11, w11, w8
111    ASR             w9, w9, w8
112    B               NEXT1
113
114NEXT:
115    LSL             w11, w11, w5
116    LSL             w9, w9, w5
117
118
119
120NEXT1:
121    STR             w9, [x2], #4
122    STR             w11, [x2], #4
123
124    CMP             X4, #0x100
125    BNE             NXT
126    MOV             X6, #4
127    B               NXT1
128NXT:
129    MOV             X6, #32
130    ADD             X3, X3, #28
131
132NXT1:
133    SUB             X4, X4, #1
134    ASR             X4, X4, #2
135    SUB             x7, x7, #28
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150NEON_PROLOGUE:
151
152    MOV             x8, #-32
153
154    dup             v14.4s, w5
155
156    SUB             X1, X1, #28
157
158    LD2             {v8.h, v9.h}[0], [x3], x6
159    LD2             {v8.h, v9.h}[1], [x3], x6
160    LD2             {v8.h, v9.h}[2], [x3], x6
161    LD2             {v8.h, v9.h}[3], [x3], x6
162
163    rev64           v10.4h, v8.4h
164    rev64           v11.4h, v9.4h
165
166    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
167
168    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
169
170    rev64           v0.4h, v0.4h
171    rev64           v1.4h, v1.4h
172    rev64           v4.4h, v4.4h
173    rev64           v5.4h, v5.4h
174
175
176
177
178
179
180
181    uMULL           v30.4s, v2.4h, v9.4h
182    uMULL           v28.4s, v4.4h, v9.4h
183    uMULL           v26.4s, v2.4h, v8.4h
184    uMULL           v24.4s, v4.4h, v8.4h
185
186    ushR            v30.4s, v30.4s, #16
187    ushR            v28.4s, v28.4s, #16
188    ushR            v26.4s, v26.4s, #16
189    ushR            v24.4s, v24.4s, #16
190
191    sMLAL           v30.4s, v3.4h, v9.4h
192    sMLAL           v28.4s, v5.4h, v9.4h
193    sMLAL           v26.4s, v3.4h, v8.4h
194    sMLAL           v24.4s, v5.4h, v8.4h
195
196    ADD             v28.4s, v26.4s , v28.4s
197    NEG             v28.4s, v28.4s
198    SUB             v30.4s, v30.4s , v24.4s
199
200    uMULL           v22.4s, v0.4h, v11.4h
201    uMULL           v20.4s, v6.4h, v11.4h
202    uMULL           v18.4s, v0.4h, v10.4h
203    uMULL           v16.4s, v6.4h, v10.4h
204
205    ushR            v22.4s, v22.4s, #16
206    ushR            v20.4s, v20.4s, #16
207    ushR            v18.4s, v18.4s, #16
208    ushR            v16.4s, v16.4s, #16
209
210    sMLAL           v22.4s, v1.4h, v11.4h
211    LD2             {v8.h, v9.h}[0], [x3], x6
212
213    sMLAL           v20.4s, v7.4h, v11.4h
214    LD2             {v8.h, v9.h}[1], [x3], x6
215
216    sMLAL           v18.4s, v1.4h, v10.4h
217    LD2             {v8.h, v9.h}[2], [x3], x6
218
219    sMLAL           v16.4s, v7.4h, v10.4h
220    LD2             {v8.h, v9.h}[3], [x3], x6
221
222    ADD             v20.4s, v20.4s , v18.4s
223
224    NEG             v20.4s, v20.4s
225    rev64           v10.4h, v8.4h
226    rev64           v11.4h, v9.4h
227    SUB             v22.4s, v16.4s , v22.4s
228    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
229
230
231
232    sshL            v20.4s, v20.4s, v14.4s
233    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
234
235    rev64           v0.4h, v0.4h
236    rev64           v1.4h, v1.4h
237    sshL            v22.4s, v22.4s, v14.4s
238
239    rev64           v4.4h, v4.4h
240    rev64           v5.4h, v5.4h
241    sshL            v18.4s, v30.4s, v14.4s
242
243
244    sshL            v16.4s, v28.4s, v14.4s
245
246
247
248
249
250
251
252
253    SUB             X4, X4, #2
254
255CORE_LOOP:
256    uMULL           v30.4s, v2.4h, v9.4h
257    MOV             v17.16B, v18.16B
258    ST2             { v16.4s, v17.4s}, [x2]
259    ADD             x2, x2, #32
260    uMULL           v28.4s, v4.4h, v9.4h
261
262    uMULL           v26.4s, v2.4h, v8.4h
263    MOV             v21.16B, v22.16B
264    ST2             { v20.4s, v21.4s}, [x7], x8
265    uMULL           v24.4s, v4.4h, v8.4h
266
267    ushR            v30.4s, v30.4s, #16
268    ushR            v28.4s, v28.4s, #16
269    ushR            v26.4s, v26.4s, #16
270    ushR            v24.4s, v24.4s, #16
271
272    sMLAL           v30.4s, v3.4h, v9.4h
273    sMLAL           v28.4s, v5.4h, v9.4h
274    sMLAL           v26.4s, v3.4h, v8.4h
275    sMLAL           v24.4s, v5.4h, v8.4h
276
277    ADD             v28.4s, v26.4s , v28.4s
278    NEG             v28.4s, v28.4s
279    SUB             v30.4s, v30.4s , v24.4s
280
281    uMULL           v22.4s, v0.4h, v11.4h
282    LD2             {v8.h, v9.h}[0], [x3], x6
283    uMULL           v20.4s, v6.4h, v11.4h
284
285    uMULL           v18.4s, v0.4h, v10.4h
286    LD2             {v8.h, v9.h}[1], [x3], x6
287    uMULL           v16.4s, v6.4h, v10.4h
288
289    ushR            v22.4s, v22.4s, #16
290    LD2             {v8.h, v9.h}[2], [x3], x6
291    ushR            v20.4s, v20.4s, #16
292
293
294    ushR            v18.4s, v18.4s, #16
295    LD2             {v8.h, v9.h}[3], [x3], x6
296    ushR            v16.4s, v16.4s, #16
297
298    sMLAL           v22.4s, v1.4h, v11.4h
299
300    sMLAL           v20.4s, v7.4h, v11.4h
301
302
303    sMLAL           v18.4s, v1.4h, v10.4h
304
305
306    sMLAL           v16.4s, v7.4h, v10.4h
307    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
308    ADD             v20.4s, v20.4s , v18.4s
309
310    NEG             v20.4s, v20.4s
311    rev64           v10.4h, v8.4h
312    rev64           v11.4h, v9.4h
313
314    SUB             v22.4s, v16.4s , v22.4s
315    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], x8
316    sshL            v20.4s, v20.4s, v14.4s
317
318
319    sshL            v22.4s, v22.4s, v14.4s
320
321    rev64           v0.4h, v0.4h
322    rev64           v1.4h, v1.4h
323    sshL            v18.4s, v30.4s, v14.4s
324
325    rev64           v4.4h, v4.4h
326    rev64           v5.4h, v5.4h
327    sshL            v16.4s, v28.4s, v14.4s
328
329
330    SUBS            x4, x4, #1
331    BNE             CORE_LOOP
332
333
334
335
336
337
338NEON_EPILOGUE:
339    uMULL           v30.4s, v2.4h, v9.4h
340    MOV             v17.16B, v18.16B
341    ST2             { v16.4s, v17.4s}, [x2]
342    ADD             x2, x2, #32
343    uMULL           v28.4s, v4.4h, v9.4h
344
345    uMULL           v26.4s, v2.4h, v8.4h
346    MOV             v21.16B, v22.16B
347
348    ST2             { v20.4s, v21.4s}, [x7], x8
349    uMULL           v24.4s, v4.4h, v8.4h
350
351    ushR            v30.4s, v30.4s, #16
352    ushR            v28.4s, v28.4s, #16
353    ushR            v26.4s, v26.4s, #16
354    ushR            v24.4s, v24.4s, #16
355
356    sMLAL           v30.4s, v3.4h, v9.4h
357    sMLAL           v28.4s, v5.4h, v9.4h
358    sMLAL           v26.4s, v3.4h, v8.4h
359    sMLAL           v24.4s, v5.4h, v8.4h
360
361    ADD             v28.4s, v26.4s , v28.4s
362    NEG             v28.4s, v28.4s
363    SUB             v30.4s, v30.4s , v24.4s
364
365    uMULL           v22.4s, v0.4h, v11.4h
366    uMULL           v20.4s, v6.4h, v11.4h
367    uMULL           v18.4s, v0.4h, v10.4h
368    uMULL           v16.4s, v6.4h, v10.4h
369
370    ushR            v22.4s, v22.4s, #16
371    ushR            v20.4s, v20.4s, #16
372    ushR            v18.4s, v18.4s, #16
373    ushR            v16.4s, v16.4s, #16
374
375    sMLAL           v22.4s, v1.4h, v11.4h
376    sMLAL           v20.4s, v7.4h, v11.4h
377    sMLAL           v18.4s, v1.4h, v10.4h
378    sMLAL           v16.4s, v7.4h, v10.4h
379
380    ADD             v20.4s, v20.4s , v18.4s
381    NEG             v20.4s, v20.4s
382    SUB             v22.4s, v16.4s , v22.4s
383
384
385    sshL            v20.4s, v20.4s, v14.4s
386    sshL            v22.4s, v22.4s, v14.4s
387    sshL            v18.4s, v30.4s, v14.4s
388    sshL            v16.4s, v28.4s, v14.4s
389    MOV             v17.16B, v18.16B
390    ST2             { v16.4s, v17.4s}, [x2]
391    ADD             x2, x2, #32
392    MOV             v21.16B, v22.16B
393    ST2             { v20.4s, v21.4s}, [x7], x8
394
395
396RESIDUE_NEON:
397    MOV             x10, #-16
398    movi            v3.2s, #0x00000000
399    movi            v4.2s, #0x00000000
400
401    LD2             {v21.2s, v22.2s}, [x0], #16
402    MOV             v0.8B, v21.8B
403    MOV             v2.8B, v22.8B
404
405    LD1             {v1.s}[0], [x0], #4;
406    LD1             {v3.s}[0], [x0], #4;
407    LD1             {v1.s}[1], [x0]
408    MOV             v21.8B, v0.8B
409
410    UZP1            v0.4h, v21.4h, v1.4h
411    UZP2            v1.4h, v21.4h, v1.4h
412    MOV             v21.8B, v2.8B
413    UZP1            v2.4h, v21.4h, v3.4h
414    UZP2            v3.4h, v21.4h, v3.4h
415
416    ADD             x1, x1, #4
417
418    LD1             {v6.s}[0], [x1], #4
419    LD1             {v4.s}[1], [x1], #4
420    LD1             {v6.s}[1], [x1], #4
421
422
423    LD2             {v21.2s, v22.2s}, [x1], #16
424    MOV             v5.8B, v21.8B
425    MOV             v7.8B, v22.8B
426
427
428    MOV             v21.8B, v4.8B
429    UZP1            v4.4h, v21.4h, v5.4h
430    UZP2            v5.4h, v21.4h, v5.4h
431    MOV             v21.8B, v6.8B
432    UZP1            v6.4h, v21.4h, v7.4h
433    UZP2            v7.4h, v21.4h, v7.4h
434    rev64           v0.4h, v0.4h
435    rev64           v1.4h, v1.4h
436    rev64           v4.4h, v4.4h
437    rev64           v5.4h, v5.4h
438
439    LD2             {v8.h, v9.h}[0], [x3], x6
440    LD2             {v8.h, v9.h}[1], [x3], x6
441    LD2             {v8.h, v9.h}[2], [x3], x6
442    LD2             {v8.h, v9.h}[3], [x3], x6
443
444    rev64           v10.4h, v8.4h
445    rev64           v11.4h, v9.4h
446
447
448
449    uMULL           v30.4s, v2.4h, v9.4h
450    uMULL           v28.4s, v4.4h, v9.4h
451    uMULL           v26.4s, v2.4h, v8.4h
452    uMULL           v24.4s, v4.4h, v8.4h
453
454    ushR            v30.4s, v30.4s, #16
455    ushR            v28.4s, v28.4s, #16
456    ushR            v26.4s, v26.4s, #16
457    ushR            v24.4s, v24.4s, #16
458
459    sMLAL           v30.4s, v3.4h, v9.4h
460    sMLAL           v28.4s, v5.4h, v9.4h
461    sMLAL           v26.4s, v3.4h, v8.4h
462    sMLAL           v24.4s, v5.4h, v8.4h
463
464    ADD             v28.4s, v26.4s , v28.4s
465    NEG             v28.4s, v28.4s
466    SUB             v30.4s, v30.4s , v24.4s
467
468    uMULL           v22.4s, v0.4h, v11.4h
469    uMULL           v20.4s, v6.4h, v11.4h
470    uMULL           v18.4s, v0.4h, v10.4h
471    uMULL           v16.4s, v6.4h, v10.4h
472
473    ushR            v22.4s, v22.4s, #16
474    ushR            v20.4s, v20.4s, #16
475    ushR            v18.4s, v18.4s, #16
476    ushR            v16.4s, v16.4s, #16
477
478    sMLAL           v22.4s, v1.4h, v11.4h
479    sMLAL           v20.4s, v7.4h, v11.4h
480    sMLAL           v18.4s, v1.4h, v10.4h
481    sMLAL           v16.4s, v7.4h, v10.4h
482
483    ADD             v20.4s, v20.4s , v18.4s
484    NEG             v20.4s, v20.4s
485    SUB             v22.4s, v16.4s , v22.4s
486
487
488
489    sshL            v20.4s, v20.4s, v14.4s
490    sshL            v22.4s, v22.4s, v14.4s
491    sshL            v18.4s, v30.4s, v14.4s
492    sshL            v16.4s, v28.4s, v14.4s
493    MOV             v21.16B, v22.16B
494    ST2             { v20.4s, v21.4s}, [x7]
495    mov             v17.16B, v18.16B
496    ST2             {v16.2s, v17.2s}, [x2]
497    ADD             x2, x2, #16
498
499    ST2             {v16.s, v17.s}[2], [x2]
500    ADD             x2, x2, #8
501
502
503
504
505
506
507END1:
508    pop_v_regs
509    ret
510
511
512
513