1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             q8, q9, [sp, #-32]!
24    stp             q10, q11, [sp, #-32]!
25    stp             q12, q13, [sp, #-32]!
26    stp             q14, q15, [sp, #-32]!
27    stp             X8, X9, [sp, #-16]!
28    stp             X10, X11, [sp, #-16]!
29    stp             X12, X13, [sp, #-16]!
30    stp             X14, X15, [sp, #-16]!
31    stp             X16, X17, [sp, #-16]!
32    stp             X29, X30, [sp, #-16]!
33.endm
34.macro pop_v_regs
35    ldp             X29, X30, [sp], #16
36    ldp             X16, X17, [sp], #16
37    ldp             X14, X15, [sp], #16
38    ldp             X12, X13, [sp], #16
39    ldp             X10, X11, [sp], #16
40    ldp             X8, X9, [sp], #16
41    ldp             q14, q15, [sp], #32
42    ldp             q12, q13, [sp], #32
43    ldp             q10, q11, [sp], #32
44    ldp             q8, q9, [sp], #32
45.endm
46
47.macro swp reg1, reg2
48    MOv             x16, \reg1
49    MOv             \reg1, \reg2
50    MOv             \reg2, x16
51.endm
52.text
53.p2align 2
54.global ixheaacd_imdct_using_fft_armv8
55ixheaacd_imdct_using_fft_armv8:
56    push_v_regs
57
58    MOV             X29, #11600
59    ADD             X4, X0, X29
60    MOV             X29, #11856
61    ADD             X5, X0, X29
62    MOV             X29, #11920
63    ADD             X6, X0, X29
64    MOV             X29, #11936
65    ADD             X7, X0, X29
66
67COND_1: CMP         X1, #0x400
68    BNE             COND_2
69    MOv             X8, #4
70    B               RADIX_4_FIRST_START
71
72
73COND_2: CMP         X1, #0x200
74    BNE             COND_3
75    MOv             X8, #3
76    MOv             X4, X5
77    B               RADIX_8_FIRST_START
78
79COND_3: CMP         X1, #0x100
80    BNE             COND_4
81    MOv             X8, #3
82    MOv             X4, X5
83    B               RADIX_4_FIRST_START
84
85COND_4: CMP         X1, #0x80
86    BNE             COND_5
87    MOv             X8, #2
88    MOv             X4, X6
89    B               RADIX_8_FIRST_START
90
91COND_5: CMP         X1, #0x40
92    BNE             COND_6
93    MOv             X8, #2
94    MOv             X4, X6
95    B               RADIX_4_FIRST_START
96COND_6:
97    MOv             X8, #1
98    MOv             X4, X7
99
100
101
102RADIX_8_FIRST_START:
103    LSR             W9 , W1, #5
104    LSL             W1, W1, #1
105
106RADIX_8_FIRST_LOOP:
107
108    MOv             X5 , X2
109    MOv             X6 , X2
110    MOv             X7 , X2
111    MOv             X11 , X2
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134    LDRB            W12, [X4]
135    ADD             X5, X5, X12, LSL #3
136    LD2             {v0.S, v1.S}[0], [X5], X1
137    ADD             X5, X5, X1
138    LD2             {v4.S, v5.S}[0], [X5], X1
139    SUB             X5, X5, X1, LSL #1
140    LD2             {v2.S, v3.S}[0], [X5], X1
141    ADD             X5, X5, X1
142    LD2             {v6.S, v7.S}[0], [X5], X1
143    SUB             X5, X5, X1, LSL #2
144
145    LDRB            W12, [X4, #1]
146    ADD             X6, X6, X12, LSL #3
147    LD2             {v0.S, v1.S}[1], [X6] , X1
148    ADD             X6, X6, X1
149    LD2             {v4.S, v5.S}[1], [X6] , X1
150    SUB             X6, X6, X1, LSL #1
151    LD2             {v2.S, v3.S}[1], [X6] , X1
152    ADD             X6, X6, X1
153    LD2             {v6.S, v7.S}[1], [X6], X1
154    SUB             X6, X6, X1, LSL #2
155
156
157    LDRB            W12, [X4, #2]
158    ADD             X7, X7, X12, LSL #3
159    LD2             {v0.S, v1.S}[2], [X7] , X1
160    ADD             X7, X7, X1
161    LD2             {v4.S, v5.S}[2], [X7] , X1
162    SUB             X7, X7, X1, LSL #1
163
164    LDRB            W12, [X4, #3]
165    ADD             X11, X11, X12, LSL #3
166    LD2             {v0.S, v1.S}[3], [X11] , X1
167    ADD             X11, X11, X1
168    LD2             {v4.S, v5.S}[3], [X11] , X1
169    SUB             X11, X11, X1, LSL #1
170
171
172    ADD             v8.4S, v0.4S, v4.4S
173    LD2             {v2.S, v3.S}[2], [X7] , X1
174    ADD             X7, X7, X1
175
176
177    SUB             v9.4S, v0.4S, v4.4S
178    LD2             {v6.S, v7.S}[2], [X7], X1
179    SUB             X7, X7, X1, LSL #2
180
181
182    ADD             v0.4S, v1.4S, v5.4S
183    LD2             {v2.S, v3.S}[3], [X11] , X1
184    ADD             X11, X11, X1
185
186    SUB             v4.4S, v1.4S, v5.4S
187    LD2             {v6.S, v7.S}[3], [X11], X1
188    SUB             X11, X11, X1, LSL #2
189
190    ADD             X4, X4, #4
191
192    ADD             X5, X5, X1, LSR #1
193    ADD             X6, X6, X1, LSR #1
194    ADD             X7, X7, X1, LSR #1
195    ADD             X11, X11, X1, LSR #1
196
197
198    ADD             v1.4S, v2.4S, v6.4S
199    LD2             {v14.S, v15.S}[0], [X5] , X1
200
201
202    SUB             v5.4S, v2.4S, v6.4S
203    LD2             {v10.S, v11.S}[0], [X5] , X1
204
205
206    ADD             v2.4S, v3.4S, v7.4S
207    LD2             {v12.S, v13.S}[0], [X5] , X1
208
209
210    SUB             v6.4S, v3.4S, v7.4S
211    LD2             {v14.S, v15.S}[1], [X6] , X1
212
213    ADD             v3.4S, v9.4S, v6.4S
214    LD2             {v10.S, v11.S}[1], [X6] , X1
215
216    SUB             v7.4S, v9.4S, v6.4S
217    LD2             {v12.S, v13.S}[1], [X6] , X1
218
219    SUB             v6.4S, v4.4S, v5.4S
220    LD2             {v14.S, v15.S}[2], [X7] , X1
221
222    ADD             v9.4S, v4.4S, v5.4S
223    LD2             {v10.S, v11.S}[2], [X7] , X1
224
225    ADD             v4.4S, v8.4S, v1.4S
226    LD2             {v12.S, v13.S}[2], [X7] , X1
227
228    SUB             v5.4S, v8.4S, v1.4S
229    LD2             {v14.S, v15.S}[3], [X11] , X1
230
231    ADD             v8.4S, v0.4S, v2.4S
232    LD2             {v10.S, v11.S}[3], [X11] , X1
233
234    SUB             v0.4S, v0.4S, v2.4S
235    LD2             {v12.S, v13.S}[3], [X11] , X1
236
237
238
239
240
241
242
243
244
245
246
247
248    LD2             {v1.S, v2.S}[0], [X5], X1
249
250    ADD             v17.4S, v14.4S, v12.4S
251
252    LD2             {v1.S, v2.S}[1], [X6] , X1
253
254    SUB             v16.4S, v14.4S, v12.4S
255
256    LD2             {v1.S, v2.S}[2], [X7] , X1
257
258    ADD             v14.4S, v15.4S, v13.4S
259
260    LD2             {v1.S, v2.S}[3], [X11] , X1
261
262    SUB             v12.4S, v15.4S, v13.4S
263
264    ADD             v15.4S, v10.4S, v1.4S
265    SUB             v13.4S, v10.4S, v1.4S
266    ADD             v10.4S, v11.4S, v2.4S
267    SUB             v1.4S, v11.4S, v2.4S
268
269    ADD             v11.4S, v17.4S, v15.4S
270    SUB             v2.4S, v17.4S, v15.4S
271    ADD             v17.4S, v14.4S, v10.4S
272    SUB             v15.4S, v14.4S, v10.4S
273
274    ADD             v14.4S, v16.4S, v12.4S
275    SUB             v10.4S, v16.4S, v12.4S
276    ADD             v16.4S, v13.4S, v1.4S
277    SUB             v12.4S, v13.4S, v1.4S
278
279    ADD             v1.4S , v14.4S, v12.4S
280    SUB             v13.4S, v14.4S, v12.4S
281    SUB             v12.4S, v16.4S, v10.4S
282
283
284    UZP1            v22.8H, v1.8H, v1.8H
285    UZP2            v23.8H, v1.8H, v1.8H
286    ADD             v14.4S, v16.4S, v10.4S
287
288    UZP1            v26.8H, v13.8H, v13.8H
289    UZP2            v27.8H, v13.8H, v13.8H
290    ADD             v16.4S, v4.4S, v11.4S
291
292    UZP1            v24.8H, v12.8H, v12.8H
293    UZP2            v25.8H, v12.8H, v12.8H
294    SUB             v10.4S, v4.4S, v11.4S
295
296    UZP1            v28.8H, v14.8H, v14.8H
297    UZP2            v29.8H, v14.8H, v14.8H
298    ADD             v4.4S, v8.4S, v17.4S
299
300    MOv             W14, #0x5a82
301
302    SUB             v11.4S, v8.4S, v17.4S
303
304    ADD             v8.4S, v5.4S, v15.4S
305    SUB             v17.4S, v5.4S, v15.4S
306    SUB             v5.4S, v0.4S, v2.4S
307    ADD             v15.4S, v0.4S, v2.4S
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329    DUP             v31.4H, W14
330
331    UMULL           v19.4S, v26.4H, v31.4H
332    UMULL           v18.4S, v28.4H, v31.4H
333    SSHR            v19.4S, v19.4S, #15
334    SSHR            v18.4S, v18.4S, #15
335
336
337    SQDMLAL         v19.4S, v27.4H, v31.4H
338    SQDMLAL         v18.4S, v29.4H, v31.4H
339
340
341    UMULL           v13.4S, v24.4H, v31.4H
342    UMULL           v14.4S, v22.4H, v31.4H
343
344    ADD             v20.4S, v3.4S, v19.4S
345    SUB             v21.4S, v3.4S, v19.4S
346    ADD             v30.4S, v6.4S, v18.4S
347    SUB             v6.4S, v6.4S, v18.4S
348
349    SSHR            v13.4S, v13.4S, #15
350    SSHR            v14.4S, v14.4S, #15
351
352    SQDMLAL         v13.4S, v25.4H, v31.4H
353    SQDMLAL         v14.4S, v23.4H, v31.4H
354
355
356
357
358    ADD             v3.4S, v7.4S, v13.4S
359    SUB             v19.4S, v7.4S, v13.4S
360    ADD             v1.4S, v9.4S, v14.4S
361    SUB             v18.4S, v9.4S, v14.4S
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385    swp             v17.D[0], v8.D[0]
386    swp             v17.D[1], v8.D[1]
387    swp             v4.D[0], v16.D[0]
388    swp             v4.D[1], v16.D[1]
389
390    TRN1            v12.4S, v4.4S, v20.4S
391    TRN2            v22.4S, v4.4S, v20.4S
392
393    SHL             v12.4S, v12.4S, #3
394    TRN1            v9.4S, v17.4S, v3.4S
395    TRN2            v2.4S, v17.4S, v3.4S
396    SHL             v22.4S, v22.4S, #3
397
398    SHL             v9.4S, v9.4S, #3
399    TRN1            v24.4S, v10.4S, v21.4S
400    TRN2            v7.4S, v10.4S, v21.4S
401    SHL             v2.4S, v2.4S, #3
402
403    SHL             v24.4S, v24.4S, #3
404    TRN1            v13.4S, v16.4S, v6.4S
405    TRN2            v23.4S, v16.4S, v6.4S
406    SHL             v7.4S, v7.4S, #3
407
408    SHL             v13.4S, v13.4S, #3
409    TRN1            v10.4S, v5.4S, v18.4S
410    TRN2            v3.4S, v5.4S, v18.4S
411    SHL             v23.4S, v23.4S, #3
412
413    SHL             v10.4S, v10.4S, #3
414    TRN1            v26.4S, v8.4S, v19.4S
415    TRN2            v4.4S, v8.4S, v19.4S
416    SHL             v3.4S, v3.4S, #3
417
418    SHL             v26.4S, v26.4S, #3
419    TRN1            v25.4S, v11.4S, v30.4S
420    TRN2            v8.4S, v11.4S, v30.4S
421    SHL             v4.4S, v4.4S, #3
422
423    SHL             v25.4S, v25.4S, #3
424    TRN1            v27.4S, v15.4S, v1.4S
425    TRN2            v5.4S, v15.4S, v1.4S
426    SHL             v8.4S, v8.4S, #3
427
428    SHL             v27.4S, v27.4S, #3
429    swp             v9.D[0], v12.D[1]
430    SHL             v5.4S, v5.4S, #3
431    swp             v2.D[0], v22.D[1]
432
433    swp             v24.D[1], v26.D[0]
434    swp             v7.D[1], v4.D[0]
435    swp             v10.D[0], v13.D[1]
436    swp             v3.D[0], v23.D[1]
437    swp             v27.D[0], v25.D[1]
438    swp             v5.D[0], v8.D[1]
439
440    MOv             X15, #32
441    ST2             {v12.4S, v13.4S}, [X3], X15
442    ST2             {v24.4S, v25.4S}, [X3], X15
443    ST2             {v22.4S, v23.4S}, [X3], X15
444    ST2             {v7.4S, v8.4S}, [X3], X15
445    ST2             {v9.4S, v10.4S}, [X3], X15
446    ST2             {v26.4S, v27.4S}, [X3], X15
447    ST2             {v2.4S, v3.4S}, [X3], X15
448    ST2             {v4.4S, v5.4S}, [X3], X15
449
450
451    SUBS            X9, X9, #1
452    BNE             RADIX_8_FIRST_LOOP
453
454    LSR             X1, X1, #1
455    LSL             X15, X1, #3
456    SUB             X3, X3, X15
457
458    MOv             X5, #8
459    MOv             X4, #32
460    LSR             X15, X1, #5
461    MOv             X6, X15
462    B               RADIX_4_FIRST_ENDS
463RADIX_8_FIRST_ENDS:
464
465RADIX_4_FIRST_START:
466
467    LSR             W9, W1, #4
468    LSL             W1, W1, #1
469RADIX_4_LOOP:
470
471    MOv             X5 , X2
472    MOv             X6 , X2
473    MOv             X7 , X2
474    MOv             X11 , X2
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490    LDRB            W12, [X4, #0]
491    ADD             X5, X5, X12, LSL #3
492
493    LD2             {v0.S, v1.S}[0], [X5] , X1
494    ADD             X5, X5, X1
495    LD2             {v8.S, v9.S}[0], [X5] , X1
496    SUB             X5, X5, X1, LSL #1
497    LD2             {v4.S, v5.S}[0], [X5] , X1
498    ADD             X5, X5, X1
499    LD2             {v12.S, v13.S}[0], [X5] , X1
500
501    LDRB            W12, [X4, #1]
502    ADD             X6, X6, X12, LSL #3
503    LD2             {v0.S, v1.S}[1], [X6] , X1
504    ADD             X6, X6, X1
505    LD2             {v8.S, v9.S}[1], [X6] , X1
506    SUB             X6, X6, X1, LSL #1
507    LD2             {v4.S, v5.S}[1], [X6] , X1
508    ADD             X6, X6, X1
509    LD2             {v12.S, v13.S}[1], [X6] , X1
510
511    LDRB            W12, [X4, #2]
512    ADD             X7, X7, X12, LSL #3
513
514    LD2             {v0.S, v1.S}[2], [X7] , X1
515    ADD             X7, X7, X1
516    LD2             {v8.S, v9.S}[2], [X7] , X1
517
518
519    LDRB            W12, [X4, #3]
520    ADD             X11, X11, X12 , LSL #3
521
522
523    LD2             {v0.S, v1.S}[3], [X11] , X1
524    ADD             X11, X11, X1
525    LD2             {v8.S, v9.S}[3], [X11] , X1
526
527    SUB             X7, X7, X1, LSL #1
528    ADD             v16.4S, v0.4S, v8.4S
529    LD2             {v4.S, v5.S}[2], [X7] , X1
530    ADD             X7, X7, X1
531    ADD             v18.4S, v1.4S, v9.4S
532    LD2             {v12.S, v13.S}[2], [X7] , X1
533
534    SUB             X11, X11, X1, LSL #1
535    SUB             v20.4S, v0.4S, v8.4S
536    LD2             {v4.S, v5.S}[3], [X11] , X1
537    ADD             X11, X11, X1
538    SUB             v22.4S, v1.4S, v9.4S
539    LD2             {v12.S, v13.S}[3], [X11] , X1
540
541
542
543
544
545
546    ADD             X4, X4, #4
547
548    ADD             v24.4S, v4.4S, v12.4S
549    ADD             v26.4S, v5.4S, v13.4S
550    SUB             v28.4S, v4.4S, v12.4S
551    SUB             v30.4S, v5.4S, v13.4S
552
553    ADD             v17.4S, v16.4S, v24.4S
554    ADD             v11.4S, v18.4S, v26.4S
555    SUB             v19.4S, v16.4S, v24.4S
556    SUB             v15.4S, v18.4S, v26.4S
557
558    ADD             v8.4S, v20.4S, v30.4S
559    SUB             v9.4S, v22.4S, v28.4S
560    ADD             v13.4S, v22.4S, v28.4S
561    SUB             v12.4S, v20.4S, v30.4S
562
563
564
565
566    TRN1            v0.4S, v17.4S, v8.4S
567    TRN2            v8.4S, v17.4S, v8.4S
568
569    SHL             v0.4S, v0.4S, #2
570    TRN1            v4.4S, v19.4S, v12.4S
571    TRN2            v12.4S, v19.4S, v12.4S
572    SHL             v8.4S, v8.4S, #2
573
574    SHL             v4.4S, v4.4S, #2
575    TRN1            v1.4S, v11.4S, v9.4S
576    TRN2            v9.4S, v11.4S, v9.4S
577    SHL             v12.4S, v12.4S, #2
578
579    SHL             v1.4S, v1.4S, #2
580    TRN1            v5.4S, v15.4S, v13.4S
581    TRN2            v13.4S, v15.4S, v13.4S
582    SHL             v9.4S, v9.4S, #2
583
584    SHL             v5.4S, v5.4S, #2
585    swp             v4.D[0], v0.D[1]
586    SHL             v13.4S, v13.4S, #2
587
588    swp             v12.D[0], v8.D[1]
589    swp             v5.D[0], v1.D[1]
590    swp             v13.D[0], v9.D[1]
591
592    MOv             X15, #32
593    ST2             {v0.4S, v1.4S}, [X3], X15
594    ST2             {v8.4S, v9.4S}, [X3], X15
595    ST2             {v4.4S, v5.4S}, [X3], X15
596    ST2             {v12.4S, v13.4S}, [X3], X15
597
598
599    SUBS            W9, W9, #1
600    BNE             RADIX_4_LOOP
601
602    LSR             X1, X1, #1
603    SUB             X3, X3, X1, LSL #3
604    MOv             X5, #4
605    MOv             X4, #64
606    LSR             X6, X1, #4
607
608
609RADIX_4_FIRST_ENDS:
610
611    MOv             x30, X3
612    LSR             X5, X5, #2
613
614    MOV             X14, #8528
615    ADD             X0, X0, X14
616
617OUTER_LOOP_R4:
618
619    MOv             X14, x30
620
621    MOv             X7, X5
622    MOv             X2, #0
623    MOv             X9, X0
624    LSL             X12, X5, #5
625MIDDLE_LOOP_R4:
626
627    LD2             {v20.H, v21.H}[0], [X9], X2
628    LD2             {v22.H, v23.H}[0], [X9], X2
629    ADD             X11, X2, X4, LSL #2
630    LD2             {v24.H, v25.H}[0], [X9]
631    ADD             X10, X0, X11
632
633    LD2             {v20.H, v21.H}[1], [X10], X11
634    LD2             {v22.H, v23.H}[1], [X10], X11
635    ADD             X2, X11, X4, LSL #2
636    LD2             {v24.H, v25.H}[1], [X10]
637    ADD             X9, X0, X2
638
639    LD2             {v20.H, v21.H}[2], [X9], X2
640    LD2             {v22.H, v23.H}[2], [X9], X2
641    ADD             X11, X2, X4, LSL #2
642    LD2             {v24.H, v25.H}[2], [X9]
643    ADD             X10, X0, X11
644
645    LD2             {v20.H, v21.H}[3], [X10], X11
646    LD2             {v22.H, v23.H}[3], [X10], X11
647    ADD             X2, X11, X4, LSL #2
648    LD2             {v24.H, v25.H}[3], [X10]
649    ADD             X9, X0, X2
650
651    MOv             X10, X6
652INNER_LOOP_R4:
653
654    LD2             {v30.4S, v31.4S}, [X14], X12
655    SSHR            v30.4S, v30.4S, #1
656    LD4             {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12
657    SSHR            v31.4S, v31.4S, #1
658
659    USHR            v16.4H, v16.4H, #1
660    LD4             {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12
661    USHR            v18.4H, v18.4H, #1
662
663    SMULL           v11.4S, v16.4H, v20.4H
664    SMLSL           v11.4S, v18.4H, v21.4H
665
666    LD4             {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12
667    SMULL           v12.4S, v16.4H, v21.4H
668    SMLAL           v12.4S, v18.4H, v20.4H
669
670    USHR            v26.4H, v26.4H, #1
671    USHR            v28.4H, v28.4H, #1
672
673    LSL             x29, X12, #2
674    SUB             X14, X14, X12, LSL #2
675
676    USHR            v0.4H, v0.4H, #1
677    USHR            v2.4H, v2.4H, #1
678
679    SMULL           v13.4S, v26.4H, v22.4H
680    SMLSL           v13.4S, v28.4H, v23.4H
681
682    SSHR            v11.4S, v11.4S, #15
683
684    SMULL           v14.4S, v26.4H, v23.4H
685    SMLAL           v14.4S, v28.4H, v22.4H
686
687    SMULL           v15.4S, v0.4H, v24.4H
688    SMLSL           v15.4S, v2.4H, v25.4H
689
690    SMLAL           v11.4S, v17.4H, v20.4H
691    SMLSL           v11.4S, v19.4H, v21.4H
692
693    SSHR            v12.4S, v12.4S, #15
694    SSHR            v13.4S, v13.4S, #15
695    SSHR            v14.4S, v14.4S, #15
696    SSHR            v15.4S, v15.4S, #15
697
698    SMLAL           v12.4S, v17.4H, v21.4H
699    SMLAL           v12.4S, v19.4H, v20.4H
700
701    SMULL           v5.4S, v0.4H, v25.4H
702    SMLAL           v5.4S, v2.4H, v24.4H
703
704    SMLAL           v13.4S, v27.4H, v22.4H
705    SMLSL           v13.4S, v29.4H, v23.4H
706
707    SMLAL           v14.4S, v27.4H, v23.4H
708    SMLAL           v14.4S, v29.4H, v22.4H
709
710    SMLAL           v15.4S, v1.4H, v24.4H
711    SMLSL           v15.4S, v3.4H, v25.4H
712
713    SSHR            v5.4S, v5.4S, #15
714
715    SMLAL           v5.4S, v1.4H, v25.4H
716    SMLAL           v5.4S, v3.4H, v24.4H
717
718
719
720    SUBS            x17, X7, X5
721    BNE             BYPASS_IF
722
723    ADD             X14, X14, X12
724
725    LDR             W3, [X14]
726    ADD             X14, X14, X12
727    ASR             W3, W3, #1
728
729    MOv             v11.S[0], W3
730
731    LDR             W3, [X14]
732    ADD             X14, X14, X12
733    ASR             W3, W3, #1
734    MOv             v13.S[0], W3
735
736    LDR             W3, [X14]
737    ASR             W3, W3, #1
738    MOv             v15.S[0], W3
739
740    SUB             X14, X14, X12, LSL #1
741    ADD             X14, X14, #4
742
743    LDR             W3, [X14]
744    ADD             X14, X14, X12
745    ASR             W3, W3, #1
746    MOv             v12.S[0], W3
747
748    LDR             W3, [X14]
749    ADD             X14, X14, X12
750    ASR             W3, W3, #1
751    MOv             v14.S[0], W3
752
753    LDR             W3, [X14]
754    ADD             X14, X14, X12
755    ASR             W3, W3, #1
756    MOv             v5.S[0], W3
757
758    SUB             X14, X14, #4
759
760    SUB             X14, X14, x29
761
762
763
764
765
766
767
768
769BYPASS_IF:
770
771    ADD             v6.4S, v30.4S, v13.4S
772    ADD             v7.4S, v31.4S, v14.4S
773    SUB             v30.4S, v30.4S, v13.4S
774    SUB             v31.4S, v31.4S, v14.4S
775    ADD             v8.4S, v11.4S, v15.4S
776    ADD             v9.4S, v12.4S, v5.4S
777
778    SUB             v15.4S, v11.4S, v15.4S
779    SUB             v14.4S, v12.4S, v5.4S
780
781
782    ADD             v10.4S, v6.4S, v8.4S
783    ADD             v11.4S, v7.4S, v9.4S
784    ADD             v12.4S, v30.4S, v14.4S
785    SUB             v13.4S, v31.4S, v15.4S
786
787    SUB             v6.4S, v6.4S, v8.4S
788    ST2             {v10.4S, v11.4S}, [X14], X12
789    SUB             v7.4S, v7.4S, v9.4S
790
791    SUB             v8.4S, v30.4S, v14.4S
792    ST2             {v12.4S, v13.4S}, [X14], X12
793    ADD             v9.4S, v31.4S, v15.4S
794
795    ST2             {v6.4S, v7.4S}, [X14], X12
796    ST2             {v8.4S, v9.4S}, [X14], X12
797    SUBS            X10, X10, #1
798    BNE             INNER_LOOP_R4
799
800    SUB             X14, X14, X1, LSL #3
801    ADD             X14, X14, #32
802
803    SUBS            X7, X7, #1
804    BNE             MIDDLE_LOOP_R4
805
806
807
808
809    LSR             X4, X4, #2
810    LSL             X5, X5, #2
811    LSR             X6, X6, #2
812    SUBS            X8, X8, #1
813    BNE             OUTER_LOOP_R4
814END_LOOPS:
815    pop_v_regs
816    RET
817
818
819
820