1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_8x8_neon.s
22// *
23// * @brief
24// *  contains function definitions for single stage  inverse transform
25// *
26// * @author
27// * anand s
28// *
29// * @par list of functions:
30// *  - ihevc_itrans_recon_32x32()
31// *
32// * @remarks
33// *  the input buffer is being corrupted
34// *
35// *******************************************************************************
36//*/
37
38///**
39// *******************************************************************************
40// *
41// * @brief
42// *  this function performs inverse transform  and reconstruction for 8x8
43// * input block
44// *
45// * @par description:
46// *  performs inverse transform and adds the prediction  data and clips output
47// * to 8 bit
48// *
49// * @param[in] pi2_src
50// *  input 16x16 coefficients
51// *
52// * @param[in] pi2_tmp
53// *  temporary 16x16 buffer for storing inverse
54// *
55// *  transform
56// *  1st stage output
57// *
58// * @param[in] pu1_pred
59// *  prediction 16x16 block
60// *
61// * @param[out] pu1_dst
62// *  output 8x8 block
63// *
64// * @param[in] src_strd
65// *  input stride
66// *
67// * @param[in] pred_strd
68// *  prediction stride
69// *
70// * @param[in] dst_strd
71// *  output stride
72// *
73// * @param[in] shift
74// *  output shift
75// *
76// * @param[in] x12
77// *  zero columns in pi2_src
78// *
79// * @returns  void
80// *
81// * @remarks
82// *  none
83// *
84// *******************************************************************************
85// */
86
87//void ihevc_itrans_recon_32x32(word16 *pi2_src,
88//                            word16 *pi2_tmp,
89//                            uword8 *pu1_pred,
90//                            uword8 *pu1_dst,
91//                            word32 src_strd,
92//                            word32 pred_strd,
93//                            word32 dst_strd,
94//                            word32 x12
95//                             word32    x11                )
96
97//**************variables vs registers*************************
98//    x0 => *pi2_src
99//    x1 => *pi2_tmp
100//    x2 => *pu1_pred
101//    x3 => *pu1_dst
102//    src_strd
103//    pred_strd
104//    dst_strd
105//    x12
106//    x11
107
108
109//d0[0]=    64        d2[0]=83
110//d0[1]= 90        d2[1]=82
111//d0[2]= 90        d2[2]=80
112//d0[3]= 90        d2[3]=78
113//d1[0]= 89         d3[0]=75
114//d1[1]= 88        d3[1]=73
115//d1[2]= 87        d3[2]=70
116//d1[3]= 85        d3[3]=67
117
118//d4[0]=    64        d6[0]=36
119//d4[1]= 61        d6[1]=31
120//d4[2]= 57        d6[2]=25
121//d4[3]= 54        d6[3]=22
122//d5[0]= 50         d7[0]=18
123//d5[1]= 46        d7[1]=13
124//d5[2]= 43        d7[2]=9
125//d5[3]= 38        d7[3]=4
126
127.text
128.align 4
129.include "ihevc_neon_macros.s"
130
131
132
133
134.set shift_stage1_idct ,   7
135.set shift_stage2_idct ,   12
136
137//#define zero_cols      x12
138//#define zero_rows     x11
139
140.globl ihevc_itrans_recon_32x32_av8
141
142.extern g_ai2_ihevc_trans_32_transpose
143
144.type ihevc_itrans_recon_32x32_av8, %function
145
146ihevc_itrans_recon_32x32_av8:
147
148    ldr         w11, [sp]
149
150// stmfd sp!,{x0-x12,x14}
151    push_v_regs
152    stp         x19, x20,[sp,#-16]!
153    stp         x0, x1,[sp,#-16]!
154    stp         x5, x6,[sp,#-16]!
155
156//ldr            x8,[sp,#56]     @ prediction stride
157//ldr            x7,[sp,#64]     @ destination stride
158    mov         x6, x4 // src stride
159    mov         x12, x7
160    lsl         x6, x6, #1                  // x sizeof(word16)
161    add         x10,x6,x6, lsl #1           // 3 rows
162
163
164    mov         x8,x0
165
166    adrp        x14, :got:g_ai2_ihevc_trans_32_transpose
167    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
168
169    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
170    ld1         {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
171
172//registers which are free
173//  x10,x9,x11,x12
174    mov         x9,#0xffffff00
175    mov         x10,#0xfffffff0
176    mov         w5,#0xfffff000
177    mov         w7,#0xffff0000
178    cmp         x12,x10
179    mov         x20,#1
180    csel        x14, x20, x14,hs
181    bhs         stage1
182
183
184    cmp         x12,x9
185    mov         x20,#2
186    csel        x14, x20, x14,hs
187    bhs         stage1
188
189    cmp         x12,x5
190    mov         x20,#3
191    csel        x14, x20, x14,hs
192    bhs         stage1
193
194    cmp         x12,x7
195    mov         x20,#4
196    csel        x14, x20, x14,hs
197
198    mov         x14,#8
199    b           stage1
200//.ltorg
201
202
203dct_stage1:
204    add         x8,x8,#8
205    mov         x0,x8
206
207stage1:
208    ld1         {v10.4h},[x0],x6
209    ld1         {v8.4h},[x0],x6
210    ld1         {v11.4h},[x0],x6
211    ld1         {v9.4h},[x0],x6
212
213    smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
214    smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
215    smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
216    smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
217
218    smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
219    smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
220    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
221    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
222
223
224
225
226
227    smull       v20.4s, v10.4h, v0.h[0]
228    smlal       v20.4s, v11.4h, v0.h[2]
229
230
231    smull       v22.4s, v10.4h, v0.h[0]
232    smlal       v22.4s, v11.4h, v1.h[2]
233
234    smull       v16.4s, v10.4h, v0.h[0]
235    smlal       v16.4s, v11.4h, v2.h[2]
236
237    smull       v18.4s, v10.4h, v0.h[0]
238    smlal       v18.4s, v11.4h, v3.h[2]
239    cmp         x11,x10
240    bhs         shift1
241
242    ld1         {v12.4h},[x0],x6
243    ld1         {v14.4h},[x0],x6
244    ld1         {v13.4h},[x0],x6
245    ld1         {v15.4h},[x0],x6
246
247
248
249
250
251
252
253    smlal       v24.4s, v14.4h, v1.h[1]
254    smlal       v26.4s, v14.4h, v3.h[3]
255    smlal       v28.4s, v14.4h, v6.h[1]
256    smlsl       v30.4s, v14.4h, v7.h[1]
257
258
259    smlal       v24.4s, v15.4h, v1.h[3]
260    smlal       v26.4s, v15.4h, v5.h[1]
261    smlsl       v28.4s, v15.4h, v7.h[1]
262    smlsl       v30.4s, v15.4h, v3.h[3]
263
264
265    smlal       v20.4s, v12.4h, v1.h[0]
266    smlal       v20.4s, v13.4h, v1.h[2]
267    smlal       v22.4s, v12.4h, v3.h[0]
268    smlal       v22.4s, v13.4h, v4.h[2]
269    smlal       v16.4s, v12.4h, v5.h[0]
270    smlal       v16.4s, v13.4h, v7.h[2]
271    smlal       v18.4s, v12.4h, v7.h[0]
272    smlsl       v18.4s, v13.4h, v5.h[2]
273
274    cmp         x11,x9
275    bhs         shift1
276
277    ld1         {v10.4h},[x0],x6
278    ld1         {v8.4h},[x0],x6
279    ld1         {v11.4h},[x0],x6
280    ld1         {v9.4h},[x0],x6
281
282
283    smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
284    smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
285    smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
286    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
287
288    smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
289    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
290    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
291    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
292
293
294
295
296
297    smlal       v20.4s, v10.4h, v2.h[0]
298    smlal       v20.4s, v11.4h, v2.h[2]
299
300
301    smlal       v22.4s, v10.4h, v6.h[0]
302    smlal       v22.4s, v11.4h, v7.h[2]
303
304    smlsl       v16.4s, v10.4h, v6.h[0]
305    smlsl       v16.4s, v11.4h, v3.h[2]
306
307    smlsl       v18.4s, v10.4h, v2.h[0]
308    smlsl       v18.4s, v11.4h, v1.h[2]
309
310    cmp         x11,x5
311    bhs         shift1
312
313
314    ld1         {v12.4h},[x0],x6
315    ld1         {v14.4h},[x0],x6
316    ld1         {v13.4h},[x0],x6
317    ld1         {v15.4h},[x0],x6
318
319
320
321
322
323
324
325
326
327    smlal       v24.4s, v14.4h, v3.h[1]
328    smlsl       v26.4s, v14.4h, v6.h[1]
329    smlsl       v28.4s, v14.4h, v0.h[1]
330    smlsl       v30.4s, v14.4h, v6.h[3]
331
332
333    smlal       v24.4s, v15.4h, v3.h[3]
334    smlsl       v26.4s, v15.4h, v4.h[3]
335    smlsl       v28.4s, v15.4h, v2.h[3]
336    smlal       v30.4s, v15.4h, v5.h[3]
337
338
339    smlal       v20.4s, v12.4h, v3.h[0]
340    smlal       v20.4s, v13.4h, v3.h[2]
341    smlsl       v22.4s, v12.4h, v7.h[0]
342    smlsl       v22.4s, v13.4h, v5.h[2]
343    smlsl       v16.4s, v12.4h, v1.h[0]
344    smlsl       v16.4s, v13.4h, v1.h[2]
345    smlsl       v18.4s, v12.4h, v5.h[0]
346    smlal       v18.4s, v13.4h, v7.h[2]
347
348    cmp         x11,x7
349    bhs         shift1
350
351
352    ld1         {v10.4h},[x0],x6
353    ld1         {v8.4h},[x0],x6
354    ld1         {v11.4h},[x0],x6
355    ld1         {v9.4h},[x0],x6
356
357
358
359    smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
360    smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
361    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
362    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
363
364    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
365    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
366    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
367    smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
368
369
370
371
372
373    smlal       v20.4s, v10.4h, v0.h[0]
374    smlal       v20.4s, v11.4h, v4.h[2]
375
376
377    smlsl       v22.4s, v10.4h, v0.h[0]
378    smlsl       v22.4s, v11.4h, v2.h[2]
379
380    smlsl       v16.4s, v10.4h, v0.h[0]
381    smlsl       v16.4s, v11.4h, v6.h[2]
382
383    smlal       v18.4s, v10.4h, v0.h[0]
384    smlal       v18.4s, v11.4h, v0.h[2]
385
386
387
388    ld1         {v12.4h},[x0],x6
389    ld1         {v14.4h},[x0],x6
390    ld1         {v13.4h},[x0],x6
391    ld1         {v15.4h},[x0],x6
392
393
394
395
396    smlal       v24.4s, v14.4h, v5.h[1]
397    smlsl       v26.4s, v14.4h, v0.h[2]
398    smlal       v28.4s, v14.4h, v5.h[3]
399    smlal       v30.4s, v14.4h, v4.h[3]
400
401
402    smlal       v24.4s, v15.4h, v5.h[3]
403    smlsl       v26.4s, v15.4h, v1.h[1]
404    smlal       v28.4s, v15.4h, v3.h[1]
405    smlsl       v30.4s, v15.4h, v7.h[3]
406
407
408    smlal       v20.4s, v12.4h, v5.h[0]
409    smlal       v20.4s, v13.4h, v5.h[2]
410    smlsl       v22.4s, v12.4h, v1.h[0]
411    smlsl       v22.4s, v13.4h, v0.h[2]
412    smlal       v16.4s, v12.4h, v7.h[0]
413    smlal       v16.4s, v13.4h, v4.h[2]
414    smlal       v18.4s, v12.4h, v3.h[0]
415    smlal       v18.4s, v13.4h, v6.h[2]
416
417
418    ld1         {v10.4h},[x0],x6
419    ld1         {v8.4h},[x0],x6
420    ld1         {v11.4h},[x0],x6
421    ld1         {v9.4h},[x0],x6
422
423
424
425
426
427
428
429    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
430    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
431    smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
432    smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
433
434    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
435    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
436    smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
437    smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
438
439
440
441
442
443    smlal       v20.4s, v10.4h, v6.h[0]
444    smlal       v20.4s, v11.4h, v6.h[2]
445
446
447    smlsl       v22.4s, v10.4h, v2.h[0]
448    smlsl       v22.4s, v11.4h, v3.h[2]
449
450    smlal       v16.4s, v10.4h, v2.h[0]
451    smlal       v16.4s, v11.4h, v0.h[2]
452
453    smlsl       v18.4s, v10.4h, v6.h[0]
454    smlsl       v18.4s, v11.4h, v2.h[2]
455
456    ld1         {v12.4h},[x0],x6
457    ld1         {v14.4h},[x0],x6
458    ld1         {v13.4h},[x0],x6
459    ld1         {v15.4h},[x0],x6
460
461
462    smlal       v24.4s, v14.4h, v7.h[1]
463    smlsl       v26.4s, v14.4h, v5.h[3]
464    smlal       v28.4s, v14.4h, v4.h[1]
465    smlsl       v30.4s, v14.4h, v2.h[3]
466
467
468    smlal       v24.4s, v15.4h, v7.h[3]
469    smlsl       v26.4s, v15.4h, v7.h[1]
470    smlal       v28.4s, v15.4h, v6.h[3]
471    smlsl       v30.4s, v15.4h, v6.h[1]
472
473
474    smlal       v20.4s, v12.4h, v7.h[0]
475    smlal       v20.4s, v13.4h, v7.h[2]
476    smlsl       v22.4s, v12.4h, v5.h[0]
477    smlsl       v22.4s, v13.4h, v6.h[2]
478    smlal       v16.4s, v12.4h, v3.h[0]
479    smlal       v16.4s, v13.4h, v5.h[2]
480    smlsl       v18.4s, v12.4h, v1.h[0]
481    smlsl       v18.4s, v13.4h, v4.h[2]
482
483
484
485shift1:
486    add         v8.4s,  v20.4s ,  v24.4s
487    sub         v10.4s,  v20.4s ,  v24.4s
488
489    add         v12.4s,  v22.4s ,  v26.4s
490    sub         v24.4s,  v22.4s ,  v26.4s
491
492    add         v14.4s,  v16.4s ,  v28.4s
493    sub         v26.4s,  v16.4s ,  v28.4s
494
495
496    add         v16.4s,  v18.4s ,  v30.4s
497    sub         v28.4s,  v18.4s ,  v30.4s
498
499
500    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
501    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
502    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
503    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
504    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
505    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
506    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
507    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
508
509
510    // registers used q15,q14,q6,q7
511
512    umov        x15,v24.d[0]
513    umov        x16,v25.d[0]
514    umov        x19,v26.d[0]
515    umov        x20,v27.d[0]
516
517    trn1        v24.4h, v30.4h, v12.4h
518    trn2        v25.4h, v30.4h, v12.4h
519    trn1        v26.4h, v31.4h, v13.4h
520    trn2        v27.4h, v31.4h, v13.4h
521
522    trn1        v30.2s, v24.2s, v26.2s
523    trn2        v31.2s, v24.2s, v26.2s
524    trn1        v12.2s, v25.2s, v27.2s
525    trn2        v13.2s, v25.2s, v27.2s
526
527    trn1        v24.4h, v14.4h, v18.4h
528    trn2        v25.4h, v14.4h, v18.4h
529    trn1        v26.4h, v15.4h, v19.4h
530    trn2        v27.4h, v15.4h, v19.4h
531
532    trn1        v14.2s, v24.2s, v26.2s
533    trn2        v15.2s, v24.2s, v26.2s
534    trn1        v18.2s, v25.2s, v27.2s
535    trn2        v19.2s, v25.2s, v27.2s
536
537    mov         v24.d[0],x15
538    mov         v25.d[0],x16
539    mov         v26.d[0],x19
540    mov         v27.d[0],x20
541
542// d30 =x0 1- 4 values
543// d31 =x2 1- 4 values
544// d12=x1 1- 4 values
545// d13=x3 1- 4 values
546// d14 =x0 28-31 values
547// d15 =x2 28- 31 values
548// d18=x1 28- 31 values
549// d19=x3 28- 31 values
550
551
552
553    st1         { v30.4h, v31.4h},[x1],#16
554    st1         { v12.4h, v13.4h},[x1],#16
555    add         x1,x1,#192
556    st1         { v14.4h, v15.4h},[x1],#16
557    st1         { v18.4h, v19.4h},[x1],#16
558    sub         x1,x1,#224
559
560    mov         x0,x8
561
562
563
564
565
566    ld1         {v10.4h},[x0],x6
567    ld1         {v8.4h},[x0],x6
568    ld1         {v11.4h},[x0],x6
569    ld1         {v9.4h},[x0],x6
570
571
572
573
574    smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
575    smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
576    smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
577    smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
578
579    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
580    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
581    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
582    smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
583
584
585
586
587
588    smull       v20.4s, v10.4h, v0.h[0]
589    smlal       v20.4s, v11.4h, v4.h[2]
590
591
592    smull       v22.4s, v10.4h, v0.h[0]
593    smlal       v22.4s, v11.4h, v5.h[2]
594
595    smull       v16.4s, v10.4h, v0.h[0]
596    smlal       v16.4s, v11.4h, v6.h[2]
597
598    smull       v18.4s, v10.4h, v0.h[0]
599    smlal       v18.4s, v11.4h, v7.h[2]
600    cmp         x11,x10
601    bhs         shift2
602
603    ld1         {v12.4h},[x0],x6
604    ld1         {v14.4h},[x0],x6
605    ld1         {v13.4h},[x0],x6
606    ld1         {v15.4h},[x0],x6
607
608
609    smlsl       v24.4s, v14.4h, v4.h[3]
610    smlsl       v26.4s, v14.4h, v2.h[1]
611    smlsl       v28.4s, v14.4h, v0.h[1]
612    smlsl       v30.4s, v14.4h, v2.h[3]
613
614
615    smlsl       v24.4s, v15.4h, v0.h[3]
616    smlsl       v26.4s, v15.4h, v3.h[1]
617    smlsl       v28.4s, v15.4h, v6.h[3]
618    smlal       v30.4s, v15.4h, v5.h[3]
619
620
621    smlsl       v20.4s, v12.4h, v7.h[0]
622    smlsl       v20.4s, v13.4h, v2.h[2]
623    smlsl       v22.4s, v12.4h, v5.h[0]
624    smlsl       v22.4s, v13.4h, v0.h[2]
625    smlsl       v16.4s, v12.4h, v3.h[0]
626    smlsl       v16.4s, v13.4h, v3.h[2]
627    smlsl       v18.4s, v12.4h, v1.h[0]
628    smlsl       v18.4s, v13.4h, v6.h[2]
629
630    cmp         x11,x9
631    bhs         shift2
632
633
634    ld1         {v10.4h},[x0],x6
635    ld1         {v8.4h},[x0],x6
636    ld1         {v11.4h},[x0],x6
637    ld1         {v9.4h},[x0],x6
638
639
640
641
642
643
644
645    smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
646    smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
647    smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
648    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
649
650    smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
651    smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
652    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
653    smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
654
655
656
657
658
659    smlsl       v20.4s, v10.4h, v2.h[0]
660    smlsl       v20.4s, v11.4h, v6.h[2]
661
662
663    smlsl       v22.4s, v10.4h, v6.h[0]
664    smlal       v22.4s, v11.4h, v4.h[2]
665
666    smlal       v16.4s, v10.4h, v6.h[0]
667    smlal       v16.4s, v11.4h, v0.h[2]
668
669    smlal       v18.4s, v10.4h, v2.h[0]
670    smlal       v18.4s, v11.4h, v5.h[2]
671
672    cmp         x11,x5
673    bhs         shift2
674
675
676    ld1         {v12.4h},[x0],x6
677    ld1         {v14.4h},[x0],x6
678    ld1         {v13.4h},[x0],x6
679    ld1         {v15.4h},[x0],x6
680
681
682
683
684
685    smlal       v24.4s, v14.4h, v2.h[3]
686    smlal       v26.4s, v14.4h, v3.h[3]
687    smlsl       v28.4s, v14.4h, v5.h[3]
688    smlsl       v30.4s, v14.4h, v0.h[3]
689
690
691    smlal       v24.4s, v15.4h, v1.h[3]
692    smlsl       v26.4s, v15.4h, v6.h[3]
693    smlsl       v28.4s, v15.4h, v0.h[3]
694    smlal       v30.4s, v15.4h, v7.h[3]
695
696
697    smlal       v20.4s, v12.4h, v5.h[0]
698    smlal       v20.4s, v13.4h, v0.h[2]
699    smlal       v22.4s, v12.4h, v1.h[0]
700    smlal       v22.4s, v13.4h, v6.h[2]
701    smlal       v16.4s, v12.4h, v7.h[0]
702    smlsl       v16.4s, v13.4h, v2.h[2]
703    smlsl       v18.4s, v12.4h, v3.h[0]
704    smlsl       v18.4s, v13.4h, v4.h[2]
705
706
707    cmp         x11,x7
708    bhs         shift2
709
710
711    ld1         {v10.4h},[x0],x6
712    ld1         {v8.4h},[x0],x6
713    ld1         {v11.4h},[x0],x6
714    ld1         {v9.4h},[x0],x6
715
716
717
718
719
720
721
722    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
723    smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
724    smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
725    smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
726
727    smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
728    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
729    smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
730    smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
731
732
733
734
735
736    smlal       v20.4s, v10.4h, v0.h[0]
737    smlsl       v20.4s, v11.4h, v7.h[2]
738
739
740    smlsl       v22.4s, v10.4h, v0.h[0]
741    smlsl       v22.4s, v11.4h, v1.h[2]
742
743    smlsl       v16.4s, v10.4h, v0.h[0]
744    smlal       v16.4s, v11.4h, v5.h[2]
745
746    smlal       v18.4s, v10.4h, v0.h[0]
747    smlal       v18.4s, v11.4h, v3.h[2]
748
749
750
751    ld1         {v12.4h},[x0],x6
752    ld1         {v14.4h},[x0],x6
753    ld1         {v13.4h},[x0],x6
754    ld1         {v15.4h},[x0],x6
755
756
757    smlsl       v24.4s, v14.4h, v0.h[1]
758    smlal       v26.4s, v14.4h, v6.h[1]
759    smlal       v28.4s, v14.4h, v4.h[1]
760    smlsl       v30.4s, v14.4h, v1.h[1]
761
762
763    smlsl       v24.4s, v15.4h, v3.h[3]
764    smlal       v26.4s, v15.4h, v0.h[1]
765    smlsl       v28.4s, v15.4h, v5.h[1]
766    smlsl       v30.4s, v15.4h, v6.h[1]
767
768
769    smlsl       v20.4s, v12.4h, v3.h[0]
770    smlsl       v20.4s, v13.4h, v1.h[2]
771    smlsl       v22.4s, v12.4h, v7.h[0]
772    smlal       v22.4s, v13.4h, v3.h[2]
773    smlal       v16.4s, v12.4h, v1.h[0]
774    smlal       v16.4s, v13.4h, v7.h[2]
775    smlsl       v18.4s, v12.4h, v5.h[0]
776    smlsl       v18.4s, v13.4h, v2.h[2]
777
778    ld1         {v10.4h},[x0],x6
779    ld1         {v8.4h},[x0],x6
780    ld1         {v11.4h},[x0],x6
781    ld1         {v9.4h},[x0],x6
782
783
784
785
786    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
787    smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
788    smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
789    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
790
791    smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
792    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
793    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
794    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
795
796
797
798
799
800    smlsl       v20.4s, v10.4h, v6.h[0]
801    smlal       v20.4s, v11.4h, v5.h[2]
802
803
804    smlal       v22.4s, v10.4h, v2.h[0]
805    smlal       v22.4s, v11.4h, v7.h[2]
806
807    smlsl       v16.4s, v10.4h, v2.h[0]
808    smlsl       v16.4s, v11.4h, v4.h[2]
809
810    smlal       v18.4s, v10.4h, v6.h[0]
811    smlal       v18.4s, v11.4h, v1.h[2]
812
813
814    ld1         {v12.4h},[x0],x6
815    ld1         {v14.4h},[x0],x6
816    ld1         {v13.4h},[x0],x6
817    ld1         {v15.4h},[x0],x6
818
819
820
821
822
823    smlal       v24.4s, v14.4h, v1.h[1]
824    smlsl       v26.4s, v14.4h, v0.h[3]
825    smlal       v28.4s, v14.4h, v1.h[3]
826    smlsl       v30.4s, v14.4h, v3.h[1]
827
828
829    smlal       v24.4s, v15.4h, v5.h[3]
830    smlsl       v26.4s, v15.4h, v5.h[1]
831    smlal       v28.4s, v15.4h, v4.h[3]
832    smlsl       v30.4s, v15.4h, v4.h[1]
833
834
835    smlal       v20.4s, v12.4h, v1.h[0]
836    smlal       v20.4s, v13.4h, v3.h[2]
837    smlsl       v22.4s, v12.4h, v3.h[0]
838    smlsl       v22.4s, v13.4h, v2.h[2]
839    smlal       v16.4s, v12.4h, v5.h[0]
840    smlal       v16.4s, v13.4h, v1.h[2]
841    smlsl       v18.4s, v12.4h, v7.h[0]
842    smlsl       v18.4s, v13.4h, v0.h[2]
843
844shift2:
845    add         v8.4s,  v20.4s ,  v24.4s
846    sub         v10.4s,  v20.4s ,  v24.4s
847
848    add         v12.4s,  v22.4s ,  v26.4s
849    sub         v24.4s,  v22.4s ,  v26.4s
850
851    add         v14.4s,  v16.4s ,  v28.4s
852    sub         v26.4s,  v16.4s ,  v28.4s
853
854
855    add         v16.4s,  v18.4s ,  v30.4s
856    sub         v28.4s,  v18.4s ,  v30.4s
857
858
859    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
860    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
861    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
862    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
863    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
864    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
865    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
866    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
867
868    umov        x15,v24.d[0]
869    umov        x16,v25.d[0]
870    umov        x19,v26.d[0]
871    umov        x20,v27.d[0]
872
873    trn1        v24.4h, v30.4h, v12.4h
874    trn2        v25.4h, v30.4h, v12.4h
875    trn1        v26.4h, v31.4h, v13.4h
876    trn2        v27.4h, v31.4h, v13.4h
877
878    trn1        v30.2s, v24.2s, v26.2s
879    trn2        v31.2s, v24.2s, v26.2s
880    trn1        v12.2s, v25.2s, v27.2s
881    trn2        v13.2s, v25.2s, v27.2s
882
883    trn1        v24.4h, v14.4h, v18.4h
884    trn2        v25.4h, v14.4h, v18.4h
885    trn1        v26.4h, v15.4h, v19.4h
886    trn2        v27.4h, v15.4h, v19.4h
887
888    trn1        v14.2s, v24.2s, v26.2s
889    trn2        v15.2s, v24.2s, v26.2s
890    trn1        v18.2s, v25.2s, v27.2s
891    trn2        v19.2s, v25.2s, v27.2s
892
893    mov         v24.d[0],x15
894    mov         v25.d[0],x16
895    mov         v26.d[0],x19
896    mov         v27.d[0],x20
897
898    st1         { v30.4h, v31.4h},[x1],#16
899    st1         { v12.4h, v13.4h},[x1],#16
900    add         x1,x1,#128
901    st1         { v14.4h, v15.4h},[x1],#16
902    st1         { v18.4h, v19.4h},[x1],#16
903    sub         x1,x1,#160
904    mov         x0,x8
905
906
907
908    ld1         {v10.4h},[x0],x6
909    ld1         {v8.4h},[x0],x6
910    ld1         {v11.4h},[x0],x6
911    ld1         {v9.4h},[x0],x6
912
913
914    smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
915    smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
916    smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
917    smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
918
919    smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
920    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
921    smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
922    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
923
924
925
926
927
928    smull       v20.4s, v10.4h, v0.h[0]
929    smlsl       v20.4s, v11.4h, v7.h[2]
930
931
932    smull       v22.4s, v10.4h, v0.h[0]
933    smlsl       v22.4s, v11.4h, v6.h[2]
934
935    smull       v16.4s, v10.4h, v0.h[0]
936    smlsl       v16.4s, v11.4h, v5.h[2]
937
938    smull       v18.4s, v10.4h, v0.h[0]
939    smlsl       v18.4s, v11.4h, v4.h[2]
940
941    cmp         x11,x10
942    bhs         shift3
943
944    ld1         {v12.4h},[x0],x6
945    ld1         {v14.4h},[x0],x6
946    ld1         {v13.4h},[x0],x6
947    ld1         {v15.4h},[x0],x6
948
949
950
951
952    smlsl       v24.4s, v14.4h, v5.h[1]
953    smlsl       v26.4s, v14.4h, v7.h[3]
954    smlal       v28.4s, v14.4h, v5.h[3]
955    smlal       v30.4s, v14.4h, v3.h[1]
956
957
958    smlal       v24.4s, v15.4h, v2.h[1]
959    smlal       v26.4s, v15.4h, v1.h[1]
960    smlal       v28.4s, v15.4h, v4.h[3]
961    smlsl       v30.4s, v15.4h, v7.h[3]
962
963
964    smlsl       v20.4s, v12.4h, v1.h[0]
965    smlal       v20.4s, v13.4h, v6.h[2]
966    smlsl       v22.4s, v12.4h, v3.h[0]
967    smlal       v22.4s, v13.4h, v3.h[2]
968    smlsl       v16.4s, v12.4h, v5.h[0]
969    smlal       v16.4s, v13.4h, v0.h[2]
970    smlsl       v18.4s, v12.4h, v7.h[0]
971    smlal       v18.4s, v13.4h, v2.h[2]
972
973    cmp         x11,x9
974    bhs         shift3
975
976    ld1         {v10.4h},[x0],x6
977    ld1         {v8.4h},[x0],x6
978    ld1         {v11.4h},[x0],x6
979    ld1         {v9.4h},[x0],x6
980
981    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
982    smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
983    smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
984    smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
985
986    smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
987    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
988    smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
989    smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
990
991
992
993
994
995    smlal       v20.4s, v10.4h, v2.h[0]
996    smlsl       v20.4s, v11.4h, v5.h[2]
997
998
999    smlal       v22.4s, v10.4h, v6.h[0]
1000    smlsl       v22.4s, v11.4h, v0.h[2]
1001
1002    smlsl       v16.4s, v10.4h, v6.h[0]
1003    smlsl       v16.4s, v11.4h, v4.h[2]
1004
1005    smlsl       v18.4s, v10.4h, v2.h[0]
1006    smlal       v18.4s, v11.4h, v6.h[2]
1007
1008    cmp         x11,x5
1009    bhs         shift3
1010
1011
1012    ld1         {v12.4h},[x0],x6
1013    ld1         {v14.4h},[x0],x6
1014    ld1         {v13.4h},[x0],x6
1015    ld1         {v15.4h},[x0],x6
1016
1017
1018
1019
1020
1021
1022    smlsl       v24.4s, v14.4h, v7.h[1]
1023    smlal       v26.4s, v14.4h, v2.h[1]
1024    smlal       v28.4s, v14.4h, v4.h[1]
1025    smlsl       v30.4s, v14.4h, v5.h[1]
1026
1027
1028    smlal       v24.4s, v15.4h, v0.h[3]
1029    smlal       v26.4s, v15.4h, v7.h[1]
1030    smlsl       v28.4s, v15.4h, v1.h[1]
1031    smlsl       v30.4s, v15.4h, v6.h[1]
1032
1033
1034    smlsl       v20.4s, v12.4h, v3.h[0]
1035    smlal       v20.4s, v13.4h, v4.h[2]
1036    smlal       v22.4s, v12.4h, v7.h[0]
1037    smlal       v22.4s, v13.4h, v2.h[2]
1038    smlal       v16.4s, v12.4h, v1.h[0]
1039    smlsl       v16.4s, v13.4h, v6.h[2]
1040    smlal       v18.4s, v12.4h, v5.h[0]
1041    smlsl       v18.4s, v13.4h, v0.h[2]
1042
1043
1044    cmp         x11,x7
1045    bhs         shift3
1046
1047
1048    ld1         {v10.4h},[x0],x6
1049    ld1         {v8.4h},[x0],x6
1050    ld1         {v11.4h},[x0],x6
1051    ld1         {v9.4h},[x0],x6
1052
1053
1054    smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
1055    smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
1056    smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
1057    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1058
1059    smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
1060    smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1061    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1062    smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1063
1064
1065
1066
1067
1068    smlal       v20.4s, v10.4h, v0.h[0]
1069    smlsl       v20.4s, v11.4h, v3.h[2]
1070
1071
1072    smlsl       v22.4s, v10.4h, v0.h[0]
1073    smlsl       v22.4s, v11.4h, v5.h[2]
1074
1075    smlsl       v16.4s, v10.4h, v0.h[0]
1076    smlal       v16.4s, v11.4h, v1.h[2]
1077
1078    smlal       v18.4s, v10.4h, v0.h[0]
1079    smlal       v18.4s, v11.4h, v7.h[2]
1080
1081
1082    ld1         {v12.4h},[x0],x6
1083    ld1         {v14.4h},[x0],x6
1084    ld1         {v13.4h},[x0],x6
1085    ld1         {v15.4h},[x0],x6
1086
1087
1088
1089    smlal       v24.4s, v14.4h, v6.h[3]
1090    smlal       v26.4s, v14.4h, v3.h[3]
1091    smlsl       v28.4s, v14.4h, v1.h[3]
1092    smlal       v30.4s, v14.4h, v7.h[1]
1093
1094
1095    smlal       v24.4s, v15.4h, v1.h[3]
1096    smlsl       v26.4s, v15.4h, v2.h[3]
1097    smlal       v28.4s, v15.4h, v7.h[1]
1098    smlal       v30.4s, v15.4h, v4.h[1]
1099
1100
1101    smlsl       v20.4s, v12.4h, v5.h[0]
1102    smlal       v20.4s, v13.4h, v2.h[2]
1103    smlal       v22.4s, v12.4h, v1.h[0]
1104    smlsl       v22.4s, v13.4h, v7.h[2]
1105    smlsl       v16.4s, v12.4h, v7.h[0]
1106    smlsl       v16.4s, v13.4h, v3.h[2]
1107    smlsl       v18.4s, v12.4h, v3.h[0]
1108    smlal       v18.4s, v13.4h, v1.h[2]
1109
1110
1111
1112    ld1         {v10.4h},[x0],x6
1113    ld1         {v8.4h},[x0],x6
1114    ld1         {v11.4h},[x0],x6
1115    ld1         {v9.4h},[x0],x6
1116
1117
1118
1119
1120    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
1121    smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1122    smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
1123    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
1124
1125    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1126    smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1127    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1128    smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1129
1130
1131
1132
1133
1134    smlal       v20.4s, v10.4h, v6.h[0]
1135    smlsl       v20.4s, v11.4h, v1.h[2]
1136
1137
1138    smlsl       v22.4s, v10.4h, v2.h[0]
1139    smlal       v22.4s, v11.4h, v4.h[2]
1140
1141    smlal       v16.4s, v10.4h, v2.h[0]
1142    smlsl       v16.4s, v11.4h, v7.h[2]
1143
1144    smlsl       v18.4s, v10.4h, v6.h[0]
1145    smlsl       v18.4s, v11.4h, v5.h[2]
1146
1147
1148    ld1         {v12.4h},[x0],x6
1149    ld1         {v14.4h},[x0],x6
1150    ld1         {v13.4h},[x0],x6
1151    ld1         {v15.4h},[x0],x6
1152
1153    smlal       v24.4s, v14.4h, v4.h[3]
1154    smlsl       v26.4s, v14.4h, v6.h[1]
1155    smlal       v28.4s, v14.4h, v7.h[3]
1156    smlal       v30.4s, v14.4h, v6.h[3]
1157
1158
1159    smlal       v24.4s, v15.4h, v3.h[3]
1160    smlsl       v26.4s, v15.4h, v3.h[1]
1161    smlal       v28.4s, v15.4h, v2.h[3]
1162    smlsl       v30.4s, v15.4h, v2.h[1]
1163
1164
1165    smlsl       v20.4s, v12.4h, v7.h[0]
1166    smlal       v20.4s, v13.4h, v0.h[2]
1167    smlal       v22.4s, v12.4h, v5.h[0]
1168    smlsl       v22.4s, v13.4h, v1.h[2]
1169    smlsl       v16.4s, v12.4h, v3.h[0]
1170    smlal       v16.4s, v13.4h, v2.h[2]
1171    smlal       v18.4s, v12.4h, v1.h[0]
1172    smlsl       v18.4s, v13.4h, v3.h[2]
1173
1174shift3:
1175    add         v8.4s,  v20.4s ,  v24.4s
1176    sub         v10.4s,  v20.4s ,  v24.4s
1177
1178    add         v12.4s,  v22.4s ,  v26.4s
1179    sub         v24.4s,  v22.4s ,  v26.4s
1180
1181    add         v14.4s,  v16.4s ,  v28.4s
1182    sub         v26.4s,  v16.4s ,  v28.4s
1183
1184
1185    add         v16.4s,  v18.4s ,  v30.4s
1186    sub         v28.4s,  v18.4s ,  v30.4s
1187
1188
1189    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1190    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1191    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1192    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1193    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1194    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1195    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1196    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1197
1198    umov        x15,v24.d[0]
1199    umov        x16,v25.d[0]
1200    umov        x19,v26.d[0]
1201    umov        x20,v27.d[0]
1202
1203    trn1        v24.4h, v30.4h, v12.4h
1204    trn2        v25.4h, v30.4h, v12.4h
1205    trn1        v26.4h, v31.4h, v13.4h
1206    trn2        v27.4h, v31.4h, v13.4h
1207
1208    trn1        v30.2s, v24.2s, v26.2s
1209    trn2        v31.2s, v24.2s, v26.2s
1210    trn1        v12.2s, v25.2s, v27.2s
1211    trn2        v13.2s, v25.2s, v27.2s
1212
1213    trn1        v24.4h, v14.4h, v18.4h
1214    trn2        v25.4h, v14.4h, v18.4h
1215    trn1        v26.4h, v15.4h, v19.4h
1216    trn2        v27.4h, v15.4h, v19.4h
1217
1218    trn1        v14.2s, v24.2s, v26.2s
1219    trn2        v15.2s, v24.2s, v26.2s
1220    trn1        v18.2s, v25.2s, v27.2s
1221    trn2        v19.2s, v25.2s, v27.2s
1222
1223    mov         v24.d[0],x15
1224    mov         v25.d[0],x16
1225    mov         v26.d[0],x19
1226    mov         v27.d[0],x20
1227    st1         { v30.4h, v31.4h},[x1],#16
1228    st1         { v12.4h, v13.4h},[x1],#16
1229    add         x1,x1,#64
1230    st1         { v14.4h, v15.4h},[x1],#16
1231    st1         { v18.4h, v19.4h},[x1],#16
1232    sub         x1,x1,#96
1233
1234    mov         x0,x8
1235
1236
1237
1238    ld1         {v10.4h},[x0],x6
1239    ld1         {v8.4h},[x0],x6
1240    ld1         {v11.4h},[x0],x6
1241    ld1         {v9.4h},[x0],x6
1242
1243
1244    smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
1245    smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1246    smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
1247    smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
1248
1249    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1250    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1251    smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1252    smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1253
1254
1255
1256
1257
1258    smull       v20.4s, v10.4h, v0.h[0]
1259    smlsl       v20.4s, v11.4h, v3.h[2]
1260
1261
1262    smull       v22.4s, v10.4h, v0.h[0]
1263    smlsl       v22.4s, v11.4h, v2.h[2]
1264
1265    smull       v16.4s, v10.4h, v0.h[0]
1266    smlsl       v16.4s, v11.4h, v1.h[2]
1267
1268    smull       v18.4s, v10.4h, v0.h[0]
1269    smlsl       v18.4s, v11.4h, v0.h[2]
1270
1271    cmp         x11,x10
1272    bhs         shift4
1273
1274    ld1         {v12.4h},[x0],x6
1275    ld1         {v14.4h},[x0],x6
1276    ld1         {v13.4h},[x0],x6
1277    ld1         {v15.4h},[x0],x6
1278
1279
1280
1281
1282
1283
1284    smlal       v24.4s, v14.4h, v0.h[1]
1285    smlal       v26.4s, v14.4h, v1.h[3]
1286    smlal       v28.4s, v14.4h, v4.h[1]
1287    smlal       v30.4s, v14.4h, v6.h[3]
1288
1289
1290    smlsl       v24.4s, v15.4h, v4.h[1]
1291    smlsl       v26.4s, v15.4h, v0.h[3]
1292    smlsl       v28.4s, v15.4h, v2.h[3]
1293    smlsl       v30.4s, v15.4h, v6.h[1]
1294
1295
1296    smlal       v20.4s, v12.4h, v7.h[0]
1297    smlal       v20.4s, v13.4h, v5.h[2]
1298    smlal       v22.4s, v12.4h, v5.h[0]
1299    smlsl       v22.4s, v13.4h, v7.h[2]
1300    smlal       v16.4s, v12.4h, v3.h[0]
1301    smlsl       v16.4s, v13.4h, v4.h[2]
1302    smlal       v18.4s, v12.4h, v1.h[0]
1303    smlsl       v18.4s, v13.4h, v1.h[2]
1304
1305    cmp         x11,x9
1306    bhs         shift4
1307
1308    ld1         {v10.4h},[x0],x6
1309    ld1         {v8.4h},[x0],x6
1310    ld1         {v11.4h},[x0],x6
1311    ld1         {v9.4h},[x0],x6
1312
1313
1314
1315    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
1316    smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
1317    smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
1318    smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
1319
1320    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1321    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1322    smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1323    smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1324
1325
1326
1327
1328
1329    smlsl       v20.4s, v10.4h, v2.h[0]
1330    smlal       v20.4s, v11.4h, v1.h[2]
1331
1332
1333    smlsl       v22.4s, v10.4h, v6.h[0]
1334    smlal       v22.4s, v11.4h, v3.h[2]
1335
1336    smlal       v16.4s, v10.4h, v6.h[0]
1337    smlsl       v16.4s, v11.4h, v7.h[2]
1338
1339    smlal       v18.4s, v10.4h, v2.h[0]
1340    smlsl       v18.4s, v11.4h, v2.h[2]
1341
1342    cmp         x11,x5
1343    bhs         shift4
1344
1345
1346    ld1         {v12.4h},[x0],x6
1347    ld1         {v14.4h},[x0],x6
1348    ld1         {v13.4h},[x0],x6
1349    ld1         {v15.4h},[x0],x6
1350
1351
1352
1353
1354
1355
1356    smlsl       v24.4s, v14.4h, v1.h[1]
1357    smlsl       v26.4s, v14.4h, v7.h[3]
1358    smlal       v28.4s, v14.4h, v1.h[3]
1359    smlal       v30.4s, v14.4h, v4.h[3]
1360
1361
1362    smlal       v24.4s, v15.4h, v2.h[1]
1363    smlal       v26.4s, v15.4h, v5.h[1]
1364    smlsl       v28.4s, v15.4h, v3.h[1]
1365    smlsl       v30.4s, v15.4h, v4.h[1]
1366
1367
1368    smlsl       v20.4s, v12.4h, v5.h[0]
1369    smlsl       v20.4s, v13.4h, v7.h[2]
1370    smlsl       v22.4s, v12.4h, v1.h[0]
1371    smlal       v22.4s, v13.4h, v1.h[2]
1372    smlsl       v16.4s, v12.4h, v7.h[0]
1373    smlal       v16.4s, v13.4h, v5.h[2]
1374    smlal       v18.4s, v12.4h, v3.h[0]
1375    smlsl       v18.4s, v13.4h, v3.h[2]
1376
1377    cmp         x11,x7
1378    bhs         shift4
1379
1380
1381    ld1         {v10.4h},[x0],x6
1382    ld1         {v8.4h},[x0],x6
1383    ld1         {v11.4h},[x0],x6
1384    ld1         {v9.4h},[x0],x6
1385
1386
1387    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
1388    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1389    smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
1390    smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
1391
1392    smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1393    smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1394    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1395    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1396
1397
1398
1399
1400
1401    smlal       v20.4s, v10.4h, v0.h[0]
1402    smlsl       v20.4s, v11.4h, v0.h[2]
1403
1404
1405    smlsl       v22.4s, v10.4h, v0.h[0]
1406    smlal       v22.4s, v11.4h, v6.h[2]
1407
1408    smlsl       v16.4s, v10.4h, v0.h[0]
1409    smlal       v16.4s, v11.4h, v2.h[2]
1410
1411    smlal       v18.4s, v10.4h, v0.h[0]
1412    smlsl       v18.4s, v11.4h, v4.h[2]
1413
1414
1415
1416
1417    ld1         {v12.4h},[x0],x6
1418    ld1         {v14.4h},[x0],x6
1419    ld1         {v13.4h},[x0],x6
1420    ld1         {v15.4h},[x0],x6
1421
1422
1423
1424
1425
1426
1427    smlal       v24.4s, v14.4h, v3.h[1]
1428    smlsl       v26.4s, v14.4h, v2.h[1]
1429    smlal       v28.4s, v14.4h, v7.h[3]
1430    smlal       v30.4s, v14.4h, v2.h[3]
1431
1432
1433    smlsl       v24.4s, v15.4h, v0.h[3]
1434    smlal       v26.4s, v15.4h, v4.h[3]
1435    smlal       v28.4s, v15.4h, v6.h[3]
1436    smlsl       v30.4s, v15.4h, v2.h[1]
1437
1438
1439    smlal       v20.4s, v12.4h, v3.h[0]
1440    smlsl       v20.4s, v13.4h, v6.h[2]
1441    smlal       v22.4s, v12.4h, v7.h[0]
1442    smlsl       v22.4s, v13.4h, v4.h[2]
1443    smlsl       v16.4s, v12.4h, v1.h[0]
1444    smlal       v16.4s, v13.4h, v0.h[2]
1445    smlal       v18.4s, v12.4h, v5.h[0]
1446    smlsl       v18.4s, v13.4h, v5.h[2]
1447
1448
1449    ld1         {v10.4h},[x0],x6
1450    ld1         {v8.4h},[x0],x6
1451    ld1         {v11.4h},[x0],x6
1452    ld1         {v9.4h},[x0],x6
1453
1454
1455
1456
1457
1458    smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
1459    smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
1460    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
1461    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1462
1463    smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
1464    smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1465    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1466    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1467
1468
1469
1470
1471
1472    smlsl       v20.4s, v10.4h, v6.h[0]
1473    smlal       v20.4s, v11.4h, v2.h[2]
1474
1475
1476    smlal       v22.4s, v10.4h, v2.h[0]
1477    smlsl       v22.4s, v11.4h, v0.h[2]
1478
1479    smlsl       v16.4s, v10.4h, v2.h[0]
1480    smlal       v16.4s, v11.4h, v3.h[2]
1481
1482    smlal       v18.4s, v10.4h, v6.h[0]
1483    smlsl       v18.4s, v11.4h, v6.h[2]
1484
1485
1486    ld1         {v12.4h},[x0],x6
1487    ld1         {v14.4h},[x0],x6
1488    ld1         {v13.4h},[x0],x6
1489    ld1         {v15.4h},[x0],x6
1490
1491
1492
1493
1494    smlsl       v24.4s, v14.4h, v5.h[1]
1495    smlal       v26.4s, v14.4h, v3.h[3]
1496    smlsl       v28.4s, v14.4h, v2.h[1]
1497    smlal       v30.4s, v14.4h, v0.h[3]
1498
1499
1500    smlal       v24.4s, v15.4h, v1.h[3]
1501    smlsl       v26.4s, v15.4h, v1.h[1]
1502    smlal       v28.4s, v15.4h, v0.h[3]
1503    smlsl       v30.4s, v15.4h, v0.h[1]
1504
1505
1506    smlsl       v20.4s, v12.4h, v1.h[0]
1507    smlal       v20.4s, v13.4h, v4.h[2]
1508    smlal       v22.4s, v12.4h, v3.h[0]
1509    smlsl       v22.4s, v13.4h, v5.h[2]
1510    smlsl       v16.4s, v12.4h, v5.h[0]
1511    smlal       v16.4s, v13.4h, v6.h[2]
1512    smlal       v18.4s, v12.4h, v7.h[0]
1513    smlsl       v18.4s, v13.4h, v7.h[2]
1514
1515shift4:
1516    add         v8.4s,  v20.4s ,  v24.4s
1517    sub         v10.4s,  v20.4s ,  v24.4s
1518
1519    add         v12.4s,  v22.4s ,  v26.4s
1520    sub         v24.4s,  v22.4s ,  v26.4s
1521
1522    add         v14.4s,  v16.4s ,  v28.4s
1523    sub         v26.4s,  v16.4s ,  v28.4s
1524
1525
1526    add         v16.4s,  v18.4s ,  v30.4s
1527    sub         v28.4s,  v18.4s ,  v30.4s
1528
1529
1530    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1531    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1532    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1533    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1534    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1535    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1536    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1537    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1538
1539    umov        x15,v24.d[0]
1540    umov        x16,v25.d[0]
1541    umov        x19,v26.d[0]
1542    umov        x20,v27.d[0]
1543
1544    trn1        v24.4h, v30.4h, v12.4h
1545    trn2        v25.4h, v30.4h, v12.4h
1546    trn1        v26.4h, v31.4h, v13.4h
1547    trn2        v27.4h, v31.4h, v13.4h
1548
1549    trn1        v30.2s, v24.2s, v26.2s
1550    trn2        v31.2s, v24.2s, v26.2s
1551    trn1        v12.2s, v25.2s, v27.2s
1552    trn2        v13.2s, v25.2s, v27.2s
1553
1554    trn1        v24.4h, v14.4h, v18.4h
1555    trn2        v25.4h, v14.4h, v18.4h
1556    trn1        v26.4h, v15.4h, v19.4h
1557    trn2        v27.4h, v15.4h, v19.4h
1558
1559    trn1        v14.2s, v24.2s, v26.2s
1560    trn2        v15.2s, v24.2s, v26.2s
1561    trn1        v18.2s, v25.2s, v27.2s
1562    trn2        v19.2s, v25.2s, v27.2s
1563
1564    mov         v24.d[0],x15
1565    mov         v25.d[0],x16
1566    mov         v26.d[0],x19
1567    mov         v27.d[0],x20
1568
1569    st1         { v30.4h, v31.4h},[x1],#16
1570    st1         { v12.4h, v13.4h},[x1],#16
1571    st1         { v14.4h, v15.4h},[x1],#16
1572    st1         { v18.4h, v19.4h},[x1],#16
1573
1574    add         x1,x1,#96
1575
1576    subs        x14,x14,#1
1577    bne         dct_stage1
1578second_stage_dct:
1579//    mov        x0,x1
1580    ldp         x8, x7,[sp],#16
1581    ldp         x0, x1,[sp],#16
1582
1583//    add x4,x2,x8, lsl #1    @ x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
1584//    add x5,x8,x8, lsl #1    @
1585//    sub   x0,x0,#512
1586    mov         x11,#0xfffffff0
1587    mov         x5, #0xffffff00
1588    mov         w6,#0xfffff000
1589    mov         w9,#0xffff0000
1590//    sub         x1,x1,#2048
1591    mov         x4,x1
1592    mov         x10,#240
1593    mov         x14,#8
1594    b           stage2
1595
1596// registers free :
1597
1598// arm registers used
1599// x8 : predicition stride
1600// x7 : destination stride
1601// x1: temp buffer
1602// x2 : pred buffer
1603// x3 : destination buffer
1604// x14 : loop counter
1605//x0 : scratch buffer
1606//x10 : used as stride
1607// x4 : used to store the initial address
1608//x12 : zero cols
1609// x11 : 0xfffffff0
1610// x5 : 0xffffff00
1611dct_stage2:
1612    add         x4,x4,#32
1613    mov         x1,x4
1614stage2:
1615    ld1         {v10.4h, v11.4h},[x1],#16
1616    ld1         {v8.4h, v9.4h},[x1],x10
1617
1618    smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
1619    smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
1620    smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
1621    smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1622
1623    smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1624    smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1625    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1626    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1627
1628
1629
1630    smull       v20.4s, v10.4h, v0.h[0]
1631    smlal       v20.4s, v11.4h, v0.h[2]
1632
1633
1634    smull       v22.4s, v10.4h, v0.h[0]
1635    smlal       v22.4s, v11.4h, v1.h[2]
1636
1637    smull       v16.4s, v10.4h, v0.h[0]
1638    smlal       v16.4s, v11.4h, v2.h[2]
1639
1640    smull       v18.4s, v10.4h, v0.h[0]
1641    smlal       v18.4s, v11.4h, v3.h[2]
1642    cmp         x12,x11
1643    bhs         stage2_shift1
1644
1645    ld1         {v12.4h, v13.4h},[x1],#16
1646    ld1         {v14.4h, v15.4h},[x1],x10
1647
1648
1649
1650
1651
1652
1653    smlal       v24.4s, v14.4h, v1.h[1]
1654    smlal       v26.4s, v14.4h, v3.h[3]
1655    smlal       v28.4s, v14.4h, v6.h[1]
1656    smlsl       v30.4s, v14.4h, v7.h[1]
1657
1658
1659    smlal       v24.4s, v15.4h, v1.h[3]
1660    smlal       v26.4s, v15.4h, v5.h[1]
1661    smlsl       v28.4s, v15.4h, v7.h[1]
1662    smlsl       v30.4s, v15.4h, v3.h[3]
1663
1664
1665    smlal       v20.4s, v12.4h, v1.h[0]
1666    smlal       v20.4s, v13.4h, v1.h[2]
1667    smlal       v22.4s, v12.4h, v3.h[0]
1668    smlal       v22.4s, v13.4h, v4.h[2]
1669    smlal       v16.4s, v12.4h, v5.h[0]
1670    smlal       v16.4s, v13.4h, v7.h[2]
1671    smlal       v18.4s, v12.4h, v7.h[0]
1672    smlsl       v18.4s, v13.4h, v5.h[2]
1673    cmp         x12,x5
1674    bhs         stage2_shift1
1675
1676    ld1         {v10.4h, v11.4h},[x1],#16
1677    ld1         {v8.4h, v9.4h},[x1],x10
1678
1679    smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
1680    smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1681    smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
1682    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
1683
1684    smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1685    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1686    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1687    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1688
1689
1690
1691
1692
1693    smlal       v20.4s, v10.4h, v2.h[0]
1694    smlal       v20.4s, v11.4h, v2.h[2]
1695
1696
1697    smlal       v22.4s, v10.4h, v6.h[0]
1698    smlal       v22.4s, v11.4h, v7.h[2]
1699
1700    smlsl       v16.4s, v10.4h, v6.h[0]
1701    smlsl       v16.4s, v11.4h, v3.h[2]
1702
1703    smlsl       v18.4s, v10.4h, v2.h[0]
1704    smlsl       v18.4s, v11.4h, v1.h[2]
1705
1706    cmp         x12,x6
1707    bhs         stage2_shift1
1708
1709
1710    ld1         {v12.4h, v13.4h},[x1],#16
1711    ld1         {v14.4h, v15.4h},[x1],x10
1712
1713
1714
1715
1716
1717    smlal       v24.4s, v14.4h, v3.h[1]
1718    smlsl       v26.4s, v14.4h, v6.h[1]
1719    smlsl       v28.4s, v14.4h, v0.h[1]
1720    smlsl       v30.4s, v14.4h, v6.h[3]
1721
1722
1723    smlal       v24.4s, v15.4h, v3.h[3]
1724    smlsl       v26.4s, v15.4h, v4.h[3]
1725    smlsl       v28.4s, v15.4h, v2.h[3]
1726    smlal       v30.4s, v15.4h, v5.h[3]
1727
1728
1729    smlal       v20.4s, v12.4h, v3.h[0]
1730    smlal       v20.4s, v13.4h, v3.h[2]
1731    smlsl       v22.4s, v12.4h, v7.h[0]
1732    smlsl       v22.4s, v13.4h, v5.h[2]
1733    smlsl       v16.4s, v12.4h, v1.h[0]
1734    smlsl       v16.4s, v13.4h, v1.h[2]
1735    smlsl       v18.4s, v12.4h, v5.h[0]
1736    smlal       v18.4s, v13.4h, v7.h[2]
1737
1738    cmp         x12,x9
1739    bhs         stage2_shift1
1740
1741
1742    ld1         {v10.4h, v11.4h},[x1],#16
1743    ld1         {v8.4h, v9.4h},[x1],x10
1744
1745
1746    smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
1747    smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
1748    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
1749    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
1750
1751    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1752    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1753    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1754    smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1755
1756
1757
1758
1759
1760    smlal       v20.4s, v10.4h, v0.h[0]
1761    smlal       v20.4s, v11.4h, v4.h[2]
1762
1763
1764    smlsl       v22.4s, v10.4h, v0.h[0]
1765    smlsl       v22.4s, v11.4h, v2.h[2]
1766
1767    smlsl       v16.4s, v10.4h, v0.h[0]
1768    smlsl       v16.4s, v11.4h, v6.h[2]
1769
1770    smlal       v18.4s, v10.4h, v0.h[0]
1771    smlal       v18.4s, v11.4h, v0.h[2]
1772
1773    ld1         {v12.4h, v13.4h},[x1],#16
1774    ld1         {v14.4h, v15.4h},[x1],x10
1775
1776
1777
1778
1779
1780    smlal       v24.4s, v14.4h, v5.h[1]
1781    smlsl       v26.4s, v14.4h, v0.h[2]
1782    smlal       v28.4s, v14.4h, v5.h[3]
1783    smlal       v30.4s, v14.4h, v4.h[3]
1784
1785
1786    smlal       v24.4s, v15.4h, v5.h[3]
1787    smlsl       v26.4s, v15.4h, v1.h[1]
1788    smlal       v28.4s, v15.4h, v3.h[1]
1789    smlsl       v30.4s, v15.4h, v7.h[3]
1790
1791
1792    smlal       v20.4s, v12.4h, v5.h[0]
1793    smlal       v20.4s, v13.4h, v5.h[2]
1794    smlsl       v22.4s, v12.4h, v1.h[0]
1795    smlsl       v22.4s, v13.4h, v0.h[2]
1796    smlal       v16.4s, v12.4h, v7.h[0]
1797    smlal       v16.4s, v13.4h, v4.h[2]
1798    smlal       v18.4s, v12.4h, v3.h[0]
1799    smlal       v18.4s, v13.4h, v6.h[2]
1800
1801
1802    ld1         {v10.4h, v11.4h},[x1],#16
1803    ld1         {v8.4h, v9.4h},[x1],x10
1804
1805
1806
1807
1808    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
1809    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1810    smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
1811    smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
1812
1813    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1814    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1815    smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1816    smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1817
1818
1819
1820
1821
1822    smlal       v20.4s, v10.4h, v6.h[0]
1823    smlal       v20.4s, v11.4h, v6.h[2]
1824
1825
1826    smlsl       v22.4s, v10.4h, v2.h[0]
1827    smlsl       v22.4s, v11.4h, v3.h[2]
1828
1829    smlal       v16.4s, v10.4h, v2.h[0]
1830    smlal       v16.4s, v11.4h, v0.h[2]
1831
1832    smlsl       v18.4s, v10.4h, v6.h[0]
1833    smlsl       v18.4s, v11.4h, v2.h[2]
1834
1835    ld1         {v12.4h, v13.4h},[x1],#16
1836    ld1         {v14.4h, v15.4h},[x1],x10
1837
1838    smlal       v24.4s, v14.4h, v7.h[1]
1839    smlsl       v26.4s, v14.4h, v5.h[3]
1840    smlal       v28.4s, v14.4h, v4.h[1]
1841    smlsl       v30.4s, v14.4h, v2.h[3]
1842
1843
1844    smlal       v24.4s, v15.4h, v7.h[3]
1845    smlsl       v26.4s, v15.4h, v7.h[1]
1846    smlal       v28.4s, v15.4h, v6.h[3]
1847    smlsl       v30.4s, v15.4h, v6.h[1]
1848
1849
1850    smlal       v20.4s, v12.4h, v7.h[0]
1851    smlal       v20.4s, v13.4h, v7.h[2]
1852    smlsl       v22.4s, v12.4h, v5.h[0]
1853    smlsl       v22.4s, v13.4h, v6.h[2]
1854    smlal       v16.4s, v12.4h, v3.h[0]
1855    smlal       v16.4s, v13.4h, v5.h[2]
1856    smlsl       v18.4s, v12.4h, v1.h[0]
1857    smlsl       v18.4s, v13.4h, v4.h[2]
1858
1859stage2_shift1:
1860    add         v8.4s,  v20.4s ,  v24.4s
1861    sub         v10.4s,  v20.4s ,  v24.4s
1862
1863    add         v12.4s,  v22.4s ,  v26.4s
1864    sub         v24.4s,  v22.4s ,  v26.4s
1865
1866    add         v14.4s,  v16.4s ,  v28.4s
1867    sub         v26.4s,  v16.4s ,  v28.4s
1868
1869
1870    add         v16.4s,  v18.4s ,  v30.4s
1871    sub         v28.4s,  v18.4s ,  v30.4s
1872
1873
1874    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
1875    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
1876    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
1877    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
1878    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
1879    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
1880    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
1881    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
1882
1883
1884    umov        x15,v24.d[0]
1885    umov        x16,v25.d[0]
1886    umov        x19,v26.d[0]
1887    umov        x20,v27.d[0]
1888
1889    trn1        v24.4h, v30.4h, v12.4h
1890    trn2        v25.4h, v30.4h, v12.4h
1891    trn1        v26.4h, v31.4h, v13.4h
1892    trn2        v27.4h, v31.4h, v13.4h
1893
1894    trn1        v30.2s, v24.2s, v26.2s
1895    trn2        v31.2s, v24.2s, v26.2s
1896    trn1        v12.2s, v25.2s, v27.2s
1897    trn2        v13.2s, v25.2s, v27.2s
1898
1899    trn1        v24.4h, v14.4h, v18.4h
1900    trn2        v25.4h, v14.4h, v18.4h
1901    trn1        v26.4h, v15.4h, v19.4h
1902    trn2        v27.4h, v15.4h, v19.4h
1903
1904    trn1        v14.2s, v24.2s, v26.2s
1905    trn2        v15.2s, v24.2s, v26.2s
1906    trn1        v18.2s, v25.2s, v27.2s
1907    trn2        v19.2s, v25.2s, v27.2s
1908
1909    mov         v24.d[0],x15
1910    mov         v25.d[0],x16
1911    mov         v26.d[0],x19
1912    mov         v27.d[0],x20
1913
1914    st1         { v30.4h, v31.4h},[x0],#16
1915    st1         { v12.4h, v13.4h},[x0],#16
1916    st1         { v14.4h, v15.4h},[x0],#16
1917    st1         { v18.4h, v19.4h},[x0],#16
1918
1919    mov         x1,x4
1920
1921
1922
1923
1924
1925
1926    ld1         {v10.4h, v11.4h},[x1],#16
1927    ld1         {v8.4h, v9.4h},[x1],x10
1928
1929
1930    smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
1931    smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1932    smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
1933    smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
1934
1935    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1936    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1937    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1938    smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1939
1940
1941
1942
1943
1944    smull       v20.4s, v10.4h, v0.h[0]
1945    smlal       v20.4s, v11.4h, v4.h[2]
1946
1947
1948    smull       v22.4s, v10.4h, v0.h[0]
1949    smlal       v22.4s, v11.4h, v5.h[2]
1950
1951    smull       v16.4s, v10.4h, v0.h[0]
1952    smlal       v16.4s, v11.4h, v6.h[2]
1953
1954    smull       v18.4s, v10.4h, v0.h[0]
1955    smlal       v18.4s, v11.4h, v7.h[2]
1956
1957    cmp         x12,x11
1958    bhs         stage2_shift2
1959
1960    ld1         {v12.4h, v13.4h},[x1],#16
1961    ld1         {v14.4h, v15.4h},[x1],x10
1962
1963
1964    smlsl       v24.4s, v14.4h, v4.h[3]
1965    smlsl       v26.4s, v14.4h, v2.h[1]
1966    smlsl       v28.4s, v14.4h, v0.h[1]
1967    smlsl       v30.4s, v14.4h, v2.h[3]
1968
1969
1970    smlsl       v24.4s, v15.4h, v0.h[3]
1971    smlsl       v26.4s, v15.4h, v3.h[1]
1972    smlsl       v28.4s, v15.4h, v6.h[3]
1973    smlal       v30.4s, v15.4h, v5.h[3]
1974
1975
1976    smlsl       v20.4s, v12.4h, v7.h[0]
1977    smlsl       v20.4s, v13.4h, v2.h[2]
1978    smlsl       v22.4s, v12.4h, v5.h[0]
1979    smlsl       v22.4s, v13.4h, v0.h[2]
1980    smlsl       v16.4s, v12.4h, v3.h[0]
1981    smlsl       v16.4s, v13.4h, v3.h[2]
1982    smlsl       v18.4s, v12.4h, v1.h[0]
1983    smlsl       v18.4s, v13.4h, v6.h[2]
1984
1985    cmp         x12,x5
1986    bhs         stage2_shift2
1987
1988    ld1         {v10.4h, v11.4h},[x1],#16
1989    ld1         {v8.4h, v9.4h},[x1],x10
1990
1991
1992
1993
1994
1995    smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
1996    smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
1997    smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
1998    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1999
2000    smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2001    smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2002    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2003    smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2004
2005
2006
2007
2008
2009    smlsl       v20.4s, v10.4h, v2.h[0]
2010    smlsl       v20.4s, v11.4h, v6.h[2]
2011
2012
2013    smlsl       v22.4s, v10.4h, v6.h[0]
2014    smlal       v22.4s, v11.4h, v4.h[2]
2015
2016    smlal       v16.4s, v10.4h, v6.h[0]
2017    smlal       v16.4s, v11.4h, v0.h[2]
2018
2019    smlal       v18.4s, v10.4h, v2.h[0]
2020    smlal       v18.4s, v11.4h, v5.h[2]
2021
2022    cmp         x12,x6
2023    bhs         stage2_shift2
2024
2025
2026    ld1         {v12.4h, v13.4h},[x1],#16
2027    ld1         {v14.4h, v15.4h},[x1],x10
2028
2029
2030
2031
2032
2033
2034    smlal       v24.4s, v14.4h, v2.h[3]
2035    smlal       v26.4s, v14.4h, v3.h[3]
2036    smlsl       v28.4s, v14.4h, v5.h[3]
2037    smlsl       v30.4s, v14.4h, v0.h[3]
2038
2039
2040    smlal       v24.4s, v15.4h, v1.h[3]
2041    smlsl       v26.4s, v15.4h, v6.h[3]
2042    smlsl       v28.4s, v15.4h, v0.h[3]
2043    smlal       v30.4s, v15.4h, v7.h[3]
2044
2045
2046    smlal       v20.4s, v12.4h, v5.h[0]
2047    smlal       v20.4s, v13.4h, v0.h[2]
2048    smlal       v22.4s, v12.4h, v1.h[0]
2049    smlal       v22.4s, v13.4h, v6.h[2]
2050    smlal       v16.4s, v12.4h, v7.h[0]
2051    smlsl       v16.4s, v13.4h, v2.h[2]
2052    smlsl       v18.4s, v12.4h, v3.h[0]
2053    smlsl       v18.4s, v13.4h, v4.h[2]
2054
2055    cmp         x12,x9
2056    bhs         stage2_shift2
2057
2058
2059    ld1         {v10.4h, v11.4h},[x1],#16
2060    ld1         {v8.4h, v9.4h},[x1],x10
2061
2062
2063
2064    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2065    smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
2066    smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
2067    smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
2068
2069    smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2070    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2071    smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2072    smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2073
2074
2075
2076
2077
2078    smlal       v20.4s, v10.4h, v0.h[0]
2079    smlsl       v20.4s, v11.4h, v7.h[2]
2080
2081
2082    smlsl       v22.4s, v10.4h, v0.h[0]
2083    smlsl       v22.4s, v11.4h, v1.h[2]
2084
2085    smlsl       v16.4s, v10.4h, v0.h[0]
2086    smlal       v16.4s, v11.4h, v5.h[2]
2087
2088    smlal       v18.4s, v10.4h, v0.h[0]
2089    smlal       v18.4s, v11.4h, v3.h[2]
2090
2091    ld1         {v12.4h, v13.4h},[x1],#16
2092    ld1         {v14.4h, v15.4h},[x1],x10
2093
2094
2095
2096
2097    smlsl       v24.4s, v14.4h, v0.h[1]
2098    smlal       v26.4s, v14.4h, v6.h[1]
2099    smlal       v28.4s, v14.4h, v4.h[1]
2100    smlsl       v30.4s, v14.4h, v1.h[1]
2101
2102
2103    smlsl       v24.4s, v15.4h, v3.h[3]
2104    smlal       v26.4s, v15.4h, v0.h[1]
2105    smlsl       v28.4s, v15.4h, v5.h[1]
2106    smlsl       v30.4s, v15.4h, v6.h[1]
2107
2108
2109    smlsl       v20.4s, v12.4h, v3.h[0]
2110    smlsl       v20.4s, v13.4h, v1.h[2]
2111    smlsl       v22.4s, v12.4h, v7.h[0]
2112    smlal       v22.4s, v13.4h, v3.h[2]
2113    smlal       v16.4s, v12.4h, v1.h[0]
2114    smlal       v16.4s, v13.4h, v7.h[2]
2115    smlsl       v18.4s, v12.4h, v5.h[0]
2116    smlsl       v18.4s, v13.4h, v2.h[2]
2117
2118
2119    ld1         {v10.4h, v11.4h},[x1],#16
2120    ld1         {v8.4h, v9.4h},[x1],x10
2121
2122
2123    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2124    smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
2125    smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
2126    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
2127
2128    smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2129    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2130    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2131    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2132
2133
2134
2135
2136
2137    smlsl       v20.4s, v10.4h, v6.h[0]
2138    smlal       v20.4s, v11.4h, v5.h[2]
2139
2140
2141    smlal       v22.4s, v10.4h, v2.h[0]
2142    smlal       v22.4s, v11.4h, v7.h[2]
2143
2144    smlsl       v16.4s, v10.4h, v2.h[0]
2145    smlsl       v16.4s, v11.4h, v4.h[2]
2146
2147    smlal       v18.4s, v10.4h, v6.h[0]
2148    smlal       v18.4s, v11.4h, v1.h[2]
2149
2150
2151    ld1         {v12.4h, v13.4h},[x1],#16
2152    ld1         {v14.4h, v15.4h},[x1],x10
2153
2154
2155
2156    smlal       v24.4s, v14.4h, v1.h[1]
2157    smlsl       v26.4s, v14.4h, v0.h[3]
2158    smlal       v28.4s, v14.4h, v1.h[3]
2159    smlsl       v30.4s, v14.4h, v3.h[1]
2160
2161
2162    smlal       v24.4s, v15.4h, v5.h[3]
2163    smlsl       v26.4s, v15.4h, v5.h[1]
2164    smlal       v28.4s, v15.4h, v4.h[3]
2165    smlsl       v30.4s, v15.4h, v4.h[1]
2166
2167
2168    smlal       v20.4s, v12.4h, v1.h[0]
2169    smlal       v20.4s, v13.4h, v3.h[2]
2170    smlsl       v22.4s, v12.4h, v3.h[0]
2171    smlsl       v22.4s, v13.4h, v2.h[2]
2172    smlal       v16.4s, v12.4h, v5.h[0]
2173    smlal       v16.4s, v13.4h, v1.h[2]
2174    smlsl       v18.4s, v12.4h, v7.h[0]
2175    smlsl       v18.4s, v13.4h, v0.h[2]
2176
2177stage2_shift2:
2178    add         v8.4s,  v20.4s ,  v24.4s
2179    sub         v10.4s,  v20.4s ,  v24.4s
2180
2181    add         v12.4s,  v22.4s ,  v26.4s
2182    sub         v24.4s,  v22.4s ,  v26.4s
2183
2184    add         v14.4s,  v16.4s ,  v28.4s
2185    sub         v26.4s,  v16.4s ,  v28.4s
2186
2187
2188    add         v16.4s,  v18.4s ,  v30.4s
2189    sub         v28.4s,  v18.4s ,  v30.4s
2190
2191
2192    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2193    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2194    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2195    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2196    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2197    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2198    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2199    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2200
2201    umov        x15,v24.d[0]
2202    umov        x16,v25.d[0]
2203    umov        x19,v26.d[0]
2204    umov        x20,v27.d[0]
2205
2206    trn1        v24.4h, v30.4h, v12.4h
2207    trn2        v25.4h, v30.4h, v12.4h
2208    trn1        v26.4h, v31.4h, v13.4h
2209    trn2        v27.4h, v31.4h, v13.4h
2210
2211    trn1        v30.2s, v24.2s, v26.2s
2212    trn2        v31.2s, v24.2s, v26.2s
2213    trn1        v12.2s, v25.2s, v27.2s
2214    trn2        v13.2s, v25.2s, v27.2s
2215
2216    trn1        v24.4h, v14.4h, v18.4h
2217    trn2        v25.4h, v14.4h, v18.4h
2218    trn1        v26.4h, v15.4h, v19.4h
2219    trn2        v27.4h, v15.4h, v19.4h
2220
2221    trn1        v14.2s, v24.2s, v26.2s
2222    trn2        v15.2s, v24.2s, v26.2s
2223    trn1        v18.2s, v25.2s, v27.2s
2224    trn2        v19.2s, v25.2s, v27.2s
2225
2226    mov         v24.d[0],x15
2227    mov         v25.d[0],x16
2228    mov         v26.d[0],x19
2229    mov         v27.d[0],x20
2230
2231    st1         { v30.4h, v31.4h},[x0],#16
2232    st1         { v12.4h, v13.4h},[x0],#16
2233    st1         { v14.4h, v15.4h},[x0],#16
2234    st1         { v18.4h, v19.4h},[x0],#16
2235
2236
2237    mov         x1,x4
2238
2239
2240
2241
2242    ld1         {v10.4h, v11.4h},[x1],#16
2243    ld1         {v8.4h, v9.4h},[x1],x10
2244
2245    smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
2246    smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
2247    smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
2248    smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
2249
2250    smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2251    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2252    smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
2253    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2254
2255
2256
2257
2258
2259    smull       v20.4s, v10.4h, v0.h[0]
2260    smlsl       v20.4s, v11.4h, v7.h[2]
2261
2262
2263    smull       v22.4s, v10.4h, v0.h[0]
2264    smlsl       v22.4s, v11.4h, v6.h[2]
2265
2266    smull       v16.4s, v10.4h, v0.h[0]
2267    smlsl       v16.4s, v11.4h, v5.h[2]
2268
2269    smull       v18.4s, v10.4h, v0.h[0]
2270    smlsl       v18.4s, v11.4h, v4.h[2]
2271
2272    cmp         x12,x11
2273    bhs         stage2_shift3
2274
2275    ld1         {v12.4h, v13.4h},[x1],#16
2276    ld1         {v14.4h, v15.4h},[x1],x10
2277
2278    smlsl       v24.4s, v14.4h, v5.h[1]
2279    smlsl       v26.4s, v14.4h, v7.h[3]
2280    smlal       v28.4s, v14.4h, v5.h[3]
2281    smlal       v30.4s, v14.4h, v3.h[1]
2282
2283
2284    smlal       v24.4s, v15.4h, v2.h[1]
2285    smlal       v26.4s, v15.4h, v1.h[1]
2286    smlal       v28.4s, v15.4h, v4.h[3]
2287    smlsl       v30.4s, v15.4h, v7.h[3]
2288
2289
2290    smlsl       v20.4s, v12.4h, v1.h[0]
2291    smlal       v20.4s, v13.4h, v6.h[2]
2292    smlsl       v22.4s, v12.4h, v3.h[0]
2293    smlal       v22.4s, v13.4h, v3.h[2]
2294    smlsl       v16.4s, v12.4h, v5.h[0]
2295    smlal       v16.4s, v13.4h, v0.h[2]
2296    smlsl       v18.4s, v12.4h, v7.h[0]
2297    smlal       v18.4s, v13.4h, v2.h[2]
2298
2299    cmp         x12,x5
2300    bhs         stage2_shift3
2301
2302    ld1         {v10.4h, v11.4h},[x1],#16
2303    ld1         {v8.4h, v9.4h},[x1],x10
2304
2305
2306
2307    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2308    smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
2309    smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
2310    smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
2311
2312    smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2313    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2314    smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2315    smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2316
2317
2318
2319
2320
2321    smlal       v20.4s, v10.4h, v2.h[0]
2322    smlsl       v20.4s, v11.4h, v5.h[2]
2323
2324
2325    smlal       v22.4s, v10.4h, v6.h[0]
2326    smlsl       v22.4s, v11.4h, v0.h[2]
2327
2328    smlsl       v16.4s, v10.4h, v6.h[0]
2329    smlsl       v16.4s, v11.4h, v4.h[2]
2330
2331    smlsl       v18.4s, v10.4h, v2.h[0]
2332    smlal       v18.4s, v11.4h, v6.h[2]
2333
2334    cmp         x12,x6
2335    bhs         stage2_shift3
2336
2337    ld1         {v12.4h, v13.4h},[x1],#16
2338    ld1         {v14.4h, v15.4h},[x1],x10
2339
2340
2341
2342
2343
2344    smlsl       v24.4s, v14.4h, v7.h[1]
2345    smlal       v26.4s, v14.4h, v2.h[1]
2346    smlal       v28.4s, v14.4h, v4.h[1]
2347    smlsl       v30.4s, v14.4h, v5.h[1]
2348
2349
2350    smlal       v24.4s, v15.4h, v0.h[3]
2351    smlal       v26.4s, v15.4h, v7.h[1]
2352    smlsl       v28.4s, v15.4h, v1.h[1]
2353    smlsl       v30.4s, v15.4h, v6.h[1]
2354
2355
2356    smlsl       v20.4s, v12.4h, v3.h[0]
2357    smlal       v20.4s, v13.4h, v4.h[2]
2358    smlal       v22.4s, v12.4h, v7.h[0]
2359    smlal       v22.4s, v13.4h, v2.h[2]
2360    smlal       v16.4s, v12.4h, v1.h[0]
2361    smlsl       v16.4s, v13.4h, v6.h[2]
2362    smlal       v18.4s, v12.4h, v5.h[0]
2363    smlsl       v18.4s, v13.4h, v0.h[2]
2364
2365    cmp         x12,x9
2366    bhs         stage2_shift3
2367
2368
2369    ld1         {v10.4h, v11.4h},[x1],#16
2370    ld1         {v8.4h, v9.4h},[x1],x10
2371
2372
2373    smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2374    smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
2375    smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
2376    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
2377
2378    smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2379    smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2380    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2381    smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2382
2383
2384
2385
2386
2387    smlal       v20.4s, v10.4h, v0.h[0]
2388    smlsl       v20.4s, v11.4h, v3.h[2]
2389
2390
2391    smlsl       v22.4s, v10.4h, v0.h[0]
2392    smlsl       v22.4s, v11.4h, v5.h[2]
2393
2394    smlsl       v16.4s, v10.4h, v0.h[0]
2395    smlal       v16.4s, v11.4h, v1.h[2]
2396
2397    smlal       v18.4s, v10.4h, v0.h[0]
2398    smlal       v18.4s, v11.4h, v7.h[2]
2399
2400    ld1         {v12.4h, v13.4h},[x1],#16
2401    ld1         {v14.4h, v15.4h},[x1],x10
2402
2403
2404
2405
2406    smlal       v24.4s, v14.4h, v6.h[3]
2407    smlal       v26.4s, v14.4h, v3.h[3]
2408    smlsl       v28.4s, v14.4h, v1.h[3]
2409    smlal       v30.4s, v14.4h, v7.h[1]
2410
2411
2412    smlal       v24.4s, v15.4h, v1.h[3]
2413    smlsl       v26.4s, v15.4h, v2.h[3]
2414    smlal       v28.4s, v15.4h, v7.h[1]
2415    smlal       v30.4s, v15.4h, v4.h[1]
2416
2417
2418    smlsl       v20.4s, v12.4h, v5.h[0]
2419    smlal       v20.4s, v13.4h, v2.h[2]
2420    smlal       v22.4s, v12.4h, v1.h[0]
2421    smlsl       v22.4s, v13.4h, v7.h[2]
2422    smlsl       v16.4s, v12.4h, v7.h[0]
2423    smlsl       v16.4s, v13.4h, v3.h[2]
2424    smlsl       v18.4s, v12.4h, v3.h[0]
2425    smlal       v18.4s, v13.4h, v1.h[2]
2426
2427
2428    ld1         {v10.4h, v11.4h},[x1],#16
2429    ld1         {v8.4h, v9.4h},[x1],x10
2430
2431
2432    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
2433    smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
2434    smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
2435    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
2436
2437    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2438    smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2439    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2440    smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2441
2442
2443
2444
2445
2446    smlal       v20.4s, v10.4h, v6.h[0]
2447    smlsl       v20.4s, v11.4h, v1.h[2]
2448
2449
2450    smlsl       v22.4s, v10.4h, v2.h[0]
2451    smlal       v22.4s, v11.4h, v4.h[2]
2452
2453    smlal       v16.4s, v10.4h, v2.h[0]
2454    smlsl       v16.4s, v11.4h, v7.h[2]
2455
2456    smlsl       v18.4s, v10.4h, v6.h[0]
2457    smlsl       v18.4s, v11.4h, v5.h[2]
2458
2459    ld1         {v12.4h, v13.4h},[x1],#16
2460    ld1         {v14.4h, v15.4h},[x1],x10
2461
2462
2463
2464    smlal       v24.4s, v14.4h, v4.h[3]
2465    smlsl       v26.4s, v14.4h, v6.h[1]
2466    smlal       v28.4s, v14.4h, v7.h[3]
2467    smlal       v30.4s, v14.4h, v6.h[3]
2468
2469
2470    smlal       v24.4s, v15.4h, v3.h[3]
2471    smlsl       v26.4s, v15.4h, v3.h[1]
2472    smlal       v28.4s, v15.4h, v2.h[3]
2473    smlsl       v30.4s, v15.4h, v2.h[1]
2474
2475
2476    smlsl       v20.4s, v12.4h, v7.h[0]
2477    smlal       v20.4s, v13.4h, v0.h[2]
2478    smlal       v22.4s, v12.4h, v5.h[0]
2479    smlsl       v22.4s, v13.4h, v1.h[2]
2480    smlsl       v16.4s, v12.4h, v3.h[0]
2481    smlal       v16.4s, v13.4h, v2.h[2]
2482    smlal       v18.4s, v12.4h, v1.h[0]
2483    smlsl       v18.4s, v13.4h, v3.h[2]
2484
2485stage2_shift3:
2486    add         v8.4s,  v20.4s ,  v24.4s
2487    sub         v10.4s,  v20.4s ,  v24.4s
2488
2489    add         v12.4s,  v22.4s ,  v26.4s
2490    sub         v24.4s,  v22.4s ,  v26.4s
2491
2492    add         v14.4s,  v16.4s ,  v28.4s
2493    sub         v26.4s,  v16.4s ,  v28.4s
2494
2495
2496    add         v16.4s,  v18.4s ,  v30.4s
2497    sub         v28.4s,  v18.4s ,  v30.4s
2498
2499
2500    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2501    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2502    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2503    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2504    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2505    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2506    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2507    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2508
2509    umov        x15,v24.d[0]
2510    umov        x16,v25.d[0]
2511    umov        x19,v26.d[0]
2512    umov        x20,v27.d[0]
2513
2514    trn1        v24.4h, v30.4h, v12.4h
2515    trn2        v25.4h, v30.4h, v12.4h
2516    trn1        v26.4h, v31.4h, v13.4h
2517    trn2        v27.4h, v31.4h, v13.4h
2518
2519    trn1        v30.2s, v24.2s, v26.2s
2520    trn2        v31.2s, v24.2s, v26.2s
2521    trn1        v12.2s, v25.2s, v27.2s
2522    trn2        v13.2s, v25.2s, v27.2s
2523
2524    trn1        v24.4h, v14.4h, v18.4h
2525    trn2        v25.4h, v14.4h, v18.4h
2526    trn1        v26.4h, v15.4h, v19.4h
2527    trn2        v27.4h, v15.4h, v19.4h
2528
2529    trn1        v14.2s, v24.2s, v26.2s
2530    trn2        v15.2s, v24.2s, v26.2s
2531    trn1        v18.2s, v25.2s, v27.2s
2532    trn2        v19.2s, v25.2s, v27.2s
2533
2534    mov         v24.d[0],x15
2535    mov         v25.d[0],x16
2536    mov         v26.d[0],x19
2537    mov         v27.d[0],x20
2538
2539    st1         { v30.4h, v31.4h},[x0],#16
2540    st1         { v12.4h, v13.4h},[x0],#16
2541    st1         { v14.4h, v15.4h},[x0],#16
2542    st1         { v18.4h, v19.4h},[x0],#16
2543
2544
2545
2546    mov         x1,x4
2547
2548
2549
2550
2551    ld1         {v10.4h, v11.4h},[x1],#16
2552    ld1         {v8.4h, v9.4h},[x1],x10
2553
2554
2555    smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2556    smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
2557    smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
2558    smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
2559
2560    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2561    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2562    smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2563    smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2564
2565
2566
2567
2568
2569    smull       v20.4s, v10.4h, v0.h[0]
2570    smlsl       v20.4s, v11.4h, v3.h[2]
2571
2572
2573    smull       v22.4s, v10.4h, v0.h[0]
2574    smlsl       v22.4s, v11.4h, v2.h[2]
2575
2576    smull       v16.4s, v10.4h, v0.h[0]
2577    smlsl       v16.4s, v11.4h, v1.h[2]
2578
2579    smull       v18.4s, v10.4h, v0.h[0]
2580    smlsl       v18.4s, v11.4h, v0.h[2]
2581
2582    cmp         x12,x11
2583    bhs         stage2_shift4
2584    ld1         {v12.4h, v13.4h},[x1],#16
2585    ld1         {v14.4h, v15.4h},[x1],x10
2586
2587
2588
2589
2590
2591
2592    smlal       v24.4s, v14.4h, v0.h[1]
2593    smlal       v26.4s, v14.4h, v1.h[3]
2594    smlal       v28.4s, v14.4h, v4.h[1]
2595    smlal       v30.4s, v14.4h, v6.h[3]
2596
2597
2598    smlsl       v24.4s, v15.4h, v4.h[1]
2599    smlsl       v26.4s, v15.4h, v0.h[3]
2600    smlsl       v28.4s, v15.4h, v2.h[3]
2601    smlsl       v30.4s, v15.4h, v6.h[1]
2602
2603
2604    smlal       v20.4s, v12.4h, v7.h[0]
2605    smlal       v20.4s, v13.4h, v5.h[2]
2606    smlal       v22.4s, v12.4h, v5.h[0]
2607    smlsl       v22.4s, v13.4h, v7.h[2]
2608    smlal       v16.4s, v12.4h, v3.h[0]
2609    smlsl       v16.4s, v13.4h, v4.h[2]
2610    smlal       v18.4s, v12.4h, v1.h[0]
2611    smlsl       v18.4s, v13.4h, v1.h[2]
2612
2613    cmp         x12,x5
2614    bhs         stage2_shift4
2615
2616    ld1         {v10.4h, v11.4h},[x1],#16
2617    ld1         {v8.4h, v9.4h},[x1],x10
2618
2619
2620
2621    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2622    smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
2623    smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
2624    smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
2625
2626    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2627    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2628    smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2629    smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2630
2631
2632
2633
2634
2635    smlsl       v20.4s, v10.4h, v2.h[0]
2636    smlal       v20.4s, v11.4h, v1.h[2]
2637
2638
2639    smlsl       v22.4s, v10.4h, v6.h[0]
2640    smlal       v22.4s, v11.4h, v3.h[2]
2641
2642    smlal       v16.4s, v10.4h, v6.h[0]
2643    smlsl       v16.4s, v11.4h, v7.h[2]
2644
2645    smlal       v18.4s, v10.4h, v2.h[0]
2646    smlsl       v18.4s, v11.4h, v2.h[2]
2647
2648    cmp         x12,x6
2649    bhs         stage2_shift4
2650
2651
2652    ld1         {v12.4h, v13.4h},[x1],#16
2653    ld1         {v14.4h, v15.4h},[x1],x10
2654
2655
2656
2657
2658
2659
2660    smlsl       v24.4s, v14.4h, v1.h[1]
2661    smlsl       v26.4s, v14.4h, v7.h[3]
2662    smlal       v28.4s, v14.4h, v1.h[3]
2663    smlal       v30.4s, v14.4h, v4.h[3]
2664
2665
2666    smlal       v24.4s, v15.4h, v2.h[1]
2667    smlal       v26.4s, v15.4h, v5.h[1]
2668    smlsl       v28.4s, v15.4h, v3.h[1]
2669    smlsl       v30.4s, v15.4h, v4.h[1]
2670
2671
2672    smlsl       v20.4s, v12.4h, v5.h[0]
2673    smlsl       v20.4s, v13.4h, v7.h[2]
2674    smlsl       v22.4s, v12.4h, v1.h[0]
2675    smlal       v22.4s, v13.4h, v1.h[2]
2676    smlsl       v16.4s, v12.4h, v7.h[0]
2677    smlal       v16.4s, v13.4h, v5.h[2]
2678    smlal       v18.4s, v12.4h, v3.h[0]
2679    smlsl       v18.4s, v13.4h, v3.h[2]
2680
2681    cmp         x12,x9
2682    bhs         stage2_shift4
2683
2684
2685    ld1         {v10.4h, v11.4h},[x1],#16
2686    ld1         {v8.4h, v9.4h},[x1],x10
2687
2688
2689    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
2690    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
2691    smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
2692    smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
2693
2694    smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2695    smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2696    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2697    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2698
2699
2700
2701
2702
2703    smlal       v20.4s, v10.4h, v0.h[0]
2704    smlsl       v20.4s, v11.4h, v0.h[2]
2705
2706
2707    smlsl       v22.4s, v10.4h, v0.h[0]
2708    smlal       v22.4s, v11.4h, v6.h[2]
2709
2710    smlsl       v16.4s, v10.4h, v0.h[0]
2711    smlal       v16.4s, v11.4h, v2.h[2]
2712
2713    smlal       v18.4s, v10.4h, v0.h[0]
2714    smlsl       v18.4s, v11.4h, v4.h[2]
2715
2716    ld1         {v12.4h, v13.4h},[x1],#16
2717    ld1         {v14.4h, v15.4h},[x1],x10
2718
2719
2720
2721
2722    smlal       v24.4s, v14.4h, v3.h[1]
2723    smlsl       v26.4s, v14.4h, v2.h[1]
2724    smlal       v28.4s, v14.4h, v7.h[3]
2725    smlal       v30.4s, v14.4h, v2.h[3]
2726
2727
2728    smlsl       v24.4s, v15.4h, v0.h[3]
2729    smlal       v26.4s, v15.4h, v4.h[3]
2730    smlal       v28.4s, v15.4h, v6.h[3]
2731    smlsl       v30.4s, v15.4h, v2.h[1]
2732
2733
2734    smlal       v20.4s, v12.4h, v3.h[0]
2735    smlsl       v20.4s, v13.4h, v6.h[2]
2736    smlal       v22.4s, v12.4h, v7.h[0]
2737    smlsl       v22.4s, v13.4h, v4.h[2]
2738    smlsl       v16.4s, v12.4h, v1.h[0]
2739    smlal       v16.4s, v13.4h, v0.h[2]
2740    smlal       v18.4s, v12.4h, v5.h[0]
2741    smlsl       v18.4s, v13.4h, v5.h[2]
2742
2743
2744    ld1         {v10.4h, v11.4h},[x1],#16
2745    ld1         {v8.4h, v9.4h},[x1],x10
2746
2747
2748
2749
2750    smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
2751    smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
2752    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
2753    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
2754
2755    smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2756    smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2757    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2758    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2759
2760
2761
2762
2763
2764    smlsl       v20.4s, v10.4h, v6.h[0]
2765    smlal       v20.4s, v11.4h, v2.h[2]
2766
2767
2768    smlal       v22.4s, v10.4h, v2.h[0]
2769    smlsl       v22.4s, v11.4h, v0.h[2]
2770
2771    smlsl       v16.4s, v10.4h, v2.h[0]
2772    smlal       v16.4s, v11.4h, v3.h[2]
2773
2774    smlal       v18.4s, v10.4h, v6.h[0]
2775    smlsl       v18.4s, v11.4h, v6.h[2]
2776
2777
2778    ld1         {v12.4h, v13.4h},[x1],#16
2779    ld1         {v14.4h, v15.4h},[x1],x10
2780
2781
2782
2783    smlsl       v24.4s, v14.4h, v5.h[1]
2784    smlal       v26.4s, v14.4h, v3.h[3]
2785    smlsl       v28.4s, v14.4h, v2.h[1]
2786    smlal       v30.4s, v14.4h, v0.h[3]
2787
2788
2789    smlal       v24.4s, v15.4h, v1.h[3]
2790    smlsl       v26.4s, v15.4h, v1.h[1]
2791    smlal       v28.4s, v15.4h, v0.h[3]
2792    smlsl       v30.4s, v15.4h, v0.h[1]
2793
2794
2795    smlsl       v20.4s, v12.4h, v1.h[0]
2796    smlal       v20.4s, v13.4h, v4.h[2]
2797    smlal       v22.4s, v12.4h, v3.h[0]
2798    smlsl       v22.4s, v13.4h, v5.h[2]
2799    smlsl       v16.4s, v12.4h, v5.h[0]
2800    smlal       v16.4s, v13.4h, v6.h[2]
2801    smlal       v18.4s, v12.4h, v7.h[0]
2802    smlsl       v18.4s, v13.4h, v7.h[2]
2803
2804stage2_shift4:
2805    add         v8.4s,  v20.4s ,  v24.4s
2806    sub         v10.4s,  v20.4s ,  v24.4s
2807
2808    add         v12.4s,  v22.4s ,  v26.4s
2809    sub         v24.4s,  v22.4s ,  v26.4s
2810
2811    add         v14.4s,  v16.4s ,  v28.4s
2812    sub         v26.4s,  v16.4s ,  v28.4s
2813
2814
2815    add         v16.4s,  v18.4s ,  v30.4s
2816    sub         v28.4s,  v18.4s ,  v30.4s
2817
2818
2819    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2820    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2821    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2822    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2823    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2824    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2825    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2826    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2827
2828
2829
2830    umov        x15,v24.d[0]
2831    umov        x16,v25.d[0]
2832    umov        x19,v26.d[0]
2833    umov        x20,v27.d[0]
2834
2835    trn1        v24.4h, v30.4h, v12.4h
2836    trn2        v25.4h, v30.4h, v12.4h
2837    trn1        v26.4h, v31.4h, v13.4h
2838    trn2        v27.4h, v31.4h, v13.4h
2839
2840    trn1        v30.2s, v24.2s, v26.2s
2841    trn2        v31.2s, v24.2s, v26.2s
2842    trn1        v12.2s, v25.2s, v27.2s
2843    trn2        v13.2s, v25.2s, v27.2s
2844
2845    trn1        v24.4h, v14.4h, v18.4h
2846    trn2        v25.4h, v14.4h, v18.4h
2847    trn1        v26.4h, v15.4h, v19.4h
2848    trn2        v27.4h, v15.4h, v19.4h
2849
2850    trn1        v14.2s, v24.2s, v26.2s
2851    trn2        v15.2s, v24.2s, v26.2s
2852    trn1        v18.2s, v25.2s, v27.2s
2853    trn2        v19.2s, v25.2s, v27.2s
2854
2855    mov         v24.d[0],x15
2856    mov         v25.d[0],x16
2857    mov         v26.d[0],x19
2858    mov         v27.d[0],x20
2859
2860    st1         { v30.4h, v31.4h},[x0],#16
2861    st1         { v12.4h, v13.4h},[x0],#16
2862    st1         { v14.4h, v15.4h},[x0],#16
2863    st1         { v18.4h, v19.4h},[x0],#16
2864
2865
2866
2867
2868    sub         x0,x0,#256
2869prediction_buffer:
2870
2871
2872    ld1         {v12.8h},[x0],#16
2873    ld1         {v14.8h},[x0],#16
2874
2875    add         x0,x0,#32
2876
2877    ld1         {v16.8h},[x0],#16
2878    ld1         {v18.8h},[x0],#16
2879    add         x0,x0,#32
2880
2881    ld1         {v20.8h},[x0],#16
2882    ld1         {v22.8h},[x0],#16
2883
2884
2885    add         x0,x0,#32
2886
2887    ld1         {v24.8h},[x0],#16
2888    ld1         {v26.8h},[x0],#16
2889
2890
2891
2892
2893
2894// d12 =x0 1- 4 values
2895// d13 =x2 1- 4 values
2896// d14=x1 1- 4 values
2897// d15=x3 1- 4 values
2898
2899// d16 =x0 5- 8 values
2900// d17 =x2 5- 8 values
2901// d18=x1 5- 8 values
2902// d19=x3 5- 8 values
2903
2904// d20 =x0 9- 12 values
2905// d21 =x2 9- 12 values
2906// d22=x1 9- 12 values
2907// d23=x3 9- 12 values
2908
2909// d24 =x0 13-16 values
2910// d25 =x2 13- 16 values
2911// d26=x1 13- 16 values
2912// d27=x3 13- 16 values
2913
2914    // swapping v12 upper and v16 lower 64bits
2915    mov         v13.d[0], v12.d[1]
2916    mov         v12.d[1], v16.d[0]
2917    mov         v16.d[0], v13.d[0]
2918    // swapping v20 upper and v24 lower 64bits
2919    mov         v21.d[0], v20.d[1]
2920    mov         v20.d[1], v24.d[0]
2921    mov         v24.d[0], v21.d[0]
2922    // swapping v14 uppper and v18 lower 64bits
2923    mov         v15.d[0], v14.d[1]
2924    mov         v14.d[1], v18.d[0]
2925    mov         v18.d[0], v15.d[0]
2926    // swapping v22 upper and v26 lower 64bits
2927    mov         v23.d[0], v22.d[1]
2928    mov         v22.d[1], v26.d[0]
2929    mov         v26.d[0], v23.d[0]
2930
2931
2932    ld1         {v8.8b, v9.8b},[x2],x8
2933    ld1         {v10.8b, v11.8b},[x2],x8
2934    ld1         {v28.8b, v29.8b},[x2],x8
2935    ld1         {v30.8b, v31.8b},[x2],x8
2936
2937
2938    uaddw       v12.8h,  v12.8h ,  v8.8b
2939    uaddw       v20.8h,  v20.8h ,  v9.8b
2940    uaddw       v14.8h,  v14.8h ,  v10.8b
2941    uaddw       v22.8h,  v22.8h ,  v11.8b
2942    uaddw       v16.8h,  v16.8h ,  v28.8b
2943    uaddw       v24.8h,  v24.8h ,  v29.8b
2944    uaddw       v18.8h,  v18.8h ,  v30.8b
2945    uaddw       v26.8h,  v26.8h ,  v31.8b
2946    sub         x2,x2,x8,lsl #2
2947    add         x2,x2,#16
2948    sqxtun      v12.8b, v12.8h
2949    sqxtun      v13.8b, v20.8h
2950    sqxtun      v20.8b, v14.8h
2951    sqxtun      v21.8b, v22.8h
2952    sqxtun      v14.8b, v16.8h
2953    sqxtun      v15.8b, v24.8h
2954    sqxtun      v22.8b, v18.8h
2955    sqxtun      v23.8b, v26.8h
2956
2957
2958    st1         {v12.8b, v13.8b},[x3],x7
2959    st1         {v20.8b, v21.8b},[x3],x7
2960    st1         {v14.8b, v15.8b},[x3],x7
2961    st1         {v22.8b, v23.8b},[x3],x7
2962
2963
2964    sub         x3,x3,x7,lsl #2
2965    add         x3,x3,#16
2966
2967    ld1         {v12.8h},[x0],#16
2968    ld1         {v14.8h},[x0],#16
2969
2970    sub         x0,x0,#96
2971
2972    ld1         {v16.8h},[x0],#16
2973    ld1         {v18.8h},[x0],#16
2974    sub         x0,x0,#96
2975
2976    ld1         {v20.8h},[x0],#16
2977    ld1         {v22.8h},[x0],#16
2978
2979
2980    sub         x0,x0,#96
2981
2982    ld1         {v24.8h},[x0],#16
2983    ld1         {v26.8h},[x0],#16
2984
2985
2986    sub         x0,x0,#64
2987
2988
2989    // swapping v12 upper and v16 lower 64bits
2990    mov         v13.d[0], v12.d[1]
2991    mov         v12.d[1], v16.d[0]
2992    mov         v16.d[0], v13.d[0]
2993    // swapping v20 upper and v24 lower 64bits
2994    mov         v21.d[0], v20.d[1]
2995    mov         v20.d[1], v24.d[0]
2996    mov         v24.d[0], v21.d[0]
2997    // swapping v14 uppper and v18 lower 64bits
2998    mov         v15.d[0], v14.d[1]
2999    mov         v14.d[1], v18.d[0]
3000    mov         v18.d[0], v15.d[0]
3001    // swapping v22 upper and v26 lower 64bits
3002    mov         v23.d[0], v22.d[1]
3003    mov         v22.d[1], v26.d[0]
3004    mov         v26.d[0], v23.d[0]
3005
3006
3007    ld1         {v8.8b, v9.8b},[x2],x8
3008    ld1         {v10.8b, v11.8b},[x2],x8
3009    ld1         {v28.8b, v29.8b},[x2],x8
3010    ld1         {v30.8b, v31.8b},[x2],x8
3011
3012
3013    uaddw       v12.8h,  v12.8h ,  v8.8b
3014    uaddw       v20.8h,  v20.8h ,  v9.8b
3015    uaddw       v14.8h,  v14.8h ,  v10.8b
3016    uaddw       v22.8h,  v22.8h ,  v11.8b
3017    uaddw       v16.8h,  v16.8h ,  v28.8b
3018    uaddw       v24.8h,  v24.8h ,  v29.8b
3019    uaddw       v18.8h,  v18.8h ,  v30.8b
3020    uaddw       v26.8h,  v26.8h ,  v31.8b
3021    sub         x2,x2,#16
3022
3023    sqxtun      v12.8b, v12.8h
3024    sqxtun      v13.8b, v20.8h
3025    sqxtun      v20.8b, v14.8h
3026    sqxtun      v21.8b, v22.8h
3027    sqxtun      v14.8b, v16.8h
3028    sqxtun      v15.8b, v24.8h
3029    sqxtun      v22.8b, v18.8h
3030    sqxtun      v23.8b, v26.8h
3031
3032
3033    st1         {v12.8b, v13.8b},[x3],x7
3034    st1         {v20.8b, v21.8b},[x3],x7
3035    st1         {v14.8b, v15.8b},[x3],x7
3036    st1         {v22.8b, v23.8b},[x3],x7
3037
3038    sub         x3,x3,#16
3039
3040    subs        x14,x14,#1
3041    bne         dct_stage2
3042    // ldmfd sp!,{x0-x12,pc}
3043    ldp         x19, x20,[sp],#16
3044    pop_v_regs
3045    ret
3046
3047
3048
3049
3050
3051