1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_8x8_neon.s
22// *
23// * @brief
24// *  contains function definitions for single stage  inverse transform
25// *
26// * @author
27// * anand s
28// *
29// * @par list of functions:
30// *  - ihevc_itrans_recon_16x16()
31// *
32// * @remarks
33// *  none
34// *
35// *******************************************************************************
36//*/
37
38///**
39// *******************************************************************************
40// *
41// * @brief
42// *  this function performs inverse transform  and reconstruction for 8x8
43// * input block
44// *
45// * @par description:
46// *  performs inverse transform and adds the prediction  data and clips output
47// * to 8 bit
48// *
49// * @param[in] pi2_src
50// *  input 16x16 coefficients
51// *
52// * @param[in] pi2_tmp
53// *  temporary 16x16 buffer for storing inverse
54// *
55// *  transform
56// *  1st stage output
57// *
58// * @param[in] pu1_pred
59// *  prediction 16x16 block
60// *
61// * @param[out] pu1_dst
62// *  output 8x8 block
63// *
64// * @param[in] src_strd
65// *  input stride
66// *
67// * @param[in] pred_strd
68// *  prediction stride
69// *
70// * @param[in] dst_strd
71// *  output stride
72// *
73// * @param[in] shift
74// *  output shift
75// *
76// * @param[in] x12
77// *  zero columns in pi2_src
78// *
79// * @returns  void
80// *
81// * @remarks
82// *  none
83// *
84// *******************************************************************************
85// */
86
87//void ihevc_itrans_recon_16x16(word16 *pi2_src,
88//                            word16 *pi2_tmp,
89//                            uword8 *pu1_pred,
90//                            uword8 *pu1_dst,
91//                            word32 src_strd,
92//                            word32 pred_strd,
93//                            word32 dst_strd,
94//                            word32 x12
95//                             word32    x11                )
96
97//**************variables vs registers*************************
98//    x0 => *pi2_src
99//    x1 => *pi2_tmp
100//    x2 => *pu1_pred
101//    x3 => *pu1_dst
102//    src_strd
103//    pred_strd
104//    dst_strd
105//    x12
106//    x11
107
108.text
109.align 4
110
111.include "ihevc_neon_macros.s"
112
113
114
115
116.set shift_stage1_idct ,   7
117.set shift_stage2_idct ,   12
118//#define zero_cols         x12
119//#define zero_rows         x11
120.globl ihevc_itrans_recon_16x16_av8
121
122.extern g_ai2_ihevc_trans_16_transpose
123
124.type ihevc_itrans_recon_16x16_av8, %function
125
126ihevc_itrans_recon_16x16_av8:
127
128    ldr         w11, [sp]
129    // stmfd sp!,{x4-x12,x14}
130    push_v_regs
131    stp         x19, x20,[sp,#-16]!
132    stp         x5, x6,[sp,#-16]!
133//    add             sp,sp,#40
134
135
136
137//    ldr            x8,[sp,#4]     @ prediction stride
138//    ldr            x7,[sp,#8]     @ destination stride
139    mov         x6, x4 // src stride
140    mov         x12, x7
141
142
143
144    adrp        x14, :got:g_ai2_ihevc_trans_16_transpose
145    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
146    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
147    mov         x7,#0xffff
148    and         x12,x12,x7
149    and         x11,x11,x7
150    lsl         x6, x6, #1                  // x sizeof(word16)
151    add         x9,x0,x6, lsl #1            // 2 rows
152
153    add         x10,x6,x6, lsl #1           // 3 rows
154    add         x5,x6,x6,lsl #2
155    mov         x7,#0xfff0
156
157    cmp         x12,x7
158    bge         zero_12cols_decision
159
160    mov         x19,#0xff00
161    cmp         x12,x19
162    bge         zero_8cols_decision
163
164
165
166
167    mov         x14,#4
168    cmp         x11,x7
169    sub         x20,x6,#0
170    neg         x20, x20
171    csel        x10,x20,x10,ge
172
173    mov         x19,#0xff00
174    cmp         x11,x19
175    csel        x8, x5, x8,ge
176    sub         x20,x8,#0
177    neg         x20, x20
178    csel        x8,x20,x8,ge
179    csel        x8, x10, x8,lt
180    add         x5,x5,x6,lsl #3
181    sub         x20,x5,#0
182    neg         x5, x20
183
184    b           first_stage_top_four_bottom_four
185
186zero_12cols_decision:
187    mov         x14,#1
188    mov         x19,#0xff00
189    cmp         x11,x19
190    csel        x8, x5, x8,ge
191    csel        x8, x10, x8,lt
192    add         x5,x5,x6,lsl #3
193    sub         x20,x5,#0
194    neg         x5, x20
195
196    b           first_stage_top_four_bottom_four
197
198zero_8cols_decision:
199    mov         x14,#2
200    mov         x8,x5
201    sub         x20,x8,#0
202    neg         x8, x20
203    mov         x19,#0xff00
204    cmp         x11,x19
205    csel        x8, x10, x8,lt
206    add         x5,x5,x6,lsl #3
207    sub         x20,x5,#0
208    neg         x5, x20
209    cmp         x11,x7
210    sub         x20,x6,#0
211    neg         x20, x20
212    csel        x10,x20,x10,ge
213
214
215    b           first_stage_top_four_bottom_four
216
217
218//d0[0]=    64        d2[0]=64
219//d0[1]= 90        d2[1]=57
220//d0[2]= 89        d2[2]=50
221//d0[3]= 87        d2[3]=43
222//d1[0]= 83         d3[0]=36
223//d1[1]= 80        d3[1]=25
224//d1[2]= 75        d3[2]=18
225//d1[3]= 70        d3[3]=9
226
227
228
229first_stage:
230    add         x0,x0,#8
231    add         x9,x9,#8
232
233first_stage_top_four_bottom_four:
234
235    ld1         {v10.4h},[x0],x6
236    ld1         {v11.4h},[x9],x6
237    ld1         {v6.4h},[x0],x10
238    ld1         {v7.4h},[x9],x10
239    cmp         x11,x7
240    bge         skip_load4rows
241
242    ld1         {v4.4h},[x0],x6
243    ld1         {v5.4h},[x9],x6
244    ld1         {v8.4h},[x0],x8
245    ld1         {v9.4h},[x9],x8
246
247// registers used: q0,q1,q3,q5,q2,q4
248
249// d10 =x0
250//d6= x1
251//d11=x2
252//d7=x3
253
254skip_load4rows:
255    smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
256    smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
257    smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
258    smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
259
260    smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
261    smlal       v26.4s, v7.4h, v2.h[1]      //// y1 * cos3 - y3 * sin1(part of b1)
262    smlal       v28.4s, v7.4h, v3.h[3]      //// y1 * sin3 - y3 * cos1(part of b2)
263    smlsl       v30.4s, v7.4h, v2.h[3]      //// y1 * sin1 - y3 * sin3(part of b3)
264
265
266
267
268
269
270    smull       v12.4s, v10.4h, v0.h[0]
271    smlal       v12.4s, v11.4h, v0.h[2]
272    smull       v14.4s, v10.4h, v0.h[0]
273    smlal       v14.4s, v11.4h, v1.h[2]
274    smull       v16.4s, v10.4h, v0.h[0]
275    smlal       v16.4s, v11.4h, v2.h[2]
276    smull       v18.4s, v10.4h, v0.h[0]
277    smlal       v18.4s, v11.4h, v3.h[2]
278
279    bge         skip_last12rows_kernel1
280
281
282    smlal       v24.4s, v8.4h, v1.h[1]
283    smlal       v26.4s, v8.4h, v3.h[3]
284    smlsl       v28.4s, v8.4h, v1.h[3]
285    smlsl       v30.4s, v8.4h, v0.h[3]
286
287
288    smlal       v24.4s, v9.4h, v1.h[3]
289    smlsl       v26.4s, v9.4h, v2.h[3]
290    smlsl       v28.4s, v9.4h, v0.h[3]
291    smlal       v30.4s, v9.4h, v3.h[3]
292
293
294
295
296
297    smlal       v12.4s, v4.4h, v1.h[0]
298    smlal       v12.4s, v5.4h, v1.h[2]
299    smlal       v14.4s, v4.4h, v3.h[0]
300    smlsl       v14.4s, v5.4h, v3.h[2]
301    smlsl       v16.4s, v4.4h, v3.h[0]
302    smlsl       v16.4s, v5.4h, v0.h[2]
303    smlsl       v18.4s, v4.4h, v1.h[0]
304    smlsl       v18.4s, v5.4h, v2.h[2]
305
306//d0[0]=    64        d2[0]=64
307//d0[1]= 90        d2[1]=57
308//d0[2]= 89        d2[2]=50
309//d0[3]= 87        d2[3]=43
310//d1[0]= 83         d3[0]=36
311//d1[1]= 80        d3[1]=25
312//d1[2]= 75        d3[2]=18
313//d1[3]= 70        d3[3]=9
314    mov         x19,#0xff00
315    cmp         x11,x19
316    bge         skip_last12rows_kernel1
317
318
319    ld1         {v10.4h},[x0],x6
320    ld1         {v11.4h},[x9],x6
321    ld1         {v6.4h},[x0],x10
322    ld1         {v7.4h},[x9],x10
323    ld1         {v4.4h},[x0],x6
324    ld1         {v5.4h},[x9],x6
325    ld1         {v8.4h},[x0],x5
326    ld1         {v9.4h},[x9],x5
327
328
329
330
331    smlal       v24.4s, v6.4h, v2.h[1]      //// y1 * cos1(part of b0)
332    smlsl       v26.4s, v6.4h, v1.h[1]      //// y1 * cos3(part of b1)
333    smlsl       v28.4s, v6.4h, v3.h[1]      //// y1 * sin3(part of b2)
334    smlal       v30.4s, v6.4h, v0.h[1]      //// y1 * sin1(part of b3)
335
336    smlal       v24.4s, v7.4h, v2.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
337    smlsl       v26.4s, v7.4h, v0.h[1]      //// y1 * cos3 - y3 * sin1(part of b1)
338    smlal       v28.4s, v7.4h, v2.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
339    smlal       v30.4s, v7.4h, v3.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
340
341
342
343    smlal       v24.4s, v8.4h, v3.h[1]
344    smlsl       v26.4s, v8.4h, v1.h[3]
345    smlal       v28.4s, v8.4h, v0.h[1]
346    smlsl       v30.4s, v8.4h, v1.h[1]
347
348
349    smlal       v24.4s, v9.4h, v3.h[3]
350    smlsl       v26.4s, v9.4h, v3.h[1]
351    smlal       v28.4s, v9.4h, v2.h[3]
352    smlsl       v30.4s, v9.4h, v2.h[1]
353
354
355
356
357
358    smlal       v12.4s, v10.4h, v0.h[0]
359    smlal       v12.4s, v11.4h, v2.h[2]
360    smlal       v12.4s, v4.4h, v3.h[0]
361    smlal       v12.4s, v5.4h, v3.h[2]
362
363
364
365
366    smlsl       v14.4s, v10.4h, v0.h[0]
367    smlsl       v14.4s, v11.4h, v0.h[2]
368    smlsl       v14.4s, v4.4h, v1.h[0]
369    smlsl       v14.4s, v5.4h, v2.h[2]
370
371
372    smlsl       v16.4s, v10.4h, v0.h[0]
373    smlal       v16.4s, v11.4h, v3.h[2]
374    smlal       v16.4s, v4.4h, v1.h[0]
375    smlal       v16.4s, v5.4h, v1.h[2]
376
377
378    smlal       v18.4s, v10.4h, v0.h[0]
379    smlal       v18.4s, v11.4h, v1.h[2]
380    smlsl       v18.4s, v4.4h, v3.h[0]
381    smlsl       v18.4s, v5.4h, v0.h[2]
382
383skip_last12rows_kernel1:
384    add         v20.4s,  v12.4s ,  v24.4s
385    sub         v22.4s,  v12.4s ,  v24.4s
386
387    add         v12.4s,  v14.4s ,  v26.4s
388    sub         v24.4s,  v14.4s ,  v26.4s
389
390    add         v14.4s,  v16.4s ,  v28.4s
391    sub         v26.4s,  v16.4s ,  v28.4s
392
393
394    add         v16.4s,  v18.4s ,  v30.4s
395    sub         v28.4s,  v18.4s ,  v30.4s
396
397
398
399
400
401
402
403    sqrshrn     v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
404    sqrshrn     v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
405    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
406    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
407    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
408    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
409    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
410    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
411
412    st1         {v30.4h, v31.4h},[x1],#16
413    st1         {v18.4h, v19.4h},[x1],#16
414    sub         x1,x1,#32
415
416    bge         skip_stage1_kernel_load
417
418first_stage_middle_eight:
419
420
421
422    ld1         {v10.4h},[x0],x6
423    ld1         {v11.4h},[x9],x6
424    ld1         {v6.4h},[x0],x10
425    ld1         {v7.4h},[x9],x10
426    ld1         {v4.4h},[x0],x6
427    ld1         {v5.4h},[x9],x6
428    ld1         {v8.4h},[x0],x8
429    ld1         {v9.4h},[x9],x8
430
431
432skip_stage1_kernel_load:
433    smull       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
434    smull       v26.4s, v6.4h, v2.h[3]     //// y1 * cos3(part of b1)
435    smull       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
436    smull       v30.4s, v6.4h, v3.h[3]     //// y1 * sin1(part of b3)
437
438    smlsl       v24.4s, v7.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
439    smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
440    smlsl       v28.4s, v7.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
441    smlsl       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
442
443
444
445
446
447
448    smull       v22.4s, v10.4h, v0.h[0]
449    smlsl       v22.4s, v11.4h, v3.h[2]
450    smull       v20.4s, v10.4h, v0.h[0]
451    smlsl       v20.4s, v11.4h, v2.h[2]
452    smull       v16.4s, v10.4h, v0.h[0]
453    smlsl       v16.4s, v11.4h, v1.h[2]
454    smull       v18.4s, v10.4h, v0.h[0]
455    smlsl       v18.4s, v11.4h, v0.h[2]
456
457
458    cmp         x11,x7
459    bge         skip_last12rows_kernel2
460
461    smlsl       v24.4s, v8.4h, v3.h[1]
462    smlal       v26.4s, v8.4h, v2.h[1]
463    smlal       v28.4s, v8.4h, v0.h[1]
464    smlal       v30.4s, v8.4h, v2.h[3]
465
466
467    smlal       v24.4s, v9.4h, v0.h[1]
468    smlal       v26.4s, v9.4h, v3.h[1]
469    smlsl       v28.4s, v9.4h, v1.h[1]
470    smlsl       v30.4s, v9.4h, v2.h[1]
471
472
473
474    smlsl       v22.4s, v4.4h, v1.h[0]
475    smlal       v22.4s, v5.4h, v2.h[2]
476    smlsl       v20.4s, v4.4h, v3.h[0]
477    smlal       v20.4s, v5.4h, v0.h[2]
478    smlal       v16.4s, v4.4h, v3.h[0]
479    smlal       v16.4s, v5.4h, v3.h[2]
480    smlal       v18.4s, v4.4h, v1.h[0]
481    smlsl       v18.4s, v5.4h, v1.h[2]
482
483//d0[0]=    64        d2[0]=64
484//d0[1]= 90        d2[1]=57
485//d0[2]= 89        d2[2]=50
486//d0[3]= 87        d2[3]=43
487//d1[0]= 83         d3[0]=36
488//d1[1]= 80        d3[1]=25
489//d1[2]= 75        d3[2]=18
490//d1[3]= 70        d3[3]=9
491    mov         x19,#0xff00
492    cmp         x11,x19
493    bge         skip_last12rows_kernel2
494
495    ld1         {v10.4h},[x0],x6
496    ld1         {v11.4h},[x9],x6
497    ld1         {v6.4h},[x0],x10
498    ld1         {v7.4h},[x9],x10
499    ld1         {v4.4h},[x0],x6
500    ld1         {v5.4h},[x9],x6
501    ld1         {v8.4h},[x0],x5
502    ld1         {v9.4h},[x9],x5
503
504
505    smlsl       v24.4s, v6.4h, v3.h[3]     //// y1 * cos1(part of b0)
506    smlsl       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
507    smlal       v28.4s, v6.4h, v2.h[3]     //// y1 * sin3(part of b2)
508    smlal       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
509
510    smlsl       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
511    smlal       v26.4s, v7.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
512    smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
513    smlsl       v30.4s, v7.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
514
515
516    smlal       v24.4s, v8.4h, v2.h[3]
517    smlal       v26.4s, v8.4h, v3.h[3]
518    smlsl       v28.4s, v8.4h, v2.h[1]
519    smlal       v30.4s, v8.4h, v0.h[3]
520
521
522    smlal       v24.4s, v9.4h, v1.h[3]
523    smlsl       v26.4s, v9.4h, v1.h[1]
524    smlal       v28.4s, v9.4h, v0.h[3]
525    smlsl       v30.4s, v9.4h, v0.h[1]
526
527
528
529
530    smlal       v22.4s, v10.4h, v0.h[0]
531    smlsl       v22.4s, v11.4h, v1.h[2]
532    smlsl       v22.4s, v4.4h, v3.h[0]
533    smlal       v22.4s, v5.4h, v0.h[2]
534
535
536
537    smlsl       v20.4s, v10.4h, v0.h[0]
538    smlsl       v20.4s, v11.4h, v3.h[2]
539    smlal       v20.4s, v4.4h, v1.h[0]
540    smlsl       v20.4s, v5.4h, v1.h[2]
541
542
543    smlsl       v16.4s, v10.4h, v0.h[0]
544    smlal       v16.4s, v11.4h, v0.h[2]
545    smlsl       v16.4s, v4.4h, v1.h[0]
546    smlal       v16.4s, v5.4h, v2.h[2]
547
548
549
550    smlal       v18.4s, v10.4h, v0.h[0]
551    smlsl       v18.4s, v11.4h, v2.h[2]
552    smlal       v18.4s, v4.4h, v3.h[0]
553    smlsl       v18.4s, v5.4h, v3.h[2]
554
555skip_last12rows_kernel2:
556
557    add         v4.4s,  v22.4s ,  v24.4s
558    sub         v22.4s,  v22.4s ,  v24.4s
559
560    add         v6.4s,  v20.4s ,  v26.4s
561    sub         v24.4s,  v20.4s ,  v26.4s
562
563    add         v10.4s,  v16.4s ,  v28.4s
564    sub         v26.4s,  v16.4s ,  v28.4s
565
566
567    add         v16.4s,  v18.4s ,  v30.4s
568    sub         v28.4s,  v18.4s ,  v30.4s
569
570
571    sqrshrn     v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
572    sqrshrn     v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
573    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
574    sqrshrn     v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
575    sqrshrn     v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
576    sqrshrn     v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
577    sqrshrn     v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
578    sqrshrn     v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
579
580
581    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
582
583
584
585
586
587
588    ld1         {v4.4h, v5.4h},[x1],#16
589    ld1         {v8.4h, v9.4h},[x1],#16
590    sub         x1,x1,#32
591
592//d4=x0
593//d12=x1
594//d5=x2
595//d13=x3
596
597//d18=x4
598//d20=x5
599//d19=x6
600//d21=x7
601
602//d22=x8
603//d30=x9
604//d23=x10
605//d31=x11
606
607//d14=x12
608//d8=x13
609//d15=x14
610//d9=x15
611
612    umov        x15,v26.d[0]
613    umov        x16,v27.d[0]
614    umov        x19,v28.d[0]
615    umov        x20,v29.d[0]
616
617    trn1        v26.4h, v4.4h, v12.4h
618    trn2        v27.4h, v4.4h, v12.4h
619    trn1        v28.4h, v5.4h, v13.4h
620    trn2        v29.4h, v5.4h, v13.4h
621
622    trn1        v4.2s, v26.2s, v28.2s
623    trn2        v5.2s, v26.2s, v28.2s
624    trn1        v12.2s, v27.2s, v29.2s
625    trn2        v13.2s, v27.2s, v29.2s
626
627    trn1        v26.4h, v18.4h, v20.4h
628    trn2        v27.4h, v18.4h, v20.4h
629    trn1        v28.4h, v19.4h, v21.4h
630    trn2        v29.4h, v19.4h, v21.4h
631
632    trn1        v18.2s, v26.2s, v28.2s
633    trn2        v19.2s, v26.2s, v28.2s
634    trn1        v20.2s, v27.2s, v29.2s
635    trn2        v21.2s, v27.2s, v29.2s
636
637    trn1        v26.4h, v22.4h, v30.4h
638    trn2        v27.4h, v22.4h, v30.4h
639    trn1        v28.4h, v23.4h, v31.4h
640    trn2        v29.4h, v23.4h, v31.4h
641
642    trn1        v22.2s, v26.2s, v28.2s
643    trn2        v23.2s, v26.2s, v28.2s
644    trn1        v30.2s, v27.2s, v29.2s
645    trn2        v31.2s, v27.2s, v29.2s
646
647    trn1        v26.4h, v14.4h, v8.4h
648    trn2        v27.4h, v14.4h, v8.4h
649    trn1        v28.4h, v15.4h, v9.4h
650    trn2        v29.4h, v15.4h, v9.4h
651
652    trn1        v14.2s, v26.2s, v28.2s
653    trn2        v15.2s, v26.2s, v28.2s
654    trn1        v8.2s, v27.2s, v29.2s
655    trn2        v9.2s, v27.2s, v29.2s
656
657    mov         v26.d[0],x15
658    mov         v27.d[0],x16
659    mov         v28.d[0],x19
660    mov         v29.d[0],x20
661
662// d4 =x0 1- 4 values
663// d5 =x2 1- 4 values
664// d12=x1 1- 4 values
665// d13=x3 1- 4 values
666
667// d18 =x0 5- 8 values
668// d19 =x2 5- 8 values
669// d20=x1 5- 8 values
670// d21=x3 5- 8 values
671
672// d22 =x0 9- 12 values
673// d23 =x2 9- 12 values
674// d30=x1 9- 12 values
675// d31=x3 9- 12 values
676
677// d14 =x0 13-16 values
678// d15 =x2 13- 16 values
679// d8=x1 13- 16 values
680// d9=x3 13- 16 values
681
682
683    st1         { v4.4h, v5.4h},[x1],#16
684    st1         { v12.4h, v13.4h},[x1],#16
685
686    st1         { v18.4h, v19.4h},[x1],#16
687    st1         { v20.4h, v21.4h},[x1],#16
688    st1         { v22.4h, v23.4h},[x1],#16
689    st1         { v30.4h, v31.4h},[x1],#16
690    st1         { v14.4h, v15.4h},[x1],#16
691    st1         { v8.4h, v9.4h},[x1],#16
692
693
694    subs        x14,x14,#1
695    bne         first_stage
696
697
698
699
700
701
702
703
704
705
706    mov         x6,x7
707
708    ldp         x8, x7,[sp],#16
709
710    mov         x10,#16
711
712    cmp         x12,x6
713    sub         x20,x1,#128
714    csel        x1, x20, x1,ge
715    bge         label1
716
717    mov         x19,#0xff00
718    cmp         x12,x19
719    sub         x20,x1,#256
720    csel        x1, x20, x1,ge
721    bge         label_2
722
723    sub         x1,x1,#512
724    sub         x20,x10,#0
725    neg         x10, x20
726
727label_2:
728    add         x9,x1,#128
729    add         x11,x9,#128
730    add         x0,x11,#128
731
732
733
734label1:
735//    mov   x6,x1
736
737
738    mov         x14,#4
739    add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
740    add         x5,x8,x8, lsl #1            //
741//    add x0,x3,x7, lsl #1    @ x0 points to 3rd row of dest data
742//    add x10,x7,x7, lsl #1    @
743
744
745
746
747second_stage:
748    ld1         {v10.4h, v11.4h},[x1],#16
749    ld1         {v6.4h, v7.4h},[x1],x10
750    cmp         x12,x6
751    bge         second_stage_process
752    ld1         {v4.4h, v5.4h},[x9],#16
753    ld1         {v8.4h, v9.4h},[x9],x10
754
755second_stage_process:
756
757
758    smull       v24.4s, v6.4h, v0.h[1]     //// y1 * cos1(part of b0)
759    smull       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
760    smull       v28.4s, v6.4h, v1.h[1]     //// y1 * sin3(part of b2)
761    smull       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
762
763    smlal       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
764    smlal       v26.4s, v7.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
765    smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
766    smlsl       v30.4s, v7.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
767
768
769    smull       v12.4s, v10.4h, v0.h[0]
770    smlal       v12.4s, v11.4h, v0.h[2]
771    smull       v14.4s, v10.4h, v0.h[0]
772    smlal       v14.4s, v11.4h, v1.h[2]
773    smull       v16.4s, v10.4h, v0.h[0]
774    smlal       v16.4s, v11.4h, v2.h[2]
775    smull       v18.4s, v10.4h, v0.h[0]
776    smlal       v18.4s, v11.4h, v3.h[2]
777
778    bge         skip_last8rows_stage2_kernel1
779
780    smlal       v24.4s, v8.4h, v1.h[1]
781    smlal       v26.4s, v8.4h, v3.h[3]
782    smlsl       v28.4s, v8.4h, v1.h[3]
783    smlsl       v30.4s, v8.4h, v0.h[3]
784
785
786    smlal       v24.4s, v9.4h, v1.h[3]
787    smlsl       v26.4s, v9.4h, v2.h[3]
788    smlsl       v28.4s, v9.4h, v0.h[3]
789    smlal       v30.4s, v9.4h, v3.h[3]
790
791
792    smlal       v12.4s, v4.4h, v1.h[0]
793    smlal       v12.4s, v5.4h, v1.h[2]
794    smlal       v14.4s, v4.4h, v3.h[0]
795    smlsl       v14.4s, v5.4h, v3.h[2]
796    smlsl       v16.4s, v4.4h, v3.h[0]
797    smlsl       v16.4s, v5.4h, v0.h[2]
798    smlsl       v18.4s, v4.4h, v1.h[0]
799    smlsl       v18.4s, v5.4h, v2.h[2]
800
801    mov         x19,#0xff00
802    cmp         x12,x19
803    bge         skip_last8rows_stage2_kernel1
804
805
806    ld1         {v10.4h, v11.4h},[x11],#16
807    ld1         {v6.4h, v7.4h},[x11],x10
808    ld1         {v4.4h, v5.4h},[x0],#16
809    ld1         {v8.4h, v9.4h},[x0],x10
810
811
812
813
814
815    smlal       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
816    smlsl       v26.4s, v6.4h, v1.h[1]     //// y1 * cos3(part of b1)
817    smlsl       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
818    smlal       v30.4s, v6.4h, v0.h[1]     //// y1 * sin1(part of b3)
819
820    smlal       v24.4s, v7.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
821    smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
822    smlal       v28.4s, v7.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
823    smlal       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
824
825
826
827    smlal       v24.4s, v8.4h, v3.h[1]
828    smlsl       v26.4s, v8.4h, v1.h[3]
829    smlal       v28.4s, v8.4h, v0.h[1]
830    smlsl       v30.4s, v8.4h, v1.h[1]
831
832
833    smlal       v24.4s, v9.4h, v3.h[3]
834    smlsl       v26.4s, v9.4h, v3.h[1]
835    smlal       v28.4s, v9.4h, v2.h[3]
836    smlsl       v30.4s, v9.4h, v2.h[1]
837
838
839
840
841
842    smlal       v12.4s, v10.4h, v0.h[0]
843    smlal       v12.4s, v11.4h, v2.h[2]
844    smlal       v12.4s, v4.4h, v3.h[0]
845    smlal       v12.4s, v5.4h, v3.h[2]
846
847
848
849
850    smlsl       v14.4s, v10.4h, v0.h[0]
851    smlsl       v14.4s, v11.4h, v0.h[2]
852    smlsl       v14.4s, v4.4h, v1.h[0]
853    smlsl       v14.4s, v5.4h, v2.h[2]
854
855
856    smlsl       v16.4s, v10.4h, v0.h[0]
857    smlal       v16.4s, v11.4h, v3.h[2]
858    smlal       v16.4s, v4.4h, v1.h[0]
859    smlal       v16.4s, v5.4h, v1.h[2]
860
861
862    smlal       v18.4s, v10.4h, v0.h[0]
863    smlal       v18.4s, v11.4h, v1.h[2]
864    smlsl       v18.4s, v4.4h, v3.h[0]
865    smlsl       v18.4s, v5.4h, v0.h[2]
866
867
868
869
870
871
872skip_last8rows_stage2_kernel1:
873
874
875
876    add         v20.4s,  v12.4s ,  v24.4s
877    sub         v22.4s,  v12.4s ,  v24.4s
878
879    add         v12.4s,  v14.4s ,  v26.4s
880    sub         v24.4s,  v14.4s ,  v26.4s
881
882    add         v14.4s,  v16.4s ,  v28.4s
883    sub         v26.4s,  v16.4s ,  v28.4s
884
885
886    add         v16.4s,  v18.4s ,  v30.4s
887    sub         v28.4s,  v18.4s ,  v30.4s
888
889
890
891
892
893
894
895    sqrshrn     v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
896    sqrshrn     v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
897    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
898    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
899    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
900    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
901    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
902    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
903
904    bge         skip_stage2_kernel_load
905
906    //q2,q4,q6,q7 is used
907    ld1         {v10.4h, v11.4h},[x1],#16
908    ld1         {v6.4h, v7.4h},[x1],#16
909    ld1         {v4.4h, v5.4h},[x9],#16
910    ld1         {v8.4h, v9.4h},[x9],#16
911skip_stage2_kernel_load:
912    sub         x1,x1,#32
913    st1         {v30.4h, v31.4h},[x1],#16
914    st1         {v18.4h, v19.4h},[x1],#16
915    sub         x1,x1,#32
916
917    smull       v24.4s, v6.4h, v2.h[1]     //// y1 * cos1(part of b0)
918    smull       v26.4s, v6.4h, v2.h[3]     //// y1 * cos3(part of b1)
919    smull       v28.4s, v6.4h, v3.h[1]     //// y1 * sin3(part of b2)
920    smull       v30.4s, v6.4h, v3.h[3]     //// y1 * sin1(part of b3)
921
922    smlsl       v24.4s, v7.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
923    smlsl       v26.4s, v7.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
924    smlsl       v28.4s, v7.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
925    smlsl       v30.4s, v7.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
926
927
928    smull       v22.4s, v10.4h, v0.h[0]
929    smlsl       v22.4s, v11.4h, v3.h[2]
930    smull       v20.4s, v10.4h, v0.h[0]
931    smlsl       v20.4s, v11.4h, v2.h[2]
932    smull       v16.4s, v10.4h, v0.h[0]
933    smlsl       v16.4s, v11.4h, v1.h[2]
934    smull       v18.4s, v10.4h, v0.h[0]
935    smlsl       v18.4s, v11.4h, v0.h[2]
936
937
938
939    cmp         x12,x6
940    bge         skip_last8rows_stage2_kernel2
941
942
943    smlsl       v24.4s, v8.4h, v3.h[1]
944    smlal       v26.4s, v8.4h, v2.h[1]
945    smlal       v28.4s, v8.4h, v0.h[1]
946    smlal       v30.4s, v8.4h, v2.h[3]
947
948
949    smlal       v24.4s, v9.4h, v0.h[1]
950    smlal       v26.4s, v9.4h, v3.h[1]
951    smlsl       v28.4s, v9.4h, v1.h[1]
952    smlsl       v30.4s, v9.4h, v2.h[1]
953
954
955
956    smlsl       v22.4s, v4.4h, v1.h[0]
957    smlal       v22.4s, v5.4h, v2.h[2]
958    smlsl       v20.4s, v4.4h, v3.h[0]
959    smlal       v20.4s, v5.4h, v0.h[2]
960    smlal       v16.4s, v4.4h, v3.h[0]
961    smlal       v16.4s, v5.4h, v3.h[2]
962    smlal       v18.4s, v4.4h, v1.h[0]
963    smlsl       v18.4s, v5.4h, v1.h[2]
964    mov         x19,#0xff00
965    cmp         x12,x19
966    bge         skip_last8rows_stage2_kernel2
967
968    ld1         {v10.4h, v11.4h},[x11],#16
969    ld1         {v6.4h, v7.4h},[x11],#16
970    ld1         {v4.4h, v5.4h},[x0],#16
971    ld1         {v8.4h, v9.4h},[x0],#16
972
973    smlsl       v24.4s, v6.4h, v3.h[3]     //// y1 * cos1(part of b0)
974    smlsl       v26.4s, v6.4h, v0.h[3]     //// y1 * cos3(part of b1)
975    smlal       v28.4s, v6.4h, v2.h[3]     //// y1 * sin3(part of b2)
976    smlal       v30.4s, v6.4h, v1.h[3]     //// y1 * sin1(part of b3)
977
978    smlsl       v24.4s, v7.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
979    smlal       v26.4s, v7.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
980    smlal       v28.4s, v7.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
981    smlsl       v30.4s, v7.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
982
983
984    smlal       v24.4s, v8.4h, v2.h[3]
985    smlal       v26.4s, v8.4h, v3.h[3]
986    smlsl       v28.4s, v8.4h, v2.h[1]
987    smlal       v30.4s, v8.4h, v0.h[3]
988
989
990    smlal       v24.4s, v9.4h, v1.h[3]
991    smlsl       v26.4s, v9.4h, v1.h[1]
992    smlal       v28.4s, v9.4h, v0.h[3]
993    smlsl       v30.4s, v9.4h, v0.h[1]
994
995
996
997
998    smlal       v22.4s, v10.4h, v0.h[0]
999    smlsl       v22.4s, v11.4h, v1.h[2]
1000    smlsl       v22.4s, v4.4h, v3.h[0]
1001    smlal       v22.4s, v5.4h, v0.h[2]
1002
1003
1004
1005    smlsl       v20.4s, v10.4h, v0.h[0]
1006    smlsl       v20.4s, v11.4h, v3.h[2]
1007    smlal       v20.4s, v4.4h, v1.h[0]
1008    smlsl       v20.4s, v5.4h, v1.h[2]
1009
1010
1011    smlsl       v16.4s, v10.4h, v0.h[0]
1012    smlal       v16.4s, v11.4h, v0.h[2]
1013    smlsl       v16.4s, v4.4h, v1.h[0]
1014    smlal       v16.4s, v5.4h, v2.h[2]
1015
1016
1017
1018    smlal       v18.4s, v10.4h, v0.h[0]
1019    smlsl       v18.4s, v11.4h, v2.h[2]
1020    smlal       v18.4s, v4.4h, v3.h[0]
1021    smlsl       v18.4s, v5.4h, v3.h[2]
1022
1023
1024skip_last8rows_stage2_kernel2:
1025
1026
1027
1028    add         v4.4s,  v22.4s ,  v24.4s
1029    sub         v22.4s,  v22.4s ,  v24.4s
1030
1031    add         v6.4s,  v20.4s ,  v26.4s
1032    sub         v24.4s,  v20.4s ,  v26.4s
1033
1034    add         v10.4s,  v16.4s ,  v28.4s
1035    sub         v26.4s,  v16.4s ,  v28.4s
1036
1037
1038    add         v16.4s,  v18.4s ,  v30.4s
1039    sub         v28.4s,  v18.4s ,  v30.4s
1040
1041
1042    sqrshrn     v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1043    sqrshrn     v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1044    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1045    sqrshrn     v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1046    sqrshrn     v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1047    sqrshrn     v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1048    sqrshrn     v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1049    sqrshrn     v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1050
1051    ld1         {v4.4h, v5.4h},[x1],#16
1052    ld1         {v8.4h, v9.4h},[x1],#16
1053
1054
1055
1056    // registers used:    {q2,q4,q6,q7}, {q9,q15,q10,q11}
1057
1058//d4=x0
1059//d12=x1
1060//d5=x2
1061//d13=x3
1062
1063//d18=x4
1064//d20=x5
1065//d19=x6
1066//d21=x7
1067
1068//d22=x8
1069//d30=x9
1070//d23=x10
1071//d31=x11
1072
1073//d14=x12
1074//d8=x13
1075//d15=x14
1076//d9=x15
1077
1078    umov        x15,v26.d[0]
1079    umov        x16,v27.d[0]
1080    umov        x19,v28.d[0]
1081    umov        x20,v29.d[0]
1082
1083    trn1        v26.4h, v4.4h, v12.4h
1084    trn2        v27.4h, v4.4h, v12.4h
1085    trn1        v28.4h, v5.4h, v13.4h
1086    trn2        v29.4h, v5.4h, v13.4h
1087
1088    trn1        v4.2s, v26.2s, v28.2s
1089    trn2        v5.2s, v26.2s, v28.2s
1090    trn1        v12.2s, v27.2s, v29.2s
1091    trn2        v13.2s, v27.2s, v29.2s
1092
1093    trn1        v26.4h, v18.4h, v20.4h
1094    trn2        v27.4h, v18.4h, v20.4h
1095    trn1        v28.4h, v19.4h, v21.4h
1096    trn2        v29.4h, v19.4h, v21.4h
1097
1098    trn1        v18.2s, v26.2s, v28.2s
1099    trn2        v19.2s, v26.2s, v28.2s
1100    trn1        v20.2s, v27.2s, v29.2s
1101    trn2        v21.2s, v27.2s, v29.2s
1102
1103    trn1        v26.4h, v22.4h, v30.4h
1104    trn2        v27.4h, v22.4h, v30.4h
1105    trn1        v28.4h, v23.4h, v31.4h
1106    trn2        v29.4h, v23.4h, v31.4h
1107
1108    trn1        v22.2s, v26.2s, v28.2s
1109    trn2        v23.2s, v26.2s, v28.2s
1110    trn1        v30.2s, v27.2s, v29.2s
1111    trn2        v31.2s, v27.2s, v29.2s
1112
1113    trn1        v26.4h, v14.4h, v8.4h
1114    trn2        v27.4h, v14.4h, v8.4h
1115    trn1        v28.4h, v15.4h, v9.4h
1116    trn2        v29.4h, v15.4h, v9.4h
1117
1118    trn1        v14.2s, v26.2s, v28.2s
1119    trn2        v15.2s, v26.2s, v28.2s
1120    trn1        v8.2s, v27.2s, v29.2s
1121    trn2        v9.2s, v27.2s, v29.2s
1122
1123    mov         v26.d[0],x15
1124    mov         v27.d[0],x16
1125    mov         v28.d[0],x19
1126    mov         v29.d[0],x20
1127
1128// d4 =x0 1- 4 values
1129// d5 =x2 1- 4 values
1130// d12=x1 1- 4 values
1131// d13=x3 1- 4 values
1132
1133// d18 =x0 5- 8 values
1134// d19 =x2 5- 8 values
1135// d20=x1 5- 8 values
1136// d21=x3 5- 8 values
1137
1138// d22 =x0 9- 12 values
1139// d23 =x2 9- 12 values
1140// d30=x1 9- 12 values
1141// d31=x3 9- 12 values
1142
1143// d14 =x0 13-16 values
1144// d15 =x2 13- 16 values
1145// d8=x1 13- 16 values
1146// d9=x3 13- 16 values
1147
1148    // swapping v5 and v15
1149    mov         v5.d[1],v5.d[0]
1150    mov         v5.d[0],v18.d[0]
1151    mov         v18.d[0],v5.d[1]
1152    // swapping v23 and v14
1153    mov         v23.d[1],v23.d[0]
1154    mov         v23.d[0],v14.d[0]
1155    mov         v14.d[0],v23.d[1]
1156    // swapping v13 and v20
1157    mov         v13.d[1],v13.d[0]
1158    mov         v13.d[0],v20.d[0]
1159    mov         v20.d[0],v13.d[1]
1160    // swapping v31 and v8
1161    mov         v31.d[1],v31.d[0]
1162    mov         v31.d[0],v8.d[0]
1163    mov         v8.d[0],v31.d[1]
1164
1165// q2: x0 1-8 values
1166// q11: x0 9-16 values
1167// q9 : x2 1-8 values
1168// q7 : x2 9-16 values
1169// q6 : x1 1- 8 values
1170// q10: x3 1-8 values
1171// q15: x1 9-16 values
1172// q4:  x3 9-16 values
1173
1174
1175//    registers free: q8,q14,q12,q13
1176
1177
1178    ld1         {v16.8b, v17.8b},[x2],x8
1179    ld1         {v28.8b, v29.8b},[x2],x5
1180    ld1         {v24.8b, v25.8b},[x4],x8
1181    ld1         {v26.8b, v27.8b},[x4],x5
1182
1183    mov         v4.d[1] ,v5.d[0]
1184    mov         v22.d[1] ,v23.d[0]
1185    mov         v12.d[1] ,v13.d[0]
1186    mov         v30.d[1] ,v31.d[0]
1187    mov         v18.d[1] ,v19.d[0]
1188    mov         v14.d[1] ,v15.d[0]
1189    mov         v20.d[1] ,v21.d[0]
1190    mov         v8.d[1] ,v9.d[0]
1191
1192    uaddw       v4.8h,  v4.8h ,  v16.8b
1193    uaddw       v22.8h,  v22.8h ,  v17.8b
1194    uaddw       v12.8h,  v12.8h ,  v28.8b
1195    uaddw       v30.8h,  v30.8h ,  v29.8b
1196    uaddw       v18.8h,  v18.8h ,  v24.8b
1197    uaddw       v14.8h,  v14.8h ,  v25.8b
1198    uaddw       v20.8h,  v20.8h ,  v26.8b
1199    uaddw       v8.8h,  v8.8h ,  v27.8b
1200
1201
1202    sqxtun      v16.8b, v4.8h
1203    sqxtun      v17.8b, v22.8h
1204    sqxtun      v28.8b, v12.8h
1205    sqxtun      v29.8b, v30.8h
1206    sqxtun      v24.8b, v18.8h
1207    sqxtun      v25.8b, v14.8h
1208    sqxtun      v26.8b, v20.8h
1209    sqxtun      v27.8b, v8.8h
1210
1211
1212
1213    st1         {v16.8b, v17.8b},[x3],x7
1214    st1         {v28.8b, v29.8b},[x3],x7
1215    st1         {v24.8b, v25.8b},[x3],x7
1216    st1         {v26.8b, v27.8b},[x3],x7
1217
1218    subs        x14,x14,#1
1219
1220
1221
1222    bne         second_stage
1223
1224
1225//    sub         sp,sp,#40
1226    // ldmfd sp!,{x4-x12,pc}
1227    ldp         x19, x20,[sp],#16
1228    pop_v_regs
1229    ret
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241