1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21///*******************************************************************************
22// * //file
23// *  ih264_iquant_itrans_recon_a9.s
24// *
25// * //brief
26// *  Contains function definitions for single stage  inverse transform
27// *
28// * //author
29// *  Parthiban V
30// *     Mohit
31// *  Harinarayanaan
32// *
33// * //par List of Functions:
34// *  - ih264_iquant_itrans_recon_4x4_av8()
35// *     - ih264_iquant_itrans_recon_8x8_av8()
36// *     - ih264_iquant_itrans_recon_chroma_4x4_av8()
37// *
38// * //remarks
39// *  None
40// *
41// *******************************************************************************
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47///*
48// *******************************************************************************
49// *
50// * //brief
51// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
52// *
53// * //par Description:
54// *  Performs inverse transform Ci4 and adds the residue to get the
55// *  reconstructed block
56// *
57// * //param[in] pi2_src
58// *  Input 4x4 coefficients
59// *
60// * //param[in] pu1_pred
61// *  Prediction 4x4 block
62// *
63// * //param[out] pu1_out
64// *  Output 4x4 block
65// *
66// * //param[in] u4_qp_div_6
67// *     QP
68// *
69// * //param[in] pu2_weigh_mat
70// * Pointer to weight matrix
71// *
72// * //param[in] pred_strd,
73// *  Prediction stride
74// *
75// * //param[in] out_strd
76// *  Output Stride
77// *
78// *//param[in] pi2_tmp
79// * temporary buffer of size 1*16
80// *
81// * //param[in] pu2_iscal_mat
82// * Pointer to the inverse quantization matrix
83// *
84// * //returns  Void
85// *
86// * //remarks
87// *  None
88// *
89// *******************************************************************************
90// */
91//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
92//                                    UWORD8 *pu1_pred,
93//                                    UWORD8 *pu1_out,
94//                                    WORD32 pred_strd,
95//                                    WORD32 out_strd,
96//                                    const UWORD16 *pu2_iscal_mat,
97//                                    const UWORD16 *pu2_weigh_mat,
98//                                    UWORD32 u4_qp_div_6,
99//                                    WORD32 *pi4_tmp,
100//                                    WORD32 iq_start_idx
101//                                    WORD16 *pi2_dc_ld_addr)
102//**************Variables Vs Registers*****************************************
103//x0 => *pi2_src
104//x1 => *pu1_pred
105//x2 => *pu1_out
106//w3 =>  pred_strd
107//w4 =>  out_strd
108//x5 => *pu2_iscal_mat
109//x6 => *pu2_weigh_mat
110//w7 =>  u4_qp_div_6
111//   =>  pi4_tmp
112//   =>  iq_start_idx
113//   =>  pi2_dc_ld_addr
114//Only one shift is done in horizontal inverse because,
115//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
116//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
117
118    .global ih264_iquant_itrans_recon_4x4_av8
119ih264_iquant_itrans_recon_4x4_av8:
120
121    push_v_regs
122    sxtw      x3, w3
123    sxtw      x4, w4
124
125    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
126
127    ldr       w8, [sp, #72]             //Loads iq_start_idx
128    sxtw      x8, w8
129
130    ldr       x10, [sp, #80]            //Load alternate dc address
131
132    subs      x8, x8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
133
134
135//=======================DEQUANT FROM HERE===================================
136
137    ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
138    ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
139    ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
140
141
142    mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
143    mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
144    mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
145    mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
146
147    smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
148    smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
149    smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
150    smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
151
152    sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
153    sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
154    sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
155    sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
156
157    sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
158    sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
159    sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
160    sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
161
162    bne       skip_loading_luma_dc_src
163    ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_ld_addr[0], if x8==1
164skip_loading_luma_dc_src:
165
166    //========= PROCESS IDCT FROM HERE =======
167    //Steps for Stage 1:
168    //------------------
169    ld1       {v30.s}[0], [x1], x3      // i row load pu1_pred buffer
170
171    sshr      v8.4h, v1.4h, #1          // d1>>1
172    sshr      v9.4h, v3.4h, #1          // d3>>1
173
174    add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
175    sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
176    sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
177    add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
178
179    ld1       {v30.s}[1], [x1], x3      // ii row load pu1_pred buffer
180
181    add       v10.4h, v4.4h , v7.4h     // x0+x3
182    add       v11.4h, v5.4h , v6.4h     // x1+x2
183    sub       v12.4h, v5.4h , v6.4h     // x1-x2
184    sub       v13.4h, v4.4h , v7.4h
185
186    ld1       {v31.s}[0], [x1], x3      // iii row load pu1_pred buf
187
188
189    //Steps for Stage 2:
190    //transopose
191    trn1      v4.4h, v10.4h, v11.4h
192    trn2      v5.4h, v10.4h, v11.4h
193    trn1      v6.4h, v12.4h, v13.4h
194    trn2      v7.4h, v12.4h, v13.4h
195
196    trn1      v10.2s, v4.2s, v6.2s      // 0
197    trn1      v11.2s, v5.2s, v7.2s      // 8
198    trn2      v12.2s, v4.2s, v6.2s      // 4
199    trn2      v13.2s, v5.2s, v7.2s
200    //end transpose
201
202    sshr      v18.4h, v11.4h, #1        // q0>>1
203    sshr      v19.4h, v13.4h, #1        // q1>>1
204
205    add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
206    sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
207    sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
208    add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
209
210
211    ld1       {v31.s}[1], [x1], x3      // iv row load pu1_pred buffer
212
213    add       v20.4h, v14.4h, v17.4h    // x0 + x3
214    add       v21.4h, v15.4h, v16.4h    // x1 + x2
215    sub       v22.4h, v15.4h, v16.4h    // x1 - x2
216    sub       v23.4h, v14.4h, v17.4h    // x0 - x3
217
218    mov       v20.d[1], v21.d[0]
219    mov       v22.d[1], v23.d[0]
220
221    srshr     v20.8h, v20.8h, #6
222    srshr     v22.8h, v22.8h, #6
223
224    uaddw     v20.8h, v20.8h , v30.8b
225    uaddw     v22.8h, v22.8h , v31.8b
226
227    sqxtun    v0.8b, v20.8h
228    sqxtun    v1.8b, v22.8h
229
230    st1       {v0.s}[0], [x2], x4       //i row store the value
231    st1       {v0.s}[1], [x2], x4       //ii row store the value
232    st1       {v1.s}[0], [x2], x4       //iii row store the value
233    st1       {v1.s}[1], [x2]           //iv row store the value
234
235    pop_v_regs
236    ret
237
238
239///**
240// *******************************************************************************
241// *
242// * @brief
243// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
244// *
245// * @par Description:
246// *  Performs inverse transform Ci4 and adds the residue to get the
247// *  reconstructed block
248// *
249// * @param[in] pi2_src
250// *  Input 4x4 coefficients
251// *
252// * @param[in] pu1_pred
253// *  Prediction 4x4 block
254// *
255// * @param[out] pu1_out
256// *  Output 4x4 block
257// *
258// * @param[in] u4_qp_div_6
259// *     QP
260// *
261// * @param[in] pu2_weigh_mat
262// * Pointer to weight matrix
263// *
264// * @param[in] pred_strd,
265// *  Prediction stride
266// *
267// * @param[in] out_strd
268// *  Output Stride
269// *
270// *@param[in] pi2_tmp
271// * temporary buffer of size 1*16
272// *
273// * @param[in] pu2_iscal_mat
274// * Pointer to the inverse quantization matrix
275// *
276// * @returns  Void
277// *
278// * @remarks
279// *  None
280// *
281// *******************************************************************************
282// */
283//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
284//                                          UWORD8 *pu1_pred,
285//                                          UWORD8 *pu1_out,
286//                                          WORD32 pred_strd,
287//                                          WORD32 out_strd,
288//                                          const UWORD16 *pu2_iscal_mat,
289//                                          const UWORD16 *pu2_weigh_mat,
290//                                          UWORD32 u4_qp_div_6,
291//                                          WORD32 *pi4_tmp
292//                                          WORD16 *pi2_dc_src)
293//**************Variables Vs Registers*****************************************
294//x0 => *pi2_src
295//x1 => *pu1_pred
296//x2 => *pu1_out
297//w3 =>  pred_strd
298//w4 =>  out_strd
299//x5 => *pu2_iscal_mat
300//x6 => *pu2_weigh_mat
301//w7 =>  u4_qp_div_6
302//sp =>  pi4_tmp
303//sp#8 => *pi2_dc_src
304
305    .global ih264_iquant_itrans_recon_chroma_4x4_av8
306ih264_iquant_itrans_recon_chroma_4x4_av8:
307
308//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
309//If the macro value changes need to change the instruction according to it.
310//Only one shift is done in horizontal inverse because,
311//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
312//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
313
314//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing
315//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput
316//all instructions were taken as equal
317
318    //reduce sp by 64
319    push_v_regs
320    sxtw      x3, w3
321    sxtw      x4, w4
322
323    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
324
325    //was at sp + 8, hence now at sp+64+8 = sp+72
326    ldr       x10, [sp, #72]            //Load alternate dc address
327
328//=======================DEQUANT FROM HERE===================================
329
330    ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
331    ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
332    ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
333
334
335    mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
336    mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
337    mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
338    mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
339
340    smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
341    smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
342    smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
343    smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
344
345    sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
346    sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
347    sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
348    sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
349
350    sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
351    sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
352    sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
353    sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
354
355    ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_src[0]
356
357    //========= PROCESS IDCT FROM HERE =======
358    //Steps for Stage 1:
359    //------------------
360
361    sshr      v8.4h, v1.4h, #1          // d1>>1
362    sshr      v9.4h, v3.4h, #1          // d3>>1
363
364    add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
365    sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
366    sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
367    add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
368
369
370    add       v10.4h, v4.4h , v7.4h     // x0+x3
371    add       v11.4h, v5.4h , v6.4h     // x1+x2
372    sub       v12.4h, v5.4h , v6.4h     // x1-x2
373    sub       v13.4h, v4.4h , v7.4h
374
375    ld1       {v26.8b}, [x1], x3        // i row load pu1_pred buffer
376    ld1       {v27.8b}, [x1], x3        // ii row load pu1_pred buffer
377    ld1       {v28.8b}, [x1], x3        // iii row load pu1_pred buf
378    ld1       {v29.8b}, [x1], x3        // iv row load pu1_pred buffer
379
380    //Steps for Stage 2:
381    //transopose
382    trn1      v4.4h, v10.4h, v11.4h
383    trn2      v5.4h, v10.4h, v11.4h
384    trn1      v6.4h, v12.4h, v13.4h
385    trn2      v7.4h, v12.4h, v13.4h
386
387    trn1      v10.2s, v4.2s, v6.2s      // 0
388    trn1      v11.2s, v5.2s, v7.2s      // 8
389    trn2      v12.2s, v4.2s, v6.2s      // 4
390    trn2      v13.2s, v5.2s, v7.2s
391    //end transpose
392
393    sshr      v18.4h, v11.4h, #1        // q0>>1
394    sshr      v19.4h, v13.4h, #1        // q1>>1
395
396    add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
397    sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
398    sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
399    add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
400
401    //Backup the output addr
402    mov       x0, x2
403
404    //load outpt buufer for interleaving
405    ld1       {v10.8b}, [x2], x4
406    ld1       {v11.8b}, [x2], x4
407    ld1       {v12.8b}, [x2], x4
408    ld1       {v13.8b}, [x2]
409
410    add       v20.4h, v14.4h, v17.4h    // x0 + x3
411    add       v21.4h, v15.4h, v16.4h    // x1 + x2
412    sub       v22.4h, v15.4h, v16.4h    // x1 - x2
413    sub       v23.4h, v14.4h, v17.4h    // x0 - x3
414
415    srshr     v20.4h, v20.4h, #6
416    srshr     v21.4h, v21.4h, #6
417    srshr     v22.4h, v22.4h, #6
418    srshr     v23.4h, v23.4h, #6
419
420    //nop       v30.8b                            //dummy for deinterleaving
421    movi      v31.4h, #0x00ff           //mask for interleaving [copy lower 8 bits]
422
423    //Extract u/v plane from interleaved data
424    uzp1      v26.8b, v26.8b, v30.8b
425    uzp1      v27.8b, v27.8b, v30.8b
426    uzp1      v28.8b, v28.8b, v30.8b
427    uzp1      v29.8b, v29.8b, v30.8b
428
429    uaddw     v20.8h, v20.8h, v26.8b
430    uaddw     v21.8h, v21.8h, v27.8b
431    uaddw     v22.8h, v22.8h, v28.8b
432    uaddw     v23.8h, v23.8h, v29.8b
433
434    sqxtun    v0.8b, v20.8h
435    sqxtun    v1.8b, v21.8h
436    sqxtun    v2.8b, v22.8h
437    sqxtun    v3.8b, v23.8h
438
439    //long the output so that we have 0 at msb and value at lsb
440    uxtl      v6.8h, v0.8b
441    uxtl      v7.8h, v1.8b
442    uxtl      v8.8h, v2.8b
443    uxtl      v9.8h, v3.8b
444
445    //select lsbs from proceesd data and msbs from pu1_out loaded data
446    bit       v10.8b, v6.8b, v31.8b
447    bit       v11.8b, v7.8b, v31.8b
448    bit       v12.8b, v8.8b, v31.8b
449    bit       v13.8b, v9.8b, v31.8b
450
451    //store the interleaved result
452    st1       {v10.8b}, [x0], x4
453    st1       {v11.8b}, [x0], x4
454    st1       {v12.8b}, [x0], x4
455    st1       {v13.8b}, [x0]
456
457    pop_v_regs
458    ret
459
460///*
461// *******************************************************************************
462// *
463// * //brief
464// *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
465// *
466// * //par Description:
467// *  Performs inverse transform Ci8 and adds the residue to get the
468// *  reconstructed block
469// *
470// * //param[in] pi2_src
471// *  Input 4x4 coefficients
472// *
473// * //param[in] pu1_pred
474// *  Prediction 4x4 block
475// *
476// * //param[out] pu1_out
477// *  Output 4x4 block
478// *
479// * //param[in] u4_qp_div_6
480// *     QP
481// *
482// * //param[in] pu2_weigh_mat
483// * Pointer to weight matrix
484// *
485// * //param[in] pred_strd,
486// *  Prediction stride
487// *
488// * //param[in] out_strd
489// *  Output Stride
490// *
491// *//param[in] pi2_tmp
492// * temporary buffer of size 1*64
493// *
494// * //param[in] pu2_iscal_mat
495// * Pointer to the inverse quantization matrix
496// *
497// * //returns  Void
498// *
499// * //remarks
500// *  None
501// *
502// *******************************************************************************
503// */
504//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
505//                                   UWORD8 *pu1_pred,
506//                                   UWORD8 *pu1_out,
507//                                   WORD32 pred_strd,
508//                                   WORD32 out_strd,
509//                                   const UWORD16 *pu2_iscal_mat,
510//                                   const UWORD16 *pu2_weigh_mat,
511//                                   UWORD32 u4_qp_div_6,
512//                                   WORD32 *pi4_tmp,
513//                                   WORD32 iq_start_idx
514//                                   WORD16 *pi2_dc_ld_addr)
515//**************Variables Vs Registers*****************************************
516//x0       => *pi2_src
517//x1       => *pu1_pred
518//x2       => *pu1_out
519//w3       =>  pred_strd
520//w4       =>  out_strd
521//x5       =>  *pu2_iscal_mat
522//x6       =>  *pu2_weigh_mat
523//w7       =>  u4_qp_div_6
524//NOT USED =>  pi4_tmp
525//NOT USED =>  iq_start_idx
526//NOT USED =>  pi2_dc_ld_addr
527
528    .global ih264_iquant_itrans_recon_8x8_av8
529ih264_iquant_itrans_recon_8x8_av8:
530
531    push_v_regs
532    sxtw      x3, w3
533    sxtw      x4, w4
534
535    ld1       {v8.8h -v11.8h}, [x5], #64
536    ld1       {v12.8h-v15.8h}, [x5]
537
538    ld1       {v16.8h -v19.8h}, [x6], #64
539    ld1       {v20.8h -v23.8h}, [x6]
540
541    mov       x8, #16
542    ld1       {v0.8h}, [x0], x8
543    ld1       {v1.8h}, [x0], x8
544    ld1       {v2.8h}, [x0], x8
545    ld1       {v3.8h}, [x0], x8
546    ld1       {v4.8h}, [x0], x8
547    ld1       {v5.8h}, [x0], x8
548    ld1       {v6.8h}, [x0], x8
549    ld1       {v7.8h}, [x0]
550
551    mul       v8.8h, v8.8h, v16.8h
552    mul       v9.8h, v9.8h, v17.8h
553    mul       v10.8h, v10.8h, v18.8h
554    mul       v11.8h, v11.8h, v19.8h
555    mul       v12.8h, v12.8h, v20.8h
556    mul       v13.8h, v13.8h, v21.8h
557    mul       v14.8h, v14.8h, v22.8h
558    mul       v15.8h, v15.8h, v23.8h
559
560    smull     v16.4s, v0.4h, v8.4h
561    smull2    v17.4s, v0.8h, v8.8h
562    smull     v18.4s, v1.4h, v9.4h
563    smull2    v19.4s, v1.8h, v9.8h
564    smull     v20.4s, v2.4h, v10.4h
565    smull2    v21.4s, v2.8h, v10.8h
566    smull     v22.4s, v3.4h, v11.4h
567    smull2    v23.4s, v3.8h, v11.8h
568    smull     v24.4s, v4.4h, v12.4h
569    smull2    v25.4s, v4.8h, v12.8h
570    smull     v26.4s, v5.4h, v13.4h
571    smull2    v27.4s, v5.8h, v13.8h
572    smull     v28.4s, v6.4h, v14.4h
573    smull2    v29.4s, v6.8h, v14.8h
574    smull     v30.4s, v7.4h, v15.4h
575    smull2    v31.4s, v7.8h, v15.8h
576
577    dup       v0.4s, w7
578
579    sshl      v16.4s, v16.4s, v0.4s
580    sshl      v17.4s, v17.4s, v0.4s
581    sshl      v18.4s, v18.4s, v0.4s
582    sshl      v19.4s, v19.4s, v0.4s
583    sshl      v20.4s, v20.4s, v0.4s
584    sshl      v21.4s, v21.4s, v0.4s
585    sshl      v22.4s, v22.4s, v0.4s
586    sshl      v23.4s, v23.4s, v0.4s
587    sshl      v24.4s, v24.4s, v0.4s
588    sshl      v25.4s, v25.4s, v0.4s
589    sshl      v26.4s, v26.4s, v0.4s
590    sshl      v27.4s, v27.4s, v0.4s
591    sshl      v28.4s, v28.4s, v0.4s
592    sshl      v29.4s, v29.4s, v0.4s
593    sshl      v30.4s, v30.4s, v0.4s
594    sshl      v31.4s, v31.4s, v0.4s
595
596    sqrshrn   v0.4h, v16.4s, #6
597    sqrshrn2  v0.8h, v17.4s, #6
598    sqrshrn   v1.4h, v18.4s, #6
599    sqrshrn2  v1.8h, v19.4s, #6
600    sqrshrn   v2.4h, v20.4s, #6
601    sqrshrn2  v2.8h, v21.4s, #6
602    sqrshrn   v3.4h, v22.4s, #6
603    sqrshrn2  v3.8h, v23.4s, #6
604    sqrshrn   v4.4h, v24.4s, #6
605    sqrshrn2  v4.8h, v25.4s, #6
606    sqrshrn   v5.4h, v26.4s, #6
607    sqrshrn2  v5.8h, v27.4s, #6
608    sqrshrn   v6.4h, v28.4s, #6
609    sqrshrn2  v6.8h, v29.4s, #6
610    sqrshrn   v7.4h, v30.4s, #6
611    sqrshrn2  v7.8h, v31.4s, #6
612
613    //loop counter
614    mov       x8, #2
615//1x8 transofORM
616trans_1x8_1d:
617
618    //transpose 8x8
619    trn1      v8.8h, v0.8h, v1.8h
620    trn2      v9.8h, v0.8h, v1.8h
621    trn1      v10.8h, v2.8h, v3.8h
622    trn2      v11.8h, v2.8h, v3.8h
623    trn1      v12.8h, v4.8h, v5.8h
624    trn2      v13.8h, v4.8h, v5.8h
625    trn1      v14.8h, v6.8h, v7.8h
626    trn2      v15.8h, v6.8h, v7.8h
627
628    trn1      v0.4s, v8.4s, v10.4s
629    trn2      v2.4s, v8.4s, v10.4s
630    trn1      v1.4s, v9.4s, v11.4s
631    trn2      v3.4s, v9.4s, v11.4s
632    trn1      v4.4s, v12.4s, v14.4s
633    trn2      v6.4s, v12.4s, v14.4s
634    trn1      v5.4s, v13.4s, v15.4s
635    trn2      v7.4s, v13.4s, v15.4s
636
637    trn1      v8.2d, v0.2d, v4.2d       //0
638    trn2      v12.2d, v0.2d, v4.2d      //1
639    trn1      v9.2d, v1.2d, v5.2d       //2
640    trn2      v13.2d, v1.2d, v5.2d      //3
641    trn1      v10.2d, v2.2d, v6.2d      //4
642    trn2      v14.2d, v2.2d, v6.2d      //5
643    trn1      v11.2d, v3.2d, v7.2d      //6
644    trn2      v15.2d, v3.2d, v7.2d      //7
645
646    // 1 3 5 6 7
647    sshr      v16.8h, v9.8h, #1         //(pi2_tmp_ptr[1] >> 1)
648    sshr      v17.8h, v10.8h, #1        //(pi2_tmp_ptr[2] >> 1)
649    sshr      v18.8h, v11.8h, #1        //(pi2_tmp_ptr[3] >> 1)
650    sshr      v19.8h, v13.8h, #1        //(pi2_tmp_ptr[5] >> 1)
651    sshr      v20.8h, v14.8h, #1        //(pi2_tmp_ptr[6] >> 1)
652    sshr      v21.8h, v15.8h, #1        //(pi2_tmp_ptr[7] >> 1)
653
654    add       v0.8h, v8.8h, v12.8h      // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
655    sub       v2.8h, v8.8h, v12.8h      // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
656
657    sub       v4.8h, v17.8h, v14.8h     //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
658    add       v6.8h, v10.8h, v20.8h     //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
659
660    //-w3 + w5
661    ssubl     v22.4s, v13.4h, v11.4h
662    ssubl2    v23.4s, v13.8h, v11.8h
663    //w3 + w5
664    saddl     v24.4s, v13.4h, v11.4h
665    saddl2    v25.4s, v13.8h, v11.8h
666    //-w1 + w7
667    ssubl     v26.4s, v15.4h, v9.4h
668    ssubl2    v27.4s, v15.8h, v9.8h
669    //w1 + w7
670    saddl     v28.4s, v15.4h, v9.4h
671    saddl2    v29.4s, v15.8h, v9.8h
672
673    //-w3 + w5 - w7
674    ssubw     v22.4s, v22.4s, v15.4h
675    ssubw2    v23.4s, v23.4s, v15.8h
676    //w3 + w5 + w1
677    saddw     v24.4s, v24.4s, v9.4h
678    saddw2    v25.4s, v25.4s, v9.8h
679    //-w1 + w7 + w5
680    saddw     v26.4s, v26.4s, v13.4h
681    saddw2    v27.4s, v27.4s, v13.8h
682    //w1 + w7 - w3
683    ssubw     v28.4s, v28.4s, v11.4h
684    ssubw2    v29.4s, v29.4s, v11.8h
685
686    //-w3 + w5 - w7 - (w7 >> 1)
687    ssubw     v22.4s, v22.4s, v21.4h
688    ssubw2    v23.4s, v23.4s, v21.8h
689    //w3 + w5 + w1 + (w1 >> 1)
690    saddw     v24.4s, v24.4s, v16.4h
691    saddw2    v25.4s, v25.4s, v16.8h
692    //-w1 + w7 + w5 + (w5 >> 1)
693    saddw     v26.4s, v26.4s, v19.4h
694    saddw2    v27.4s, v27.4s, v19.8h
695    //w1 + w7 - w3 - (w3 >> 1)
696    ssubw     v28.4s, v28.4s, v18.4h
697    ssubw2    v29.4s, v29.4s, v18.8h
698
699    xtn       v1.4h, v22.4s
700    xtn2      v1.8h, v23.4s
701    xtn       v3.4h, v28.4s
702    xtn2      v3.8h, v29.4s
703    xtn       v5.4h, v26.4s
704    xtn2      v5.8h, v27.4s
705    xtn       v7.4h, v24.4s
706    xtn2      v7.8h, v25.4s
707
708    sshr      v16.8h, v1.8h, #2         //(y1 >> 2)
709    sshr      v17.8h, v3.8h, #2         //(y3 >> 2)
710    sshr      v18.8h, v5.8h, #2         //(y5 >> 2)
711    sshr      v19.8h, v7.8h, #2         //(y7 >> 2)
712
713    add       v8.8h, v0.8h, v6.8h
714    add       v9.8h, v1.8h, v19.8h
715    add       v10.8h, v2.8h, v4.8h
716    add       v11.8h, v3.8h, v18.8h
717    sub       v12.8h, v2.8h, v4.8h
718    sub       v13.8h, v17.8h, v5.8h
719    sub       v14.8h, v0.8h, v6.8h
720    sub       v15.8h, v7.8h, v16.8h
721
722    add       v0.8h, v8.8h, v15.8h
723    add       v1.8h, v10.8h, v13.8h
724    add       v2.8h, v12.8h, v11.8h
725    add       v3.8h, v14.8h, v9.8h
726    sub       v4.8h, v14.8h, v9.8h
727    sub       v5.8h, v12.8h, v11.8h
728    sub       v6.8h, v10.8h, v13.8h
729    sub       v7.8h, v8.8h, v15.8h
730
731    subs      x8, x8, #1
732    bne       trans_1x8_1d
733
734    ld1       {v22.8b}, [x1], x3
735    ld1       {v23.8b}, [x1], x3
736    ld1       {v24.8b}, [x1], x3
737    ld1       {v25.8b}, [x1], x3
738    ld1       {v26.8b}, [x1], x3
739    ld1       {v27.8b}, [x1], x3
740    ld1       {v28.8b}, [x1], x3
741    ld1       {v29.8b}, [x1]
742
743    srshr     v0.8h, v0.8h, #6
744    srshr     v1.8h, v1.8h, #6
745    srshr     v2.8h, v2.8h, #6
746    srshr     v3.8h, v3.8h, #6
747    srshr     v4.8h, v4.8h, #6
748    srshr     v5.8h, v5.8h, #6
749    srshr     v6.8h, v6.8h, #6
750    srshr     v7.8h, v7.8h, #6
751
752    uaddw     v0.8h, v0.8h, v22.8b
753    uaddw     v1.8h, v1.8h, v23.8b
754    uaddw     v2.8h, v2.8h, v24.8b
755    uaddw     v3.8h, v3.8h, v25.8b
756    uaddw     v4.8h, v4.8h, v26.8b
757    uaddw     v5.8h, v5.8h, v27.8b
758    uaddw     v6.8h, v6.8h, v28.8b
759    uaddw     v7.8h, v7.8h, v29.8b
760
761    sqxtun    v0.8b, v0.8h
762    sqxtun    v1.8b, v1.8h
763    sqxtun    v2.8b, v2.8h
764    sqxtun    v3.8b, v3.8h
765    sqxtun    v4.8b, v4.8h
766    sqxtun    v5.8b, v5.8h
767    sqxtun    v6.8b, v6.8h
768    sqxtun    v7.8b, v7.8h
769
770    st1       {v0.8b}, [x2], x4
771    st1       {v1.8b}, [x2], x4
772    st1       {v2.8b}, [x2], x4
773    st1       {v3.8b}, [x2], x4
774    st1       {v4.8b}, [x2], x4
775    st1       {v5.8b}, [x2], x4
776    st1       {v6.8b}, [x2], x4
777    st1       {v7.8b}, [x2]
778
779    pop_v_regs
780    ret
781
782
783
784
785