1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_ihadamard_scaling_av8.s
24// *
25// * @brief
26// *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
27// *  of 16x16 intra-prediction
28// *
29// * @author
30// *  Mohit
31// *
32// * @par List of Functions:
33// *  - ih264_ihadamard_scaling_4x4_av8()
34// *
35// * @remarks
36// *  None
37// *
38.include "ih264_neon_macros.s"
39
40// *******************************************************************************
41// */
42// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
43// * of a 16x16 intra prediction macroblock, and then performs scaling.
44// * prediction buffer
45// *
46// * @par Description:
47// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
48// *  This inverse transformed content is scaled to based on Qp value.
49// *
50// * @param[in] pi2_src
51// *  input 4x4 block of DC coefficients
52// *
53// * @param[out] pi2_out
54// *  output 4x4 block
55// *
56// * @param[in] pu2_iscal_mat
57// *  pointer to scaling list
58// *
59// * @param[in] pu2_weigh_mat
60// *  pointer to weight matrix
61// *
62// * @param[in] u4_qp_div_6
63// *  Floor (qp/6)
64// *
65// * @param[in] pi4_tmp
66// * temporary buffer of size 1*16
67// *
68// * @returns none
69// *
70// * @remarks none
71// *
72// *******************************************************************************
73// */
74// *
75// *******************************************************************************
76// */
77// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
78//        word16* pi2_out,
79//        const uword16 *pu2_iscal_mat,
80//        const uword16 *pu2_weigh_mat,
81//        uword32 u4_qp_div_6,
82//        word32* pi4_tmp)
83//**************variables vs registers*****************************************
84//x0 => *pi2_src
85//x1 => *pi2_out
86//x2 => *pu2_iscal_mat
87//x3 => *pu2_weigh_mat
88//x4=>   u4_qp_div_6
89
90.text
91.p2align 2
92
93    .global ih264_ihadamard_scaling_4x4_av8
94ih264_ihadamard_scaling_4x4_av8:
95
96//only one shift is done in horizontal inverse because,
97//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
98//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
99    push_v_regs
100
101//=======================inverse hadamard transform================================
102
103    ld4       {v0.4h-v3.4h}, [x0]       //load x4,x5,x6,x7
104
105    dup       v14.4s, w4                // populate the u4_qp_div_6
106    ld1       {v15.h}[0], [x3]          // pu2_weigh_mat
107    ld1       {v16.h}[0], [x2]          //pu2_iscal_mat
108
109    saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7
110    saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6
111    ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6
112    ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7
113
114    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
115    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
116    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
117    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
118
119    umull     v15.4s, v15.4h, v16.4h
120    dup       v15.4s, v15.s[0]          //pu2_weigh_mat[0]*pu2_iscal_mat[0]
121
122    //transpose
123    trn1      v4.4s, v0.4s, v1.4s
124    trn2      v5.4s, v0.4s, v1.4s
125    trn1      v6.4s, v2.4s, v3.4s
126    trn2      v7.4s, v2.4s, v3.4s
127
128    trn1      v0.2d, v4.2d, v6.2d
129    trn2      v2.2d, v4.2d, v6.2d
130    trn1      v1.2d, v5.2d, v7.2d
131    trn2      v3.2d, v5.2d, v7.2d
132    //end transpose
133
134    add       v4.4s, v0.4s, v3.4s       //x0 = x4+x7
135    add       v5.4s, v1.4s, v2.4s       //x1 = x5+x6
136    sub       v6.4s, v1.4s, v2.4s       //x2 = x5-x6
137    sub       v7.4s, v0.4s, v3.4s       //x3 = x4-x7
138
139    add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
140    add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
141    sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
142    sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
143
144    mul       v0.4s, v0.4s, v15.4s      // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
145    mul       v1.4s, v1.4s, v15.4s      // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
146    mul       v2.4s, v2.4s, v15.4s      // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
147    mul       v3.4s, v3.4s, v15.4s      // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
148
149    sshl      v0.4s, v0.4s, v14.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
150    sshl      v1.4s, v1.4s, v14.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
151    sshl      v2.4s, v2.4s, v14.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
152    sshl      v3.4s, v3.4s, v14.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
153
154    sqrshrn   v0.4h, v0.4s, #6          // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
155    sqrshrn   v1.4h, v1.4s, #6          // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
156    sqrshrn   v2.4h, v2.4s, #6          // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
157    sqrshrn   v3.4h, v3.4s, #6          // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
158
159    st1       {v0.4h-v3.4h}, [x1]       //store the result
160
161    pop_v_regs
162    ret
163
164
165// *******************************************************************************
166// */
167// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
168// *
169// * @par Description:
170// *  The DC coefficients pass through a 2-stage inverse hadamard transform.
171// *  This inverse transformed content is scaled to based on Qp value.
172// *  Both DC blocks of U and v blocks are processesd
173// *
174// * @param[in] pi2_src
175// *  input 1x8 block of ceffs. First 4 are from U and next from V
176// *
177// * @param[out] pi2_out
178// *  output 1x8 block
179// *
180// * @param[in] pu2_iscal_mat
181// *  pointer to scaling list
182// *
183// * @param[in] pu2_weigh_mat
184// *  pointer to weight matrix
185// *
186// * @param[in] u4_qp_div_6
187// *  Floor (qp/6)
188// *
189// * @returns none
190// *
191// * @remarks none
192// *
193// *******************************************************************************
194// */
195// *
196// *******************************************************************************
197// */
198// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
199//                                  WORD16* pi2_out,
200//                                  const UWORD16 *pu2_iscal_mat,
201//                                  const UWORD16 *pu2_weigh_mat,
202//                                  UWORD32 u4_qp_div_6,
203
204    .global ih264_ihadamard_scaling_2x2_uv_av8
205ih264_ihadamard_scaling_2x2_uv_av8:
206
207//Registers used
208//   x0 : *pi2_src
209//   x1 : *pi2_out
210//   x2 : *pu2_iscal_mat
211//   x3 : *pu2_weigh_mat
212//   x4 : u4_qp_div_6
213    push_v_regs
214    ld1       {v26.h}[0], [x2]
215    ld1       {v27.h}[0], [x3]
216
217    sub       w4, w4, #5                //qp/6 - 4
218    dup       v28.4s, w4                //load qp/6
219
220    ld2       {v0.4h, v1.4h}, [x0]      //load 8 dc coeffs
221                                        //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
222                                        //i2_x5,i2_x7,i2_y5,i1_y6 -> d1
223
224    saddl     v2.4s, v0.4h, v1.4h       //i4_x0 = i4_x4 + i4_x5;...x2
225    ssubl     v4.4s, v0.4h, v1.4h       //i4_x1 = i4_x4 - i4_x5;...x3
226
227    umull     v30.4s, v26.4h, v27.4h    //pu2_iscal_mat[0]*pu2_weigh_mat[0]
228    dup       v30.4s, v30.s[0]
229
230    trn1      v0.4s, v2.4s, v4.4s
231    trn2      v1.4s, v2.4s, v4.4s       //i4_x0 i4_x1 -> q1
232
233    add       v2.4s, v0.4s, v1.4s       //i4_x4 = i4_x0+i4_x2;.. i4_x5
234    sub       v3.4s, v0.4s, v1.4s       //i4_x6 = i4_x0-i4_x2;.. i4_x7
235
236    mul       v2.4s, v2.4s, v30.4s
237    mul       v3.4s, v3.4s, v30.4s
238
239    sshl      v2.4s, v2.4s, v28.4s
240    sshl      v3.4s, v3.4s, v28.4s
241
242    xtn       v0.4h, v2.4s              //i4_x4 i4_x5 i4_y4 i4_y5
243    xtn       v1.4h, v3.4s              //i4_x6 i4_x7 i4_y6 i4_y7
244
245    st2       {v0.4s-v1.4s}, [x1]
246    pop_v_regs
247    ret
248
249
250
251