1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21// ******************************************************************************* 22// * @file 23// * ih264_ihadamard_scaling_av8.s 24// * 25// * @brief 26// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs 27// * of 16x16 intra-prediction 28// * 29// * @author 30// * Mohit 31// * 32// * @par List of Functions: 33// * - ih264_ihadamard_scaling_4x4_av8() 34// * 35// * @remarks 36// * None 37// * 38.include "ih264_neon_macros.s" 39 40// ******************************************************************************* 41// */ 42// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients 43// * of a 16x16 intra prediction macroblock, and then performs scaling. 44// * prediction buffer 45// * 46// * @par Description: 47// * The DC coefficients pass through a 2-stage inverse hadamard transform. 48// * This inverse transformed content is scaled to based on Qp value. 49// * 50// * @param[in] pi2_src 51// * input 4x4 block of DC coefficients 52// * 53// * @param[out] pi2_out 54// * output 4x4 block 55// * 56// * @param[in] pu2_iscal_mat 57// * pointer to scaling list 58// * 59// * @param[in] pu2_weigh_mat 60// * pointer to weight matrix 61// * 62// * @param[in] u4_qp_div_6 63// * Floor (qp/6) 64// * 65// * @param[in] pi4_tmp 66// * temporary buffer of size 1*16 67// * 68// * @returns none 69// * 70// * @remarks none 71// * 72// ******************************************************************************* 73// */ 74// * 75// ******************************************************************************* 76// */ 77// void ih264_ihadamard_scaling_4x4(word16* pi2_src, 78// word16* pi2_out, 79// const uword16 *pu2_iscal_mat, 80// const uword16 *pu2_weigh_mat, 81// uword32 u4_qp_div_6, 82// word32* pi4_tmp) 83//**************variables vs registers***************************************** 84//x0 => *pi2_src 85//x1 => *pi2_out 86//x2 => *pu2_iscal_mat 87//x3 => *pu2_weigh_mat 88//x4=> u4_qp_div_6 89 90.text 91.p2align 2 92 93 .global ih264_ihadamard_scaling_4x4_av8 94ih264_ihadamard_scaling_4x4_av8: 95 96//only one shift is done in horizontal inverse because, 97//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 98//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 99 push_v_regs 100 101//=======================inverse hadamard transform================================ 102 103 ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7 104 105 dup v14.4s, w4 // populate the u4_qp_div_6 106 ld1 {v15.h}[0], [x3] // pu2_weigh_mat 107 ld1 {v16.h}[0], [x2] //pu2_iscal_mat 108 109 saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7 110 saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6 111 ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6 112 ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7 113 114 add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 115 add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 116 sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 117 sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 118 119 umull v15.4s, v15.4h, v16.4h 120 dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0] 121 122 //transpose 123 trn1 v4.4s, v0.4s, v1.4s 124 trn2 v5.4s, v0.4s, v1.4s 125 trn1 v6.4s, v2.4s, v3.4s 126 trn2 v7.4s, v2.4s, v3.4s 127 128 trn1 v0.2d, v4.2d, v6.2d 129 trn2 v2.2d, v4.2d, v6.2d 130 trn1 v1.2d, v5.2d, v7.2d 131 trn2 v3.2d, v5.2d, v7.2d 132 //end transpose 133 134 add v4.4s, v0.4s, v3.4s //x0 = x4+x7 135 add v5.4s, v1.4s, v2.4s //x1 = x5+x6 136 sub v6.4s, v1.4s, v2.4s //x2 = x5-x6 137 sub v7.4s, v0.4s, v3.4s //x3 = x4-x7 138 139 add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 140 add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 141 sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 142 sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 143 144 mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 145 mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 146 mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 147 mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 148 149 sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 150 sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 151 sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 152 sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 153 154 sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 155 sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 156 sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 157 sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 158 159 st1 {v0.4h-v3.4h}, [x1] //store the result 160 161 pop_v_regs 162 ret 163 164 165// ******************************************************************************* 166// */ 167// * @brief This function performs a 2x2 inverse hadamard transform for chroma block 168// * 169// * @par Description: 170// * The DC coefficients pass through a 2-stage inverse hadamard transform. 171// * This inverse transformed content is scaled to based on Qp value. 172// * Both DC blocks of U and v blocks are processesd 173// * 174// * @param[in] pi2_src 175// * input 1x8 block of ceffs. First 4 are from U and next from V 176// * 177// * @param[out] pi2_out 178// * output 1x8 block 179// * 180// * @param[in] pu2_iscal_mat 181// * pointer to scaling list 182// * 183// * @param[in] pu2_weigh_mat 184// * pointer to weight matrix 185// * 186// * @param[in] u4_qp_div_6 187// * Floor (qp/6) 188// * 189// * @returns none 190// * 191// * @remarks none 192// * 193// ******************************************************************************* 194// */ 195// * 196// ******************************************************************************* 197// */ 198// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, 199// WORD16* pi2_out, 200// const UWORD16 *pu2_iscal_mat, 201// const UWORD16 *pu2_weigh_mat, 202// UWORD32 u4_qp_div_6, 203 204 .global ih264_ihadamard_scaling_2x2_uv_av8 205ih264_ihadamard_scaling_2x2_uv_av8: 206 207//Registers used 208// x0 : *pi2_src 209// x1 : *pi2_out 210// x2 : *pu2_iscal_mat 211// x3 : *pu2_weigh_mat 212// x4 : u4_qp_div_6 213 push_v_regs 214 ld1 {v26.h}[0], [x2] 215 ld1 {v27.h}[0], [x3] 216 217 sub w4, w4, #5 //qp/6 - 4 218 dup v28.4s, w4 //load qp/6 219 220 ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs 221 //i2_x4,i2_x6,i2_y4,i1_y6 -> d0 222 //i2_x5,i2_x7,i2_y5,i1_y6 -> d1 223 224 saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2 225 ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3 226 227 umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0] 228 dup v30.4s, v30.s[0] 229 230 trn1 v0.4s, v2.4s, v4.4s 231 trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1 232 233 add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5 234 sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7 235 236 mul v2.4s, v2.4s, v30.4s 237 mul v3.4s, v3.4s, v30.4s 238 239 sshl v2.4s, v2.4s, v28.4s 240 sshl v3.4s, v3.4s, v28.4s 241 242 xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5 243 xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7 244 245 st2 {v0.4s-v1.4s}, [x1] 246 pop_v_regs 247 ret 248 249 250 251