1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21///******************************************************************************* 22// * //file 23// * ih264_iquant_itrans_recon_a9.s 24// * 25// * //brief 26// * Contains function definitions for single stage inverse transform 27// * 28// * //author 29// * Parthiban V 30// * Mohit 31// * Harinarayanaan 32// * 33// * //par List of Functions: 34// * - ih264_iquant_itrans_recon_4x4_av8() 35// * - ih264_iquant_itrans_recon_8x8_av8() 36// * - ih264_iquant_itrans_recon_chroma_4x4_av8() 37// * 38// * //remarks 39// * None 40// * 41// ******************************************************************************* 42 43.text 44.p2align 2 45.include "ih264_neon_macros.s" 46 47///* 48// ******************************************************************************* 49// * 50// * //brief 51// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 52// * 53// * //par Description: 54// * Performs inverse transform Ci4 and adds the residue to get the 55// * reconstructed block 56// * 57// * //param[in] pi2_src 58// * Input 4x4 coefficients 59// * 60// * //param[in] pu1_pred 61// * Prediction 4x4 block 62// * 63// * //param[out] pu1_out 64// * Output 4x4 block 65// * 66// * //param[in] u4_qp_div_6 67// * QP 68// * 69// * //param[in] pu2_weigh_mat 70// * Pointer to weight matrix 71// * 72// * //param[in] pred_strd, 73// * Prediction stride 74// * 75// * //param[in] out_strd 76// * Output Stride 77// * 78// *//param[in] pi2_tmp 79// * temporary buffer of size 1*16 80// * 81// * //param[in] pu2_iscal_mat 82// * Pointer to the inverse quantization matrix 83// * 84// * //returns Void 85// * 86// * //remarks 87// * None 88// * 89// ******************************************************************************* 90// */ 91//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, 92// UWORD8 *pu1_pred, 93// UWORD8 *pu1_out, 94// WORD32 pred_strd, 95// WORD32 out_strd, 96// const UWORD16 *pu2_iscal_mat, 97// const UWORD16 *pu2_weigh_mat, 98// UWORD32 u4_qp_div_6, 99// WORD32 *pi4_tmp, 100// WORD32 iq_start_idx 101// WORD16 *pi2_dc_ld_addr) 102//**************Variables Vs Registers***************************************** 103//x0 => *pi2_src 104//x1 => *pu1_pred 105//x2 => *pu1_out 106//w3 => pred_strd 107//w4 => out_strd 108//x5 => *pu2_iscal_mat 109//x6 => *pu2_weigh_mat 110//w7 => u4_qp_div_6 111// => pi4_tmp 112// => iq_start_idx 113// => pi2_dc_ld_addr 114//Only one shift is done in horizontal inverse because, 115//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 116//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 117 118 .global ih264_iquant_itrans_recon_4x4_av8 119ih264_iquant_itrans_recon_4x4_av8: 120 121 push_v_regs 122 sxtw x3, w3 123 sxtw x4, w4 124 125 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 126 127 ldr w8, [sp, #72] //Loads iq_start_idx 128 sxtw x8, w8 129 130 ldr x10, [sp, #80] //Load alternate dc address 131 132 subs x8, x8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set 133 134 135//=======================DEQUANT FROM HERE=================================== 136 137 ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 138 ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 139 ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 140 141 142 mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 143 mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 144 mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 145 mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 146 147 smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 148 smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 149 smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 150 smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 151 152 sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 153 sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 154 sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 155 sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 156 157 sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 158 sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 159 sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 160 sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 161 162 bne skip_loading_luma_dc_src 163 ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_ld_addr[0], if x8==1 164skip_loading_luma_dc_src: 165 166 //========= PROCESS IDCT FROM HERE ======= 167 //Steps for Stage 1: 168 //------------------ 169 ld1 {v30.s}[0], [x1], x3 // i row load pu1_pred buffer 170 171 sshr v8.4h, v1.4h, #1 // d1>>1 172 sshr v9.4h, v3.4h, #1 // d3>>1 173 174 add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// 175 sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// 176 sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// 177 add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// 178 179 ld1 {v30.s}[1], [x1], x3 // ii row load pu1_pred buffer 180 181 add v10.4h, v4.4h , v7.4h // x0+x3 182 add v11.4h, v5.4h , v6.4h // x1+x2 183 sub v12.4h, v5.4h , v6.4h // x1-x2 184 sub v13.4h, v4.4h , v7.4h 185 186 ld1 {v31.s}[0], [x1], x3 // iii row load pu1_pred buf 187 188 189 //Steps for Stage 2: 190 //transopose 191 trn1 v4.4h, v10.4h, v11.4h 192 trn2 v5.4h, v10.4h, v11.4h 193 trn1 v6.4h, v12.4h, v13.4h 194 trn2 v7.4h, v12.4h, v13.4h 195 196 trn1 v10.2s, v4.2s, v6.2s // 0 197 trn1 v11.2s, v5.2s, v7.2s // 8 198 trn2 v12.2s, v4.2s, v6.2s // 4 199 trn2 v13.2s, v5.2s, v7.2s 200 //end transpose 201 202 sshr v18.4h, v11.4h, #1 // q0>>1 203 sshr v19.4h, v13.4h, #1 // q1>>1 204 205 add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// 206 sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// 207 sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// 208 add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// 209 210 211 ld1 {v31.s}[1], [x1], x3 // iv row load pu1_pred buffer 212 213 add v20.4h, v14.4h, v17.4h // x0 + x3 214 add v21.4h, v15.4h, v16.4h // x1 + x2 215 sub v22.4h, v15.4h, v16.4h // x1 - x2 216 sub v23.4h, v14.4h, v17.4h // x0 - x3 217 218 mov v20.d[1], v21.d[0] 219 mov v22.d[1], v23.d[0] 220 221 srshr v20.8h, v20.8h, #6 222 srshr v22.8h, v22.8h, #6 223 224 uaddw v20.8h, v20.8h , v30.8b 225 uaddw v22.8h, v22.8h , v31.8b 226 227 sqxtun v0.8b, v20.8h 228 sqxtun v1.8b, v22.8h 229 230 st1 {v0.s}[0], [x2], x4 //i row store the value 231 st1 {v0.s}[1], [x2], x4 //ii row store the value 232 st1 {v1.s}[0], [x2], x4 //iii row store the value 233 st1 {v1.s}[1], [x2] //iv row store the value 234 235 pop_v_regs 236 ret 237 238 239///** 240// ******************************************************************************* 241// * 242// * @brief 243// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 244// * 245// * @par Description: 246// * Performs inverse transform Ci4 and adds the residue to get the 247// * reconstructed block 248// * 249// * @param[in] pi2_src 250// * Input 4x4 coefficients 251// * 252// * @param[in] pu1_pred 253// * Prediction 4x4 block 254// * 255// * @param[out] pu1_out 256// * Output 4x4 block 257// * 258// * @param[in] u4_qp_div_6 259// * QP 260// * 261// * @param[in] pu2_weigh_mat 262// * Pointer to weight matrix 263// * 264// * @param[in] pred_strd, 265// * Prediction stride 266// * 267// * @param[in] out_strd 268// * Output Stride 269// * 270// *@param[in] pi2_tmp 271// * temporary buffer of size 1*16 272// * 273// * @param[in] pu2_iscal_mat 274// * Pointer to the inverse quantization matrix 275// * 276// * @returns Void 277// * 278// * @remarks 279// * None 280// * 281// ******************************************************************************* 282// */ 283//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, 284// UWORD8 *pu1_pred, 285// UWORD8 *pu1_out, 286// WORD32 pred_strd, 287// WORD32 out_strd, 288// const UWORD16 *pu2_iscal_mat, 289// const UWORD16 *pu2_weigh_mat, 290// UWORD32 u4_qp_div_6, 291// WORD32 *pi4_tmp 292// WORD16 *pi2_dc_src) 293//**************Variables Vs Registers***************************************** 294//x0 => *pi2_src 295//x1 => *pu1_pred 296//x2 => *pu1_out 297//w3 => pred_strd 298//w4 => out_strd 299//x5 => *pu2_iscal_mat 300//x6 => *pu2_weigh_mat 301//w7 => u4_qp_div_6 302//sp => pi4_tmp 303//sp#8 => *pi2_dc_src 304 305 .global ih264_iquant_itrans_recon_chroma_4x4_av8 306ih264_iquant_itrans_recon_chroma_4x4_av8: 307 308//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 309//If the macro value changes need to change the instruction according to it. 310//Only one shift is done in horizontal inverse because, 311//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 312//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 313 314//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing 315//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput 316//all instructions were taken as equal 317 318 //reduce sp by 64 319 push_v_regs 320 sxtw x3, w3 321 sxtw x4, w4 322 323 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 324 325 //was at sp + 8, hence now at sp+64+8 = sp+72 326 ldr x10, [sp, #72] //Load alternate dc address 327 328//=======================DEQUANT FROM HERE=================================== 329 330 ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 331 ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 332 ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 333 334 335 mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 336 mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 337 mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 338 mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 339 340 smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 341 smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 342 smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 343 smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 344 345 sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 346 sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 347 sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 348 sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 349 350 sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 351 sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 352 sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 353 sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 354 355 ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_src[0] 356 357 //========= PROCESS IDCT FROM HERE ======= 358 //Steps for Stage 1: 359 //------------------ 360 361 sshr v8.4h, v1.4h, #1 // d1>>1 362 sshr v9.4h, v3.4h, #1 // d3>>1 363 364 add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// 365 sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// 366 sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// 367 add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// 368 369 370 add v10.4h, v4.4h , v7.4h // x0+x3 371 add v11.4h, v5.4h , v6.4h // x1+x2 372 sub v12.4h, v5.4h , v6.4h // x1-x2 373 sub v13.4h, v4.4h , v7.4h 374 375 ld1 {v26.8b}, [x1], x3 // i row load pu1_pred buffer 376 ld1 {v27.8b}, [x1], x3 // ii row load pu1_pred buffer 377 ld1 {v28.8b}, [x1], x3 // iii row load pu1_pred buf 378 ld1 {v29.8b}, [x1], x3 // iv row load pu1_pred buffer 379 380 //Steps for Stage 2: 381 //transopose 382 trn1 v4.4h, v10.4h, v11.4h 383 trn2 v5.4h, v10.4h, v11.4h 384 trn1 v6.4h, v12.4h, v13.4h 385 trn2 v7.4h, v12.4h, v13.4h 386 387 trn1 v10.2s, v4.2s, v6.2s // 0 388 trn1 v11.2s, v5.2s, v7.2s // 8 389 trn2 v12.2s, v4.2s, v6.2s // 4 390 trn2 v13.2s, v5.2s, v7.2s 391 //end transpose 392 393 sshr v18.4h, v11.4h, #1 // q0>>1 394 sshr v19.4h, v13.4h, #1 // q1>>1 395 396 add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// 397 sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// 398 sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// 399 add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// 400 401 //Backup the output addr 402 mov x0, x2 403 404 //load outpt buufer for interleaving 405 ld1 {v10.8b}, [x2], x4 406 ld1 {v11.8b}, [x2], x4 407 ld1 {v12.8b}, [x2], x4 408 ld1 {v13.8b}, [x2] 409 410 add v20.4h, v14.4h, v17.4h // x0 + x3 411 add v21.4h, v15.4h, v16.4h // x1 + x2 412 sub v22.4h, v15.4h, v16.4h // x1 - x2 413 sub v23.4h, v14.4h, v17.4h // x0 - x3 414 415 srshr v20.4h, v20.4h, #6 416 srshr v21.4h, v21.4h, #6 417 srshr v22.4h, v22.4h, #6 418 srshr v23.4h, v23.4h, #6 419 420 //nop v30.8b //dummy for deinterleaving 421 movi v31.4h, #0x00ff //mask for interleaving [copy lower 8 bits] 422 423 //Extract u/v plane from interleaved data 424 uzp1 v26.8b, v26.8b, v30.8b 425 uzp1 v27.8b, v27.8b, v30.8b 426 uzp1 v28.8b, v28.8b, v30.8b 427 uzp1 v29.8b, v29.8b, v30.8b 428 429 uaddw v20.8h, v20.8h, v26.8b 430 uaddw v21.8h, v21.8h, v27.8b 431 uaddw v22.8h, v22.8h, v28.8b 432 uaddw v23.8h, v23.8h, v29.8b 433 434 sqxtun v0.8b, v20.8h 435 sqxtun v1.8b, v21.8h 436 sqxtun v2.8b, v22.8h 437 sqxtun v3.8b, v23.8h 438 439 //long the output so that we have 0 at msb and value at lsb 440 uxtl v6.8h, v0.8b 441 uxtl v7.8h, v1.8b 442 uxtl v8.8h, v2.8b 443 uxtl v9.8h, v3.8b 444 445 //select lsbs from proceesd data and msbs from pu1_out loaded data 446 bit v10.8b, v6.8b, v31.8b 447 bit v11.8b, v7.8b, v31.8b 448 bit v12.8b, v8.8b, v31.8b 449 bit v13.8b, v9.8b, v31.8b 450 451 //store the interleaved result 452 st1 {v10.8b}, [x0], x4 453 st1 {v11.8b}, [x0], x4 454 st1 {v12.8b}, [x0], x4 455 st1 {v13.8b}, [x0] 456 457 pop_v_regs 458 ret 459 460///* 461// ******************************************************************************* 462// * 463// * //brief 464// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 465// * 466// * //par Description: 467// * Performs inverse transform Ci8 and adds the residue to get the 468// * reconstructed block 469// * 470// * //param[in] pi2_src 471// * Input 4x4 coefficients 472// * 473// * //param[in] pu1_pred 474// * Prediction 4x4 block 475// * 476// * //param[out] pu1_out 477// * Output 4x4 block 478// * 479// * //param[in] u4_qp_div_6 480// * QP 481// * 482// * //param[in] pu2_weigh_mat 483// * Pointer to weight matrix 484// * 485// * //param[in] pred_strd, 486// * Prediction stride 487// * 488// * //param[in] out_strd 489// * Output Stride 490// * 491// *//param[in] pi2_tmp 492// * temporary buffer of size 1*64 493// * 494// * //param[in] pu2_iscal_mat 495// * Pointer to the inverse quantization matrix 496// * 497// * //returns Void 498// * 499// * //remarks 500// * None 501// * 502// ******************************************************************************* 503// */ 504//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, 505// UWORD8 *pu1_pred, 506// UWORD8 *pu1_out, 507// WORD32 pred_strd, 508// WORD32 out_strd, 509// const UWORD16 *pu2_iscal_mat, 510// const UWORD16 *pu2_weigh_mat, 511// UWORD32 u4_qp_div_6, 512// WORD32 *pi4_tmp, 513// WORD32 iq_start_idx 514// WORD16 *pi2_dc_ld_addr) 515//**************Variables Vs Registers***************************************** 516//x0 => *pi2_src 517//x1 => *pu1_pred 518//x2 => *pu1_out 519//w3 => pred_strd 520//w4 => out_strd 521//x5 => *pu2_iscal_mat 522//x6 => *pu2_weigh_mat 523//w7 => u4_qp_div_6 524//NOT USED => pi4_tmp 525//NOT USED => iq_start_idx 526//NOT USED => pi2_dc_ld_addr 527 528 .global ih264_iquant_itrans_recon_8x8_av8 529ih264_iquant_itrans_recon_8x8_av8: 530 531 push_v_regs 532 sxtw x3, w3 533 sxtw x4, w4 534 535 ld1 {v8.8h -v11.8h}, [x5], #64 536 ld1 {v12.8h-v15.8h}, [x5] 537 538 ld1 {v16.8h -v19.8h}, [x6], #64 539 ld1 {v20.8h -v23.8h}, [x6] 540 541 mov x8, #16 542 ld1 {v0.8h}, [x0], x8 543 ld1 {v1.8h}, [x0], x8 544 ld1 {v2.8h}, [x0], x8 545 ld1 {v3.8h}, [x0], x8 546 ld1 {v4.8h}, [x0], x8 547 ld1 {v5.8h}, [x0], x8 548 ld1 {v6.8h}, [x0], x8 549 ld1 {v7.8h}, [x0] 550 551 mul v8.8h, v8.8h, v16.8h 552 mul v9.8h, v9.8h, v17.8h 553 mul v10.8h, v10.8h, v18.8h 554 mul v11.8h, v11.8h, v19.8h 555 mul v12.8h, v12.8h, v20.8h 556 mul v13.8h, v13.8h, v21.8h 557 mul v14.8h, v14.8h, v22.8h 558 mul v15.8h, v15.8h, v23.8h 559 560 smull v16.4s, v0.4h, v8.4h 561 smull2 v17.4s, v0.8h, v8.8h 562 smull v18.4s, v1.4h, v9.4h 563 smull2 v19.4s, v1.8h, v9.8h 564 smull v20.4s, v2.4h, v10.4h 565 smull2 v21.4s, v2.8h, v10.8h 566 smull v22.4s, v3.4h, v11.4h 567 smull2 v23.4s, v3.8h, v11.8h 568 smull v24.4s, v4.4h, v12.4h 569 smull2 v25.4s, v4.8h, v12.8h 570 smull v26.4s, v5.4h, v13.4h 571 smull2 v27.4s, v5.8h, v13.8h 572 smull v28.4s, v6.4h, v14.4h 573 smull2 v29.4s, v6.8h, v14.8h 574 smull v30.4s, v7.4h, v15.4h 575 smull2 v31.4s, v7.8h, v15.8h 576 577 dup v0.4s, w7 578 579 sshl v16.4s, v16.4s, v0.4s 580 sshl v17.4s, v17.4s, v0.4s 581 sshl v18.4s, v18.4s, v0.4s 582 sshl v19.4s, v19.4s, v0.4s 583 sshl v20.4s, v20.4s, v0.4s 584 sshl v21.4s, v21.4s, v0.4s 585 sshl v22.4s, v22.4s, v0.4s 586 sshl v23.4s, v23.4s, v0.4s 587 sshl v24.4s, v24.4s, v0.4s 588 sshl v25.4s, v25.4s, v0.4s 589 sshl v26.4s, v26.4s, v0.4s 590 sshl v27.4s, v27.4s, v0.4s 591 sshl v28.4s, v28.4s, v0.4s 592 sshl v29.4s, v29.4s, v0.4s 593 sshl v30.4s, v30.4s, v0.4s 594 sshl v31.4s, v31.4s, v0.4s 595 596 sqrshrn v0.4h, v16.4s, #6 597 sqrshrn2 v0.8h, v17.4s, #6 598 sqrshrn v1.4h, v18.4s, #6 599 sqrshrn2 v1.8h, v19.4s, #6 600 sqrshrn v2.4h, v20.4s, #6 601 sqrshrn2 v2.8h, v21.4s, #6 602 sqrshrn v3.4h, v22.4s, #6 603 sqrshrn2 v3.8h, v23.4s, #6 604 sqrshrn v4.4h, v24.4s, #6 605 sqrshrn2 v4.8h, v25.4s, #6 606 sqrshrn v5.4h, v26.4s, #6 607 sqrshrn2 v5.8h, v27.4s, #6 608 sqrshrn v6.4h, v28.4s, #6 609 sqrshrn2 v6.8h, v29.4s, #6 610 sqrshrn v7.4h, v30.4s, #6 611 sqrshrn2 v7.8h, v31.4s, #6 612 613 //loop counter 614 mov x8, #2 615//1x8 transofORM 616trans_1x8_1d: 617 618 //transpose 8x8 619 trn1 v8.8h, v0.8h, v1.8h 620 trn2 v9.8h, v0.8h, v1.8h 621 trn1 v10.8h, v2.8h, v3.8h 622 trn2 v11.8h, v2.8h, v3.8h 623 trn1 v12.8h, v4.8h, v5.8h 624 trn2 v13.8h, v4.8h, v5.8h 625 trn1 v14.8h, v6.8h, v7.8h 626 trn2 v15.8h, v6.8h, v7.8h 627 628 trn1 v0.4s, v8.4s, v10.4s 629 trn2 v2.4s, v8.4s, v10.4s 630 trn1 v1.4s, v9.4s, v11.4s 631 trn2 v3.4s, v9.4s, v11.4s 632 trn1 v4.4s, v12.4s, v14.4s 633 trn2 v6.4s, v12.4s, v14.4s 634 trn1 v5.4s, v13.4s, v15.4s 635 trn2 v7.4s, v13.4s, v15.4s 636 637 trn1 v8.2d, v0.2d, v4.2d //0 638 trn2 v12.2d, v0.2d, v4.2d //1 639 trn1 v9.2d, v1.2d, v5.2d //2 640 trn2 v13.2d, v1.2d, v5.2d //3 641 trn1 v10.2d, v2.2d, v6.2d //4 642 trn2 v14.2d, v2.2d, v6.2d //5 643 trn1 v11.2d, v3.2d, v7.2d //6 644 trn2 v15.2d, v3.2d, v7.2d //7 645 646 // 1 3 5 6 7 647 sshr v16.8h, v9.8h, #1 //(pi2_tmp_ptr[1] >> 1) 648 sshr v17.8h, v10.8h, #1 //(pi2_tmp_ptr[2] >> 1) 649 sshr v18.8h, v11.8h, #1 //(pi2_tmp_ptr[3] >> 1) 650 sshr v19.8h, v13.8h, #1 //(pi2_tmp_ptr[5] >> 1) 651 sshr v20.8h, v14.8h, #1 //(pi2_tmp_ptr[6] >> 1) 652 sshr v21.8h, v15.8h, #1 //(pi2_tmp_ptr[7] >> 1) 653 654 add v0.8h, v8.8h, v12.8h // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] ); 655 sub v2.8h, v8.8h, v12.8h // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] ); 656 657 sub v4.8h, v17.8h, v14.8h //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] ); 658 add v6.8h, v10.8h, v20.8h //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1)); 659 660 //-w3 + w5 661 ssubl v22.4s, v13.4h, v11.4h 662 ssubl2 v23.4s, v13.8h, v11.8h 663 //w3 + w5 664 saddl v24.4s, v13.4h, v11.4h 665 saddl2 v25.4s, v13.8h, v11.8h 666 //-w1 + w7 667 ssubl v26.4s, v15.4h, v9.4h 668 ssubl2 v27.4s, v15.8h, v9.8h 669 //w1 + w7 670 saddl v28.4s, v15.4h, v9.4h 671 saddl2 v29.4s, v15.8h, v9.8h 672 673 //-w3 + w5 - w7 674 ssubw v22.4s, v22.4s, v15.4h 675 ssubw2 v23.4s, v23.4s, v15.8h 676 //w3 + w5 + w1 677 saddw v24.4s, v24.4s, v9.4h 678 saddw2 v25.4s, v25.4s, v9.8h 679 //-w1 + w7 + w5 680 saddw v26.4s, v26.4s, v13.4h 681 saddw2 v27.4s, v27.4s, v13.8h 682 //w1 + w7 - w3 683 ssubw v28.4s, v28.4s, v11.4h 684 ssubw2 v29.4s, v29.4s, v11.8h 685 686 //-w3 + w5 - w7 - (w7 >> 1) 687 ssubw v22.4s, v22.4s, v21.4h 688 ssubw2 v23.4s, v23.4s, v21.8h 689 //w3 + w5 + w1 + (w1 >> 1) 690 saddw v24.4s, v24.4s, v16.4h 691 saddw2 v25.4s, v25.4s, v16.8h 692 //-w1 + w7 + w5 + (w5 >> 1) 693 saddw v26.4s, v26.4s, v19.4h 694 saddw2 v27.4s, v27.4s, v19.8h 695 //w1 + w7 - w3 - (w3 >> 1) 696 ssubw v28.4s, v28.4s, v18.4h 697 ssubw2 v29.4s, v29.4s, v18.8h 698 699 xtn v1.4h, v22.4s 700 xtn2 v1.8h, v23.4s 701 xtn v3.4h, v28.4s 702 xtn2 v3.8h, v29.4s 703 xtn v5.4h, v26.4s 704 xtn2 v5.8h, v27.4s 705 xtn v7.4h, v24.4s 706 xtn2 v7.8h, v25.4s 707 708 sshr v16.8h, v1.8h, #2 //(y1 >> 2) 709 sshr v17.8h, v3.8h, #2 //(y3 >> 2) 710 sshr v18.8h, v5.8h, #2 //(y5 >> 2) 711 sshr v19.8h, v7.8h, #2 //(y7 >> 2) 712 713 add v8.8h, v0.8h, v6.8h 714 add v9.8h, v1.8h, v19.8h 715 add v10.8h, v2.8h, v4.8h 716 add v11.8h, v3.8h, v18.8h 717 sub v12.8h, v2.8h, v4.8h 718 sub v13.8h, v17.8h, v5.8h 719 sub v14.8h, v0.8h, v6.8h 720 sub v15.8h, v7.8h, v16.8h 721 722 add v0.8h, v8.8h, v15.8h 723 add v1.8h, v10.8h, v13.8h 724 add v2.8h, v12.8h, v11.8h 725 add v3.8h, v14.8h, v9.8h 726 sub v4.8h, v14.8h, v9.8h 727 sub v5.8h, v12.8h, v11.8h 728 sub v6.8h, v10.8h, v13.8h 729 sub v7.8h, v8.8h, v15.8h 730 731 subs x8, x8, #1 732 bne trans_1x8_1d 733 734 ld1 {v22.8b}, [x1], x3 735 ld1 {v23.8b}, [x1], x3 736 ld1 {v24.8b}, [x1], x3 737 ld1 {v25.8b}, [x1], x3 738 ld1 {v26.8b}, [x1], x3 739 ld1 {v27.8b}, [x1], x3 740 ld1 {v28.8b}, [x1], x3 741 ld1 {v29.8b}, [x1] 742 743 srshr v0.8h, v0.8h, #6 744 srshr v1.8h, v1.8h, #6 745 srshr v2.8h, v2.8h, #6 746 srshr v3.8h, v3.8h, #6 747 srshr v4.8h, v4.8h, #6 748 srshr v5.8h, v5.8h, #6 749 srshr v6.8h, v6.8h, #6 750 srshr v7.8h, v7.8h, #6 751 752 uaddw v0.8h, v0.8h, v22.8b 753 uaddw v1.8h, v1.8h, v23.8b 754 uaddw v2.8h, v2.8h, v24.8b 755 uaddw v3.8h, v3.8h, v25.8b 756 uaddw v4.8h, v4.8h, v26.8b 757 uaddw v5.8h, v5.8h, v27.8b 758 uaddw v6.8h, v6.8h, v28.8b 759 uaddw v7.8h, v7.8h, v29.8b 760 761 sqxtun v0.8b, v0.8h 762 sqxtun v1.8b, v1.8h 763 sqxtun v2.8b, v2.8h 764 sqxtun v3.8b, v3.8h 765 sqxtun v4.8b, v4.8h 766 sqxtun v5.8b, v5.8h 767 sqxtun v6.8b, v6.8h 768 sqxtun v7.8b, v7.8h 769 770 st1 {v0.8b}, [x2], x4 771 st1 {v1.8b}, [x2], x4 772 st1 {v2.8b}, [x2], x4 773 st1 {v3.8b}, [x2], x4 774 st1 {v4.8b}, [x2], x4 775 st1 {v5.8b}, [x2], x4 776 st1 {v6.8b}, [x2], x4 777 st1 {v7.8b}, [x2] 778 779 pop_v_regs 780 ret 781 782 783 784 785