1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@ ******************************************************************************* 22@ * @file 23@ * ih264_iquant_itrans_recon_a9.s 24@ * 25@ * @brief 26@ * Contains function definitions for single stage inverse transform 27@ * 28@ * @author 29@ * Mohit 30@ * Harinarayanaan 31@ * 32@ * @par List of Functions: 33@ * - ih264_iquant_itrans_recon_4x4_a9() 34@ * - ih264_iquant_itrans_recon_8x8_a9() 35@ * - ih264_iquant_itrans_recon_chroma_4x4_a9() 36@ * 37@ * @remarks 38@ * None 39@ * 40@ ******************************************************************************* 41@* 42@** 43@ ******************************************************************************* 44@ * 45@ * @brief 46@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 47@ * 48@ * @par Description: 49@ * Performs inverse transform Ci4 and adds the residue to get the 50@ * reconstructed block 51@ * 52@ * @param[in] pi2_src 53@ * Input 4x4 coefficients 54@ * 55@ * @param[in] pu1_pred 56@ * Prediction 4x4 block 57@ * 58@ * @param[out] pu1_out 59@ * Output 4x4 block 60@ * 61@ * @param[in] u4_qp_div_6 62@ * QP 63@ * 64@ * @param[in] pu2_weigh_mat 65@ * Pointer to weight matrix 66@ * 67@ * @param[in] pred_strd, 68@ * Prediction stride 69@ * 70@ * @param[in] out_strd 71@ * Output Stride 72@ * 73@ *@param[in] pi2_tmp 74@ * temporary buffer of size 1*16 75@ * 76@ * @param[in] pu2_iscal_mat 77@ * Pointer to the inverse quantization matrix 78@ * 79@ * @returns Void 80@ * 81@ * @remarks 82@ * None 83@ * 84@ ******************************************************************************* 85@ * 86@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, 87@ UWORD8 *pu1_pred, 88@ UWORD8 *pu1_out, 89@ WORD32 pred_strd, 90@ WORD32 out_strd, 91@ const UWORD16 *pu2_iscal_mat, 92@ const UWORD16 *pu2_weigh_mat, 93@ UWORD32 u4_qp_div_6, 94@ WORD32 *pi4_tmp, 95@ WORD32 iq_start_idx 96@ WORD16 *pi2_dc_ld_addr) 97@**************Variables Vs Registers***************************************** 98@r0 => *pi2_src 99@r1 => *pu1_pred 100@r2 => *pu1_out 101@r3 => pred_strd 102@r4 => out_strd 103@r5 => *pu2_iscal_mat 104@r6 => *pu2_weigh_mat 105@r7 => u4_qp_div_6 106@r8 => iq_start_idx 107@r10=> pi2_dc_ld_addr 108.text 109.syntax unified 110.p2align 2 111 112 .global ih264_iquant_itrans_recon_4x4_a9 113 114ih264_iquant_itrans_recon_4x4_a9: 115 116@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 117@If the macro value changes need to change the instruction according to it. 118@Only one shift is done in horizontal inverse because, 119@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 120@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 121 122 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 123 ldr r7, [sp, #52] @Loads u4_qp_div_6 124 ldr r4, [sp, #40] @Loads out_strd 125 vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 126 ldr r5, [sp, #44] @Loads *pu2_iscal_mat 127 128 ldr r6, [sp, #48] @Loads *pu2_weigh_mat 129 130 ldr r8, [sp, #60] @Loads iq_start_idx 131 132 ldr r10, [sp, #64] @Load alternate dc address 133 134 vpush {d8-d15} 135@=======================DEQUANT FROM HERE=================================== 136 137 vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 138 vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 139 vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 140 vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 141 142 vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 143 144 subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set 145 ldrsheq r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1 146 147 vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 148 vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 149 vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 150 vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 151 152 vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 153 vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 154 vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 155 vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 156 157 vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 158 vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 159 vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 160 vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 161 162 vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1 163 164@========= PROCESS IDCT FROM HERE ======= 165@Steps for Stage 1: 166@------------------ 167 vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer 168 vadd.s16 d4, d0, d2 @x0 = q0 + q1; 169 170 vsub.s16 d5, d0, d2 @x1 = q0 - q1; 171 172 vshr.s16 d8, d1, #1 @q0>>1 173 vshr.s16 d9, d3, #1 @q1>>1 174 175 vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; 176 vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); 177 vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer 178 179 vswp d6, d7 @Reverse positions of x2 and x3 180 181 vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined 182 vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined 183 184 vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf 185 186 vswp d12, d13 187@Steps for Stage 2: 188@------------------ 189 vtrn.16 d10, d11 190 vtrn.16 d12, d13 191 vtrn.32 d10, d12 192 vtrn.32 d11, d13 193 vadd.s16 d14, d10, d12 @x0 = q0 + q1; 194 195 vsub.s16 d15, d10, d12 @x1 = q0 - q1; 196 197 vshr.s16 d18, d11, #1 @q0>>1 198 vshr.s16 d19, d13, #1 @q1>>1 199 200 vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; 201 vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); 202 203 vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer 204 vswp d16, d17 @Reverse positions of x2 and x3 205 206 vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined 207 vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined 208 209 vswp d22, d23 210 211 vrshr.s16 q10, q10, #6 @ 212 vrshr.s16 q11, q11, #6 213 214 vaddw.u8 q10, q10, d30 215 vaddw.u8 q11, q11, d31 216 217 vqmovun.s16 d0, q10 218 vqmovun.s16 d1, q11 219 220 vst1.32 d0[0], [r2], r4 @I row store the value 221 vst1.32 d0[1], [r2], r4 @II row store the value 222 vst1.32 d1[0], [r2], r4 @III row store the value 223 vst1.32 d1[1], [r2] @IV row store the value 224 225 vpop {d8-d15} 226 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 227 228 229@** 230@ ******************************************************************************* 231@ * 232@ * @brief 233@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 234@ * 235@ * @par Description: 236@ * Performs inverse transform Ci4 and adds the residue to get the 237@ * reconstructed block 238@ * 239@ * @param[in] pi2_src 240@ * Input 4x4 coefficients 241@ * 242@ * @param[in] pu1_pred 243@ * Prediction 4x4 block 244@ * 245@ * @param[out] pu1_out 246@ * Output 4x4 block 247@ * 248@ * @param[in] u4_qp_div_6 249@ * QP 250@ * 251@ * @param[in] pu2_weigh_mat 252@ * Pointer to weight matrix 253@ * 254@ * @param[in] pred_strd, 255@ * Prediction stride 256@ * 257@ * @param[in] out_strd 258@ * Output Stride 259@ * 260@ *@param[in] pi2_tmp 261@ * temporary buffer of size 1*16 262@ * 263@ * @param[in] pu2_iscal_mat 264@ * Pointer to the inverse quantization matrix 265@ * 266@ * @returns Void 267@ * 268@ * @remarks 269@ * None 270@ * 271@ ******************************************************************************* 272@ * 273@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, 274@ UWORD8 *pu1_pred, 275@ UWORD8 *pu1_out, 276@ WORD32 pred_strd, 277@ WORD32 out_strd, 278@ const UWORD16 *pu2_iscal_mat, 279@ const UWORD16 *pu2_weigh_mat, 280@ UWORD32 u4_qp_div_6, 281@ WORD32 *pi4_tmp 282@ WORD16 *pi2_dc_src) 283@**************Variables Vs Registers***************************************** 284@r0 => *pi2_src 285@r1 => *pu1_pred 286@r2 => *pu1_out 287@r3 => pred_strd 288@r4 => out_strd 289@r5 => *pu2_iscal_mat 290@r6 => *pu2_weigh_mat 291@r7 => u4_qp_div_6 292 293 .global ih264_iquant_itrans_recon_chroma_4x4_a9 294ih264_iquant_itrans_recon_chroma_4x4_a9: 295 296@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 297@If the macro value changes need to change the instruction according to it. 298@Only one shift is done in horizontal inverse because, 299@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 300@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 301 302 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 303 ldr r7, [sp, #52] @Loads u4_qp_div_6 304 ldr r4, [sp, #40] @Loads out_strd 305 vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 306 ldr r5, [sp, #44] @Loads *pu2_iscal_mat 307 ldr r6, [sp, #48] @Loads *pu2_weigh_mat 308 ldr r8, [sp, #60] @loads *pi2_dc_src 309 310 vpush {d8-d15} 311@=======================DEQUANT FROM HERE=================================== 312 313 vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 314 vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 315 vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 316 vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 317 318 vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 319 320 vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 321 vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 322 vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 323 vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 324 325 vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 326 vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 327 vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 328 vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 329 330 vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 331 vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 332 vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 333 vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 334 335 ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0] 336 vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it 337 338@========= PROCESS IDCT FROM HERE ======= 339@Steps for Stage 1: 340@------------------ 341 vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer 342 vadd.s16 d4, d0, d2 @x0 = q0 + q1; 343 344 vsub.s16 d5, d0, d2 @x1 = q0 - q1; 345 346 vshr.s16 d8, d1, #1 @q0>>1 347 vshr.s16 d9, d3, #1 @q1>>1 348 349 vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; 350 vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); 351 vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer 352 353 vswp d6, d7 @Reverse positions of x2 and x3 354 355 vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined 356 vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer 357 vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined 358 359 vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf 360 361 vswp d12, d13 362@Steps for Stage 2: 363@------------------ 364 vtrn.16 d10, d11 365 vtrn.16 d12, d13 366 vtrn.32 d10, d12 367 vtrn.32 d11, d13 368 vadd.s16 d14, d10, d12 @x0 = q0 + q1; 369 370 vsub.s16 d15, d10, d12 @x1 = q0 - q1; 371 372 vshr.s16 d18, d11, #1 @q0>>1 373 vshr.s16 d19, d13, #1 @q1>>1 374 375 vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; 376 vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); 377 378 vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer 379 vswp d16, d17 @Reverse positions of x2 and x3 380 381 vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined 382 vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf 383 vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined 384 385 vswp d22, d23 386 387 vrshr.s16 q10, q10, #6 @ 388 vrshr.s16 q11, q11, #6 389 390 vaddw.u8 q10, q10, d28 391 vaddw.u8 q11, q11, d29 392 393 vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs 394 vld1.u8 d1, [r2], r4 395 vld1.u8 d2, [r2], r4 396 vld1.u8 d3, [r2], r4 397 398 sub r2, r2, r4, lsl #2 399 400 vqmovun.s16 d20, q10 @Getting quantized coeffs 401 vqmovun.s16 d22, q11 402 403 vmovl.u8 q10, d20 @Move the coffs into 16 bit 404 vmovl.u8 q11, d22 @so that we can use vbit to copy 405 406 vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs 407 408 vbit.u8 q0, q10, q14 409 vbit.u8 q1, q11, q14 410 411 vst1.u8 d0, [r2], r4 412 vst1.u8 d1, [r2], r4 413 vst1.u8 d2, [r2], r4 414 vst1.u8 d3, [r2] 415 416 vpop {d8-d15} 417 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 418 419 420@* 421@ ******************************************************************************* 422@ * 423@ * @brief 424@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 425@ * 426@ * @par Description: 427@ * Performs inverse transform Ci8 and adds the residue to get the 428@ * reconstructed block 429@ * 430@ * @param[in] pi2_src 431@ * Input 4x4 coefficients 432@ * 433@ * @param[in] pu1_pred 434@ * Prediction 4x4 block 435@ * 436@ * @param[out] pu1_out 437@ * Output 4x4 block 438@ * 439@ * @param[in] u4_qp_div_6 440@ * QP 441@ * 442@ * @param[in] pu2_weigh_mat 443@ * Pointer to weight matrix 444@ * 445@ * @param[in] pred_strd, 446@ * Prediction stride 447@ * 448@ * @param[in] out_strd 449@ * Output Stride 450@ * 451@ *@param[in] pi2_tmp 452@ * temporary buffer of size 1*64 453@ * 454@ * @param[in] pu2_iscal_mat 455@ * Pointer to the inverse quantization matrix 456@ * 457@ * @returns Void 458@ * 459@ * @remarks 460@ * None 461@ * 462@ ******************************************************************************* 463@ * 464@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, 465@ UWORD8 *pu1_pred, 466@ UWORD8 *pu1_out, 467@ WORD32 pred_strd, 468@ WORD32 out_strd, 469@ const UWORD16 *pu2_iscal_mat, 470@ const UWORD16 *pu2_weigh_mat, 471@ UWORD32 u4_qp_div_6, 472@ WORD32 *pi4_tmp, 473@ WORD32 iq_start_idx) 474@**************Variables Vs Registers***************************************** 475@r0 => *pi2_src 476@r1 => *pu1_pred 477@r2 => *pu1_out 478@r3 => pred_strd 479@r4 => out_strd 480@r5 => *pu2_iscal_mat 481@r6 => *pu2_weigh_mat 482@r7 => u4_qp_div_6 483 484 485 .global ih264_iquant_itrans_recon_8x8_a9 486ih264_iquant_itrans_recon_8x8_a9: 487 488 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 489 ldr r7, [sp, #52] @Loads u4_qp_div_6 490 ldr r4, [sp, #40] @Loads out_strd 491 492 ldr r5, [sp, #44] @Loads *pu2_iscal_mat 493 ldr r6, [sp, #48] @Loads *pu2_weigh_mat 494 vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 495 vpush {d8-d15} 496 497idct_8x8_begin: 498 499@========= DEQUANT FROM HERE =========== 500 501 vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0 502 vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0 503 vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1 504 vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7 505 vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1 506 vld1.32 {q8}, [r0]! @ Q8 = Source row 0 507 vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15 508 vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 509 vld1.32 {q9}, [r0]! @ Q8 = Source row 1 510 vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 511 vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 512 vld1.32 {q13}, [r6]! @ Scaling factors row 2 513 vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 514 vld1.32 {q14}, [r6]! @ Scaling factors row 3 515 vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 516 vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2 517 vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 518 vld1.32 {q8}, [r0]! @ Source Row 2 519 vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 520 vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3 521 vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 522 vld1.32 {q9}, [r0]! @ Source Row 3 523 vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2 524 vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3 525 vld1.32 {q4}, [r6]! @ Scaling factors row 4 526 vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3 527 vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7 528 vld1.32 {q5}, [r6]! @ Scaling factors row 5 529 vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11 530 vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15 531 vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4 532 vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19 533 vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23 534 vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5 535 vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27 536 vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31 537 538 vld1.32 {q14}, [r0]! @ Source row 4 539 vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4 540 vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5 541 vld1.32 {q9}, [r0]! @ Source row 5 542 vshl.s32 q2, q2, q15 @ 543 vshl.s32 q3, q3, q15 @ 544 vld1.32 {q13}, [r6]! @ Scaling factors row 6 545 vshl.s32 q6, q6, q15 @ 546 vshl.s32 q7, q7, q15 @ 547 vmull.s16 q4, d28, d20 @ i = 32..35 548 vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19 549 vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23 550 vmull.s16 q5, d29, d21 @ i =36..39 551 vld1.32 {q10}, [r5]! @ Dequant values row 6 552 vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27 553 vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31 554 vld1.32 {q14}, [r6]! @ Scaling factors row 7 555 vmull.s16 q6, d18, d22 @ 556 vld1.32 {q8}, [r0]! @ Source row 6 557 vmull.s16 q7, d19, d23 @ 558 vld1.32 {q11}, [r5]! @ Dequant values row 7 559 vshl.s32 q4, q4, q15 @ 560 vld1.32 {q9}, [r0]! @ Source row 7 561 vshl.s32 q5, q5, q15 @ 562 563 vshl.s32 q6, q6, q15 @ 564 vshl.s32 q7, q7, q15 @ 565 vmul.s16 q10, q10, q13 @ Dequant*scaling row 6 566 vmul.s16 q11, q11, q14 @ Dequant*scaling row 7 567 vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35 568 vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39 569 vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43 570 vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47 571 vmull.s16 q6, d16, d20 @ i= 48..51 572 vmull.s16 q7, d17, d21 @ i= 52..55 573 vmull.s16 q8, d18, d22 @ i=56..59 574 vmull.s16 q9, d19, d23 @ i=60..63 575 vshl.s32 q6, q6, q15 @ 576 vzip.s16 q0, q1 @Transpose 577 vshl.s32 q7, q7, q15 @ 578 vshl.s32 q8, q8, q15 @ 579 vzip.s16 q2, q3 @ 580 vshl.s32 q9, q9, q15 @ 581 vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51 582 vzip.s16 q4, q5 @Transpose 583 vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55 584 vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59 585 vzip.s32 q0, q2 @Transpose 586 vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63 587 588@========= PROCESS IDCT FROM HERE ======= 589 590@Steps for Stage 2: 591@------------------ 592 593@ TRANSPOSE 8x8 coeffs to actual order 594 595 vzip.s16 q6, q7 @ 596 597 vzip.s32 q1, q3 @ 598 vzip.s32 q4, q6 @ 599 vzip.s32 q5, q7 @ 600 601 vswp d1, d8 @ Q0/Q1 = Row order x0/x1 602 vswp d3, d10 @ Q2/Q3 = Row order x2/x3 603 vswp d5, d12 @ Q4/Q5 = Row order x4/x5 604 vswp d7, d14 @ Q6/Q7 = Row order x6/x7 605 606 vswp q1, q4 @ 607 vshr.s16 q10, q2, #0x1 @ 608 vswp q3, q6 @ 609 610@Steps for Stage 1: 611@------------------ 612 613 vadd.s16 q8, q0, q4 @ Q8 = y0 614 vsub.s16 q9, q0, q4 @ Q9 = y2 615 616 vsra.s16 q2, q6, #0x1 @ Q2 = y6 617 vsub.s16 q6, q10, q6 @ Q6 = y4 618 619 vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7 620 vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7 621 622 vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1 623 vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1 624 625 vadd.s16 q0, q8, q2 @ Q0 = z0 626 vsub.s16 q4, q8, q2 @ Q4 = z6 627 628 vadd.s16 q8, q9, q6 @ Q8 = z2 629 vsub.s16 q2, q9, q6 @ Q2 = z4 630 631 vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3 632 vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3 633 634 vshr.s16 q6, q3, #0x1 @ 635 636 vaddw.s16 q10, q10, d10 @ 637 vaddw.s16 q11, q11, d11 @ 638 639 vshr.s16 q9, q5, #0x1 @ 640 641 vsubw.s16 q12, q12, d12 @ 642 vsubw.s16 q13, q13, d13 @ 643 644 vaddw.s16 q10, q10, d18 @ 645 vaddw.s16 q11, q11, d19 @ 646 647 vqmovn.s32 d12, q12 @ 648 vaddl.s16 q12, d10, d6 @ 649 vqmovn.s32 d13, q13 @ Q6 = y3 650 vaddl.s16 q13, d11, d7 @ 651 vqmovn.s32 d18, q10 @ 652 vsubl.s16 q10, d10, d6 @ 653 vqmovn.s32 d19, q11 @ Q9 = y5 654 vsubl.s16 q11, d11, d7 @ 655 656 vshr.s16 q3, q6, #0x2 @ 657 658 vsra.s16 q6, q9, #0x2 @ Q6 = z3 659 660 vaddw.s16 q12, q12, d2 @ 661 vaddw.s16 q13, q13, d3 @ 662 663 vshr.s16 q1, #0x1 @ 664 665 vsub.s16 q5, q3, q9 @ Q5 = z5 666 667 vsubw.s16 q10, q10, d14 @ 668 vsubw.s16 q11, q11, d15 @ 669 670 vshr.s16 q7, #0x1 @ 671 672 vaddw.s16 q12, q12, d2 @ 673 vaddw.s16 q13, q13, d3 @ 674 675 vsubw.s16 q10, q10, d14 @ 676 vsubw.s16 q11, q11, d15 @ 677 678 679 vqmovn.s32 d14, q12 @ 680 vadd.s16 q1, q8, q5 @ Q1 = x1 681 vqmovn.s32 d15, q13 @ Q7 = y7 682 vsub.s16 q3, q8, q5 @ Q3 = x6 683 vqmovn.s32 d18, q10 @ 684 vsub.s16 q5, q2, q6 @ Q5 = x5 685 vqmovn.s32 d19, q11 @ Q9 = y1 686 vadd.s16 q2, q2, q6 @ Q2 = x2 687 688 vshr.s16 q12, q9, #0x2 @ 689 vsra.s16 q9, q7, #0x2 @ Q9 = z1 690 691 vsub.s16 q11, q7, q12 @ Q11 = z7 692 693 vadd.s16 q6, q4, q9 @ Q6 = x3 694 vsub.s16 q4, q4, q9 @ Q4 = x4 695 696 vsub.s16 q7, q0, q11 @ Q7 = x7 697 vadd.s16 q0, q0, q11 @ Q0 = x0 698 699 vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6 700 701 702@Steps for Stage 2: 703@------------------ 704 705@ TRANSPOSE 8x8 coeffs to actual order 706 707 vzip.s16 q0, q1 @ 708 vzip.s16 q2, q3 @ 709 vzip.s16 q4, q5 @ 710 vzip.s16 q6, q7 @ 711 712 vzip.s32 q0, q2 @ 713 vzip.s32 q1, q3 @ 714 vzip.s32 q4, q6 @ 715 vzip.s32 q5, q7 @ 716 717 vswp d1, d8 @ Q0/Q1 = Row order x0/x1 718 vswp d3, d10 @ Q2/Q3 = Row order x2/x3 719 vswp d5, d12 @ Q4/Q5 = Row order x4/x5 720 vswp d7, d14 @ Q6/Q7 = Row order x6/x7 721 722 vswp q1, q4 @ 723 vshr.s16 q10, q2, #0x1 @ 724 vswp q3, q6 @ 725 726@Steps for Stage 3: 727@------------------ 728 729@Repeat stage 1 again for vertical transform 730 731 vadd.s16 q8, q0, q4 @ Q8 = y0 732 vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... 733 vsub.s16 q9, q0, q4 @ Q9 = y2 734 735 vsra.s16 q2, q6, #0x1 @ Q2 = y6 736 vsub.s16 q6, q10, q6 @ Q6 = y4 737 738 vaddl.s16 q12, d14, d2 @ 739 vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... 740 vaddl.s16 q13, d15, d3 @ 741 742 vsubl.s16 q10, d14, d2 @ 743 vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... 744 vsubl.s16 q11, d15, d3 @ 745 746 vadd.s16 q0, q8, q2 @ Q0 = z0 747 vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... 748 vsub.s16 q4, q8, q2 @ Q4 = z6 749 750 vadd.s16 q8, q9, q6 @ Q8 = z2 751 vsub.s16 q2, q9, q6 @ Q2 = z4 752 753 vsubw.s16 q12, q12, d6 @ 754 vsubw.s16 q13, q13, d7 @ 755 756 vshr.s16 q6, q3, #0x1 @ 757 758 vaddw.s16 q10, q10, d10 @ 759 vaddw.s16 q11, q11, d11 @ 760 761 vshr.s16 q9, q5, #0x1 @ 762 763 vsubw.s16 q12, q12, d12 @ 764 vsubw.s16 q13, q13, d13 @ 765 766 vaddw.s16 q10, q10, d18 @ 767 vaddw.s16 q11, q11, d19 @ 768 769 vqmovn.s32 d12, q12 @ 770 vaddl.s16 q12, d10, d6 @ 771 vqmovn.s32 d13, q13 @ Q6 = y3 772 vaddl.s16 q13, d11, d7 @ 773 vqmovn.s32 d18, q10 @ 774 vsubl.s16 q10, d10, d6 @ 775 vqmovn.s32 d19, q11 @ Q9 = y5 776 vsubl.s16 q11, d11, d7 @ 777 778 vshr.s16 q3, q6, #0x2 @ 779 780 vsra.s16 q6, q9, #0x2 @ Q6 = z3 781 782 vaddw.s16 q12, q12, d2 @ 783 vaddw.s16 q13, q13, d3 @ 784 785 vshr.s16 q1, #0x1 @ 786 787 vsub.s16 q5, q3, q9 @ Q5 = z5 788 789 vsubw.s16 q10, q10, d14 @ 790 vsubw.s16 q11, q11, d15 @ 791 792 vshr.s16 q7, #0x1 @ 793 794 vaddw.s16 q12, q12, d2 @ 795 vaddw.s16 q13, q13, d3 @ 796 797 vsubw.s16 q10, q10, d14 @ 798 vsubw.s16 q11, q11, d15 @ 799 800 vqmovn.s32 d14, q12 @ 801 vadd.s16 q1, q8, q5 @ Q1 = x1 802 vqmovn.s32 d15, q13 @ Q7 = y7 803 vsub.s16 q3, q8, q5 @ Q3 = x6 804 vqmovn.s32 d18, q10 @ 805 vsub.s16 q5, q2, q6 @ Q5 = x5 806 vqmovn.s32 d19, q11 @ Q9 = y1 807 vadd.s16 q2, q2, q6 @ Q2 = x2 808 809 vshr.s16 q12, q9, #0x2 @ 810 vsra.s16 q9, q7, #0x2 @ Q9 = z1 811 812 vsub.s16 q11, q7, q12 @ Q11 = z7 813 814 vadd.s16 q6, q4, q9 @ Q6 = x3 815 vsub.s16 q4, q4, q9 @ Q4 = x4 816 817 vsub.s16 q7, q0, q11 @ Q7 = x7 818 vadd.s16 q0, q0, q11 @ Q0 = x0 819 820 vswp.s16 q3, q6 @ Q3 <-> Q6 821 822 vrshr.s16 q1, q1, #6 @ 823 vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605.... 824 vrshr.s16 q2, q2, #6 @ 825 vrshr.s16 q4, q4, #6 @ 826 vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605.... 827 vrshr.s16 q5, q5, #6 @ 828 vrshr.s16 q7, q7, #6 @ 829 vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605.... 830 vrshr.s16 q0, q0, #6 @ 831 vrshr.s16 q3, q3, #6 @ 832 vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605.... 833 vrshr.s16 q6, q6, #6 @ 834 835@ Code Added to pack sign and magnitudes 836 837 vaddw.u8 q0, q0, d28 838 vaddw.u8 q1, q1, d29 839 vaddw.u8 q2, q2, d30 840 vaddw.u8 q3, q3, d31 841 vqmovun.s16 d0, q0 842 vaddw.u8 q4, q4, d16 843 vqmovun.s16 d1, q1 844 vaddw.u8 q5, q5, d17 845 vqmovun.s16 d2, q2 846 vaddw.u8 q6, q6, d18 847 vqmovun.s16 d3, q3 848 vaddw.u8 q7, q7, d19 849 850 vqmovun.s16 d4, q4 851 vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 852 vqmovun.s16 d5, q5 853 vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 854 vqmovun.s16 d6, q6 855 vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 856 vqmovun.s16 d7, q7 857 vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 858 vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 859 860 vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 861 862 863 vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 864 865 866 vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 867 868idct_8x8_end: 869 870 vpop {d8-d15} 871 ldmfd sp!, {r4-r12, r15} 872 873