1; 2; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11;TODO(cd): adjust these constant to be able to use vqdmulh for faster 12; dct_const_round_shift(a * b) within butterfly calculations. 13cospi_1_64 EQU 16364 14cospi_2_64 EQU 16305 15cospi_3_64 EQU 16207 16cospi_4_64 EQU 16069 17cospi_5_64 EQU 15893 18cospi_6_64 EQU 15679 19cospi_7_64 EQU 15426 20cospi_8_64 EQU 15137 21cospi_9_64 EQU 14811 22cospi_10_64 EQU 14449 23cospi_11_64 EQU 14053 24cospi_12_64 EQU 13623 25cospi_13_64 EQU 13160 26cospi_14_64 EQU 12665 27cospi_15_64 EQU 12140 28cospi_16_64 EQU 11585 29cospi_17_64 EQU 11003 30cospi_18_64 EQU 10394 31cospi_19_64 EQU 9760 32cospi_20_64 EQU 9102 33cospi_21_64 EQU 8423 34cospi_22_64 EQU 7723 35cospi_23_64 EQU 7005 36cospi_24_64 EQU 6270 37cospi_25_64 EQU 5520 38cospi_26_64 EQU 4756 39cospi_27_64 EQU 3981 40cospi_28_64 EQU 3196 41cospi_29_64 EQU 2404 42cospi_30_64 EQU 1606 43cospi_31_64 EQU 804 44 45 46 EXPORT |vp9_idct32x32_1024_add_neon| 47 ARM 48 REQUIRE8 49 PRESERVE8 50 51 AREA ||.text||, CODE, READONLY, ALIGN=2 52 53 AREA Block, CODE, READONLY 54 55 ; -------------------------------------------------------------------------- 56 ; Load from transposed_buffer 57 ; q13 = transposed_buffer[first_offset] 58 ; q14 = transposed_buffer[second_offset] 59 ; for proper address calculation, the last offset used when manipulating 60 ; transposed_buffer must be passed in. use 0 for first use. 61 MACRO 62 LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset 63 ; address calculation with proper stride and loading 64 add r0, #($first_offset - $prev_offset )*8*2 65 vld1.s16 {q14}, [r0] 66 add r0, #($second_offset - $first_offset)*8*2 67 vld1.s16 {q13}, [r0] 68 ; (used) two registers (q14, q13) 69 MEND 70 ; -------------------------------------------------------------------------- 71 ; Load from output (used as temporary storage) 72 ; reg1 = output[first_offset] 73 ; reg2 = output[second_offset] 74 ; for proper address calculation, the last offset used when manipulating 75 ; output, whether reading or storing) must be passed in. use 0 for first 76 ; use. 77 MACRO 78 LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 79 ; address calculation with proper stride and loading 80 add r1, #($first_offset - $prev_offset )*32*2 81 vld1.s16 {$reg1}, [r1] 82 add r1, #($second_offset - $first_offset)*32*2 83 vld1.s16 {$reg2}, [r1] 84 ; (used) two registers ($reg1, $reg2) 85 MEND 86 ; -------------------------------------------------------------------------- 87 ; Store into output (sometimes as as temporary storage) 88 ; output[first_offset] = reg1 89 ; output[second_offset] = reg2 90 ; for proper address calculation, the last offset used when manipulating 91 ; output, whether reading or storing) must be passed in. use 0 for first 92 ; use. 93 MACRO 94 STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 95 ; address calculation with proper stride and storing 96 add r1, #($first_offset - $prev_offset )*32*2 97 vst1.16 {$reg1}, [r1] 98 add r1, #($second_offset - $first_offset)*32*2 99 vst1.16 {$reg2}, [r1] 100 MEND 101 ; -------------------------------------------------------------------------- 102 ; Combine-add results with current destination content 103 ; q6-q9 contain the results (out[j * 32 + 0-31]) 104 MACRO 105 STORE_COMBINE_CENTER_RESULTS 106 ; load dest[j * dest_stride + 0-31] 107 vld1.s16 {d8}, [r10], r2 108 vld1.s16 {d11}, [r9], r11 109 vld1.s16 {d9}, [r10] 110 vld1.s16 {d10}, [r9] 111 ; ROUND_POWER_OF_TWO 112 vrshr.s16 q7, q7, #6 113 vrshr.s16 q8, q8, #6 114 vrshr.s16 q9, q9, #6 115 vrshr.s16 q6, q6, #6 116 ; add to dest[j * dest_stride + 0-31] 117 vaddw.u8 q7, q7, d9 118 vaddw.u8 q8, q8, d10 119 vaddw.u8 q9, q9, d11 120 vaddw.u8 q6, q6, d8 121 ; clip pixel 122 vqmovun.s16 d9, q7 123 vqmovun.s16 d10, q8 124 vqmovun.s16 d11, q9 125 vqmovun.s16 d8, q6 126 ; store back into dest[j * dest_stride + 0-31] 127 vst1.16 {d9}, [r10], r11 128 vst1.16 {d10}, [r9], r2 129 vst1.16 {d8}, [r10] 130 vst1.16 {d11}, [r9] 131 ; update pointers (by dest_stride * 2) 132 sub r9, r9, r2, lsl #1 133 add r10, r10, r2, lsl #1 134 MEND 135 ; -------------------------------------------------------------------------- 136 ; Combine-add results with current destination content 137 ; q6-q9 contain the results (out[j * 32 + 0-31]) 138 MACRO 139 STORE_COMBINE_CENTER_RESULTS_LAST 140 ; load dest[j * dest_stride + 0-31] 141 vld1.s16 {d8}, [r10], r2 142 vld1.s16 {d11}, [r9], r11 143 vld1.s16 {d9}, [r10] 144 vld1.s16 {d10}, [r9] 145 ; ROUND_POWER_OF_TWO 146 vrshr.s16 q7, q7, #6 147 vrshr.s16 q8, q8, #6 148 vrshr.s16 q9, q9, #6 149 vrshr.s16 q6, q6, #6 150 ; add to dest[j * dest_stride + 0-31] 151 vaddw.u8 q7, q7, d9 152 vaddw.u8 q8, q8, d10 153 vaddw.u8 q9, q9, d11 154 vaddw.u8 q6, q6, d8 155 ; clip pixel 156 vqmovun.s16 d9, q7 157 vqmovun.s16 d10, q8 158 vqmovun.s16 d11, q9 159 vqmovun.s16 d8, q6 160 ; store back into dest[j * dest_stride + 0-31] 161 vst1.16 {d9}, [r10], r11 162 vst1.16 {d10}, [r9], r2 163 vst1.16 {d8}, [r10]! 164 vst1.16 {d11}, [r9]! 165 ; update pointers (by dest_stride * 2) 166 sub r9, r9, r2, lsl #1 167 add r10, r10, r2, lsl #1 168 MEND 169 ; -------------------------------------------------------------------------- 170 ; Combine-add results with current destination content 171 ; q4-q7 contain the results (out[j * 32 + 0-31]) 172 MACRO 173 STORE_COMBINE_EXTREME_RESULTS 174 ; load dest[j * dest_stride + 0-31] 175 vld1.s16 {d4}, [r7], r2 176 vld1.s16 {d7}, [r6], r11 177 vld1.s16 {d5}, [r7] 178 vld1.s16 {d6}, [r6] 179 ; ROUND_POWER_OF_TWO 180 vrshr.s16 q5, q5, #6 181 vrshr.s16 q6, q6, #6 182 vrshr.s16 q7, q7, #6 183 vrshr.s16 q4, q4, #6 184 ; add to dest[j * dest_stride + 0-31] 185 vaddw.u8 q5, q5, d5 186 vaddw.u8 q6, q6, d6 187 vaddw.u8 q7, q7, d7 188 vaddw.u8 q4, q4, d4 189 ; clip pixel 190 vqmovun.s16 d5, q5 191 vqmovun.s16 d6, q6 192 vqmovun.s16 d7, q7 193 vqmovun.s16 d4, q4 194 ; store back into dest[j * dest_stride + 0-31] 195 vst1.16 {d5}, [r7], r11 196 vst1.16 {d6}, [r6], r2 197 vst1.16 {d7}, [r6] 198 vst1.16 {d4}, [r7] 199 ; update pointers (by dest_stride * 2) 200 sub r6, r6, r2, lsl #1 201 add r7, r7, r2, lsl #1 202 MEND 203 ; -------------------------------------------------------------------------- 204 ; Combine-add results with current destination content 205 ; q4-q7 contain the results (out[j * 32 + 0-31]) 206 MACRO 207 STORE_COMBINE_EXTREME_RESULTS_LAST 208 ; load dest[j * dest_stride + 0-31] 209 vld1.s16 {d4}, [r7], r2 210 vld1.s16 {d7}, [r6], r11 211 vld1.s16 {d5}, [r7] 212 vld1.s16 {d6}, [r6] 213 ; ROUND_POWER_OF_TWO 214 vrshr.s16 q5, q5, #6 215 vrshr.s16 q6, q6, #6 216 vrshr.s16 q7, q7, #6 217 vrshr.s16 q4, q4, #6 218 ; add to dest[j * dest_stride + 0-31] 219 vaddw.u8 q5, q5, d5 220 vaddw.u8 q6, q6, d6 221 vaddw.u8 q7, q7, d7 222 vaddw.u8 q4, q4, d4 223 ; clip pixel 224 vqmovun.s16 d5, q5 225 vqmovun.s16 d6, q6 226 vqmovun.s16 d7, q7 227 vqmovun.s16 d4, q4 228 ; store back into dest[j * dest_stride + 0-31] 229 vst1.16 {d5}, [r7], r11 230 vst1.16 {d6}, [r6], r2 231 vst1.16 {d7}, [r6]! 232 vst1.16 {d4}, [r7]! 233 ; update pointers (by dest_stride * 2) 234 sub r6, r6, r2, lsl #1 235 add r7, r7, r2, lsl #1 236 MEND 237 ; -------------------------------------------------------------------------- 238 ; Touches q8-q12, q15 (q13-q14 are preserved) 239 ; valid output registers are anything but q8-q11 240 MACRO 241 DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 242 ; TODO(cd): have special case to re-use constants when they are similar for 243 ; consecutive butterflies 244 ; TODO(cd): have special case when both constants are the same, do the 245 ; additions/subtractions before the multiplies. 246 ; generate the constants 247 ; generate scalar constants 248 mov r8, #$first_constant & 0xFF00 249 mov r12, #$second_constant & 0xFF00 250 add r8, #$first_constant & 0x00FF 251 add r12, #$second_constant & 0x00FF 252 ; generate vector constants 253 vdup.16 d30, r8 254 vdup.16 d31, r12 255 ; (used) two for inputs (regA-regD), one for constants (q15) 256 ; do some multiplications (ordered for maximum latency hiding) 257 vmull.s16 q8, $regC, d30 258 vmull.s16 q10, $regA, d31 259 vmull.s16 q9, $regD, d30 260 vmull.s16 q11, $regB, d31 261 vmull.s16 q12, $regC, d31 262 ; (used) five for intermediate (q8-q12), one for constants (q15) 263 ; do some addition/subtractions (to get back two register) 264 vsub.s32 q8, q8, q10 265 vsub.s32 q9, q9, q11 266 ; do more multiplications (ordered for maximum latency hiding) 267 vmull.s16 q10, $regD, d31 268 vmull.s16 q11, $regA, d30 269 vmull.s16 q15, $regB, d30 270 ; (used) six for intermediate (q8-q12, q15) 271 ; do more addition/subtractions 272 vadd.s32 q11, q12, q11 273 vadd.s32 q10, q10, q15 274 ; (used) four for intermediate (q8-q11) 275 ; dct_const_round_shift 276 vqrshrn.s32 $reg1, q8, #14 277 vqrshrn.s32 $reg2, q9, #14 278 vqrshrn.s32 $reg3, q11, #14 279 vqrshrn.s32 $reg4, q10, #14 280 ; (used) two for results, well four d registers 281 MEND 282 ; -------------------------------------------------------------------------- 283 ; Touches q8-q12, q15 (q13-q14 are preserved) 284 ; valid output registers are anything but q8-q11 285 MACRO 286 DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 287 DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 288 MEND 289 ; -------------------------------------------------------------------------- 290 291;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); 292; 293; r0 int16_t *input, 294; r1 uint8_t *dest, 295; r2 int dest_stride) 296; loop counters 297; r4 bands loop counter 298; r5 pass loop counter 299; r8 transpose loop counter 300; combine-add pointers 301; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) 302; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) 303; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) 304; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) 305 306|vp9_idct32x32_1024_add_neon| PROC 307 ; This function does one pass of idct32x32 transform. 308 ; 309 ; This is done by transposing the input and then doing a 1d transform on 310 ; columns. In the first pass, the transposed columns are the original 311 ; rows. In the second pass, after the transposition, the colums are the 312 ; original columns. 313 ; The 1d transform is done by looping over bands of eight columns (the 314 ; idct32_bands loop). For each band, the transform input transposition 315 ; is done on demand, one band of four 8x8 matrices at a time. The four 316 ; matrices are transposed by pairs (the idct32_transpose_pair loop). 317 push {r4-r11} 318 vpush {d8-d15} 319 ; stack operation 320 ; internal buffer used to transpose 8 lines into before transforming them 321 ; int16_t transpose_buffer[32 * 8]; 322 ; at sp + [4096, 4607] 323 ; results of the first pass (transpose and transform rows) 324 ; int16_t pass1[32 * 32]; 325 ; at sp + [0, 2047] 326 ; results of the second pass (transpose and transform columns) 327 ; int16_t pass2[32 * 32]; 328 ; at sp + [2048, 4095] 329 sub sp, sp, #512+2048+2048 330 331 ; r6 = dest + 31 * dest_stride 332 ; r7 = dest + 0 * dest_stride 333 ; r9 = dest + 15 * dest_stride 334 ; r10 = dest + 16 * dest_stride 335 rsb r6, r2, r2, lsl #5 336 rsb r9, r2, r2, lsl #4 337 add r10, r1, r2, lsl #4 338 mov r7, r1 339 add r6, r6, r1 340 add r9, r9, r1 341 ; r11 = -dest_stride 342 neg r11, r2 343 ; r3 = input 344 mov r3, r0 345 ; parameters for first pass 346 ; r0 = transpose_buffer[32 * 8] 347 add r0, sp, #4096 348 ; r1 = pass1[32 * 32] 349 mov r1, sp 350 351 mov r5, #0 ; initialize pass loop counter 352idct32_pass_loop 353 mov r4, #4 ; initialize bands loop counter 354idct32_bands_loop 355 mov r8, #2 ; initialize transpose loop counter 356idct32_transpose_pair_loop 357 ; Load two horizontally consecutive 8x8 16bit data matrices. The first one 358 ; into q0-q7 and the second one into q8-q15. There is a stride of 64, 359 ; adjusted to 32 because of the two post-increments. 360 vld1.s16 {q8}, [r3]! 361 vld1.s16 {q0}, [r3]! 362 add r3, #32 363 vld1.s16 {q9}, [r3]! 364 vld1.s16 {q1}, [r3]! 365 add r3, #32 366 vld1.s16 {q10}, [r3]! 367 vld1.s16 {q2}, [r3]! 368 add r3, #32 369 vld1.s16 {q11}, [r3]! 370 vld1.s16 {q3}, [r3]! 371 add r3, #32 372 vld1.s16 {q12}, [r3]! 373 vld1.s16 {q4}, [r3]! 374 add r3, #32 375 vld1.s16 {q13}, [r3]! 376 vld1.s16 {q5}, [r3]! 377 add r3, #32 378 vld1.s16 {q14}, [r3]! 379 vld1.s16 {q6}, [r3]! 380 add r3, #32 381 vld1.s16 {q15}, [r3]! 382 vld1.s16 {q7}, [r3]! 383 384 ; Transpose the two 8x8 16bit data matrices. 385 vswp d17, d24 386 vswp d23, d30 387 vswp d21, d28 388 vswp d19, d26 389 vswp d1, d8 390 vswp d7, d14 391 vswp d5, d12 392 vswp d3, d10 393 vtrn.32 q8, q10 394 vtrn.32 q9, q11 395 vtrn.32 q12, q14 396 vtrn.32 q13, q15 397 vtrn.32 q0, q2 398 vtrn.32 q1, q3 399 vtrn.32 q4, q6 400 vtrn.32 q5, q7 401 vtrn.16 q8, q9 402 vtrn.16 q10, q11 403 vtrn.16 q12, q13 404 vtrn.16 q14, q15 405 vtrn.16 q0, q1 406 vtrn.16 q2, q3 407 vtrn.16 q4, q5 408 vtrn.16 q6, q7 409 410 ; Store both matrices after each other. There is a stride of 32, which 411 ; adjusts to nothing because of the post-increments. 412 vst1.16 {q8}, [r0]! 413 vst1.16 {q9}, [r0]! 414 vst1.16 {q10}, [r0]! 415 vst1.16 {q11}, [r0]! 416 vst1.16 {q12}, [r0]! 417 vst1.16 {q13}, [r0]! 418 vst1.16 {q14}, [r0]! 419 vst1.16 {q15}, [r0]! 420 vst1.16 {q0}, [r0]! 421 vst1.16 {q1}, [r0]! 422 vst1.16 {q2}, [r0]! 423 vst1.16 {q3}, [r0]! 424 vst1.16 {q4}, [r0]! 425 vst1.16 {q5}, [r0]! 426 vst1.16 {q6}, [r0]! 427 vst1.16 {q7}, [r0]! 428 429 ; increment pointers by adjusted stride (not necessary for r0/out) 430 ; go back by 7*32 for the seven lines moved fully by read and add 431 ; go back by 32 for the eigth line only read 432 ; advance by 16*2 to go the next pair 433 sub r3, r3, #7*32*2 + 32 - 16*2 434 ; transpose pair loop processing 435 subs r8, r8, #1 436 bne idct32_transpose_pair_loop 437 438 ; restore r0/input to its original value 439 sub r0, r0, #32*8*2 440 441 ; Instead of doing the transforms stage by stage, it is done by loading 442 ; some input values and doing as many stages as possible to minimize the 443 ; storing/loading of intermediate results. To fit within registers, the 444 ; final coefficients are cut into four blocks: 445 ; BLOCK A: 16-19,28-31 446 ; BLOCK B: 20-23,24-27 447 ; BLOCK C: 8-10,11-15 448 ; BLOCK D: 0-3,4-7 449 ; Blocks A and C are straight calculation through the various stages. In 450 ; block B, further calculations are performed using the results from 451 ; block A. In block D, further calculations are performed using the results 452 ; from block C and then the final calculations are done using results from 453 ; block A and B which have been combined at the end of block B. 454 455 ; -------------------------------------------------------------------------- 456 ; BLOCK A: 16-19,28-31 457 ; -------------------------------------------------------------------------- 458 ; generate 16,17,30,31 459 ; -------------------------------------------------------------------------- 460 ; part of stage 1 461 ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; 462 ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; 463 ;step1b[16][i] = dct_const_round_shift(temp1); 464 ;step1b[31][i] = dct_const_round_shift(temp2); 465 LOAD_FROM_TRANSPOSED 0, 1, 31 466 DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 467 ; -------------------------------------------------------------------------- 468 ; part of stage 1 469 ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; 470 ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; 471 ;step1b[17][i] = dct_const_round_shift(temp1); 472 ;step1b[30][i] = dct_const_round_shift(temp2); 473 LOAD_FROM_TRANSPOSED 31, 17, 15 474 DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 475 ; -------------------------------------------------------------------------- 476 ; part of stage 2 477 ;step2[16] = step1b[16][i] + step1b[17][i]; 478 ;step2[17] = step1b[16][i] - step1b[17][i]; 479 ;step2[30] = -step1b[30][i] + step1b[31][i]; 480 ;step2[31] = step1b[30][i] + step1b[31][i]; 481 vadd.s16 q4, q0, q1 482 vsub.s16 q13, q0, q1 483 vadd.s16 q6, q2, q3 484 vsub.s16 q14, q2, q3 485 ; -------------------------------------------------------------------------- 486 ; part of stage 3 487 ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; 488 ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; 489 ;step3[17] = dct_const_round_shift(temp1); 490 ;step3[30] = dct_const_round_shift(temp2); 491 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 492 ; -------------------------------------------------------------------------- 493 ; generate 18,19,28,29 494 ; -------------------------------------------------------------------------- 495 ; part of stage 1 496 ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; 497 ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; 498 ;step1b[18][i] = dct_const_round_shift(temp1); 499 ;step1b[29][i] = dct_const_round_shift(temp2); 500 LOAD_FROM_TRANSPOSED 15, 9, 23 501 DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 502 ; -------------------------------------------------------------------------- 503 ; part of stage 1 504 ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; 505 ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; 506 ;step1b[19][i] = dct_const_round_shift(temp1); 507 ;step1b[28][i] = dct_const_round_shift(temp2); 508 LOAD_FROM_TRANSPOSED 23, 25, 7 509 DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 510 ; -------------------------------------------------------------------------- 511 ; part of stage 2 512 ;step2[18] = -step1b[18][i] + step1b[19][i]; 513 ;step2[19] = step1b[18][i] + step1b[19][i]; 514 ;step2[28] = step1b[28][i] + step1b[29][i]; 515 ;step2[29] = step1b[28][i] - step1b[29][i]; 516 vsub.s16 q13, q3, q2 517 vadd.s16 q3, q3, q2 518 vsub.s16 q14, q1, q0 519 vadd.s16 q2, q1, q0 520 ; -------------------------------------------------------------------------- 521 ; part of stage 3 522 ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); 523 ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); 524 ;step3[29] = dct_const_round_shift(temp1); 525 ;step3[18] = dct_const_round_shift(temp2); 526 DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 527 ; -------------------------------------------------------------------------- 528 ; combine 16-19,28-31 529 ; -------------------------------------------------------------------------- 530 ; part of stage 4 531 ;step1[16] = step1b[16][i] + step1b[19][i]; 532 ;step1[17] = step1b[17][i] + step1b[18][i]; 533 ;step1[18] = step1b[17][i] - step1b[18][i]; 534 ;step1[29] = step1b[30][i] - step1b[29][i]; 535 ;step1[30] = step1b[30][i] + step1b[29][i]; 536 ;step1[31] = step1b[31][i] + step1b[28][i]; 537 vadd.s16 q8, q4, q2 538 vadd.s16 q9, q5, q0 539 vadd.s16 q10, q7, q1 540 vadd.s16 q15, q6, q3 541 vsub.s16 q13, q5, q0 542 vsub.s16 q14, q7, q1 543 STORE_IN_OUTPUT 0, 16, 31, q8, q15 544 STORE_IN_OUTPUT 31, 17, 30, q9, q10 545 ; -------------------------------------------------------------------------- 546 ; part of stage 5 547 ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; 548 ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; 549 ;step2[18] = dct_const_round_shift(temp1); 550 ;step2[29] = dct_const_round_shift(temp2); 551 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 552 STORE_IN_OUTPUT 30, 29, 18, q1, q0 553 ; -------------------------------------------------------------------------- 554 ; part of stage 4 555 ;step1[19] = step1b[16][i] - step1b[19][i]; 556 ;step1[28] = step1b[31][i] - step1b[28][i]; 557 vsub.s16 q13, q4, q2 558 vsub.s16 q14, q6, q3 559 ; -------------------------------------------------------------------------- 560 ; part of stage 5 561 ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; 562 ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; 563 ;step2[19] = dct_const_round_shift(temp1); 564 ;step2[28] = dct_const_round_shift(temp2); 565 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 566 STORE_IN_OUTPUT 18, 19, 28, q4, q6 567 ; -------------------------------------------------------------------------- 568 569 570 ; -------------------------------------------------------------------------- 571 ; BLOCK B: 20-23,24-27 572 ; -------------------------------------------------------------------------- 573 ; generate 20,21,26,27 574 ; -------------------------------------------------------------------------- 575 ; part of stage 1 576 ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; 577 ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; 578 ;step1b[20][i] = dct_const_round_shift(temp1); 579 ;step1b[27][i] = dct_const_round_shift(temp2); 580 LOAD_FROM_TRANSPOSED 7, 5, 27 581 DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 582 ; -------------------------------------------------------------------------- 583 ; part of stage 1 584 ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; 585 ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; 586 ;step1b[21][i] = dct_const_round_shift(temp1); 587 ;step1b[26][i] = dct_const_round_shift(temp2); 588 LOAD_FROM_TRANSPOSED 27, 21, 11 589 DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 590 ; -------------------------------------------------------------------------- 591 ; part of stage 2 592 ;step2[20] = step1b[20][i] + step1b[21][i]; 593 ;step2[21] = step1b[20][i] - step1b[21][i]; 594 ;step2[26] = -step1b[26][i] + step1b[27][i]; 595 ;step2[27] = step1b[26][i] + step1b[27][i]; 596 vsub.s16 q13, q0, q1 597 vadd.s16 q0, q0, q1 598 vsub.s16 q14, q2, q3 599 vadd.s16 q2, q2, q3 600 ; -------------------------------------------------------------------------- 601 ; part of stage 3 602 ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; 603 ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; 604 ;step3[21] = dct_const_round_shift(temp1); 605 ;step3[26] = dct_const_round_shift(temp2); 606 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 607 ; -------------------------------------------------------------------------- 608 ; generate 22,23,24,25 609 ; -------------------------------------------------------------------------- 610 ; part of stage 1 611 ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; 612 ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; 613 ;step1b[22][i] = dct_const_round_shift(temp1); 614 ;step1b[25][i] = dct_const_round_shift(temp2); 615 LOAD_FROM_TRANSPOSED 11, 13, 19 616 DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 617 ; -------------------------------------------------------------------------- 618 ; part of stage 1 619 ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; 620 ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; 621 ;step1b[23][i] = dct_const_round_shift(temp1); 622 ;step1b[24][i] = dct_const_round_shift(temp2); 623 LOAD_FROM_TRANSPOSED 19, 29, 3 624 DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 625 ; -------------------------------------------------------------------------- 626 ; part of stage 2 627 ;step2[22] = -step1b[22][i] + step1b[23][i]; 628 ;step2[23] = step1b[22][i] + step1b[23][i]; 629 ;step2[24] = step1b[24][i] + step1b[25][i]; 630 ;step2[25] = step1b[24][i] - step1b[25][i]; 631 vsub.s16 q14, q4, q5 632 vadd.s16 q5, q4, q5 633 vsub.s16 q13, q6, q7 634 vadd.s16 q6, q6, q7 635 ; -------------------------------------------------------------------------- 636 ; part of stage 3 637 ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); 638 ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); 639 ;step3[25] = dct_const_round_shift(temp1); 640 ;step3[22] = dct_const_round_shift(temp2); 641 DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 642 ; -------------------------------------------------------------------------- 643 ; combine 20-23,24-27 644 ; -------------------------------------------------------------------------- 645 ; part of stage 4 646 ;step1[22] = step1b[22][i] + step1b[21][i]; 647 ;step1[23] = step1b[23][i] + step1b[20][i]; 648 vadd.s16 q10, q7, q1 649 vadd.s16 q11, q5, q0 650 ;step1[24] = step1b[24][i] + step1b[27][i]; 651 ;step1[25] = step1b[25][i] + step1b[26][i]; 652 vadd.s16 q12, q6, q2 653 vadd.s16 q15, q4, q3 654 ; -------------------------------------------------------------------------- 655 ; part of stage 6 656 ;step3[16] = step1b[16][i] + step1b[23][i]; 657 ;step3[17] = step1b[17][i] + step1b[22][i]; 658 ;step3[22] = step1b[17][i] - step1b[22][i]; 659 ;step3[23] = step1b[16][i] - step1b[23][i]; 660 LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 661 vadd.s16 q8, q14, q11 662 vadd.s16 q9, q13, q10 663 vsub.s16 q13, q13, q10 664 vsub.s16 q11, q14, q11 665 STORE_IN_OUTPUT 17, 17, 16, q9, q8 666 ; -------------------------------------------------------------------------- 667 ; part of stage 6 668 ;step3[24] = step1b[31][i] - step1b[24][i]; 669 ;step3[25] = step1b[30][i] - step1b[25][i]; 670 ;step3[30] = step1b[30][i] + step1b[25][i]; 671 ;step3[31] = step1b[31][i] + step1b[24][i]; 672 LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 673 vsub.s16 q8, q9, q12 674 vadd.s16 q10, q14, q15 675 vsub.s16 q14, q14, q15 676 vadd.s16 q12, q9, q12 677 STORE_IN_OUTPUT 31, 30, 31, q10, q12 678 ; -------------------------------------------------------------------------- 679 ; TODO(cd) do some register allocation change to remove these push/pop 680 vpush {q8} ; [24] 681 vpush {q11} ; [23] 682 ; -------------------------------------------------------------------------- 683 ; part of stage 7 684 ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; 685 ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; 686 ;step1[22] = dct_const_round_shift(temp1); 687 ;step1[25] = dct_const_round_shift(temp2); 688 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 689 STORE_IN_OUTPUT 31, 25, 22, q14, q13 690 ; -------------------------------------------------------------------------- 691 ; part of stage 7 692 ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; 693 ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; 694 ;step1[23] = dct_const_round_shift(temp1); 695 ;step1[24] = dct_const_round_shift(temp2); 696 ; TODO(cd) do some register allocation change to remove these push/pop 697 vpop {q13} ; [23] 698 vpop {q14} ; [24] 699 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 700 STORE_IN_OUTPUT 22, 24, 23, q14, q13 701 ; -------------------------------------------------------------------------- 702 ; part of stage 4 703 ;step1[20] = step1b[23][i] - step1b[20][i]; 704 ;step1[27] = step1b[24][i] - step1b[27][i]; 705 vsub.s16 q14, q5, q0 706 vsub.s16 q13, q6, q2 707 ; -------------------------------------------------------------------------- 708 ; part of stage 5 709 ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); 710 ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); 711 ;step2[27] = dct_const_round_shift(temp1); 712 ;step2[20] = dct_const_round_shift(temp2); 713 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 714 ; -------------------------------------------------------------------------- 715 ; part of stage 4 716 ;step1[21] = step1b[22][i] - step1b[21][i]; 717 ;step1[26] = step1b[25][i] - step1b[26][i]; 718 vsub.s16 q14, q7, q1 719 vsub.s16 q13, q4, q3 720 ; -------------------------------------------------------------------------- 721 ; part of stage 5 722 ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); 723 ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); 724 ;step2[26] = dct_const_round_shift(temp1); 725 ;step2[21] = dct_const_round_shift(temp2); 726 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 727 ; -------------------------------------------------------------------------- 728 ; part of stage 6 729 ;step3[18] = step1b[18][i] + step1b[21][i]; 730 ;step3[19] = step1b[19][i] + step1b[20][i]; 731 ;step3[20] = step1b[19][i] - step1b[20][i]; 732 ;step3[21] = step1b[18][i] - step1b[21][i]; 733 LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 734 vadd.s16 q8, q14, q1 735 vadd.s16 q9, q13, q6 736 vsub.s16 q13, q13, q6 737 vsub.s16 q1, q14, q1 738 STORE_IN_OUTPUT 19, 18, 19, q8, q9 739 ; -------------------------------------------------------------------------- 740 ; part of stage 6 741 ;step3[27] = step1b[28][i] - step1b[27][i]; 742 ;step3[28] = step1b[28][i] + step1b[27][i]; 743 ;step3[29] = step1b[29][i] + step1b[26][i]; 744 ;step3[26] = step1b[29][i] - step1b[26][i]; 745 LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 746 vsub.s16 q14, q8, q5 747 vadd.s16 q10, q8, q5 748 vadd.s16 q11, q9, q0 749 vsub.s16 q0, q9, q0 750 STORE_IN_OUTPUT 29, 28, 29, q10, q11 751 ; -------------------------------------------------------------------------- 752 ; part of stage 7 753 ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; 754 ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; 755 ;step1[20] = dct_const_round_shift(temp1); 756 ;step1[27] = dct_const_round_shift(temp2); 757 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 758 STORE_IN_OUTPUT 29, 20, 27, q13, q14 759 ; -------------------------------------------------------------------------- 760 ; part of stage 7 761 ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; 762 ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; 763 ;step1[21] = dct_const_round_shift(temp1); 764 ;step1[26] = dct_const_round_shift(temp2); 765 DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 766 STORE_IN_OUTPUT 27, 21, 26, q1, q0 767 ; -------------------------------------------------------------------------- 768 769 770 ; -------------------------------------------------------------------------- 771 ; BLOCK C: 8-10,11-15 772 ; -------------------------------------------------------------------------- 773 ; generate 8,9,14,15 774 ; -------------------------------------------------------------------------- 775 ; part of stage 2 776 ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; 777 ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; 778 ;step2[8] = dct_const_round_shift(temp1); 779 ;step2[15] = dct_const_round_shift(temp2); 780 LOAD_FROM_TRANSPOSED 3, 2, 30 781 DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 782 ; -------------------------------------------------------------------------- 783 ; part of stage 2 784 ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; 785 ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; 786 ;step2[9] = dct_const_round_shift(temp1); 787 ;step2[14] = dct_const_round_shift(temp2); 788 LOAD_FROM_TRANSPOSED 30, 18, 14 789 DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 790 ; -------------------------------------------------------------------------- 791 ; part of stage 3 792 ;step3[8] = step1b[8][i] + step1b[9][i]; 793 ;step3[9] = step1b[8][i] - step1b[9][i]; 794 ;step3[14] = step1b[15][i] - step1b[14][i]; 795 ;step3[15] = step1b[15][i] + step1b[14][i]; 796 vsub.s16 q13, q0, q1 797 vadd.s16 q0, q0, q1 798 vsub.s16 q14, q2, q3 799 vadd.s16 q2, q2, q3 800 ; -------------------------------------------------------------------------- 801 ; part of stage 4 802 ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; 803 ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; 804 ;step1[9] = dct_const_round_shift(temp1); 805 ;step1[14] = dct_const_round_shift(temp2); 806 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 807 ; -------------------------------------------------------------------------- 808 ; generate 10,11,12,13 809 ; -------------------------------------------------------------------------- 810 ; part of stage 2 811 ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; 812 ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; 813 ;step2[10] = dct_const_round_shift(temp1); 814 ;step2[13] = dct_const_round_shift(temp2); 815 LOAD_FROM_TRANSPOSED 14, 10, 22 816 DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 817 ; -------------------------------------------------------------------------- 818 ; part of stage 2 819 ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; 820 ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; 821 ;step2[11] = dct_const_round_shift(temp1); 822 ;step2[12] = dct_const_round_shift(temp2); 823 LOAD_FROM_TRANSPOSED 22, 26, 6 824 DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 825 ; -------------------------------------------------------------------------- 826 ; part of stage 3 827 ;step3[10] = step1b[11][i] - step1b[10][i]; 828 ;step3[11] = step1b[11][i] + step1b[10][i]; 829 ;step3[12] = step1b[12][i] + step1b[13][i]; 830 ;step3[13] = step1b[12][i] - step1b[13][i]; 831 vsub.s16 q14, q4, q5 832 vadd.s16 q5, q4, q5 833 vsub.s16 q13, q6, q7 834 vadd.s16 q6, q6, q7 835 ; -------------------------------------------------------------------------- 836 ; part of stage 4 837 ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); 838 ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); 839 ;step1[13] = dct_const_round_shift(temp1); 840 ;step1[10] = dct_const_round_shift(temp2); 841 DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 842 ; -------------------------------------------------------------------------- 843 ; combine 8-10,11-15 844 ; -------------------------------------------------------------------------- 845 ; part of stage 5 846 ;step2[8] = step1b[8][i] + step1b[11][i]; 847 ;step2[9] = step1b[9][i] + step1b[10][i]; 848 ;step2[10] = step1b[9][i] - step1b[10][i]; 849 vadd.s16 q8, q0, q5 850 vadd.s16 q9, q1, q7 851 vsub.s16 q13, q1, q7 852 ;step2[13] = step1b[14][i] - step1b[13][i]; 853 ;step2[14] = step1b[14][i] + step1b[13][i]; 854 ;step2[15] = step1b[15][i] + step1b[12][i]; 855 vsub.s16 q14, q3, q4 856 vadd.s16 q10, q3, q4 857 vadd.s16 q15, q2, q6 858 STORE_IN_OUTPUT 26, 8, 15, q8, q15 859 STORE_IN_OUTPUT 15, 9, 14, q9, q10 860 ; -------------------------------------------------------------------------- 861 ; part of stage 6 862 ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; 863 ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; 864 ;step3[10] = dct_const_round_shift(temp1); 865 ;step3[13] = dct_const_round_shift(temp2); 866 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 867 STORE_IN_OUTPUT 14, 13, 10, q3, q1 868 ; -------------------------------------------------------------------------- 869 ; part of stage 5 870 ;step2[11] = step1b[8][i] - step1b[11][i]; 871 ;step2[12] = step1b[15][i] - step1b[12][i]; 872 vsub.s16 q13, q0, q5 873 vsub.s16 q14, q2, q6 874 ; -------------------------------------------------------------------------- 875 ; part of stage 6 876 ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; 877 ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; 878 ;step3[11] = dct_const_round_shift(temp1); 879 ;step3[12] = dct_const_round_shift(temp2); 880 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 881 STORE_IN_OUTPUT 10, 11, 12, q1, q3 882 ; -------------------------------------------------------------------------- 883 884 885 ; -------------------------------------------------------------------------- 886 ; BLOCK D: 0-3,4-7 887 ; -------------------------------------------------------------------------- 888 ; generate 4,5,6,7 889 ; -------------------------------------------------------------------------- 890 ; part of stage 3 891 ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; 892 ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; 893 ;step3[4] = dct_const_round_shift(temp1); 894 ;step3[7] = dct_const_round_shift(temp2); 895 LOAD_FROM_TRANSPOSED 6, 4, 28 896 DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 897 ; -------------------------------------------------------------------------- 898 ; part of stage 3 899 ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; 900 ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; 901 ;step3[5] = dct_const_round_shift(temp1); 902 ;step3[6] = dct_const_round_shift(temp2); 903 LOAD_FROM_TRANSPOSED 28, 20, 12 904 DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 905 ; -------------------------------------------------------------------------- 906 ; part of stage 4 907 ;step1[4] = step1b[4][i] + step1b[5][i]; 908 ;step1[5] = step1b[4][i] - step1b[5][i]; 909 ;step1[6] = step1b[7][i] - step1b[6][i]; 910 ;step1[7] = step1b[7][i] + step1b[6][i]; 911 vsub.s16 q13, q0, q1 912 vadd.s16 q0, q0, q1 913 vsub.s16 q14, q2, q3 914 vadd.s16 q2, q2, q3 915 ; -------------------------------------------------------------------------- 916 ; part of stage 5 917 ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; 918 ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; 919 ;step2[5] = dct_const_round_shift(temp1); 920 ;step2[6] = dct_const_round_shift(temp2); 921 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 922 ; -------------------------------------------------------------------------- 923 ; generate 0,1,2,3 924 ; -------------------------------------------------------------------------- 925 ; part of stage 4 926 ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; 927 ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; 928 ;step1[1] = dct_const_round_shift(temp1); 929 ;step1[0] = dct_const_round_shift(temp2); 930 LOAD_FROM_TRANSPOSED 12, 0, 16 931 DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 932 ; -------------------------------------------------------------------------- 933 ; part of stage 4 934 ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; 935 ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; 936 ;step1[2] = dct_const_round_shift(temp1); 937 ;step1[3] = dct_const_round_shift(temp2); 938 LOAD_FROM_TRANSPOSED 16, 8, 24 939 DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 940 ; -------------------------------------------------------------------------- 941 ; part of stage 5 942 ;step2[0] = step1b[0][i] + step1b[3][i]; 943 ;step2[1] = step1b[1][i] + step1b[2][i]; 944 ;step2[2] = step1b[1][i] - step1b[2][i]; 945 ;step2[3] = step1b[0][i] - step1b[3][i]; 946 vadd.s16 q4, q7, q6 947 vsub.s16 q7, q7, q6 948 vsub.s16 q6, q5, q14 949 vadd.s16 q5, q5, q14 950 ; -------------------------------------------------------------------------- 951 ; combine 0-3,4-7 952 ; -------------------------------------------------------------------------- 953 ; part of stage 6 954 ;step3[0] = step1b[0][i] + step1b[7][i]; 955 ;step3[1] = step1b[1][i] + step1b[6][i]; 956 ;step3[2] = step1b[2][i] + step1b[5][i]; 957 ;step3[3] = step1b[3][i] + step1b[4][i]; 958 vadd.s16 q8, q4, q2 959 vadd.s16 q9, q5, q3 960 vadd.s16 q10, q6, q1 961 vadd.s16 q11, q7, q0 962 ;step3[4] = step1b[3][i] - step1b[4][i]; 963 ;step3[5] = step1b[2][i] - step1b[5][i]; 964 ;step3[6] = step1b[1][i] - step1b[6][i]; 965 ;step3[7] = step1b[0][i] - step1b[7][i]; 966 vsub.s16 q12, q7, q0 967 vsub.s16 q13, q6, q1 968 vsub.s16 q14, q5, q3 969 vsub.s16 q15, q4, q2 970 ; -------------------------------------------------------------------------- 971 ; part of stage 7 972 ;step1[0] = step1b[0][i] + step1b[15][i]; 973 ;step1[1] = step1b[1][i] + step1b[14][i]; 974 ;step1[14] = step1b[1][i] - step1b[14][i]; 975 ;step1[15] = step1b[0][i] - step1b[15][i]; 976 LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 977 vadd.s16 q2, q8, q1 978 vadd.s16 q3, q9, q0 979 vsub.s16 q4, q9, q0 980 vsub.s16 q5, q8, q1 981 ; -------------------------------------------------------------------------- 982 ; part of final stage 983 ;output[14 * 32] = step1b[14][i] + step1b[17][i]; 984 ;output[15 * 32] = step1b[15][i] + step1b[16][i]; 985 ;output[16 * 32] = step1b[15][i] - step1b[16][i]; 986 ;output[17 * 32] = step1b[14][i] - step1b[17][i]; 987 LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 988 vadd.s16 q8, q4, q1 989 vadd.s16 q9, q5, q0 990 vsub.s16 q6, q5, q0 991 vsub.s16 q7, q4, q1 992 993 cmp r5, #0 994 bgt idct32_bands_end_2nd_pass 995 996idct32_bands_end_1st_pass 997 STORE_IN_OUTPUT 17, 16, 17, q6, q7 998 STORE_IN_OUTPUT 17, 14, 15, q8, q9 999 ; -------------------------------------------------------------------------- 1000 ; part of final stage 1001 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; 1002 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; 1003 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; 1004 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; 1005 LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 1006 vadd.s16 q4, q2, q1 1007 vadd.s16 q5, q3, q0 1008 vsub.s16 q6, q3, q0 1009 vsub.s16 q7, q2, q1 1010 STORE_IN_OUTPUT 31, 30, 31, q6, q7 1011 STORE_IN_OUTPUT 31, 0, 1, q4, q5 1012 ; -------------------------------------------------------------------------- 1013 ; part of stage 7 1014 ;step1[2] = step1b[2][i] + step1b[13][i]; 1015 ;step1[3] = step1b[3][i] + step1b[12][i]; 1016 ;step1[12] = step1b[3][i] - step1b[12][i]; 1017 ;step1[13] = step1b[2][i] - step1b[13][i]; 1018 LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 1019 vadd.s16 q2, q10, q1 1020 vadd.s16 q3, q11, q0 1021 vsub.s16 q4, q11, q0 1022 vsub.s16 q5, q10, q1 1023 ; -------------------------------------------------------------------------- 1024 ; part of final stage 1025 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; 1026 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; 1027 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; 1028 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; 1029 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 1030 vadd.s16 q8, q4, q1 1031 vadd.s16 q9, q5, q0 1032 vsub.s16 q6, q5, q0 1033 vsub.s16 q7, q4, q1 1034 STORE_IN_OUTPUT 19, 18, 19, q6, q7 1035 STORE_IN_OUTPUT 19, 12, 13, q8, q9 1036 ; -------------------------------------------------------------------------- 1037 ; part of final stage 1038 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; 1039 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; 1040 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; 1041 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; 1042 LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 1043 vadd.s16 q4, q2, q1 1044 vadd.s16 q5, q3, q0 1045 vsub.s16 q6, q3, q0 1046 vsub.s16 q7, q2, q1 1047 STORE_IN_OUTPUT 29, 28, 29, q6, q7 1048 STORE_IN_OUTPUT 29, 2, 3, q4, q5 1049 ; -------------------------------------------------------------------------- 1050 ; part of stage 7 1051 ;step1[4] = step1b[4][i] + step1b[11][i]; 1052 ;step1[5] = step1b[5][i] + step1b[10][i]; 1053 ;step1[10] = step1b[5][i] - step1b[10][i]; 1054 ;step1[11] = step1b[4][i] - step1b[11][i]; 1055 LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 1056 vadd.s16 q2, q12, q1 1057 vadd.s16 q3, q13, q0 1058 vsub.s16 q4, q13, q0 1059 vsub.s16 q5, q12, q1 1060 ; -------------------------------------------------------------------------- 1061 ; part of final stage 1062 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; 1063 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; 1064 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; 1065 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; 1066 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 1067 vadd.s16 q8, q4, q1 1068 vadd.s16 q9, q5, q0 1069 vsub.s16 q6, q5, q0 1070 vsub.s16 q7, q4, q1 1071 STORE_IN_OUTPUT 21, 20, 21, q6, q7 1072 STORE_IN_OUTPUT 21, 10, 11, q8, q9 1073 ; -------------------------------------------------------------------------- 1074 ; part of final stage 1075 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; 1076 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; 1077 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; 1078 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; 1079 LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 1080 vadd.s16 q4, q2, q1 1081 vadd.s16 q5, q3, q0 1082 vsub.s16 q6, q3, q0 1083 vsub.s16 q7, q2, q1 1084 STORE_IN_OUTPUT 27, 26, 27, q6, q7 1085 STORE_IN_OUTPUT 27, 4, 5, q4, q5 1086 ; -------------------------------------------------------------------------- 1087 ; part of stage 7 1088 ;step1[6] = step1b[6][i] + step1b[9][i]; 1089 ;step1[7] = step1b[7][i] + step1b[8][i]; 1090 ;step1[8] = step1b[7][i] - step1b[8][i]; 1091 ;step1[9] = step1b[6][i] - step1b[9][i]; 1092 LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 1093 vadd.s16 q2, q14, q1 1094 vadd.s16 q3, q15, q0 1095 vsub.s16 q4, q15, q0 1096 vsub.s16 q5, q14, q1 1097 ; -------------------------------------------------------------------------- 1098 ; part of final stage 1099 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; 1100 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; 1101 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; 1102 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; 1103 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 1104 vadd.s16 q8, q4, q1 1105 vadd.s16 q9, q5, q0 1106 vsub.s16 q6, q5, q0 1107 vsub.s16 q7, q4, q1 1108 STORE_IN_OUTPUT 23, 22, 23, q6, q7 1109 STORE_IN_OUTPUT 23, 8, 9, q8, q9 1110 ; -------------------------------------------------------------------------- 1111 ; part of final stage 1112 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; 1113 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; 1114 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; 1115 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; 1116 LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 1117 vadd.s16 q4, q2, q1 1118 vadd.s16 q5, q3, q0 1119 vsub.s16 q6, q3, q0 1120 vsub.s16 q7, q2, q1 1121 STORE_IN_OUTPUT 25, 24, 25, q6, q7 1122 STORE_IN_OUTPUT 25, 6, 7, q4, q5 1123 1124 ; restore r0 by removing the last offset from the last 1125 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 1126 sub r0, r0, #24*8*2 1127 ; restore r1 by removing the last offset from the last 1128 ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 1129 ; advance by 8 columns => 8*2 1130 sub r1, r1, #7*32*2 - 8*2 1131 ; advance by 8 lines (8*32*2) 1132 ; go back by the two pairs from the loop (32*2) 1133 add r3, r3, #8*32*2 - 32*2 1134 1135 ; bands loop processing 1136 subs r4, r4, #1 1137 bne idct32_bands_loop 1138 1139 ; parameters for second pass 1140 ; the input of pass2 is the result of pass1. we have to remove the offset 1141 ; of 32 columns induced by the above idct32_bands_loop 1142 sub r3, r1, #32*2 1143 ; r1 = pass2[32 * 32] 1144 add r1, sp, #2048 1145 1146 ; pass loop processing 1147 add r5, r5, #1 1148 b idct32_pass_loop 1149 1150idct32_bands_end_2nd_pass 1151 STORE_COMBINE_CENTER_RESULTS 1152 ; -------------------------------------------------------------------------- 1153 ; part of final stage 1154 ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; 1155 ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; 1156 ;output[30 * 32] = step1b[1][i] - step1b[30][i]; 1157 ;output[31 * 32] = step1b[0][i] - step1b[31][i]; 1158 LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 1159 vadd.s16 q4, q2, q1 1160 vadd.s16 q5, q3, q0 1161 vsub.s16 q6, q3, q0 1162 vsub.s16 q7, q2, q1 1163 STORE_COMBINE_EXTREME_RESULTS 1164 ; -------------------------------------------------------------------------- 1165 ; part of stage 7 1166 ;step1[2] = step1b[2][i] + step1b[13][i]; 1167 ;step1[3] = step1b[3][i] + step1b[12][i]; 1168 ;step1[12] = step1b[3][i] - step1b[12][i]; 1169 ;step1[13] = step1b[2][i] - step1b[13][i]; 1170 LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 1171 vadd.s16 q2, q10, q1 1172 vadd.s16 q3, q11, q0 1173 vsub.s16 q4, q11, q0 1174 vsub.s16 q5, q10, q1 1175 ; -------------------------------------------------------------------------- 1176 ; part of final stage 1177 ;output[12 * 32] = step1b[12][i] + step1b[19][i]; 1178 ;output[13 * 32] = step1b[13][i] + step1b[18][i]; 1179 ;output[18 * 32] = step1b[13][i] - step1b[18][i]; 1180 ;output[19 * 32] = step1b[12][i] - step1b[19][i]; 1181 LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 1182 vadd.s16 q8, q4, q1 1183 vadd.s16 q9, q5, q0 1184 vsub.s16 q6, q5, q0 1185 vsub.s16 q7, q4, q1 1186 STORE_COMBINE_CENTER_RESULTS 1187 ; -------------------------------------------------------------------------- 1188 ; part of final stage 1189 ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; 1190 ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; 1191 ;output[28 * 32] = step1b[3][i] - step1b[28][i]; 1192 ;output[29 * 32] = step1b[2][i] - step1b[29][i]; 1193 LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 1194 vadd.s16 q4, q2, q1 1195 vadd.s16 q5, q3, q0 1196 vsub.s16 q6, q3, q0 1197 vsub.s16 q7, q2, q1 1198 STORE_COMBINE_EXTREME_RESULTS 1199 ; -------------------------------------------------------------------------- 1200 ; part of stage 7 1201 ;step1[4] = step1b[4][i] + step1b[11][i]; 1202 ;step1[5] = step1b[5][i] + step1b[10][i]; 1203 ;step1[10] = step1b[5][i] - step1b[10][i]; 1204 ;step1[11] = step1b[4][i] - step1b[11][i]; 1205 LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 1206 vadd.s16 q2, q12, q1 1207 vadd.s16 q3, q13, q0 1208 vsub.s16 q4, q13, q0 1209 vsub.s16 q5, q12, q1 1210 ; -------------------------------------------------------------------------- 1211 ; part of final stage 1212 ;output[10 * 32] = step1b[10][i] + step1b[21][i]; 1213 ;output[11 * 32] = step1b[11][i] + step1b[20][i]; 1214 ;output[20 * 32] = step1b[11][i] - step1b[20][i]; 1215 ;output[21 * 32] = step1b[10][i] - step1b[21][i]; 1216 LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 1217 vadd.s16 q8, q4, q1 1218 vadd.s16 q9, q5, q0 1219 vsub.s16 q6, q5, q0 1220 vsub.s16 q7, q4, q1 1221 STORE_COMBINE_CENTER_RESULTS 1222 ; -------------------------------------------------------------------------- 1223 ; part of final stage 1224 ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; 1225 ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; 1226 ;output[26 * 32] = step1b[5][i] - step1b[26][i]; 1227 ;output[27 * 32] = step1b[4][i] - step1b[27][i]; 1228 LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 1229 vadd.s16 q4, q2, q1 1230 vadd.s16 q5, q3, q0 1231 vsub.s16 q6, q3, q0 1232 vsub.s16 q7, q2, q1 1233 STORE_COMBINE_EXTREME_RESULTS 1234 ; -------------------------------------------------------------------------- 1235 ; part of stage 7 1236 ;step1[6] = step1b[6][i] + step1b[9][i]; 1237 ;step1[7] = step1b[7][i] + step1b[8][i]; 1238 ;step1[8] = step1b[7][i] - step1b[8][i]; 1239 ;step1[9] = step1b[6][i] - step1b[9][i]; 1240 LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 1241 vadd.s16 q2, q14, q1 1242 vadd.s16 q3, q15, q0 1243 vsub.s16 q4, q15, q0 1244 vsub.s16 q5, q14, q1 1245 ; -------------------------------------------------------------------------- 1246 ; part of final stage 1247 ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; 1248 ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; 1249 ;output[22 * 32] = step1b[9][i] - step1b[22][i]; 1250 ;output[23 * 32] = step1b[8][i] - step1b[23][i]; 1251 LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 1252 vadd.s16 q8, q4, q1 1253 vadd.s16 q9, q5, q0 1254 vsub.s16 q6, q5, q0 1255 vsub.s16 q7, q4, q1 1256 STORE_COMBINE_CENTER_RESULTS_LAST 1257 ; -------------------------------------------------------------------------- 1258 ; part of final stage 1259 ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; 1260 ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; 1261 ;output[24 * 32] = step1b[7][i] - step1b[24][i]; 1262 ;output[25 * 32] = step1b[6][i] - step1b[25][i]; 1263 LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 1264 vadd.s16 q4, q2, q1 1265 vadd.s16 q5, q3, q0 1266 vsub.s16 q6, q3, q0 1267 vsub.s16 q7, q2, q1 1268 STORE_COMBINE_EXTREME_RESULTS_LAST 1269 ; -------------------------------------------------------------------------- 1270 ; restore pointers to their initial indices for next band pass by 1271 ; removing/adding dest_stride * 8. The actual increment by eight 1272 ; is taken care of within the _LAST macros. 1273 add r6, r6, r2, lsl #3 1274 add r9, r9, r2, lsl #3 1275 sub r7, r7, r2, lsl #3 1276 sub r10, r10, r2, lsl #3 1277 1278 ; restore r0 by removing the last offset from the last 1279 ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 1280 sub r0, r0, #24*8*2 1281 ; restore r1 by removing the last offset from the last 1282 ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 1283 ; advance by 8 columns => 8*2 1284 sub r1, r1, #25*32*2 - 8*2 1285 ; advance by 8 lines (8*32*2) 1286 ; go back by the two pairs from the loop (32*2) 1287 add r3, r3, #8*32*2 - 32*2 1288 1289 ; bands loop processing 1290 subs r4, r4, #1 1291 bne idct32_bands_loop 1292 1293 ; stack operation 1294 add sp, sp, #512+2048+2048 1295 vpop {d8-d15} 1296 pop {r4-r11} 1297 bx lr 1298 ENDP ; |vp9_idct32x32_1024_add_neon| 1299 END 1300