1/* 2 * MIPS DSPr2 optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. 5 * All rights reserved. 6 * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) 7 * Darko Laus (darko.laus@imgtec.com) 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25#include "jsimd_mips_dspr2_asm.h" 26 27/*****************************************************************************/ 28LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2) 29/* 30 * a0 - cinfo->image_width 31 * a1 - input_buf 32 * a2 - output_buf 33 * a3 - output_row 34 * 16(sp) - num_rows 35 * 20(sp) - cinfo->num_components 36 * 37 * Null conversion for compression 38 */ 39 40 SAVE_REGS_ON_STACK 8, s0, s1 41 42 lw t9, 24(sp) // t9 = num_rows 43 lw s0, 28(sp) // s0 = cinfo->num_components 44 andi t0, a0, 3 // t0 = cinfo->image_width & 3 45 beqz t0, 4f // no residual 46 nop 470: 48 addiu t9, t9, -1 49 bltz t9, 7f 50 li t1, 0 511: 52 sll t3, t1, 2 53 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 54 lw t2, 0(a1) // t2 = inptr = *input_buf 55 sll t4, a3, 2 56 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 57 addu t2, t2, t1 58 addu s1, t5, a0 59 addu t6, t5, t0 602: 61 lbu t3, 0(t2) 62 addiu t5, t5, 1 63 sb t3, -1(t5) 64 bne t6, t5, 2b 65 addu t2, t2, s0 663: 67 lbu t3, 0(t2) 68 addu t4, t2, s0 69 addu t7, t4, s0 70 addu t8, t7, s0 71 addu t2, t8, s0 72 lbu t4, 0(t4) 73 lbu t7, 0(t7) 74 lbu t8, 0(t8) 75 addiu t5, t5, 4 76 sb t3, -4(t5) 77 sb t4, -3(t5) 78 sb t7, -2(t5) 79 bne s1, t5, 3b 80 sb t8, -1(t5) 81 addiu t1, t1, 1 82 bne t1, s0, 1b 83 nop 84 addiu a1, a1, 4 85 bgez t9, 0b 86 addiu a3, a3, 1 87 b 7f 88 nop 894: 90 addiu t9, t9, -1 91 bltz t9, 7f 92 li t1, 0 935: 94 sll t3, t1, 2 95 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 96 lw t2, 0(a1) // t2 = inptr = *input_buf 97 sll t4, a3, 2 98 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 99 addu t2, t2, t1 100 addu s1, t5, a0 101 addu t6, t5, t0 1026: 103 lbu t3, 0(t2) 104 addu t4, t2, s0 105 addu t7, t4, s0 106 addu t8, t7, s0 107 addu t2, t8, s0 108 lbu t4, 0(t4) 109 lbu t7, 0(t7) 110 lbu t8, 0(t8) 111 addiu t5, t5, 4 112 sb t3, -4(t5) 113 sb t4, -3(t5) 114 sb t7, -2(t5) 115 bne s1, t5, 6b 116 sb t8, -1(t5) 117 addiu t1, t1, 1 118 bne t1, s0, 5b 119 nop 120 addiu a1, a1, 4 121 bgez t9, 4b 122 addiu a3, a3, 1 1237: 124 RESTORE_REGS_FROM_STACK 8, s0, s1 125 126 j ra 127 nop 128 129END(jsimd_c_null_convert_mips_dspr2) 130 131/*****************************************************************************/ 132/* 133 * jsimd_extrgb_ycc_convert_mips_dspr2 134 * jsimd_extbgr_ycc_convert_mips_dspr2 135 * jsimd_extrgbx_ycc_convert_mips_dspr2 136 * jsimd_extbgrx_ycc_convert_mips_dspr2 137 * jsimd_extxbgr_ycc_convert_mips_dspr2 138 * jsimd_extxrgb_ycc_convert_mips_dspr2 139 * 140 * Colorspace conversion RGB -> YCbCr 141 */ 142 143.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs 144 145.macro DO_RGB_TO_YCC r, \ 146 g, \ 147 b, \ 148 inptr 149 lbu \r, \r_offs(\inptr) 150 lbu \g, \g_offs(\inptr) 151 lbu \b, \b_offs(\inptr) 152 addiu \inptr, \pixel_size 153.endm 154 155LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) 156/* 157 * a0 - cinfo->image_width 158 * a1 - input_buf 159 * a2 - output_buf 160 * a3 - output_row 161 * 16(sp) - num_rows 162 */ 163 164 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 165 166 lw t7, 48(sp) // t7 = num_rows 167 li s0, 0x4c8b // FIX(0.29900) 168 li s1, 0x9646 // FIX(0.58700) 169 li s2, 0x1d2f // FIX(0.11400) 170 li s3, 0xffffd4cd // -FIX(0.16874) 171 li s4, 0xffffab33 // -FIX(0.33126) 172 li s5, 0x8000 // FIX(0.50000) 173 li s6, 0xffff94d1 // -FIX(0.41869) 174 li s7, 0xffffeb2f // -FIX(0.08131) 175 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 176 1770: 178 addiu t7, -1 // --num_rows 179 lw t6, 0(a1) // t6 = input_buf[0] 180 lw t0, 0(a2) 181 lw t1, 4(a2) 182 lw t2, 8(a2) 183 sll t3, a3, 2 184 lwx t0, t3(t0) // t0 = output_buf[0][output_row] 185 lwx t1, t3(t1) // t1 = output_buf[1][output_row] 186 lwx t2, t3(t2) // t2 = output_buf[2][output_row] 187 188 addu t9, t2, a0 // t9 = end address 189 addiu a3, 1 190 1911: 192 DO_RGB_TO_YCC t3, t4, t5, t6 193 194 mtlo s5, $ac0 195 mtlo t8, $ac1 196 mtlo t8, $ac2 197 maddu $ac0, s2, t5 198 maddu $ac1, s5, t5 199 maddu $ac2, s5, t3 200 maddu $ac0, s0, t3 201 maddu $ac1, s3, t3 202 maddu $ac2, s6, t4 203 maddu $ac0, s1, t4 204 maddu $ac1, s4, t4 205 maddu $ac2, s7, t5 206 extr.w t3, $ac0, 16 207 extr.w t4, $ac1, 16 208 extr.w t5, $ac2, 16 209 sb t3, 0(t0) 210 sb t4, 0(t1) 211 sb t5, 0(t2) 212 addiu t0, 1 213 addiu t2, 1 214 bne t2, t9, 1b 215 addiu t1, 1 216 bgtz t7, 0b 217 addiu a1, 4 218 219 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 220 221 j ra 222 nop 223END(jsimd_\colorid\()_ycc_convert_mips_dspr2) 224 225.purgem DO_RGB_TO_YCC 226 227.endm 228 229/*------------------------------------------id -- pix R G B */ 230GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 231GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 232GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 233GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 234GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 235GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 236 237/*****************************************************************************/ 238/* 239 * jsimd_ycc_extrgb_convert_mips_dspr2 240 * jsimd_ycc_extbgr_convert_mips_dspr2 241 * jsimd_ycc_extrgbx_convert_mips_dspr2 242 * jsimd_ycc_extbgrx_convert_mips_dspr2 243 * jsimd_ycc_extxbgr_convert_mips_dspr2 244 * jsimd_ycc_extxrgb_convert_mips_dspr2 245 * 246 * Colorspace conversion YCbCr -> RGB 247 */ 248 249.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs 250 251.macro STORE_YCC_TO_RGB scratch0 \ 252 scratch1 \ 253 scratch2 \ 254 outptr 255 sb \scratch0, \r_offs(\outptr) 256 sb \scratch1, \g_offs(\outptr) 257 sb \scratch2, \b_offs(\outptr) 258.if (\pixel_size == 4) 259 li t0, 0xFF 260 sb t0, \a_offs(\outptr) 261.endif 262 addiu \outptr, \pixel_size 263.endm 264 265LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) 266/* 267 * a0 - cinfo->image_width 268 * a1 - input_buf 269 * a2 - input_row 270 * a3 - output_buf 271 * 16(sp) - num_rows 272 */ 273 274 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 275 276 lw s1, 48(sp) 277 li t3, 0x8000 278 li t4, 0x166e9 // FIX(1.40200) 279 li t5, 0x1c5a2 // FIX(1.77200) 280 li t6, 0xffff492e // -FIX(0.71414) 281 li t7, 0xffffa7e6 // -FIX(0.34414) 282 repl.ph t8, 128 283 2840: 285 lw s0, 0(a3) 286 lw t0, 0(a1) 287 lw t1, 4(a1) 288 lw t2, 8(a1) 289 sll s5, a2, 2 290 addiu s1, -1 291 lwx s2, s5(t0) 292 lwx s3, s5(t1) 293 lwx s4, s5(t2) 294 addu t9, s2, a0 295 addiu a2, 1 296 2971: 298 lbu s7, 0(s4) // cr 299 lbu s6, 0(s3) // cb 300 lbu s5, 0(s2) // y 301 addiu s2, 1 302 addiu s4, 1 303 addiu s7, -128 304 addiu s6, -128 305 mul t2, t7, s6 306 mul t0, t6, s7 // Crgtab[cr] 307 sll s7, 15 308 mulq_rs.w t1, t4, s7 // Crrtab[cr] 309 sll s6, 15 310 addu t2, t3 // Cbgtab[cb] 311 addu t2, t0 312 313 mulq_rs.w t0, t5, s6 // Cbbtab[cb] 314 sra t2, 16 315 addu t1, s5 316 addu t2, s5 // add y 317 ins t2, t1, 16, 16 318 subu.ph t2, t2, t8 319 addu t0, s5 320 shll_s.ph t2, t2, 8 321 subu t0, 128 322 shra.ph t2, t2, 8 323 shll_s.w t0, t0, 24 324 addu.ph t2, t2, t8 // clip & store 325 sra t0, t0, 24 326 sra t1, t2, 16 327 addiu t0, 128 328 329 STORE_YCC_TO_RGB t1, t2, t0, s0 330 331 bne s2, t9, 1b 332 addiu s3, 1 333 bgtz s1, 0b 334 addiu a3, 4 335 336 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 337 338 j ra 339 nop 340END(jsimd_ycc_\colorid\()_convert_mips_dspr2) 341 342.purgem STORE_YCC_TO_RGB 343 344.endm 345 346/*------------------------------------------id -- pix R G B A */ 347GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 348GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 349GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 350GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 351GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 352GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 353 354/*****************************************************************************/ 355/* 356 * jsimd_extrgb_gray_convert_mips_dspr2 357 * jsimd_extbgr_gray_convert_mips_dspr2 358 * jsimd_extrgbx_gray_convert_mips_dspr2 359 * jsimd_extbgrx_gray_convert_mips_dspr2 360 * jsimd_extxbgr_gray_convert_mips_dspr2 361 * jsimd_extxrgb_gray_convert_mips_dspr2 362 * 363 * Colorspace conversion RGB -> GRAY 364 */ 365 366.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs 367 368.macro DO_RGB_TO_GRAY r, \ 369 g, \ 370 b, \ 371 inptr 372 lbu \r, \r_offs(\inptr) 373 lbu \g, \g_offs(\inptr) 374 lbu \b, \b_offs(\inptr) 375 addiu \inptr, \pixel_size 376.endm 377 378LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2) 379/* 380 * a0 - cinfo->image_width 381 * a1 - input_buf 382 * a2 - output_buf 383 * a3 - output_row 384 * 16(sp) - num_rows 385 */ 386 387 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 388 389 li s0, 0x4c8b // s0 = FIX(0.29900) 390 li s1, 0x9646 // s1 = FIX(0.58700) 391 li s2, 0x1d2f // s2 = FIX(0.11400) 392 li s7, 0x8000 // s7 = FIX(0.50000) 393 lw s6, 48(sp) 394 andi t7, a0, 3 395 3960: 397 addiu s6, -1 // s6 = num_rows 398 lw t0, 0(a1) 399 lw t1, 0(a2) 400 sll t3, a3, 2 401 lwx t1, t3(t1) 402 addiu a3, 1 403 addu t9, t1, a0 404 subu t8, t9, t7 405 beq t1, t8, 2f 406 nop 407 4081: 409 DO_RGB_TO_GRAY t3, t4, t5, t0 410 DO_RGB_TO_GRAY s3, s4, s5, t0 411 412 mtlo s7, $ac0 413 maddu $ac0, s2, t5 414 maddu $ac0, s1, t4 415 maddu $ac0, s0, t3 416 mtlo s7, $ac1 417 maddu $ac1, s2, s5 418 maddu $ac1, s1, s4 419 maddu $ac1, s0, s3 420 extr.w t6, $ac0, 16 421 422 DO_RGB_TO_GRAY t3, t4, t5, t0 423 DO_RGB_TO_GRAY s3, s4, s5, t0 424 425 mtlo s7, $ac0 426 maddu $ac0, s2, t5 427 maddu $ac0, s1, t4 428 extr.w t2, $ac1, 16 429 maddu $ac0, s0, t3 430 mtlo s7, $ac1 431 maddu $ac1, s2, s5 432 maddu $ac1, s1, s4 433 maddu $ac1, s0, s3 434 extr.w t5, $ac0, 16 435 sb t6, 0(t1) 436 sb t2, 1(t1) 437 extr.w t3, $ac1, 16 438 addiu t1, 4 439 sb t5, -2(t1) 440 sb t3, -1(t1) 441 bne t1, t8, 1b 442 nop 443 4442: 445 beqz t7, 4f 446 nop 447 4483: 449 DO_RGB_TO_GRAY t3, t4, t5, t0 450 451 mtlo s7, $ac0 452 maddu $ac0, s2, t5 453 maddu $ac0, s1, t4 454 maddu $ac0, s0, t3 455 extr.w t6, $ac0, 16 456 sb t6, 0(t1) 457 addiu t1, 1 458 bne t1, t9, 3b 459 nop 460 4614: 462 bgtz s6, 0b 463 addiu a1, 4 464 465 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 466 467 j ra 468 nop 469END(jsimd_\colorid\()_gray_convert_mips_dspr2) 470 471.purgem DO_RGB_TO_GRAY 472 473.endm 474 475/*------------------------------------------id -- pix R G B */ 476GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 477GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 478GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 479GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 480GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 481GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 482/*****************************************************************************/ 483/* 484 * jsimd_h2v2_merged_upsample_mips_dspr2 485 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 486 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 487 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 488 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 489 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 490 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 491 * 492 * Merged h2v2 upsample routines 493 */ 494.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ 495 pixel_size, \ 496 r1_offs, \ 497 g1_offs, \ 498 b1_offs, \ 499 a1_offs, \ 500 r2_offs, \ 501 g2_offs, \ 502 b2_offs, \ 503 a2_offs 504 505.macro STORE_H2V2_2_PIXELS scratch0 \ 506 scratch1 \ 507 scratch2 \ 508 scratch3 \ 509 scratch4 \ 510 scratch5 \ 511 outptr 512 sb \scratch0, \r1_offs(\outptr) 513 sb \scratch1, \g1_offs(\outptr) 514 sb \scratch2, \b1_offs(\outptr) 515 sb \scratch3, \r2_offs(\outptr) 516 sb \scratch4, \g2_offs(\outptr) 517 sb \scratch5, \b2_offs(\outptr) 518.if (\pixel_size == 8) 519 li \scratch0, 0xFF 520 sb \scratch0, \a1_offs(\outptr) 521 sb \scratch0, \a2_offs(\outptr) 522.endif 523 addiu \outptr, \pixel_size 524.endm 525 526.macro STORE_H2V2_1_PIXEL scratch0 \ 527 scratch1 \ 528 scratch2 \ 529 outptr 530 sb \scratch0, \r1_offs(\outptr) 531 sb \scratch1, \g1_offs(\outptr) 532 sb \scratch2, \b1_offs(\outptr) 533 534.if (\pixel_size == 8) 535 li t0, 0xFF 536 sb t0, \a1_offs(\outptr) 537.endif 538.endm 539 540LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) 541/* 542 * a0 - cinfo->output_width 543 * a1 - input_buf 544 * a2 - in_row_group_ctr 545 * a3 - output_buf 546 * 16(sp) - cinfo->sample_range_limit 547 */ 548 549 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 550 551 lw t9, 56(sp) // cinfo->sample_range_limit 552 lw v0, 0(a1) 553 lw v1, 4(a1) 554 lw t0, 8(a1) 555 sll t1, a2, 3 556 addiu t2, t1, 4 557 sll t3, a2, 2 558 lw t4, 0(a3) // t4 = output_buf[0] 559 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] 560 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] 561 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] 562 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] 563 lw t7, 4(a3) // t7 = output_buf[1] 564 li s1, 0xe6ea 565 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] 566 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] 567 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 568 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 569 srl t3, a0, 1 570 blez t3, 2f 571 addu t0, t5, t3 // t0 = end address 572 1: 573 lbu t3, 0(t5) 574 lbu s3, 0(t6) 575 addiu t5, t5, 1 576 addiu t3, t3, -128 // (cb - 128) 577 addiu s3, s3, -128 // (cr - 128) 578 mult $ac1, s1, t3 579 madd $ac1, s2, s3 580 sll s3, s3, 15 581 sll t3, t3, 15 582 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 583 extr_r.w s5, $ac1, 16 584 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 585 lbu v0, 0(t1) 586 addiu t6, t6, 1 587 addiu t1, t1, 2 588 addu t3, v0, s4 // y+cred 589 addu s3, v0, s5 // y+cgreen 590 addu v1, v0, s6 // y+cblue 591 addu t3, t9, t3 // y+cred 592 addu s3, t9, s3 // y+cgreen 593 addu v1, t9, v1 // y+cblue 594 lbu AT, 0(t3) 595 lbu s7, 0(s3) 596 lbu ra, 0(v1) 597 lbu v0, -1(t1) 598 addu t3, v0, s4 // y+cred 599 addu s3, v0, s5 // y+cgreen 600 addu v1, v0, s6 // y+cblue 601 addu t3, t9, t3 // y+cred 602 addu s3, t9, s3 // y+cgreen 603 addu v1, t9, v1 // y+cblue 604 lbu t3, 0(t3) 605 lbu s3, 0(s3) 606 lbu v1, 0(v1) 607 lbu v0, 0(t2) 608 609 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 610 611 addu t3, v0, s4 // y+cred 612 addu s3, v0, s5 // y+cgreen 613 addu v1, v0, s6 // y+cblue 614 addu t3, t9, t3 // y+cred 615 addu s3, t9, s3 // y+cgreen 616 addu v1, t9, v1 // y+cblue 617 lbu AT, 0(t3) 618 lbu s7, 0(s3) 619 lbu ra, 0(v1) 620 lbu v0, 1(t2) 621 addiu t2, t2, 2 622 addu t3, v0, s4 // y+cred 623 addu s3, v0, s5 // y+cgreen 624 addu v1, v0, s6 // y+cblue 625 addu t3, t9, t3 // y+cred 626 addu s3, t9, s3 // y+cgreen 627 addu v1, t9, v1 // y+cblue 628 lbu t3, 0(t3) 629 lbu s3, 0(s3) 630 lbu v1, 0(v1) 631 632 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 633 634 bne t0, t5, 1b 635 nop 6362: 637 andi t0, a0, 1 638 beqz t0, 4f 639 lbu t3, 0(t5) 640 lbu s3, 0(t6) 641 addiu t3, t3, -128 // (cb - 128) 642 addiu s3, s3, -128 // (cr - 128) 643 mult $ac1, s1, t3 644 madd $ac1, s2, s3 645 sll s3, s3, 15 646 sll t3, t3, 15 647 lbu v0, 0(t1) 648 extr_r.w s5, $ac1, 16 649 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 650 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 651 addu t3, v0, s4 // y+cred 652 addu s3, v0, s5 // y+cgreen 653 addu v1, v0, s6 // y+cblue 654 addu t3, t9, t3 // y+cred 655 addu s3, t9, s3 // y+cgreen 656 addu v1, t9, v1 // y+cblue 657 lbu t3, 0(t3) 658 lbu s3, 0(s3) 659 lbu v1, 0(v1) 660 lbu v0, 0(t2) 661 662 STORE_H2V2_1_PIXEL t3, s3, v1, t4 663 664 addu t3, v0, s4 // y+cred 665 addu s3, v0, s5 // y+cgreen 666 addu v1, v0, s6 // y+cblue 667 addu t3, t9, t3 // y+cred 668 addu s3, t9, s3 // y+cgreen 669 addu v1, t9, v1 // y+cblue 670 lbu t3, 0(t3) 671 lbu s3, 0(s3) 672 lbu v1, 0(v1) 673 674 STORE_H2V2_1_PIXEL t3, s3, v1, t7 6754: 676 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 677 678 j ra 679 nop 680 681END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) 682 683.purgem STORE_H2V2_1_PIXEL 684.purgem STORE_H2V2_2_PIXELS 685.endm 686 687/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 688GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 689GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 690GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 691GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 692GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 693GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 694/*****************************************************************************/ 695/* 696 * jsimd_h2v1_merged_upsample_mips_dspr2 697 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 698 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 699 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 700 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 701 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 702 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 703 * 704 * Merged h2v1 upsample routines 705 */ 706 707.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ 708 pixel_size, \ 709 r1_offs, \ 710 g1_offs, \ 711 b1_offs, \ 712 a1_offs, \ 713 r2_offs, \ 714 g2_offs, \ 715 b2_offs, \ 716 a2_offs 717 718.macro STORE_H2V1_2_PIXELS scratch0 \ 719 scratch1 \ 720 scratch2 \ 721 scratch3 \ 722 scratch4 \ 723 scratch5 \ 724 outptr 725 sb \scratch0, \r1_offs(\outptr) 726 sb \scratch1, \g1_offs(\outptr) 727 sb \scratch2, \b1_offs(\outptr) 728 sb \scratch3, \r2_offs(\outptr) 729 sb \scratch4, \g2_offs(\outptr) 730 sb \scratch5, \b2_offs(\outptr) 731.if (\pixel_size == 8) 732 li t0, 0xFF 733 sb t0, \a1_offs(\outptr) 734 sb t0, \a2_offs(\outptr) 735.endif 736 addiu \outptr, \pixel_size 737.endm 738 739.macro STORE_H2V1_1_PIXEL scratch0 \ 740 scratch1 \ 741 scratch2 \ 742 outptr 743 sb \scratch0, \r1_offs(\outptr) 744 sb \scratch1, \g1_offs(\outptr) 745 sb \scratch2, \b1_offs(\outptr) 746.if (\pixel_size == 8) 747 li t0, 0xFF 748 sb t0, \a1_offs(\outptr) 749.endif 750.endm 751 752LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) 753/* 754 * a0 - cinfo->output_width 755 * a1 - input_buf 756 * a2 - in_row_group_ctr 757 * a3 - output_buf 758 * 16(sp) - range_limit 759 */ 760 761 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 762 763 li t0, 0xe6ea 764 lw t1, 0(a1) // t1 = input_buf[0] 765 lw t2, 4(a1) // t2 = input_buf[1] 766 lw t3, 8(a1) // t3 = input_buf[2] 767 lw t8, 56(sp) // t8 = range_limit 768 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] 769 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] 770 addiu s0, t0, 0x9916 // s0 = 0x8000 771 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 772 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 773 srl t0, a0, 1 774 sll t4, a2, 2 775 lwx s5, t4(t1) // s5 = inptr0 776 lwx s6, t4(t2) // s6 = inptr1 777 lwx s7, t4(t3) // s7 = inptr2 778 lw t7, 0(a3) // t7 = outptr 779 blez t0, 2f 780 addu t9, s6, t0 // t9 = end address 7811: 782 lbu t2, 0(s6) // t2 = cb 783 lbu t0, 0(s7) // t0 = cr 784 lbu t1, 0(s5) // t1 = y 785 addiu t2, t2, -128 // t2 = cb - 128 786 addiu t0, t0, -128 // t0 = cr - 128 787 mult $ac1, s4, t2 788 madd $ac1, s3, t0 789 sll t0, t0, 15 790 sll t2, t2, 15 791 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS 792 extr_r.w t5, $ac1, 16 793 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS 794 addiu s7, s7, 1 795 addiu s6, s6, 1 796 addu t2, t1, t0 // t2 = y + cred 797 addu t3, t1, t5 // t3 = y + cgreen 798 addu t4, t1, t6 // t4 = y + cblue 799 addu t2, t8, t2 800 addu t3, t8, t3 801 addu t4, t8, t4 802 lbu t1, 1(s5) 803 lbu v0, 0(t2) 804 lbu v1, 0(t3) 805 lbu ra, 0(t4) 806 addu t2, t1, t0 807 addu t3, t1, t5 808 addu t4, t1, t6 809 addu t2, t8, t2 810 addu t3, t8, t3 811 addu t4, t8, t4 812 lbu t2, 0(t2) 813 lbu t3, 0(t3) 814 lbu t4, 0(t4) 815 816 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 817 818 bne t9, s6, 1b 819 addiu s5, s5, 2 8202: 821 andi t0, a0, 1 822 beqz t0, 4f 823 nop 8243: 825 lbu t2, 0(s6) 826 lbu t0, 0(s7) 827 lbu t1, 0(s5) 828 addiu t2, t2, -128 //(cb - 128) 829 addiu t0, t0, -128 //(cr - 128) 830 mul t3, s4, t2 831 mul t4, s3, t0 832 sll t0, t0, 15 833 sll t2, t2, 15 834 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS 835 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS 836 addu t3, t3, s0 837 addu t3, t4, t3 838 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS 839 addu t2, t1, t0 // y + cred 840 addu t3, t1, t5 // y + cgreen 841 addu t4, t1, t6 // y + cblue 842 addu t2, t8, t2 843 addu t3, t8, t3 844 addu t4, t8, t4 845 lbu t2, 0(t2) 846 lbu t3, 0(t3) 847 lbu t4, 0(t4) 848 849 STORE_H2V1_1_PIXEL t2, t3, t4, t7 8504: 851 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 852 853 j ra 854 nop 855 856END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) 857 858.purgem STORE_H2V1_1_PIXEL 859.purgem STORE_H2V1_2_PIXELS 860.endm 861 862/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 863GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 864GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 865GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 866GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 867GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 868GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 869/*****************************************************************************/ 870/* 871 * jsimd_h2v2_fancy_upsample_mips_dspr2 872 * 873 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 874 */ 875LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) 876/* 877 * a0 - cinfo->max_v_samp_factor 878 * a1 - downsampled_width 879 * a2 - input_data 880 * a3 - output_data_ptr 881 */ 882 883 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 884 885 li s4, 0 886 lw s2, 0(a3) // s2 = *output_data_ptr 8870: 888 li t9, 2 889 lw s1, -4(a2) // s1 = inptr1 890 8911: 892 lw s0, 0(a2) // s0 = inptr0 893 lwx s3, s4(s2) 894 addiu s5, a1, -2 // s5 = downsampled_width - 2 895 srl t4, s5, 1 896 sll t4, t4, 1 897 lbu t0, 0(s0) 898 lbu t1, 1(s0) 899 lbu t2, 0(s1) 900 lbu t3, 1(s1) 901 addiu s0, 2 902 addiu s1, 2 903 addu t8, s0, t4 // t8 = end address 904 andi s5, s5, 1 // s5 = residual 905 sll t4, t0, 1 906 sll t6, t1, 1 907 addu t0, t0, t4 // t0 = (*inptr0++) * 3 908 addu t1, t1, t6 // t1 = (*inptr0++) * 3 909 addu t7, t0, t2 // t7 = thiscolsum 910 addu t6, t1, t3 // t5 = nextcolsum 911 sll t0, t7, 2 // t0 = thiscolsum * 4 912 subu t1, t0, t7 // t1 = thiscolsum * 3 913 shra_r.w t0, t0, 4 914 addiu t1, 7 915 addu t1, t1, t6 916 srl t1, t1, 4 917 sb t0, 0(s3) 918 sb t1, 1(s3) 919 beq t8, s0, 22f // skip to final iteration if width == 3 920 addiu s3, 2 9212: 922 lh t0, 0(s0) // t0 = A3|A2 923 lh t2, 0(s1) // t2 = B3|B2 924 addiu s0, 2 925 addiu s1, 2 926 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 927 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 928 shll.ph t1, t0, 1 929 sll t3, t6, 1 930 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 931 addu t3, t3, t6 // t3 = this * 3 932 addu.ph t0, t0, t2 // t0 = next2|next1 933 addu t1, t3, t7 934 andi t7, t0, 0xFFFF // t7 = next1 935 sll t2, t7, 1 936 addu t2, t7, t2 // t2 = next1*3 937 addu t4, t2, t6 938 srl t6, t0, 16 // t6 = next2 939 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 940 addu t0, t3, t7 941 addiu t0, 7 942 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 943 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 944 addu t2, t2, t6 945 addiu t2, 7 946 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 947 sb t1, 0(s3) 948 sb t0, 1(s3) 949 sb t4, 2(s3) 950 sb t2, 3(s3) 951 bne t8, s0, 2b 952 addiu s3, 4 95322: 954 beqz s5, 4f 955 addu t8, s0, s5 9563: 957 lbu t0, 0(s0) 958 lbu t2, 0(s1) 959 addiu s0, 1 960 addiu s1, 1 961 sll t3, t6, 1 962 sll t1, t0, 1 963 addu t1, t0, t1 // t1 = inptr0 * 3 964 addu t3, t3, t6 // t3 = thiscolsum * 3 965 addu t5, t1, t2 966 addu t1, t3, t7 967 shra_r.w t1, t1, 4 968 addu t0, t3, t5 969 addiu t0, 7 970 srl t0, t0, 4 971 sb t1, 0(s3) 972 sb t0, 1(s3) 973 addiu s3, 2 974 move t7, t6 975 bne t8, s0, 3b 976 move t6, t5 9774: 978 sll t0, t6, 2 // t0 = thiscolsum * 4 979 subu t1, t0, t6 // t1 = thiscolsum * 3 980 addu t1, t1, t7 981 addiu s4, 4 982 shra_r.w t1, t1, 4 983 addiu t0, 7 984 srl t0, t0, 4 985 sb t1, 0(s3) 986 sb t0, 1(s3) 987 addiu t9, -1 988 addiu s3, 2 989 bnez t9, 1b 990 lw s1, 4(a2) 991 srl t0, s4, 2 992 subu t0, a0, t0 993 bgtz t0, 0b 994 addiu a2, 4 995 996 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 997 998 j ra 999 nop 1000END(jsimd_h2v2_fancy_upsample_mips_dspr2) 1001 1002/*****************************************************************************/ 1003LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) 1004/* 1005 * a0 - cinfo->max_v_samp_factor 1006 * a1 - downsampled_width 1007 * a2 - input_data 1008 * a3 - output_data_ptr 1009 */ 1010 1011 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1012 1013 .set at 1014 1015 beqz a0, 3f 1016 sll t0, a0, 2 1017 lw s1, 0(a3) 1018 li s3, 0x10001 1019 addu s0, s1, t0 10200: 1021 addiu t8, a1, -2 1022 srl t9, t8, 2 1023 lw t7, 0(a2) 1024 lw s2, 0(s1) 1025 lbu t0, 0(t7) 1026 lbu t1, 1(t7) // t1 = inptr[1] 1027 sll t2, t0, 1 1028 addu t2, t2, t0 // t2 = invalue*3 1029 addu t2, t2, t1 1030 shra_r.w t2, t2, 2 1031 sb t0, 0(s2) 1032 sb t2, 1(s2) 1033 beqz t9, 11f 1034 addiu s2, 2 10351: 1036 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| 1037 ulw t1, 1(t7) 1038 ulh t2, 4(t7) // t2 = |0|0|P5|P4| 1039 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| 1040 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| 1041 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| 1042 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| 1043 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| 1044 shll.ph t5, t4, 1 1045 shll.ph t6, t1, 1 1046 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| 1047 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| 1048 addu.ph t4, t3, s3 1049 addu.ph t0, t0, s3 1050 addu.ph t4, t4, t5 1051 addu.ph t0, t0, t6 1052 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| 1053 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| 1054 addu.ph t2, t2, t5 1055 addu.ph t3, t3, t6 1056 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| 1057 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| 1058 shll.ph t2, t2, 8 1059 shll.ph t3, t3, 8 1060 or t2, t4, t2 1061 or t3, t3, t0 1062 addiu t9, -1 1063 usw t3, 0(s2) 1064 usw t2, 4(s2) 1065 addiu s2, 8 1066 bgtz t9, 1b 1067 addiu t7, 4 106811: 1069 andi t8, 3 1070 beqz t8, 22f 1071 addiu t7, 1 1072 10732: 1074 lbu t0, 0(t7) 1075 addiu t7, 1 1076 sll t1, t0, 1 1077 addu t2, t0, t1 // t2 = invalue 1078 lbu t3, -2(t7) 1079 lbu t4, 0(t7) 1080 addiu t3, 1 1081 addiu t4, 2 1082 addu t3, t3, t2 1083 addu t4, t4, t2 1084 srl t3, 2 1085 srl t4, 2 1086 sb t3, 0(s2) 1087 sb t4, 1(s2) 1088 addiu t8, -1 1089 bgtz t8, 2b 1090 addiu s2, 2 1091 109222: 1093 lbu t0, 0(t7) 1094 lbu t2, -1(t7) 1095 sll t1, t0, 1 1096 addu t1, t1, t0 // t1 = invalue * 3 1097 addu t1, t1, t2 1098 addiu t1, 1 1099 srl t1, t1, 2 1100 sb t1, 0(s2) 1101 sb t0, 1(s2) 1102 addiu s1, 4 1103 bne s1, s0, 0b 1104 addiu a2, 4 11053: 1106 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1107 1108 j ra 1109 nop 1110END(jsimd_h2v1_fancy_upsample_mips_dspr2) 1111 1112/*****************************************************************************/ 1113LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2) 1114/* 1115 * a0 - cinfo->image_width 1116 * a1 - cinfo->max_v_samp_factor 1117 * a2 - compptr->v_samp_factor 1118 * a3 - compptr->width_in_blocks 1119 * 16(sp) - input_data 1120 * 20(sp) - output_data 1121 */ 1122 .set at 1123 1124 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 1125 1126 beqz a2, 7f 1127 lw s1, 44(sp) // s1 = output_data 1128 lw s0, 40(sp) // s0 = input_data 1129 srl s2, a0, 2 1130 andi t9, a0, 2 1131 srl t7, t9, 1 1132 addu s2, t7, s2 1133 sll t0, a3, 3 // t0 = width_in_blocks*DCT 1134 srl t7, t0, 1 1135 subu s2, t7, s2 11360: 1137 andi t6, a0, 1 // t6 = temp_index 1138 addiu t6, -1 1139 lw t4, 0(s1) // t4 = outptr 1140 lw t5, 0(s0) // t5 = inptr0 1141 li s3, 0 // s3 = bias 1142 srl t7, a0, 1 // t7 = image_width1 1143 srl s4, t7, 2 1144 andi t8, t7, 3 11451: 1146 ulhu t0, 0(t5) 1147 ulhu t1, 2(t5) 1148 ulhu t2, 4(t5) 1149 ulhu t3, 6(t5) 1150 raddu.w.qb t0, t0 1151 raddu.w.qb t1, t1 1152 raddu.w.qb t2, t2 1153 raddu.w.qb t3, t3 1154 shra.ph t0, t0, 1 1155 shra_r.ph t1, t1, 1 1156 shra.ph t2, t2, 1 1157 shra_r.ph t3, t3, 1 1158 sb t0, 0(t4) 1159 sb t1, 1(t4) 1160 sb t2, 2(t4) 1161 sb t3, 3(t4) 1162 addiu s4, -1 1163 addiu t4, 4 1164 bgtz s4, 1b 1165 addiu t5, 8 1166 beqz t8, 3f 1167 addu s4, t4, t8 11682: 1169 ulhu t0, 0(t5) 1170 raddu.w.qb t0, t0 1171 addqh.w t0, t0, s3 1172 xori s3, s3, 1 1173 sb t0, 0(t4) 1174 addiu t4, 1 1175 bne t4, s4, 2b 1176 addiu t5, 2 11773: 1178 lbux t1, t6(t5) 1179 sll t1, 1 1180 addqh.w t2, t1, s3 // t2 = pixval1 1181 xori s3, s3, 1 1182 addqh.w t3, t1, s3 // t3 = pixval2 1183 blez s2, 5f 1184 append t3, t2, 8 1185 addu t5, t4, s2 // t5 = loop_end2 11864: 1187 ush t3, 0(t4) 1188 addiu s2, -1 1189 bgtz s2, 4b 1190 addiu t4, 2 11915: 1192 beqz t9, 6f 1193 nop 1194 sb t2, 0(t4) 11956: 1196 addiu s1, 4 1197 addiu a2, -1 1198 bnez a2, 0b 1199 addiu s0, 4 12007: 1201 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 1202 1203 j ra 1204 nop 1205END(jsimd_h2v1_downsample_mips_dspr2) 1206 1207/*****************************************************************************/ 1208LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) 1209 1210/* 1211 * a0 - cinfo->image_width 1212 * a1 - cinfo->max_v_samp_factor 1213 * a2 - compptr->v_samp_factor 1214 * a3 - compptr->width_in_blocks 1215 * 16(sp) - input_data 1216 * 20(sp) - output_data 1217 */ 1218 .set at 1219 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1220 1221 beqz a2, 8f 1222 lw s1, 52(sp) // s1 = output_data 1223 lw s0, 48(sp) // s0 = input_data 1224 1225 andi t6, a0, 1 // t6 = temp_index 1226 addiu t6, -1 1227 srl t7, a0, 1 // t7 = image_width1 1228 srl s4, t7, 2 1229 andi t8, t7, 3 1230 andi t9, a0, 2 1231 srl s2, a0, 2 1232 srl t7, t9, 1 1233 addu s2, t7, s2 1234 sll t0, a3, 3 // s2 = width_in_blocks*DCT 1235 srl t7, t0, 1 1236 subu s2, t7, s2 12370: 1238 lw t4, 0(s1) // t4 = outptr 1239 lw t5, 0(s0) // t5 = inptr0 1240 lw s7, 4(s0) // s7 = inptr1 1241 li s6, 1 // s6 = bias 12422: 1243 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| 1244 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| 1245 ulw t2, 4(t5) 1246 ulw t3, 4(s7) 1247 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| 1248 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| 1249 raddu.w.qb t1, t7 1250 raddu.w.qb t0, t0 1251 shra_r.w t1, t1, 2 1252 addiu t0, 1 1253 srl t0, 2 1254 precrq.ph.w t7, t2, t3 1255 ins t2, t3, 16, 16 1256 raddu.w.qb t7, t7 1257 raddu.w.qb t2, t2 1258 shra_r.w t7, t7, 2 1259 addiu t2, 1 1260 srl t2, 2 1261 sb t0, 0(t4) 1262 sb t1, 1(t4) 1263 sb t2, 2(t4) 1264 sb t7, 3(t4) 1265 addiu t4, 4 1266 addiu t5, 8 1267 addiu s4, s4, -1 1268 bgtz s4, 2b 1269 addiu s7, 8 1270 beqz t8, 4f 1271 addu t8, t4, t8 12723: 1273 ulhu t0, 0(t5) 1274 ulhu t1, 0(s7) 1275 ins t0, t1, 16, 16 1276 raddu.w.qb t0, t0 1277 addu t0, t0, s6 1278 srl t0, 2 1279 xori s6, s6, 3 1280 sb t0, 0(t4) 1281 addiu t5, 2 1282 addiu t4, 1 1283 bne t8, t4, 3b 1284 addiu s7, 2 12854: 1286 lbux t1, t6(t5) 1287 sll t1, 1 1288 lbux t0, t6(s7) 1289 sll t0, 1 1290 addu t1, t1, t0 1291 addu t3, t1, s6 1292 srl t0, t3, 2 // t2 = pixval1 1293 xori s6, s6, 3 1294 addu t2, t1, s6 1295 srl t1, t2, 2 // t3 = pixval2 1296 blez s2, 6f 1297 append t1, t0, 8 12985: 1299 ush t1, 0(t4) 1300 addiu s2, -1 1301 bgtz s2, 5b 1302 addiu t4, 2 13036: 1304 beqz t9, 7f 1305 nop 1306 sb t0, 0(t4) 13077: 1308 addiu s1, 4 1309 addiu a2, -1 1310 bnez a2, 0b 1311 addiu s0, 8 13128: 1313 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1314 1315 j ra 1316 nop 1317END(jsimd_h2v2_downsample_mips_dspr2) 1318/*****************************************************************************/ 1319LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) 1320/* 1321 * a0 - input_data 1322 * a1 - output_data 1323 * a2 - compptr->v_samp_factor 1324 * a3 - cinfo->max_v_samp_factor 1325 * 16(sp) - cinfo->smoothing_factor 1326 * 20(sp) - compptr->width_in_blocks 1327 * 24(sp) - cinfo->image_width 1328 */ 1329 1330 .set at 1331 1332 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1333 1334 lw s7, 52(sp) // compptr->width_in_blocks 1335 lw s0, 56(sp) // cinfo->image_width 1336 lw s6, 48(sp) // cinfo->smoothing_factor 1337 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE 1338 sll v0, s7, 1 1339 subu v0, v0, s0 1340 blez v0, 2f 1341 move v1, zero 1342 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 13430: 1344 addiu t1, a0, -4 1345 sll t2, v1, 2 1346 lwx t1, t2(t1) 1347 move t3, v0 1348 addu t1, t1, s0 1349 lbu t2, -1(t1) 13501: 1351 addiu t3, t3, -1 1352 sb t2, 0(t1) 1353 bgtz t3, 1b 1354 addiu t1, t1, 1 1355 addiu v1, v1, 1 1356 bne v1, t0, 0b 1357 nop 13582: 1359 li v0, 80 1360 mul v0, s6, v0 1361 li v1, 16384 1362 move t4, zero 1363 move t5, zero 1364 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 1365 sll t7, s6, 4 // t7 = tmp_smoot_f * 16 13663: 1367/* Special case for first column: pretend column -1 is same as column 0 */ 1368 sll v0, t4, 2 1369 lwx t8, v0(a1) // outptr = output_data[outrow] 1370 sll v1, t5, 2 1371 addiu t9, v1, 4 1372 addiu s0, v1, -4 1373 addiu s1, v1, 8 1374 lwx s2, v1(a0) // inptr0 = input_data[inrow] 1375 lwx t9, t9(a0) // inptr1 = input_data[inrow+1] 1376 lwx s0, s0(a0) // above_ptr = input_data[inrow-1] 1377 lwx s1, s1(a0) // below_ptr = input_data[inrow+2] 1378 lh v0, 0(s2) 1379 lh v1, 0(t9) 1380 lh t0, 0(s0) 1381 lh t1, 0(s1) 1382 ins v0, v1, 16, 16 1383 ins t0, t1, 16, 16 1384 raddu.w.qb t2, v0 1385 raddu.w.qb s3, t0 1386 lbu v0, 0(s2) 1387 lbu v1, 2(s2) 1388 lbu t0, 0(t9) 1389 lbu t1, 2(t9) 1390 addu v0, v0, v1 1391 mult $ac1,t2, t6 1392 addu t0, t0, t1 1393 lbu t2, 2(s0) 1394 addu t0, t0, v0 1395 lbu t3, 2(s1) 1396 addu s3, t0, s3 1397 lbu v0, 0(s0) 1398 lbu t0, 0(s1) 1399 sll s3, s3, 1 1400 addu v0, v0, t2 1401 addu t0, t0, t3 1402 addu t0, t0, v0 1403 addu s3, t0, s3 1404 madd $ac1,s3, t7 1405 extr_r.w v0, $ac1, 16 1406 addiu t8, t8, 1 1407 addiu s2, s2, 2 1408 addiu t9, t9, 2 1409 addiu s0, s0, 2 1410 addiu s1, s1, 2 1411 sb v0, -1(t8) 1412 addiu s4, s7, -2 1413 and s4, s4, 3 1414 addu s5, s4, t8 //end adress 14154: 1416 lh v0, 0(s2) 1417 lh v1, 0(t9) 1418 lh t0, 0(s0) 1419 lh t1, 0(s1) 1420 ins v0, v1, 16, 16 1421 ins t0, t1, 16, 16 1422 raddu.w.qb t2, v0 1423 raddu.w.qb s3, t0 1424 lbu v0, -1(s2) 1425 lbu v1, 2(s2) 1426 lbu t0, -1(t9) 1427 lbu t1, 2(t9) 1428 addu v0, v0, v1 1429 mult $ac1, t2, t6 1430 addu t0, t0, t1 1431 lbu t2, 2(s0) 1432 addu t0, t0, v0 1433 lbu t3, 2(s1) 1434 addu s3, t0, s3 1435 lbu v0, -1(s0) 1436 lbu t0, -1(s1) 1437 sll s3, s3, 1 1438 addu v0, v0, t2 1439 addu t0, t0, t3 1440 addu t0, t0, v0 1441 addu s3, t0, s3 1442 madd $ac1, s3, t7 1443 extr_r.w t2, $ac1, 16 1444 addiu t8, t8, 1 1445 addiu s2, s2, 2 1446 addiu t9, t9, 2 1447 addiu s0, s0, 2 1448 sb t2, -1(t8) 1449 bne s5, t8, 4b 1450 addiu s1, s1, 2 1451 addiu s5, s7, -2 1452 subu s5, s5, s4 1453 addu s5, s5, t8 //end adress 14545: 1455 lh v0, 0(s2) 1456 lh v1, 0(t9) 1457 lh t0, 0(s0) 1458 lh t1, 0(s1) 1459 ins v0, v1, 16, 16 1460 ins t0, t1, 16, 16 1461 raddu.w.qb t2, v0 1462 raddu.w.qb s3, t0 1463 lbu v0, -1(s2) 1464 lbu v1, 2(s2) 1465 lbu t0, -1(t9) 1466 lbu t1, 2(t9) 1467 addu v0, v0, v1 1468 mult $ac1, t2, t6 1469 addu t0, t0, t1 1470 lbu t2, 2(s0) 1471 addu t0, t0, v0 1472 lbu t3, 2(s1) 1473 addu s3, t0, s3 1474 lbu v0, -1(s0) 1475 lbu t0, -1(s1) 1476 sll s3, s3, 1 1477 addu v0, v0, t2 1478 addu t0, t0, t3 1479 lh v1, 2(t9) 1480 addu t0, t0, v0 1481 lh v0, 2(s2) 1482 addu s3, t0, s3 1483 lh t0, 2(s0) 1484 lh t1, 2(s1) 1485 madd $ac1, s3, t7 1486 extr_r.w t2, $ac1, 16 1487 ins t0, t1, 16, 16 1488 ins v0, v1, 16, 16 1489 raddu.w.qb s3, t0 1490 lbu v1, 4(s2) 1491 lbu t0, 1(t9) 1492 lbu t1, 4(t9) 1493 sb t2, 0(t8) 1494 raddu.w.qb t3, v0 1495 lbu v0, 1(s2) 1496 addu t0, t0, t1 1497 mult $ac1, t3, t6 1498 addu v0, v0, v1 1499 lbu t2, 4(s0) 1500 addu t0, t0, v0 1501 lbu v0, 1(s0) 1502 addu s3, t0, s3 1503 lbu t0, 1(s1) 1504 lbu t3, 4(s1) 1505 addu v0, v0, t2 1506 sll s3, s3, 1 1507 addu t0, t0, t3 1508 lh v1, 4(t9) 1509 addu t0, t0, v0 1510 lh v0, 4(s2) 1511 addu s3, t0, s3 1512 lh t0, 4(s0) 1513 lh t1, 4(s1) 1514 madd $ac1, s3, t7 1515 extr_r.w t2, $ac1, 16 1516 ins t0, t1, 16, 16 1517 ins v0, v1, 16, 16 1518 raddu.w.qb s3, t0 1519 lbu v1, 6(s2) 1520 lbu t0, 3(t9) 1521 lbu t1, 6(t9) 1522 sb t2, 1(t8) 1523 raddu.w.qb t3, v0 1524 lbu v0, 3(s2) 1525 addu t0, t0,t1 1526 mult $ac1, t3, t6 1527 addu v0, v0, v1 1528 lbu t2, 6(s0) 1529 addu t0, t0, v0 1530 lbu v0, 3(s0) 1531 addu s3, t0, s3 1532 lbu t0, 3(s1) 1533 lbu t3, 6(s1) 1534 addu v0, v0, t2 1535 sll s3, s3, 1 1536 addu t0, t0, t3 1537 lh v1, 6(t9) 1538 addu t0, t0, v0 1539 lh v0, 6(s2) 1540 addu s3, t0, s3 1541 lh t0, 6(s0) 1542 lh t1, 6(s1) 1543 madd $ac1, s3, t7 1544 extr_r.w t3, $ac1, 16 1545 ins t0, t1, 16, 16 1546 ins v0, v1, 16, 16 1547 raddu.w.qb s3, t0 1548 lbu v1, 8(s2) 1549 lbu t0, 5(t9) 1550 lbu t1, 8(t9) 1551 sb t3, 2(t8) 1552 raddu.w.qb t2, v0 1553 lbu v0, 5(s2) 1554 addu t0, t0, t1 1555 mult $ac1, t2, t6 1556 addu v0, v0, v1 1557 lbu t2, 8(s0) 1558 addu t0, t0, v0 1559 lbu v0, 5(s0) 1560 addu s3, t0, s3 1561 lbu t0, 5(s1) 1562 lbu t3, 8(s1) 1563 addu v0, v0, t2 1564 sll s3, s3, 1 1565 addu t0, t0, t3 1566 addiu t8, t8, 4 1567 addu t0, t0, v0 1568 addiu s2, s2, 8 1569 addu s3, t0, s3 1570 addiu t9, t9, 8 1571 madd $ac1, s3, t7 1572 extr_r.w t1, $ac1, 16 1573 addiu s0, s0, 8 1574 addiu s1, s1, 8 1575 bne s5, t8, 5b 1576 sb t1, -1(t8) 1577/* Special case for last column */ 1578 lh v0, 0(s2) 1579 lh v1, 0(t9) 1580 lh t0, 0(s0) 1581 lh t1, 0(s1) 1582 ins v0, v1, 16, 16 1583 ins t0, t1, 16, 16 1584 raddu.w.qb t2, v0 1585 raddu.w.qb s3, t0 1586 lbu v0, -1(s2) 1587 lbu v1, 1(s2) 1588 lbu t0, -1(t9) 1589 lbu t1, 1(t9) 1590 addu v0, v0, v1 1591 mult $ac1, t2, t6 1592 addu t0, t0, t1 1593 lbu t2, 1(s0) 1594 addu t0, t0, v0 1595 lbu t3, 1(s1) 1596 addu s3, t0, s3 1597 lbu v0, -1(s0) 1598 lbu t0, -1(s1) 1599 sll s3, s3, 1 1600 addu v0, v0, t2 1601 addu t0, t0, t3 1602 addu t0, t0, v0 1603 addu s3, t0, s3 1604 madd $ac1, s3, t7 1605 extr_r.w t0, $ac1, 16 1606 addiu t5, t5, 2 1607 sb t0, 0(t8) 1608 addiu t4, t4, 1 1609 bne t4, a2, 3b 1610 addiu t5, t5, 2 1611 1612 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1613 1614 j ra 1615 nop 1616 1617END(jsimd_h2v2_smooth_downsample_mips_dspr2) 1618 1619/*****************************************************************************/ 1620LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2) 1621/* 1622 * a0 - upsample->h_expand[compptr->component_index] 1623 * a1 - upsample->v_expand[compptr->component_index] 1624 * a2 - input_data 1625 * a3 - output_data_ptr 1626 * 16(sp) - cinfo->output_width 1627 * 20(sp) - cinfo->max_v_samp_factor 1628 */ 1629 .set at 1630 1631 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1632 1633 lw s0, 0(a3) // s0 = output_data 1634 lw s1, 32(sp) // s1 = cinfo->output_width 1635 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor 1636 li t6, 0 // t6 = inrow 1637 beqz s2, 10f 1638 li s3, 0 // s3 = outrow 16390: 1640 addu t0, a2, t6 1641 addu t7, s0, s3 1642 lw t3, 0(t0) // t3 = inptr 1643 lw t8, 0(t7) // t8 = outptr 1644 beqz s1, 4f 1645 addu t5, t8, s1 // t5 = outend 16461: 1647 lb t2, 0(t3) // t2 = invalue = *inptr++ 1648 addiu t3, 1 1649 beqz a0, 3f 1650 move t0, a0 // t0 = h_expand 16512: 1652 sb t2, 0(t8) 1653 addiu t0, -1 1654 bgtz t0, 2b 1655 addiu t8, 1 16563: 1657 bgt t5, t8, 1b 1658 nop 16594: 1660 addiu t9, a1, -1 // t9 = v_expand - 1 1661 blez t9, 9f 1662 nop 16635: 1664 lw t3, 0(s0) 1665 lw t4, 4(s0) 1666 subu t0, s1, 0xF 1667 blez t0, 7f 1668 addu t5, t3, s1 // t5 = end address 1669 andi t7, s1, 0xF // t7 = residual 1670 subu t8, t5, t7 16716: 1672 ulw t0, 0(t3) 1673 ulw t1, 4(t3) 1674 ulw t2, 8(t3) 1675 usw t0, 0(t4) 1676 ulw t0, 12(t3) 1677 usw t1, 4(t4) 1678 usw t2, 8(t4) 1679 usw t0, 12(t4) 1680 addiu t3, 16 1681 bne t3, t8, 6b 1682 addiu t4, 16 1683 beqz t7, 8f 1684 nop 16857: 1686 lbu t0, 0(t3) 1687 sb t0, 0(t4) 1688 addiu t3, 1 1689 bne t3, t5, 7b 1690 addiu t4, 1 16918: 1692 addiu t9, -1 1693 bgtz t9, 5b 1694 addiu s0, 8 16959: 1696 addu s3, s3, a1 1697 bne s3, s2, 0b 1698 addiu t6, 1 169910: 1700 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1701 1702 j ra 1703 nop 1704END(jsimd_int_upsample_mips_dspr2) 1705 1706/*****************************************************************************/ 1707LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) 1708/* 1709 * a0 - cinfo->max_v_samp_factor 1710 * a1 - cinfo->output_width 1711 * a2 - input_data 1712 * a3 - output_data_ptr 1713 */ 1714 lw t7, 0(a3) // t7 = output_data 1715 andi t8, a1, 0xf // t8 = residual 1716 sll t0, a0, 2 1717 blez a0, 4f 1718 addu t9, t7, t0 // t9 = output_data end address 17190: 1720 lw t5, 0(t7) // t5 = outptr 1721 lw t6, 0(a2) // t6 = inptr 1722 addu t3, t5, a1 // t3 = outptr + output_width (end address) 1723 subu t3, t8 // t3 = end address - residual 1724 beq t5, t3, 2f 1725 move t4, t8 17261: 1727 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| 1728 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| 1729 srl t1, t0, 16 // t1 = |X|X|P3|P2| 1730 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| 1731 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| 1732 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| 1733 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| 1734 usw t0, 0(t5) 1735 usw t1, 4(t5) 1736 srl t0, t2, 16 // t0 = |X|X|P7|P6| 1737 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| 1738 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| 1739 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| 1740 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| 1741 usw t2, 8(t5) 1742 usw t0, 12(t5) 1743 addiu t5, 16 1744 bne t5, t3, 1b 1745 addiu t6, 8 1746 beqz t8, 3f 1747 move t4, t8 17482: 1749 lbu t1, 0(t6) 1750 sb t1, 0(t5) 1751 sb t1, 1(t5) 1752 addiu t4, -2 1753 addiu t6, 1 1754 bgtz t4, 2b 1755 addiu t5, 2 17563: 1757 addiu t7, 4 1758 bne t9, t7, 0b 1759 addiu a2, 4 17604: 1761 j ra 1762 nop 1763END(jsimd_h2v1_upsample_mips_dspr2) 1764 1765/*****************************************************************************/ 1766LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) 1767/* 1768 * a0 - cinfo->max_v_samp_factor 1769 * a1 - cinfo->output_width 1770 * a2 - input_data 1771 * a3 - output_data_ptr 1772 */ 1773 lw t7, 0(a3) 1774 blez a0, 7f 1775 andi t9, a1, 0xf // t9 = residual 17760: 1777 lw t6, 0(a2) // t6 = inptr 1778 lw t5, 0(t7) // t5 = outptr 1779 addu t8, t5, a1 // t8 = outptr end address 1780 subu t8, t9 // t8 = end address - residual 1781 beq t5, t8, 2f 1782 move t4, t9 17831: 1784 ulw t0, 0(t6) 1785 srl t1, t0, 16 1786 ins t0, t0, 16, 16 1787 ins t0, t0, 8, 16 1788 ins t1, t1, 16, 16 1789 ins t1, t1, 8, 16 1790 ulw t2, 4(t6) 1791 usw t0, 0(t5) 1792 usw t1, 4(t5) 1793 srl t3, t2, 16 1794 ins t2, t2, 16, 16 1795 ins t2, t2, 8, 16 1796 ins t3, t3, 16, 16 1797 ins t3, t3, 8, 16 1798 usw t2, 8(t5) 1799 usw t3, 12(t5) 1800 addiu t5, 16 1801 bne t5, t8, 1b 1802 addiu t6, 8 1803 beqz t9, 3f 1804 move t4, t9 18052: 1806 lbu t0, 0(t6) 1807 sb t0, 0(t5) 1808 sb t0, 1(t5) 1809 addiu t4, -2 1810 addiu t6, 1 1811 bgtz t4, 2b 1812 addiu t5, 2 18133: 1814 lw t6, 0(t7) // t6 = outptr[0] 1815 lw t5, 4(t7) // t5 = outptr[1] 1816 addu t4, t6, a1 // t4 = new end address 1817 beq a1, t9, 5f 1818 subu t8, t4, t9 18194: 1820 ulw t0, 0(t6) 1821 ulw t1, 4(t6) 1822 ulw t2, 8(t6) 1823 usw t0, 0(t5) 1824 ulw t0, 12(t6) 1825 usw t1, 4(t5) 1826 usw t2, 8(t5) 1827 usw t0, 12(t5) 1828 addiu t6, 16 1829 bne t6, t8, 4b 1830 addiu t5, 16 1831 beqz t9, 6f 1832 nop 18335: 1834 lbu t0, 0(t6) 1835 sb t0, 0(t5) 1836 addiu t6, 1 1837 bne t6, t4, 5b 1838 addiu t5, 1 18396: 1840 addiu t7, 8 1841 addiu a0, -2 1842 bgtz a0, 0b 1843 addiu a2, 4 18447: 1845 j ra 1846 nop 1847END(jsimd_h2v2_upsample_mips_dspr2) 1848 1849/*****************************************************************************/ 1850LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2) 1851/* 1852 * a0 - coef_block 1853 * a1 - compptr->dcttable 1854 * a2 - output 1855 * a3 - range_limit 1856 */ 1857 1858 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1859 1860 addiu sp, sp, -256 1861 move v0, sp 1862 addiu v1, zero, 8 // v1 = DCTSIZE = 8 18631: 1864 lh s4, 32(a0) // s4 = inptr[16] 1865 lh s5, 64(a0) // s5 = inptr[32] 1866 lh s6, 96(a0) // s6 = inptr[48] 1867 lh t1, 112(a0) // t1 = inptr[56] 1868 lh t7, 16(a0) // t7 = inptr[8] 1869 lh t5, 80(a0) // t5 = inptr[40] 1870 lh t3, 48(a0) // t3 = inptr[24] 1871 or s4, s4, t1 1872 or s4, s4, t3 1873 or s4, s4, t5 1874 or s4, s4, t7 1875 or s4, s4, s5 1876 or s4, s4, s6 1877 bnez s4, 2f 1878 addiu v1, v1, -1 1879 lh s5, 0(a1) // quantptr[DCTSIZE*0] 1880 lh s6, 0(a0) // inptr[DCTSIZE*0] 1881 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) 1882 sll s5, s5, 2 1883 sw s5, 0(v0) 1884 sw s5, 32(v0) 1885 sw s5, 64(v0) 1886 sw s5, 96(v0) 1887 sw s5, 128(v0) 1888 sw s5, 160(v0) 1889 sw s5, 192(v0) 1890 b 3f 1891 sw s5, 224(v0) 18922: 1893 lh t0, 112(a1) 1894 lh t2, 48(a1) 1895 lh t4, 80(a1) 1896 lh t6, 16(a1) 1897 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7]) 1898 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3]) 1899 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5]) 1900 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1]) 1901 lh t4, 32(a1) 1902 lh t5, 32(a0) 1903 lh t6, 96(a1) 1904 lh t7, 96(a0) 1905 addu s0, t0, t1 // z3 = tmp0 + tmp2 1906 addu s1, t1, t2 // z2 = tmp1 + tmp2 1907 addu s2, t2, t3 // z4 = tmp1 + tmp3 1908 addu s3, s0, s2 // z3 + z4 1909 addiu t9, zero, 9633 // FIX_1_175875602 1910 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 1911 addu t8, t0, t3 // z1 = tmp0 + tmp3 1912 addiu t9, zero, 2446 // FIX_0_298631336 1913 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 1914 addiu t9, zero, 16819 // FIX_2_053119869 1915 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 1916 addiu t9, zero, 25172 // FIX_3_072711026 1917 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 1918 addiu t9, zero, 12299 // FIX_1_501321110 1919 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 1920 addiu t9, zero, 16069 // FIX_1_961570560 1921 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) 1922 addiu t9, zero, 3196 // FIX_0_390180644 1923 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) 1924 addiu t9, zero, 7373 // FIX_0_899976223 1925 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) 1926 addiu t9, zero, 20995 // FIX_2_562915447 1927 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) 1928 subu s0, s3, s0 // z3 += z5 1929 addu t0, t0, s0 // tmp0 += z3 1930 addu t1, t1, s0 // tmp2 += z3 1931 subu s2, s3, s2 // z4 += z5 1932 addu t2, t2, s2 // tmp1 += z4 1933 addu t3, t3, s2 // tmp3 += z4 1934 subu t0, t0, t8 // tmp0 += z1 1935 subu t1, t1, s1 // tmp2 += z2 1936 subu t2, t2, s1 // tmp1 += z2 1937 subu t3, t3, t8 // tmp3 += z1 1938 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2]) 1939 addiu t9, zero, 6270 // FIX_0_765366865 1940 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6]) 1941 lh t4, 0(a1) 1942 lh t5, 0(a0) 1943 lh t6, 64(a1) 1944 lh t7, 64(a0) 1945 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) 1946 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0]) 1947 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4]) 1948 addiu t9, zero, 4433 // FIX_0_541196100 1949 addu s3, s0, s1 // z2 + z3 1950 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 1951 addiu t9, zero, 15137 // FIX_1_847759065 1952 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) 1953 addu t4, t5, t6 1954 subu t5, t5, t6 1955 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS 1956 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS 1957 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 1958 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) 1959 addu s0, t4, t7 1960 subu s1, t4, t7 1961 addu s2, t5, t6 1962 subu s3, t5, t6 1963 addu t4, s0, t3 1964 subu s0, s0, t3 1965 addu t3, s2, t1 1966 subu s2, s2, t1 1967 addu t1, s3, t2 1968 subu s3, s3, t2 1969 addu t2, s1, t0 1970 subu s1, s1, t0 1971 shra_r.w t4, t4, 11 1972 shra_r.w t3, t3, 11 1973 shra_r.w t1, t1, 11 1974 shra_r.w t2, t2, 11 1975 shra_r.w s1, s1, 11 1976 shra_r.w s3, s3, 11 1977 shra_r.w s2, s2, 11 1978 shra_r.w s0, s0, 11 1979 sw t4, 0(v0) 1980 sw t3, 32(v0) 1981 sw t1, 64(v0) 1982 sw t2, 96(v0) 1983 sw s1, 128(v0) 1984 sw s3, 160(v0) 1985 sw s2, 192(v0) 1986 sw s0, 224(v0) 19873: 1988 addiu a1, a1, 2 1989 addiu a0, a0, 2 1990 bgtz v1, 1b 1991 addiu v0, v0, 4 1992 move v0, sp 1993 addiu v1, zero, 8 19944: 1995 lw t0, 8(v0) // z2 = (INT32) wsptr[2] 1996 lw t1, 24(v0) // z3 = (INT32) wsptr[6] 1997 lw t2, 0(v0) // (INT32) wsptr[0] 1998 lw t3, 16(v0) // (INT32) wsptr[4] 1999 lw s4, 4(v0) // (INT32) wsptr[1] 2000 lw s5, 12(v0) // (INT32) wsptr[3] 2001 lw s6, 20(v0) // (INT32) wsptr[5] 2002 lw s7, 28(v0) // (INT32) wsptr[7] 2003 or s4, s4, t0 2004 or s4, s4, t1 2005 or s4, s4, t3 2006 or s4, s4, s7 2007 or s4, s4, s5 2008 or s4, s4, s6 2009 bnez s4, 5f 2010 addiu v1, v1, -1 2011 shra_r.w s5, t2, 5 2012 andi s5, s5, 0x3ff 2013 lbux s5, s5(a3) 2014 lw s1, 0(a2) 2015 replv.qb s5, s5 2016 usw s5, 0(s1) 2017 usw s5, 4(s1) 2018 b 6f 2019 nop 20205: 2021 addu t4, t0, t1 // z2 + z3 2022 addiu t8, zero, 4433 // FIX_0_541196100 2023 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 2024 addiu t8, zero, 15137 // FIX_1_847759065 2025 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) 2026 addiu t8, zero, 6270 // FIX_0_765366865 2027 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) 2028 addu t4, t2, t3 // (INT32) wsptr[0] + (INT32) wsptr[4] 2029 subu t2, t2, t3 // (INT32) wsptr[0] - (INT32) wsptr[4] 2030 sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS 2031 sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS 2032 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) 2033 subu t3, t2, t1 // tmp12 = tmp1 - tmp2 2034 addu t2, t2, t1 // tmp11 = tmp1 + tmp2 2035 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 2036 subu t1, t4, t5 // tmp13 = tmp0 - tmp3 2037 addu t0, t4, t5 // tmp10 = tmp0 + tmp3 2038 lw t4, 28(v0) // tmp0 = (INT32) wsptr[7] 2039 lw t6, 12(v0) // tmp2 = (INT32) wsptr[3] 2040 lw t5, 20(v0) // tmp1 = (INT32) wsptr[5] 2041 lw t7, 4(v0) // tmp3 = (INT32) wsptr[1] 2042 addu s0, t4, t6 // z3 = tmp0 + tmp2 2043 addiu t8, zero, 9633 // FIX_1_175875602 2044 addu s1, t5, t7 // z4 = tmp1 + tmp3 2045 addu s2, s0, s1 // z3 + z4 2046 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 2047 addu s3, t4, t7 // z1 = tmp0 + tmp3 2048 addu t9, t5, t6 // z2 = tmp1 + tmp2 2049 addiu t8, zero, 16069 // FIX_1_961570560 2050 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) 2051 addiu t8, zero, 3196 // FIX_0_390180644 2052 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) 2053 addiu t8, zero, 2446 // FIX_0_298631336 2054 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 2055 addiu t8, zero, 7373 // FIX_0_899976223 2056 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) 2057 addiu t8, zero, 16819 // FIX_2_053119869 2058 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 2059 addiu t8, zero, 20995 // FIX_2_562915447 2060 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) 2061 addiu t8, zero, 25172 // FIX_3_072711026 2062 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 2063 addiu t8, zero, 12299 // FIX_1_501321110 2064 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 2065 subu s0, s2, s0 // z3 += z5 2066 subu s1, s2, s1 // z4 += z5 2067 addu t4, t4, s0 2068 subu t4, t4, s3 // tmp0 2069 addu t5, t5, s1 2070 subu t5, t5, t9 // tmp1 2071 addu t6, t6, s0 2072 subu t6, t6, t9 // tmp2 2073 addu t7, t7, s1 2074 subu t7, t7, s3 // tmp3 2075 addu s0, t0, t7 2076 subu t0, t0, t7 2077 addu t7, t2, t6 2078 subu t2, t2, t6 2079 addu t6, t3, t5 2080 subu t3, t3, t5 2081 addu t5, t1, t4 2082 subu t1, t1, t4 2083 shra_r.w s0, s0, 18 2084 shra_r.w t7, t7, 18 2085 shra_r.w t6, t6, 18 2086 shra_r.w t5, t5, 18 2087 shra_r.w t1, t1, 18 2088 shra_r.w t3, t3, 18 2089 shra_r.w t2, t2, 18 2090 shra_r.w t0, t0, 18 2091 andi s0, s0, 0x3ff 2092 andi t7, t7, 0x3ff 2093 andi t6, t6, 0x3ff 2094 andi t5, t5, 0x3ff 2095 andi t1, t1, 0x3ff 2096 andi t3, t3, 0x3ff 2097 andi t2, t2, 0x3ff 2098 andi t0, t0, 0x3ff 2099 lw s1, 0(a2) 2100 lbux s0, s0(a3) 2101 lbux t7, t7(a3) 2102 lbux t6, t6(a3) 2103 lbux t5, t5(a3) 2104 lbux t1, t1(a3) 2105 lbux t3, t3(a3) 2106 lbux t2, t2(a3) 2107 lbux t0, t0(a3) 2108 sb s0, 0(s1) 2109 sb t7, 1(s1) 2110 sb t6, 2(s1) 2111 sb t5, 3(s1) 2112 sb t1, 4(s1) 2113 sb t3, 5(s1) 2114 sb t2, 6(s1) 2115 sb t0, 7(s1) 21166: 2117 addiu v0, v0, 32 2118 bgtz v1, 4b 2119 addiu a2, a2, 4 2120 addiu sp, sp, 256 2121 2122 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2123 2124 j ra 2125 nop 2126 2127END(jsimd_idct_islow_mips_dspr2) 2128 2129/*****************************************************************************/ 2130LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) 2131/* 2132 * a0 - inptr 2133 * a1 - quantptr 2134 * a2 - wsptr 2135 * a3 - mips_idct_ifast_coefs 2136 */ 2137 2138 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2139 2140 addiu t9, a0, 16 // end address 2141 or AT, a3, zero 2142 21430: 2144 lw s0, 0(a1) // quantptr[DCTSIZE*0] 2145 lw t0, 0(a0) // inptr[DCTSIZE*0] 2146 lw t1, 16(a0) // inptr[DCTSIZE*1] 2147 muleq_s.w.phl v0, t0, s0 // tmp0 ... 2148 lw t2, 32(a0) // inptr[DCTSIZE*2] 2149 lw t3, 48(a0) // inptr[DCTSIZE*3] 2150 lw t4, 64(a0) // inptr[DCTSIZE*4] 2151 lw t5, 80(a0) // inptr[DCTSIZE*5] 2152 muleq_s.w.phr t0, t0, s0 // ... tmp0 ... 2153 lw t6, 96(a0) // inptr[DCTSIZE*6] 2154 lw t7, 112(a0) // inptr[DCTSIZE*7] 2155 or s4, t1, t2 2156 or s5, t3, t4 2157 bnez s4, 1f 2158 ins t0, v0, 16, 16 // ... tmp0 2159 bnez s5, 1f 2160 or s6, t5, t6 2161 or s6, s6, t7 2162 bnez s6, 1f 2163 sw t0, 0(a2) // wsptr[DCTSIZE*0] 2164 sw t0, 16(a2) // wsptr[DCTSIZE*1] 2165 sw t0, 32(a2) // wsptr[DCTSIZE*2] 2166 sw t0, 48(a2) // wsptr[DCTSIZE*3] 2167 sw t0, 64(a2) // wsptr[DCTSIZE*4] 2168 sw t0, 80(a2) // wsptr[DCTSIZE*5] 2169 sw t0, 96(a2) // wsptr[DCTSIZE*6] 2170 sw t0, 112(a2) // wsptr[DCTSIZE*7] 2171 addiu a0, a0, 4 2172 b 2f 2173 addiu a1, a1, 4 2174 21751: 2176 lw s1, 32(a1) // quantptr[DCTSIZE*2] 2177 lw s2, 64(a1) // quantptr[DCTSIZE*4] 2178 muleq_s.w.phl v0, t2, s1 // tmp1 ... 2179 muleq_s.w.phr t2, t2, s1 // ... tmp1 ... 2180 lw s0, 16(a1) // quantptr[DCTSIZE*1] 2181 lw s1, 48(a1) // quantptr[DCTSIZE*3] 2182 lw s3, 96(a1) // quantptr[DCTSIZE*6] 2183 muleq_s.w.phl v1, t4, s2 // tmp2 ... 2184 muleq_s.w.phr t4, t4, s2 // ... tmp2 ... 2185 lw s2, 80(a1) // quantptr[DCTSIZE*5] 2186 lw t8, 4(AT) // FIX(1.414213562) 2187 ins t2, v0, 16, 16 // ... tmp1 2188 muleq_s.w.phl v0, t6, s3 // tmp3 ... 2189 muleq_s.w.phr t6, t6, s3 // ... tmp3 ... 2190 ins t4, v1, 16, 16 // ... tmp2 2191 addq.ph s4, t0, t4 // tmp10 2192 subq.ph s5, t0, t4 // tmp11 2193 ins t6, v0, 16, 16 // ... tmp3 2194 subq.ph s6, t2, t6 // tmp12 ... 2195 addq.ph s7, t2, t6 // tmp13 2196 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2197 addq.ph t0, s4, s7 // tmp0 2198 subq.ph t6, s4, s7 // tmp3 2199 muleq_s.w.phl v0, t1, s0 // tmp4 ... 2200 muleq_s.w.phr t1, t1, s0 // ... tmp4 ... 2201 shll_s.ph s6, s6, 1 // x2 2202 lw s3, 112(a1) // quantptr[DCTSIZE*7] 2203 subq.ph s6, s6, s7 // ... tmp12 2204 muleq_s.w.phl v1, t7, s3 // tmp7 ... 2205 muleq_s.w.phr t7, t7, s3 // ... tmp7 ... 2206 ins t1, v0, 16, 16 // ... tmp4 2207 addq.ph t2, s5, s6 // tmp1 2208 subq.ph t4, s5, s6 // tmp2 2209 muleq_s.w.phl v0, t5, s2 // tmp6 ... 2210 muleq_s.w.phr t5, t5, s2 // ... tmp6 ... 2211 ins t7, v1, 16, 16 // ... tmp7 2212 addq.ph s5, t1, t7 // z11 2213 subq.ph s6, t1, t7 // z12 2214 muleq_s.w.phl v1, t3, s1 // tmp5 ... 2215 muleq_s.w.phr t3, t3, s1 // ... tmp5 ... 2216 ins t5, v0, 16, 16 // ... tmp6 2217 ins t3, v1, 16, 16 // ... tmp5 2218 addq.ph s7, t5, t3 // z13 2219 subq.ph v0, t5, t3 // z10 2220 addq.ph t7, s5, s7 // tmp7 2221 subq.ph s5, s5, s7 // tmp11 ... 2222 addq.ph v1, v0, s6 // z5 ... 2223 mulq_s.ph s5, s5, t8 // ... tmp11 2224 lw t8, 8(AT) // FIX(1.847759065) 2225 lw s4, 0(AT) // FIX(1.082392200) 2226 addq.ph s0, t0, t7 2227 subq.ph s1, t0, t7 2228 mulq_s.ph v1, v1, t8 // ... z5 2229 shll_s.ph s5, s5, 1 // x2 2230 lw t8, 12(AT) // FIX(-2.613125930) 2231 sw s0, 0(a2) // wsptr[DCTSIZE*0] 2232 shll_s.ph v0, v0, 1 // x4 2233 mulq_s.ph v0, v0, t8 // tmp12 ... 2234 mulq_s.ph s4, s6, s4 // tmp10 ... 2235 shll_s.ph v1, v1, 1 // x2 2236 addiu a0, a0, 4 2237 addiu a1, a1, 4 2238 sw s1, 112(a2) // wsptr[DCTSIZE*7] 2239 shll_s.ph s6, v0, 1 // x4 2240 shll_s.ph s4, s4, 1 // x2 2241 addq.ph s6, s6, v1 // ... tmp12 2242 subq.ph t5, s6, t7 // tmp6 2243 subq.ph s4, s4, v1 // ... tmp10 2244 subq.ph t3, s5, t5 // tmp5 2245 addq.ph s2, t2, t5 2246 addq.ph t1, s4, t3 // tmp4 2247 subq.ph s3, t2, t5 2248 sw s2, 16(a2) // wsptr[DCTSIZE*1] 2249 sw s3, 96(a2) // wsptr[DCTSIZE*6] 2250 addq.ph v0, t4, t3 2251 subq.ph v1, t4, t3 2252 sw v0, 32(a2) // wsptr[DCTSIZE*2] 2253 sw v1, 80(a2) // wsptr[DCTSIZE*5] 2254 addq.ph v0, t6, t1 2255 subq.ph v1, t6, t1 2256 sw v0, 64(a2) // wsptr[DCTSIZE*4] 2257 sw v1, 48(a2) // wsptr[DCTSIZE*3] 2258 22592: 2260 bne a0, t9, 0b 2261 addiu a2, a2, 4 2262 2263 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2264 2265 j ra 2266 nop 2267 2268END(jsimd_idct_ifast_cols_mips_dspr2) 2269 2270/*****************************************************************************/ 2271LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) 2272/* 2273 * a0 - wsptr 2274 * a1 - output_buf 2275 * a2 - output_col 2276 * a3 - mips_idct_ifast_coefs 2277 */ 2278 2279 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2280 2281 addiu t9, a0, 128 // end address 2282 lui s8, 0x8080 2283 ori s8, s8, 0x8080 2284 22850: 2286 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) 2287 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a 2288 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A 2289 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c 2290 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C 2291 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e 2292 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E 2293 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g 2294 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G 2295 precrq.ph.w t1, s0, t0 // B b 2296 ins t0, s0, 16, 16 // A a 2297 bnez t1, 1f 2298 or s0, t2, s2 2299 bnez s0, 1f 2300 or s0, t4, s4 2301 bnez s0, 1f 2302 or s0, t6, s6 2303 bnez s0, 1f 2304 shll_s.ph s0, t0, 2 // A a 2305 lw a3, 0(a1) 2306 lw AT, 4(a1) 2307 precrq.ph.w t0, s0, s0 // A A 2308 ins s0, s0, 16, 16 // a a 2309 addu a3, a3, a2 2310 addu AT, AT, a2 2311 precrq.qb.ph t0, t0, t0 // A A A A 2312 precrq.qb.ph s0, s0, s0 // a a a a 2313 addu.qb s0, s0, s8 2314 addu.qb t0, t0, s8 2315 sw s0, 0(a3) 2316 sw s0, 4(a3) 2317 sw t0, 0(AT) 2318 sw t0, 4(AT) 2319 addiu a0, a0, 32 2320 bne a0, t9, 0b 2321 addiu a1, a1, 8 2322 b 2f 2323 nop 2324 23251: 2326 precrq.ph.w t3, s2, t2 2327 ins t2, s2, 16, 16 2328 precrq.ph.w t5, s4, t4 2329 ins t4, s4, 16, 16 2330 precrq.ph.w t7, s6, t6 2331 ins t6, s6, 16, 16 2332 lw t8, 4(AT) // FIX(1.414213562) 2333 addq.ph s4, t0, t4 // tmp10 2334 subq.ph s5, t0, t4 // tmp11 2335 subq.ph s6, t2, t6 // tmp12 ... 2336 addq.ph s7, t2, t6 // tmp13 2337 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2338 addq.ph t0, s4, s7 // tmp0 2339 subq.ph t6, s4, s7 // tmp3 2340 shll_s.ph s6, s6, 1 // x2 2341 subq.ph s6, s6, s7 // ... tmp12 2342 addq.ph t2, s5, s6 // tmp1 2343 subq.ph t4, s5, s6 // tmp2 2344 addq.ph s5, t1, t7 // z11 2345 subq.ph s6, t1, t7 // z12 2346 addq.ph s7, t5, t3 // z13 2347 subq.ph v0, t5, t3 // z10 2348 addq.ph t7, s5, s7 // tmp7 2349 subq.ph s5, s5, s7 // tmp11 ... 2350 addq.ph v1, v0, s6 // z5 ... 2351 mulq_s.ph s5, s5, t8 // ... tmp11 2352 lw t8, 8(AT) // FIX(1.847759065) 2353 lw s4, 0(AT) // FIX(1.082392200) 2354 addq.ph s0, t0, t7 // tmp0 + tmp7 2355 subq.ph s7, t0, t7 // tmp0 - tmp7 2356 mulq_s.ph v1, v1, t8 // ... z5 2357 lw a3, 0(a1) 2358 lw t8, 12(AT) // FIX(-2.613125930) 2359 shll_s.ph s5, s5, 1 // x2 2360 addu a3, a3, a2 2361 shll_s.ph v0, v0, 1 // x4 2362 mulq_s.ph v0, v0, t8 // tmp12 ... 2363 mulq_s.ph s4, s6, s4 // tmp10 ... 2364 shll_s.ph v1, v1, 1 // x2 2365 addiu a0, a0, 32 2366 addiu a1, a1, 8 2367 shll_s.ph s6, v0, 1 // x4 2368 shll_s.ph s4, s4, 1 // x2 2369 addq.ph s6, s6, v1 // ... tmp12 2370 shll_s.ph s0, s0, 2 2371 subq.ph t5, s6, t7 // tmp6 2372 subq.ph s4, s4, v1 // ... tmp10 2373 subq.ph t3, s5, t5 // tmp5 2374 shll_s.ph s7, s7, 2 2375 addq.ph t1, s4, t3 // tmp4 2376 addq.ph s1, t2, t5 // tmp1 + tmp6 2377 subq.ph s6, t2, t5 // tmp1 - tmp6 2378 addq.ph s2, t4, t3 // tmp2 + tmp5 2379 subq.ph s5, t4, t3 // tmp2 - tmp5 2380 addq.ph s4, t6, t1 // tmp3 + tmp4 2381 subq.ph s3, t6, t1 // tmp3 - tmp4 2382 shll_s.ph s1, s1, 2 2383 shll_s.ph s2, s2, 2 2384 shll_s.ph s3, s3, 2 2385 shll_s.ph s4, s4, 2 2386 shll_s.ph s5, s5, 2 2387 shll_s.ph s6, s6, 2 2388 precrq.ph.w t0, s1, s0 // B A 2389 ins s0, s1, 16, 16 // b a 2390 precrq.ph.w t2, s3, s2 // D C 2391 ins s2, s3, 16, 16 // d c 2392 precrq.ph.w t4, s5, s4 // F E 2393 ins s4, s5, 16, 16 // f e 2394 precrq.ph.w t6, s7, s6 // H G 2395 ins s6, s7, 16, 16 // h g 2396 precrq.qb.ph t0, t2, t0 // D C B A 2397 precrq.qb.ph s0, s2, s0 // d c b a 2398 precrq.qb.ph t4, t6, t4 // H G F E 2399 precrq.qb.ph s4, s6, s4 // h g f e 2400 addu.qb s0, s0, s8 2401 addu.qb s4, s4, s8 2402 sw s0, 0(a3) // outptr[0/1/2/3] d c b a 2403 sw s4, 4(a3) // outptr[4/5/6/7] h g f e 2404 lw a3, -4(a1) 2405 addu.qb t0, t0, s8 2406 addu a3, a3, a2 2407 addu.qb t4, t4, s8 2408 sw t0, 0(a3) // outptr[0/1/2/3] D C B A 2409 bne a0, t9, 0b 2410 sw t4, 4(a3) // outptr[4/5/6/7] H G F E 2411 24122: 2413 2414 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2415 2416 j ra 2417 nop 2418 2419END(jsimd_idct_ifast_rows_mips_dspr2) 2420 2421/*****************************************************************************/ 2422LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) 2423/* 2424 * a0 - data 2425 */ 2426 2427 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2428 2429 lui t0, 6437 2430 ori t0, 2260 2431 lui t1, 9633 2432 ori t1, 11363 2433 lui t2, 0xd39e 2434 ori t2, 0xe6dc 2435 lui t3, 0xf72d 2436 ori t3, 9633 2437 lui t4, 2261 2438 ori t4, 9633 2439 lui t5, 0xd39e 2440 ori t5, 6437 2441 lui t6, 9633 2442 ori t6, 0xd39d 2443 lui t7, 0xe6dc 2444 ori t7, 2260 2445 lui t8, 4433 2446 ori t8, 10703 2447 lui t9, 0xd630 2448 ori t9, 4433 2449 li s8, 8 2450 move a1, a0 24511: 2452 lw s0, 0(a1) // tmp0 = 1|0 2453 lw s1, 4(a1) // tmp1 = 3|2 2454 lw s2, 8(a1) // tmp2 = 5|4 2455 lw s3, 12(a1) // tmp3 = 7|6 2456 packrl.ph s1, s1, s1 // tmp1 = 2|3 2457 packrl.ph s3, s3, s3 // tmp3 = 6|7 2458 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 2459 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 2460 mult $0, $0 // ac0 = 0 2461 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 2462 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 2463 mult $ac1, $0, $0 // ac1 = 0 2464 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 2465 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 2466 mult $ac2, $0, $0 // ac2 = 0 2467 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 2468 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 2469 mult $ac3, $0, $0 // ac3 = 0 2470 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 2471 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 2472 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 2473 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 2474 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2475 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2476 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 2477 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 2478 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 2479 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 2480 sh s0, 2(a1) 2481 sh s1, 6(a1) 2482 sh s2, 10(a1) 2483 sh s3, 14(a1) 2484 mult $0, $0 // ac0 = 0 2485 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 2486 mult $ac1, $0, $0 // ac1 = 0 2487 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 2488 sra s4, s5, 16 // tmp4 = t11 2489 addiu a1, a1, 16 2490 addiu s8, s8, -1 2491 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2492 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2493 addu s2, s5, s4 // tmp2 = t10 + t11 2494 subu s3, s5, s4 // tmp3 = t10 - t11 2495 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 2496 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 2497 sh s2, -16(a1) 2498 sh s3, -8(a1) 2499 sh s0, -12(a1) 2500 bgtz s8, 1b 2501 sh s1, -4(a1) 2502 li t0, 2260 2503 li t1, 11363 2504 li t2, 9633 2505 li t3, 6436 2506 li t4, 6437 2507 li t5, 2261 2508 li t6, 11362 2509 li t7, 2259 2510 li t8, 4433 2511 li t9, 10703 2512 li a1, 10704 2513 li s8, 8 2514 25152: 2516 lh a2, 0(a0) // 0 2517 lh a3, 16(a0) // 8 2518 lh v0, 32(a0) // 16 2519 lh v1, 48(a0) // 24 2520 lh s4, 64(a0) // 32 2521 lh s5, 80(a0) // 40 2522 lh s6, 96(a0) // 48 2523 lh s7, 112(a0) // 56 2524 addu s2, v0, s5 // tmp2 = 16 + 40 2525 subu s5, v0, s5 // tmp5 = 16 - 40 2526 addu s3, v1, s4 // tmp3 = 24 + 32 2527 subu s4, v1, s4 // tmp4 = 24 - 32 2528 addu s0, a2, s7 // tmp0 = 0 + 56 2529 subu s7, a2, s7 // tmp7 = 0 - 56 2530 addu s1, a3, s6 // tmp1 = 8 + 48 2531 subu s6, a3, s6 // tmp6 = 8 - 48 2532 addu a2, s0, s3 // tmp10 = tmp0 + tmp3 2533 subu v1, s0, s3 // tmp13 = tmp0 - tmp3 2534 addu a3, s1, s2 // tmp11 = tmp1 + tmp2 2535 subu v0, s1, s2 // tmp12 = tmp1 - tmp2 2536 mult s7, t1 // ac0 = tmp7 * c1 2537 madd s4, t0 // ac0 += tmp4 * c0 2538 madd s5, t4 // ac0 += tmp5 * c4 2539 madd s6, t2 // ac0 += tmp6 * c2 2540 mult $ac1, s7, t2 // ac1 = tmp7 * c2 2541 msub $ac1, s4, t3 // ac1 -= tmp4 * c3 2542 msub $ac1, s5, t6 // ac1 -= tmp5 * c6 2543 msub $ac1, s6, t7 // ac1 -= tmp6 * c7 2544 mult $ac2, s7, t4 // ac2 = tmp7 * c4 2545 madd $ac2, s4, t2 // ac2 += tmp4 * c2 2546 madd $ac2, s5, t5 // ac2 += tmp5 * c5 2547 msub $ac2, s6, t6 // ac2 -= tmp6 * c6 2548 mult $ac3, s7, t0 // ac3 = tmp7 * c0 2549 msub $ac3, s4, t1 // ac3 -= tmp4 * c1 2550 madd $ac3, s5, t2 // ac3 += tmp5 * c2 2551 msub $ac3, s6, t3 // ac3 -= tmp6 * c3 2552 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 2553 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 2554 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 2555 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 2556 addiu s8, s8, -1 2557 addu s4, a2, a3 // tmp4 = tmp10 + tmp11 2558 subu s5, a2, a3 // tmp5 = tmp10 - tmp11 2559 sh s0, 16(a0) 2560 sh s1, 48(a0) 2561 sh s2, 80(a0) 2562 sh s3, 112(a0) 2563 mult v0, t8 // ac0 = tmp12 * c8 2564 madd v1, t9 // ac0 += tmp13 * c9 2565 mult $ac1, v1, t8 // ac1 = tmp13 * c8 2566 msub $ac1, v0, a1 // ac1 -= tmp12 * c10 2567 addiu a0, a0, 2 2568 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 2569 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 2570 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 2571 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 2572 sh s4, -2(a0) 2573 sh s5, 62(a0) 2574 sh s6, 30(a0) 2575 bgtz s8, 2b 2576 sh s7, 94(a0) 2577 2578 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2579 2580 jr ra 2581 nop 2582 2583END(jsimd_fdct_islow_mips_dspr2) 2584 2585/*****************************************************************************/ 2586LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2) 2587/* 2588 * a0 - data 2589 */ 2590 .set at 2591 SAVE_REGS_ON_STACK 8, s0, s1 2592 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) 2593 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) 2594 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) 2595 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) 2596 2597 move v0, a0 2598 addiu v1, v0, 128 // end address 2599 26000: 2601 lw t0, 0(v0) // tmp0 = 1|0 2602 lw t1, 4(v0) // tmp1 = 3|2 2603 lw t2, 8(v0) // tmp2 = 5|4 2604 lw t3, 12(v0) // tmp3 = 7|6 2605 packrl.ph t1, t1, t1 // tmp1 = 2|3 2606 packrl.ph t3, t3, t3 // tmp3 = 6|7 2607 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 2608 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 2609 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 2610 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 2611 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 2612 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 2613 sra t4, t8, 16 // tmp4 = t11 2614 mult $0, $0 // ac0 = 0 2615 dpa.w.ph $ac0, t9, s1 2616 mult $ac1, $0, $0 // ac1 = 0 2617 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 2618 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 2619 mult $ac2, $0, $0 // ac2 = 0 2620 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 2621 mult $ac3, $0, $0 // ac3 = 0 2622 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 2623 precrq.ph.w t0, t5, t7 // t0 = t5|t6 2624 addq.ph t2, t8, t4 // tmp2 = t10 + t11 2625 subq.ph t3, t8, t4 // tmp3 = t10 - t11 2626 extr.w t4, $ac0, 8 2627 mult $0, $0 // ac0 = 0 2628 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 2629 extr.w t0, $ac1, 8 // t0 = z5 2630 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) 2631 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) 2632 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) 2633 add t6, t1, t0 // t6 = z2 2634 add t7, t7, t0 // t7 = z4 2635 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 2636 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 2637 addq.ph t1, t0, t6 // t1 = z13 + z2 2638 subq.ph t6, t0, t6 // t6 = z13 - z2 2639 addq.ph t0, t8, t7 // t0 = z11 + z4 2640 subq.ph t7, t8, t7 // t7 = z11 - z4 2641 addq.ph t5, t4, t9 2642 subq.ph t4, t9, t4 2643 sh t2, 0(v0) 2644 sh t5, 4(v0) 2645 sh t3, 8(v0) 2646 sh t4, 12(v0) 2647 sh t1, 10(v0) 2648 sh t6, 6(v0) 2649 sh t0, 2(v0) 2650 sh t7, 14(v0) 2651 addiu v0, 16 2652 bne v1, v0, 0b 2653 nop 2654 move v0, a0 2655 addiu v1, v0, 16 2656 26571: 2658 lh t0, 0(v0) // 0 2659 lh t1, 16(v0) // 8 2660 lh t2, 32(v0) // 16 2661 lh t3, 48(v0) // 24 2662 lh t4, 64(v0) // 32 2663 lh t5, 80(v0) // 40 2664 lh t6, 96(v0) // 48 2665 lh t7, 112(v0) // 56 2666 add t8, t0, t7 // t8 = tmp0 2667 sub t7, t0, t7 // t7 = tmp7 2668 add t0, t1, t6 // t0 = tmp1 2669 sub t1, t1, t6 // t1 = tmp6 2670 add t6, t2, t5 // t6 = tmp2 2671 sub t5, t2, t5 // t5 = tmp5 2672 add t2, t3, t4 // t2 = tmp3 2673 sub t3, t3, t4 // t3 = tmp4 2674 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 2675 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 2676 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 2677 ins t8, s0, 16, 16 // t8 = tmp12|tmp13 2678 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 2679 mult $0, $0 // ac0 = 0 2680 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 2681 add s0, t4, t2 // t8 = tmp10+tmp11 2682 sub t4, t4, t2 // t4 = tmp10-tmp11 2683 sh s0, 0(v0) 2684 sh t4, 64(v0) 2685 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781) 2686 addq.ph t4, t8, t2 // t9 = tmp13 + z1 2687 subq.ph t8, t8, t2 // t2 = tmp13 - z1 2688 sh t4, 32(v0) 2689 sh t8, 96(v0) 2690 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 2691 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 2692 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 2693 andi t4, a1, 0xffff 2694 mul s0, t1, t4 2695 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) 2696 ins t1, t3, 16, 16 // t1 = tmp10|tmp12 2697 mult $0, $0 // ac0 = 0 2698 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 2699 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433) 2700 add t2, t7, t8 // t2 = tmp7 + z5 2701 sub t7, t7, t8 // t7 = tmp7 - z5 2702 andi t4, a2, 0xffff 2703 mul t8, t3, t4 2704 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) 2705 andi t4, s1, 0xffff 2706 mul t6, t0, t4 2707 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) 2708 add t0, t6, t8 // t0 = z3 + z2 2709 sub t1, t6, t8 // t1 = z3 - z2 2710 add t3, t6, s0 // t3 = z3 + z4 2711 sub t4, t6, s0 // t4 = z3 - z4 2712 sub t5, t2, t1 // t5 = dataptr[5] 2713 sub t6, t7, t0 // t6 = dataptr[3] 2714 add t3, t2, t3 // t3 = dataptr[1] 2715 add t4, t7, t4 // t4 = dataptr[7] 2716 sh t5, 80(v0) 2717 sh t6, 48(v0) 2718 sh t3, 16(v0) 2719 sh t4, 112(v0) 2720 addiu v0, 2 2721 bne v0, v1, 1b 2722 nop 2723 2724 RESTORE_REGS_FROM_STACK 8, s0, s1 2725 2726 j ra 2727 nop 2728END(jsimd_fdct_ifast_mips_dspr2) 2729 2730/*****************************************************************************/ 2731LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) 2732/* 2733 * a0 - coef_block 2734 * a1 - divisors 2735 * a2 - workspace 2736 */ 2737 2738 .set at 2739 2740 SAVE_REGS_ON_STACK 16, s0, s1, s2 2741 2742 addiu v0, a2, 124 // v0 = workspace_end 2743 lh t0, 0(a2) 2744 lh t1, 0(a1) 2745 lh t2, 128(a1) 2746 sra t3, t0, 15 2747 sll t3, t3, 1 2748 addiu t3, t3, 1 2749 mul t0, t0, t3 2750 lh t4, 384(a1) 2751 lh t5, 130(a1) 2752 lh t6, 2(a2) 2753 lh t7, 2(a1) 2754 lh t8, 386(a1) 2755 27561: 2757 andi t1, 0xffff 2758 add t9, t0, t2 2759 andi t9, 0xffff 2760 mul v1, t9, t1 2761 sra s0, t6, 15 2762 sll s0, s0, 1 2763 addiu s0, s0, 1 2764 addiu t9, t4, 16 2765 srav v1, v1, t9 2766 mul v1, v1, t3 2767 mul t6, t6, s0 2768 andi t7, 0xffff 2769 addiu a2, a2, 4 2770 addiu a1, a1, 4 2771 add s1, t6, t5 2772 andi s1, 0xffff 2773 sh v1, 0(a0) 2774 2775 mul s2, s1, t7 2776 addiu s1, t8, 16 2777 srav s2, s2, s1 2778 mul s2,s2, s0 2779 lh t0, 0(a2) 2780 lh t1, 0(a1) 2781 sra t3, t0, 15 2782 sll t3, t3, 1 2783 addiu t3, t3, 1 2784 mul t0, t0, t3 2785 lh t2, 128(a1) 2786 lh t4, 384(a1) 2787 lh t5, 130(a1) 2788 lh t8, 386(a1) 2789 lh t6, 2(a2) 2790 lh t7, 2(a1) 2791 sh s2, 2(a0) 2792 lh t0, 0(a2) 2793 sra t3, t0, 15 2794 sll t3, t3, 1 2795 addiu t3, t3, 1 2796 mul t0, t0,t3 2797 bne a2, v0, 1b 2798 addiu a0, a0, 4 2799 2800 andi t1, 0xffff 2801 add t9, t0, t2 2802 andi t9, 0xffff 2803 mul v1, t9, t1 2804 sra s0, t6, 15 2805 sll s0, s0, 1 2806 addiu s0, s0, 1 2807 addiu t9, t4, 16 2808 srav v1, v1, t9 2809 mul v1, v1, t3 2810 mul t6, t6, s0 2811 andi t7, 0xffff 2812 sh v1, 0(a0) 2813 add s1, t6, t5 2814 andi s1, 0xffff 2815 mul s2, s1, t7 2816 addiu s1, t8, 16 2817 addiu a2, a2, 4 2818 addiu a1, a1, 4 2819 srav s2, s2, s1 2820 mul s2, s2, s0 2821 sh s2, 2(a0) 2822 2823 RESTORE_REGS_FROM_STACK 16, s0, s1, s2 2824 2825 j ra 2826 nop 2827 2828END(jsimd_quantize_mips_dspr2) 2829 2830/*****************************************************************************/ 2831LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) 2832/* 2833 * a0 - coef_block 2834 * a1 - divisors 2835 * a2 - workspace 2836 */ 2837 2838 .set at 2839 2840 li t1, 0x46800100 //integer representation 16384.5 2841 mtc1 t1, f0 2842 li t0, 63 28430: 2844 lwc1 f1, 0(a2) 2845 lwc1 f5, 0(a1) 2846 lwc1 f2, 4(a2) 2847 lwc1 f6, 4(a1) 2848 lwc1 f3, 8(a2) 2849 lwc1 f7, 8(a1) 2850 lwc1 f4, 12(a2) 2851 lwc1 f8, 12(a1) 2852 madd.s f1, f0, f1, f5 2853 madd.s f2, f0, f2, f6 2854 madd.s f3, f0, f3, f7 2855 madd.s f4, f0, f4, f8 2856 lwc1 f5, 16(a1) 2857 lwc1 f6, 20(a1) 2858 trunc.w.s f1, f1 2859 trunc.w.s f2, f2 2860 trunc.w.s f3, f3 2861 trunc.w.s f4, f4 2862 lwc1 f7, 24(a1) 2863 lwc1 f8, 28(a1) 2864 mfc1 t1, f1 2865 mfc1 t2, f2 2866 mfc1 t3, f3 2867 mfc1 t4, f4 2868 lwc1 f1, 16(a2) 2869 lwc1 f2, 20(a2) 2870 lwc1 f3, 24(a2) 2871 lwc1 f4, 28(a2) 2872 madd.s f1, f0, f1, f5 2873 madd.s f2, f0, f2, f6 2874 madd.s f3, f0, f3, f7 2875 madd.s f4, f0, f4, f8 2876 addiu t1, t1, -16384 2877 addiu t2, t2, -16384 2878 addiu t3, t3, -16384 2879 addiu t4, t4, -16384 2880 trunc.w.s f1, f1 2881 trunc.w.s f2, f2 2882 trunc.w.s f3, f3 2883 trunc.w.s f4, f4 2884 sh t1, 0(a0) 2885 sh t2, 2(a0) 2886 sh t3, 4(a0) 2887 sh t4, 6(a0) 2888 mfc1 t1, f1 2889 mfc1 t2, f2 2890 mfc1 t3, f3 2891 mfc1 t4, f4 2892 addiu t0, t0, -8 2893 addiu a2, a2, 32 2894 addiu a1, a1, 32 2895 addiu t1, t1, -16384 2896 addiu t2, t2, -16384 2897 addiu t3, t3, -16384 2898 addiu t4, t4, -16384 2899 sh t1, 8(a0) 2900 sh t2, 10(a0) 2901 sh t3, 12(a0) 2902 sh t4, 14(a0) 2903 bgez t0, 0b 2904 addiu a0, a0, 16 2905 2906 j ra 2907 nop 2908 2909END(jsimd_quantize_float_mips_dspr2) 2910/*****************************************************************************/ 2911LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) 2912/* 2913 * a0 - compptr->dct_table 2914 * a1 - coef_block 2915 * a2 - output_buf 2916 * a3 - output_col 2917 */ 2918 .set at 2919 2920 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 2921 2922 addiu sp, sp, -40 2923 move v0, sp 2924 addiu s2, zero, 29692 2925 addiu s3, zero, -10426 2926 addiu s4, zero, 6967 2927 addiu s5, zero, -5906 2928 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] 2929 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] 2930 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] 2931 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] 2932 mul t4, t5, t0 2933 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] 2934 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] 2935 mul t6, t6, t1 2936 mul t5, t5, t0 2937 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] 2938 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] 2939 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] 2940 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] 2941 mul t7, t7, t2 2942 mult zero, zero 2943 mul t8, t8, t3 2944 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) 2945 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) 2946 ins t6, t5, 16, 16 // t6 = t5|t6 2947 sll t4, t4, 15 2948 dpa.w.ph $ac0, t6, s0 2949 lh t1, 2(a1) 2950 lh t6, 2(a0) 2951 ins t8, t7, 16, 16 // t8 = t7|t8 2952 dpa.w.ph $ac0, t8, s1 2953 mflo t0, $ac0 2954 mul t5, t6, t1 2955 lh t1, 18(a1) 2956 lh t6, 18(a0) 2957 lh t2, 50(a1) 2958 lh t7, 50(a0) 2959 mul t6, t6, t1 2960 subu t8, t4, t0 2961 mul t7, t7, t2 2962 addu t0, t4, t0 2963 shra_r.w t0, t0, 13 2964 lh t1, 82(a1) 2965 lh t2, 82(a0) 2966 lh t3, 114(a1) 2967 lh t4, 114(a0) 2968 shra_r.w t8, t8, 13 2969 mul t1, t1, t2 2970 mul t3, t3, t4 2971 sw t0, 0(v0) 2972 sw t8, 20(v0) 2973 sll t4, t5, 15 2974 ins t7, t6, 16, 16 2975 mult zero, zero 2976 dpa.w.ph $ac0, t7, s0 2977 ins t3, t1, 16, 16 2978 lh t1, 6(a1) 2979 lh t6, 6(a0) 2980 dpa.w.ph $ac0, t3, s1 2981 mflo t0, $ac0 2982 mul t5, t6, t1 2983 lh t1, 22(a1) 2984 lh t6, 22(a0) 2985 lh t2, 54(a1) 2986 lh t7, 54(a0) 2987 mul t6, t6, t1 2988 subu t8, t4, t0 2989 mul t7, t7, t2 2990 addu t0, t4, t0 2991 shra_r.w t0, t0, 13 2992 lh t1, 86(a1) 2993 lh t2, 86(a0) 2994 lh t3, 118(a1) 2995 lh t4, 118(a0) 2996 shra_r.w t8, t8, 13 2997 mul t1, t1, t2 2998 mul t3, t3, t4 2999 sw t0, 4(v0) 3000 sw t8, 24(v0) 3001 sll t4, t5, 15 3002 ins t7, t6, 16, 16 3003 mult zero, zero 3004 dpa.w.ph $ac0, t7, s0 3005 ins t3, t1, 16, 16 3006 lh t1, 10(a1) 3007 lh t6, 10(a0) 3008 dpa.w.ph $ac0, t3, s1 3009 mflo t0, $ac0 3010 mul t5, t6, t1 3011 lh t1, 26(a1) 3012 lh t6, 26(a0) 3013 lh t2, 58(a1) 3014 lh t7, 58(a0) 3015 mul t6, t6, t1 3016 subu t8, t4, t0 3017 mul t7, t7, t2 3018 addu t0, t4, t0 3019 shra_r.w t0, t0, 13 3020 lh t1, 90(a1) 3021 lh t2, 90(a0) 3022 lh t3, 122(a1) 3023 lh t4, 122(a0) 3024 shra_r.w t8, t8, 13 3025 mul t1, t1, t2 3026 mul t3, t3, t4 3027 sw t0, 8(v0) 3028 sw t8, 28(v0) 3029 sll t4, t5, 15 3030 ins t7, t6, 16, 16 3031 mult zero, zero 3032 dpa.w.ph $ac0, t7, s0 3033 ins t3, t1, 16, 16 3034 lh t1, 14(a1) 3035 lh t6, 14(a0) 3036 dpa.w.ph $ac0, t3, s1 3037 mflo t0, $ac0 3038 mul t5, t6, t1 3039 lh t1, 30(a1) 3040 lh t6, 30(a0) 3041 lh t2, 62(a1) 3042 lh t7, 62(a0) 3043 mul t6, t6, t1 3044 subu t8, t4, t0 3045 mul t7, t7, t2 3046 addu t0, t4, t0 3047 shra_r.w t0, t0, 13 3048 lh t1, 94(a1) 3049 lh t2, 94(a0) 3050 lh t3, 126(a1) 3051 lh t4, 126(a0) 3052 shra_r.w t8, t8, 13 3053 mul t1, t1, t2 3054 mul t3, t3, t4 3055 sw t0, 12(v0) 3056 sw t8, 32(v0) 3057 sll t4, t5, 15 3058 ins t7, t6, 16, 16 3059 mult zero, zero 3060 dpa.w.ph $ac0, t7, s0 3061 ins t3, t1, 16, 16 3062 dpa.w.ph $ac0, t3, s1 3063 mflo t0, $ac0 3064 lw t9, 0(a2) 3065 lw t3, 0(v0) 3066 lw t7, 4(v0) 3067 lw t1, 8(v0) 3068 addu t9, t9, a3 3069 sll t3, t3, 15 3070 subu t8, t4, t0 3071 addu t0, t4, t0 3072 shra_r.w t0, t0, 13 3073 shra_r.w t8, t8, 13 3074 sw t0, 16(v0) 3075 sw t8, 36(v0) 3076 lw t5, 12(v0) 3077 lw t6, 16(v0) 3078 mult t7, s2 3079 madd t1, s3 3080 madd t5, s4 3081 madd t6, s5 3082 lw t5, 24(v0) 3083 lw t7, 28(v0) 3084 mflo t0, $ac0 3085 lw t8, 32(v0) 3086 lw t2, 36(v0) 3087 mult $ac1, t5, s2 3088 madd $ac1, t7, s3 3089 madd $ac1, t8, s4 3090 madd $ac1, t2, s5 3091 addu t1, t3, t0 3092 subu t6, t3, t0 3093 shra_r.w t1, t1, 20 3094 shra_r.w t6, t6, 20 3095 mflo t4, $ac1 3096 shll_s.w t1, t1, 24 3097 shll_s.w t6, t6, 24 3098 sra t1, t1, 24 3099 sra t6, t6, 24 3100 addiu t1, t1, 128 3101 addiu t6, t6, 128 3102 lw t0, 20(v0) 3103 sb t1, 0(t9) 3104 sb t6, 1(t9) 3105 sll t0, t0, 15 3106 lw t9, 4(a2) 3107 addu t1, t0, t4 3108 subu t6, t0, t4 3109 addu t9, t9, a3 3110 shra_r.w t1, t1, 20 3111 shra_r.w t6, t6, 20 3112 shll_s.w t1, t1, 24 3113 shll_s.w t6, t6, 24 3114 sra t1, t1, 24 3115 sra t6, t6, 24 3116 addiu t1, t1, 128 3117 addiu t6, t6, 128 3118 sb t1, 0(t9) 3119 sb t6, 1(t9) 3120 addiu sp, sp, 40 3121 3122 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 3123 3124 j ra 3125 nop 3126 3127END(jsimd_idct_2x2_mips_dspr2) 3128 3129/*****************************************************************************/ 3130LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) 3131/* 3132 * a0 - compptr->dct_table 3133 * a1 - coef_block 3134 * a2 - output_buf 3135 * a3 - output_col 3136 * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes 3137 */ 3138 3139 .set at 3140 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3141 3142 lw v1, 48(sp) 3143 move t0, a1 3144 move t1, v1 3145 li t9, 4 3146 li s0, 0x2e75f93e 3147 li s1, 0x21f9ba79 3148 li s2, 0xecc2efb0 3149 li s3, 0x52031ccd 3150 31510: 3152 lh s6, 32(t0) // inptr[DCTSIZE*2] 3153 lh t6, 32(a0) // quantptr[DCTSIZE*2] 3154 lh s7, 96(t0) // inptr[DCTSIZE*6] 3155 lh t7, 96(a0) // quantptr[DCTSIZE*6] 3156 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3157 lh s4, 0(t0) // inptr[DCTSIZE*0] 3158 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3159 lh s5, 0(a0) // quantptr[0] 3160 li s6, 15137 3161 li s7, 6270 3162 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3163 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3164 lh t5, 112(t0) // inptr[DCTSIZE*7] 3165 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3166 lh s4, 112(a0) // quantptr[DCTSIZE*7] 3167 lh v0, 80(t0) // inptr[DCTSIZE*5] 3168 lh s5, 80(a0) // quantptr[DCTSIZE*5] 3169 lh s6, 48(a0) // quantptr[DCTSIZE*3] 3170 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3171 lh s7, 16(a0) // quantptr[DCTSIZE*1] 3172 lh t8, 16(t0) // inptr[DCTSIZE*1] 3173 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3174 lh t7, 48(t0) // inptr[DCTSIZE*3] 3175 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3176 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3177 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3178 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3179 addu t3, t2, t6 // tmp10 = tmp0 + z2 3180 subu t4, t2, t6 // tmp10 = tmp0 - z2 3181 mult $ac0, zero, zero 3182 mult $ac1, zero, zero 3183 ins t5, v0, 16, 16 3184 ins t7, t8, 16, 16 3185 addiu t9, t9, -1 3186 dpa.w.ph $ac0, t5, s0 3187 dpa.w.ph $ac0, t7, s1 3188 dpa.w.ph $ac1, t5, s2 3189 dpa.w.ph $ac1, t7, s3 3190 mflo s4, $ac0 3191 mflo s5, $ac1 3192 addiu a0, a0, 2 3193 addiu t1, t1, 4 3194 addiu t0, t0, 2 3195 addu t6, t4, s4 3196 subu t5, t4, s4 3197 addu s6, t3, s5 3198 subu s7, t3, s5 3199 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) 3200 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) 3201 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3202 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3203 sw t6, 28(t1) 3204 sw t5, 60(t1) 3205 sw s6, -4(t1) 3206 bgtz t9, 0b 3207 sw s7, 92(t1) 3208 // second loop three pass 3209 li t9, 3 32101: 3211 lh s6, 34(t0) // inptr[DCTSIZE*2] 3212 lh t6, 34(a0) // quantptr[DCTSIZE*2] 3213 lh s7, 98(t0) // inptr[DCTSIZE*6] 3214 lh t7, 98(a0) // quantptr[DCTSIZE*6] 3215 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3216 lh s4, 2(t0) // inptr[DCTSIZE*0] 3217 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3218 lh s5, 2(a0) // quantptr[DCTSIZE*0] 3219 li s6, 15137 3220 li s7, 6270 3221 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3222 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3223 lh t5, 114(t0) // inptr[DCTSIZE*7] 3224 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3225 lh s4, 114(a0) // quantptr[DCTSIZE*7] 3226 lh s5, 82(a0) // quantptr[DCTSIZE*5] 3227 lh t6, 82(t0) // inptr[DCTSIZE*5] 3228 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3229 lh s6, 50(a0) // quantptr[DCTSIZE*3] 3230 lh t8, 18(t0) // inptr[DCTSIZE*1] 3231 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3232 lh t7, 50(t0) // inptr[DCTSIZE*3] 3233 lh s7, 18(a0) // quantptr[DCTSIZE*1] 3234 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3235 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3236 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3237 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3238 addu t3, t2, v0 // tmp10 = tmp0 + z2 3239 subu t4, t2, v0 // tmp10 = tmp0 - z2 3240 mult $ac0, zero, zero 3241 mult $ac1, zero, zero 3242 ins t5, t6, 16, 16 3243 ins t7, t8, 16, 16 3244 dpa.w.ph $ac0, t5, s0 3245 dpa.w.ph $ac0, t7, s1 3246 dpa.w.ph $ac1, t5, s2 3247 dpa.w.ph $ac1, t7, s3 3248 mflo t5, $ac0 3249 mflo t6, $ac1 3250 addiu t9, t9, -1 3251 addiu t0, t0, 2 3252 addiu a0, a0, 2 3253 addiu t1, t1, 4 3254 addu s5, t4, t5 3255 subu s4, t4, t5 3256 addu s6, t3, t6 3257 subu s7, t3, t6 3258 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) 3259 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) 3260 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3261 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3262 sw s5, 32(t1) 3263 sw s4, 64(t1) 3264 sw s6, 0(t1) 3265 bgtz t9, 1b 3266 sw s7, 96(t1) 3267 move t1, v1 3268 li s4, 15137 3269 lw s6, 8(t1) // wsptr[2] 3270 li s5, 6270 3271 lw s7, 24(t1) // wsptr[6] 3272 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065) 3273 lw t2, 0(t1) // wsptr[0] 3274 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865) 3275 lh t5, 28(t1) // wsptr[7] 3276 lh t6, 20(t1) // wsptr[5] 3277 lh t7, 12(t1) // wsptr[3] 3278 lh t8, 4(t1) // wsptr[1] 3279 ins t5, t6, 16, 16 3280 ins t7, t8, 16, 16 3281 mult $ac0, zero, zero 3282 dpa.w.ph $ac0, t5, s0 3283 dpa.w.ph $ac0, t7, s1 3284 mult $ac1, zero, zero 3285 dpa.w.ph $ac1, t5, s2 3286 dpa.w.ph $ac1, t7, s3 3287 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1) 3288 mflo s6, $ac0 3289 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3290 subu s4, s4, s5 3291 addu t3, t2, s4 // tmp10 = tmp0 + z2 3292 mflo s7, $ac1 3293 subu t4, t2, s4 // tmp10 = tmp0 - z2 3294 addu t7, t4, s6 3295 subu t8, t4, s6 3296 addu t5, t3, s7 3297 subu t6, t3, s7 3298 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3299 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3300 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3301 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3302 sll s4, t9, 2 3303 lw v0, 0(a2) // output_buf[ctr] 3304 shll_s.w t5, t5, 24 3305 shll_s.w t6, t6, 24 3306 shll_s.w t7, t7, 24 3307 shll_s.w t8, t8, 24 3308 sra t5, t5, 24 3309 sra t6, t6, 24 3310 sra t7, t7, 24 3311 sra t8, t8, 24 3312 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3313 addiu t5, t5, 128 3314 addiu t6, t6, 128 3315 addiu t7, t7, 128 3316 addiu t8, t8, 128 3317 sb t5, 0(v0) 3318 sb t7, 1(v0) 3319 sb t8, 2(v0) 3320 sb t6, 3(v0) 3321 // 2 3322 li s4, 15137 3323 lw s6, 40(t1) // wsptr[2] 3324 li s5, 6270 3325 lw s7, 56(t1) // wsptr[6] 3326 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065) 3327 lw t2, 32(t1) // wsptr[0] 3328 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865) 3329 lh t5, 60(t1) // wsptr[7] 3330 lh t6, 52(t1) // wsptr[5] 3331 lh t7, 44(t1) // wsptr[3] 3332 lh t8, 36(t1) // wsptr[1] 3333 ins t5, t6, 16, 16 3334 ins t7, t8, 16, 16 3335 mult $ac0, zero, zero 3336 dpa.w.ph $ac0, t5, s0 3337 dpa.w.ph $ac0, t7, s1 3338 mult $ac1, zero, zero 3339 dpa.w.ph $ac1, t5, s2 3340 dpa.w.ph $ac1, t7, s3 3341 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1) 3342 mflo s6, $ac0 3343 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3344 subu s4, s4, s5 3345 addu t3, t2, s4 // tmp10 = tmp0 + z2 3346 mflo s7, $ac1 3347 subu t4, t2, s4 // tmp10 = tmp0 - z2 3348 addu t7, t4, s6 3349 subu t8, t4, s6 3350 addu t5, t3, s7 3351 subu t6, t3, s7 3352 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) 3353 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) 3354 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) 3355 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) 3356 sll s4, t9, 2 3357 lw v0, 4(a2) // output_buf[ctr] 3358 shll_s.w t5, t5, 24 3359 shll_s.w t6, t6, 24 3360 shll_s.w t7, t7, 24 3361 shll_s.w t8, t8, 24 3362 sra t5, t5, 24 3363 sra t6, t6, 24 3364 sra t7, t7, 24 3365 sra t8, t8, 24 3366 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3367 addiu t5, t5, 128 3368 addiu t6, t6, 128 3369 addiu t7, t7, 128 3370 addiu t8, t8, 128 3371 sb t5, 0(v0) 3372 sb t7, 1(v0) 3373 sb t8, 2(v0) 3374 sb t6, 3(v0) 3375 // 3 3376 li s4, 15137 3377 lw s6, 72(t1) // wsptr[2] 3378 li s5, 6270 3379 lw s7, 88(t1) // wsptr[6] 3380 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065) 3381 lw t2, 64(t1) // wsptr[0] 3382 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865) 3383 lh t5, 92(t1) // wsptr[7] 3384 lh t6, 84(t1) // wsptr[5] 3385 lh t7, 76(t1) // wsptr[3] 3386 lh t8, 68(t1) // wsptr[1] 3387 ins t5, t6, 16, 16 3388 ins t7, t8, 16, 16 3389 mult $ac0, zero, zero 3390 dpa.w.ph $ac0, t5, s0 3391 dpa.w.ph $ac0, t7, s1 3392 mult $ac1, zero, zero 3393 dpa.w.ph $ac1, t5, s2 3394 dpa.w.ph $ac1, t7, s3 3395 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1) 3396 mflo s6, $ac0 3397 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3398 subu s4, s4, s5 3399 addu t3, t2, s4 // tmp10 = tmp0 + z2 3400 mflo s7, $ac1 3401 subu t4, t2, s4 // tmp10 = tmp0 - z2 3402 addu t7, t4, s6 3403 subu t8, t4, s6 3404 addu t5, t3, s7 3405 subu t6, t3, s7 3406 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3407 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3408 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3409 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3410 sll s4, t9, 2 3411 lw v0, 8(a2) // output_buf[ctr] 3412 shll_s.w t5, t5, 24 3413 shll_s.w t6, t6, 24 3414 shll_s.w t7, t7, 24 3415 shll_s.w t8, t8, 24 3416 sra t5, t5, 24 3417 sra t6, t6, 24 3418 sra t7, t7, 24 3419 sra t8, t8, 24 3420 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3421 addiu t5, t5, 128 3422 addiu t6, t6, 128 3423 addiu t7, t7, 128 3424 addiu t8, t8, 128 3425 sb t5, 0(v0) 3426 sb t7, 1(v0) 3427 sb t8, 2(v0) 3428 sb t6, 3(v0) 3429 li s4, 15137 3430 lw s6, 104(t1) // wsptr[2] 3431 li s5, 6270 3432 lw s7, 120(t1) // wsptr[6] 3433 mul s4, s4, s6 // MULTIPLY((INT32) wsptr[2], FIX_1_847759065) 3434 lw t2, 96(t1) // wsptr[0] 3435 mul s5, s5, s7 // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865) 3436 lh t5, 124(t1) // wsptr[7] 3437 lh t6, 116(t1) // wsptr[5] 3438 lh t7, 108(t1) // wsptr[3] 3439 lh t8, 100(t1) // wsptr[1] 3440 ins t5, t6, 16, 16 3441 ins t7, t8, 16, 16 3442 mult $ac0, zero, zero 3443 dpa.w.ph $ac0, t5, s0 3444 dpa.w.ph $ac0, t7, s1 3445 mult $ac1, zero, zero 3446 dpa.w.ph $ac1, t5, s2 3447 dpa.w.ph $ac1, t7, s3 3448 sll t2, t2, 14 // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1) 3449 mflo s6, $ac0 3450 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3451 subu s4, s4, s5 3452 addu t3, t2, s4 // tmp10 = tmp0 + z2; 3453 mflo s7, $ac1 3454 subu t4, t2, s4 // tmp10 = tmp0 - z2; 3455 addu t7, t4, s6 3456 subu t8, t4, s6 3457 addu t5, t3, s7 3458 subu t6, t3, s7 3459 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3460 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3461 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3462 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3463 sll s4, t9, 2 3464 lw v0, 12(a2) // output_buf[ctr] 3465 shll_s.w t5, t5, 24 3466 shll_s.w t6, t6, 24 3467 shll_s.w t7, t7, 24 3468 shll_s.w t8, t8, 24 3469 sra t5, t5, 24 3470 sra t6, t6, 24 3471 sra t7, t7, 24 3472 sra t8, t8, 24 3473 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3474 addiu t5, t5, 128 3475 addiu t6, t6, 128 3476 addiu t7, t7, 128 3477 addiu t8, t8, 128 3478 sb t5, 0(v0) 3479 sb t7, 1(v0) 3480 sb t8, 2(v0) 3481 sb t6, 3(v0) 3482 3483 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3484 3485 j ra 3486 nop 3487END(jsimd_idct_4x4_mips_dspr2) 3488 3489/*****************************************************************************/ 3490LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) 3491/* 3492 * a0 - compptr->dct_table 3493 * a1 - coef_block 3494 * a2 - output_buf 3495 * a3 - output_col 3496 */ 3497 .set at 3498 3499 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3500 3501 addiu sp, sp, -144 3502 move v0, sp 3503 addiu v1, v0, 24 3504 addiu t9, zero, 5793 3505 addiu s0, zero, 10033 3506 addiu s1, zero, 2998 3507 35081: 3509 lh s2, 0(a0) // q0 = quantptr[ 0] 3510 lh s3, 32(a0) // q1 = quantptr[16] 3511 lh s4, 64(a0) // q2 = quantptr[32] 3512 lh t2, 64(a1) // tmp2 = inptr[32] 3513 lh t1, 32(a1) // tmp1 = inptr[16] 3514 lh t0, 0(a1) // tmp0 = inptr[ 0] 3515 mul t2, t2, s4 // tmp2 = tmp2 * q2 3516 mul t1, t1, s3 // tmp1 = tmp1 * q1 3517 mul t0, t0, s2 // tmp0 = tmp0 * q0 3518 lh t6, 16(a1) // z1 = inptr[ 8] 3519 lh t8, 80(a1) // z3 = inptr[40] 3520 lh t7, 48(a1) // z2 = inptr[24] 3521 lh s2, 16(a0) // q0 = quantptr[ 8] 3522 lh s4, 80(a0) // q2 = quantptr[40] 3523 lh s3, 48(a0) // q1 = quantptr[24] 3524 mul t2, t2, t9 // tmp2 = tmp2 * 5793 3525 mul t1, t1, s0 // tmp1 = tmp1 * 10033 3526 sll t0, t0, 13 // tmp0 = tmp0 << 13 3527 mul t6, t6, s2 // z1 = z1 * q0 3528 mul t8, t8, s4 // z3 = z3 * q2 3529 mul t7, t7, s3 // z2 = z2 * q1 3530 addu t3, t0, t2 // tmp10 = tmp0 + tmp2 3531 sll t2, t2, 1 // tmp2 = tmp2 << 2 3532 subu t4, t0, t2 // tmp11 = tmp0 - tmp2; 3533 subu t5, t3, t1 // tmp12 = tmp10 - tmp1 3534 addu t3, t3, t1 // tmp10 = tmp10 + tmp1 3535 addu t1, t6, t8 // tmp1 = z1 + z3 3536 mul t1, t1, s1 // tmp1 = tmp1 * 2998 3537 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3538 subu t2, t6, t8 // tmp2 = z1 - z3 3539 subu t2, t2, t7 // tmp2 = tmp2 - z2 3540 sll t2, t2, 2 // tmp2 = tmp2 << 2 3541 addu t0, t6, t7 // tmp0 = z1 + z2 3542 sll t0, t0, 13 // tmp0 = tmp0 << 13 3543 subu s2, t8, t7 // q0 = z3 - z2 3544 sll s2, s2, 13 // q0 = q0 << 13 3545 addu t0, t0, t1 // tmp0 = tmp0 + tmp1 3546 addu t1, s2, t1 // tmp1 = q0 + tmp1 3547 addu s2, t4, t2 // q0 = tmp11 + tmp2 3548 subu s3, t4, t2 // q1 = tmp11 - tmp2 3549 addu t6, t3, t0 // z1 = tmp10 + tmp0 3550 subu t7, t3, t0 // z2 = tmp10 - tmp0 3551 addu t4, t5, t1 // tmp11 = tmp12 + tmp1 3552 subu t5, t5, t1 // tmp12 = tmp12 - tmp1 3553 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 3554 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 3555 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3556 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 3557 sw s2, 24(v0) 3558 sw s3, 96(v0) 3559 sw t6, 0(v0) 3560 sw t7, 120(v0) 3561 sw t4, 48(v0) 3562 sw t5, 72(v0) 3563 addiu v0, v0, 4 3564 addiu a1, a1, 2 3565 bne v0, v1, 1b 3566 addiu a0, a0, 2 3567 3568 /* Pass 2: process 6 rows from work array, store into output array. */ 3569 move v0, sp 3570 addiu v1, v0, 144 3571 35722: 3573 lw t0, 0(v0) 3574 lw t2, 16(v0) 3575 lw s5, 0(a2) 3576 addiu t0, t0, 16 3577 sll t0, t0, 13 3578 mul t3, t2, t9 3579 lw t6, 4(v0) 3580 lw t8, 20(v0) 3581 lw t7, 12(v0) 3582 addu s5, s5, a3 3583 addu s6, t6, t8 3584 mul s6, s6, s1 3585 addu t1, t0, t3 3586 subu t4, t0, t3 3587 subu t4, t4, t3 3588 lw t3, 8(v0) 3589 mul t0, t3, s0 3590 addu s7, t6, t7 3591 sll s7, s7, 13 3592 addu s7, s6, s7 3593 subu t2, t8, t7 3594 sll t2, t2, 13 3595 addu t2, s6, t2 3596 subu s6, t6, t7 3597 subu s6, s6, t8 3598 sll s6, s6, 13 3599 addu t3, t1, t0 3600 subu t5, t1, t0 3601 addu t6, t3, s7 3602 subu t3, t3, s7 3603 addu t7, t4, s6 3604 subu t4, t4, s6 3605 addu t8, t5, t2 3606 subu t5, t5, t2 3607 shll_s.w t6, t6, 6 3608 shll_s.w t3, t3, 6 3609 shll_s.w t7, t7, 6 3610 shll_s.w t4, t4, 6 3611 shll_s.w t8, t8, 6 3612 shll_s.w t5, t5, 6 3613 sra t6, t6, 24 3614 addiu t6, t6, 128 3615 sra t3, t3, 24 3616 addiu t3, t3, 128 3617 sb t6, 0(s5) 3618 sra t7, t7, 24 3619 addiu t7, t7, 128 3620 sb t3, 5(s5) 3621 sra t4, t4, 24 3622 addiu t4, t4, 128 3623 sb t7, 1(s5) 3624 sra t8, t8, 24 3625 addiu t8, t8, 128 3626 sb t4, 4(s5) 3627 addiu v0, v0, 24 3628 sra t5, t5, 24 3629 addiu t5, t5, 128 3630 sb t8, 2(s5) 3631 addiu a2, a2, 4 3632 bne v0, v1, 2b 3633 sb t5, 3(s5) 3634 3635 addiu sp, sp, 144 3636 3637 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3638 3639 j ra 3640 nop 3641 3642END(jsimd_idct_6x6_mips_dspr2) 3643 3644/*****************************************************************************/ 3645LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) 3646/* 3647 * a0 - compptr->dct_table 3648 * a1 - coef_block 3649 * a2 - workspace 3650 */ 3651 3652 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3653 3654 li a3, 8 3655 36561: 3657 // odd part 3658 lh t0, 48(a1) 3659 lh t1, 48(a0) 3660 lh t2, 16(a1) 3661 lh t3, 16(a0) 3662 lh t4, 80(a1) 3663 lh t5, 80(a0) 3664 lh t6, 112(a1) 3665 lh t7, 112(a0) 3666 mul t0, t0, t1 // z2 3667 mul t1, t2, t3 // z1 3668 mul t2, t4, t5 // z3 3669 mul t3, t6, t7 // z4 3670 li t4, 10703 // FIX(1.306562965) 3671 li t5, 4433 // FIX_0_541196100 3672 li t6, 7053 // FIX(0.860918669) 3673 mul t4, t0,t4 // tmp11 3674 mul t5, t0,t5 // -tmp14 3675 addu t7, t1,t2 // tmp10 3676 addu t8, t7,t3 // tmp10 + z4 3677 mul t6, t6, t8 // tmp15 3678 li t8, 2139 // FIX(0.261052384) 3679 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) 3680 li t7, 2295 // FIX(0.280143716) 3681 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) 3682 addu t9, t2, t3 // z3 + z4 3683 li s0, 8565 // FIX(1.045510580) 3684 mul t9, t9, s0 // -tmp13 3685 li s0, 12112 // FIX(1.478575242) 3686 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) 3687 li s1, 12998 // FIX(1.586706681) 3688 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3689 li s2, 5540 // FIX(0.676326758) 3690 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3691 li s3, 16244 // FIX(1.982889723) 3692 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3693 subu t1, t1, t3 // z1-=z4 3694 subu t0, t0, t2 // z2-=z3 3695 addu t2, t0, t1 // z1+z2 3696 li t3, 4433 // FIX_0_541196100 3697 mul t2, t2, t3 // z3 3698 li t3, 6270 // FIX_0_765366865 3699 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3700 li t3, 15137 // FIX_0_765366865 3701 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3702 addu t8, t6, t8 // tmp12 3703 addu t3, t8, t4 // tmp12 + tmp11 3704 addu t3, t3, t7 // tmp10 3705 subu t8, t8, t9 // tmp12 + tmp13 3706 addu s0, t5, s0 3707 subu t8, t8, s0 // tmp12 3708 subu t9, t6, t9 3709 subu s1, s1, t4 3710 addu t9, t9, s1 // tmp13 3711 subu t6, t6, t5 3712 subu t6, t6, s2 3713 subu t6, t6, s3 // tmp15 3714 // even part start 3715 lh t4, 64(a1) 3716 lh t5, 64(a0) 3717 lh t7, 32(a1) 3718 lh s0, 32(a0) 3719 lh s1, 0(a1) 3720 lh s2, 0(a0) 3721 lh s3, 96(a1) 3722 lh v0, 96(a0) 3723 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) 3724 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) 3725 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) 3726 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) 3727 // odd part end 3728 addu t1, t2, t1 // tmp11 3729 subu t0, t2, t0 // tmp14 3730 // update counter and pointers 3731 addiu a3, a3, -1 3732 addiu a0, a0, 2 3733 addiu a1, a1, 2 3734 // even part rest 3735 li s1, 10033 3736 li s2, 11190 3737 mul t4, t4, s1 // z4 3738 mul s1, t5, s2 // z4 3739 sll t5, t5, 13 // z1 3740 sll t7, t7, 13 3741 addiu t7, t7, 1024 // z3 3742 sll s0, s0, 13 // z2 3743 addu s2, t7, t4 // tmp10 3744 subu t4, t7, t4 // tmp11 3745 subu s3, t5, s0 // tmp12 3746 addu t2, t7, s3 // tmp21 3747 subu s3, t7, s3 // tmp24 3748 addu t7, s1, s0 // tmp12 3749 addu v0, s2, t7 // tmp20 3750 subu s2, s2, t7 // tmp25 3751 subu s1, s1, t5 // z4 - z1 3752 subu s1, s1, s0 // tmp12 3753 addu s0, t4, s1 // tmp22 3754 subu t4, t4, s1 // tmp23 3755 // final output stage 3756 addu t5, v0, t3 3757 subu v0, v0, t3 3758 addu t3, t2, t1 3759 subu t2, t2, t1 3760 addu t1, s0, t8 3761 subu s0, s0, t8 3762 addu t8, t4, t9 3763 subu t4, t4, t9 3764 addu t9, s3, t0 3765 subu s3, s3, t0 3766 addu t0, s2, t6 3767 subu s2, s2, t6 3768 sra t5, t5, 11 3769 sra t3, t3, 11 3770 sra t1, t1, 11 3771 sra t8, t8, 11 3772 sra t9, t9, 11 3773 sra t0, t0, 11 3774 sra s2, s2, 11 3775 sra s3, s3, 11 3776 sra t4, t4, 11 3777 sra s0, s0, 11 3778 sra t2, t2, 11 3779 sra v0, v0, 11 3780 sw t5, 0(a2) 3781 sw t3, 32(a2) 3782 sw t1, 64(a2) 3783 sw t8, 96(a2) 3784 sw t9, 128(a2) 3785 sw t0, 160(a2) 3786 sw s2, 192(a2) 3787 sw s3, 224(a2) 3788 sw t4, 256(a2) 3789 sw s0, 288(a2) 3790 sw t2, 320(a2) 3791 sw v0, 352(a2) 3792 bgtz a3, 1b 3793 addiu a2, a2, 4 3794 3795 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3796 3797 j ra 3798 nop 3799 3800END(jsimd_idct_12x12_pass1_mips_dspr2) 3801 3802/*****************************************************************************/ 3803LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) 3804/* 3805 * a0 - workspace 3806 * a1 - output 3807 */ 3808 3809 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3810 3811 li a3, 12 3812 38131: 3814 // Odd part 3815 lw t0, 12(a0) 3816 lw t1, 4(a0) 3817 lw t2, 20(a0) 3818 lw t3, 28(a0) 3819 li t4, 10703 // FIX(1.306562965) 3820 li t5, 4433 // FIX_0_541196100 3821 mul t4, t0, t4 // tmp11 3822 mul t5, t0, t5 // -tmp14 3823 addu t6, t1, t2 // tmp10 3824 li t7, 2139 // FIX(0.261052384) 3825 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) 3826 addu t6, t6, t3 // tmp10 + z4 3827 li t8, 7053 // FIX(0.860918669) 3828 mul t6, t6, t8 // tmp15 3829 li t8, 2295 // FIX(0.280143716) 3830 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) 3831 addu t9, t2, t3 // z3 + z4 3832 li s0, 8565 // FIX(1.045510580) 3833 mul t9, t9, s0 // -tmp13 3834 li s0, 12112 // FIX(1.478575242) 3835 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) 3836 li s1, 12998 // FIX(1.586706681) 3837 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3838 li s2, 5540 // FIX(0.676326758) 3839 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3840 li s3, 16244 // FIX(1.982889723) 3841 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3842 subu t1, t1, t3 // z1 -= z4 3843 subu t0, t0, t2 // z2 -= z3 3844 addu t2, t1, t0 // z1 + z2 3845 li t3, 4433 // FIX_0_541196100 3846 mul t2, t2, t3 // z3 3847 li t3, 6270 // FIX_0_765366865 3848 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3849 li t3, 15137 // FIX_1_847759065 3850 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3851 addu t3, t6, t7 // tmp12 3852 addu t7, t3, t4 3853 addu t7, t7, t8 // tmp10 3854 subu t3, t3, t9 3855 subu t3, t3, t5 3856 subu t3, t3, s0 // tmp12 3857 subu t9, t6, t9 3858 subu t9, t9, t4 3859 addu t9, t9, s1 // tmp13 3860 subu t6, t6, t5 3861 subu t6, t6, s2 3862 subu t6, t6, s3 // tmp15 3863 addu t1, t2, t1 // tmp11 3864 subu t0, t2, t0 // tmp14 3865 // even part 3866 lw t2, 16(a0) // z4 3867 lw t4, 8(a0) // z1 3868 lw t5, 0(a0) // z3 3869 lw t8, 24(a0) // z2 3870 li s0, 10033 // FIX(1.224744871) 3871 li s1, 11190 // FIX(1.366025404) 3872 mul t2, t2, s0 // z4 3873 mul s0, t4, s1 // z4 3874 addiu t5, t5, 0x10 3875 sll t5, t5, 13 // z3 3876 sll t4, t4, 13 // z1 3877 sll t8, t8, 13 // z2 3878 subu s1, t4, t8 // tmp12 3879 addu s2, t5, t2 // tmp10 3880 subu t2, t5, t2 // tmp11 3881 addu s3, t5, s1 // tmp21 3882 subu s1, t5, s1 // tmp24 3883 addu t5, s0, t8 // tmp12 3884 addu v0, s2, t5 // tmp20 3885 subu t5, s2, t5 // tmp25 3886 subu t4, s0, t4 3887 subu t4, t4, t8 // tmp12 3888 addu t8, t2, t4 // tmp22 3889 subu t2, t2, t4 // tmp23 3890 // increment counter and pointers 3891 addiu a3, a3, -1 3892 addiu a0, a0, 32 3893 // Final stage 3894 addu t4, v0, t7 3895 subu v0, v0, t7 3896 addu t7, s3, t1 3897 subu s3, s3, t1 3898 addu t1, t8, t3 3899 subu t8, t8, t3 3900 addu t3, t2, t9 3901 subu t2, t2, t9 3902 addu t9, s1, t0 3903 subu s1, s1, t0 3904 addu t0, t5, t6 3905 subu t5, t5, t6 3906 sll t4, t4, 4 3907 sll t7, t7, 4 3908 sll t1, t1, 4 3909 sll t3, t3, 4 3910 sll t9, t9, 4 3911 sll t0, t0, 4 3912 sll t5, t5, 4 3913 sll s1, s1, 4 3914 sll t2, t2, 4 3915 sll t8, t8, 4 3916 sll s3, s3, 4 3917 sll v0, v0, 4 3918 shll_s.w t4, t4, 2 3919 shll_s.w t7, t7, 2 3920 shll_s.w t1, t1, 2 3921 shll_s.w t3, t3, 2 3922 shll_s.w t9, t9, 2 3923 shll_s.w t0, t0, 2 3924 shll_s.w t5, t5, 2 3925 shll_s.w s1, s1, 2 3926 shll_s.w t2, t2, 2 3927 shll_s.w t8, t8, 2 3928 shll_s.w s3, s3, 2 3929 shll_s.w v0, v0, 2 3930 srl t4, t4, 24 3931 srl t7, t7, 24 3932 srl t1, t1, 24 3933 srl t3, t3, 24 3934 srl t9, t9, 24 3935 srl t0, t0, 24 3936 srl t5, t5, 24 3937 srl s1, s1, 24 3938 srl t2, t2, 24 3939 srl t8, t8, 24 3940 srl s3, s3, 24 3941 srl v0, v0, 24 3942 lw t6, 0(a1) 3943 addiu t4, t4, 0x80 3944 addiu t7, t7, 0x80 3945 addiu t1, t1, 0x80 3946 addiu t3, t3, 0x80 3947 addiu t9, t9, 0x80 3948 addiu t0, t0, 0x80 3949 addiu t5, t5, 0x80 3950 addiu s1, s1, 0x80 3951 addiu t2, t2, 0x80 3952 addiu t8, t8, 0x80 3953 addiu s3, s3, 0x80 3954 addiu v0, v0, 0x80 3955 sb t4, 0(t6) 3956 sb t7, 1(t6) 3957 sb t1, 2(t6) 3958 sb t3, 3(t6) 3959 sb t9, 4(t6) 3960 sb t0, 5(t6) 3961 sb t5, 6(t6) 3962 sb s1, 7(t6) 3963 sb t2, 8(t6) 3964 sb t8, 9(t6) 3965 sb s3, 10(t6) 3966 sb v0, 11(t6) 3967 bgtz a3, 1b 3968 addiu a1, a1, 4 3969 3970 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3971 3972 jr ra 3973 nop 3974 3975END(jsimd_idct_12x12_pass2_mips_dspr2) 3976 3977/*****************************************************************************/ 3978LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2) 3979/* 3980 * a0 - sample_data 3981 * a1 - start_col 3982 * a2 - workspace 3983 */ 3984 3985 lw t0, 0(a0) 3986 li t7, 0xff80ff80 3987 addu t0, t0, a1 3988 ulw t1, 0(t0) 3989 ulw t2, 4(t0) 3990 preceu.ph.qbr t3, t1 3991 preceu.ph.qbl t4, t1 3992 lw t0, 4(a0) 3993 preceu.ph.qbr t5, t2 3994 preceu.ph.qbl t6, t2 3995 addu t0, t0, a1 3996 addu.ph t3, t3, t7 3997 addu.ph t4, t4, t7 3998 ulw t1, 0(t0) 3999 ulw t2, 4(t0) 4000 addu.ph t5, t5, t7 4001 addu.ph t6, t6, t7 4002 usw t3, 0(a2) 4003 usw t4, 4(a2) 4004 preceu.ph.qbr t3, t1 4005 preceu.ph.qbl t4, t1 4006 usw t5, 8(a2) 4007 usw t6, 12(a2) 4008 4009 lw t0, 8(a0) 4010 preceu.ph.qbr t5, t2 4011 preceu.ph.qbl t6, t2 4012 addu t0, t0, a1 4013 addu.ph t3, t3, t7 4014 addu.ph t4, t4, t7 4015 ulw t1, 0(t0) 4016 ulw t2, 4(t0) 4017 addu.ph t5, t5, t7 4018 addu.ph t6, t6, t7 4019 usw t3, 16(a2) 4020 usw t4, 20(a2) 4021 preceu.ph.qbr t3, t1 4022 preceu.ph.qbl t4, t1 4023 usw t5, 24(a2) 4024 usw t6, 28(a2) 4025 4026 lw t0, 12(a0) 4027 preceu.ph.qbr t5, t2 4028 preceu.ph.qbl t6, t2 4029 addu t0, t0, a1 4030 addu.ph t3, t3, t7 4031 addu.ph t4, t4, t7 4032 ulw t1, 0(t0) 4033 ulw t2, 4(t0) 4034 addu.ph t5, t5, t7 4035 addu.ph t6, t6, t7 4036 usw t3, 32(a2) 4037 usw t4, 36(a2) 4038 preceu.ph.qbr t3, t1 4039 preceu.ph.qbl t4, t1 4040 usw t5, 40(a2) 4041 usw t6, 44(a2) 4042 4043 lw t0, 16(a0) 4044 preceu.ph.qbr t5, t2 4045 preceu.ph.qbl t6, t2 4046 addu t0, t0, a1 4047 addu.ph t3, t3, t7 4048 addu.ph t4, t4, t7 4049 ulw t1, 0(t0) 4050 ulw t2, 4(t0) 4051 addu.ph t5, t5, t7 4052 addu.ph t6, t6, t7 4053 usw t3, 48(a2) 4054 usw t4, 52(a2) 4055 preceu.ph.qbr t3, t1 4056 preceu.ph.qbl t4, t1 4057 usw t5, 56(a2) 4058 usw t6, 60(a2) 4059 4060 lw t0, 20(a0) 4061 preceu.ph.qbr t5, t2 4062 preceu.ph.qbl t6, t2 4063 addu t0, t0, a1 4064 addu.ph t3, t3, t7 4065 addu.ph t4, t4, t7 4066 ulw t1, 0(t0) 4067 ulw t2, 4(t0) 4068 addu.ph t5, t5, t7 4069 addu.ph t6, t6, t7 4070 usw t3, 64(a2) 4071 usw t4, 68(a2) 4072 preceu.ph.qbr t3, t1 4073 preceu.ph.qbl t4, t1 4074 usw t5, 72(a2) 4075 usw t6, 76(a2) 4076 4077 lw t0, 24(a0) 4078 preceu.ph.qbr t5, t2 4079 preceu.ph.qbl t6, t2 4080 addu t0, t0, a1 4081 addu.ph t3, t3, t7 4082 addu.ph t4, t4, t7 4083 ulw t1, 0(t0) 4084 ulw t2, 4(t0) 4085 addu.ph t5, t5, t7 4086 addu.ph t6, t6, t7 4087 usw t3, 80(a2) 4088 usw t4, 84(a2) 4089 preceu.ph.qbr t3, t1 4090 preceu.ph.qbl t4, t1 4091 usw t5, 88(a2) 4092 usw t6, 92(a2) 4093 4094 lw t0, 28(a0) 4095 preceu.ph.qbr t5, t2 4096 preceu.ph.qbl t6, t2 4097 addu t0, t0, a1 4098 addu.ph t3, t3, t7 4099 addu.ph t4, t4, t7 4100 ulw t1, 0(t0) 4101 ulw t2, 4(t0) 4102 addu.ph t5, t5, t7 4103 addu.ph t6, t6, t7 4104 usw t3, 96(a2) 4105 usw t4, 100(a2) 4106 preceu.ph.qbr t3, t1 4107 preceu.ph.qbl t4, t1 4108 usw t5, 104(a2) 4109 usw t6, 108(a2) 4110 preceu.ph.qbr t5, t2 4111 preceu.ph.qbl t6, t2 4112 addu.ph t3, t3, t7 4113 addu.ph t4, t4, t7 4114 addu.ph t5, t5, t7 4115 addu.ph t6, t6, t7 4116 usw t3, 112(a2) 4117 usw t4, 116(a2) 4118 usw t5, 120(a2) 4119 usw t6, 124(a2) 4120 4121 j ra 4122 nop 4123 4124END(jsimd_convsamp_mips_dspr2) 4125 4126/*****************************************************************************/ 4127LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) 4128/* 4129 * a0 - sample_data 4130 * a1 - start_col 4131 * a2 - workspace 4132 */ 4133 4134 .set at 4135 4136 lw t0, 0(a0) 4137 addu t0, t0, a1 4138 lbu t1, 0(t0) 4139 lbu t2, 1(t0) 4140 lbu t3, 2(t0) 4141 lbu t4, 3(t0) 4142 lbu t5, 4(t0) 4143 lbu t6, 5(t0) 4144 lbu t7, 6(t0) 4145 lbu t8, 7(t0) 4146 addiu t1, t1, -128 4147 addiu t2, t2, -128 4148 addiu t3, t3, -128 4149 addiu t4, t4, -128 4150 addiu t5, t5, -128 4151 addiu t6, t6, -128 4152 addiu t7, t7, -128 4153 addiu t8, t8, -128 4154 mtc1 t1, f1 4155 mtc1 t2, f2 4156 mtc1 t3, f3 4157 mtc1 t4, f4 4158 mtc1 t5, f5 4159 mtc1 t6, f6 4160 mtc1 t7, f7 4161 mtc1 t8, f8 4162 cvt.s.w f1, f1 4163 cvt.s.w f2, f2 4164 cvt.s.w f3, f3 4165 cvt.s.w f4, f4 4166 cvt.s.w f5, f5 4167 cvt.s.w f6, f6 4168 cvt.s.w f7, f7 4169 cvt.s.w f8, f8 4170 lw t0, 4(a0) 4171 swc1 f1, 0(a2) 4172 swc1 f2, 4(a2) 4173 swc1 f3, 8(a2) 4174 addu t0, t0, a1 4175 swc1 f4, 12(a2) 4176 swc1 f5, 16(a2) 4177 swc1 f6, 20(a2) 4178 swc1 f7, 24(a2) 4179 swc1 f8, 28(a2) 4180 //elemr 1 4181 lbu t1, 0(t0) 4182 lbu t2, 1(t0) 4183 lbu t3, 2(t0) 4184 lbu t4, 3(t0) 4185 lbu t5, 4(t0) 4186 lbu t6, 5(t0) 4187 lbu t7, 6(t0) 4188 lbu t8, 7(t0) 4189 addiu t1, t1, -128 4190 addiu t2, t2, -128 4191 addiu t3, t3, -128 4192 addiu t4, t4, -128 4193 addiu t5, t5, -128 4194 addiu t6, t6, -128 4195 addiu t7, t7, -128 4196 addiu t8, t8, -128 4197 mtc1 t1, f1 4198 mtc1 t2, f2 4199 mtc1 t3, f3 4200 mtc1 t4, f4 4201 mtc1 t5, f5 4202 mtc1 t6, f6 4203 mtc1 t7, f7 4204 mtc1 t8, f8 4205 cvt.s.w f1, f1 4206 cvt.s.w f2, f2 4207 cvt.s.w f3, f3 4208 cvt.s.w f4, f4 4209 cvt.s.w f5, f5 4210 cvt.s.w f6, f6 4211 cvt.s.w f7, f7 4212 cvt.s.w f8, f8 4213 lw t0, 8(a0) 4214 swc1 f1, 32(a2) 4215 swc1 f2, 36(a2) 4216 swc1 f3, 40(a2) 4217 addu t0, t0, a1 4218 swc1 f4, 44(a2) 4219 swc1 f5, 48(a2) 4220 swc1 f6, 52(a2) 4221 swc1 f7, 56(a2) 4222 swc1 f8, 60(a2) 4223 //elemr 2 4224 lbu t1, 0(t0) 4225 lbu t2, 1(t0) 4226 lbu t3, 2(t0) 4227 lbu t4, 3(t0) 4228 lbu t5, 4(t0) 4229 lbu t6, 5(t0) 4230 lbu t7, 6(t0) 4231 lbu t8, 7(t0) 4232 addiu t1, t1, -128 4233 addiu t2, t2, -128 4234 addiu t3, t3, -128 4235 addiu t4, t4, -128 4236 addiu t5, t5, -128 4237 addiu t6, t6, -128 4238 addiu t7, t7, -128 4239 addiu t8, t8, -128 4240 mtc1 t1, f1 4241 mtc1 t2, f2 4242 mtc1 t3, f3 4243 mtc1 t4, f4 4244 mtc1 t5, f5 4245 mtc1 t6, f6 4246 mtc1 t7, f7 4247 mtc1 t8, f8 4248 cvt.s.w f1, f1 4249 cvt.s.w f2, f2 4250 cvt.s.w f3, f3 4251 cvt.s.w f4, f4 4252 cvt.s.w f5, f5 4253 cvt.s.w f6, f6 4254 cvt.s.w f7, f7 4255 cvt.s.w f8, f8 4256 lw t0, 12(a0) 4257 swc1 f1, 64(a2) 4258 swc1 f2, 68(a2) 4259 swc1 f3, 72(a2) 4260 addu t0, t0, a1 4261 swc1 f4, 76(a2) 4262 swc1 f5, 80(a2) 4263 swc1 f6, 84(a2) 4264 swc1 f7, 88(a2) 4265 swc1 f8, 92(a2) 4266 //elemr 3 4267 lbu t1, 0(t0) 4268 lbu t2, 1(t0) 4269 lbu t3, 2(t0) 4270 lbu t4, 3(t0) 4271 lbu t5, 4(t0) 4272 lbu t6, 5(t0) 4273 lbu t7, 6(t0) 4274 lbu t8, 7(t0) 4275 addiu t1, t1, -128 4276 addiu t2, t2, -128 4277 addiu t3, t3, -128 4278 addiu t4, t4, -128 4279 addiu t5, t5, -128 4280 addiu t6, t6, -128 4281 addiu t7, t7, -128 4282 addiu t8, t8, -128 4283 mtc1 t1, f1 4284 mtc1 t2, f2 4285 mtc1 t3, f3 4286 mtc1 t4, f4 4287 mtc1 t5, f5 4288 mtc1 t6, f6 4289 mtc1 t7, f7 4290 mtc1 t8, f8 4291 cvt.s.w f1, f1 4292 cvt.s.w f2, f2 4293 cvt.s.w f3, f3 4294 cvt.s.w f4, f4 4295 cvt.s.w f5, f5 4296 cvt.s.w f6, f6 4297 cvt.s.w f7, f7 4298 cvt.s.w f8, f8 4299 lw t0, 16(a0) 4300 swc1 f1, 96(a2) 4301 swc1 f2, 100(a2) 4302 swc1 f3, 104(a2) 4303 addu t0, t0, a1 4304 swc1 f4, 108(a2) 4305 swc1 f5, 112(a2) 4306 swc1 f6, 116(a2) 4307 swc1 f7, 120(a2) 4308 swc1 f8, 124(a2) 4309 //elemr 4 4310 lbu t1, 0(t0) 4311 lbu t2, 1(t0) 4312 lbu t3, 2(t0) 4313 lbu t4, 3(t0) 4314 lbu t5, 4(t0) 4315 lbu t6, 5(t0) 4316 lbu t7, 6(t0) 4317 lbu t8, 7(t0) 4318 addiu t1, t1, -128 4319 addiu t2, t2, -128 4320 addiu t3, t3, -128 4321 addiu t4, t4, -128 4322 addiu t5, t5, -128 4323 addiu t6, t6, -128 4324 addiu t7, t7, -128 4325 addiu t8, t8, -128 4326 mtc1 t1, f1 4327 mtc1 t2, f2 4328 mtc1 t3, f3 4329 mtc1 t4, f4 4330 mtc1 t5, f5 4331 mtc1 t6, f6 4332 mtc1 t7, f7 4333 mtc1 t8, f8 4334 cvt.s.w f1, f1 4335 cvt.s.w f2, f2 4336 cvt.s.w f3, f3 4337 cvt.s.w f4, f4 4338 cvt.s.w f5, f5 4339 cvt.s.w f6, f6 4340 cvt.s.w f7, f7 4341 cvt.s.w f8, f8 4342 lw t0, 20(a0) 4343 swc1 f1, 128(a2) 4344 swc1 f2, 132(a2) 4345 swc1 f3, 136(a2) 4346 addu t0, t0, a1 4347 swc1 f4, 140(a2) 4348 swc1 f5, 144(a2) 4349 swc1 f6, 148(a2) 4350 swc1 f7, 152(a2) 4351 swc1 f8, 156(a2) 4352 //elemr 5 4353 lbu t1, 0(t0) 4354 lbu t2, 1(t0) 4355 lbu t3, 2(t0) 4356 lbu t4, 3(t0) 4357 lbu t5, 4(t0) 4358 lbu t6, 5(t0) 4359 lbu t7, 6(t0) 4360 lbu t8, 7(t0) 4361 addiu t1, t1, -128 4362 addiu t2, t2, -128 4363 addiu t3, t3, -128 4364 addiu t4, t4, -128 4365 addiu t5, t5, -128 4366 addiu t6, t6, -128 4367 addiu t7, t7, -128 4368 addiu t8, t8, -128 4369 mtc1 t1, f1 4370 mtc1 t2, f2 4371 mtc1 t3, f3 4372 mtc1 t4, f4 4373 mtc1 t5, f5 4374 mtc1 t6, f6 4375 mtc1 t7, f7 4376 mtc1 t8, f8 4377 cvt.s.w f1, f1 4378 cvt.s.w f2, f2 4379 cvt.s.w f3, f3 4380 cvt.s.w f4, f4 4381 cvt.s.w f5, f5 4382 cvt.s.w f6, f6 4383 cvt.s.w f7, f7 4384 cvt.s.w f8, f8 4385 lw t0, 24(a0) 4386 swc1 f1, 160(a2) 4387 swc1 f2, 164(a2) 4388 swc1 f3, 168(a2) 4389 addu t0, t0, a1 4390 swc1 f4, 172(a2) 4391 swc1 f5, 176(a2) 4392 swc1 f6, 180(a2) 4393 swc1 f7, 184(a2) 4394 swc1 f8, 188(a2) 4395 //elemr 6 4396 lbu t1, 0(t0) 4397 lbu t2, 1(t0) 4398 lbu t3, 2(t0) 4399 lbu t4, 3(t0) 4400 lbu t5, 4(t0) 4401 lbu t6, 5(t0) 4402 lbu t7, 6(t0) 4403 lbu t8, 7(t0) 4404 addiu t1, t1, -128 4405 addiu t2, t2, -128 4406 addiu t3, t3, -128 4407 addiu t4, t4, -128 4408 addiu t5, t5, -128 4409 addiu t6, t6, -128 4410 addiu t7, t7, -128 4411 addiu t8, t8, -128 4412 mtc1 t1, f1 4413 mtc1 t2, f2 4414 mtc1 t3, f3 4415 mtc1 t4, f4 4416 mtc1 t5, f5 4417 mtc1 t6, f6 4418 mtc1 t7, f7 4419 mtc1 t8, f8 4420 cvt.s.w f1, f1 4421 cvt.s.w f2, f2 4422 cvt.s.w f3, f3 4423 cvt.s.w f4, f4 4424 cvt.s.w f5, f5 4425 cvt.s.w f6, f6 4426 cvt.s.w f7, f7 4427 cvt.s.w f8, f8 4428 lw t0, 28(a0) 4429 swc1 f1, 192(a2) 4430 swc1 f2, 196(a2) 4431 swc1 f3, 200(a2) 4432 addu t0, t0, a1 4433 swc1 f4, 204(a2) 4434 swc1 f5, 208(a2) 4435 swc1 f6, 212(a2) 4436 swc1 f7, 216(a2) 4437 swc1 f8, 220(a2) 4438 //elemr 7 4439 lbu t1, 0(t0) 4440 lbu t2, 1(t0) 4441 lbu t3, 2(t0) 4442 lbu t4, 3(t0) 4443 lbu t5, 4(t0) 4444 lbu t6, 5(t0) 4445 lbu t7, 6(t0) 4446 lbu t8, 7(t0) 4447 addiu t1, t1, -128 4448 addiu t2, t2, -128 4449 addiu t3, t3, -128 4450 addiu t4, t4, -128 4451 addiu t5, t5, -128 4452 addiu t6, t6, -128 4453 addiu t7, t7, -128 4454 addiu t8, t8, -128 4455 mtc1 t1, f1 4456 mtc1 t2, f2 4457 mtc1 t3, f3 4458 mtc1 t4, f4 4459 mtc1 t5, f5 4460 mtc1 t6, f6 4461 mtc1 t7, f7 4462 mtc1 t8, f8 4463 cvt.s.w f1, f1 4464 cvt.s.w f2, f2 4465 cvt.s.w f3, f3 4466 cvt.s.w f4, f4 4467 cvt.s.w f5, f5 4468 cvt.s.w f6, f6 4469 cvt.s.w f7, f7 4470 cvt.s.w f8, f8 4471 swc1 f1, 224(a2) 4472 swc1 f2, 228(a2) 4473 swc1 f3, 232(a2) 4474 swc1 f4, 236(a2) 4475 swc1 f5, 240(a2) 4476 swc1 f6, 244(a2) 4477 swc1 f7, 248(a2) 4478 swc1 f8, 252(a2) 4479 4480 j ra 4481 nop 4482 4483END(jsimd_convsamp_float_mips_dspr2) 4484 4485/*****************************************************************************/ 4486 4487