1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_filter_block2d_first_pass_armv6| 13 EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| 14 EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| 15 EXPORT |vp8_filter_block2d_second_pass_armv6| 16 EXPORT |vp8_filter4_block2d_second_pass_armv6| 17 EXPORT |vp8_filter_block2d_first_pass_only_armv6| 18 EXPORT |vp8_filter_block2d_second_pass_only_armv6| 19 20 AREA |.text|, CODE, READONLY ; name this block of code 21;------------------------------------- 22; r0 unsigned char *src_ptr 23; r1 short *output_ptr 24; r2 unsigned int src_pixels_per_line 25; r3 unsigned int output_width 26; stack unsigned int output_height 27; stack const short *vp8_filter 28;------------------------------------- 29; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with 30; the output being a 2 byte value and the intput being a 1 byte value. 31|vp8_filter_block2d_first_pass_armv6| PROC 32 stmdb sp!, {r4 - r11, lr} 33 34 ldr r11, [sp, #40] ; vp8_filter address 35 ldr r7, [sp, #36] ; output height 36 37 sub r2, r2, r3 ; inside loop increments input array, 38 ; so the height loop only needs to add 39 ; r2 - width to the input pointer 40 41 mov r3, r3, lsl #1 ; multiply width by 2 because using shorts 42 add r12, r3, #16 ; square off the output 43 sub sp, sp, #4 44 45 ldr r4, [r11] ; load up packed filter coefficients 46 ldr r5, [r11, #4] 47 ldr r6, [r11, #8] 48 49 str r1, [sp] ; push destination to stack 50 mov r7, r7, lsl #16 ; height is top part of counter 51 52; six tap filter 53|height_loop_1st_6| 54 ldrb r8, [r0, #-2] ; load source data 55 ldrb r9, [r0, #-1] 56 ldrb r10, [r0], #2 57 orr r7, r7, r3, lsr #2 ; construct loop counter 58 59|width_loop_1st_6| 60 ldrb r11, [r0, #-1] 61 62 pkhbt lr, r8, r9, lsl #16 ; r9 | r8 63 pkhbt r8, r9, r10, lsl #16 ; r10 | r9 64 65 ldrb r9, [r0] 66 67 smuad lr, lr, r4 ; apply the filter 68 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 69 smuad r8, r8, r4 70 pkhbt r11, r11, r9, lsl #16 ; r9 | r11 71 72 smlad lr, r10, r5, lr 73 ldrb r10, [r0, #1] 74 smlad r8, r11, r5, r8 75 ldrb r11, [r0, #2] 76 77 sub r7, r7, #1 78 79 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 80 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 81 82 smlad lr, r9, r6, lr 83 smlad r11, r10, r6, r8 84 85 ands r10, r7, #0xff ; test loop counter 86 87 add lr, lr, #0x40 ; round_shift_and_clamp 88 ldrneb r8, [r0, #-2] ; load data for next loop 89 usat lr, #8, lr, asr #7 90 add r11, r11, #0x40 91 ldrneb r9, [r0, #-1] 92 usat r11, #8, r11, asr #7 93 94 strh lr, [r1], r12 ; result is transposed and stored, which 95 ; will make second pass filtering easier. 96 ldrneb r10, [r0], #2 97 strh r11, [r1], r12 98 99 bne width_loop_1st_6 100 101 ldr r1, [sp] ; load and update dst address 102 subs r7, r7, #0x10000 103 add r0, r0, r2 ; move to next input line 104 105 add r1, r1, #2 ; move over to next column 106 str r1, [sp] 107 108 bne height_loop_1st_6 109 110 add sp, sp, #4 111 ldmia sp!, {r4 - r11, pc} 112 113 ENDP 114 115; -------------------------- 116; 16x16 version 117; ----------------------------- 118|vp8_filter_block2d_first_pass_16x16_armv6| PROC 119 stmdb sp!, {r4 - r11, lr} 120 121 ldr r11, [sp, #40] ; vp8_filter address 122 ldr r7, [sp, #36] ; output height 123 124 add r4, r2, #18 ; preload next low 125 pld [r0, r4] 126 127 sub r2, r2, r3 ; inside loop increments input array, 128 ; so the height loop only needs to add 129 ; r2 - width to the input pointer 130 131 mov r3, r3, lsl #1 ; multiply width by 2 because using shorts 132 add r12, r3, #16 ; square off the output 133 sub sp, sp, #4 134 135 ldr r4, [r11] ; load up packed filter coefficients 136 ldr r5, [r11, #4] 137 ldr r6, [r11, #8] 138 139 str r1, [sp] ; push destination to stack 140 mov r7, r7, lsl #16 ; height is top part of counter 141 142; six tap filter 143|height_loop_1st_16_6| 144 ldrb r8, [r0, #-2] ; load source data 145 ldrb r9, [r0, #-1] 146 ldrb r10, [r0], #2 147 orr r7, r7, r3, lsr #2 ; construct loop counter 148 149|width_loop_1st_16_6| 150 ldrb r11, [r0, #-1] 151 152 pkhbt lr, r8, r9, lsl #16 ; r9 | r8 153 pkhbt r8, r9, r10, lsl #16 ; r10 | r9 154 155 ldrb r9, [r0] 156 157 smuad lr, lr, r4 ; apply the filter 158 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 159 smuad r8, r8, r4 160 pkhbt r11, r11, r9, lsl #16 ; r9 | r11 161 162 smlad lr, r10, r5, lr 163 ldrb r10, [r0, #1] 164 smlad r8, r11, r5, r8 165 ldrb r11, [r0, #2] 166 167 sub r7, r7, #1 168 169 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 170 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 171 172 smlad lr, r9, r6, lr 173 smlad r11, r10, r6, r8 174 175 ands r10, r7, #0xff ; test loop counter 176 177 add lr, lr, #0x40 ; round_shift_and_clamp 178 ldrneb r8, [r0, #-2] ; load data for next loop 179 usat lr, #8, lr, asr #7 180 add r11, r11, #0x40 181 ldrneb r9, [r0, #-1] 182 usat r11, #8, r11, asr #7 183 184 strh lr, [r1], r12 ; result is transposed and stored, which 185 ; will make second pass filtering easier. 186 ldrneb r10, [r0], #2 187 strh r11, [r1], r12 188 189 bne width_loop_1st_16_6 190 191 ldr r1, [sp] ; load and update dst address 192 subs r7, r7, #0x10000 193 add r0, r0, r2 ; move to next input line 194 195 add r11, r2, #34 ; adding back block width(=16) 196 pld [r0, r11] ; preload next low 197 198 add r1, r1, #2 ; move over to next column 199 str r1, [sp] 200 201 bne height_loop_1st_16_6 202 203 add sp, sp, #4 204 ldmia sp!, {r4 - r11, pc} 205 206 ENDP 207 208; -------------------------- 209; 8x8 version 210; ----------------------------- 211|vp8_filter_block2d_first_pass_8x8_armv6| PROC 212 stmdb sp!, {r4 - r11, lr} 213 214 ldr r11, [sp, #40] ; vp8_filter address 215 ldr r7, [sp, #36] ; output height 216 217 add r4, r2, #10 ; preload next low 218 pld [r0, r4] 219 220 sub r2, r2, r3 ; inside loop increments input array, 221 ; so the height loop only needs to add 222 ; r2 - width to the input pointer 223 224 mov r3, r3, lsl #1 ; multiply width by 2 because using shorts 225 add r12, r3, #16 ; square off the output 226 sub sp, sp, #4 227 228 ldr r4, [r11] ; load up packed filter coefficients 229 ldr r5, [r11, #4] 230 ldr r6, [r11, #8] 231 232 str r1, [sp] ; push destination to stack 233 mov r7, r7, lsl #16 ; height is top part of counter 234 235; six tap filter 236|height_loop_1st_8_6| 237 ldrb r8, [r0, #-2] ; load source data 238 ldrb r9, [r0, #-1] 239 ldrb r10, [r0], #2 240 orr r7, r7, r3, lsr #2 ; construct loop counter 241 242|width_loop_1st_8_6| 243 ldrb r11, [r0, #-1] 244 245 pkhbt lr, r8, r9, lsl #16 ; r9 | r8 246 pkhbt r8, r9, r10, lsl #16 ; r10 | r9 247 248 ldrb r9, [r0] 249 250 smuad lr, lr, r4 ; apply the filter 251 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 252 smuad r8, r8, r4 253 pkhbt r11, r11, r9, lsl #16 ; r9 | r11 254 255 smlad lr, r10, r5, lr 256 ldrb r10, [r0, #1] 257 smlad r8, r11, r5, r8 258 ldrb r11, [r0, #2] 259 260 sub r7, r7, #1 261 262 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 263 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 264 265 smlad lr, r9, r6, lr 266 smlad r11, r10, r6, r8 267 268 ands r10, r7, #0xff ; test loop counter 269 270 add lr, lr, #0x40 ; round_shift_and_clamp 271 ldrneb r8, [r0, #-2] ; load data for next loop 272 usat lr, #8, lr, asr #7 273 add r11, r11, #0x40 274 ldrneb r9, [r0, #-1] 275 usat r11, #8, r11, asr #7 276 277 strh lr, [r1], r12 ; result is transposed and stored, which 278 ; will make second pass filtering easier. 279 ldrneb r10, [r0], #2 280 strh r11, [r1], r12 281 282 bne width_loop_1st_8_6 283 284 ldr r1, [sp] ; load and update dst address 285 subs r7, r7, #0x10000 286 add r0, r0, r2 ; move to next input line 287 288 add r11, r2, #18 ; adding back block width(=8) 289 pld [r0, r11] ; preload next low 290 291 add r1, r1, #2 ; move over to next column 292 str r1, [sp] 293 294 bne height_loop_1st_8_6 295 296 add sp, sp, #4 297 ldmia sp!, {r4 - r11, pc} 298 299 ENDP 300 301;--------------------------------- 302; r0 short *src_ptr, 303; r1 unsigned char *output_ptr, 304; r2 unsigned int output_pitch, 305; r3 unsigned int cnt, 306; stack const short *vp8_filter 307;--------------------------------- 308|vp8_filter_block2d_second_pass_armv6| PROC 309 stmdb sp!, {r4 - r11, lr} 310 311 ldr r11, [sp, #36] ; vp8_filter address 312 sub sp, sp, #4 313 mov r7, r3, lsl #16 ; height is top part of counter 314 str r1, [sp] ; push destination to stack 315 316 ldr r4, [r11] ; load up packed filter coefficients 317 ldr r5, [r11, #4] 318 ldr r6, [r11, #8] 319 320 pkhbt r12, r5, r4 ; pack the filter differently 321 pkhbt r11, r6, r5 322 323 sub r0, r0, #4 ; offset input buffer 324 325|height_loop_2nd| 326 ldr r8, [r0] ; load the data 327 ldr r9, [r0, #4] 328 orr r7, r7, r3, lsr #1 ; loop counter 329 330|width_loop_2nd| 331 smuad lr, r4, r8 ; apply filter 332 sub r7, r7, #1 333 smulbt r8, r4, r8 334 335 ldr r10, [r0, #8] 336 337 smlad lr, r5, r9, lr 338 smladx r8, r12, r9, r8 339 340 ldrh r9, [r0, #12] 341 342 smlad lr, r6, r10, lr 343 smladx r8, r11, r10, r8 344 345 add r0, r0, #4 346 smlatb r10, r6, r9, r8 347 348 add lr, lr, #0x40 ; round_shift_and_clamp 349 ands r8, r7, #0xff 350 usat lr, #8, lr, asr #7 351 add r10, r10, #0x40 352 strb lr, [r1], r2 ; the result is transposed back and stored 353 usat r10, #8, r10, asr #7 354 355 ldrne r8, [r0] ; load data for next loop 356 ldrne r9, [r0, #4] 357 strb r10, [r1], r2 358 359 bne width_loop_2nd 360 361 ldr r1, [sp] ; update dst for next loop 362 subs r7, r7, #0x10000 363 add r0, r0, #16 ; updata src for next loop 364 add r1, r1, #1 365 str r1, [sp] 366 367 bne height_loop_2nd 368 369 add sp, sp, #4 370 ldmia sp!, {r4 - r11, pc} 371 372 ENDP 373 374;--------------------------------- 375; r0 short *src_ptr, 376; r1 unsigned char *output_ptr, 377; r2 unsigned int output_pitch, 378; r3 unsigned int cnt, 379; stack const short *vp8_filter 380;--------------------------------- 381|vp8_filter4_block2d_second_pass_armv6| PROC 382 stmdb sp!, {r4 - r11, lr} 383 384 ldr r11, [sp, #36] ; vp8_filter address 385 mov r7, r3, lsl #16 ; height is top part of counter 386 387 ldr r4, [r11] ; load up packed filter coefficients 388 add lr, r1, r3 ; save final destination pointer 389 ldr r5, [r11, #4] 390 ldr r6, [r11, #8] 391 392 pkhbt r12, r5, r4 ; pack the filter differently 393 pkhbt r11, r6, r5 394 mov r4, #0x40 ; rounding factor (for smlad{x}) 395 396|height_loop_2nd_4| 397 ldrd r8, r9, [r0, #-4] ; load the data 398 orr r7, r7, r3, lsr #1 ; loop counter 399 400|width_loop_2nd_4| 401 ldr r10, [r0, #4]! 402 smladx r6, r9, r12, r4 ; apply filter 403 pkhbt r8, r9, r8 404 smlad r5, r8, r12, r4 405 pkhbt r8, r10, r9 406 smladx r6, r10, r11, r6 407 sub r7, r7, #1 408 smlad r5, r8, r11, r5 409 410 mov r8, r9 ; shift the data for the next loop 411 mov r9, r10 412 413 usat r6, #8, r6, asr #7 ; shift and clamp 414 usat r5, #8, r5, asr #7 415 416 strb r5, [r1], r2 ; the result is transposed back and stored 417 tst r7, #0xff 418 strb r6, [r1], r2 419 420 bne width_loop_2nd_4 421 422 subs r7, r7, #0x10000 423 add r0, r0, #16 ; update src for next loop 424 sub r1, lr, r7, lsr #16 ; update dst for next loop 425 426 bne height_loop_2nd_4 427 428 ldmia sp!, {r4 - r11, pc} 429 430 ENDP 431 432;------------------------------------ 433; r0 unsigned char *src_ptr 434; r1 unsigned char *output_ptr, 435; r2 unsigned int src_pixels_per_line 436; r3 unsigned int cnt, 437; stack unsigned int output_pitch, 438; stack const short *vp8_filter 439;------------------------------------ 440|vp8_filter_block2d_first_pass_only_armv6| PROC 441 stmdb sp!, {r4 - r11, lr} 442 443 add r7, r2, r3 ; preload next low 444 add r7, r7, #2 445 pld [r0, r7] 446 447 ldr r4, [sp, #36] ; output pitch 448 ldr r11, [sp, #40] ; HFilter address 449 sub sp, sp, #8 450 451 mov r7, r3 452 sub r2, r2, r3 ; inside loop increments input array, 453 ; so the height loop only needs to add 454 ; r2 - width to the input pointer 455 456 sub r4, r4, r3 457 str r4, [sp] ; save modified output pitch 458 str r2, [sp, #4] 459 460 mov r2, #0x40 461 462 ldr r4, [r11] ; load up packed filter coefficients 463 ldr r5, [r11, #4] 464 ldr r6, [r11, #8] 465 466; six tap filter 467|height_loop_1st_only_6| 468 ldrb r8, [r0, #-2] ; load data 469 ldrb r9, [r0, #-1] 470 ldrb r10, [r0], #2 471 472 mov r12, r3, lsr #1 ; loop counter 473 474|width_loop_1st_only_6| 475 ldrb r11, [r0, #-1] 476 477 pkhbt lr, r8, r9, lsl #16 ; r9 | r8 478 pkhbt r8, r9, r10, lsl #16 ; r10 | r9 479 480 ldrb r9, [r0] 481 482;; smuad lr, lr, r4 483 smlad lr, lr, r4, r2 484 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 485;; smuad r8, r8, r4 486 smlad r8, r8, r4, r2 487 pkhbt r11, r11, r9, lsl #16 ; r9 | r11 488 489 smlad lr, r10, r5, lr 490 ldrb r10, [r0, #1] 491 smlad r8, r11, r5, r8 492 ldrb r11, [r0, #2] 493 494 subs r12, r12, #1 495 496 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 497 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 498 499 smlad lr, r9, r6, lr 500 smlad r10, r10, r6, r8 501 502;; add lr, lr, #0x40 ; round_shift_and_clamp 503 ldrneb r8, [r0, #-2] ; load data for next loop 504 usat lr, #8, lr, asr #7 505;; add r10, r10, #0x40 506 strb lr, [r1], #1 ; store the result 507 usat r10, #8, r10, asr #7 508 509 ldrneb r9, [r0, #-1] 510 strb r10, [r1], #1 511 ldrneb r10, [r0], #2 512 513 bne width_loop_1st_only_6 514 515 ldr lr, [sp] ; load back output pitch 516 ldr r12, [sp, #4] ; load back output pitch 517 subs r7, r7, #1 518 add r0, r0, r12 ; updata src for next loop 519 520 add r11, r12, r3 ; preload next low 521 add r11, r11, #2 522 pld [r0, r11] 523 524 add r1, r1, lr ; update dst for next loop 525 526 bne height_loop_1st_only_6 527 528 add sp, sp, #8 529 ldmia sp!, {r4 - r11, pc} 530 ENDP ; |vp8_filter_block2d_first_pass_only_armv6| 531 532 533;------------------------------------ 534; r0 unsigned char *src_ptr, 535; r1 unsigned char *output_ptr, 536; r2 unsigned int src_pixels_per_line 537; r3 unsigned int cnt, 538; stack unsigned int output_pitch, 539; stack const short *vp8_filter 540;------------------------------------ 541|vp8_filter_block2d_second_pass_only_armv6| PROC 542 stmdb sp!, {r4 - r11, lr} 543 544 ldr r11, [sp, #40] ; VFilter address 545 ldr r12, [sp, #36] ; output pitch 546 547 mov r7, r3, lsl #16 ; height is top part of counter 548 sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after 549 550 sub sp, sp, #8 551 552 ldr r4, [r11] ; load up packed filter coefficients 553 ldr r5, [r11, #4] 554 ldr r6, [r11, #8] 555 556 str r0, [sp] ; save r0 to stack 557 str r1, [sp, #4] ; save dst to stack 558 559; six tap filter 560|width_loop_2nd_only_6| 561 ldrb r8, [r0], r2 ; load data 562 orr r7, r7, r3 ; loop counter 563 ldrb r9, [r0], r2 564 ldrb r10, [r0], r2 565 566|height_loop_2nd_only_6| 567 ; filter first column in this inner loop, than, move to next colum. 568 ldrb r11, [r0], r2 569 570 pkhbt lr, r8, r9, lsl #16 ; r9 | r8 571 pkhbt r8, r9, r10, lsl #16 ; r10 | r9 572 573 ldrb r9, [r0], r2 574 575 smuad lr, lr, r4 576 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 577 smuad r8, r8, r4 578 pkhbt r11, r11, r9, lsl #16 ; r9 | r11 579 580 smlad lr, r10, r5, lr 581 ldrb r10, [r0], r2 582 smlad r8, r11, r5, r8 583 ldrb r11, [r0] 584 585 sub r7, r7, #2 586 sub r0, r0, r2, lsl #2 587 588 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 589 pkhbt r10, r10, r11, lsl #16 ; r11 | r10 590 591 smlad lr, r9, r6, lr 592 smlad r10, r10, r6, r8 593 594 ands r9, r7, #0xff 595 596 add lr, lr, #0x40 ; round_shift_and_clamp 597 ldrneb r8, [r0], r2 ; load data for next loop 598 usat lr, #8, lr, asr #7 599 add r10, r10, #0x40 600 strb lr, [r1], r12 ; store the result for the column 601 usat r10, #8, r10, asr #7 602 603 ldrneb r9, [r0], r2 604 strb r10, [r1], r12 605 ldrneb r10, [r0], r2 606 607 bne height_loop_2nd_only_6 608 609 ldr r0, [sp] 610 ldr r1, [sp, #4] 611 subs r7, r7, #0x10000 612 add r0, r0, #1 ; move to filter next column 613 str r0, [sp] 614 add r1, r1, #1 615 str r1, [sp, #4] 616 617 bne width_loop_2nd_only_6 618 619 add sp, sp, #8 620 621 ldmia sp!, {r4 - r11, pc} 622 ENDP ; |vp8_filter_block2d_second_pass_only_armv6| 623 624 END 625