1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_build_intra_predictors_mby_neon_func| 13 EXPORT |vp8_build_intra_predictors_mby_s_neon_func| 14 15 ARM 16 REQUIRE8 17 PRESERVE8 18 19 AREA ||.text||, CODE, READONLY, ALIGN=2 20; r0 unsigned char *y_buffer 21; r1 unsigned char *ypred_ptr 22; r2 int y_stride 23; r3 int mode 24; stack int Up 25; stack int Left 26 27|vp8_build_intra_predictors_mby_neon_func| PROC 28 push {r4-r8, lr} 29 30 cmp r3, #0 31 beq case_dc_pred 32 cmp r3, #1 33 beq case_v_pred 34 cmp r3, #2 35 beq case_h_pred 36 cmp r3, #3 37 beq case_tm_pred 38 39case_dc_pred 40 ldr r4, [sp, #24] ; Up 41 ldr r5, [sp, #28] ; Left 42 43 ; Default the DC average to 128 44 mov r12, #128 45 vdup.u8 q0, r12 46 47 ; Zero out running sum 48 mov r12, #0 49 50 ; compute shift and jump 51 adds r7, r4, r5 52 beq skip_dc_pred_up_left 53 54 ; Load above row, if it exists 55 cmp r4, #0 56 beq skip_dc_pred_up 57 58 sub r6, r0, r2 59 vld1.8 {q1}, [r6] 60 vpaddl.u8 q2, q1 61 vpaddl.u16 q3, q2 62 vpaddl.u32 q4, q3 63 64 vmov.32 r4, d8[0] 65 vmov.32 r6, d9[0] 66 67 add r12, r4, r6 68 69 ; Move back to interger registers 70 71skip_dc_pred_up 72 73 cmp r5, #0 74 beq skip_dc_pred_left 75 76 sub r0, r0, #1 77 78 ; Load left row, if it exists 79 ldrb r3, [r0], r2 80 ldrb r4, [r0], r2 81 ldrb r5, [r0], r2 82 ldrb r6, [r0], r2 83 84 add r12, r12, r3 85 add r12, r12, r4 86 add r12, r12, r5 87 add r12, r12, r6 88 89 ldrb r3, [r0], r2 90 ldrb r4, [r0], r2 91 ldrb r5, [r0], r2 92 ldrb r6, [r0], r2 93 94 add r12, r12, r3 95 add r12, r12, r4 96 add r12, r12, r5 97 add r12, r12, r6 98 99 ldrb r3, [r0], r2 100 ldrb r4, [r0], r2 101 ldrb r5, [r0], r2 102 ldrb r6, [r0], r2 103 104 add r12, r12, r3 105 add r12, r12, r4 106 add r12, r12, r5 107 add r12, r12, r6 108 109 ldrb r3, [r0], r2 110 ldrb r4, [r0], r2 111 ldrb r5, [r0], r2 112 ldrb r6, [r0] 113 114 add r12, r12, r3 115 add r12, r12, r4 116 add r12, r12, r5 117 add r12, r12, r6 118 119skip_dc_pred_left 120 add r7, r7, #3 ; Shift 121 sub r4, r7, #1 122 mov r5, #1 123 add r12, r12, r5, lsl r4 124 mov r5, r12, lsr r7 ; expected_dc 125 126 vdup.u8 q0, r5 127 128skip_dc_pred_up_left 129 vst1.u8 {q0}, [r1]! 130 vst1.u8 {q0}, [r1]! 131 vst1.u8 {q0}, [r1]! 132 vst1.u8 {q0}, [r1]! 133 vst1.u8 {q0}, [r1]! 134 vst1.u8 {q0}, [r1]! 135 vst1.u8 {q0}, [r1]! 136 vst1.u8 {q0}, [r1]! 137 vst1.u8 {q0}, [r1]! 138 vst1.u8 {q0}, [r1]! 139 vst1.u8 {q0}, [r1]! 140 vst1.u8 {q0}, [r1]! 141 vst1.u8 {q0}, [r1]! 142 vst1.u8 {q0}, [r1]! 143 vst1.u8 {q0}, [r1]! 144 vst1.u8 {q0}, [r1]! 145 146 pop {r4-r8,pc} 147case_v_pred 148 ; Copy down above row 149 sub r6, r0, r2 150 vld1.8 {q0}, [r6] 151 152 vst1.u8 {q0}, [r1]! 153 vst1.u8 {q0}, [r1]! 154 vst1.u8 {q0}, [r1]! 155 vst1.u8 {q0}, [r1]! 156 vst1.u8 {q0}, [r1]! 157 vst1.u8 {q0}, [r1]! 158 vst1.u8 {q0}, [r1]! 159 vst1.u8 {q0}, [r1]! 160 vst1.u8 {q0}, [r1]! 161 vst1.u8 {q0}, [r1]! 162 vst1.u8 {q0}, [r1]! 163 vst1.u8 {q0}, [r1]! 164 vst1.u8 {q0}, [r1]! 165 vst1.u8 {q0}, [r1]! 166 vst1.u8 {q0}, [r1]! 167 vst1.u8 {q0}, [r1]! 168 pop {r4-r8,pc} 169 170case_h_pred 171 ; Load 4x yleft_col 172 sub r0, r0, #1 173 174 ldrb r3, [r0], r2 175 ldrb r4, [r0], r2 176 ldrb r5, [r0], r2 177 ldrb r6, [r0], r2 178 vdup.u8 q0, r3 179 vdup.u8 q1, r4 180 vdup.u8 q2, r5 181 vdup.u8 q3, r6 182 vst1.u8 {q0}, [r1]! 183 vst1.u8 {q1}, [r1]! 184 vst1.u8 {q2}, [r1]! 185 vst1.u8 {q3}, [r1]! 186 187 ldrb r3, [r0], r2 188 ldrb r4, [r0], r2 189 ldrb r5, [r0], r2 190 ldrb r6, [r0], r2 191 vdup.u8 q0, r3 192 vdup.u8 q1, r4 193 vdup.u8 q2, r5 194 vdup.u8 q3, r6 195 vst1.u8 {q0}, [r1]! 196 vst1.u8 {q1}, [r1]! 197 vst1.u8 {q2}, [r1]! 198 vst1.u8 {q3}, [r1]! 199 200 201 ldrb r3, [r0], r2 202 ldrb r4, [r0], r2 203 ldrb r5, [r0], r2 204 ldrb r6, [r0], r2 205 vdup.u8 q0, r3 206 vdup.u8 q1, r4 207 vdup.u8 q2, r5 208 vdup.u8 q3, r6 209 vst1.u8 {q0}, [r1]! 210 vst1.u8 {q1}, [r1]! 211 vst1.u8 {q2}, [r1]! 212 vst1.u8 {q3}, [r1]! 213 214 ldrb r3, [r0], r2 215 ldrb r4, [r0], r2 216 ldrb r5, [r0], r2 217 ldrb r6, [r0], r2 218 vdup.u8 q0, r3 219 vdup.u8 q1, r4 220 vdup.u8 q2, r5 221 vdup.u8 q3, r6 222 vst1.u8 {q0}, [r1]! 223 vst1.u8 {q1}, [r1]! 224 vst1.u8 {q2}, [r1]! 225 vst1.u8 {q3}, [r1]! 226 227 pop {r4-r8,pc} 228 229case_tm_pred 230 ; Load yabove_row 231 sub r3, r0, r2 232 vld1.8 {q8}, [r3] 233 234 ; Load ytop_left 235 sub r3, r3, #1 236 ldrb r7, [r3] 237 238 vdup.u16 q7, r7 239 240 ; Compute yabove_row - ytop_left 241 mov r3, #1 242 vdup.u8 q0, r3 243 244 vmull.u8 q4, d16, d0 245 vmull.u8 q5, d17, d0 246 247 vsub.s16 q4, q4, q7 248 vsub.s16 q5, q5, q7 249 250 ; Load 4x yleft_col 251 sub r0, r0, #1 252 mov r12, #4 253 254case_tm_pred_loop 255 ldrb r3, [r0], r2 256 ldrb r4, [r0], r2 257 ldrb r5, [r0], r2 258 ldrb r6, [r0], r2 259 vdup.u16 q0, r3 260 vdup.u16 q1, r4 261 vdup.u16 q2, r5 262 vdup.u16 q3, r6 263 264 vqadd.s16 q8, q0, q4 265 vqadd.s16 q9, q0, q5 266 267 vqadd.s16 q10, q1, q4 268 vqadd.s16 q11, q1, q5 269 270 vqadd.s16 q12, q2, q4 271 vqadd.s16 q13, q2, q5 272 273 vqadd.s16 q14, q3, q4 274 vqadd.s16 q15, q3, q5 275 276 vqshrun.s16 d0, q8, #0 277 vqshrun.s16 d1, q9, #0 278 279 vqshrun.s16 d2, q10, #0 280 vqshrun.s16 d3, q11, #0 281 282 vqshrun.s16 d4, q12, #0 283 vqshrun.s16 d5, q13, #0 284 285 vqshrun.s16 d6, q14, #0 286 vqshrun.s16 d7, q15, #0 287 288 vst1.u8 {q0}, [r1]! 289 vst1.u8 {q1}, [r1]! 290 vst1.u8 {q2}, [r1]! 291 vst1.u8 {q3}, [r1]! 292 293 subs r12, r12, #1 294 bne case_tm_pred_loop 295 296 pop {r4-r8,pc} 297 298 ENDP 299 300;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 301; r0 unsigned char *y_buffer 302; r1 unsigned char *ypred_ptr 303; r2 int y_stride 304; r3 int mode 305; stack int Up 306; stack int Left 307 308|vp8_build_intra_predictors_mby_s_neon_func| PROC 309 push {r4-r8, lr} 310 311 mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; 312 313 cmp r3, #0 314 beq case_dc_pred_s 315 cmp r3, #1 316 beq case_v_pred_s 317 cmp r3, #2 318 beq case_h_pred_s 319 cmp r3, #3 320 beq case_tm_pred_s 321 322case_dc_pred_s 323 ldr r4, [sp, #24] ; Up 324 ldr r5, [sp, #28] ; Left 325 326 ; Default the DC average to 128 327 mov r12, #128 328 vdup.u8 q0, r12 329 330 ; Zero out running sum 331 mov r12, #0 332 333 ; compute shift and jump 334 adds r7, r4, r5 335 beq skip_dc_pred_up_left_s 336 337 ; Load above row, if it exists 338 cmp r4, #0 339 beq skip_dc_pred_up_s 340 341 sub r6, r0, r2 342 vld1.8 {q1}, [r6] 343 vpaddl.u8 q2, q1 344 vpaddl.u16 q3, q2 345 vpaddl.u32 q4, q3 346 347 vmov.32 r4, d8[0] 348 vmov.32 r6, d9[0] 349 350 add r12, r4, r6 351 352 ; Move back to interger registers 353 354skip_dc_pred_up_s 355 356 cmp r5, #0 357 beq skip_dc_pred_left_s 358 359 sub r0, r0, #1 360 361 ; Load left row, if it exists 362 ldrb r3, [r0], r2 363 ldrb r4, [r0], r2 364 ldrb r5, [r0], r2 365 ldrb r6, [r0], r2 366 367 add r12, r12, r3 368 add r12, r12, r4 369 add r12, r12, r5 370 add r12, r12, r6 371 372 ldrb r3, [r0], r2 373 ldrb r4, [r0], r2 374 ldrb r5, [r0], r2 375 ldrb r6, [r0], r2 376 377 add r12, r12, r3 378 add r12, r12, r4 379 add r12, r12, r5 380 add r12, r12, r6 381 382 ldrb r3, [r0], r2 383 ldrb r4, [r0], r2 384 ldrb r5, [r0], r2 385 ldrb r6, [r0], r2 386 387 add r12, r12, r3 388 add r12, r12, r4 389 add r12, r12, r5 390 add r12, r12, r6 391 392 ldrb r3, [r0], r2 393 ldrb r4, [r0], r2 394 ldrb r5, [r0], r2 395 ldrb r6, [r0] 396 397 add r12, r12, r3 398 add r12, r12, r4 399 add r12, r12, r5 400 add r12, r12, r6 401 402skip_dc_pred_left_s 403 add r7, r7, #3 ; Shift 404 sub r4, r7, #1 405 mov r5, #1 406 add r12, r12, r5, lsl r4 407 mov r5, r12, lsr r7 ; expected_dc 408 409 vdup.u8 q0, r5 410 411skip_dc_pred_up_left_s 412 vst1.u8 {q0}, [r1], r2 413 vst1.u8 {q0}, [r1], r2 414 vst1.u8 {q0}, [r1], r2 415 vst1.u8 {q0}, [r1], r2 416 vst1.u8 {q0}, [r1], r2 417 vst1.u8 {q0}, [r1], r2 418 vst1.u8 {q0}, [r1], r2 419 vst1.u8 {q0}, [r1], r2 420 vst1.u8 {q0}, [r1], r2 421 vst1.u8 {q0}, [r1], r2 422 vst1.u8 {q0}, [r1], r2 423 vst1.u8 {q0}, [r1], r2 424 vst1.u8 {q0}, [r1], r2 425 vst1.u8 {q0}, [r1], r2 426 vst1.u8 {q0}, [r1], r2 427 vst1.u8 {q0}, [r1], r2 428 429 pop {r4-r8,pc} 430case_v_pred_s 431 ; Copy down above row 432 sub r6, r0, r2 433 vld1.8 {q0}, [r6] 434 435 vst1.u8 {q0}, [r1], r2 436 vst1.u8 {q0}, [r1], r2 437 vst1.u8 {q0}, [r1], r2 438 vst1.u8 {q0}, [r1], r2 439 vst1.u8 {q0}, [r1], r2 440 vst1.u8 {q0}, [r1], r2 441 vst1.u8 {q0}, [r1], r2 442 vst1.u8 {q0}, [r1], r2 443 vst1.u8 {q0}, [r1], r2 444 vst1.u8 {q0}, [r1], r2 445 vst1.u8 {q0}, [r1], r2 446 vst1.u8 {q0}, [r1], r2 447 vst1.u8 {q0}, [r1], r2 448 vst1.u8 {q0}, [r1], r2 449 vst1.u8 {q0}, [r1], r2 450 vst1.u8 {q0}, [r1], r2 451 pop {r4-r8,pc} 452 453case_h_pred_s 454 ; Load 4x yleft_col 455 sub r0, r0, #1 456 457 ldrb r3, [r0], r2 458 ldrb r4, [r0], r2 459 ldrb r5, [r0], r2 460 ldrb r6, [r0], r2 461 vdup.u8 q0, r3 462 vdup.u8 q1, r4 463 vdup.u8 q2, r5 464 vdup.u8 q3, r6 465 vst1.u8 {q0}, [r1], r2 466 vst1.u8 {q1}, [r1], r2 467 vst1.u8 {q2}, [r1], r2 468 vst1.u8 {q3}, [r1], r2 469 470 ldrb r3, [r0], r2 471 ldrb r4, [r0], r2 472 ldrb r5, [r0], r2 473 ldrb r6, [r0], r2 474 vdup.u8 q0, r3 475 vdup.u8 q1, r4 476 vdup.u8 q2, r5 477 vdup.u8 q3, r6 478 vst1.u8 {q0}, [r1], r2 479 vst1.u8 {q1}, [r1], r2 480 vst1.u8 {q2}, [r1], r2 481 vst1.u8 {q3}, [r1], r2 482 483 484 ldrb r3, [r0], r2 485 ldrb r4, [r0], r2 486 ldrb r5, [r0], r2 487 ldrb r6, [r0], r2 488 vdup.u8 q0, r3 489 vdup.u8 q1, r4 490 vdup.u8 q2, r5 491 vdup.u8 q3, r6 492 vst1.u8 {q0}, [r1], r2 493 vst1.u8 {q1}, [r1], r2 494 vst1.u8 {q2}, [r1], r2 495 vst1.u8 {q3}, [r1], r2 496 497 ldrb r3, [r0], r2 498 ldrb r4, [r0], r2 499 ldrb r5, [r0], r2 500 ldrb r6, [r0], r2 501 vdup.u8 q0, r3 502 vdup.u8 q1, r4 503 vdup.u8 q2, r5 504 vdup.u8 q3, r6 505 vst1.u8 {q0}, [r1], r2 506 vst1.u8 {q1}, [r1], r2 507 vst1.u8 {q2}, [r1], r2 508 vst1.u8 {q3}, [r1], r2 509 510 pop {r4-r8,pc} 511 512case_tm_pred_s 513 ; Load yabove_row 514 sub r3, r0, r2 515 vld1.8 {q8}, [r3] 516 517 ; Load ytop_left 518 sub r3, r3, #1 519 ldrb r7, [r3] 520 521 vdup.u16 q7, r7 522 523 ; Compute yabove_row - ytop_left 524 mov r3, #1 525 vdup.u8 q0, r3 526 527 vmull.u8 q4, d16, d0 528 vmull.u8 q5, d17, d0 529 530 vsub.s16 q4, q4, q7 531 vsub.s16 q5, q5, q7 532 533 ; Load 4x yleft_col 534 sub r0, r0, #1 535 mov r12, #4 536 537case_tm_pred_loop_s 538 ldrb r3, [r0], r2 539 ldrb r4, [r0], r2 540 ldrb r5, [r0], r2 541 ldrb r6, [r0], r2 542 vdup.u16 q0, r3 543 vdup.u16 q1, r4 544 vdup.u16 q2, r5 545 vdup.u16 q3, r6 546 547 vqadd.s16 q8, q0, q4 548 vqadd.s16 q9, q0, q5 549 550 vqadd.s16 q10, q1, q4 551 vqadd.s16 q11, q1, q5 552 553 vqadd.s16 q12, q2, q4 554 vqadd.s16 q13, q2, q5 555 556 vqadd.s16 q14, q3, q4 557 vqadd.s16 q15, q3, q5 558 559 vqshrun.s16 d0, q8, #0 560 vqshrun.s16 d1, q9, #0 561 562 vqshrun.s16 d2, q10, #0 563 vqshrun.s16 d3, q11, #0 564 565 vqshrun.s16 d4, q12, #0 566 vqshrun.s16 d5, q13, #0 567 568 vqshrun.s16 d6, q14, #0 569 vqshrun.s16 d7, q15, #0 570 571 vst1.u8 {q0}, [r1], r2 572 vst1.u8 {q1}, [r1], r2 573 vst1.u8 {q2}, [r1], r2 574 vst1.u8 {q3}, [r1], r2 575 576 subs r12, r12, #1 577 bne case_tm_pred_loop_s 578 579 pop {r4-r8,pc} 580 581 ENDP 582 583 584 END 585