1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21 22@** 23@****************************************************************************** 24@* 25@* 26@* @brief 27@* This file contains definitions of routines that compute distortion 28@* between two macro/sub blocks of identical dimensions 29@* 30@* @author 31@* Ittiam 32@* 33@* @par List of Functions: 34@* - ime_compute_sad_16x16_a9q() 35@* - ime_compute_sad_16x16_fast_a9q() 36@* - ime_compute_sad_16x8_a9q() 37@* - ime_compute_sad_16x16_ea8_a9q() 38@* - ime_calculate_sad2_prog_a9q() 39@* - ime_calculate_sad3_prog_a9q() 40@* - ime_calculate_sad4_prog_a9q() 41@* - ime_sub_pel_compute_sad_16x16_a9q() 42@* - ime_compute_satqd_16x16_lumainter_a9q() 43@* - 44@* @remarks 45@* None 46@* 47@******************************************************************************* 48@ 49 50 51@** 52@****************************************************************************** 53@* 54@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) 55@* 56@* @par Description 57@* This functions computes SAD between 2 16x16 blocks. There is a provision 58@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 59@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 60@* 61@* @param[in] pu1_src 62@* UWORD8 pointer to the source 63@* 64@* @param[out] pu1_dst 65@* UWORD8 pointer to the destination 66@* 67@* @param[in] src_strd 68@* integer source stride 69@* 70@* @param[in] dst_strd 71@* integer destination stride 72@* 73@* @param[in] i4_max_sad 74@* integer maximum allowed distortion 75@* 76@* @param[in] pi4_mb_distortion 77@* integer evaluated sad 78@* 79@* @remarks 80@* 81@****************************************************************************** 82@* 83.text 84.p2align 2 85 86 .global ime_compute_sad_16x16_fast_a9q 87 88ime_compute_sad_16x16_fast_a9q: 89 90 stmfd sp!, {r12, lr} 91 vpush {d8-d15} 92 lsl r2, r2, #1 93 lsl r3, r3, #1 94 95 @for bringing buffer2 into cache..., dummy load instructions 96 @LDR r12,[r1] 97 98 vld1.8 {d4, d5}, [r0], r2 99 vld1.8 {d6, d7}, [r1], r3 100 mov r12, #6 101 vld1.8 {d8, d9}, [r0], r2 102 vabdl.u8 q0, d6, d4 103 vabdl.u8 q1, d7, d5 104 vld1.8 {d10, d11}, [r1], r3 105 106loop_sad_16x16_fast: 107 108 vld1.8 {d4, d5}, [r0], r2 109 vabal.u8 q0, d10, d8 110 vabal.u8 q1, d11, d9 111 vld1.8 {d6, d7}, [r1], r3 112 subs r12, #2 113 vld1.8 {d8, d9}, [r0], r2 114 vabal.u8 q0, d6, d4 115 vabal.u8 q1, d7, d5 116 vld1.8 {d10, d11}, [r1], r3 117 118 bne loop_sad_16x16_fast 119 120 vabal.u8 q0, d10, d8 121 vabal.u8 q1, d11, d9 122 123 vadd.i16 q0, q0, q1 124 vadd.i16 d0, d1, d0 125 vpop {d8-d15} 126 ldr r12, [sp, #12] 127 vpaddl.u16 d0, d0 128 vpaddl.u32 d0, d0 129 vshl.u32 d0, d0, #1 130 vst1.32 {d0[0]}, [r12] 131 132 ldmfd sp!, {r12, pc} 133 134 135 136 137@** 138@****************************************************************************** 139@* 140@* @brief computes distortion (SAD) between 2 16x8 blocks 141@* 142@* 143@* @par Description 144@* This functions computes SAD between 2 16x8 blocks. There is a provision 145@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 146@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 147@* 148@* @param[in] pu1_src 149@* UWORD8 pointer to the source 150@* 151@* @param[out] pu1_dst 152@* UWORD8 pointer to the destination 153@* 154@* @param[in] src_strd 155@* integer source stride 156@* 157@* @param[in] dst_strd 158@* integer destination stride 159@* 160@* @param[in] u4_max_sad 161@* integer maximum allowed distortion 162@* 163@* @param[in] pi4_mb_distortion 164@* integer evaluated sad 165@* 166@* @remarks 167@* 168@****************************************************************************** 169@* 170@ 171 .global ime_compute_sad_16x8_a9q 172 173ime_compute_sad_16x8_a9q: 174 175 stmfd sp!, {r12, lr} 176 177 @for bringing buffer2 into cache..., dummy load instructions 178 @LDR r12,[r1] 179 180 vld1.8 {d4, d5}, [r0], r2 181 vld1.8 {d6, d7}, [r1], r3 182 mov r12, #6 183 vpush {d8-d15} 184 vld1.8 {d8, d9}, [r0], r2 185 vabdl.u8 q0, d6, d4 186 vabdl.u8 q1, d7, d5 187 vld1.8 {d10, d11}, [r1], r3 188 189loop_sad_16x8: 190 191 vld1.8 {d4, d5}, [r0], r2 192 vabal.u8 q0, d10, d8 193 vabal.u8 q1, d11, d9 194 vld1.8 {d6, d7}, [r1], r3 195 subs r12, #2 196 vld1.8 {d8, d9}, [r0], r2 197 vabal.u8 q0, d6, d4 198 vabal.u8 q1, d7, d5 199 vld1.8 {d10, d11}, [r1], r3 200 201 bne loop_sad_16x8 202 203 vabal.u8 q0, d10, d8 204 vabal.u8 q1, d11, d9 205 206 vadd.i16 q0, q0, q1 207 vadd.i16 d0, d1, d0 208 vpop {d8-d15} 209 ldr r12, [sp, #12] 210 vpaddl.u16 d0, d0 211 vpaddl.u32 d0, d0 212 213 vst1.32 {d0[0]}, [r12] 214 215 ldmfd sp!, {r12, pc} 216 217 218 219@** 220@****************************************************************************** 221@* 222@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit 223@* 224@* @par Description 225@* This functions computes SAD between 2 16x16 blocks. There is a provision 226@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 227@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 228@* 229@* @param[in] pu1_src 230@* UWORD8 pointer to the source 231@* 232@* @param[out] pu1_dst 233@* UWORD8 pointer to the destination 234@* 235@* @param[in] src_strd 236@* integer source stride 237@* 238@* @param[in] dst_strd 239@* integer destination stride 240@* 241@* @param[in] i4_max_sad 242@* integer maximum allowed distortion 243@* 244@* @param[in] pi4_mb_distortion 245@* integer evaluated sad 246@* 247@* @remarks 248@* 249@****************************************************************************** 250@* 251 252 .global ime_compute_sad_16x16_ea8_a9q 253 254ime_compute_sad_16x16_ea8_a9q: 255 256 stmfd sp!, {r5-r7, lr} 257 lsl r2, r2, #1 258 lsl r3, r3, #1 259 260 @for bringing buffer2 into cache..., dummy load instructions 261 @LDR r12,[r1] 262 263 vld1.8 {d4, d5}, [r0], r2 264 vld1.8 {d6, d7}, [r1], r3 265 mov r5, #6 266 ldrd r6, r7, [sp, #16] 267 vpush {d8-d15} 268 vld1.8 {d8, d9}, [r0], r2 269 vabdl.u8 q0, d6, d4 270 vabdl.u8 q1, d7, d5 271 vld1.8 {d10, d11}, [r1], r3 272 273 @r6 = i4_max_sad, r7 = pi4_mb_distortion 274 275loop_sad_16x16_ea8_1: 276 277 vld1.8 {d4, d5}, [r0], r2 278 vabal.u8 q0, d10, d8 279 vabal.u8 q1, d11, d9 280 vld1.8 {d6, d7}, [r1], r3 281 subs r5, #2 282 vld1.8 {d8, d9}, [r0], r2 283 vabal.u8 q0, d6, d4 284 vabal.u8 q1, d7, d5 285 vld1.8 {d10, d11}, [r1], r3 286 287 bne loop_sad_16x16_ea8_1 288 289 vabal.u8 q0, d10, d8 290 sub r0, r0, r2, lsl #3 291 vabal.u8 q1, d11, d9 292 sub r1, r1, r3, lsl #3 293 294 vadd.i16 q6, q0, q1 295 add r0, r0, r2, asr #1 296 vadd.i16 d12, d12, d13 297 add r1, r1, r3, asr #1 298 299 vpaddl.u16 d12, d12 300 vld1.8 {d4, d5}, [r0], r2 301 vld1.8 {d6, d7}, [r1], r3 302 vpaddl.u32 d12, d12 303 vld1.8 {d8, d9}, [r0], r2 304 vabal.u8 q0, d6, d4 305 vabal.u8 q1, d7, d5 306 307 vst1.32 {d12[0]}, [r7] 308 ldr r5, [r7] 309 cmp r5, r6 310 bgt end_func_16x16_ea8 311 312 vld1.8 {d10, d11}, [r1], r3 313 mov r5, #6 314 315loop_sad_16x16_ea8_2: 316 317 vld1.8 {d4, d5}, [r0], r2 318 vabal.u8 q0, d10, d8 319 vabal.u8 q1, d11, d9 320 vld1.8 {d6, d7}, [r1], r3 321 subs r5, #2 322 vld1.8 {d8, d9}, [r0], r2 323 vabal.u8 q0, d6, d4 324 vabal.u8 q1, d7, d5 325 vld1.8 {d10, d11}, [r1], r3 326 327 bne loop_sad_16x16_ea8_2 328 329 vabal.u8 q0, d10, d8 330 vabal.u8 q1, d11, d9 331 332 vadd.i16 q0, q0, q1 333 vadd.i16 d0, d1, d0 334 335 vpaddl.u16 d0, d0 336 vpaddl.u32 d0, d0 337 338 vst1.32 {d0[0]}, [r7] 339 340end_func_16x16_ea8: 341 vpop {d8-d15} 342 ldmfd sp!, {r5-r7, pc} 343 344 345 346@* 347@//--------------------------------------------------------------------------- 348@// Function Name : Calculate_Mad2_prog() 349@// 350@// Detail Description : This function find the sad values of 4 Progressive MBs 351@// at one shot 352@// 353@// Platform : CortexA8/NEON . 354@// 355@//----------------------------------------------------------------------------- 356@* 357 358 .global ime_calculate_sad2_prog_a9q 359 360ime_calculate_sad2_prog_a9q: 361 362 @ r0 = ref1 <UWORD8 *> 363 @ r1 = ref2 <UWORD8 *> 364 @ r2 = src <UWORD8 *> 365 @ r3 = RefBufferWidth <UWORD32> 366 @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> 367 368 stmfd sp!, {r4-r5, lr} 369 370 ldr r4, [sp, #8] @ load src stride to r4 371 mov r5, #14 372 vpush {d8-d15} 373 @Row 1 374 vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 375 vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 376 vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 377 378 @Row 2 379 vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 380 vabdl.u8 q6, d2, d0 381 vabdl.u8 q7, d3, d1 382 vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 383 vabdl.u8 q8, d4, d0 384 vabdl.u8 q9, d5, d1 385 vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 386 387loop_sad2_prog: 388 389 subs r5, #2 390 @Row 1 391 vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 392 vabal.u8 q6, d8, d6 393 vabal.u8 q7, d9, d7 394 vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 395 vabal.u8 q8, d10, d6 396 vabal.u8 q9, d11, d7 397 vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 398 399 @Row 2 400 vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 401 vabal.u8 q6, d2, d0 402 vabal.u8 q7, d3, d1 403 vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 404 vabal.u8 q8, d4, d0 405 vabal.u8 q9, d5, d1 406 vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 407 408 bne loop_sad2_prog 409 410 vabal.u8 q6, d8, d6 411 vabal.u8 q7, d9, d7 412 vabal.u8 q8, d10, d6 413 vabal.u8 q9, d11, d7 414 415 @ Compute SAD 416 417 vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 418 vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 419 420 vadd.u16 d12, d12, d13 421 ldr r5, [sp, #16] @ loading pi4_sad to r5 422 vadd.u16 d16, d16, d17 423 424 vpadd.u16 d12, d12, d16 425 vpaddl.u16 d12, d12 426 427 vst1.64 {d12}, [r5]! 428 vpop {d8-d15} 429 ldmfd sp!, {r4-r5, pc} 430 431 432 433@* 434@//--------------------------------------------------------------------------- 435@// Function Name : Calculate_Mad3_prog() 436@// 437@// Detail Description : This function find the sad values of 4 Progressive MBs 438@// at one shot 439@// 440@// Platform : CortexA8/NEON . 441@// 442@//----------------------------------------------------------------------------- 443@* 444 445 .global ime_calculate_sad3_prog_a9q 446 447ime_calculate_sad3_prog_a9q: 448 449 @ r0 = ref1 <UWORD8 *> 450 @ r1 = ref2 <UWORD8 *> 451 @ r2 = ref3 <UWORD8 *> 452 @ r3 = src <UWORD8 *> 453 @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> 454 455 456 stmfd sp!, {r4-r6, lr} 457 458 ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 459 mov r6, #14 460 vpush {d8-d15} 461 @Row 1 462 vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 463 vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 464 vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 465 vabdl.u8 q8, d2, d0 466 vabdl.u8 q9, d3, d1 467 vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 468 vabdl.u8 q10, d4, d0 469 vabdl.u8 q11, d5, d1 470 471 @Row 2 472 vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 473 vabdl.u8 q12, d6, d0 474 vabdl.u8 q13, d7, d1 475 vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 476 vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 477 vabal.u8 q8, d10, d8 478 vabal.u8 q9, d11, d9 479 vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 480 vabal.u8 q10, d12, d8 481 vabal.u8 q11, d13, d9 482 483loop_sad3_prog: 484 485 @Row 1 486 vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 487 vabal.u8 q12, d14, d8 488 vabal.u8 q13, d15, d9 489 vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 490 vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 491 vabal.u8 q8, d2, d0 492 vabal.u8 q9, d3, d1 493 vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 494 vabal.u8 q10, d4, d0 495 vabal.u8 q11, d5, d1 496 497 @Row 2 498 vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 499 vabal.u8 q12, d6, d0 500 vabal.u8 q13, d7, d1 501 vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 502 subs r6, #2 503 vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 504 vabal.u8 q8, d10, d8 505 vabal.u8 q9, d11, d9 506 vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 507 vabal.u8 q10, d12, d8 508 vabal.u8 q11, d13, d9 509 510 bne loop_sad3_prog 511 512 vabal.u8 q12, d14, d8 513 vabal.u8 q13, d15, d9 514 515 @ Compute SAD 516 517 vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 518 vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 519 vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 520 521 vadd.u16 d16, d16, d17 522 vadd.u16 d20, d20, d21 523 vadd.u16 d24, d24, d25 524 525 vpadd.u16 d16, d16, d20 526 vpadd.u16 d24, d24, d24 527 528 ldr r6, [sp, #24] @ loading pi4_sad to r6 529 vpaddl.u16 d16, d16 530 vpaddl.u16 d24, d24 531 532 vst1.64 {d16}, [r6]! 533 vst1.32 {d24[0]}, [r6] 534 vpop {d8-d15} 535 ldmfd sp!, {r4-r6, pc} 536 537 538 539@** 540@****************************************************************************** 541@* 542@* @brief computes distortion (SAD) for sub-pel motion estimation 543@* 544@* @par Description 545@* This functions computes SAD for all the 8 half pel points 546@* 547@* @param[out] pi4_sad 548@* integer evaluated sad 549@* pi4_sad[0] - half x 550@* pi4_sad[1] - half x - 1 551@* pi4_sad[2] - half y 552@* pi4_sad[3] - half y - 1 553@* pi4_sad[4] - half xy 554@* pi4_sad[5] - half xy - 1 555@* pi4_sad[6] - half xy - strd 556@* pi4_sad[7] - half xy - 1 - strd 557@* 558@* @remarks 559@* 560@****************************************************************************** 561@* 562 563.text 564.p2align 2 565 566 .global ime_sub_pel_compute_sad_16x16_a9q 567 568ime_sub_pel_compute_sad_16x16_a9q: 569 570 stmfd sp!, {r4-r11, lr} @store register values to stack 571 572 ldr r9, [sp, #36] 573 ldr r10, [sp, #40] 574 vpush {d8-d15} 575 sub r4, r1, #1 @ x left 576 sub r5, r2, r10 @ y top 577 578 sub r6, r3, #1 @ xy left 579 sub r7, r3, r10 @ xy top 580 581 sub r8, r7, #1 @ xy top-left 582 mov r11, #15 583 584 @for bringing buffer2 into cache..., dummy load instructions 585 @ LDR r12,[r1] 586 @ LDR r12,[sp,#12] 587 588 vld1.8 {d0, d1}, [r0], r9 @ src 589 vld1.8 {d2, d3}, [r5], r10 @ y top LOAD 590 vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD 591 vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD 592 593 vabdl.u8 q6, d2, d0 @ y top ABS1 594 vabdl.u8 q7, d4, d0 @ xy top ABS1 595 vld1.8 {d8, d9}, [r1], r10 @ x LOAD 596 vabdl.u8 q8, d6, d0 @ xy top-left ABS1 597 vabdl.u8 q9, d8, d0 @ x ABS1 598 vld1.8 {d10, d11}, [r4], r10 @ x left LOAD 599 600 vabal.u8 q6, d3, d1 @ y top ABS2 601 vabal.u8 q7, d5, d1 @ xy top ABS2 602 vld1.8 {d2, d3}, [r2], r10 @ y LOAD 603 vabal.u8 q8, d7, d1 @ xy top-left ABS2 604 vabal.u8 q9, d9, d1 @ x ABS2 605 vld1.8 {d4, d5}, [r3], r10 @ xy LOAD 606 607 vabdl.u8 q10, d10, d0 @ x left ABS1 608 vabdl.u8 q11, d2, d0 @ y ABS1 609 vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD 610 vabdl.u8 q12, d4, d0 @ xy ABS1 611 vabdl.u8 q13, d6, d0 @ xy left ABS1 612 613loop_sub_pel_16x16: 614 615 vabal.u8 q10, d11, d1 @ x left ABS2 616 vabal.u8 q11, d3, d1 @ y ABS2 617 subs r11, #1 618 vabal.u8 q12, d5, d1 @ xy ABS2 619 vabal.u8 q13, d7, d1 @ xy left ABS2 620 621 vld1.8 {d0, d1}, [r0], r9 @ src 622 vabal.u8 q6, d2, d0 @ y top ABS1 623 vabal.u8 q7, d4, d0 @ xy top ABS1 624 vld1.8 {d8, d9}, [r1], r10 @ x LOAD 625 vabal.u8 q8, d6, d0 @ xy top-left ABS1 626 vabal.u8 q9, d8, d0 @ x ABS1 627 vld1.8 {d10, d11}, [r4], r10 @ x left LOAD 628 629 vabal.u8 q6, d3, d1 @ y top ABS2 630 vabal.u8 q7, d5, d1 @ xy top ABS2 631 vld1.8 {d2, d3}, [r2], r10 @ y LOAD 632 vabal.u8 q8, d7, d1 @ xy top-left ABS2 633 vabal.u8 q9, d9, d1 @ x ABS2 634 vld1.8 {d4, d5}, [r3], r10 @ xy LOAD 635 636 vabal.u8 q10, d10, d0 @ x left ABS1 637 vabal.u8 q11, d2, d0 @ y ABS1 638 vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD 639 vabal.u8 q12, d4, d0 @ xy ABS1 640 vabal.u8 q13, d6, d0 @ xy left ABS1 641 642 bne loop_sub_pel_16x16 643 644 vabal.u8 q10, d11, d1 @ x left ABS2 645 vabal.u8 q11, d3, d1 @ y ABS2 646 vabal.u8 q12, d5, d1 @ xy ABS2 647 vabal.u8 q13, d7, d1 @ xy left ABS2 648 649 vadd.i16 d0, d18, d19 @ x 650 vadd.i16 d3, d12, d13 @ y top 651 vadd.i16 d6, d14, d15 @ xy top 652 vadd.i16 d5, d26, d27 @ xy left 653 vadd.i16 d1, d20, d21 @ x left 654 vadd.i16 d2, d22, d23 @ y 655 vadd.i16 d4, d24, d25 @ xy 656 vadd.i16 d7, d16, d17 @ xy top left 657 658 vpadd.i16 d0, d0, d1 659 vpadd.i16 d2, d2, d3 660 vpadd.i16 d4, d4, d5 661 vpadd.i16 d6, d6, d7 662 663 vpaddl.u16 d0, d0 664 vpaddl.u16 d2, d2 665 vpop {d8-d15} 666 ldr r11, [sp, #44] 667 vpaddl.u16 d4, d4 668 vpaddl.u16 d6, d6 669 670 vst1.32 {d0}, [r11]! 671 vst1.32 {d2}, [r11]! 672 vst1.32 {d4}, [r11]! 673 vst1.32 {d6}, [r11]! 674 675 ldmfd sp!, {r4-r11, pc} @Restoring registers from stack 676 677 678 679@** 680@****************************************************************************** 681@* 682@* @brief computes distortion (SAD) between 2 16x16 blocks 683@* 684@* @par Description 685@* This functions computes SAD between 2 16x16 blocks. There is a provision 686@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 687@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 688@* 689@* @param[in] pu1_src 690@* UWORD8 pointer to the source 691@* 692@* @param[out] pu1_dst 693@* UWORD8 pointer to the destination 694@* 695@* @param[in] src_strd 696@* integer source stride 697@* 698@* @param[in] dst_strd 699@* integer destination stride 700@* 701@* @param[in] i4_max_sad 702@* integer maximum allowed distortion 703@* 704@* @param[in] pi4_mb_distortion 705@* integer evaluated sad 706@* 707@* @remarks 708@* 709@****************************************************************************** 710@* 711 712.text 713.p2align 2 714 715 .global ime_compute_sad_16x16_a9q 716 717ime_compute_sad_16x16_a9q: 718 719 720 @STMFD sp!,{r12,lr} 721 stmfd sp!, {r12, r14} @store register values to stack 722 723 @for bringing buffer2 into cache..., dummy load instructions 724 @ LDR r12,[r1] 725 @ LDR r12,[sp,#12] 726 727 vld1.8 {d4, d5}, [r0], r2 728 vld1.8 {d6, d7}, [r1], r3 729 vpush {d8-d15} 730 mov r12, #14 731 vld1.8 {d8, d9}, [r0], r2 732 vabdl.u8 q0, d4, d6 733 vld1.8 {d10, d11}, [r1], r3 734 vabdl.u8 q1, d5, d7 735 736loop_sad_16x16: 737 738 vld1.8 {d4, d5}, [r0], r2 739 vabal.u8 q0, d8, d10 740 vld1.8 {d6, d7}, [r1], r3 741 vabal.u8 q1, d9, d11 742 743 vld1.8 {d8, d9}, [r0], r2 744 vabal.u8 q0, d4, d6 745 subs r12, #2 746 vld1.8 {d10, d11}, [r1], r3 747 vabal.u8 q1, d5, d7 748 749 bne loop_sad_16x16 750 751 vabal.u8 q0, d8, d10 752 vabal.u8 q1, d9, d11 753 754 vadd.i16 q0, q0, q1 755 vadd.i16 d0, d1, d0 756 vpop {d8-d15} 757 ldr r12, [sp, #12] 758 759 vpaddl.u16 d0, d0 760 vpaddl.u32 d0, d0 761 vst1.32 {d0[0]}, [r12] 762 763 ldmfd sp!, {r12, pc} @Restoring registers from stack 764 765 766@* 767@//--------------------------------------------------------------------------- 768@// Function Name : Calculate_Mad4_prog() 769@// 770@// Detail Description : This function find the sad values of 4 Progressive MBs 771@// at one shot 772@// 773@// Platform : CortexA8/NEON . 774@// 775@//----------------------------------------------------------------------------- 776@* 777 778 .global ime_calculate_sad4_prog_a9q 779 780ime_calculate_sad4_prog_a9q: 781 @ r0 = temp_frame <UWORD8 *> 782 @ r1 = buffer_ptr <UWORD8 *> 783 @ r2 = RefBufferWidth <UWORD32> 784 @ r3 = CurBufferWidth <UWORD32> 785 @ stack = psad <UWORD32 *> {at 0x34} 786 787 stmfd sp!, {r4-r7, lr} 788 789 @UWORD8 *left_ptr = temp_frame - 1; 790 @UWORD8 *right_ptr = temp_frame + 1; 791 @UWORD8 *top_ptr = temp_frame - RefBufferWidth; 792 @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; 793 794 mov r7, #14 795 sub r4, r0, #0x01 @r4 = left_ptr 796 add r5, r0, #0x1 @r5 = right_ptr 797 sub r6, r0, r2 @r6 = top_ptr 798 add r0, r0, r2 @r0 = bot_ptr 799 @r1 = buffer_ptr 800 vpush {d8-d15} 801 @D0:D1 : buffer 802 @D2:D3 : top 803 @D4:D5 : left 804 @D6:D7 : right 805 @D8:D9 : bottom 806 807 @Row 1 808 vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 809 vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 810 vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 811 812 vabdl.u8 q5, d2, d0 813 vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 814 vabdl.u8 q6, d3, d1 815 816 vabdl.u8 q7, d0, d4 817 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 818 vabdl.u8 q8, d1, d5 819 820 @Row 2 821 vabdl.u8 q9, d0, d6 822 vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 823 vabdl.u8 q10, d1, d7 824 825 vabdl.u8 q11, d0, d8 826 vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 827 vabdl.u8 q12, d1, d9 828 829loop_sad4_prog: 830 831 vabal.u8 q5, d26, d2 832 vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 833 vabal.u8 q6, d27, d3 834 835 vabal.u8 q7, d26, d4 836 vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 837 vabal.u8 q8, d27, d5 838 839 vabal.u8 q9, d26, d6 840 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 841 vabal.u8 q10, d27, d7 842 843 @Row 1 844 vabal.u8 q11, d26, d8 845 vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 846 vabal.u8 q12, d27, d9 847 848 vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 849 subs r7, #2 850 vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 851 852 vabal.u8 q5, d0, d2 853 vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 854 vabal.u8 q6, d1, d3 855 856 vabal.u8 q7, d0, d4 857 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 858 vabal.u8 q8, d1, d5 859 860 @Row 2 861 vabal.u8 q9, d0, d6 862 vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 863 vabal.u8 q10, d1, d7 864 865 vabal.u8 q11, d0, d8 866 vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 867 vabal.u8 q12, d1, d9 868 869 bne loop_sad4_prog 870 871 vabal.u8 q5, d26, d2 872 vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 873 vabal.u8 q6, d27, d3 874 875 vabal.u8 q7, d26, d4 876 vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 877 vabal.u8 q8, d27, d5 878 879 vabal.u8 q9, d26, d6 880 vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 881 vabal.u8 q10, d27, d7 882 883 vabal.u8 q11, d26, d8 884 vabal.u8 q12, d27, d9 885 886 @;Q5:Q6 : sad_top 887 @;Q7:Q8 : sad_left 888 @;Q9:Q10 : sad_right 889 @;Q11:Q12 : sad_bot 890 891 vadd.u16 q5, q5, q6 892 vadd.u16 q7, q7, q8 893 vadd.u16 q9, q9, q10 894 vadd.u16 q11, q11, q12 895 896 @; Free :- 897 @; Q6,Q8,Q10,Q12 898 899 @;Q5 -> D10:D11 900 @;Q7 -> D14:D15 901 @;Q9 -> D18:D19 902 @;Q11 -> D22:D23 903 904 vadd.u16 d10, d10, d11 905 vadd.u16 d14, d14, d15 906 vadd.u16 d18, d18, d19 907 vadd.u16 d22, d22, d23 908 909 @;D10 : sad_top 910 @;D14 : sad_left 911 @;D18 : sad_right 912 @;D22 : sad_bot 913 914 915 vpaddl.u16 d11, d10 916 vpaddl.u16 d15, d14 917 vpaddl.u16 d19, d18 918 vpaddl.u16 d23, d22 919 920 @;D11 : sad_top 921 @;D15 : sad_left 922 @;D19 : sad_right 923 @;D23 : sad_bot 924 925 vpaddl.u32 d10, d11 926 vpaddl.u32 d22, d23 927 vpaddl.u32 d14, d15 928 vpaddl.u32 d18, d19 929 930 @;D10 : sad_top 931 @;D14 : sad_left 932 @;D18 : sad_right 933 @;D22 : sad_bot 934 935 ldr r4, [sp, #84] @;Can be rearranged 936 937 vsli.64 d10, d22, #32 938 vsli.64 d14, d18, #32 939 940 vst1.64 {d14}, [r4]! 941 vst1.64 {d10}, [r4]! 942 vpop {d8-d15} 943 ldmfd sp!, {r4-r7, pc} 944 945 946 947 948@***************************************************************************** 949@* 950@* Function Name : ime_compute_satqd_16x16_lumainter_a9 951@* Description : This fucntion computes SAD for a 16x16 block. 952@ : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant 953@ 954@ Arguments : R0 :pointer to src buffer 955@ R1 :pointer to est buffer 956@ R2 :source stride 957@ R3 :est stride 958@ STACk :Threshold,distotion,is_nonzero 959@* 960@* Values Returned : NONE 961@* 962@* Register Usage : R0-R11 963@* Stack Usage : 964@* Cycles : Around 965@* Interruptiaility : Interruptable 966@* 967@* Known Limitations 968@* \Assumptions : 969@* 970@* Revision History : 971@* DD MM YYYY Author(s) Changes 972@* 14 04 2014 Harinarayanan K K First version 973@* 974@***************************************************************************** 975 .global ime_compute_satqd_16x16_lumainter_a9q 976ime_compute_satqd_16x16_lumainter_a9q: 977 @R0 :pointer to src buffer 978 @R1 :pointer to est buffer 979 @R2 :Source stride 980 @R3 :Pred stride 981 @R4 :Threshold pointer 982 @R5 :Distortion,ie SAD 983 @R6 :is nonzero 984 985 push {r4-r12, lr} @push all the variables first 986 @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables 987 ldr r4, [sp, #40] @load the threshold address 988 vpush {d8-d15} 989 mov r8, #8 @Number of 4x8 blocks to be processed 990 mov r10, #0 @Sad 991 mov r7, #0 @Nonzero info 992 @---------------------------------------------------- 993 994 vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 995 996 vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 997 998 vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 999 1000 vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 1001 1002 vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 1003 vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 1004 1005 vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 1006 1007 vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 1008 1009 vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 1010 vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 1011 1012 vld1.u16 {q11}, [r4] @I load the threhold 1013 vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 1014 1015 vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 1016 1017 1018 1019core_loop: 1020 @S1 S2 S3 S4 A1 A2 A3 A4 1021 @S5 S6 S7 S8 A5 A6 A7 A8 1022 @S9 S10 S11 S12 A9 A10 A11 A12 1023 @S13 S14 S15 S16 A13 A14 A15 A16 1024 ands r11, r8, #1 @II See if we are at even or odd block 1025 vadd.u16 q4 , q0, q3 @I Add r1 r4 1026 lsl r11, r2, #2 @II Move back src 4 rows 1027 1028 subeq r0, r0, r11 @II Move back src 4 rows if we are at even block 1029 vadd.u16 q5 , q1, q2 @I Add r2 r3 1030 addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block 1031 1032 lsl r11, r3, #2 @II Move back pred 4 rows 1033 vtrn.16 d8 , d10 @I trnspse 1 1034 subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block 1035 1036 addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block 1037 vtrn.16 d9 , d11 @I trnspse 2 1038 subne r0, r0, #8 @II Src 8clos back for odd rows 1039 1040 subne r1, r1, #8 @II Pred 8 cols back for odd rows 1041 vtrn.32 d10, d11 @I trnspse 4 1042 1043 1044 vtrn.32 d8 , d9 @I trnspse 3 1045 vswp d10, d11 @I rearrange so that the q4 and q5 add properly 1046 @D8 S1 S4 A1 A4 1047 @D9 S2 S3 A2 A3 1048 @D11 S1 S4 A1 A4 1049 @D10 S2 S3 A2 A3 1050 1051 vadd.s16 q6, q4, q5 @I Get s1 s4 1052 vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 1053 1054 vtrn.s16 d12, d13 @I Get s2 s3 1055 @D12 S1 S4 A1 A4 1056 @D13 S2 S3 A2 A3 1057 1058 vshl.s16 q7, q6 , #1 @I si = si<<1 1059 vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 1060 1061 vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) 1062 vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 1063 @ D16 S14 A14 S23 A23 1064 vrev32.16 d0, d16 @I 1065 vuzp.s16 d16, d0 @I 1066 @D16 S14 S23 A14 A23 1067 vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) 1068 vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 1069 @D17 S12 S34 A12 A34 1070 1071 vrev32.16 q9, q7 @I Rearrange si's 1072 @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 1073 1074 @D12 S1 S4 A1 A4 1075 @D19 Z3 Z2 Y3 Y2 1076 vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) 1077 vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 1078 @D13 S2 S3 A2 A3 1079 @D18 Z4 Z1 Y4 Y1 1080 vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) 1081 vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 1082 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 1083 1084 @D16 S14 S23 A14 A23 1085 vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 1086 vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 1087 @D22 SAD1 SAD2 junk junk 1088 1089 1090 @Q8 S2 S1 A2 A1 S6 S3 A6 A3 1091 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 1092 vtrn.32 q8, q4 @I Rearrange to make ls of each block togather 1093 @Q8 S2 S1 S8 S5 S6 S3 S7 S4 1094 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 1095 1096 1097 ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 1098 vdup.s16 q6, d10[0] @I Get the sad blk 1 1099 vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 1100 1101 vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 1102 vmov.s16 r9, d10[0] @I Get the sad for block 1 1103 1104 vsub.s16 q9, q7, q8 @I Add to the lss 1105 vmov.s16 r5, d10[1] @I Get the sad for block 2 1106 1107 vcle.s16 q7, q11, q9 @I Add to the lss 1108 vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 1109 1110 vdup.s16 q15, d10[1] @I Get the sad blk 1 1111 vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 1112 1113 1114 vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 1115 vsub.s16 q3, q14, q4 @I Add to the lss 1116 vcle.s16 q15, q11, q3 @I Add to the lss 1117 1118 ADD R10, R10, R9 @I Add to the global sad blk 1 1119 vtrn.u8 q15, q7 @I get all comparison bits to one reg 1120 vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 1121 1122 ADD R10, R10, R5 @I Add to the global sad blk 2 1123 vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs 1124 cmp r11, r9 1125 1126 movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 1127 vadd.u8 d28, d28, d29 @I Add the bits 1128 cmp r11, r5 @I Compare with threshold blk 2 1129 1130 movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 1131 vpadd.u8 d28, d28, d29 @I Add the bits 1132 1133 vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 1134 vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 1135 1136 orr r7, r7, r11 @I get the guy to r11 1137 1138 1139 sub r8, r8, #1 @I Decremrnt block count 1140 1141 cmp r7, #0 @I If we have atlest one non zero block 1142 bne compute_sad_only @I if a non zero block is der,From now on compute sad only 1143 1144 cmp r8, #1 @I See if we are at the last block 1145 bne core_loop @I If the blocks are zero, lets continue the satdq 1146 1147 1148 @EPILOUGE for core loop 1149 @S1 S2 S3 S4 A1 A2 A3 A4 1150 @S5 S6 S7 S8 A5 A6 A7 A8 1151 @S9 S10 S11 S12 A9 A10 A11 A12 1152 @S13 S14 S15 S16 A13 A14 A15 A16 1153 vadd.u16 q4 , q0, q3 @Add r1 r4 1154 vadd.u16 q5 , q1, q2 @Add r2 r3 1155 @D8 S1 S2 S2 S1 1156 @D10 S4 S3 S3 S4 1157 @D9 A1 A2 A2 A1 1158 @D11 A4 A3 A3 A4 1159 vtrn.16 d8 , d10 @I trnspse 1 1160 vtrn.16 d9 , d11 @I trnspse 2 1161 vtrn.32 d8 , d9 @I trnspse 3 1162 vtrn.32 d10, d11 @I trnspse 4 1163 1164 vswp d10, d11 @I rearrange so that the q4 and q5 add properly 1165 @D8 S1 S4 A1 A4 1166 @D9 S2 S3 A2 A3 1167 @D11 S1 S4 A1 A4 1168 @D10 S2 S3 A2 A3 1169 vadd.s16 q6, q4, q5 @Get s1 s4 1170 vtrn.s16 d12, d13 @Get s2 s3 1171 @D12 S1 S4 A1 A4 1172 @D13 S2 S3 A2 A3 1173 1174 vshl.s16 q7, q6 , #1 @si = si<<1 1175 vmov.s16 r9, d10[0] @Get the sad for block 1 1176 1177 vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) 1178 vmov.s16 r5, d10[1] @Get the sad for block 2 1179 @D16 S14 A14 S23 A23 1180 vrev32.16 d30, d16 @ 1181 vuzp.s16 d16, d30 @ 1182 @D16 S14 S23 A14 A23 1183 vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) 1184 @D17 S12 S34 A12 A34 1185 1186 vrev32.16 q9, q7 @Rearrange si's 1187 @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 1188 1189 @D12 S1 S4 A1 A4 1190 @D19 Z3 Z2 Y3 Y2 1191 vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) 1192 @D13 S2 S3 A2 A3 1193 @D18 Z4 Z1 Y4 Y1 1194 vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) 1195 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 1196 1197 @D16 S14 S23 A14 A23 1198 vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 1199 @D22 SAD1 SAD2 junk junk 1200 vmov.u16 r9, d10[0] @Get the sad for block 1 1201 vmov.u16 r5, d10[1] @Get the sad for block 2 1202 1203 @Q8 S2 S1 A2 A1 S6 S3 A6 A3 1204 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 1205 ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 1206 vtrn.32 q8, q4 @Rearrange to make ls of each block togather 1207 ADD R10, R10, R9 @Add to the global sad blk 1 1208 1209 @Q8 S2 S1 S8 S5 S6 S3 S7 S4 1210 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 1211 1212 vld1.u16 {q11}, [r4] @load the threhold 1213 ADD R10, R10, R5 @Add to the global sad blk 2 1214 1215 vdup.u16 q6, d10[0] @Get the sad blk 1 1216 1217 cmp r11, r9 @Compare with threshold blk 1 1218 vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 1219 1220 vsub.s16 q9, q7, q8 @Add to the lss 1221 1222 vcle.s16 q15, q11, q9 @Add to the lss 1223 movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 1224 1225 cmp r11, r5 @Compare with threshold blk 2 1226 vdup.u16 q14, d10[1] @Get the sad blk 1 1227 1228 vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 1229 vsub.s16 q12, q13, q4 @Add to the lss 1230 vcle.s16 q14, q11, q12 @Add to the lss 1231 movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 1232 1233 vtrn.u8 q14, q15 @get all comparison bits to one reg 1234 vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs 1235 vadd.u8 d28, d28, d29 @Add the bits 1236 vpadd.u8 d28, d28, d29 @Add the bits 1237 vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 1238 orr r7, r7, r11 @get the guy to r11 1239 1240 b funcend_sad_16x16 @Since all blocks ar processed nw, got to end 1241 1242compute_sad_only: @This block computes SAD only, so will be lighter 1243 @IT will start processign at n odd block 1244 @It will compute sad for odd blok, 1245 @and then for two blocks at a time 1246 @The counter is r7, hence r7 blocks will be processed 1247 1248 and r11, r8, #1 @Get the last bit of counter 1249 cmp r11, #0 @See if we are at even or odd block 1250 @iif the blk is even we just have to set the pointer to the 1251 @start of current row 1252 1253 lsleq r11, r2, #2 @I Move back src 4 rows 1254 subeq r0, r0, r11 @I Move back src 4 rows if we are at even block 1255 1256 lsleq r11, r3, #2 @I Move back pred 4 rows 1257 subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block 1258 @ADDEQ R8,R8,#2 ;Inc counter 1259 beq skip_odd_blk @If the blk is odd we have to compute sad 1260 1261 1262 vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 1263 vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 1264 vadd.u16 q6, q4, q5 @Add SAD of row 1-4 1265 vadd.u16 d14, d12, d13 @Add Blk1 and blk2 1266 vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 1267 vpadd.u16 d18, d16, d17 @Add col 12-34 1268 1269 vmov.u16 r9, d18[0] @Move sad to arm 1270 ADD R10, R10, R9 @Add to the global sad 1271 1272 sub r8, r8, #1 @Dec counter 1273 cmp r8, #0 @See if we processed last block 1274 beq funcend_sad_16x16 @if lprocessed last block goto end of func 1275 1276 sub r0, r0, #8 @Since we processed od block move back src by 8 cols 1277 sub r1, r1, #8 @Since we processed od block move back pred by 8 cols 1278 1279skip_odd_blk: 1280 1281 vmov.s16 q0, #0 @Initialize the accumulator 1282 vmov.s16 q1, #0 @Initialize the accumulator 1283 1284 vld1.u8 {q15}, [r0], r2 @load src r1 1285 vld1.u8 {q14}, [r1], r3 @load pred r1 1286 1287 vld1.u8 {q13}, [r0], r2 @load src r2 1288 vld1.u8 {q12}, [r1], r3 @load pred r2 1289 1290 vld1.u8 {q11}, [r0], r2 @load src r3 1291 vld1.u8 {q10}, [r1], r3 @load pred r2 1292 1293 vld1.u8 {q9}, [r0], r2 @load src r4 1294 vld1.u8 {q8}, [r1], r3 @load pred r4 1295 1296 cmp r8, #2 1297 beq sad_epilouge 1298 1299sad_loop: 1300 1301 vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 1302 vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 1303 1304 vld1.u8 {q15}, [r0], r2 @II load r1 src 1305 vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 1306 1307 vld1.u8 {q14}, [r1], r3 @II load r1 pred 1308 vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 1309 1310 vld1.u8 {q13}, [r0], r2 @II load r3 src 1311 vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 1312 1313 vld1.u8 {q12}, [r1], r3 @II load r2 pred 1314 vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 1315 1316 vld1.u8 {q11}, [r0], r2 @II load r3 src 1317 vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 1318 1319 1320 sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 1321 vld1.u8 {q10}, [r1], r3 @II load r3 pred 1322 vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 1323 1324 cmp r8, #2 @Check if last loop 1325 vld1.u8 {q9}, [r0], r2 @II load r4 src 1326 vld1.u8 {q8}, [r1], r3 @II load r4 pred 1327 1328 bne sad_loop @Go back to SAD computation 1329 1330sad_epilouge: 1331 vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 1332 vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 1333 1334 vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 1335 vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 1336 1337 vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 1338 vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 1339 1340 vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 1341 vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 1342 1343 vadd.u16 q2, q0, q1 @ADD two accumulators 1344 vadd.u16 d6, d4, d5 @Add two blk sad 1345 vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad 1346 vpadd.u16 d10, d8, d9 @Add col 12-34 sad 1347 1348 vmov.u16 r9, d10[0] @move SAD to ARM 1349 ADD R10, R10, R9 @Add to the global sad 1350 1351funcend_sad_16x16: @End of fucntion process 1352 1353 vpop {d8-d15} 1354 ldr r5, [sp, #44] 1355 ldr r6, [sp, #48] 1356 1357 str r7, [r6] @Store the is zero reg 1358 str r10, [r5] @Store sad 1359 1360 @SUB SP,SP,#40 1361 pop {r4-r12, pc} 1362 1363 1364