1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20//** 21 22///** 23//****************************************************************************** 24//* 25//* 26//* @brief 27//* This file contains definitions of routines that compute distortion 28//* between two macro/sub blocks of identical dimensions 29//* 30//* @author 31//* Ittiam 32//* 33//* @par List of Functions: 34//* - ime_compute_sad_16x16() 35//* - ime_compute_sad_8x8() 36//* - ime_compute_sad_4x4() 37//* - ime_compute_sad_16x8() 38//* - ime_compute_satqd_16x16_lumainter_av8() 39//* 40//* @remarks 41//* None 42//* 43//******************************************************************************* 44// 45 46 47///** 48//****************************************************************************** 49//* 50//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) 51//* 52//* @par Description 53//* This functions computes SAD between 2 16x16 blocks. There is a provision 54//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 55//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 56//* 57//* @param[in] pu1_src 58//* UWORD8 pointer to the source 59//* 60//* @param[out] pu1_dst 61//* UWORD8 pointer to the destination 62//* 63//* @param[in] src_strd 64//* integer source stride 65//* 66//* @param[in] dst_strd 67//* integer destination stride 68//* 69//* @param[in] i4_max_sad 70//* integer maximum allowed distortion 71//* 72//* @param[in] pi4_mb_distortion 73//* integer evaluated sad 74//* 75//* @remarks 76//* 77//****************************************************************************** 78//*/ 79.text 80.p2align 2 81 82.macro push_v_regs 83 stp d8, d9, [sp, #-16]! 84 stp d10, d11, [sp, #-16]! 85 stp d12, d13, [sp, #-16]! 86 stp d14, d15, [sp, #-16]! 87.endm 88.macro pop_v_regs 89 ldp d14, d15, [sp], #16 90 ldp d12, d13, [sp], #16 91 ldp d10, d11, [sp], #16 92 ldp d8, d9, [sp], #16 93.endm 94 95 .global ime_compute_sad_16x16_fast_av8 96ime_compute_sad_16x16_fast_av8: 97 push_v_regs 98 sxtw x2, w2 99 sxtw x3, w3 100 lsl x2, x2, #1 101 lsl x3, x3, #1 102 103 mov x6, #2 104 movi v30.8h, #0 105 106core_loop_ime_compute_sad_16x16_fast_av8: 107 108 ld1 {v0.16b}, [x0], x2 109 ld1 {v1.16b}, [x1], x3 110 ld1 {v2.16b}, [x0], x2 111 ld1 {v3.16b}, [x1], x3 112 113 uabal v30.8h, v0.8b, v1.8b 114 uabal2 v30.8h, v0.16b, v1.16b 115 116 uabal v30.8h, v2.8b, v3.8b 117 uabal2 v30.8h, v2.16b, v3.16b 118 119 ld1 {v4.16b}, [x0], x2 120 ld1 {v5.16b}, [x1], x3 121 ld1 {v6.16b}, [x0], x2 122 ld1 {v7.16b}, [x1], x3 123 124 uabal v30.8h, v4.8b, v5.8b 125 uabal2 v30.8h, v4.16b, v5.16b 126 127 uabal v30.8h, v6.8b, v7.8b 128 uabal2 v30.8h, v6.16b, v7.16b 129 130 subs x6, x6, #1 131 bne core_loop_ime_compute_sad_16x16_fast_av8 132 133 134 addp v30.8h, v30.8h, v30.8h 135 uaddlp v30.4s, v30.8h 136 addp v30.2s, v30.2s, v30.2s 137 shl v30.2s, v30.2s, #1 138 139 st1 {v30.s}[0], [x5] 140 pop_v_regs 141 ret 142 143 144///** 145//****************************************************************************** 146//* 147//* @brief computes distortion (SAD) between 2 16x8 blocks 148//* 149//* 150//* @par Description 151//* This functions computes SAD between 2 16x8 blocks. There is a provision 152//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 153//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 154//* 155//* @param[in] pu1_src 156//* UWORD8 pointer to the source 157//* 158//* @param[out] pu1_dst 159//* UWORD8 pointer to the destination 160//* 161//* @param[in] src_strd 162//* integer source stride 163//* 164//* @param[in] dst_strd 165//* integer destination stride 166//* 167//* @param[in] u4_max_sad 168//* integer maximum allowed distortion 169//* 170//* @param[in] pi4_mb_distortion 171//* integer evaluated sad 172//* 173//* @remarks 174//* 175//****************************************************************************** 176//*/ 177// 178 .global ime_compute_sad_16x8_av8 179ime_compute_sad_16x8_av8: 180 181 //chheck what stride incremtn to use 182 //earlier code did not have this lsl 183 push_v_regs 184 sxtw x2, w2 185 sxtw x3, w3 186 mov x6, #2 187 movi v30.8h, #0 188 189core_loop_ime_compute_sad_16x8_av8: 190 191 ld1 {v0.16b}, [x0], x2 192 ld1 {v1.16b}, [x1], x3 193 ld1 {v2.16b}, [x0], x2 194 ld1 {v3.16b}, [x1], x3 195 196 uabal v30.8h, v0.8b, v1.8b 197 uabal2 v30.8h, v0.16b, v1.16b 198 199 uabal v30.8h, v2.8b, v3.8b 200 uabal2 v30.8h, v2.16b, v3.16b 201 202 ld1 {v4.16b}, [x0], x2 203 ld1 {v5.16b}, [x1], x3 204 ld1 {v6.16b}, [x0], x2 205 ld1 {v7.16b}, [x1], x3 206 207 uabal v30.8h, v4.8b, v5.8b 208 uabal2 v30.8h, v4.16b, v5.16b 209 210 uabal v30.8h, v6.8b, v7.8b 211 uabal2 v30.8h, v6.16b, v7.16b 212 213 subs x6, x6, #1 214 bne core_loop_ime_compute_sad_16x8_av8 215 216 217 addp v30.8h, v30.8h, v30.8h 218 uaddlp v30.4s, v30.8h 219 addp v30.2s, v30.2s, v30.2s 220 221 st1 {v30.s}[0], [x5] 222 pop_v_regs 223 ret 224 225///** 226//****************************************************************************** 227//* 228//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit 229//* 230//* @par Description 231//* This functions computes SAD between 2 16x16 blocks. There is a provision 232//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 233//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 234//* 235//* @param[in] pu1_src 236//* UWORD8 pointer to the source 237//* 238//* @param[out] pu1_dst 239//* UWORD8 pointer to the destination 240//* 241//* @param[in] src_strd 242//* integer source stride 243//* 244//* @param[in] dst_strd 245//* integer destination stride 246//* 247//* @param[in] i4_max_sad 248//* integer maximum allowed distortion 249//* 250//* @param[in] pi4_mb_distortion 251//* integer evaluated sad 252//* 253//* @remarks 254//* 255//****************************************************************************** 256//*/ 257 258 .global ime_compute_sad_16x16_ea8_av8 259ime_compute_sad_16x16_ea8_av8: 260 261 push_v_regs 262 sxtw x2, w2 263 sxtw x3, w3 264 movi v30.8h, #0 265 266 add x7, x0, x2 267 add x8, x1, x3 268 269 lsl x2, x2, #1 270 lsl x3, x3, #1 271 272 ld1 {v0.16b}, [x0], x2 273 ld1 {v1.16b}, [x1], x3 274 ld1 {v2.16b}, [x0], x2 275 ld1 {v3.16b}, [x1], x3 276 ld1 {v8.16b}, [x0], x2 277 ld1 {v9.16b}, [x1], x3 278 ld1 {v10.16b}, [x0], x2 279 ld1 {v11.16b}, [x1], x3 280 ld1 {v12.16b}, [x0], x2 281 ld1 {v13.16b}, [x1], x3 282 ld1 {v14.16b}, [x0], x2 283 ld1 {v15.16b}, [x1], x3 284 ld1 {v16.16b}, [x0], x2 285 ld1 {v17.16b}, [x1], x3 286 ld1 {v18.16b}, [x0], x2 287 ld1 {v19.16b}, [x1], x3 288 289 uabal v30.8h, v0.8b, v1.8b 290 uabal2 v30.8h, v0.16b, v1.16b 291 292 uabal v30.8h, v2.8b, v3.8b 293 uabal2 v30.8h, v2.16b, v3.16b 294 295 uabal v30.8h, v8.8b, v9.8b 296 uabal2 v30.8h, v8.16b, v9.16b 297 298 uabal v30.8h, v10.8b, v11.8b 299 uabal2 v30.8h, v10.16b, v11.16b 300 301 uabal v30.8h, v12.8b, v13.8b 302 uabal2 v30.8h, v12.16b, v13.16b 303 304 uabal v30.8h, v14.8b, v15.8b 305 uabal2 v30.8h, v14.16b, v15.16b 306 307 uabal v30.8h, v16.8b, v17.8b 308 uabal2 v30.8h, v16.16b, v17.16b 309 310 uabal v30.8h, v18.8b, v19.8b 311 uabal2 v30.8h, v18.16b, v19.16b 312 313 addp v31.8h, v30.8h, v30.8h 314 uaddlp v31.4s, v31.8h 315 addp v31.2s, v31.2s, v31.2s 316 mov w6, v31.s[0] 317 cmp w6, w4 318 bgt end_func_16x16 319 320 //do the stuff again 321 ld1 {v0.16b}, [x7], x2 322 ld1 {v1.16b}, [x8], x3 323 ld1 {v2.16b}, [x7], x2 324 ld1 {v3.16b}, [x8], x3 325 ld1 {v8.16b}, [x7], x2 326 ld1 {v9.16b}, [x8], x3 327 ld1 {v10.16b}, [x7], x2 328 ld1 {v11.16b}, [x8], x3 329 ld1 {v12.16b}, [x7], x2 330 ld1 {v13.16b}, [x8], x3 331 ld1 {v14.16b}, [x7], x2 332 ld1 {v15.16b}, [x8], x3 333 ld1 {v16.16b}, [x7], x2 334 ld1 {v17.16b}, [x8], x3 335 ld1 {v18.16b}, [x7], x2 336 ld1 {v19.16b}, [x8], x3 337 338 uabal v30.8h, v0.8b, v1.8b 339 uabal2 v30.8h, v0.16b, v1.16b 340 341 uabal v30.8h, v2.8b, v3.8b 342 uabal2 v30.8h, v2.16b, v3.16b 343 344 uabal v30.8h, v8.8b, v9.8b 345 uabal2 v30.8h, v8.16b, v9.16b 346 347 uabal v30.8h, v10.8b, v11.8b 348 uabal2 v30.8h, v10.16b, v11.16b 349 350 uabal v30.8h, v12.8b, v13.8b 351 uabal2 v30.8h, v12.16b, v13.16b 352 353 uabal v30.8h, v14.8b, v15.8b 354 uabal2 v30.8h, v14.16b, v15.16b 355 356 uabal v30.8h, v16.8b, v17.8b 357 uabal2 v30.8h, v16.16b, v17.16b 358 359 uabal v30.8h, v18.8b, v19.8b 360 uabal2 v30.8h, v18.16b, v19.16b 361 362 addp v31.8h, v30.8h, v30.8h 363 uaddlp v31.4s, v31.8h 364 addp v31.2s, v31.2s, v31.2s 365 366end_func_16x16: 367 st1 {v31.s}[0], [x5] 368 pop_v_regs 369 ret 370 371 372///* 373////--------------------------------------------------------------------------- 374//// Function Name : ime_calculate_sad2_prog_av8() 375//// 376//// Detail Description : This function find the sad values of 4 Progressive MBs 377//// at one shot 378//// 379//// Platform : CortexAv8/NEON . 380//// 381////----------------------------------------------------------------------------- 382//*/ 383 384 .global ime_calculate_sad2_prog_av8 385ime_calculate_sad2_prog_av8: 386 387 // x0 = ref1 <UWORD8 *> 388 // x1 = ref2 <UWORD8 *> 389 // x2 = src <UWORD8 *> 390 // w3 = RefBufferWidth <UWORD32> 391 // w4 = CurBufferWidth <UWORD32> 392 // x5 = psad <UWORD32 *> 393 push_v_regs 394 sxtw x3, w3 395 sxtw x4, w4 396 mov x6, #8 397 movi v30.8h, #0 398 movi v31.8h, #0 399 400core_loop_ime_calculate_sad2_prog_av8: 401 402 ld1 {v0.16b}, [x0], x3 403 ld1 {v1.16b}, [x1], x3 404 ld1 {v2.16b}, [x3], x4 405 406 ld1 {v3.16b}, [x0], x3 407 ld1 {v4.16b}, [x1], x3 408 ld1 {v5.16b}, [x3], x4 409 410 411 uabal v30.8h, v0.8b, v2.8b 412 uabal2 v30.8h, v0.16b, v2.16b 413 uabal v31.8h, v1.8b, v2.8b 414 uabal2 v31.8h, v1.16b, v2.16b 415 416 uabal v30.8h, v3.8b, v5.8b 417 uabal2 v30.8h, v3.16b, v5.16b 418 uabal v31.8h, v4.8b, v5.8b 419 uabal2 v31.8h, v4.16b, v5.16b 420 421 422 ld1 {v6.16b}, [x0], x3 423 ld1 {v7.16b}, [x1], x3 424 ld1 {v8.16b}, [x3], x4 425 426 ld1 {v9.16b}, [x0], x3 427 ld1 {v10.16b}, [x1], x3 428 ld1 {v11.16b}, [x3], x4 429 430 uabal v30.8h, v6.8b, v8.8b 431 uabal2 v30.8h, v6.16b, v8.16b 432 uabal v31.8h, v7.8b, v8.8b 433 uabal2 v31.8h, v7.16b, v8.16b 434 435 uabal v30.8h, v9.8b, v11.8b 436 uabal2 v30.8h, v9.16b, v11.16b 437 uabal v31.8h, v10.8b, v11.8b 438 uabal2 v31.8h, v0.16b, v11.16b 439 440 subs x6, x6, #1 441 bne core_loop_ime_calculate_sad2_prog_av8 442 443 addp v30.8h, v30.8h, v31.8h 444 uaddlp v30.4s, v30.8h 445 addp v30.2s, v30.2s, v30.2s 446 shl v30.2s, v30.2s, #1 447 448 st1 {v30.2s}, [x5] 449 pop_v_regs 450 ret 451 452///* 453////--------------------------------------------------------------------------- 454//// Function Name : Calculate_Mad3_prog() 455//// 456//// Detail Description : This function find the sad values of 4 Progressive MBs 457//// at one shot 458//// 459//// Platform : CortexA8/NEON . 460//// 461////----------------------------------------------------------------------------- 462//*/ 463 464 .global ime_calculate_sad3_prog_av8 465ime_calculate_sad3_prog_av8: 466 467 // x0 = ref1 <UWORD8 *> 468 // x1 = ref2 <UWORD8 *> 469 // x2 = ref3 <UWORD8 *> 470 // x3 = src <UWORD8 *> 471 // w4 = RefBufferWidth <UWORD32> 472 // w5 = CurBufferWidth <UWORD32> 473 // x6 = psad <UWORD32 *> 474 475 476 push_v_regs 477 sxtw x4, w4 478 sxtw x5, w5 479 mov x7, #16 480 movi v29.8h, #0 481 movi v30.8h, #0 482 movi v31.8h, #0 483 484core_loop_ime_calculate_sad3_prog_av8: 485 486 ld1 {v0.16b}, [x0], x4 487 ld1 {v1.16b}, [x1], x4 488 ld1 {v2.16b}, [x2], x4 489 ld1 {v3.16b}, [x3], x5 490 491 uabal v29.8h, v0.8b, v3.8b 492 uabal2 v29.8h, v0.16b, v3.16b 493 uabal v30.8h, v1.8b, v3.8b 494 uabal2 v30.8h, v1.16b, v3.16b 495 uabal v31.8h, v2.8b, v3.8b 496 uabal2 v31.8h, v2.16b, v3.16b 497 498 ld1 {v4.16b}, [x0], x4 499 ld1 {v5.16b}, [x1], x4 500 ld1 {v6.16b}, [x2], x4 501 ld1 {v7.16b}, [x3], x5 502 503 uabal v29.8h, v4.8b, v7.8b 504 uabal2 v29.8h, v4.16b, v7.16b 505 uabal v30.8h, v5.8b, v7.8b 506 uabal2 v30.8h, v5.16b, v7.16b 507 uabal v31.8h, v6.8b, v7.8b 508 uabal2 v31.8h, v6.16b, v7.16b 509 510 subs x7, x7, #1 511 bne core_loop_ime_calculate_sad3_prog_av8 512 513 addp v30.8h, v30.8h, v31.8h 514 uaddlp v30.4s, v30.8h 515 addp v30.2s, v30.2s, v30.2s 516 shl v30.2s, v30.2s, #1 517 518 st1 {v30.2s}, [x6] 519 pop_v_regs 520 ret 521 522 523 524 525///** 526//****************************************************************************** 527//* 528//* @brief computes distortion (SAD) for sub-pel motion estimation 529//* 530//* @par Description 531//* This functions computes SAD for all the 8 half pel points 532//* 533//* @param[out] pi4_sad 534//* integer evaluated sad 535//* pi4_sad[0] - half x 536//* pi4_sad[1] - half x - 1 537//* pi4_sad[2] - half y 538//* pi4_sad[3] - half y - 1 539//* pi4_sad[4] - half xy 540//* pi4_sad[5] - half xy - 1 541//* pi4_sad[6] - half xy - strd 542//* pi4_sad[7] - half xy - 1 - strd 543//* 544//* @remarks 545//* 546//****************************************************************************** 547//*/ 548 549.text 550.p2align 2 551 552 .global ime_sub_pel_compute_sad_16x16_av8 553ime_sub_pel_compute_sad_16x16_av8: 554 push_v_regs 555 sxtw x4, w4 556 sxtw x5, w5 557 sub x7, x1, #1 //x left 558 sub x8, x2, x5 //y top 559 sub x9, x3, #1 //xy left 560 sub x10, x3, x5 //xy top 561 sub x11, x10, #1 //xy top left 562 563 movi v24.8h, #0 564 movi v25.8h, #0 565 movi v26.8h, #0 566 movi v27.8h, #0 567 movi v28.8h, #0 568 movi v29.8h, #0 569 movi v30.8h, #0 570 movi v31.8h, #0 571 572 mov x12, #16 573core_loop_ime_sub_pel_compute_sad_16x16_av8: 574 575 ld1 {v0.16b}, [x0], x4 //src 576 ld1 {v1.16b}, [x1], x5 //x 577 ld1 {v2.16b}, [x7], x5 //x left 578 ld1 {v3.16b}, [x2], x5 //y 579 ld1 {v9.16b}, [x8], x5 //y top 580 ld1 {v10.16b}, [x3], x5 //xy 581 ld1 {v11.16b}, [x9], x5 //xy left 582 ld1 {v12.16b}, [x10], x5 //xy top 583 ld1 {v13.16b}, [x11], x5 //xy top left 584 585 uabal v24.8h, v0.8b, v1.8b 586 uabal2 v24.8h, v0.16b, v1.16b 587 uabal v25.8h, v0.8b, v2.8b 588 uabal2 v25.8h, v0.16b, v2.16b 589 uabal v26.8h, v0.8b, v3.8b 590 uabal2 v26.8h, v0.16b, v3.16b 591 uabal v27.8h, v0.8b, v9.8b 592 uabal2 v27.8h, v0.16b, v9.16b 593 uabal v28.8h, v0.8b, v10.8b 594 uabal2 v28.8h, v0.16b, v10.16b 595 uabal v29.8h, v0.8b, v11.8b 596 uabal2 v29.8h, v0.16b, v11.16b 597 uabal v30.8h, v0.8b, v12.8b 598 uabal2 v30.8h, v0.16b, v12.16b 599 uabal v31.8h, v0.8b, v13.8b 600 uabal2 v31.8h, v0.16b, v13.16b 601 602 subs x12, x12, #1 603 bne core_loop_ime_sub_pel_compute_sad_16x16_av8 604 605 addp v24.8h, v24.8h, v25.8h 606 addp v26.8h, v26.8h, v27.8h 607 addp v28.8h, v28.8h, v29.8h 608 addp v30.8h, v30.8h, v31.8h 609 610 uaddlp v24.4s, v24.8h 611 uaddlp v26.4s, v26.8h 612 uaddlp v28.4s, v28.8h 613 uaddlp v30.4s, v30.8h 614 615 addp v24.4s, v24.4s, v26.4s 616 addp v25.4s, v28.4s, v30.4s 617 618 st1 {v24.4s-v25.4s}, [x6] 619 620 621 pop_v_regs 622 ret 623 624 625///** 626//****************************************************************************** 627//* 628//* @brief computes distortion (SAD) between 2 16x16 blocks 629//* 630//* @par Description 631//* This functions computes SAD between 2 16x16 blocks. There is a provision 632//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 633//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 634//* 635//* @param[in] pu1_src 636//* UWORD8 pointer to the source 637//* 638//* @param[out] pu1_dst 639//* UWORD8 pointer to the destination 640//* 641//* @param[in] src_strd 642//* integer source stride 643//* 644//* @param[in] dst_strd 645//* integer destination stride 646//* 647//* @param[in] i4_max_sad 648//* integer maximum allowed distortion 649//* 650//* @param[in] pi4_mb_distortion 651//* integer evaluated sad 652//* 653//* @remarks 654//* 655//****************************************************************************** 656//*/ 657 .global ime_compute_sad_16x16_av8 658ime_compute_sad_16x16_av8: 659 push_v_regs 660 sxtw x2, w2 661 sxtw x3, w3 662 mov x6, #4 663 movi v30.8h, #0 664 665core_loop_ime_compute_sad_16x16_av8: 666 667 ld1 {v0.16b}, [x0], x2 668 ld1 {v1.16b}, [x1], x3 669 ld1 {v2.16b}, [x0], x2 670 ld1 {v3.16b}, [x1], x3 671 672 uabal v30.8h, v0.8b, v1.8b 673 uabal2 v30.8h, v0.16b, v1.16b 674 675 uabal v30.8h, v2.8b, v3.8b 676 uabal2 v30.8h, v2.16b, v3.16b 677 678 ld1 {v4.16b}, [x0], x2 679 ld1 {v5.16b}, [x1], x3 680 ld1 {v6.16b}, [x0], x2 681 ld1 {v7.16b}, [x1], x3 682 683 uabal v30.8h, v4.8b, v5.8b 684 uabal2 v30.8h, v4.16b, v5.16b 685 686 uabal v30.8h, v6.8b, v7.8b 687 uabal2 v30.8h, v6.16b, v7.16b 688 689 subs x6, x6, #1 690 bne core_loop_ime_compute_sad_16x16_av8 691 692 693 addp v30.8h, v30.8h, v30.8h 694 uaddlp v30.4s, v30.8h 695 addp v30.2s, v30.2s, v30.2s 696 697 st1 {v30.s}[0], [x5] 698 pop_v_regs 699 ret 700 701 702///* 703////--------------------------------------------------------------------------- 704//// Function Name : Calculate_Mad4_prog() 705//// 706//// Detail Description : This function find the sad values of 4 Progressive MBs 707//// at one shot 708//// 709//// Platform : CortexA8/NEON . 710//// 711////----------------------------------------------------------------------------- 712//*/ 713 714 .global ime_calculate_sad4_prog_av8 715ime_calculate_sad4_prog_av8: 716 push_v_regs 717 sxtw x2, w2 718 sxtw x3, w3 719 sub x5, x0, #1 //left 720 add x6, x0, #1 //right 721 sub x7, x0, x2 //top 722 add x8, x0, x2 //bottom 723 724 movi v28.8h, #0 725 movi v29.8h, #0 726 movi v30.8h, #0 727 movi v31.8h, #0 728 729 mov x9, #16 730core_loop_ime_calculate_sad4_prog_av8: 731 732 ld1 {v0.16b}, [x1], x3 733 ld1 {v1.16b}, [x5], x2 734 ld1 {v2.16b}, [x6], x2 735 ld1 {v3.16b}, [x7], x2 736 ld1 {v9.16b}, [x8], x2 737 738 uabal v28.8h, v0.8b, v1.8b 739 uabal2 v28.8h, v0.16b, v1.16b 740 uabal v29.8h, v0.8b, v2.8b 741 uabal2 v29.8h, v0.16b, v2.16b 742 uabal v30.8h, v0.8b, v3.8b 743 uabal2 v30.8h, v0.16b, v3.16b 744 uabal v31.8h, v0.8b, v9.8b 745 uabal2 v31.8h, v0.16b, v9.16b 746 747 subs x9, x9, #1 748 bne core_loop_ime_calculate_sad4_prog_av8 749 750 addp v28.8h, v28.8h, v29.8h 751 addp v30.8h, v30.8h, v31.8h 752 753 uaddlp v28.4s, v28.8h 754 uaddlp v30.4s, v30.8h 755 756 addp v28.4s, v28.4s, v30.4s 757 st1 {v28.4s}, [x4] 758 pop_v_regs 759 ret 760 761 762 763//***************************************************************************** 764//* 765//* Function Name : ime_compute_satqd_16x16_lumainter_av8 766//* Description : This fucntion computes SAD for a 16x16 block. 767// : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant 768// 769// Arguments : x0 :pointer to src buffer 770// x1 :pointer to est buffer 771// x2 :source stride 772// x3 :est stride 773// STACk :Threshold,distotion,is_nonzero 774//* 775//* Values Returned : NONE 776//* 777//* Register Usage : x0-x11 778//* Stack Usage : 779//* Cycles : Around 780//* Interruptiaility : Interruptable 781//* 782//* Known Limitations 783//* \Assumptions : 784//* 785//* Revision History : 786//* DD MM YYYY Author(s) Changes 787//* 14 04 2014 Harinarayanan K K First version 788//* 789//***************************************************************************** 790 .global ime_compute_satqd_16x16_lumainter_av8 791ime_compute_satqd_16x16_lumainter_av8: 792 //x0 :pointer to src buffer 793 //x1 :pointer to est buffer 794 //w2 :Source stride 795 //w3 :Pred stride 796 //x4 :Threshold pointer 797 //x5 :Distortion,ie SAD 798 //x6 :is nonzero 799 //x7 :loop counter 800 push_v_regs 801 sxtw x2, w2 802 sxtw x3, w3 803 stp d8, d9, [sp, #-16]! 804 stp d10, d11, [sp, #-16]! 805 stp d12, d13, [sp, #-16]! 806 stp d14, d15, [sp, #-16]! 807 808 ld1 {v30.8h}, [x4] 809 810 dup v20.4h, v30.h[1] //ls1 811 dup v24.4h, v30.h[0] //ls2 812 dup v21.4h, v30.h[5] //ls3 813 dup v25.4h, v30.h[7] //ls4 814 dup v22.4h, v30.h[3] //ls5 815 dup v26.4h, v30.h[4] //ls6 816 dup v23.4h, v30.h[6] //ls7 817 dup v27.4h, v30.h[2] //ls8 818 819 mov v20.d[1], v24.d[0] 820 mov v21.d[1], v25.d[0] 821 mov v22.d[1], v26.d[0] 822 mov v23.d[1], v27.d[0] 823 824 add x4, x4, #16 825 ld1 {v29.h}[0], [x4] 826 dup v29.4h, v29.h[0] 827 828 movi v31.8h, #0 829 830 mov x7, #4 831core_loop_satqd_ime_compute_satqd_16x16_lumainter: 832 ld1 {v0.16b}, [x0], x2 833 ld1 {v1.16b}, [x1], x3 834 ld1 {v2.16b}, [x0], x2 835 ld1 {v3.16b}, [x1], x3 836 ld1 {v4.16b}, [x0], x2 837 ld1 {v5.16b}, [x1], x3 838 ld1 {v6.16b}, [x0], x2 839 ld1 {v7.16b}, [x1], x3 840 841 uabdl v10.8h, v0.8b, v1.8b 842 uabdl2 v15.8h, v0.16b, v1.16b 843 uabdl v11.8h, v2.8b, v3.8b 844 uabdl2 v16.8h, v2.16b, v3.16b 845 uabdl v12.8h, v4.8b, v5.8b 846 uabdl2 v17.8h, v4.16b, v5.16b 847 uabdl v13.8h, v6.8b, v7.8b 848 uabdl2 v18.8h, v6.16b, v7.16b 849 850 add v0.8h, v10.8h, v13.8h 851 add v1.8h, v11.8h, v12.8h 852 add v2.8h, v15.8h, v18.8h 853 add v3.8h, v16.8h, v17.8h 854 855 //v0 : S1 S4 S4 S1 A1 A4 A4 A1 856 //v1 : S2 S3 S3 S2 A2 A3 A3 A2 857 //v2 : B1 B4 B4 B1 X1 X4 X4 X1 858 //v3 : B3 B2 B2 B3 X3 X2 X2 X3 859 860 trn1 v4.8h, v0.8h, v1.8h 861 trn2 v5.8h, v0.8h, v1.8h 862 trn1 v6.8h, v2.8h, v3.8h 863 trn2 v7.8h, v2.8h, v3.8h 864 865 trn1 v0.4s, v4.4s, v6.4s 866 trn2 v2.4s, v4.4s, v6.4s 867 trn1 v1.4s, v5.4s, v7.4s 868 trn2 v3.4s, v5.4s, v7.4s 869 870 add v4.8h, v0.8h, v3.8h 871 add v5.8h, v1.8h, v2.8h 872 //v4 : S1 S2 B1 B2 A1 A2 X1 X2 873 //v5 : S4 S3 B4 B3 A4 A3 X4 X3 874 875 //compute sad for each 4x4 block 876 add v6.8h, v4.8h, v5.8h 877 addp v19.8h, v6.8h, v6.8h 878 //duplicate the sad into 128 bit so that we can compare using 128bit 879 add v31.4h, v31.4h, v19.4h 880 881 //sad_2 = sad_1<<1; 882 shl v28.8h, v19.8h, #1 883 884 //sad_2 - pu2_thrsh 885 sub v24.8h, v28.8h, v20.8h 886 sub v25.8h, v28.8h, v21.8h 887 sub v26.8h, v28.8h, v22.8h 888 sub v27.8h, v28.8h, v23.8h 889 890 trn1 v0.4s, v4.4s, v5.4s 891 trn2 v1.4s, v4.4s, v5.4s 892 //v0 : S1 S2 S4 S3 A1 A2 A4 A3 893 //v1 : B1 B2 B4 B3 X1 X2 X4 X3 894 895 trn1 v4.8h, v0.8h, v1.8h 896 trn2 v5.8h, v0.8h, v1.8h 897 //v4 : S1 B1 S4 B4 A1 X1 A4 X4 898 //v5 : S2 B2 S3 B3 A2 X2 A3 X3 899 900 mov v7.s[0], v4.s[1] 901 mov v7.s[1], v4.s[3] 902 mov v6.s[0], v5.s[1] // V4 //S1 B1 A1 X1 903 mov v6.s[1], v5.s[3] // V5 //S2 B2 A2 X2 904 mov v4.s[1], v4.s[2] // V6 //S3 B3 A3 X3 905 mov v5.s[1], v5.s[2] // V7 //S4 B4 A4 X4 906 907 shl v0.4h, v4.4h, #1 //S1<<1 908 shl v1.4h, v5.4h, #1 //S2<<1 909 shl v2.4h, v6.4h, #1 //S3<<1 910 shl v3.4h, v7.4h, #1 //S4<<1 911 912 add v8.4h, v5.4h, v6.4h //(s2[j] + s3[j])) 913 add v9.4h, v4.4h, v7.4h //(s1[j] + s4[j])) 914 add v10.4h, v6.4h, v7.4h //(s3[j] + s4[j])) 915 sub v11.4h, v6.4h, v0.4h //(s3[j] - (s1[j]<<1)) 916 sub v12.4h, v7.4h, v1.4h //(s4[j] - (s2[j]<<1)) 917 add v13.4h, v4.4h, v5.4h //(s1[j] + s2[j])) 918 sub v14.4h, v5.4h, v3.4h //(s2[j] - (s4[j]<<1))) 919 sub v15.4h, v4.4h, v2.4h //(s1[j] - (s3[j]<<1))) 920 921 mov v8.d[1], v9.d[0] 922 mov v10.d[1], v11.d[0] 923 mov v12.d[1], v13.d[0] 924 mov v14.d[1], v15.d[0] 925 926 cmge v0.8h, v24.8h, v8.8h //ls1 ls2 927 cmge v1.8h, v25.8h, v10.8h //ls3 ls4 928 cmge v2.8h, v26.8h, v12.8h //ls5 ls6 929 cmge v3.8h, v27.8h, v14.8h //ls7 ls8 930 cmge v4.4h, v19.4h, v29.4h //sad 931 932 orr v0.16b, v0.16b, v1.16b 933 orr v2.16b, v2.16b, v3.16b 934 orr v2.16b, v0.16b, v2.16b 935 xtn v2.8b, v2.8h 936 orr v2.8b, v2.8b, v4.8b 937 938 //if the comparison is non zero, out 939 mov x4, v2.d[0] 940 cmp x4, #0 941 bne core_loop_compute_sad_pre 942 943 subs x7, x7, #1 944 bne core_loop_satqd_ime_compute_satqd_16x16_lumainter 945 b satdq_end_func 946 947 948core_loop_compute_sad: 949 ld1 {v0.16b}, [x0], x2 950 ld1 {v1.16b}, [x1], x3 951 ld1 {v2.16b}, [x0], x2 952 ld1 {v3.16b}, [x1], x3 953 954 uabal v31.8h, v0.8b, v1.8b 955 uabal2 v31.8h, v0.16b, v1.16b 956 957 uabal v31.8h, v2.8b, v3.8b 958 uabal2 v31.8h, v2.16b, v3.16b 959 960 ld1 {v4.16b}, [x0], x2 961 ld1 {v5.16b}, [x1], x3 962 ld1 {v6.16b}, [x0], x2 963 ld1 {v7.16b}, [x1], x3 964 965 uabal v31.8h, v4.8b, v5.8b 966 uabal2 v31.8h, v4.16b, v5.16b 967 968 uabal v31.8h, v6.8b, v7.8b 969 uabal2 v31.8h, v6.16b, v7.16b 970 971core_loop_compute_sad_pre: 972 subs x7, x7, #1 973 bne core_loop_compute_sad 974 975satdq_end_func: 976 977 mov x7, #1 978 cmp x4, #0 979 csel x7, x4, x7, eq 980 str w7, [x6] 981 982 addp v31.8h, v31.8h, v31.8h 983 uaddlp v31.4s, v31.8h 984 addp v31.2s, v31.2s, v31.2s 985 st1 {v31.s}[0], [x5] 986 987 988 ldp d14, d15, [sp], #16 989 ldp d12, d13, [sp], #16 990 ldp d10, d11, [sp], #16 991 ldp d8, d9, [sp], #16 992 pop_v_regs 993 ret 994