1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20//** 21 22///** 23//****************************************************************************** 24//* 25//* 26//* @brief 27//* This file contains definitions of routines that compute distortion 28//* between two macro/sub blocks of identical dimensions 29//* 30//* @author 31//* Ittiam 32//* 33//* @par List of Functions: 34//* - ime_compute_sad_16x16() 35//* - ime_compute_sad_8x8() 36//* - ime_compute_sad_4x4() 37//* - ime_compute_sad_16x8() 38//* - ime_compute_satqd_16x16_lumainter_av8() 39//* 40//* @remarks 41//* None 42//* 43//******************************************************************************* 44// 45 46 47///** 48//****************************************************************************** 49//* 50//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) 51//* 52//* @par Description 53//* This functions computes SAD between 2 16x16 blocks. There is a provision 54//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 55//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 56//* 57//* @param[in] pu1_src 58//* UWORD8 pointer to the source 59//* 60//* @param[out] pu1_dst 61//* UWORD8 pointer to the destination 62//* 63//* @param[in] src_strd 64//* integer source stride 65//* 66//* @param[in] dst_strd 67//* integer destination stride 68//* 69//* @param[in] i4_max_sad 70//* integer maximum allowed distortion 71//* 72//* @param[in] pi4_mb_distortion 73//* integer evaluated sad 74//* 75//* @remarks 76//* 77//****************************************************************************** 78//*/ 79.text 80.p2align 2 81 82.macro push_v_regs 83 stp d8, d9, [sp, #-16]! 84 stp d10, d11, [sp, #-16]! 85 stp d12, d13, [sp, #-16]! 86 stp d14, d15, [sp, #-16]! 87.endm 88.macro pop_v_regs 89 ldp d14, d15, [sp], #16 90 ldp d12, d13, [sp], #16 91 ldp d10, d11, [sp], #16 92 ldp d8, d9, [sp], #16 93.endm 94 95 .global ime_compute_sad_16x16_fast_av8 96ime_compute_sad_16x16_fast_av8: 97 push_v_regs 98 lsl x2, x2, #1 99 lsl x3, x3, #1 100 101 mov x6, #2 102 movi v30.8h, #0 103 104core_loop_ime_compute_sad_16x16_fast_av8: 105 106 ld1 {v0.16b}, [x0], x2 107 ld1 {v1.16b}, [x1], x3 108 ld1 {v2.16b}, [x0], x2 109 ld1 {v3.16b}, [x1], x3 110 111 uabal v30.8h, v0.8b, v1.8b 112 uabal2 v30.8h, v0.16b, v1.16b 113 114 uabal v30.8h, v2.8b, v3.8b 115 uabal2 v30.8h, v2.16b, v3.16b 116 117 ld1 {v4.16b}, [x0], x2 118 ld1 {v5.16b}, [x1], x3 119 ld1 {v6.16b}, [x0], x2 120 ld1 {v7.16b}, [x1], x3 121 122 uabal v30.8h, v4.8b, v5.8b 123 uabal2 v30.8h, v4.16b, v5.16b 124 125 uabal v30.8h, v6.8b, v7.8b 126 uabal2 v30.8h, v6.16b, v7.16b 127 128 subs x6, x6, #1 129 bne core_loop_ime_compute_sad_16x16_fast_av8 130 131 132 addp v30.8h, v30.8h, v30.8h 133 uaddlp v30.4s, v30.8h 134 addp v30.2s, v30.2s, v30.2s 135 shl v30.2s, v30.2s, #1 136 137 st1 {v30.s}[0], [x5] 138 pop_v_regs 139 ret 140 141 142///** 143//****************************************************************************** 144//* 145//* @brief computes distortion (SAD) between 2 16x8 blocks 146//* 147//* 148//* @par Description 149//* This functions computes SAD between 2 16x8 blocks. There is a provision 150//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 151//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 152//* 153//* @param[in] pu1_src 154//* UWORD8 pointer to the source 155//* 156//* @param[out] pu1_dst 157//* UWORD8 pointer to the destination 158//* 159//* @param[in] src_strd 160//* integer source stride 161//* 162//* @param[in] dst_strd 163//* integer destination stride 164//* 165//* @param[in] u4_max_sad 166//* integer maximum allowed distortion 167//* 168//* @param[in] pi4_mb_distortion 169//* integer evaluated sad 170//* 171//* @remarks 172//* 173//****************************************************************************** 174//*/ 175// 176 .global ime_compute_sad_16x8_av8 177ime_compute_sad_16x8_av8: 178 179 //chheck what stride incremtn to use 180 //earlier code did not have this lsl 181 push_v_regs 182 mov x6, #2 183 movi v30.8h, #0 184 185core_loop_ime_compute_sad_16x8_av8: 186 187 ld1 {v0.16b}, [x0], x2 188 ld1 {v1.16b}, [x1], x3 189 ld1 {v2.16b}, [x0], x2 190 ld1 {v3.16b}, [x1], x3 191 192 uabal v30.8h, v0.8b, v1.8b 193 uabal2 v30.8h, v0.16b, v1.16b 194 195 uabal v30.8h, v2.8b, v3.8b 196 uabal2 v30.8h, v2.16b, v3.16b 197 198 ld1 {v4.16b}, [x0], x2 199 ld1 {v5.16b}, [x1], x3 200 ld1 {v6.16b}, [x0], x2 201 ld1 {v7.16b}, [x1], x3 202 203 uabal v30.8h, v4.8b, v5.8b 204 uabal2 v30.8h, v4.16b, v5.16b 205 206 uabal v30.8h, v6.8b, v7.8b 207 uabal2 v30.8h, v6.16b, v7.16b 208 209 subs x6, x6, #1 210 bne core_loop_ime_compute_sad_16x8_av8 211 212 213 addp v30.8h, v30.8h, v30.8h 214 uaddlp v30.4s, v30.8h 215 addp v30.2s, v30.2s, v30.2s 216 217 st1 {v30.s}[0], [x5] 218 pop_v_regs 219 ret 220 221///** 222//****************************************************************************** 223//* 224//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit 225//* 226//* @par Description 227//* This functions computes SAD between 2 16x16 blocks. There is a provision 228//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 229//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 230//* 231//* @param[in] pu1_src 232//* UWORD8 pointer to the source 233//* 234//* @param[out] pu1_dst 235//* UWORD8 pointer to the destination 236//* 237//* @param[in] src_strd 238//* integer source stride 239//* 240//* @param[in] dst_strd 241//* integer destination stride 242//* 243//* @param[in] i4_max_sad 244//* integer maximum allowed distortion 245//* 246//* @param[in] pi4_mb_distortion 247//* integer evaluated sad 248//* 249//* @remarks 250//* 251//****************************************************************************** 252//*/ 253 254 .global ime_compute_sad_16x16_ea8_av8 255ime_compute_sad_16x16_ea8_av8: 256 257 push_v_regs 258 movi v30.8h, #0 259 260 add x7, x0, x2 261 add x8, x1, x3 262 263 lsl x2, x2, #1 264 lsl x3, x3, #1 265 266 ld1 {v0.16b}, [x0], x2 267 ld1 {v1.16b}, [x1], x3 268 ld1 {v2.16b}, [x0], x2 269 ld1 {v3.16b}, [x1], x3 270 ld1 {v8.16b}, [x0], x2 271 ld1 {v9.16b}, [x1], x3 272 ld1 {v10.16b}, [x0], x2 273 ld1 {v11.16b}, [x1], x3 274 ld1 {v12.16b}, [x0], x2 275 ld1 {v13.16b}, [x1], x3 276 ld1 {v14.16b}, [x0], x2 277 ld1 {v15.16b}, [x1], x3 278 ld1 {v16.16b}, [x0], x2 279 ld1 {v17.16b}, [x1], x3 280 ld1 {v18.16b}, [x0], x2 281 ld1 {v19.16b}, [x1], x3 282 283 uabal v30.8h, v0.8b, v1.8b 284 uabal2 v30.8h, v0.16b, v1.16b 285 286 uabal v30.8h, v2.8b, v3.8b 287 uabal2 v30.8h, v2.16b, v3.16b 288 289 uabal v30.8h, v8.8b, v9.8b 290 uabal2 v30.8h, v8.16b, v9.16b 291 292 uabal v30.8h, v10.8b, v11.8b 293 uabal2 v30.8h, v10.16b, v11.16b 294 295 uabal v30.8h, v12.8b, v13.8b 296 uabal2 v30.8h, v12.16b, v13.16b 297 298 uabal v30.8h, v14.8b, v15.8b 299 uabal2 v30.8h, v14.16b, v15.16b 300 301 uabal v30.8h, v16.8b, v17.8b 302 uabal2 v30.8h, v16.16b, v17.16b 303 304 uabal v30.8h, v18.8b, v19.8b 305 uabal2 v30.8h, v18.16b, v19.16b 306 307 addp v31.8h, v30.8h, v30.8h 308 uaddlp v31.4s, v31.8h 309 addp v31.2s, v31.2s, v31.2s 310 mov w6, v31.s[0] 311 cmp w6, w4 312 bgt end_func_16x16 313 314 //do the stuff again 315 ld1 {v0.16b}, [x7], x2 316 ld1 {v1.16b}, [x8], x3 317 ld1 {v2.16b}, [x7], x2 318 ld1 {v3.16b}, [x8], x3 319 ld1 {v8.16b}, [x7], x2 320 ld1 {v9.16b}, [x8], x3 321 ld1 {v10.16b}, [x7], x2 322 ld1 {v11.16b}, [x8], x3 323 ld1 {v12.16b}, [x7], x2 324 ld1 {v13.16b}, [x8], x3 325 ld1 {v14.16b}, [x7], x2 326 ld1 {v15.16b}, [x8], x3 327 ld1 {v16.16b}, [x7], x2 328 ld1 {v17.16b}, [x8], x3 329 ld1 {v18.16b}, [x7], x2 330 ld1 {v19.16b}, [x8], x3 331 332 uabal v30.8h, v0.8b, v1.8b 333 uabal2 v30.8h, v0.16b, v1.16b 334 335 uabal v30.8h, v2.8b, v3.8b 336 uabal2 v30.8h, v2.16b, v3.16b 337 338 uabal v30.8h, v8.8b, v9.8b 339 uabal2 v30.8h, v8.16b, v9.16b 340 341 uabal v30.8h, v10.8b, v11.8b 342 uabal2 v30.8h, v10.16b, v11.16b 343 344 uabal v30.8h, v12.8b, v13.8b 345 uabal2 v30.8h, v12.16b, v13.16b 346 347 uabal v30.8h, v14.8b, v15.8b 348 uabal2 v30.8h, v14.16b, v15.16b 349 350 uabal v30.8h, v16.8b, v17.8b 351 uabal2 v30.8h, v16.16b, v17.16b 352 353 uabal v30.8h, v18.8b, v19.8b 354 uabal2 v30.8h, v18.16b, v19.16b 355 356 addp v31.8h, v30.8h, v30.8h 357 uaddlp v31.4s, v31.8h 358 addp v31.2s, v31.2s, v31.2s 359 360end_func_16x16: 361 st1 {v31.s}[0], [x5] 362 pop_v_regs 363 ret 364 365 366///* 367////--------------------------------------------------------------------------- 368//// Function Name : ime_calculate_sad2_prog_av8() 369//// 370//// Detail Description : This function find the sad values of 4 Progressive MBs 371//// at one shot 372//// 373//// Platform : CortexAv8/NEON . 374//// 375////----------------------------------------------------------------------------- 376//*/ 377 378 .global ime_calculate_sad2_prog_av8 379ime_calculate_sad2_prog_av8: 380 381 // x0 = ref1 <UWORD8 *> 382 // x1 = ref2 <UWORD8 *> 383 // x2 = src <UWORD8 *> 384 // x3 = RefBufferWidth <UWORD32> 385 // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> 386 push_v_regs 387 mov x6, #8 388 movi v30.8h, #0 389 movi v31.8h, #0 390 391core_loop_ime_calculate_sad2_prog_av8: 392 393 ld1 {v0.16b}, [x0], x3 394 ld1 {v1.16b}, [x1], x3 395 ld1 {v2.16b}, [x3], x4 396 397 ld1 {v3.16b}, [x0], x3 398 ld1 {v4.16b}, [x1], x3 399 ld1 {v5.16b}, [x3], x4 400 401 402 uabal v30.8h, v0.8b, v2.8b 403 uabal2 v30.8h, v0.16b, v2.16b 404 uabal v31.8h, v1.8b, v2.8b 405 uabal2 v31.8h, v1.16b, v2.16b 406 407 uabal v30.8h, v3.8b, v5.8b 408 uabal2 v30.8h, v3.16b, v5.16b 409 uabal v31.8h, v4.8b, v5.8b 410 uabal2 v31.8h, v4.16b, v5.16b 411 412 413 ld1 {v6.16b}, [x0], x3 414 ld1 {v7.16b}, [x1], x3 415 ld1 {v8.16b}, [x3], x4 416 417 ld1 {v9.16b}, [x0], x3 418 ld1 {v10.16b}, [x1], x3 419 ld1 {v11.16b}, [x3], x4 420 421 uabal v30.8h, v6.8b, v8.8b 422 uabal2 v30.8h, v6.16b, v8.16b 423 uabal v31.8h, v7.8b, v8.8b 424 uabal2 v31.8h, v7.16b, v8.16b 425 426 uabal v30.8h, v9.8b, v11.8b 427 uabal2 v30.8h, v9.16b, v11.16b 428 uabal v31.8h, v10.8b, v11.8b 429 uabal2 v31.8h, v0.16b, v11.16b 430 431 subs x6, x6, #1 432 bne core_loop_ime_calculate_sad2_prog_av8 433 434 addp v30.8h, v30.8h, v31.8h 435 uaddlp v30.4s, v30.8h 436 addp v30.2s, v30.2s, v30.2s 437 shl v30.2s, v30.2s, #1 438 439 st1 {v30.2s}, [x5] 440 pop_v_regs 441 ret 442 443///* 444////--------------------------------------------------------------------------- 445//// Function Name : Calculate_Mad3_prog() 446//// 447//// Detail Description : This function find the sad values of 4 Progressive MBs 448//// at one shot 449//// 450//// Platform : CortexA8/NEON . 451//// 452////----------------------------------------------------------------------------- 453//*/ 454 455 .global ime_calculate_sad3_prog_av8 456ime_calculate_sad3_prog_av8: 457 458 // x0 = ref1 <UWORD8 *> 459 // x1 = ref2 <UWORD8 *> 460 // x2 = ref3 <UWORD8 *> 461 // x3 = src <UWORD8 *> 462 // stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> 463 464 465 // x0 = ref1 <UWORD8 *> 466 // x1 = ref2 <UWORD8 *> 467 // x2 = src <UWORD8 *> 468 // x3 = RefBufferWidth <UWORD32> 469 // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> 470 push_v_regs 471 mov x6, #16 472 movi v29.8h, #0 473 movi v30.8h, #0 474 movi v31.8h, #0 475 476core_loop_ime_calculate_sad3_prog_av8: 477 478 ld1 {v0.16b}, [x0], x4 479 ld1 {v1.16b}, [x1], x4 480 ld1 {v2.16b}, [x2], x4 481 ld1 {v3.16b}, [x3], x5 482 483 uabal v29.8h, v0.8b, v3.8b 484 uabal2 v29.8h, v0.16b, v3.16b 485 uabal v30.8h, v1.8b, v3.8b 486 uabal2 v30.8h, v1.16b, v3.16b 487 uabal v31.8h, v2.8b, v3.8b 488 uabal2 v31.8h, v2.16b, v3.16b 489 490 ld1 {v4.16b}, [x0], x4 491 ld1 {v5.16b}, [x1], x4 492 ld1 {v6.16b}, [x2], x4 493 ld1 {v7.16b}, [x3], x5 494 495 uabal v29.8h, v4.8b, v7.8b 496 uabal2 v29.8h, v4.16b, v7.16b 497 uabal v30.8h, v5.8b, v7.8b 498 uabal2 v30.8h, v5.16b, v7.16b 499 uabal v31.8h, v6.8b, v7.8b 500 uabal2 v31.8h, v6.16b, v7.16b 501 502 subs x6, x6, #1 503 bne core_loop_ime_calculate_sad2_prog_av8 504 505 addp v30.8h, v30.8h, v31.8h 506 uaddlp v30.4s, v30.8h 507 addp v30.2s, v30.2s, v30.2s 508 shl v30.2s, v30.2s, #1 509 510 st1 {v30.2s}, [x5] 511 pop_v_regs 512 ret 513 514 515 516 517///** 518//****************************************************************************** 519//* 520//* @brief computes distortion (SAD) for sub-pel motion estimation 521//* 522//* @par Description 523//* This functions computes SAD for all the 8 half pel points 524//* 525//* @param[out] pi4_sad 526//* integer evaluated sad 527//* pi4_sad[0] - half x 528//* pi4_sad[1] - half x - 1 529//* pi4_sad[2] - half y 530//* pi4_sad[3] - half y - 1 531//* pi4_sad[4] - half xy 532//* pi4_sad[5] - half xy - 1 533//* pi4_sad[6] - half xy - strd 534//* pi4_sad[7] - half xy - 1 - strd 535//* 536//* @remarks 537//* 538//****************************************************************************** 539//*/ 540 541.text 542.p2align 2 543 544 .global ime_sub_pel_compute_sad_16x16_av8 545ime_sub_pel_compute_sad_16x16_av8: 546 push_v_regs 547 sub x7, x1, #1 //x left 548 sub x8, x2, x5 //y top 549 sub x9, x3, #1 //xy left 550 sub x10, x3, x5 //xy top 551 sub x11, x10, #1 //xy top left 552 553 movi v24.8h, #0 554 movi v25.8h, #0 555 movi v26.8h, #0 556 movi v27.8h, #0 557 movi v28.8h, #0 558 movi v29.8h, #0 559 movi v30.8h, #0 560 movi v31.8h, #0 561 562 mov x12, #16 563core_loop_ime_sub_pel_compute_sad_16x16_av8: 564 565 ld1 {v0.16b}, [x0], x4 //src 566 ld1 {v1.16b}, [x1], x5 //x 567 ld1 {v2.16b}, [x7], x5 //x left 568 ld1 {v3.16b}, [x2], x5 //y 569 ld1 {v9.16b}, [x8], x5 //y top 570 ld1 {v10.16b}, [x3], x5 //xy 571 ld1 {v11.16b}, [x9], x5 //xy left 572 ld1 {v12.16b}, [x10], x5 //xy top 573 ld1 {v13.16b}, [x11], x5 //xy top left 574 575 uabal v24.8h, v0.8b, v1.8b 576 uabal2 v24.8h, v0.16b, v1.16b 577 uabal v25.8h, v0.8b, v2.8b 578 uabal2 v25.8h, v0.16b, v2.16b 579 uabal v26.8h, v0.8b, v3.8b 580 uabal2 v26.8h, v0.16b, v3.16b 581 uabal v27.8h, v0.8b, v9.8b 582 uabal2 v27.8h, v0.16b, v9.16b 583 uabal v28.8h, v0.8b, v10.8b 584 uabal2 v28.8h, v0.16b, v10.16b 585 uabal v29.8h, v0.8b, v11.8b 586 uabal2 v29.8h, v0.16b, v11.16b 587 uabal v30.8h, v0.8b, v12.8b 588 uabal2 v30.8h, v0.16b, v12.16b 589 uabal v31.8h, v0.8b, v13.8b 590 uabal2 v31.8h, v0.16b, v13.16b 591 592 subs x12, x12, #1 593 bne core_loop_ime_sub_pel_compute_sad_16x16_av8 594 595 addp v24.8h, v24.8h, v25.8h 596 addp v26.8h, v26.8h, v27.8h 597 addp v28.8h, v28.8h, v29.8h 598 addp v30.8h, v30.8h, v31.8h 599 600 uaddlp v24.4s, v24.8h 601 uaddlp v26.4s, v26.8h 602 uaddlp v28.4s, v28.8h 603 uaddlp v30.4s, v30.8h 604 605 addp v24.4s, v24.4s, v26.4s 606 addp v25.4s, v28.4s, v30.4s 607 608 st1 {v24.4s-v25.4s}, [x6] 609 610 611 pop_v_regs 612 ret 613 614 615///** 616//****************************************************************************** 617//* 618//* @brief computes distortion (SAD) between 2 16x16 blocks 619//* 620//* @par Description 621//* This functions computes SAD between 2 16x16 blocks. There is a provision 622//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 623//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 624//* 625//* @param[in] pu1_src 626//* UWORD8 pointer to the source 627//* 628//* @param[out] pu1_dst 629//* UWORD8 pointer to the destination 630//* 631//* @param[in] src_strd 632//* integer source stride 633//* 634//* @param[in] dst_strd 635//* integer destination stride 636//* 637//* @param[in] i4_max_sad 638//* integer maximum allowed distortion 639//* 640//* @param[in] pi4_mb_distortion 641//* integer evaluated sad 642//* 643//* @remarks 644//* 645//****************************************************************************** 646//*/ 647 .global ime_compute_sad_16x16_av8 648ime_compute_sad_16x16_av8: 649 push_v_regs 650 mov x6, #4 651 movi v30.8h, #0 652 653core_loop_ime_compute_sad_16x16_av8: 654 655 ld1 {v0.16b}, [x0], x2 656 ld1 {v1.16b}, [x1], x3 657 ld1 {v2.16b}, [x0], x2 658 ld1 {v3.16b}, [x1], x3 659 660 uabal v30.8h, v0.8b, v1.8b 661 uabal2 v30.8h, v0.16b, v1.16b 662 663 uabal v30.8h, v2.8b, v3.8b 664 uabal2 v30.8h, v2.16b, v3.16b 665 666 ld1 {v4.16b}, [x0], x2 667 ld1 {v5.16b}, [x1], x3 668 ld1 {v6.16b}, [x0], x2 669 ld1 {v7.16b}, [x1], x3 670 671 uabal v30.8h, v4.8b, v5.8b 672 uabal2 v30.8h, v4.16b, v5.16b 673 674 uabal v30.8h, v6.8b, v7.8b 675 uabal2 v30.8h, v6.16b, v7.16b 676 677 subs x6, x6, #1 678 bne core_loop_ime_compute_sad_16x16_av8 679 680 681 addp v30.8h, v30.8h, v30.8h 682 uaddlp v30.4s, v30.8h 683 addp v30.2s, v30.2s, v30.2s 684 685 st1 {v30.s}[0], [x5] 686 pop_v_regs 687 ret 688 689 690///* 691////--------------------------------------------------------------------------- 692//// Function Name : Calculate_Mad4_prog() 693//// 694//// Detail Description : This function find the sad values of 4 Progressive MBs 695//// at one shot 696//// 697//// Platform : CortexA8/NEON . 698//// 699////----------------------------------------------------------------------------- 700//*/ 701 702 .global ime_calculate_sad4_prog_av8 703ime_calculate_sad4_prog_av8: 704 push_v_regs 705 sub x5, x0, #1 //left 706 add x6, x0, #1 //right 707 sub x7, x0, x2 //top 708 add x8, x0, x2 //bottom 709 710 movi v28.8h, #0 711 movi v29.8h, #0 712 movi v30.8h, #0 713 movi v31.8h, #0 714 715 mov x9, #16 716core_loop_ime_calculate_sad4_prog_av8: 717 718 ld1 {v0.16b}, [x1], x3 719 ld1 {v1.16b}, [x5], x2 720 ld1 {v2.16b}, [x6], x2 721 ld1 {v3.16b}, [x7], x2 722 ld1 {v9.16b}, [x8], x2 723 724 uabal v28.8h, v0.8b, v1.8b 725 uabal2 v28.8h, v0.16b, v1.16b 726 uabal v29.8h, v0.8b, v2.8b 727 uabal2 v29.8h, v0.16b, v2.16b 728 uabal v30.8h, v0.8b, v3.8b 729 uabal2 v30.8h, v0.16b, v3.16b 730 uabal v31.8h, v0.8b, v9.8b 731 uabal2 v31.8h, v0.16b, v9.16b 732 733 subs x9, x9, #1 734 bne core_loop_ime_calculate_sad4_prog_av8 735 736 addp v28.8h, v28.8h, v29.8h 737 addp v30.8h, v30.8h, v31.8h 738 739 uaddlp v28.4s, v28.8h 740 uaddlp v30.4s, v30.8h 741 742 addp v28.4s, v28.4s, v30.4s 743 st1 {v28.4s}, [x4] 744 pop_v_regs 745 ret 746 747 748 749//***************************************************************************** 750//* 751//* Function Name : ime_compute_satqd_16x16_lumainter_av8 752//* Description : This fucntion computes SAD for a 16x16 block. 753// : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant 754// 755// Arguments : x0 :pointer to src buffer 756// x1 :pointer to est buffer 757// x2 :source stride 758// x3 :est stride 759// STACk :Threshold,distotion,is_nonzero 760//* 761//* Values Returned : NONE 762//* 763//* Register Usage : x0-x11 764//* Stack Usage : 765//* Cycles : Around 766//* Interruptiaility : Interruptable 767//* 768//* Known Limitations 769//* \Assumptions : 770//* 771//* Revision History : 772//* DD MM YYYY Author(s) Changes 773//* 14 04 2014 Harinarayanan K K First version 774//* 775//***************************************************************************** 776 .global ime_compute_satqd_16x16_lumainter_av8 777ime_compute_satqd_16x16_lumainter_av8: 778 //x0 :pointer to src buffer 779 //x1 :pointer to est buffer 780 //x2 :Source stride 781 //x3 :Pred stride 782 //x4 :Threshold pointer 783 //x5 :Distortion,ie SAD 784 //x6 :is nonzero 785 //x7 :loop counter 786 push_v_regs 787 stp d8, d9, [sp, #-16]! 788 stp d10, d11, [sp, #-16]! 789 stp d12, d13, [sp, #-16]! 790 stp d14, d15, [sp, #-16]! 791 792 ld1 {v30.8h}, [x4] 793 794 dup v20.4h, v30.h[1] //ls1 795 dup v24.4h, v30.h[0] //ls2 796 dup v21.4h, v30.h[5] //ls3 797 dup v25.4h, v30.h[7] //ls4 798 dup v22.4h, v30.h[3] //ls5 799 dup v26.4h, v30.h[4] //ls6 800 dup v23.4h, v30.h[6] //ls7 801 dup v27.4h, v30.h[2] //ls8 802 803 mov v20.d[1], v24.d[0] 804 mov v21.d[1], v25.d[0] 805 mov v22.d[1], v26.d[0] 806 mov v23.d[1], v27.d[0] 807 808 add x4, x4, #16 809 ld1 {v29.h}[0], [x4] 810 dup v29.4h, v29.h[0] 811 812 movi v31.8h, #0 813 814 mov x7, #4 815core_loop_satqd_ime_compute_satqd_16x16_lumainter: 816 ld1 {v0.16b}, [x0], x2 817 ld1 {v1.16b}, [x1], x3 818 ld1 {v2.16b}, [x0], x2 819 ld1 {v3.16b}, [x1], x3 820 ld1 {v4.16b}, [x0], x2 821 ld1 {v5.16b}, [x1], x3 822 ld1 {v6.16b}, [x0], x2 823 ld1 {v7.16b}, [x1], x3 824 825 uabdl v10.8h, v0.8b, v1.8b 826 uabdl2 v15.8h, v0.16b, v1.16b 827 uabdl v11.8h, v2.8b, v3.8b 828 uabdl2 v16.8h, v2.16b, v3.16b 829 uabdl v12.8h, v4.8b, v5.8b 830 uabdl2 v17.8h, v4.16b, v5.16b 831 uabdl v13.8h, v6.8b, v7.8b 832 uabdl2 v18.8h, v6.16b, v7.16b 833 834 add v0.8h, v10.8h, v13.8h 835 add v1.8h, v11.8h, v12.8h 836 add v2.8h, v15.8h, v18.8h 837 add v3.8h, v16.8h, v17.8h 838 839 //v0 : S1 S4 S4 S1 A1 A4 A4 A1 840 //v1 : S2 S3 S3 S2 A2 A3 A3 A2 841 //v2 : B1 B4 B4 B1 X1 X4 X4 X1 842 //v3 : B3 B2 B2 B3 X3 X2 X2 X3 843 844 trn1 v4.8h, v0.8h, v1.8h 845 trn2 v5.8h, v0.8h, v1.8h 846 trn1 v6.8h, v2.8h, v3.8h 847 trn2 v7.8h, v2.8h, v3.8h 848 849 trn1 v0.4s, v4.4s, v6.4s 850 trn2 v2.4s, v4.4s, v6.4s 851 trn1 v1.4s, v5.4s, v7.4s 852 trn2 v3.4s, v5.4s, v7.4s 853 854 add v4.8h, v0.8h, v3.8h 855 add v5.8h, v1.8h, v2.8h 856 //v4 : S1 S2 B1 B2 A1 A2 X1 X2 857 //v5 : S4 S3 B4 B3 A4 A3 X4 X3 858 859 //compute sad for each 4x4 block 860 add v6.8h, v4.8h, v5.8h 861 addp v19.8h, v6.8h, v6.8h 862 //duplicate the sad into 128 bit so that we can compare using 128bit 863 add v31.4h, v31.4h, v19.4h 864 865 //sad_2 = sad_1<<1; 866 shl v28.8h, v19.8h, #1 867 868 //sad_2 - pu2_thrsh 869 sub v24.8h, v28.8h, v20.8h 870 sub v25.8h, v28.8h, v21.8h 871 sub v26.8h, v28.8h, v22.8h 872 sub v27.8h, v28.8h, v23.8h 873 874 trn1 v0.4s, v4.4s, v5.4s 875 trn2 v1.4s, v4.4s, v5.4s 876 //v0 : S1 S2 S4 S3 A1 A2 A4 A3 877 //v1 : B1 B2 B4 B3 X1 X2 X4 X3 878 879 trn1 v4.8h, v0.8h, v1.8h 880 trn2 v5.8h, v0.8h, v1.8h 881 //v4 : S1 B1 S4 B4 A1 X1 A4 X4 882 //v5 : S2 B2 S3 B3 A2 X2 A3 X3 883 884 mov v7.s[0], v4.s[1] 885 mov v7.s[1], v4.s[3] 886 mov v6.s[0], v5.s[1] // V4 //S1 B1 A1 X1 887 mov v6.s[1], v5.s[3] // V5 //S2 B2 A2 X2 888 mov v4.s[1], v4.s[2] // V6 //S3 B3 A3 X3 889 mov v5.s[1], v5.s[2] // V7 //S4 B4 A4 X4 890 891 shl v0.4h, v4.4h, #1 //S1<<1 892 shl v1.4h, v5.4h, #1 //S2<<1 893 shl v2.4h, v6.4h, #1 //S3<<1 894 shl v3.4h, v7.4h, #1 //S4<<1 895 896 add v8.4h, v5.4h, v6.4h //(s2[j] + s3[j])) 897 add v9.4h, v4.4h, v7.4h //(s1[j] + s4[j])) 898 add v10.4h, v6.4h, v7.4h //(s3[j] + s4[j])) 899 sub v11.4h, v6.4h, v0.4h //(s3[j] - (s1[j]<<1)) 900 sub v12.4h, v7.4h, v1.4h //(s4[j] - (s2[j]<<1)) 901 add v13.4h, v4.4h, v5.4h //(s1[j] + s2[j])) 902 sub v14.4h, v5.4h, v3.4h //(s2[j] - (s4[j]<<1))) 903 sub v15.4h, v4.4h, v2.4h //(s1[j] - (s3[j]<<1))) 904 905 mov v8.d[1], v9.d[0] 906 mov v10.d[1], v11.d[0] 907 mov v12.d[1], v13.d[0] 908 mov v14.d[1], v15.d[0] 909 910 cmge v0.8h, v24.8h, v8.8h //ls1 ls2 911 cmge v1.8h, v25.8h, v10.8h //ls3 ls4 912 cmge v2.8h, v26.8h, v12.8h //ls5 ls6 913 cmge v3.8h, v27.8h, v14.8h //ls7 ls8 914 cmge v4.4h, v19.4h, v29.4h //sad 915 916 orr v0.16b, v0.16b, v1.16b 917 orr v2.16b, v2.16b, v3.16b 918 orr v2.16b, v0.16b, v2.16b 919 xtn v2.8b, v2.8h 920 orr v2.8b, v2.8b, v4.8b 921 922 //if the comparison is non zero, out 923 mov x4, v2.d[0] 924 cmp x4, #0 925 bne core_loop_compute_sad_pre 926 927 subs x7, x7, #1 928 bne core_loop_satqd_ime_compute_satqd_16x16_lumainter 929 b satdq_end_func 930 931 932core_loop_compute_sad: 933 ld1 {v0.16b}, [x0], x2 934 ld1 {v1.16b}, [x1], x3 935 ld1 {v2.16b}, [x0], x2 936 ld1 {v3.16b}, [x1], x3 937 938 uabal v31.8h, v0.8b, v1.8b 939 uabal2 v31.8h, v0.16b, v1.16b 940 941 uabal v31.8h, v2.8b, v3.8b 942 uabal2 v31.8h, v2.16b, v3.16b 943 944 ld1 {v4.16b}, [x0], x2 945 ld1 {v5.16b}, [x1], x3 946 ld1 {v6.16b}, [x0], x2 947 ld1 {v7.16b}, [x1], x3 948 949 uabal v31.8h, v4.8b, v5.8b 950 uabal2 v31.8h, v4.16b, v5.16b 951 952 uabal v31.8h, v6.8b, v7.8b 953 uabal2 v31.8h, v6.16b, v7.16b 954 955core_loop_compute_sad_pre: 956 subs x7, x7, #1 957 bne core_loop_compute_sad 958 959satdq_end_func: 960 961 mov x7, #1 962 cmp x4, #0 963 csel x7, x4, x7, eq 964 str w7, [x6] 965 966 addp v31.8h, v31.8h, v31.8h 967 uaddlp v31.4s, v31.8h 968 addp v31.2s, v31.2s, v31.2s 969 st1 {v31.s}[0], [x5] 970 971 972 ldp d14, d15, [sp], #16 973 ldp d12, d13, [sp], #16 974 ldp d10, d11, [sp], #16 975 ldp d8, d9, [sp], #16 976 pop_v_regs 977 ret 978