1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21///** 22//****************************************************************************** 23//* 24//* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) 25//* and do the prediction. 26//* 27//* @par Description 28//* This function evaluates first three 16x16 modes and compute corresponding sad 29//* and return the buffer predicted with best mode. 30//* 31//* @param[in] pu1_src 32//* UWORD8 pointer to the source 33//* 34//** @param[in] pu1_ngbr_pels_i16 35//* UWORD8 pointer to neighbouring pels 36//* 37//* @param[out] pu1_dst 38//* UWORD8 pointer to the destination 39//* 40//* @param[in] src_strd 41//* integer source stride 42//* 43//* @param[in] dst_strd 44//* integer destination stride 45//* 46//* @param[in] u4_n_avblty 47//* availability of neighbouring pixels 48//* 49//* @param[in] u4_intra_mode 50//* Pointer to the variable in which best mode is returned 51//* 52//* @param[in] pu4_sadmin 53//* Pointer to the variable in which minimum sad is returned 54//* 55//* @param[in] u4_valid_intra_modes 56//* Says what all modes are valid 57//* 58//* 59//* @return none 60//* 61//****************************************************************************** 62//*/ 63// 64//void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, 65// UWORD8 *pu1_ngbr_pels_i16, 66// UWORD8 *pu1_dst, 67// UWORD32 src_strd, 68// UWORD32 dst_strd, 69// WORD32 u4_n_avblty, 70// UWORD32 *u4_intra_mode, 71// WORD32 *pu4_sadmin, 72// UWORD32 u4_valid_intra_modes) 73// 74.text 75.p2align 2 76.include "ih264_neon_macros.s" 77 78.global ih264e_evaluate_intra16x16_modes_av8 79 80ih264e_evaluate_intra16x16_modes_av8: 81 82//x0 = pu1_src, 83//x1 = pu1_ngbr_pels_i16, 84//x2 = pu1_dst, 85//x3 = src_strd, 86//x4 = dst_strd, 87//x5 = u4_n_avblty, 88//x6 = u4_intra_mode, 89//x7 = pu4_sadmin 90 91 92 93 // STMFD sp!, {x4-x12, x14} //store register values to stack 94 push_v_regs 95 stp x19, x20, [sp, #-16]! 96 97 ldr x16, [sp, #80] 98 mov x17, x4 99 mov x14, x6 100 mov x15, x7 101 102 103 sub v0.16b, v0.16b, v0.16b 104 sub v1.16b, v1.16b, v1.16b 105 mov w10, #0 106 mov w11 , #3 107 108 ands x6, x5, #0x01 109 beq top_available //LEFT NOT AVAILABLE 110 ld1 {v0.16b}, [x1] 111 add w10, w10, #8 112 add w11, w11, #1 113top_available: 114 ands x6, x5, #0x04 115 beq none_available 116 add x6, x1, #17 117 ld1 {v1.16b}, [x6] 118 add w10, w10, #8 119 add w11, w11, #1 120 b summation 121none_available: 122 cmp x5, #0 123 bne summation 124 mov w6, #128 125 dup v30.16b, w6 126 dup v31.16b, w6 127 b sad_comp 128summation: 129 uaddl v2.8h, v0.8b, v1.8b 130 uaddl2 v3.8h, v0.16b, v1.16b 131 dup v10.8h, w10 132 neg w11, w11 133 dup v20.8h, w11 134 add v0.8h, v2.8h, v3.8h 135 mov v1.d[0], v0.d[1] 136 add v0.4h, v0.4h, v1.4h 137 addp v0.4h, v0.4h , v0.4h 138 addp v0.4h, v0.4h , v0.4h 139 add v0.4h, v0.4h, v10.4h 140 uqshl v0.8h, v0.8h, v20.8h 141 sqxtun v0.8b, v0.8h 142 143 dup v30.16b, v0.b[0] 144 dup v31.16b, v0.b[0] 145 146 147sad_comp: 148 ld1 { v0.2s, v1.2s }, [x0], x3 // source x0w 0 149 150 ld1 { v2.2s, v3.2s}, [x0], x3 //row 1 151 152 ld1 { v4.2s, v5.2s}, [x0], x3 //row 2 153 154 ld1 { v6.2s, v7.2s}, [x0], x3 //row 3 155 156 //--------------------- 157 158 //values for vertical prediction 159 add x6, x1, #17 160 ld1 {v10.8b}, [x6], #8 161 ld1 {v11.8b}, [x6], #8 162 ld1 {v9.16b}, [x1] 163 164 165 166 dup v20.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// 167 dup v21.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// 168 169 170///* computing SADs for all three modes*/ 171 ///vertical row 0@ 172 uabdl v16.8h, v0.8b, v10.8b 173 uabdl v18.8h, v1.8b, v11.8b 174 175 ///HORZ row 0@ 176 uabdl v26.8h, v0.8b, v20.8b 177 uabdl v28.8h, v1.8b, v21.8b 178 179 ///dc row 0@ 180 uabdl v22.8h, v0.8b, v30.8b 181 uabdl v24.8h, v1.8b, v31.8b 182 183 184 185 186 187 dup v20.8b, v9.b[14] ///HORIZONTAL VALUE ROW=1// 188 dup v21.8b, v9.b[14] 189 190 191 ///vertical row 1@ 192 uabal v16.8h, v2.8b, v10.8b 193 uabal v18.8h, v3.8b, v11.8b 194 195 ld1 { v0.2s, v1.2s }, [x0], x3 //row 4 196 ///HORZ row 1@ 197 uabal v26.8h, v2.8b, v20.8b 198 uabal v28.8h, v3.8b, v21.8b 199 200 ///dc row 1@ 201 uabal v22.8h, v2.8b, v30.8b 202 uabal v24.8h, v3.8b, v31.8b 203 204 dup v20.8b, v9.b[13] ///HORIZONTAL VALUE ROW=2// 205 dup v21.8b, v9.b[13] 206 207 ///vertical row 2@ 208 uabal v16.8h, v4.8b, v10.8b 209 uabal v18.8h, v5.8b, v11.8b 210 211 ld1 { v2.2s, v3.2s}, [x0], x3 //row 5 212 ///HORZ row 2@ 213 uabal v26.8h, v4.8b, v20.8b 214 uabal v28.8h, v5.8b, v21.8b 215 216 ///dc row 2@ 217 uabal v22.8h, v4.8b, v30.8b 218 uabal v24.8h, v5.8b, v31.8b 219 220 dup v20.8b, v9.b[12] ///HORIZONTAL VALUE ROW=3// 221 dup v21.8b, v9.b[12] 222 223 ///vertical row 3@ 224 uabal v16.8h, v6.8b, v10.8b 225 uabal v18.8h, v7.8b, v11.8b 226 227 ld1 { v4.2s, v5.2s}, [x0], x3 //row 6 228 ///HORZ row 3@ 229 uabal v26.8h, v6.8b, v20.8b 230 uabal v28.8h, v7.8b, v21.8b 231 232 ///dc row 3@ 233 uabal v22.8h, v6.8b, v30.8b 234 uabal v24.8h, v7.8b, v31.8b 235//---------------------------------------------------------------------------------------------- 236 237 dup v20.8b, v9.b[11] ///HORIZONTAL VALUE ROW=0// 238 dup v21.8b, v9.b[11] 239 240 ///vertical row 0@ 241 uabal v16.8h, v0.8b, v10.8b 242 uabal v18.8h, v1.8b, v11.8b 243 244 ld1 { v6.2s, v7.2s}, [x0], x3 //row 7 245 ///HORZ row 0@ 246 uabal v26.8h, v0.8b, v20.8b 247 uabal v28.8h, v1.8b, v21.8b 248 249 ///dc row 0@ 250 uabal v22.8h, v0.8b, v30.8b 251 uabal v24.8h, v1.8b, v31.8b 252 253 dup v20.8b, v9.b[10] ///HORIZONTAL VALUE ROW=1// 254 dup v21.8b, v9.b[10] 255 256 ///vertical row 1@ 257 uabal v16.8h, v2.8b, v10.8b 258 uabal v18.8h, v3.8b, v11.8b 259 260 ld1 { v0.2s, v1.2s }, [x0], x3 //row 8 261 ///HORZ row 1@ 262 uabal v26.8h, v2.8b, v20.8b 263 uabal v28.8h, v3.8b, v21.8b 264 265 ///dc row 1@ 266 uabal v22.8h, v2.8b, v30.8b 267 uabal v24.8h, v3.8b, v31.8b 268 269 dup v20.8b, v9.b[9] ///HORIZONTAL VALUE ROW=2// 270 dup v21.8b, v9.b[9] 271 272 ///vertical row 2@ 273 uabal v16.8h, v4.8b, v10.8b 274 uabal v18.8h, v5.8b, v11.8b 275 276 ld1 { v2.2s, v3.2s}, [x0], x3 //row 9 277 278 ///HORZ row 2@ 279 uabal v26.8h, v4.8b, v20.8b 280 uabal v28.8h, v5.8b, v21.8b 281 282 ///dc row 2@ 283 uabal v22.8h, v4.8b, v30.8b 284 uabal v24.8h, v5.8b, v31.8b 285 286 dup v20.8b, v9.b[8] ///HORIZONTAL VALUE ROW=3// 287 dup v21.8b, v9.b[8] 288 289 ///vertical row 3@ 290 uabal v16.8h, v6.8b, v10.8b 291 uabal v18.8h, v7.8b, v11.8b 292 293 ld1 { v4.2s, v5.2s}, [x0], x3 //row 10 294 295 ///HORZ row 3@ 296 uabal v26.8h, v6.8b, v20.8b 297 uabal v28.8h, v7.8b, v21.8b 298 299 ///dc row 3@ 300 uabal v22.8h, v6.8b, v30.8b 301 uabal v24.8h, v7.8b, v31.8b 302 303 304//------------------------------------------- 305 306 dup v20.8b, v9.b[7] ///HORIZONTAL VALUE ROW=0// 307 dup v21.8b, v9.b[7] 308 309 ///vertical row 0@ 310 uabal v16.8h, v0.8b, v10.8b 311 uabal v18.8h, v1.8b, v11.8b 312 313 ld1 { v6.2s, v7.2s}, [x0], x3 //row11 314 315 ///HORZ row 0@ 316 uabal v26.8h, v0.8b, v20.8b 317 uabal v28.8h, v1.8b, v21.8b 318 319 ///dc row 0@ 320 uabal v22.8h, v0.8b, v30.8b 321 uabal v24.8h, v1.8b, v31.8b 322 323 dup v20.8b, v9.b[6] ///HORIZONTAL VALUE ROW=1// 324 dup v21.8b, v9.b[6] 325 326 ///vertical row 1@ 327 uabal v16.8h, v2.8b, v10.8b 328 uabal v18.8h, v3.8b, v11.8b 329 330 ld1 { v0.2s, v1.2s }, [x0], x3 //row12 331 332 ///HORZ row 1@ 333 uabal v26.8h, v2.8b, v20.8b 334 uabal v28.8h, v3.8b, v21.8b 335 336 ///dc row 1@ 337 uabal v22.8h, v2.8b, v30.8b 338 uabal v24.8h, v3.8b, v31.8b 339 340 dup v20.8b, v9.b[5] ///HORIZONTAL VALUE ROW=2// 341 dup v21.8b, v9.b[5] 342 343 ///vertical row 2@ 344 uabal v16.8h, v4.8b, v10.8b 345 uabal v18.8h, v5.8b, v11.8b 346 347 ld1 { v2.2s, v3.2s}, [x0], x3 //row13 348 349 ///HORZ row 2@ 350 uabal v26.8h, v4.8b, v20.8b 351 uabal v28.8h, v5.8b, v21.8b 352 353 ///dc row 2@ 354 uabal v22.8h, v4.8b, v30.8b 355 uabal v24.8h, v5.8b, v31.8b 356 357 dup v20.8b, v9.b[4] ///HORIZONTAL VALUE ROW=3// 358 dup v21.8b, v9.b[4] 359 360 ///vertical row 3@ 361 uabal v16.8h, v6.8b, v10.8b 362 uabal v18.8h, v7.8b, v11.8b 363 364 ld1 { v4.2s, v5.2s}, [x0], x3 //row14 365 366 ///HORZ row 3@ 367 uabal v26.8h, v6.8b, v20.8b 368 uabal v28.8h, v7.8b, v21.8b 369 370 ///dc row 3@ 371 uabal v22.8h, v6.8b, v30.8b 372 uabal v24.8h, v7.8b, v31.8b 373 //----------------------------------------------------------------- 374 375 dup v20.8b, v9.b[3] ///HORIZONTAL VALUE ROW=0// 376 dup v21.8b, v9.b[3] 377 378 ///vertical row 0@ 379 uabal v16.8h, v0.8b, v10.8b 380 uabal v18.8h, v1.8b, v11.8b 381 382 ld1 { v6.2s, v7.2s}, [x0], x3 //row15 383 384 ///HORZ row 0@ 385 uabal v26.8h, v0.8b, v20.8b 386 uabal v28.8h, v1.8b, v21.8b 387 388 ///dc row 0@ 389 uabal v22.8h, v0.8b, v30.8b 390 uabal v24.8h, v1.8b, v31.8b 391 392 dup v20.8b, v9.b[2] ///HORIZONTAL VALUE ROW=1// 393 dup v21.8b, v9.b[2] 394 395 ///vertical row 1@ 396 uabal v16.8h, v2.8b, v10.8b 397 uabal v18.8h, v3.8b, v11.8b 398 399 ///HORZ row 1@ 400 uabal v26.8h, v2.8b, v20.8b 401 uabal v28.8h, v3.8b, v21.8b 402 403 ///dc row 1@ 404 uabal v22.8h, v2.8b, v30.8b 405 uabal v24.8h, v3.8b, v31.8b 406 407 dup v20.8b, v9.b[1] ///HORIZONTAL VALUE ROW=2// 408 dup v21.8b, v9.b[1] 409 410 ///vertical row 2@ 411 uabal v16.8h, v4.8b, v10.8b 412 uabal v18.8h, v5.8b, v11.8b 413 414 ///HORZ row 2@ 415 uabal v26.8h, v4.8b, v20.8b 416 uabal v28.8h, v5.8b, v21.8b 417 418 ///dc row 2@ 419 uabal v22.8h, v4.8b, v30.8b 420 uabal v24.8h, v5.8b, v31.8b 421 422 dup v20.8b, v9.b[0] ///HORIZONTAL VALUE ROW=3// 423 dup v21.8b, v9.b[0] 424 425 ///vertical row 3@ 426 uabal v16.8h, v6.8b, v10.8b 427 uabal v18.8h, v7.8b, v11.8b 428 429 ///HORZ row 3@ 430 uabal v26.8h, v6.8b, v20.8b 431 uabal v28.8h, v7.8b, v21.8b 432 433 ///dc row 3@ 434 uabal v22.8h, v6.8b, v30.8b 435 uabal v24.8h, v7.8b, v31.8b 436 //------------------------------------------------------------------------------ 437 438 439 //vert sum 440 441 add v16.8h, v16.8h , v18.8h 442 mov v18.d[0], v16.d[1] 443 add v16.4h, v16.4h , v18.4h 444 uaddlp v16.2s, v16.4h 445 addp v16.2s, v16.2s, v16.2s 446 smov x8, v16.s[0] //dc 447 448 449 //horz sum 450 451 add v26.8h, v26.8h , v28.8h 452 mov v28.d[0], v26.d[1] 453 add v26.4h, v26.4h , v28.4h 454 uaddlp v26.2s, v26.4h 455 addp v26.2s, v26.2s, v26.2s 456 smov x9, v26.s[0] 457 458 //dc sum 459 460 add v24.8h, v22.8h , v24.8h ///DC 461 mov v25.d[0], v24.d[1] 462 add v24.4h, v24.4h , v25.4h ///DC 463 uaddlp v24.2s, v24.4h ///DC 464 addp v24.2s, v24.2s, v24.2s ///DC 465 smov x10, v24.s[0] //dc 466 467 468 //----------------------- 469 mov x11, #1 470 lsl x11, x11, #30 471 472 mov x0, x16 473 //-------------------------------------------- 474 ands x7, x0, #01 // vert mode valid???????????? 475 csel x8, x11, x8, eq 476 477 478 ands x6, x0, #02 // horz mode valid???????????? 479 csel x9, x11, x9, eq 480 481 ands x6, x0, #04 // dc mode valid???????????? 482 csel x10, x11, x10, eq 483 484 485 486 487//-------------------------------- 488 489 mov x4, x17 490 mov x7, x15 491 mov x6, x14 492 493 //--------------------------- 494 495 //-------------------------- 496 497 cmp x8, x9 498 bgt not_vert 499 cmp x8, x10 500 bgt do_dc 501 502 ///---------------------- 503 //DO VERTICAL PREDICTION 504 str w8 , [x7] //MIN SAD 505 mov w8, #0 506 str w8 , [x6] // MODE 507 add x6, x1, #17 508 ld1 {v30.16b}, [x6] 509 b do_dc_vert 510 //----------------------------- 511not_vert: cmp x9, x10 512 bgt do_dc 513 514 ///---------------------- 515 //DO HORIZONTAL 516 str w9 , [x7] //MIN SAD 517 mov w9, #1 518 str w9 , [x6] // MODE 519 520 ld1 {v0.16b}, [x1] 521 dup v10.16b, v0.b[15] 522 dup v11.16b, v0.b[14] 523 dup v12.16b, v0.b[13] 524 dup v13.16b, v0.b[12] 525 st1 {v10.16b}, [x2], x4 526 dup v14.16b, v0.b[11] 527 st1 {v11.16b}, [x2], x4 528 dup v15.16b, v0.b[10] 529 st1 {v12.16b}, [x2], x4 530 dup v16.16b, v0.b[9] 531 st1 {v13.16b}, [x2], x4 532 dup v17.16b, v0.b[8] 533 st1 {v14.16b}, [x2], x4 534 dup v18.16b, v0.b[7] 535 st1 {v15.16b}, [x2], x4 536 dup v19.16b, v0.b[6] 537 st1 {v16.16b}, [x2], x4 538 dup v20.16b, v0.b[5] 539 st1 {v17.16b}, [x2], x4 540 dup v21.16b, v0.b[4] 541 st1 {v18.16b}, [x2], x4 542 dup v22.16b, v0.b[3] 543 st1 {v19.16b}, [x2], x4 544 dup v23.16b, v0.b[2] 545 st1 {v20.16b}, [x2], x4 546 dup v24.16b, v0.b[1] 547 st1 {v21.16b}, [x2], x4 548 dup v25.16b, v0.b[0] 549 st1 {v22.16b}, [x2], x4 550 st1 {v23.16b}, [x2], x4 551 st1 {v24.16b}, [x2], x4 552 st1 {v25.16b}, [x2], x4 553 554 555 556 b end_func 557 558 559 ///----------------------------- 560 561do_dc: ///--------------------------------- 562 //DO DC 563 str w10 , [x7] //MIN SAD 564 mov w10, #2 565 str w10 , [x6] // MODE 566do_dc_vert: 567 st1 {v30.4s}, [x2], x4 //0 568 st1 {v30.4s}, [x2], x4 //1 569 st1 {v30.4s}, [x2], x4 //2 570 st1 {v30.4s}, [x2], x4 //3 571 st1 {v30.4s}, [x2], x4 //4 572 st1 {v30.4s}, [x2], x4 //5 573 st1 {v30.4s}, [x2], x4 //6 574 st1 {v30.4s}, [x2], x4 //7 575 st1 {v30.4s}, [x2], x4 //8 576 st1 {v30.4s}, [x2], x4 //9 577 st1 {v30.4s}, [x2], x4 //10 578 st1 {v30.4s}, [x2], x4 //11 579 st1 {v30.4s}, [x2], x4 //12 580 st1 {v30.4s}, [x2], x4 //13 581 st1 {v30.4s}, [x2], x4 //14 582 st1 {v30.4s}, [x2], x4 //15 583 ///------------------ 584end_func: 585 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 586 ldp x19, x20, [sp], #16 587 pop_v_regs 588 ret 589 590 591