1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21///** 22//****************************************************************************** 23//* 24//* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) 25//* and do the prediction. 26//* 27//* @par Description 28//* This function evaluates first three intra chroma modes and compute corresponding sad 29//* and return the buffer predicted with best mode. 30//* 31//* @param[in] pu1_src 32//* UWORD8 pointer to the source 33//* 34//** @param[in] pu1_ngbr_pels 35//* UWORD8 pointer to neighbouring pels 36//* 37//* @param[out] pu1_dst 38//* UWORD8 pointer to the destination 39//* 40//* @param[in] src_strd 41//* integer source stride 42//* 43//* @param[in] dst_strd 44//* integer destination stride 45//* 46//* @param[in] u4_n_avblty 47//* availability of neighbouring pixels 48//* 49//* @param[in] u4_intra_mode 50//* Pointer to the variable in which best mode is returned 51//* 52//* @param[in] pu4_sadmin 53//* Pointer to the variable in which minimum sad is returned 54//* 55//* @param[in] u4_valid_intra_modes 56//* Says what all modes are valid 57//* 58//* 59//* @return none 60//* 61//****************************************************************************** 62//*/ 63// 64//void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, 65// UWORD8 *pu1_ngbr_pels_i16, 66// UWORD8 *pu1_dst, 67// UWORD32 src_strd, 68// UWORD32 dst_strd, 69// WORD32 u4_n_avblty, 70// UWORD32 *u4_intra_mode, 71// WORD32 *pu4_sadmin, 72// UWORD32 u4_valid_intra_modes) 73// 74.text 75.p2align 2 76.include "ih264_neon_macros.s" 77 78.global ih264e_evaluate_intra_chroma_modes_av8 79 80ih264e_evaluate_intra_chroma_modes_av8: 81 82//x0 = pu1_src, 83//x1 = pu1_ngbr_pels_i16, 84//x2 = pu1_dst, 85//w3 = src_strd, 86//w4 = dst_strd, 87//w5 = u4_n_avblty, 88//x6 = u4_intra_mode, 89//x7 = pu4_sadmin 90 91 92 93 // STMFD sp!, {x4-x12, x14} //store register values to stack 94 push_v_regs 95 sxtw x3, w3 96 sxtw x4, w4 97 stp x19, x20, [sp, #-16]! 98 //----------------------- 99 ldr w16, [sp, #80] 100 mov x17, x4 101 mov w18, w5 102 mov x14, x6 103 mov x15, x7 104 105 mov w19, #5 106 ands w6, w5, w19 107 beq none_available 108 cmp w6, #1 109 beq left_only_available 110 cmp w6, #4 111 beq top_only_available 112 113all_available: 114 ld1 {v0.8b, v1.8b}, [x1] 115 add x6, x1, #18 116 ld1 {v2.8b, v3.8b}, [x6] 117 uxtl v0.8h, v0.8b 118 uxtl v1.8h, v1.8b 119 addp v0.4s, v0.4s , v0.4s 120 addp v1.4s, v1.4s , v1.4s 121 addp v0.4s, v0.4s , v0.4s 122 addp v1.4s, v1.4s , v1.4s 123 uxtl v2.8h, v2.8b 124 uxtl v3.8h, v3.8b 125 addp v2.4s, v2.4s , v2.4s 126 addp v3.4s, v3.4s , v3.4s 127 addp v2.4s, v2.4s , v2.4s 128 addp v3.4s, v3.4s , v3.4s 129 rshrn v5.8b, v0.8h, #2 130 dup v21.8h, v5.h[0] 131 rshrn v6.8b, v3.8h, #2 132 dup v20.8h, v6.h[0] 133 add v1.8h, v1.8h, v2.8h 134 rshrn v1.8b, v1.8h, #3 135 dup v23.8h, v1.h[0] 136 mov v20.d[0], v23.d[0] 137 add v0.8h, v0.8h, v3.8h 138 rshrn v0.8b, v0.8h, #3 139 dup v23.8h, v0.h[0] 140 mov v31.d[0], v23.d[0] 141 mov v28.d[0], v20.d[0] 142 mov v29.d[0], v20.d[1] 143 mov v30.d[0], v21.d[0] 144 b sad_comp 145 146left_only_available: 147 ld1 {v0.8b, v1.8b}, [x1] 148 uxtl v0.8h, v0.8b 149 uxtl v1.8h, v1.8b 150 addp v0.4s, v0.4s , v0.4s 151 addp v1.4s, v1.4s , v1.4s 152 addp v0.4s, v0.4s , v0.4s 153 addp v1.4s, v1.4s , v1.4s 154 rshrn v0.8b, v0.8h, #2 155 rshrn v1.8b, v1.8h, #2 156 157 dup v28.8h , v1.h[0] 158 dup v29.8h , v1.h[0] 159 dup v30.8h, v0.h[0] 160 dup v31.8h, v0.h[0] 161 b sad_comp 162 163top_only_available: 164 add x6, x1, #18 165 ld1 {v0.8b, v1.8b}, [x6] 166 uxtl v0.8h, v0.8b 167 uxtl v1.8h, v1.8b 168 addp v0.4s, v0.4s , v0.4s 169 addp v1.4s, v1.4s , v1.4s 170 addp v0.4s, v0.4s , v0.4s 171 addp v1.4s, v1.4s , v1.4s 172 rshrn v0.8b, v0.8h, #2 173 rshrn v1.8b, v1.8h, #2 174 dup v28.8h , v0.h[0] 175 dup v30.8h, v1.h[0] 176 mov v29.d[0], v30.d[1] 177 mov v30.d[0], v28.d[0] 178 mov v31.d[0], v30.d[1] 179 b sad_comp 180none_available: 181 mov w20, #128 182 dup v28.16b, w20 183 dup v29.16b, w20 184 dup v30.16b, w20 185 dup v31.16b, w20 186 187 188 189sad_comp: 190 add x6, x1, #18 191 ld1 {v10.8b, v11.8b}, [x6] // vertical values 192 193 ld1 {v27.8h}, [x1] 194 195 dup v20.8h, v27.h[7] ///HORIZONTAL VALUE ROW=0// 196 dup v21.8h, v27.h[7] 197 198 ld1 { v0.8b, v1.8b}, [x0], x3 199 200 201 ///vertical row 0@ 202 uabdl v16.8h, v0.8b, v10.8b 203 uabdl v18.8h, v1.8b, v11.8b 204 205 ///HORZ row 0@ 206 uabdl v26.8h, v0.8b, v20.8b 207 uabdl v14.8h, v1.8b, v21.8b 208 209 ld1 {v2.8b, v3.8b}, [x0], x3 210 211 212 213 ///dc row 0@ 214 uabdl v22.8h, v0.8b, v28.8b 215 uabdl v24.8h, v1.8b, v29.8b 216 217 218 dup v20.8h, v27.h[6] 219 dup v21.8h, v27.h[6] ///HORIZONTAL VALUE ROW=1// 220 221 ///vertical row 1@ 222 uabal v16.8h, v2.8b, v10.8b 223 uabal v18.8h, v3.8b, v11.8b 224 225 ld1 { v4.8b, v5.8b}, [x0], x3 226 227 ///HORZ row 1@ 228 uabal v26.8h, v2.8b, v20.8b 229 uabal v14.8h, v3.8b, v21.8b 230 231 ///dc row 1@ 232 uabal v22.8h, v2.8b, v28.8b 233 uabal v24.8h, v3.8b, v29.8b 234 235 dup v20.8h, v27.h[5] 236 dup v21.8h, v27.h[5] ///HORIZONTAL VALUE ROW=2// 237 238 ///vertical row 2@ 239 uabal v16.8h, v4.8b, v10.8b 240 uabal v18.8h, v5.8b, v11.8b 241 242 ld1 { v6.8b, v7.8b}, [x0], x3 243 ///HORZ row 2@ 244 uabal v26.8h, v4.8b, v20.8b 245 uabal v14.8h, v5.8b, v21.8b 246 247 ///dc row 2@ 248 uabal v22.8h, v4.8b, v28.8b 249 uabal v24.8h, v5.8b, v29.8b 250 251 dup v20.8h, v27.h[4] 252 dup v21.8h, v27.h[4] ///HORIZONTAL VALUE ROW=3// 253 254 ///vertical row 3@ 255 uabal v16.8h, v6.8b, v10.8b 256 uabal v18.8h, v7.8b, v11.8b 257 258 ///HORZ row 3@ 259 uabal v26.8h, v6.8b, v20.8b 260 uabal v14.8h, v7.8b, v21.8b 261 262 ///dc row 3@ 263 uabal v22.8h, v6.8b, v28.8b 264 uabal v24.8h, v7.8b, v29.8b 265 266 //---------------------------------------------------------------------------------------------- 267 ld1 { v0.8b, v1.8b}, [x0], x3 268 269 270 dup v20.8h, v27.h[3] 271 dup v21.8h, v27.h[3] ///HORIZONTAL VALUE ROW=0// 272 273 ///vertical row 0@ 274 uabal v16.8h, v0.8b, v10.8b 275 uabal v18.8h, v1.8b, v11.8b 276 277 ///HORZ row 0@ 278 uabal v26.8h, v0.8b, v20.8b 279 uabal v14.8h, v1.8b, v21.8b 280 281 ld1 { v2.8b, v3.8b}, [x0], x3 282 283 ///dc row 0@ 284 uabal v22.8h, v0.8b, v30.8b 285 uabal v24.8h, v1.8b, v31.8b 286 287 dup v20.8h, v27.h[2] 288 dup v21.8h, v27.h[2] ///HORIZONTAL VALUE ROW=1// 289 290 ///vertical row 1@ 291 uabal v16.8h, v2.8b, v10.8b 292 uabal v18.8h, v3.8b, v11.8b 293 294 ///HORZ row 1@ 295 uabal v26.8h, v2.8b, v20.8b 296 uabal v14.8h, v3.8b, v21.8b 297 298 ld1 { v4.8b, v5.8b}, [x0], x3 299 300 ///dc row 1@ 301 uabal v22.8h, v2.8b, v30.8b 302 uabal v24.8h, v3.8b, v31.8b 303 304 dup v20.8h, v27.h[1] 305 dup v21.8h, v27.h[1] ///HORIZONTAL VALUE ROW=2// 306 307 ///vertical row 2@ 308 uabal v16.8h, v4.8b, v10.8b 309 uabal v18.8h, v5.8b, v11.8b 310 311 ///HORZ row 2@ 312 uabal v26.8h, v4.8b, v20.8b 313 uabal v14.8h, v5.8b, v21.8b 314 315 ld1 {v6.8b, v7.8b}, [x0], x3 316 317 ///dc row 2@ 318 uabal v22.8h, v4.8b, v30.8b 319 uabal v24.8h, v5.8b, v31.8b 320 321 dup v20.8h, v27.h[0] 322 dup v21.8h, v27.h[0] ///HORIZONTAL VALUE ROW=3// 323 324 ///vertical row 3@ 325 uabal v16.8h, v6.8b, v10.8b 326 uabal v18.8h, v7.8b, v11.8b 327 328 ///HORZ row 3@ 329 uabal v26.8h, v6.8b, v20.8b 330 uabal v14.8h, v7.8b, v21.8b 331 332 ///dc row 3@ 333 uabal v22.8h, v6.8b, v30.8b 334 uabal v24.8h, v7.8b, v31.8b 335 336 337//------------------------------------------- 338 339 340//vert sum 341 342 add v16.8h, v16.8h , v18.8h 343 mov v18.d[0], v16.d[1] 344 add v16.4h, v16.4h , v18.4h 345 uaddlp v16.2s, v16.4h 346 addp v16.2s, v16.2s, v16.2s 347 smov x8, v16.s[0] 348 349 350 //horz sum 351 352 add v26.8h, v26.8h , v14.8h 353 mov v14.d[0], v26.d[1] 354 add v26.4h, v26.4h , v14.4h 355 uaddlp v26.2s, v26.4h 356 addp v26.2s, v26.2s, v26.2s 357 smov x9, v26.s[0] 358 359 //dc sum 360 361 add v24.8h, v22.8h , v24.8h ///DC 362 mov v25.d[0], v24.d[1] 363 add v24.4h, v24.4h , v25.4h ///DC 364 uaddlp v24.2s, v24.4h ///DC 365 addp v24.2s, v24.2s, v24.2s ///DC 366 smov x10, v24.s[0] //dc 367 368 369 370 371 mov x11, #1 372//----------------------- 373 mov w0, w16 // u4_valid_intra_modes 374 375//-------------------------------------------- 376 377 378 lsl x11, x11, #30 379 380 ands w7, w0, #04 // vert mode valid???????????? 381 csel x8, x11, x8, eq 382 383 ands w6, w0, #02 // horz mode valid???????????? 384 csel x9, x11, x9, eq 385 386 ands w6, w0, #01 // dc mode valid???????????? 387 csel x10, x11, x10, eq 388 389 390 //--------------------------- 391 392 mov x4, x17 393 mov x6, x14 394 mov x7, x15 395 396 //-------------------------- 397 398 cmp x10, x9 399 bgt not_dc 400 cmp x10, x8 401 bgt do_vert 402 403 ///---------------------- 404 //DO DC PREDICTION 405 str w10 , [x7] //MIN SAD 406 407 mov w10, #0 408 str w10 , [x6] // MODE 409 410 b do_dc_vert 411 //----------------------------- 412 413not_dc: 414 cmp x9, x8 415 bgt do_vert 416 ///---------------------- 417 //DO HORIZONTAL 418 str w9 , [x7] //MIN SAD 419 420 mov w10, #1 421 str w10 , [x6] // MODE 422 ld1 {v0.8h}, [x1] 423 424 dup v10.8h, v0.h[7] 425 dup v11.8h, v0.h[6] 426 dup v12.8h, v0.h[5] 427 dup v13.8h, v0.h[4] 428 st1 {v10.8h}, [x2], x4 429 dup v14.8h, v0.h[3] 430 st1 {v11.8h}, [x2], x4 431 dup v15.8h, v0.h[2] 432 st1 {v12.8h}, [x2], x4 433 dup v16.8h, v0.h[1] 434 st1 {v13.8h}, [x2], x4 435 dup v17.8h, v0.h[0] 436 st1 {v14.8h}, [x2], x4 437 st1 {v15.8h}, [x2], x4 438 st1 {v16.8h}, [x2], x4 439 st1 {v17.8h}, [x2], x4 440 441 b end_func 442 443do_vert: 444 //DO VERTICAL PREDICTION 445 str w8 , [x7] //MIN SAD 446 mov w8, #2 447 str w8 , [x6] // MODE 448 add x6, x1, #18 449 ld1 {v28.8b, v29.8b}, [x6] // vertical values 450 ld1 {v30.8b, v31.8b}, [x6] // vertical values 451 452do_dc_vert: 453 st1 {v28.2s, v29.2s} , [x2], x4 //0 454 st1 {v28.2s, v29.2s} , [x2], x4 //1 455 st1 {v28.2s, v29.2s} , [x2], x4 //2 456 st1 {v28.2s, v29.2s} , [x2], x4 //3 457 st1 {v30.2s, v31.2s} , [x2], x4 //4 458 st1 {v30.2s, v31.2s} , [x2], x4 //5 459 st1 {v30.2s, v31.2s} , [x2], x4 //6 460 st1 {v30.2s, v31.2s} , [x2], x4 //7 461 462end_func: 463 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 464 ldp x19, x20, [sp], #16 465 pop_v_regs 466 ret 467 468 469