1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_filters_dc.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* akshaya mukund 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] pi1_coeff 61//* word8 pointer to the planar coefficients 62//* 63//* @param[in] nt 64//* size of tranform block 65//* 66//* @param[in] mode 67//* type of filtering 68//* 69//* @returns 70//* 71//* @remarks 72//* none 73//* 74//******************************************************************************* 75//*/ 76 77//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83// 84//**************variables vs registers***************************************** 85//x0 => *pu1_ref 86//x1 => src_strd 87//x2 => *pu1_dst 88//x3 => dst_strd 89 90//stack contents from #40 91// nt 92// mode 93// pi1_coeff 94 95.text 96.align 4 97.include "ihevc_neon_macros.s" 98 99 100.globl ihevc_intra_pred_luma_dc_av8 101 102.type ihevc_intra_pred_luma_dc_av8, %function 103 104ihevc_intra_pred_luma_dc_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 111//********** testing 112 //mov x6, #128 113 //b prologue_cpy_32 114//********** testing 115 116 mov x11, #2 //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val) 117 mov x9, #0 118 mov v17.s[0], w11 119 mov v17.s[1], w9 120 121 clz w5,w4 122 123 add x6, x0, x4 //&src[nt] 124 sub x20, x5, #32 //log2nt 125 neg x5, x20 126 add x7, x0, x4, lsl #1 //&src[2nt] 127 128 add x8, x7, #1 //&src[2nt+1] 129 mvn x5, x5 130 add x5, x5, #1 131 dup v7.2s,w5 132 133 ldrb w14, [x8] 134 sxtw x14,w14 135 shl d7, d7,#32 136 137 sub x9, x7, #1 //&src[2nt-1] 138 sshr d7, d7,#32 139 140 mov x7, x8 //x7 also stores 2nt+1 141 142 ldrb w12, [x9] 143 sxtw x12,w12 144 add x14, x14, x12 //src[2nt+1] + src[2nt-1] 145 add x14, x14, x11 //src[2nt+1] + src[2nt-1] + 2 146 147 cmp x4, #4 148 beq dc_4 149 150 mov x10, x4 //nt 151 152add_loop: 153 ld1 {v0.8b},[x6],#8 //load from src[nt] 154 mov x5, #0 // 155 ld1 {v1.8b},[x8],#8 //load from src[2nt+1] 156 157 uaddlp v2.4h, v0.8b 158 159 mov v6.s[0], w4 160 mov v6.s[1], w5 //store nt to accumulate 161 uaddlp v3.4h, v1.8b 162 163 ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 8) 164 165 ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 8) 166 add v4.4h, v2.4h , v3.4h 167 168 169 uaddlp v5.2s, v4.4h 170 171 172 uadalp v6.1d, v5.2s //accumulate all inp into d6 (end for nt==8) 173 174 subs x10, x10,#8 175 beq epil_add_loop 176 177core_loop_add: 178 uaddlp v2.4h, v0.8b 179 subs x10, x10,#8 180 uaddlp v3.4h, v1.8b 181 182 183 184 add v4.4h, v2.4h , v3.4h 185 ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 16) 186 187 uaddlp v5.2s, v4.4h 188 ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 16) 189 190 uadalp v6.1d, v5.2s //accumulate all inp into d6 191 bne core_loop_add 192 193epil_add_loop: 194 195 sshl d18, d6, d7 //(dc_val) shr by log2nt+1 196 cmp x4, #32 197 198 mov v28.s[0], w14 199 mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28 200 mov x20,#128 201 csel x6, x20, x6,eq 202 203 dup v16.8b, v18.b[0] //dc_val 204 shl d25, d18,#1 //2*dc 205 206 beq prologue_cpy_32 207 208 add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val 209 mov x20,#0 210 csel x6, x20, x6,ne //nt 211 212 ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] 213 csel x10, x4, x10,ne 214 215 add d23, d25 , d18 //3*dc 216 sub x12, x3, x3, lsl #3 //-7*strd 217 218 add d23, d23 , d17 //3*dc + 2 219 add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8) 220 221 dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) 222 sub x0, x3, x4 //strd - nt 223 224prologue_col: 225 //0th column and 0-7 rows done here 226 //x8 and x9 (2nt+1+col 2nt-1-row) 227 228 mov x8, x7 //&src[2nt+1] 229 230 add x0, x0, #8 //strd - nt + 8 231 ld1 {v0.8b},[x8],#8 //col 1::7 load (prol) 232 sub x9, x9, #7 //&src[2nt-1-row] 233 234 ld1 {v1.8b},[x9] //row 7::1 (0 also) load (prol) 235 sub x9, x9, #8 236 237 uxtl v20.8h, v0.8b 238 239 ld1 {v6.8b},[x8] //col 8::15 load (prol extra) 240 add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol) 241 242 uxtl v22.8h, v1.8b 243 sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol) 244 245 uxtl v26.8h, v6.8b 246 add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol) 247 248 movi d19, #0x00000000000000ff // 249 sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) 250 251 bsl v19.8b, v29.8b , v2.8b //first row with dst[0] 252 add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra) 253 254 rev64 v3.8b, v3.8b 255 256 st1 {v19.8b},[x2], x3 //store row 0 (prol) 257 sshr d3, d3,#8 //row 0 shift (prol) (first value to be ignored) 258 259 movi d20, #0x00000000000000ff //byte mask row 1 (prol) 260 261loop_again_col_row: 262 263 bsl v20.8b, v3.8b , v16.8b //row 1 (prol) 264 265 movi d21, #0x00000000000000ff //byte mask row 2 (prol) 266 sshr d3, d3,#8 //row 1 shift (prol) 267 268 st1 {v20.8b},[x2], x3 //store row 1 (prol) 269 sqshrun v4.8b, v26.8h,#2 //columns shx2 movn (prol extra) 270 271 272 bsl v21.8b, v3.8b , v16.8b //row 2 (prol) 273 274 movi d20, #0x00000000000000ff //byte mask row 3 (prol) 275 sshr d3, d3,#8 //row 2 shift (prol) 276 277 st1 {v21.8b},[x2], x3 //store row 2 (prol) 278 279 280 bsl v20.8b, v3.8b , v16.8b //row 3 (prol) 281 282 movi d21, #0x00000000000000ff //byte mask row 4 (prol) 283 sshr d3, d3,#8 //row 3 shift (prol) 284 285 st1 {v20.8b},[x2], x3 //store row 3 (prol) 286 287 288 bsl v21.8b, v3.8b , v16.8b //row 4 (prol) 289 290 movi d20, #0x00000000000000ff //byte mask row 5 (prol) 291 sshr d3, d3,#8 //row 4 shift (prol) 292 293 st1 {v21.8b},[x2], x3 //store row 4 (prol) 294 295 296 bsl v20.8b, v3.8b , v16.8b //row 5 (prol) 297 298 movi d21, #0x00000000000000ff //byte mask row 6 (prol) 299 sshr d3, d3,#8 //row 5 shift (prol) 300 301 st1 {v20.8b},[x2], x3 //store row 5 (prol) 302 303 ld1 {v1.8b},[x9] //row 8::15 load (prol extra) 304 305 bsl v21.8b, v3.8b , v16.8b //row 6 (prol) 306 307 uxtl v22.8h, v1.8b 308 309 movi d20, #0x00000000000000ff //byte mask row 7 (prol) 310 sshr d3, d3,#8 //row 6 shift (prol) 311 312 st1 {v21.8b},[x2], x3 //store row 6 (prol) 313 314 bsl v20.8b, v3.8b , v16.8b //row 7 (prol) 315 add v22.8h, v22.8h , v24.8h //row 8::15 add 3dc+2 (prol extra) 316 317 sshr d3, d3,#8 //row 7 shift (prol) 318 st1 {v20.8b},[x2], x12 //store row 7 (prol) 319 320 subs x10, x10, #8 //counter for cols 321 322 beq end_func 323 blt copy_16 324 325 326 movi d20, #0x00000000000000ff //byte mask row 9 (prol) 327 sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) 328 329 rev64 v3.8b, v3.8b 330 331 st1 {v4.8b},[x2], x3 //store 2nd col (for 16x16) 332 333 st1 {v16.8b},[x2], x3 334 st1 {v16.8b},[x2], x3 335 st1 {v16.8b},[x2], x3 336 st1 {v16.8b},[x2], x3 337 st1 {v16.8b},[x2], x3 338 st1 {v16.8b},[x2], x3 339 st1 {v16.8b},[x2], x0 //go to next row for 16 340 341 342 bsl v20.8b, v3.8b , v16.8b //row 9 (prol) 343 subs x10, x10, #8 344 345 st1 {v20.8b},[x2], x3 //store row 9 (prol) 346 sshr d3, d3,#8 //row 9 shift (prol) 347 348 movi d20, #0x00000000000000ff //byte mask row 9 (prol) 349 350 b loop_again_col_row 351 352 353copy_16: 354 st1 {v16.8b},[x2], x3 355 st1 {v16.8b},[x2], x3 356 st1 {v16.8b},[x2], x3 357 st1 {v16.8b},[x2], x3 358 st1 {v16.8b},[x2], x3 359 st1 {v16.8b},[x2], x3 360 st1 {v16.8b},[x2], x3 361 st1 {v16.8b},[x2] 362 363 b end_func 364 365prologue_cpy_32: 366 mov x9, #128 367 //sub x7, x3, #-24 368 add x5, x2, x3 369 add x8, x5, x3 370 add x10, x8, x3 371 dup v20.16b, v16.b[0] 372 lsl x6, x3, #2 373 sub x6, x6, #16 374 375 st1 {v20.16b}, [x2],#16 376 st1 {v20.16b}, [x5],#16 377 st1 {v20.16b}, [x8],#16 378 st1 {v20.16b}, [x10],#16 379 380 st1 {v20.16b}, [x2], x6 381 st1 {v20.16b}, [x5], x6 382 st1 {v20.16b}, [x8], x6 383 st1 {v20.16b}, [x10], x6 384 385 sub x9, x9, #32 //32x32 prol/epil counter dec 386 387kernel_copy: 388 st1 {v20.16b}, [x2],#16 389 st1 {v20.16b}, [x5],#16 390 st1 {v20.16b}, [x8],#16 391 st1 {v20.16b}, [x10],#16 392 393 st1 {v20.16b}, [x2], x6 394 st1 {v20.16b}, [x5], x6 395 st1 {v20.16b}, [x8], x6 396 st1 {v20.16b}, [x10], x6 397 398 subs x9, x9, #32 399 400 st1 {v20.16b}, [x2],#16 401 st1 {v20.16b}, [x5],#16 402 st1 {v20.16b}, [x8],#16 403 st1 {v20.16b}, [x10],#16 404 405 st1 {v20.16b}, [x2], x6 406 st1 {v20.16b}, [x5], x6 407 st1 {v20.16b}, [x8], x6 408 st1 {v20.16b}, [x10], x6 409 410 bne kernel_copy 411 412epilogue_copy: 413 st1 {v20.16b}, [x2],#16 414 st1 {v20.16b}, [x5],#16 415 st1 {v20.16b}, [x8],#16 416 st1 {v20.16b}, [x10],#16 417 418 st1 {v20.16b}, [x2] 419 st1 {v20.16b}, [x5] 420 st1 {v20.16b}, [x8] 421 st1 {v20.16b}, [x10] 422 423 b end_func 424 425 426dc_4: 427 ld1 {v0.8b},[x6],#8 //load from src[nt] 428 ld1 {v1.8b},[x8],#8 //load from src[2nt+1] 429 430 uaddlp v2.4h, v0.8b 431 mov x5, #0 // 432 mov v6.s[0], w4 433 mov v6.s[1], w5 //store nt to accumulate 434 uaddlp v3.4h, v1.8b 435 436 add v4.4h, v2.4h , v3.4h 437 438 439 uaddlp v5.2s, v4.4h 440 movi d30, #0x00000000ffffffff 441 442 and v5.8b, v5.8b , v30.8b 443 444 mov v28.s[0], w14 445 mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28 446 add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8) 447 448 sshl d18, d6, d7 //(dc_val) shr by log2nt+1 449 mov x8, x7 //&src[2nt+1] 450 451 shl d25, d18,#1 //2*dc 452 sub x9, x9, #3 //&src[2nt-1-row] 453 454 dup v16.8b, v18.b[0] //dc_val 455 add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val 456 457 ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0] 458 sub x12, x3, x3, lsl #2 //-3*strd 459 add d23, d25 , d18 //3*dc 460 461 add d23, d23 , d17 //3*dc + 2 462 add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4) 463 464 dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes) 465 sub x0, x3, x4 //strd - nt 466 467 468 ld1 {v0.8b},[x8] //col 1::3 load (prol) 469 ld1 {v1.8b},[x9] //row 3::1 (0 also) load (prol) 470 471 uxtl v20.8h, v0.8b 472 473 uxtl v22.8h, v1.8b 474 add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol) 475 476 add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol) 477 478 movi d19, #0x00000000000000ff // 479 sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol) 480 481 movi d20, #0x00000000000000ff //byte mask row 1 (prol) 482 sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol) 483 484 485 bsl v19.8b, v29.8b , v2.8b //first row with dst[0] 486 487 rev64 v3.8b, v3.8b 488 489 st1 {v19.s}[0],[x2], x3 //store row 0 (prol) 490 sshr d3, d3,#40 //row 0 shift (prol) (first value to be ignored) 491 492 movi d21, #0x00000000000000ff //byte mask row 2 (prol) 493 494 bsl v20.8b, v3.8b , v16.8b //row 1 (prol) 495 sshr d3, d3,#8 //row 1 shift (prol) 496 497 st1 {v20.s}[0],[x2], x3 //store row 1 (prol) 498 499 bsl v21.8b, v3.8b , v16.8b //row 2 (prol) 500 501 movi d20, #0x00000000000000ff //byte mask row 3 (prol) 502 503 sshr d3, d3,#8 //row 2 shift (prol) 504 st1 {v21.s}[0],[x2], x3 //store row 2 (prol) 505 506 bsl v20.8b, v3.8b , v16.8b //row 3 (prol) 507 st1 {v20.s}[0],[x2] //store row 3 (prol) 508 509epilogue_end: 510end_func: 511 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 512 ldp x19, x20,[sp],#16 513 514 ret 515 516 517 518 519 520