1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_filters_planar.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for planar input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 78@ word32 src_strd, 79@ uword8* pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode, 83@ word32 pi1_coeff) 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #40 91@ nt 92@ mode 93@ pi1_coeff 94 95.text 96.align 4 97 98 99 100 101.globl ihevc_intra_pred_luma_planar_a9q 102.extern gau1_ihevc_planar_factor 103.extern gau1_ihevc_planar_factor_1 104 105gau1_ihevc_planar_factor_addr: 106.long gau1_ihevc_planar_factor - ulbl1 - 8 107 108gau1_ihevc_planar_factor_1_addr: 109.long gau1_ihevc_planar_factor_1 - ulbl2 - 8 110 111 112.type ihevc_intra_pred_luma_planar_a9q, %function 113 114ihevc_intra_pred_luma_planar_a9q: 115 116 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 117 118 ldr r4,[sp,#40] @loads nt 119 ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs 120ulbl1: 121 add r11,r11,pc 122 123 clz r5, r4 124 rsb r5, r5, #32 125 vdup.16 q7, r5 126 vneg.s16 q7, q7 @shr value (so vneg) 127 vdup.8 d2, r4 @nt 128 vdup.s16 q8, r4 @nt 129 130 sub r6, r4, #1 @nt-1 131 add r6, r6, r0 132 ldr r7, [r6] 133 vdup.s8 d0, r7 @src[nt-1] 134 135 add r6, r4, r4,lsl #1 @3nt 136 add r6, r6, #1 @3nt + 1 137 add r6, r6, r0 138 ldr r7, [r6] 139 vdup.s8 d1, r7 @src[3nt+1] 140 141 add r6, r4, r4 @2nt 142 add r14, r6, #1 @2nt+1 143 sub r6, r6, #1 @2nt-1 144 add r6, r6, r0 @&src[2nt-1] 145 add r14, r14, r0 @&src[2nt+1] 146 147 mov r8, #1 @row+1 (row is first 0) 148 sub r9, r4, r8 @nt-1-row (row is first 0) 149 150 vdup.s8 d5, r8 @row + 1 151 vdup.s8 d6, r9 @nt - 1 - row 152 vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 153 154 add r12, r11, #1 @coeffs (to be reloaded after every row) 155 mov r1, r4 @nt (row counter) (dec after every row) 156 mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd) 157 mov r10, #8 @increment for the coeffs 158 mov r0, r14 @&src[2nt+1] (to be reloaded after every row) 159 160 cmp r4, #4 161 beq tf_sz_4 162 163@@ ========== ***************** ===================== 164prolog: 165tf_sz_8_16_32: 166 167 mov r7, r4 @column counter (set to no of cols) 168 mov r9, r4, lsr #3 @divide nt by 8 169 mul r7, r7, r9 @multiply width * height 170 ldr r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs 171ulbl2: 172 add r5,r5,pc 173 sub r6, r6, #7 174 mov r8, r2 175 lsl r9, r3, #3 @4*stride 176 rsb r9, r9, #8 @8-4*stride 177 mov r10, r4 @nt 178 sub r10, r10, #8 @nt - 8 179 180col_loop_8_16_32: 181 182 vld1.s8 d8, [r12] @(1-8)load 8 coeffs [col+1] 183 vdup.16 q6, r4 @(1) 184 vld1.s8 d4, [r6] @(1-8)src[2nt-1-row] 185 vsub.s8 d9, d2, d8 @(1-8)[nt-1-col] 186 187 188 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 189 190 vld1.s8 d3, [r14] @(1-8)load 8 src[2nt+1+col] 191 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 192 193 vdup.s8 d20, d4[7] @(1) 194 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 195 196 vdup.s8 d21, d4[6] @(2) 197 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 198 199 vdup.16 q15, r4 @(2) 200 vadd.s8 d5, d5, d7 @(1) 201 202 vsub.s8 d6, d6, d7 @(1) 203 204 vdup.s8 d22, d4[5] @(3) 205 vmlal.u8 q15, d5, d0 @(2) 206 207 vdup.16 q14, r4 @(3) 208 vmlal.u8 q15, d8, d1 @(2) 209 210 vmlal.u8 q15, d6, d3 @(2) 211 vmlal.u8 q15, d9, d21 @(2) 212 213 vshl.s16 q6, q6, q7 @(1)shr 214 215 vadd.s8 d5, d5, d7 @(2) 216 vsub.s8 d6, d6, d7 @(2) 217 218 vmovn.i16 d12, q6 @(1) 219 vmlal.u8 q14, d5, d0 @(3) 220 221 vdup.8 d23, d4[4] @(4) 222 vmlal.u8 q14, d8, d1 @(3) 223 224 vdup.16 q5, r4 @(4) 225 vmlal.u8 q14, d6, d3 @(3) 226 227 vst1.s8 d12, [r2], r3 @(1)str 8 values 228 vmlal.u8 q14, d9, d22 @(3) 229 230 vshl.s16 q15, q15, q7 @(2)shr 231 232 vadd.s8 d5, d5, d7 @(3) 233 vsub.s8 d6, d6, d7 @(3) 234 235 vmovn.i16 d30, q15 @(2) 236 vmlal.u8 q5, d5, d0 @(4) 237 238 vdup.8 d20, d4[3] @(5) 239 vmlal.u8 q5, d8, d1 @(4) 240 241 vdup.16 q8, r4 @(5) 242 vmlal.u8 q5, d6, d3 @(4) 243 244 vst1.s8 d30, [r2], r3 @(2)str 8 values 245 vmlal.u8 q5, d9, d23 @(4) 246 247 vshl.s16 q14, q14, q7 @(3)shr 248 249 vadd.s8 d5, d5, d7 @(4) 250 vsub.s8 d6, d6, d7 @(4) 251 252 vmovn.i16 d28, q14 @(3) 253 vmlal.u8 q8, d5, d0 @(5) 254 255 vdup.8 d21, d4[2] @(6) 256 vmlal.u8 q8, d8, d1 @(5) 257 258 vdup.16 q9, r4 @(6) 259 vmlal.u8 q8, d6, d3 @(5) 260 261 vst1.s8 d28, [r2], r3 @(3)str 8 values 262 vmlal.u8 q8, d9, d20 @(5) 263 264 vshl.s16 q5, q5, q7 @(4)shr 265 vadd.s8 d5, d5, d7 @(5) 266 vsub.s8 d6, d6, d7 @(5) 267 268 vmovn.i16 d10, q5 @(4) 269 vmlal.u8 q9, d5, d0 @(6) 270 271 vdup.8 d22, d4[1] @(7) 272 vmlal.u8 q9, d8, d1 @(6) 273 274 vdup.16 q13, r4 @(7) 275 vmlal.u8 q9, d6, d3 @(6) 276 277 vst1.s8 d10, [r2], r3 @(4)str 8 values 278 vmlal.u8 q9, d9, d21 @(6) 279 280 vshl.s16 q8, q8, q7 @(5)shr 281 282 vadd.s8 d5, d5, d7 @(6) 283 vsub.s8 d6, d6, d7 @(6) 284 285 vmovn.i16 d16, q8 @(5) 286 vmlal.u8 q13, d5, d0 @(7) 287 288 vdup.8 d23, d4[0] @(8) 289 vmlal.u8 q13, d8, d1 @(7) 290 291 vdup.16 q12, r4 @(8) 292 vmlal.u8 q13, d6, d3 @(7) 293 294 vst1.s8 d16, [r2], r3 @(5)str 8 values 295 vmlal.u8 q13, d9, d22 @(7) 296 297 vshl.s16 q9, q9, q7 @(6)shr 298 299 vadd.s8 d5, d5, d7 @(7) 300 vsub.s8 d6, d6, d7 @(7) 301 302 vmovn.i16 d18, q9 @(6) 303 vmlal.u8 q12, d5, d0 @(8) 304 305 306 vmlal.u8 q12, d8, d1 @(8) 307 308 vmlal.u8 q12, d6, d3 @(8) 309 310 vst1.s8 d18, [r2], r3 @(6)str 8 values 311 vmlal.u8 q12, d9, d23 @(8) 312 313 vshl.s16 q13, q13, q7 @(7)shr 314 315 subs r7, r7, #8 316 317 beq epilog 318 319 subs r1, r1, #8 @row counter 320 addgt r12, r12, #8 @col inc 321 addgt r14, r14, #8 @also for col inc 322 movle r1, r4 @nt reloaded (refresh the value) 323 addle r12, r11, #1 @r12 reset 324 325 movle r14, r0 @r14 reset 326 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 327 328 suble r6, r6, #8 @for next set of rows 329 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 330 331 addle r5, r5, #8 332 vdup.16 q6, r4 @(1n)(1) 333 334 vld1.s8 d5, [r5] 335 336 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 337 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 338 339 vdup.s8 d20, d4[7] @(1n)(1) 340 vsub.s8 d6, d2, d5 341 342 beq epilog 343 344kernel_plnr: 345 346 cmp r1, #0 @ (cond loop) 347 vshl.s16 q12, q12, q7 @(8)shr 348 349 vmovn.i16 d26, q13 @(7) 350 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 351 352 vmovn.i16 d24, q12 @(8) 353 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 354 355 vdup.s8 d21, d4[6] @(2) 356 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 357 358 vdup.16 q15, r4 @(2) 359 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 360 361 vst1.s8 d26, [r2], r3 @(7)str 8 values 362 vadd.s8 d5, d5, d7 @(1) 363 364 vst1.s8 d24, [r2], r3 @(8)str 8 values 365 vsub.s8 d6, d6, d7 @(1) 366 367 addgt r2, r2, r9 @since more cols to fill, dst + 8 - 6*strd (cond loop) 368 vmlal.u8 q15, d5, d0 @(2) 369 370 suble r2, r2, r10 @else go to next set of rows, dst - (nt-8) (cond loop) 371 vmlal.u8 q15, d8, d1 @(2) 372 373 vdup.s8 d22, d4[5] @(3) 374 vmlal.u8 q15, d6, d3 @(2) 375 376 vdup.16 q14, r4 @(3) 377 vmlal.u8 q15, d9, d21 @(2) 378 379 vshl.s16 q6, q6, q7 @(1)shr 380 381 vadd.s8 d5, d5, d7 @(2) 382 movle r1, r4 @nt reloaded (refresh the value) (cond loop) 383 384 vsub.s8 d6, d6, d7 @(2) 385 subs r1, r1, #8 @row counter (loop) 386 387 vmovn.i16 d12, q6 @(1) 388 vmlal.u8 q14, d5, d0 @(3) 389 390 vdup.8 d23, d4[4] @(4) 391 vmlal.u8 q14, d8, d1 @(3) 392 393 vdup.16 q5, r4 @(4) 394 vmlal.u8 q14, d6, d3 @(3) 395 396 vst1.s8 d12, [r2], r3 @(1)str 8 values 397 vmlal.u8 q14, d9, d22 @(3) 398 399 vshl.s16 q15, q15, q7 @(2)shr 400 401 vadd.s8 d5, d5, d7 @(3) 402 403 vsub.s8 d6, d6, d7 @(3) 404 405 vmovn.i16 d30, q15 @(2) 406 vmlal.u8 q5, d5, d0 @(4) 407 408 vdup.8 d20, d4[3] @(5) 409 vmlal.u8 q5, d8, d1 @(4) 410 411 vdup.16 q8, r4 @(5) 412 vmlal.u8 q5, d6, d3 @(4) 413 414 vst1.s8 d30, [r2], r3 @(2)str 8 values 415 vmlal.u8 q5, d9, d23 @(4) 416 417 vshl.s16 q14, q14, q7 @(3)shr 418 419 vadd.s8 d5, d5, d7 @(4) 420 421 vsub.s8 d6, d6, d7 @(4) 422 423 vmovn.i16 d28, q14 @(3) 424 vmlal.u8 q8, d5, d0 @(5) 425 426 vdup.8 d21, d4[2] @(6) 427 vmlal.u8 q8, d8, d1 @(5) 428 429 vdup.16 q9, r4 @(6) 430 vmlal.u8 q8, d6, d3 @(5) 431 432 vst1.s8 d28, [r2], r3 @(3)str 8 values 433 vmlal.u8 q8, d9, d20 @(5) 434 435 addle r12, r11, #1 @r12 reset (cond loop) 436 vshl.s16 q5, q5, q7 @(4)shr 437 438 addgt r12, r12, #8 @col inc (cond loop) 439 vadd.s8 d5, d5, d7 @(5) 440 441 addgt r14, r14, #8 @also for col inc (cond loop) 442 vsub.s8 d6, d6, d7 @(5) 443 444 vmovn.i16 d10, q5 @(4) 445 vmlal.u8 q9, d5, d0 @(6) 446 447 vdup.8 d22, d4[1] @(7) 448 vmlal.u8 q9, d8, d1 @(6) 449 450 vdup.16 q13, r4 @(7) 451 vmlal.u8 q9, d6, d3 @(6) 452 453 vst1.s8 d10, [r2], r3 @(4)str 8 values 454 vmlal.u8 q9, d9, d21 @(6) 455 456 movle r14, r0 @r14 reset (cond loop) 457 vshl.s16 q8, q8, q7 @(5)shr 458 459 suble r6, r6, #8 @for next set of rows (cond loop) 460 vadd.s8 d5, d5, d7 @(6) 461 462 addle r5, r5, #8 @ (cond loop) 463 vsub.s8 d6, d6, d7 @(6) 464 465 vmovn.i16 d16, q8 @(5) 466 vmlal.u8 q13, d5, d0 @(7) 467 468 vdup.8 d23, d4[0] @(8) 469 vmlal.u8 q13, d8, d1 @(7) 470 471 vdup.16 q12, r4 @(8) 472 vmlal.u8 q13, d6, d3 @(7) 473 474 vst1.s8 d16, [r2], r3 @(5)str 8 values 475 vmlal.u8 q13, d9, d22 @(7) 476 477 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 478 vshl.s16 q9, q9, q7 @(6)shr 479 480 vadd.s8 d5, d5, d7 @(7) 481 482 vsub.s8 d6, d6, d7 @(7) 483 484 vmovn.i16 d18, q9 @(6) 485 vmlal.u8 q12, d5, d0 @(8) 486 487 vld1.s8 d5, [r5] @(row+1 value) 488 vmlal.u8 q12, d8, d1 @(8) 489 490 vdup.s8 d20, d4[7] @(1n)(1) 491 vmlal.u8 q12, d6, d3 @(8) 492 493 vst1.s8 d18, [r2], r3 @(6)str 8 values 494 vmlal.u8 q12, d9, d23 @(8) 495 496 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 497 vsub.s8 d6, d2, d5 @(nt-1-row) value 498 499 subs r7, r7, #8 @col counter 500 501 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 502 vshl.s16 q13, q13, q7 @(7)shr 503 504 vdup.16 q6, r4 @(1n)(1) 505 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 506 507 bne kernel_plnr 508 509epilog: 510 511 vmovn.i16 d26, q13 @(7) 512 vst1.s8 d26, [r2], r3 @(7)str 8 values 513 514 vshl.s16 q12, q12, q7 @(8)shr 515 vmovn.i16 d24, q12 @(8) 516 vst1.s8 d24, [r2], r3 @(8)str 8 values 517 518@@ ========== ***************** ===================== 519 520 beq end_loop 521 522tf_sz_4: 523 vld1.s8 d10, [r14] @load src[2nt+1+col] 524 vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1] 525loop_sz_4: 526 mov r10, #4 @reduce inc to #4 for 4x4 527 ldr r7, [r6], #-1 @src[2nt-1-row] (dec to take into account row) 528 vdup.s8 d4, r7 @src[2nt-1-row] 529 530 vsub.s8 d9, d2, d8 @[nt-1-col] 531 532 vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] 533 vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] 534 vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] 535 vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row] 536@ vadd.i16 q6, q6, q8 @add (nt) 537@ vshl.s16 q6, q6, q7 @shr 538@ vmovn.i16 d12, q6 539 vrshrn.s16 d12,q6,#3 540 vst1.s32 {d12[0]}, [r2], r3 541 542 vadd.s8 d5, d5, d7 @row++ [(row+1)++] 543 vsub.s8 d6, d6, d7 @[nt-1-row]-- 544 subs r1, r1, #1 545 546 bne loop_sz_4 547 548end_loop: 549 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 550 551 552 553 554 555 556 557 558