1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/******************************************************************************* 19@* @file 20@* ihevc_deblk_luma_vert.s 21@* 22@* @brief 23@* contains function definitions for inter prediction interpolation. 24@* functions are coded using neon intrinsics and can be compiled using 25 26@* rvct 27@* 28@* @author 29@* anand s 30@* 31@* @par list of functions: 32@* 33@* 34@* @remarks 35@* none 36@* 37@*******************************************************************************/ 38 39.text 40.align 4 41 42 43 44 45 46.extern gai4_ihevc_tc_table 47.extern gai4_ihevc_beta_table 48.globl ihevc_deblk_luma_horz_a9q 49 50gai4_ihevc_tc_table_addr: 51.long gai4_ihevc_tc_table - ulbl1 - 8 52 53gai4_ihevc_beta_table_addr: 54.long gai4_ihevc_beta_table - ulbl2 - 8 55 56.type ihevc_deblk_luma_horz_a9q, %function 57 58ihevc_deblk_luma_horz_a9q: 59 stmfd sp!, {r3-r12,lr} 60 ldr r4,[sp,#0x2c] 61 ldr r5,[sp,#0x30] 62 63 add r3,r3,r4 64 add r3,r3,#1 65 ldr r6, [sp,#0x34] 66 asr r3,r3,#1 67 add r7,r3,r5,lsl #1 68 add r3,r3,r6,lsl #1 69 cmp r7,#0x33 70 movgt r7,#0x33 71 bgt l1.1532 72 cmp r7,#0x0 73 movlt r7,#0x0 @ r7 has the beta_index value 74l1.1532: 75 @ bic r2,r2,#1 76 asr r2,r2,#1 77 78 add r3,r3,r2,lsl #1 79 cmp r3,#0x35 80 movgt r3,#0x35 81 bgt l1.1564 82 cmp r3,#0x0 83 movlt r3,#0x0 @ r3 has the tc_index value 84 85 @ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ 86 @ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ 87 @ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ 88 89l1.1564: 90 ldr r2,gai4_ihevc_beta_table_addr 91ulbl2: 92 add r2,r2,pc 93 ldr r4,gai4_ihevc_tc_table_addr 94ulbl1: 95 add r4,r4,pc 96 97 ldr r5,[r2,r7,lsl #2] @ beta 98 ldr r6,[r4,r3,lsl #2] @ tc 99 100 101 102 cmp r6,#0 103 beq l1.2404 104 vmov.i16 d0,#0x2 105 lsl r7,r6,#1 106 add r14,r1,r1,lsl #1 107 ldr r8,[r0,-r14] @ -3 value 108 vdup.8 d1,r7 109 ldr r10,[r0,-r1,lsl #1] @-2 value 110 vdup.32 d23,r8 @ -3 value 111 ldr r11,[r0,-r1] @-1 value 112 vdup.32 d24,r10 @ -2 value 113 and r8,#0xff 114 ldr r12,[r0,#0] @ 0 value 115 vdup.32 d25, r11 @-1 value 116 and r10,#0xff 117 ldr r9,[r0,r1] @ 1 value 118 vdup.32 d26,r12 @ 0 value 119 and r11,#0xff 120 ldr r2,[r0,r1,lsl #1] @ 2 value 121 vdup.32 d27,r9 @ 1value 122 and r12,#0xff 123 vdup.32 d28,r2 @ 2 value 124 and r9,#0xff 125 and r2,#0xff 126 127 add r12,r12,r2 128 subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9 129 rsbmi r9,r9,#0 130 @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ 131 132 add r8,r8,r11 133 subs r8,r8,r10,lsl #1 134 rsbmi r8,r8,#0 @ dp0 value is stored in r8 135 @ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ 136 137 138 139 add r3,r1,r1,lsl #1 140 add r14,r0,#3 141 142 143 ldrb r2,[r14,-r3] @ -2 value 144 ldrb r10,[r14,-r1,lsl #1] @ -2 value 145 ldrb r11,[r14,-r1] @ -1 value 146 ldrb r12,[r14,#0] @ 0 value 147 ldrb r3,[r14,r1] @ 1 value 148 ldrb r4,[r14,r1,lsl #1] @ 2 value 149 150 151 add r12,r12,r4 152 subs r12,r12,r3,lsl #1 @ dq3value is stored in r12 153 rsbmi r12,r12,#0 154 @ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ 155 156 157 add r2,r2,r11 158 subs r11,r2,r10,lsl #1 159 rsbmi r11,r11,#0 @ dp3 value is stored in r8 160 @ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ 161 162 163 164 add r3,r8,r9 @ r3 has the d0 value 165 add r4,r11,r12 @ r4 has the d3 value 166 167 168 @ d0 = dp0 + dq0@ 169 @ d3 = dp3 + dq3@ 170 171 add r14,r8,r11 @ r13 has the value dp 172 add r12,r12,r9 @ r12 has the value dq 173 @ dp = dp0 + dp3@ 174 @ dq = dq0 + dq3@ 175 176 add r11, r3, r4 @ r3 has the value d 177 178 @ d = d0 + d3@ 179 180 181 cmp r11,r5 182 bge l1.2404 183 184 @ if(d < beta) 185 186 187 @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11 188 189 @ registers for use: r2,r7,r8,r9,r10, 190 191 asr r10,r5,#2 192 vqadd.u8 d30,d26,d1 193 cmp r10,r3,lsl #1 194 vqsub.u8 d31,d26,d1 195 ble l1.1840 196 add r10,r1,r1,lsl #1 197 vaddl.u8 q3,d25,d26 198 ldr r2,[r0,-r1,lsl #2] @ has the -4 value 199 ldrb r7,[r0,-r1] @ has the -1 value 200 vdup.32 d22,r2 @ -4 value 201 vaddw.u8 q4,q3,d27 202 ldrb r3,[r0,#0] @ r4 has the 0 value 203 vqadd.u8 d16,d27,d1 204 and r2,#0xff 205 vmul.i16 q6,q4,d0[0] 206 ldr r8,[r0,r10] @ has the 3 value 207 vaddl.u8 q5,d24,d28 208 subs r2,r2,r7 209 vqsub.u8 d17,d27,d1 210 vdup.32 d29,r8 @ 3 value 211 and r8,#0xff 212 vadd.i16 q6,q6,q5 213 rsbmi r2,r2,#0 214 vrshrn.i16 d20,q6,#3 215 subs r8,r8,r3 216 rsbmi r8,r8,#0 217 vmin.u8 d18,d20,d30 218 add r8,r8,r2 219 220 cmp r8,r5,asr #3 221 bge l1.1840 222 vaddw.u8 q7,q4,d28 223 subs r7,r3,r7 224 vmax.u8 d4,d18,d31 225 rsbmi r7,r7,#0 226 vqadd.u8 d30,d28,d1 227 mov r10,#5 228 vrshrn.i16 d21,q7,#2 229 mul r10,r10,r6 230 vqsub.u8 d31,d28,d1 231 add r10,#1 232 cmp r7,r10,asr #1 233 vmin.u8 d18,d21,d16 234 bge l1.1840 235 236 237 @ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 238 @ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 239 240 vmax.u8 d5,d18,d17 241 asr r10,r5,#2 242 vaddl.u8 q8,d29,d28 243 cmp r10,r4,lsl #1 244 ble l1.1840 245 246 add r10,r1,r1,lsl #1 247 vmul.i16 q8,q8,d0[0] 248 add r4,r0,#3 249 250 251 ldrb r2,[r4,-r1,lsl #2] 252 vadd.i16 q8,q8,q7 253 ldrb r7,[r4,-r1] 254 vrshrn.i16 d19,q8,#3 255 ldrb r3,[r4,#0] 256 ldrb r8,[r4,r10] 257 @ ubfx r7,r2,#24,#8 @ has the -1 value 258 @ and r2,#0xff @ has the -4 value 259 @ ubfx r8,r3,#24,#8 @ has the 3 value 260 @ and r3,#0xff @ r4 has the 0 value 261 262 263 264 subs r8,r8,r3 265 vmin.u8 d18,d19,d30 266 rsbmi r8,r8,#0 267 vaddl.u8 q3,d25,d24 268 subs r2,r2,r7 269 vmax.u8 d3,d18,d31 270 rsbmi r2,r2,#0 271 vaddw.u8 q4,q3,d26 272 add r8,r8,r2 273 vqadd.u8 d30,d25,d1 274 cmp r8,r5,asr #3 275 vqsub.u8 d31,d25,d1 276 bge l1.1840 277 vmul.i16 q6,q4,d0[0] 278 subs r7,r3,r7 279 vqadd.u8 d16,d24,d1 280 rsbmi r7,r7,#0 281 vaddl.u8 q5,d23,d27 282 mov r10,#5 283 vqsub.u8 d17,d24,d1 284 mul r10,r10,r6 285 vadd.i16 q6,q6,q5 286 add r10,#1 287 vrshrn.i16 d20,q6,#3 288 cmp r7,r10,asr #1 289 vaddw.u8 q7,q4,d23 290 bge l1.1840 291 vmin.u8 d18,d20,d30 292 mov r2,#2 293 vqadd.u8 d30,d23,d1 294 ldr r4,[sp,#0x38] @ loading the filter_flag_p 295 vmax.u8 d2,d18,d31 296 ldr r5,[sp,#0x3c] @ loading the filter_flag_q 297 vrshrn.i16 d21,q7,#2 298 b end_dep_deq_decision_horz 299 @ r2 has the value of de 300 @ r6 has teh value of tc 301 @ r5 has the value of beta 302 @ r14 has the value of dp 303 @ r12 has the value of dq 304 @ r0 has the value of source address 305 @ r1 has the src stride 306 307l1.1840: 308 mov r2,#1 309 310 mov r11,r5 311 ldr r4,[sp,#0x38] @ loading the filter_flag_p 312 ldr r5,[sp,#0x3c] @ loading the filter_flag_q 313 314 cmp r6,#1 315 moveq r9,#0 316 moveq r10,#0 317 beq end_dep_deq_decision_horz 318 319 and r7,r4,r5 320 cmp r7,#1 321 beq both_flags_set_horz 322 cmp r4,#0 323 beq set_flag_dep_zero_horz 324 325 326 add r8,r11,r11,asr #1 327 mov r10,#0 328 asr r8,#3 329 cmp r8,r14 330 movgt r9,#1 331 movle r9,#0 332 b end_dep_deq_decision_horz 333set_flag_dep_zero_horz: 334 335 add r8,r11,r11,asr #1 336 mov r9,#0 337 asr r8,#3 338 cmp r8,r12 339 movgt r10,#1 340 movle r10,#0 341 b end_dep_deq_decision_horz 342 343both_flags_set_horz: 344 add r8,r11,r11,asr #1 345 asr r8,#3 346 cmp r8,r14 347 movgt r9,#1 348 movle r9,#0 349 cmp r8,r12 350 movgt r10,#1 351 movle r10,#0 352end_dep_deq_decision_horz: 353 354 @r0=source address 355 @r1=stride 356 @ r2 =de 357 @ r4=flag p 358 @r5= flag q 359 @r6 =tc 360 @ r9 =dep 361 @ r10=deq 362 363 364 365 @ add r14,r1,r1,lsl #1 366 @ lsl r7,r6,#1 367 @ vdup.8 d1,r7 368 @ vmov.i16 d0,#0x2 369 vmin.u8 d18,d21,d16 370 cmp r2,#1 371 vqsub.u8 d31,d23,d1 372 beq l1.2408 373 vaddl.u8 q4,d23,d22 374 cmp r5,#1 375 376 bne strong_filtering_p 377 378strong_filtering_q: 379 mov r12,r0 380 vst1.32 d4[0],[r12],r1 381 vst1.32 d5[0],[r12],r1 382 vst1.32 d3[0],[r12] 383 cmp r4,#1 384 bne l1.2404 385strong_filtering_p: 386 vmax.u8 d5,d18,d17 387 mov r12,r0 388 vmul.i16 q4,q4,d0[0] 389 rsb r11,r1,#0 390 vadd.i16 q8,q4,q7 391 add r12,r12,r11 392 vrshrn.i16 d19,q8,#3 393 vst1.32 d2[0],[r12],r11 394 vmin.u8 d18,d19,d30 395 vst1.32 d5[0],[r12],r11 396 vmax.u8 d3,d18,d31 397 vst1.32 d3[0],[r12] 398 399l1.2404: 400 ldmfd sp!, {r3-r12,pc} 401 402 @ r4=flag p 403 @r5= flag q 404 @r6 =tc 405 @ r9 =dep 406 @ r10=deq 407 408 409 @ d22 -4 value 410 411 @d23 @ -3 value 412 413 @ vdup.32 d24,r11 @ -2 value 414 415 @ vdup.32 d25, r11 @-1 value 416 417 @ vdup.32 d26,r11 @ 0 value 418 419 @ vdup.32 d27,r11 @ 1value 420 421 @ vdup.32 d28,r11 @ 2 value 422 423 @ vdup.32 d29,r11 @ 3 value 424 425l1.2408: 426 427 vmov.i16 d0,#0x9 428 429 vsubl.u8 q5,d26,d25 430 431 vmul.i16 q5,q5,d0[0] 432 433 vmov.i16 d0,#0x3 434 435 vsubl.u8 q6,d27,d24 436 vmul.i16 q6,q6,d0[0] 437 438 439 vdup.8 d30,r6 @ duplicating the +tc value 440 441 rsb r12,r6,#0 442 vdup.8 d31,r12 @ duplicating the -tc value 443 444 445 446 vsub.i16 q5,q5,q6 447 448 449 450 vrshr.s16 q5,q5,#4 451 @ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ 452 453 vabs.s16 q4,q5 454 vmovn.i16 d9,q4 455 @ storing the absolute values of delta in d9 456 457 vqmovn.s16 d10,q5 458 @ storing the clipped values of delta in d16 459 460 461 vmin.s8 d11,d10,d30 462 vmax.s8 d8,d31,d11 @ d8 has the value delta = clip3(delta, -tc, tc)@ 463 464 465 vmovl.u8 q3,d25 466 467 vaddw.s8 q2,q3,d8 468 469 vqmovun.s16 d12,q2 470 vmovl.u8 q3,d26 471 vsubw.s8 q2,q3,d8 472 vqmovun.s16 d13,q2 473 474 475 mov r11,#0xa 476 mul r12,r11,r6 477 vdup.8 d2,r12 @ d2 has the 10*tc value 478 vmov d18,d24 479 vdup.8 d0,r6 480 vshr.s8 d0,#1 481 vneg.s8 d1,d0 482 483 cmp r4,#1 484 bne l1.2724 485 cmp r9,#1 486 bne l1.2700 487 488 @ d12 and d13 have the value temp_p0 and temp_q0 489 vaddl.u8 q7,d23,d25 490 vrshrn.u16 d14,q7,#1 491 vsubl.u8 q7,d14,d24 492 vaddw.s8 q7,q7,d8 493 vqshrn.s16 d14,q7,#1 494 vmin.s8 d15,d14,d0 495 vmax.s8 d14,d1,d15 496 497 @ d14 has the delta p value 498 vmovl.u8 q8,d24 499 vaddw.s8 q8,q8,d14 500 vqmovun.s16 d14,q8 501 502 @ d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@ 503 vcge.u8 d18,d9,d2 504 vbsl d18,d24,d14 505 506l1.2700: 507 mov r12,r0 508 rsb r11,r1,#0 509 add r12,r11 510 vcge.u8 d19,d9,d2 511 vbsl d19,d25,d12 512 vst1.32 {d19[0]},[r12],r11 513 vst1.32 {d18[0]},[r12] 514l1.2724: 515 cmp r5,#1 516 bne l1.2404 517 cmp r10,#1 518 vmov d18, d27 519 bne l1.2852 520 521 vaddl.u8 q7,d26,d28 522 vrshrn.u16 d14,q7,#1 523 vsubl.u8 q7,d14,d27 524 vsubw.s8 q7,q7,d8 525 vqshrn.s16 d14,q7,#1 526 vmin.s8 d15,d14,d0 527 vmax.s8 d14,d1,d15 528@ d14 has the delta p value 529 vmovl.u8 q8,d27 530 vaddw.s8 q8,q8,d14 531 vqmovun.s16 d14,q8 532 vcge.u8 d18,d9,d2 533 vbsl d18,d27,d14 534l1.2852: 535 mov r12,r0 536 vcge.u8 d19,d9,d2 537 vbsl d19,d26,d13 538 vst1.32 {d19[0]},[r12],r1 539 vst1.32 {d18[0]},[r12] 540 ldmfd sp!, {r3-r12,r15} 541 542 543 544