1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_chroma_mode_3_to_9.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* parthiban v 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] nt 61@* size of tranform block 62@* 63@* @param[in] mode 64@* type of filtering 65@* 66@* @returns 67@* 68@* @remarks 69@* none 70@* 71@******************************************************************************* 72@*/ 73@void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref, 74@ word32 src_strd, 75@ uword8 *pu1_dst, 76@ word32 dst_strd, 77@ word32 nt, 78@ word32 mode) 79@**************variables vs registers***************************************** 80@r0 => *pu1_ref 81@r1 => src_strd 82@r2 => *pu1_dst 83@r3 => dst_strd 84 85@stack contents from #40 86@ nt 87@ mode 88 89.text 90.align 4 91 92 93 94 95 96.globl ihevc_intra_pred_chroma_mode_3_to_9_a9q 97.extern gai4_ihevc_ang_table 98.extern gai4_ihevc_inv_ang_table 99.extern col_for_intra_chroma 100.extern idx_neg_idx_chroma_3_9 101 102gai4_ihevc_ang_table_addr: 103.long gai4_ihevc_ang_table - ulbl1 - 8 104 105gai4_ihevc_inv_ang_table_addr: 106.long gai4_ihevc_inv_ang_table - ulbl2 - 8 107 108 109idx_neg_idx_chroma_3_9_addr: 110.long idx_neg_idx_chroma_3_9 - ulbl3 - 8 111 112col_for_intra_chroma_addr_1: 113.long col_for_intra_chroma - ulbl4 - 8 114 115col_for_intra_chroma_addr_2: 116.long col_for_intra_chroma - ulbl5 - 8 117 118col_for_intra_chroma_addr_3: 119.long col_for_intra_chroma - ulbl6 - 8 120 121.type ihevc_intra_pred_chroma_mode_3_to_9_a9q, %function 122 123ihevc_intra_pred_chroma_mode_3_to_9_a9q: 124 125 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 126 127 ldr r4,[sp,#40] @loads nt 128 ldr r7, gai4_ihevc_ang_table_addr 129ulbl1: 130 add r7,r7,pc 131 132 ldr r5,[sp,#44] @mode (3 to 9) 133 ldr r8, gai4_ihevc_inv_ang_table_addr 134ulbl2: 135 add r8,r8,pc 136 137 add r7, r7, r5, lsl #2 @gai4_ihevc_ang_table[mode] 138 ldr r7, [r7] @intra_pred_ang 139 vdup.8 d30, r7 @intra_pred_ang 140 141 ldr r14, col_for_intra_chroma_addr_1 142ulbl4: 143 add r14,r14,pc 144 145prologue_8_16_32: 146 lsr r10, r4, #3 147 vld1.8 d31, [r14]! 148 mul r10, r4, r10 @block counter (dec by #8) 149 150 mov r11, r4, lsl #1 @col counter to be inc/dec by #8 151 vmull.s8 q11, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 152 153 sub r7, r5, #3 154 ldr r12, idx_neg_idx_chroma_3_9_addr @load most idx table 155ulbl3: 156 add r12,r12,pc 157 158 add r12, r12, r7, lsl #4 159 mov r8, r12 160 161 mov r7, #8 162 sub r7, r7, r3, lsl #3 @r7 = 8-8r3 163 164 ldr r9, [r8] 165 mov r9, r9, lsl #1 166 add r1, r0, r4, lsl #2 @pu1_ref + 4*nt 167 168 vmovn.s16 d6, q11 169 vdup.8 d26, r9 @most idx added to final idx values 170 sub r1, r1, #26 @ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row 171 172 sub r6, r1, r9 173 174 vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from most idx) 175 vshr.s16 q11, q11, #5 176 177 vmov.i8 d29, #31 @contains #31 for vand operation 178 179 vmov.i8 d28, #32 180 181 vqmovn.s16 d8, q11 182 vshl.s8 d8, d8, #1 @ 2 * idx 183 184 vand d6, d6, d29 @fract values in d1/ idx values in d0 185 vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1 186 187 movw r0,#0x302 @ idx value for v is +1 of u 188 vdup.u16 d27,r0 189 mov r0,#0 190 191 vmov.i8 d9, #22 @row 0 to 7 192 193 vsub.s8 d8, d8, d27 @ref_main_idx (sub row) 194 vsub.s8 d8, d26, d8 @ref_main_idx (row 0) 195 vadd.s8 d8, d8, d9 @to compensate the pu1_src idx incremented by 8 196 vsub.s8 d9, d8, d29 @ref_main_idx + 1 (row 0) 197 vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0) 198 vsub.s8 d7, d28, d6 @32-fract 199 200 vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0) 201 vsub.s8 d4, d8, d29 @ref_main_idx (row 1) 202 vsub.s8 d5, d9, d29 @ref_main_idx + 1 (row 1) 203 204 vmov.i8 d29, #4 205 206 vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1) 207 vmull.u8 q12, d12, d7 @mul (row 0) 208 vmlal.u8 q12, d13, d6 @mul (row 0) 209 210 vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1) 211 vsub.s8 d8, d8, d29 @ref_main_idx (row 2) 212 vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 2) 213 214 vrshrn.i16 d24, q12, #5 @round shft (row 0) 215 216 vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2) 217 vmull.u8 q11, d16, d7 @mul (row 1) 218 vmlal.u8 q11, d17, d6 @mul (row 1) 219 220 vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2) 221 vsub.s8 d4, d4, d29 @ref_main_idx (row 3) 222 vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 3) 223 224 vst1.8 d24, [r2], r3 @st (row 0) 225 vrshrn.i16 d22, q11, #5 @round shft (row 1) 226 227 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3) 228 vmull.u8 q10, d14, d7 @mul (row 2) 229 vmlal.u8 q10, d15, d6 @mul (row 2) 230 231 vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3) 232 vsub.s8 d8, d8, d29 @ref_main_idx (row 4) 233 vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 4) 234 235 vst1.8 d22, [r2], r3 @st (row 1) 236 vrshrn.i16 d20, q10, #5 @round shft (row 2) 237 238 vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4) 239 vmull.u8 q9, d10, d7 @mul (row 3) 240 vmlal.u8 q9, d11, d6 @mul (row 3) 241 242 vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4) 243 vsub.s8 d4, d4, d29 @ref_main_idx (row 5) 244 vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 5) 245 246 vst1.8 d20, [r2], r3 @st (row 2) 247 vrshrn.i16 d18, q9, #5 @round shft (row 3) 248 249 vtbl.8 d16, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5) 250 vmull.u8 q12, d12, d7 @mul (row 4) 251 vmlal.u8 q12, d13, d6 @mul (row 4) 252 253 vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5) 254 vsub.s8 d8, d8, d29 @ref_main_idx (row 6) 255 vsub.s8 d9, d9, d29 @ref_main_idx + 1 (row 6) 256 257 vst1.8 d18, [r2], r3 @st (row 3) 258 cmp r4,#4 259 beq end_func 260 vrshrn.i16 d24, q12, #5 @round shft (row 4) 261 262 vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6) 263 vmull.u8 q11, d16, d7 @mul (row 5) 264 vmlal.u8 q11, d17, d6 @mul (row 5) 265 266 vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6) 267 vsub.s8 d4, d4, d29 @ref_main_idx (row 7) 268 vsub.s8 d5, d5, d29 @ref_main_idx + 1 (row 7) 269 270 vst1.8 d24, [r2], r3 @st (row 4) 271 vrshrn.i16 d22, q11, #5 @round shft (row 5) 272 273 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7) 274 vmull.u8 q10, d14, d7 @mul (row 6) 275 vmlal.u8 q10, d15, d6 @mul (row 6) 276 277 vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7) 278 vmull.u8 q9, d10, d7 @mul (row 7) 279 vmlal.u8 q9, d11, d6 @mul (row 7) 280 281 vst1.8 d22, [r2], r3 @st (row 5) 282 vrshrn.i16 d20, q10, #5 @round shft (row 6) 283 vrshrn.i16 d18, q9, #5 @round shft (row 7) 284 285 vst1.8 d20, [r2], r3 @st (row 6) 286 287 subs r10, r10, #4 @subtract 8 and go to end if 8x8 288 289 vst1.8 d18, [r2], r3 @st (row 7) 290 291 beq end_func 292 293 subs r11, r11, #8 @decrement the processed col 294 addgt r8, r8, #4 295 addgt r2, r2, r7 296 movle r8, r12 297 suble r2, r2, r4 298 addle r2, r2, #8 299 movle r11, r4, lsl #1 300 ldrle r14, col_for_intra_chroma_addr_2 301ulbl5: 302 addle r14,r14,pc 303 addle r0, r0, #8 304 305 vld1.8 d31, [r14]! 306 vmull.s8 q6, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 307 vmovn.s16 d10, q6 308 vshr.s16 q6, q6, #5 309 vqmovn.s16 d11, q6 310 vshl.s8 d11, d11, #1 311 movw r5, #0x302 @idx value for v is +1 of u 312 vdup.u16 d27, r5 @row value inc or reset accordingly 313 ldr r9, [r8] @loads index value 314 mov r9, r9, lsl #1 315 mov r5, #22 316 sub r5, r5, r0, lsl #1 317 vdup.8 d16, r5 318 vdup.8 d26, r9 319 320 mov r5,r2 321 vsub.s8 d11, d11, d27 @ref_main_idx (sub row) 322 323kernel_8_16_32: 324 vmov.i8 d29, #2 @contains #2 for adding to get ref_main_idx + 1 325 vsub.s8 d8, d26, d11 @ref_main_idx 326 vmov d26,d10 327 328 subs r11, r11, #8 329 sub r6, r1, r9 330 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7) 331 vadd.s8 d8, d8, d16 @to compensate the pu1_src idx incremented by 8 332 333 vmull.u8 q10, d14, d7 @mul (row 6) 334 vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx - 1 (row 7) 335 vmlal.u8 q10, d15, d6 @mul (row 6) 336 337 addle r0, r0, #8 338 vsub.s8 d9, d8, d29 @ref_main_idx - 2 339 addgt r8, r8, #4 340 341 vld1.8 {d0,d1,d2,d3}, [r6] @stores the 32 values reqd based on indices values (from most idx) 342 vrshrn.i16 d22, q11, #5 @round shft (row 5) 343 344 ldrle r14, col_for_intra_chroma_addr_3 345ulbl6: 346 addle r14,r14,pc 347 vst1.8 d24, [r5], r3 @st (row 4) 348 movle r8, r12 349 350 movw r9,#0x302 351 vdup.16 d27, r9 @row value inc or reset accordingly 352 vsub.s8 d4, d8, d29 @ref_main_idx (row 1) 353 354 vsub.s8 d5, d9, d29 @ref_main_idx - 1 (row 1) 355 vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 0) 356 vmov.i8 d29, #31 @contains #2 for adding to get ref_main_idx + 1 357 358 vmull.u8 q9, d10, d7 @mul (row 7) 359 vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 0) 360 vmlal.u8 q9, d11, d6 @mul (row 7) 361 362 vld1.8 d31, [r14]! 363 vand d6, d29, d26 @fract values in d1/ idx values in d0 364 365 movle r11, r4, lsl #1 366 vmov.i8 d29, #4 @contains #2 for adding to get ref_main_idx + 1 367 ldr r9, [r8] 368 369 vst1.8 d22, [r5], r3 @(from previous loop)st (row 5) 370 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 371 372 vsub.s8 d8, d8, d29 @ref_main_idx (row 2) 373 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 1) 374 vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 2) 375 376 mov r9,r9,lsl #1 377 vsub.s8 d7, d28, d6 @32-fract 378 379 vmull.u8 q12, d12, d7 @mul (row 0) 380 vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 1) 381 vmlal.u8 q12, d13, d6 @mul (row 0) 382 383 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 384 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 385 386 vsub.s8 d4, d4, d29 @ref_main_idx (row 3) 387 vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 2) 388 vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 3) 389 390 vmull.u8 q11, d10, d7 @mul (row 1) 391 vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 2) 392 vmlal.u8 q11, d17, d6 @mul (row 1) 393 394 vrshrn.i16 d24, q12, #5 @round shft (row 0) 395 vst1.8 d18, [r5], r3 @(from previous loop)st (row 7) 396 397 vsub.s8 d8, d8, d29 @ref_main_idx (row 4) 398 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 3) 399 vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 4) 400 401 vmull.u8 q10, d14, d7 @mul (row 2) 402 vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 3) 403 vmlal.u8 q10, d15, d6 @mul (row 2) 404 405 add r5,r2,r3,lsl#2 406 vmull.s8 q7, d30, d31 @(col+1)*intra_pred_angle [0:7](col) 407 add r9, r9, r0, lsl #1 408 409 vst1.8 d24, [r2], r3 @st (row 0) 410 vrshrn.i16 d22, q11, #5 @round shft (row 1) 411 412 vsub.s8 d4, d4, d29 @ref_main_idx (row 5) 413 vtbl.8 d12, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 4) 414 vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 5) 415 416 vmull.u8 q9, d10, d7 @mul (row 3) 417 vtbl.8 d13, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 4) 418 vmlal.u8 q9, d11, d6 @mul (row 3) 419 420 vst1.8 d22, [r2], r3 @st (row 1) 421 vrshrn.i16 d20, q10, #5 @round shft (row 2) 422 423 vmovn.s16 d10, q7 424 vshr.s16 q7, q7, #5 425 426 vsub.s8 d8, d8, d29 @ref_main_idx (row 6) 427 vtbl.8 d21, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 5) 428 vsub.s8 d9, d9, d29 @ref_main_idx - 1 (row 6) 429 430 vmull.u8 q12, d12, d7 @mul (row 4) 431 vtbl.8 d17, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 5) 432 vqmovn.s16 d11, q7 433 434 vst1.8 d20, [r2], r3 @st (row 2) 435 vmlal.u8 q12, d13, d6 @mul (row 4) 436 437 vrshrn.i16 d18, q9, #5 @round shft (row 3) 438 vdup.8 d26, r9 439 440 vsub.s8 d4, d4, d29 @ref_main_idx (row 7) 441 vtbl.8 d14, {d0,d1,d2,d3}, d8 @load from ref_main_idx (row 6) 442 vsub.s8 d5, d5, d29 @ref_main_idx - 1 (row 7) 443 444 mov r6, #22 @to compensate the 2*row value 445 vshl.u8 d11,#1 446 sub r6, r6, r0, lsl #1 447 448 vmull.u8 q11, d21, d7 @mul (row 5) 449 vtbl.8 d15, {d0,d1,d2,d3}, d9 @load from ref_main_idx + 1 (row 6) 450 vmlal.u8 q11, d17, d6 @mul (row 5) 451 452 vst1.8 d18, [r2], r3 @st (row 3) 453 vrshrn.i16 d24, q12, #5 @round shft (row 4) 454 455 add r2,r2,r3, lsl #2 456 vdup.8 d16, r6 457 addgt r2, r7, r2 458 459 suble r2, r2, r4 460 vsub.s8 d11, d11, d27 @ref_main_idx (add row) 461 suble r2,r2,#8 462 463 subs r10, r10, #4 @subtract 8 and go to end if 8x8 464 465 bne kernel_8_16_32 466 467epil_8_16_32: 468 vtbl.8 d10, {d0,d1,d2,d3}, d4 @load from ref_main_idx (row 7) 469 470 vmull.u8 q10, d14, d7 @mul (row 6) 471 vtbl.8 d11, {d0,d1,d2,d3}, d5 @load from ref_main_idx + 1 (row 7) 472 vmlal.u8 q10, d15, d6 @mul (row 6) 473 474 vst1.8 d24, [r5], r3 @st (row 4) 475 vrshrn.i16 d24, q11, #5 @round shft (row 5) 476 477 vmull.u8 q9, d10, d7 @mul (row 7) 478 vmlal.u8 q9, d11, d6 @mul (row 7) 479 480 vst1.8 d24, [r5], r3 @(from previous loop)st (row 5) 481 vrshrn.i16 d20, q10, #5 @(from previous loop)round shft (row 6) 482 483 vst1.8 d20, [r5], r3 @(from previous loop)st (row 6) 484 vrshrn.i16 d18, q9, #5 @(from previous loop)round shft (row 7) 485 486 vst1.8 d18, [r5], r3 @st (row 7) 487 488end_func: 489 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 490 491 492 493 494 495 496 497 498