1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_weighted_pred_bi_default.s 22@* 23@* @brief 24@* contains function definitions for weighted prediction used in inter 25@* prediction 26@* 27@* @author 28@* parthiban v 29@* 30@* @par list of functions: 31@* - ihevc_weighted_pred_bi_default() 32@* 33@* @remarks 34@* none 35@* 36@******************************************************************************* 37@*/ 38@/** 39@******************************************************************************* 40@* 41@* @brief 42@* does default bi-weighted prediction on the arrays pointed by pi2_src1 and 43@* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the 44@* function is optimized considering the fact width and height are multiple 45@* of 2. 46@* 47@* @par description: 48@* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 49@* >> shift where shift = 15 - bitdepth 50@* 51@* @param[in] pi2_src1 52@* pointer to source 1 53@* 54@* @param[in] pi2_src2 55@* pointer to source 2 56@* 57@* @param[out] pu1_dst 58@* pointer to destination 59@* 60@* @param[in] src_strd1 61@* source stride 1 62@* 63@* @param[in] src_strd2 64@* source stride 2 65@* 66@* @param[in] dst_strd 67@* destination stride 68@* 69@* @param[in] lvl_shift1 70@* added before shift and offset 71@* 72@* @param[in] lvl_shift2 73@* added before shift and offset 74@* 75@* @param[in] ht 76@* height of the source 77@* 78@* @param[in] wd 79@* width of the source 80@* 81@* @returns 82@* 83@* @remarks 84@* none 85@* 86@******************************************************************************* 87@*/ 88@void ihevc_weighted_pred_bi_default(word16 *pi2_src1, 89@ word16 *pi2_src2, 90@ uword8 *pu1_dst, 91@ word32 src_strd1, 92@ word32 src_strd2, 93@ word32 dst_strd, 94@ word32 lvl_shift1, 95@ word32 lvl_shift2, 96@ word32 ht, 97@ word32 wd) 98 99@**************variables vs registers***************************************** 100@ r0 => *pi2_src1 101@ r1 => *pi2_src2 102@ r2 => *pu1_dst 103@ r3 => src_strd1 104@ r4 => src_strd2 105@ r5 => dst_strd 106@ r6 => lvl_shift1 107@ r7 => lvl_shift2 108@ r8 => ht 109@ r9 => wd 110.text 111.syntax unified 112.align 4 113 114 115 116 117.globl ihevc_weighted_pred_bi_default_a9q 118 119.type ihevc_weighted_pred_bi_default_a9q, %function 120 121ihevc_weighted_pred_bi_default_a9q: 122 123 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 124 ldr r4,[sp,#40] @load src_strd2 125 lsl r3,r3,#1 126 ldr r5,[sp,#44] @load dst_strd 127 ldr r6,[sp,#48] @load lvl_shift1 128 lsl r4,r4,#1 129 ldr r7,[sp,#52] @load lvl_shift2 130 ldr r8,[sp,#56] @load ht 131 ldr r9,[sp,#60] @load wd 132 vdup.16 q2,r6 @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1) 133 vdup.16 q3,r7 @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2) 134 vmov.i16 q0,#0x40 @tmp_lvl_shift = 1 << (shift - 1) 135 vadd.i16 q2,q3 136 vadd.s16 q0,q0,q2 137@ vmvn.i32 q1,#0x6 @vmovq_n_s32(tmp_shift) 138 lsl r6,r9,#1 139 rsb r7,r6,r3,lsl #2 @4*src_strd1 - wd 140 rsb r10,r6,r4,lsl #2 @4*src_strd2 - wd 141 @asr r6,#1 142 @rsb r6,r6,r5,lsl #2 @4*dst_strd - wd 143 144 cmp r8,#0 @check ht == 0 145 beq end_loops @if equal, then end the function 146 147chroma_decision: 148 orr r14,r8,r9 149 cmp r14,#10 150 beq outer_loop_chroma_8x2 151 152 cmp r14,#6 153 beq outer_loop_chroma_4x2 154 155 156luma_decision: 157 cmp r9,#24 158 beq outer_loop_8 159 160 cmp r9,#16 161 bge outer_loop_16 162 163 cmp r9,#12 164 beq outer_loop_4 165 166 cmp r9,#8 167 bge outer_loop_8 168 169 170 171 172 173 174outer_loop_4: 175 cmp r9,#0 @check wd == 0 176 beq end_loops @if equal, then end the function 177 178core_loop_4: 179 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 180 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 181 vld1.s16 {d6},[r0]! @load and increment the pi2_src1 182 add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd 183 vld1.s16 {d7},[r1]! @load and increment the pi2_src2 184 vld1.s16 {d8},[r11],r3 @load and increment the pi2_src1 ii iteration 185 vqadd.s16 d18,d6,d7 186 vqadd.s16 d18,d18,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 187 vld1.s16 {d9},[r12],r4 @load and increment the pi2_src2 ii iteration 188 vqadd.s16 d20,d8,d9 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 189 vqadd.s16 d19,d20,d0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 190 vqshrun.s16 d20,q9,#7 191 vld1.s16 {d22},[r11],r3 @load and increment the pi2_src1 iii iteration 192 vld1.s16 {d23},[r12],r4 @load and increment the pi2_src2 iii iteration 193 vqadd.s16 d30,d22,d23 194 vqadd.s16 d30,d30,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration 195 vld1.s16 {d24},[r11],r3 @load and increment the pi2_src1 iv iteration 196 vld1.s16 {d25},[r12],r4 @load and increment the pi2_src2 iv iteration 197 vqadd.s16 d18,d24,d25 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration 198 vqadd.s16 d31,d18,d0 199 vst1.32 {d20[0]},[r2]! @store pu1_dst i iteration 200 vst1.32 {d20[1]},[r14],r5 @store pu1_dst ii iteration 201 vqshrun.s16 d30,q15,#7 202 vst1.32 {d30[0]},[r14],r5 @store pu1_dst iii iteration @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 203 subs r9,r9,#4 @decrement wd by 4 and check for 0 204 vst1.32 {d30[1]},[r14],r5 @store pu1_dst iv iteration 205 bgt core_loop_4 @if greater than 0 repeat the core loop again 206 207end_core_loop_4: 208 209 subs r8,r8,#4 @decrement the ht by 4 210 211 add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 212 asr r9,r6,#1 213 add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd 214 rsb r14,r9,r5,lsl #2 @4*dst_strd - wd 215 add r2,r2,r14 216 @pu1_dst + dst_std - wd 217 bgt core_loop_4 @if ht is greater than 0 goto outer_loop 218 219 b end_loops 220 221 222@ this is only for chroma module with input 2x2 223outer_loop_chroma_4x2: 224 cmp r9,#0 @check wd == 0 225 beq end_loops @if equal, then end the function 226 rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd 227 rsb r10,r6,r4,lsl #1 @2*src_strd2 - wd 228core_loop_chroma_4x2: 229 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 230 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 231 vld1.s16 {d6},[r0]! @load and increment the pi2_src1 232 add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd 233 vld1.s16 {d7},[r1]! @load and increment the pi2_src2 234 vld1.s16 {d8},[r11],r3 @load and increment the pi2_src1 ii iteration 235 vqadd.s16 d18,d6,d7 236 vqadd.s16 d18,d18,d0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 237 vld1.s16 {d9},[r12],r4 @load and increment the pi2_src2 ii iteration 238 vqadd.s16 d20,d8,d9 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 239 vqadd.s16 d19,d20,d0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 240 vqshrun.s16 d20,q9,#7 241 vst1.32 {d20[0]},[r2]! @store pu1_dst i iteration 242 vst1.32 {d20[1]},[r14],r5 @store pu1_dst ii iteration 243 244 subs r9,r9,#4 @decrement wd by 4 and check for 0 245 246 bgt core_loop_chroma_4x2 @if greater than 0 repeat the core loop again 247 248end_core_loop_chorma_4x2: 249 250 subs r8,r8,#2 @decrement the ht by 4 251 252 add r0,r0,r7 @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 253 asr r9,r6,#1 254 add r1,r1,r10 @pi2_src2 + 2*src_strd2 - 2*wd 255 rsb r14,r9,r5,lsl #1 @2*dst_strd - wd 256 add r2,r2,r14 257 @pu1_dst + dst_std - wd 258 bgt core_loop_chroma_4x2 @if ht is greater than 0 goto outer_loop 259 260 b end_loops 261 262 263 264outer_loop_8: 265 cmp r9,#0 @check wd == 0 266 beq end_loops @if equal, then end the function 267 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 268 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 269core_loop_8: 270 271 vld1.s16 {q12},[r0]! @load and increment the pi2_src1 272 add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd 273 vld1.s16 {q13},[r1]! @load and increment the pi2_src2 274 vqadd.s16 q12,q12,q13 275 vld1.s16 {q14},[r11],r3 @load and increment the pi2_src1 ii iteration 276 vqadd.s16 q12,q12,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 277 vld1.s16 {q15},[r12],r4 @load and increment the pi2_src2 ii iteration 278 vld1.s16 {q8},[r11],r3 @load and increment the pi2_src1 iii iteration 279 vqadd.s16 q11,q14,q15 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 280 vld1.s16 {q9},[r12],r4 @load and increment the pi2_src2 iii iteration 281 vqadd.s16 q11,q11,q0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 282 vqshrun.s16 d20,q12,#7 283 vld1.s16 {q6},[r11],r3 @load and increment the pi2_src1 iv iteration 284 vqadd.s16 q15,q8,q9 285 vqshrun.s16 d21,q11,#7 286 vld1.s16 {q7},[r12],r4 @load and increment the pi2_src2 iv iteration 287 vqadd.s16 q15,q15,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration 288 vst1.32 {d20},[r2]! @store pu1_dst i iteration 289 vqadd.s16 q4,q6,q7 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration 290 vst1.32 {d21},[r14],r5 @store pu1_dst ii iteration 291 vqadd.s16 q4,q4,q0 292 vqshrun.s16 d30,q15,#7 293 vqshrun.s16 d31,q4,#7 294 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 295 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 296 vst1.32 {d30},[r14],r5 @store pu1_dst iii iteration @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 297 subs r9,r9,#8 @decrement wd by 4 and check for 0 298 vst1.32 {d31},[r14],r5 @store pu1_dst iv iteration 299 bgt core_loop_8 @if greater than 0 repeat the core loop again 300 301end_core_loop_8: 302 303 subs r8,r8,#4 @decrement the ht by 4 304 305 add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 306 asr r9,r6,#1 307 add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd 308 rsb r14,r9,r5,lsl #2 @4*dst_strd - wd 309 add r2,r2,r14 310 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 311 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) @pu1_dst + dst_std - wd 312 313 bgt core_loop_8 314 b end_loops 315 316 317 318@ this is only for chroma module with inpput 4x2 319outer_loop_chroma_8x2: 320 cmp r9,#0 @check wd == 0 321 beq end_loops @if equal, then end the function 322 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 323 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 324 rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd 325 rsb r10,r6,r4,lsl #1 @2*src_strd2 - wd 326core_loop_chroma_8x2: 327 328 vld1.s16 {q12},[r0]! @load and increment the pi2_src1 329 add r14,r2,r5 @pu1_dst_tmp = pu1_dst + dst_strd 330 vld1.s16 {q13},[r1]! @load and increment the pi2_src2 331 vqadd.s16 q12,q12,q13 332 vld1.s16 {q14},[r11],r3 @load and increment the pi2_src1 ii iteration 333 vqadd.s16 q12,q12,q0 @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 334 vld1.s16 {q15},[r12],r4 @load and increment the pi2_src2 ii iteration 335 vld1.s16 {q8},[r11],r3 @load and increment the pi2_src1 iii iteration 336 vqadd.s16 q11,q14,q15 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 337 vqadd.s16 q11,q11,q0 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 338 vqshrun.s16 d20,q12,#7 339 vqshrun.s16 d21,q11,#7 340 vst1.32 {d20},[r2]! @store pu1_dst i iteration 341 vst1.32 {d21},[r14],r5 @store pu1_dst ii iteration 342 343 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 344 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 345 @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 346 subs r9,r9,#8 @decrement wd by 4 and check for 0 347 348 bgt core_loop_chroma_8x2 @if greater than 0 repeat the core loop again 349 350end_core_loop_chroma_8x2: 351 352 subs r8,r8,#2 @decrement the ht by 4 353 354 add r0,r0,r7 @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 355 asr r9,r6,#1 356 add r1,r1,r10 @pi2_src2 + 4*src_strd2 - 2*wd 357 rsb r14,r9,r5,lsl #1 @4*dst_strd - wd 358 add r2,r2,r14 359 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 360 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) @pu1_dst + dst_std - wd 361 362 bgt core_loop_chroma_8x2 363 364 b end_loops 365 366 367 368 369outer_loop_16: 370 cmp r9,#0 @check wd == 0 371 beq end_loops @if equal, then end the function 372 add r11,r0,r3 @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 373 add r12,r1,r4 @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 374 rsb r7,r6,r3,lsl #1 @2*src_strd1 - wd 375 mov r14,#16 376 sub r10,r14,r5 377 sub r11,r3,r14 378 sub r12,r14,r3 379 380 rsb r14,r9,r5,lsl #1 @2*dst_strd - wd 381 382 383 384prolog_16: 385 386 387 vld1.s16 {q1},[r0]! @load and increment the pi2_src1 388 vld1.s16 {q2},[r1]! @load and increment the pi2_src2 389 vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1 390 vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2 391 vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration 392 subs r9,r9,#16 393 vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration 394 subeq r8,r8,#2 395 vqadd.s16 q11,q1,q2 396 vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration 397 vqadd.s16 q14,q5,q6 398 vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration 399 addeq r0,r0,r7 400 addeq r1,r1,r7 401 vqadd.s16 q12,q3,q4 402 vld1.s16 {q1},[r0]! 403 vqadd.s16 q13,q7,q8 404@ if the input is chroma with 8x2 block size 405 cmp r8,#0 406 beq epilog_16 407 408 vld1.s16 {q2},[r1]! @load and increment the pi2_src2 409 vqadd.s16 q11,q11,q0 410 vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1 411 vqadd.s16 q14,q14,q0 412 vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2 413 vqadd.s16 q12,q12,q0 414 vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration 415 vqadd.s16 q15,q13,q0 416 vqshrun.s16 d20,q11,#7 417 vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration 418 vqshrun.s16 d21,q14,#7 419 vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration 420 vqshrun.s16 d26,q12,#7 421 vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration 422 vqshrun.s16 d27,q15,#7 423 424 425 426core_loop_16: 427 428 cmp r9,#0 429 vqadd.s16 q11,q1,q2 430 asreq r9,r6,#1 431 vst1.32 {q10},[r2],r5 432 vqadd.s16 q14,q5,q6 433 vst1.32 {q13},[r2],r10 434 addeq r2,r2,r14 435 vqadd.s16 q12,q3,q4 436 subs r9,r9,#16 437 addeq r0,r0,r7 438 vqadd.s16 q13,q7,q8 439 440 addeq r1,r1,r7 441 subseq r8,r8,#2 @decrement the ht by 2 442 beq epilog_16 443 444 445 vqadd.s16 q11,q11,q0 446 vld1.s16 {q1},[r0]! @load and increment the pi2_src1 447 vqadd.s16 q14,q14,q0 448 vld1.s16 {q2},[r1]! @load and increment the pi2_src2 449 vqadd.s16 q12,q12,q0 450 vld1.s16 {q5},[r0],r11 @load and increment the pi2_src1 451 vqadd.s16 q15,q13,q0 452 vld1.s16 {q6},[r1],r11 @load and increment the pi2_src2 453 vqshrun.s16 d20,q11,#7 454 vld1.s16 {q3},[r0]! @load and increment the pi2_src1 ii iteration 455 vqshrun.s16 d21,q14,#7 456 vld1.s16 {q4},[r1]! @load and increment the pi2_src2 ii iteration 457 vqshrun.s16 d26,q12,#7 458 vld1.s16 {q7},[r0],r12 @load and increment the pi2_src1 ii iteration 459 vqshrun.s16 d27,q15,#7 460 vld1.s16 {q8},[r1],r12 @load and increment the pi2_src2 ii iteration 461 462 463 b core_loop_16 464 465 466epilog_16: 467 468 vqadd.s16 q11,q11,q0 469 vqadd.s16 q14,q14,q0 470 vqadd.s16 q12,q12,q0 471 vqadd.s16 q15,q13,q0 472 vqshrun.s16 d20,q11,#7 473 vqshrun.s16 d21,q14,#7 474 vqshrun.s16 d26,q12,#7 475 vqshrun.s16 d27,q15,#7 476 vst1.32 {q10},[r2],r5 477 vst1.32 {q13},[r2] 478 479 480 481end_core_loop_16: 482 483 484 485 486 487 488 489 490end_loops: 491 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 492 493 494 495 496