1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@****************************************************************************** 22@* @file 23@* ih264_weighted_bi_pred_a9q.s 24@* 25@* @brief 26@* Contains function definitions for weighted biprediction. 27@* 28@* @author 29@* Kaushik Senthoor R 30@* 31@* @par List of Functions: 32@* 33@* - ih264_weighted_bi_pred_luma_a9q() 34@* - ih264_weighted_bi_pred_chroma_a9q() 35@* 36@* @remarks 37@* None 38@* 39@******************************************************************************* 40@* 41@******************************************************************************* 42@* @function 43@* ih264_weighted_bi_pred_luma_a9q() 44@* 45@* @brief 46@* This routine performs the weighted biprediction as described in sec 47@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma. 48@* 49@* @par Description: 50@* This function gets two ht x wd blocks, calculates the weighted samples, 51@* rounds off, adds offset and stores it in the destination block. 52@* 53@* @param[in] pu1_src1 54@* UWORD8 Pointer to the buffer containing the input block 1. 55@* 56@* @param[in] pu1_src2 57@* UWORD8 Pointer to the buffer containing the input block 2. 58@* 59@* @param[out] pu1_dst 60@* UWORD8 pointer to the destination where the output block is stored. 61@* 62@* @param[in] src_strd1 63@* Stride of the input buffer 1 64@* 65@* @param[in] src_strd2 66@* Stride of the input buffer 2 67@* 68@* @param[in] dst_strd 69@* Stride of the destination buffer 70@* 71@* @param[in] log_wd 72@* number of bits to be rounded off 73@* 74@* @param[in] wt1 75@* weight for the weighted prediction 76@* 77@* @param[in] wt2 78@* weight for the weighted prediction 79@* 80@* @param[in] ofst1 81@* offset 1 used after rounding off 82@* 83@* @param[in] ofst2 84@* offset 2 used after rounding off 85@* 86@* @param[in] ht 87@* integer height of the array 88@* 89@* @param[in] wd 90@* integer width of the array 91@* 92@* @returns 93@* None 94@* 95@* @remarks 96@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). 97@* 98@******************************************************************************* 99@* 100@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1, 101@ UWORD8 *pu1_src2, 102@ UWORD8 *pu1_dst, 103@ WORD32 src_strd1, 104@ WORD32 src_strd2, 105@ WORD32 dst_strd, 106@ WORD32 log_wd, 107@ WORD32 wt1, 108@ WORD32 wt2, 109@ WORD32 ofst1, 110@ WORD32 ofst2, 111@ WORD32 ht, 112@ WORD32 wd) 113@ 114@**************Variables Vs Registers***************************************** 115@ r0 => pu1_src1 116@ r1 => pu1_src2 117@ r2 => pu1_dst 118@ r3 => src_strd1 119@ [sp] => src_strd2 (r4) 120@ [sp+4] => dst_strd (r5) 121@ [sp+8] => log_wd (r6) 122@ [sp+12] => wt1 (r7) 123@ [sp+16] => wt2 (r8) 124@ [sp+20] => ofst1 (r9) 125@ [sp+24] => ofst2 (r10) 126@ [sp+28] => ht (r11) 127@ [sp+32] => wd (r12) 128@ 129.text 130.p2align 2 131 132 .global ih264_weighted_bi_pred_luma_a9q 133 134ih264_weighted_bi_pred_luma_a9q: 135 136 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 137 ldr r6, [sp, #48] @Load log_wd in r6 138 ldr r7, [sp, #52] @Load wt1 in r7 139 ldr r8, [sp, #56] @Load wt2 in r8 140 ldr r9, [sp, #60] @Load ofst1 in r9 141 142 add r6, r6, #1 @r6 = log_wd + 1 143 sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit 144 ldr r4, [sp, #40] @Load src_strd2 in r4 145 ldr r5, [sp, #44] @Load dst_strd in r5 146 sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit 147 rsb r10, r6, #0 @r13 = -(log_wd + 1) 148 ldr r11, [sp, #68] @Load ht in r11 149 ldr r12, [sp, #72] @Load wd in r12 150 vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit) 151 add r9, r9, #1 @r9 = ofst1 + 1 152 153 ldr r10, [sp, #64] @Load ofst2 in r10 154 sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit 155 cmp r12, #16 @check if wd is 16 156 vpush {d8-d15} 157 sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit 158 add r9, r9, r10 @r9 = ofst1 + ofst2 + 1 159 vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)} 160 asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1 161 vdup.8 d3, r9 @D3 = ofst (8-bit) 162 beq loop_16 @branch if wd is 16 163 164 cmp r12, #8 @check if wd is 8 165 beq loop_8 @branch if wd is 8 166 167loop_4: @each iteration processes four rows 168 169 vld1.32 d4[0], [r0], r3 @load row 1 in source 1 170 vld1.32 d4[1], [r0], r3 @load row 2 in source 1 171 vld1.32 d6[0], [r1], r4 @load row 1 in source 2 172 vld1.32 d6[1], [r1], r4 @load row 2 in source 2 173 174 vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit 175 vld1.32 d8[0], [r0], r3 @load row 3 in source 1 176 vld1.32 d8[1], [r0], r3 @load row 4 in source 1 177 vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit 178 vld1.32 d10[0], [r1], r4 @load row 3 in source 2 179 vld1.32 d10[1], [r1], r4 @load row 4 in source 2 180 181 vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit 182 vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit 183 184 vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2 185 vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2 186 vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4 187 vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4 188 189 subs r11, r11, #4 @decrement ht by 4 190 vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2 191 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4 192 193 vaddw.s8 q2, q2, d3 @adding offset for rows 1,2 194 vaddw.s8 q4, q4, d3 @adding offset for rows 3,4 195 196 vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit 197 vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit 198 199 vst1.32 d4[0], [r2], r5 @store row 1 in destination 200 vst1.32 d4[1], [r2], r5 @store row 2 in destination 201 vst1.32 d8[0], [r2], r5 @store row 3 in destination 202 vst1.32 d8[1], [r2], r5 @store row 4 in destination 203 204 bgt loop_4 @if greater than 0 repeat the loop again 205 206 b end_loops 207 208loop_8: @each iteration processes four rows 209 210 vld1.8 d4, [r0], r3 @load row 1 in source 1 211 vld1.8 d6, [r1], r4 @load row 1 in source 2 212 vld1.8 d8, [r0], r3 @load row 2 in source 1 213 vld1.8 d10, [r1], r4 @load row 2 in source 2 214 vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit 215 vld1.8 d12, [r0], r3 @load row 3 in source 1 216 vld1.8 d14, [r1], r4 @load row 3 in source 2 217 vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit 218 vld1.8 d16, [r0], r3 @load row 4 in source 1 219 vld1.8 d18, [r1], r4 @load row 4 in source 2 220 221 vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit 222 vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit 223 224 vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1 225 vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1 226 vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit 227 vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit 228 vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2 229 vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2 230 vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit 231 vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit 232 233 vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3 234 vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3 235 vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4 236 vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4 237 238 vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1 239 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2 240 vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3 241 vaddw.s8 q2, q2, d3 @adding offset for row 1 242 vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4 243 vaddw.s8 q4, q4, d3 @adding offset for row 2 244 245 vaddw.s8 q6, q6, d3 @adding offset for row 3 246 vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit 247 vaddw.s8 q8, q8, d3 @adding offset for row 4 248 vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit 249 250 vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit 251 vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit 252 253 vst1.8 d4, [r2], r5 @store row 1 in destination 254 vst1.8 d8, [r2], r5 @store row 2 in destination 255 subs r11, r11, #4 @decrement ht by 4 256 vst1.8 d12, [r2], r5 @store row 3 in destination 257 vst1.8 d16, [r2], r5 @store row 4 in destination 258 259 bgt loop_8 @if greater than 0 repeat the loop again 260 261 b end_loops 262 263loop_16: @each iteration processes two rows 264 265 vld1.8 {q2}, [r0], r3 @load row 1 in source 1 266 vld1.8 {q3}, [r1], r4 @load row 1 in source 2 267 vld1.8 {q4}, [r0], r3 @load row 2 in source 1 268 vld1.8 {q5}, [r1], r4 @load row 2 in source 2 269 vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit 270 vld1.8 {q6}, [r0], r3 @load row 3 in source 1 271 vld1.8 {q7}, [r1], r4 @load row 3 in source 2 272 vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit 273 vld1.8 {q8}, [r0], r3 @load row 4 in source 1 274 vld1.8 {q9}, [r1], r4 @load row 4 in source 2 275 276 vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit 277 vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit 278 279 vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L 280 vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L 281 vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit 282 vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit 283 284 vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H 285 vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H 286 vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit 287 vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit 288 289 vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L 290 vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L 291 vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit 292 vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit 293 294 vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H 295 vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H 296 vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit 297 vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit 298 299 vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L 300 vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L 301 vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit 302 vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit 303 304 vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H 305 vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H 306 vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit 307 vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit 308 309 vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L 310 vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L 311 vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L 312 313 vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H 314 vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H 315 vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H 316 317 vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L 318 vaddw.s8 q10, q10, d3 @adding offset for row 1L 319 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H 320 vaddw.s8 q2, q2, d3 @adding offset for row 1H 321 vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L 322 vaddw.s8 q12, q12, d3 @adding offset for row 2L 323 vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H 324 vaddw.s8 q4, q4, d3 @adding offset for row 2H 325 vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L 326 vaddw.s8 q14, q14, d3 @adding offset for row 3L 327 vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H 328 vaddw.s8 q6, q6, d3 @adding offset for row 3H 329 330 vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit 331 vaddw.s8 q11, q11, d3 @adding offset for row 4L 332 vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit 333 vaddw.s8 q8, q8, d3 @adding offset for row 4H 334 335 vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit 336 vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit 337 vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit 338 vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit 339 vst1.8 {q13}, [r2], r5 @store row 1 in destination 340 vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit 341 vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit 342 343 vst1.8 {q5}, [r2], r5 @store row 2 in destination 344 subs r11, r11, #4 @decrement ht by 4 345 vst1.8 {q15}, [r2], r5 @store row 3 in destination 346 vst1.8 {q7}, [r2], r5 @store row 4 in destination 347 348 bgt loop_16 @if greater than 0 repeat the loop again 349 350end_loops: 351 352 vpop {d8-d15} 353 ldmfd sp!, {r4-r12, r15} @Reload the registers from sp 354 355 356@******************************************************************************* 357@* @function 358@* ih264_weighted_bi_pred_chroma_a9q() 359@* 360@* @brief 361@* This routine performs the default weighted prediction as described in sec 362@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma. 363@* 364@* @par Description: 365@* This function gets two ht x wd blocks, calculates the weighted samples, 366@* rounds off, adds offset and stores it in the destination block for U and V. 367@* 368@* @param[in] pu1_src1 369@* UWORD8 Pointer to the buffer containing the input block 1. 370@* 371@* @param[in] pu1_src2 372@* UWORD8 Pointer to the buffer containing the input block 2. 373@* 374@* @param[out] pu1_dst 375@* UWORD8 pointer to the destination where the output block is stored. 376@* 377@* @param[in] src_strd1 378@* Stride of the input buffer 1 379@* 380@* @param[in] src_strd2 381@* Stride of the input buffer 2 382@* 383@* @param[in] dst_strd 384@* Stride of the destination buffer 385@* 386@* @param[in] log_wd 387@* number of bits to be rounded off 388@* 389@* @param[in] wt1 390@* weights for the weighted prediction in U and V 391@* 392@* @param[in] wt2 393@* weights for the weighted prediction in U and V 394@* 395@* @param[in] ofst1 396@* offset 1 used after rounding off for U an dV 397@* 398@* @param[in] ofst2 399@* offset 2 used after rounding off for U and V 400@* 401@* @param[in] ht 402@* integer height of the array 403@* 404@* @param[in] wd 405@* integer width of the array 406@* 407@* @returns 408@* None 409@* 410@* @remarks 411@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). 412@* 413@******************************************************************************* 414@* 415@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1, 416@ UWORD8 *pu1_src2, 417@ UWORD8 *pu1_dst, 418@ WORD32 src_strd1, 419@ WORD32 src_strd2, 420@ WORD32 dst_strd, 421@ WORD32 log_wd, 422@ WORD32 wt1, 423@ WORD32 wt2, 424@ WORD32 ofst1, 425@ WORD32 ofst2, 426@ WORD32 ht, 427@ WORD32 wd) 428@ 429@**************Variables Vs Registers***************************************** 430@ r0 => pu1_src1 431@ r1 => pu1_src2 432@ r2 => pu1_dst 433@ r3 => src_strd1 434@ [sp] => src_strd2 (r4) 435@ [sp+4] => dst_strd (r5) 436@ [sp+8] => log_wd (r6) 437@ [sp+12] => wt1 (r7) 438@ [sp+16] => wt2 (r8) 439@ [sp+20] => ofst1 (r9) 440@ [sp+24] => ofst2 (r10) 441@ [sp+28] => ht (r11) 442@ [sp+32] => wd (r12) 443@ 444 445 446 .global ih264_weighted_bi_pred_chroma_a9q 447 448ih264_weighted_bi_pred_chroma_a9q: 449 450 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 451 452 ldr r6, [sp, #48] @Load log_wd in r6 453 ldr r7, [sp, #52] @Load wt1 in r7 454 ldr r8, [sp, #56] @Load wt2 in r8 455 add r6, r6, #1 @r6 = log_wd + 1 456 ldr r9, [sp, #60] @Load ofst1 in r9 457 ldr r10, [sp, #64] @Load ofst2 in r10 458 459 rsb r12, r6, #0 @r12 = -(log_wd + 1) 460 ldr r4, [sp, #40] @Load src_strd2 in r4 461 ldr r5, [sp, #44] @Load dst_strd in r5 462 vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit) 463 464 ldr r11, [sp, #68] @Load ht in r11 465 vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit) 466 ldr r12, [sp, #72] @Load wd in r12 467 vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit) 468 asr r7, r9, #8 @r7 = ofst1_v 469 asr r8, r10, #8 @r8 = ofst2_v 470 vpush {d8-d15} 471 sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit 472 sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit 473 sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit 474 sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit 475 476 add r9, r9, #1 @r9 = ofst1_u + 1 477 add r7, r7, #1 @r7 = ofst1_v + 1 478 add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1 479 add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1 480 asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1 481 asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1 482 cmp r12, #8 @check if wd is 8 483 pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)} 484 vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)} 485 beq loop_8_uv @branch if wd is 8 486 487 cmp r12, #4 @check if wd is 4 488 beq loop_4_uv @branch if wd is 4 489 490loop_2_uv: @each iteration processes two rows 491 492 vld1.32 d8[0], [r0], r3 @load row 1 in source 1 493 vld1.32 d8[1], [r0], r3 @load row 2 in source 1 494 vld1.32 d10[0], [r1], r4 @load row 1 in source 2 495 vld1.32 d10[1], [r1], r4 @load row 2 in source 2 496 497 vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit 498 vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit 499 500 vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2 501 vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2 502 503 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2 504 505 vadd.s16 q4, q4, q3 @adding offset for rows 1,2 506 507 vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit 508 509 vst1.32 d8[0], [r2], r5 @store row 1 in destination 510 vst1.32 d8[1], [r2], r5 @store row 2 in destination 511 512 subs r11, r11, #2 @decrement ht by 2 513 bgt loop_2_uv @if greater than 0 repeat the loop again 514 515 b end_loops_uv 516 517loop_4_uv: @each iteration processes two rows 518 519 vld1.8 d8, [r0], r3 @load row 1 in source 1 520 vld1.8 d10, [r1], r4 @load row 1 in source 2 521 vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit 522 vld1.8 d12, [r0], r3 @load row 2 in source 1 523 vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit 524 vld1.8 d14, [r1], r4 @load row 2 in source 2 525 526 vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit 527 vmul.s16 q4, q4, q1 @weight 1 mult. for row 1 528 vmla.s16 q4, q5, q2 @weight 2 mult. for row 1 529 vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit 530 531 vmul.s16 q6, q6, q1 @weight 1 mult. for row 2 532 vmla.s16 q6, q7, q2 @weight 2 mult. for row 2 533 534 subs r11, r11, #2 @decrement ht by 2 535 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1 536 vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2 537 vadd.s16 q4, q4, q3 @adding offset for row 1 538 vadd.s16 q6, q6, q3 @adding offset for row 2 539 540 vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit 541 vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit 542 543 vst1.8 d8, [r2], r5 @store row 1 in destination 544 vst1.8 d12, [r2], r5 @store row 2 in destination 545 546 bgt loop_4_uv @if greater than 0 repeat the loop again 547 548 b end_loops_uv 549 550loop_8_uv: @each iteration processes two rows 551 552 vld1.8 {q4}, [r0], r3 @load row 1 in source 1 553 vld1.8 {q5}, [r1], r4 @load row 1 in source 2 554 vld1.8 {q6}, [r0], r3 @load row 2 in source 1 555 vld1.8 {q7}, [r1], r4 @load row 2 in source 2 556 vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit 557 vld1.8 {q8}, [r0], r3 @load row 3 in source 1 558 vld1.8 {q9}, [r1], r4 @load row 3 in source 2 559 vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit 560 vld1.8 {q10}, [r0], r3 @load row 4 in source 1 561 vld1.8 {q11}, [r1], r4 @load row 4 in source 2 562 563 vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit 564 vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit 565 566 vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L 567 vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L 568 vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit 569 vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit 570 571 vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H 572 vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H 573 vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit 574 vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit 575 576 vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L 577 vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L 578 vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit 579 vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit 580 581 vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H 582 vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H 583 vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit 584 vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit 585 586 vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L 587 vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L 588 vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit 589 vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit 590 591 vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H 592 vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H 593 vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit 594 vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit 595 596 vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L 597 vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L 598 vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L 599 600 vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H 601 vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H 602 vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H 603 604 vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L 605 vadd.s16 q12, q12, q3 @adding offset for row 1L 606 vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H 607 vadd.s16 q4, q4, q3 @adding offset for row 1H 608 vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L 609 vadd.s16 q14, q14, q3 @adding offset for row 2L 610 vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H 611 vadd.s16 q6, q6, q3 @adding offset for row 2H 612 vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L 613 vadd.s16 q13, q13, q3 @adding offset for row 3L 614 vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H 615 vadd.s16 q8, q8, q3 @adding offset for row 3H 616 617 vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit 618 vadd.s16 q15, q15, q3 @adding offset for row 4L 619 vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit 620 vadd.s16 q10, q10, q3 @adding offset for row 4H 621 622 vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit 623 vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit 624 vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit 625 vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit 626 vst1.8 {q5}, [r2], r5 @store row 1 in destination 627 vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit 628 vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit 629 630 vst1.8 {q9}, [r2], r5 @store row 2 in destination 631 subs r11, r11, #4 @decrement ht by 4 632 vst1.8 {q7}, [r2], r5 @store row 3 in destination 633 vst1.8 {q11}, [r2], r5 @store row 4 in destination 634 635 bgt loop_8_uv @if greater than 0 repeat the loop again 636 637end_loops_uv: 638 639 vpop {d8-d15} 640 ldmfd sp!, {r4-r12, r15} @Reload the registers from sp 641 642 643