1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_horz_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs / akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter to store horizontal 16bit ouput 45@* 46@* @par description: 47@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48@* to the elements pointed by 'pu1_src' and writes to the location pointed 49@* by 'pu1_dst' no downshifting or clipping is done and the output is used 50@* as an input for vertical filtering or weighted prediction 51@* 52@* @param[in] pu1_src 53@* uword8 pointer to the source 54@* 55@* @param[out] pi2_dst 56@* word16 pointer to the destination 57@* 58@* @param[in] src_strd 59@* integer source stride 60@* 61@* @param[in] dst_strd 62@* integer destination stride 63@* 64@* @param[in] pi1_coeff 65@* word8 pointer to the filter coefficients 66@* 67@* @param[in] ht 68@* integer height of the array 69@* 70@* @param[in] wd 71@* integer width of the array 72@* 73@* @returns 74@* 75@* @remarks 76@* none 77@* 78@******************************************************************************* 79@*/ 80@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src, 81@ word16 *pi2_dst, 82@ word32 src_strd, 83@ word32 dst_strd, 84@ word8 *pi1_coeff, 85@ word32 ht, 86@ word32 wd) 87@**************variables vs registers***************************************** 88@r0 => *pu1_src 89@r1 => *pi2_dst 90@r2 => src_strd 91@r3 => dst_strd 92 93 94.text 95.align 4 96 97 98 99 100.globl ihevc_inter_pred_chroma_horz_w16out_a9q 101 102 103.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function 104 105ihevc_inter_pred_chroma_horz_w16out_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 109 ldr r4,[sp,#40] @loads pi1_coeff 110 ldr r6,[sp,#44] @loads ht 111 ldr r10,[sp,#48] @loads wd 112 113 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 114 subs r14,r6,#0 @checks for ht == 0 115 vabs.s8 d2,d0 @vabs_s8(coeff) 116 117@******* added 118 mov r11, #2 119@******* added ends 120 121 ble end_loops 122 123 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 124 sub r12,r0,#2 @pu1_src - 2 125 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 126 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 127 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 128 129 tst r10,#3 @checks wd for multiples of 4 130 mov r5,r10,lsl #1 @2wd 131 132 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 133 134 and r7,r14,#1 @added @calculating ht_residue ht_residue = (ht & 1) 135 sub r14,r14,r7 @added @decrement height by ht_residue(residue value is calculated outside) 136 137 bne outer_loop_4 @ this branching happens when the width is 2 or 6 138 139 cmp r10,#12 140 beq skip_16 141 142 cmp r10,#8 143 bge outer_loop_16 144 145skip_16: 146 tst r6,#3 147 148@******* removal 149 @mov r11,#8 150@******* removal ends 151 152 sub r9,r0,#2 153 beq outer_loop_ht_4 @this branching happens when the height is a a multiple of 4 154 155 156 157@ cmp r10,#12 158@ beq outer_loop_8 159@ cmp r10,#16 160@ bge outer_loop_16 161 b outer_loop_8 162 163 164 165outer_loop_16: 166 add r4,r12,r2 167 168 169 and r0, r12, #31 170 pld [r12, r2, lsl #1] 171 172 173 174 175 176 177 178 vld1.u32 {q0},[r12],r11 @vector load pu1_src 179 mov r10,r5 @2wd 180 mul r14,r14,r10 181 vld1.u32 {q1},[r12],r11 @vector load pu1_src 182 pld [r4, r2, lsl #1] 183 mov r9,#10 184 vld1.u32 {q2},[r12],r11 @vector load pu1_src 185 rsb r6,r3,#8 186 sub r8,r3,#8 187 vld1.u32 {q3},[r12],r9 @vector load pu1_src 188 189 190 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 191 vld1.u32 {q4},[r4],r11 @vector load pu1_src 192 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 193 vld1.u32 {q5},[r4],r11 @vector load pu1_src 194 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 195 vld1.u32 {q6},[r4],r11 @vector load pu1_src 196 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 197 vld1.u32 {q7},[r4],r9 @vector load pu1_src 198 vmull.u8 q14,d3,d25 199 lsl r6,#1 200 rsb r3,r5,r3,lsl #1 201 vmlsl.u8 q14,d1,d24 202 lsl r8,#1 203 rsb r7,r5,r2,lsl #1 204 vmlal.u8 q14,d5,d26 205 206 vmlsl.u8 q14,d7,d27 207 cmp r14,#32 208 beq epilog_end 209 sub r14,#64 210 211inner_loop_16: 212 213 @ and r7, r12, #31 @decrement the wd loop 214 @ cmp r7, r0 215 pld [r12, r2, lsl #2] 216 pld [r4, r2, lsl #2] 217 218 219 subs r10,r10,#16 220 221 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 222 223 224 225@ addeq r12,r12,r2,lsl #1 226@ subeq r12,r12,r5 227 addeq r12,r12,r7 228 addeq r4,r12,r2 229 230 231 vst1.16 {q15}, [r1]! 232 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 233 234 235 236 237 238 vld1.u32 {q0},[r12],r11 @vector load pu1_src 239 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 240 241 242 243 244 vld1.u32 {q1},[r12],r11 @vector load pu1_src 245 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 246 247 248 vld1.u32 {q2},[r12],r11 @vector load pu1_src 249 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 250 251 vst1.16 {q14}, [r1],r8 252 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 253 254 vld1.u32 {q3},[r12],r9 @vector load pu1_src 255 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 256 257 vld1.u32 {q4},[r4],r11 @vector load pu1_src 258 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 259 260 261 vld1.u32 {q5},[r4],r11 @vector load pu1_src 262 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 263 264 vld1.u32 {q6},[r4],r11 @vector load pu1_src 265 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 266 267 vld1.u32 {q7},[r4],r9 @vector load pu1_src 268 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 269 270 vst1.16 {q11},[r1]! @store the result pu1_dst 271 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 272 273 moveq r10,r5 @2wd 274 vmull.u8 q14,d3,d25 275 276 277 278 vmlsl.u8 q14,d1,d24 279 vst1.16 {q10},[r1],r6 @store the result pu1_dst 280 281 282 addeq r1,r1,r3,lsl #1 283 vmlal.u8 q14,d5,d26 284 285 subs r14,r14,#32 @decrement the ht loop 286 vmlsl.u8 q14,d7,d27 287 288 289 290@ mov r0, r7 291 bgt inner_loop_16 292 293 294 295 add r14,r14,#64 296 cmp r14,#32 297 beq epilog_end 298 299epilog: 300 301 vst1.16 {q15}, [r1]! 302 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 303 vst1.16 {q14}, [r1],r8 304 305 306 307 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 308 subs r10,r10,#16 @decrement the wd loop 309 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 310@ addeq r12,r12,r2,lsl #1 311 addeq r12,r12,r7 312 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 313 @ subeq r12,r12,r5 314 moveq r10,r5 @2wd 315 addeq r4,r12,r2 316 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 317 vld1.u32 {q0},[r12],r11 @vector load pu1_src 318 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 319 vld1.u32 {q1},[r12],r11 @vector load pu1_src 320 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 321 vld1.u32 {q2},[r12],r11 @vector load pu1_src 322 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 323 vld1.u32 {q3},[r12],r9 @vector load pu1_src 324 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 325 326 327 vld1.u32 {q4},[r4],r11 @vector load pu1_src 328 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 329 vld1.u32 {q5},[r4],r11 @vector load pu1_src 330 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 331 332 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 333 334 vld1.u32 {q6},[r4],r11 @vector load pu1_src 335 vmull.u8 q14,d3,d25 336 vld1.u32 {q7},[r4],r9 @vector load pu1_src 337 vmlsl.u8 q14,d1,d24 338 vst1.16 {q11},[r1]! @store the result pu1_dst 339 vmlal.u8 q14,d5,d26 340 vst1.16 {q10},[r1],r6 @store the result pu1_dst 341 vmlsl.u8 q14,d7,d27 342 addeq r1,r1,r3,lsl #1 343 344 345epilog_end: 346 347 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 348 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 349 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 350 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 351 352 353 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 354 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 355 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 356 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 357 358 359 vst1.16 {q15}, [r1]! 360 vst1.16 {q14}, [r1],r8 361 vst1.16 {q11},[r1]! @store the result pu1_dst 362 vst1.16 {q10},[r1],r6 @store the result pu1_dst 363 364 365 ldr r6,[sp,#44] @loads ht 366 367 and r7,r6,#1 368 369 cmp r7,#0 370 mov r10,r5 371 addne r12,r12,r2,lsl #1 372 subne r12,r12,r5 373 addne r1,r1,r3,lsl #1 374 375 376 bgt loop_residue_4 377 378 b end_loops 379 380 381 382 383outer_loop_8: 384 385 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 386 mov r10,r5 @2wd 387 add r4,r12,r2 @pu1_src + src_strd 388 389inner_loop_8: 390 @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src 391 vld1.u32 {d0},[r12],r11 @vector load pu1_src 392 vld1.u32 {d1},[r12],r11 @vector load pu1_src 393 vld1.u32 {d2},[r12],r11 @vector load pu1_src 394 vld1.u32 {d3},[r12],r11 @vector load pu1_src 395 396 397 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 398 vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 399 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 400 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 401 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 402 vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 403 vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 404 405 @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd 406 vld1.u32 {d4},[r4],r11 @vector load pu1_src 407 vld1.u32 {d5},[r4],r11 @vector load pu1_src 408 vld1.u32 {d6},[r4],r11 @vector load pu1_src 409 vld1.u32 {d7},[r4],r11 @vector load pu1_src 410 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 411 vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 412 vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 413 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 414 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 415 vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 416 vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 417 418 vst1.16 {d8, d9}, [r1]! 419 420 subs r10,r10,#8 @decrement the wd loop 421 vst1.16 {d10, d11},[r6]! @store the result pu1_dst 422 bgt inner_loop_8 423 424 sub r12,r12,r5 425 subs r14,r14,#2 @decrement the ht loop 426 sub r1,r1,r5,lsl #1 427 add r12,r12,r2,lsl #1 428 add r1,r1,r3,lsl #2 429 bgt outer_loop_8 430 431 cmp r7,#0 432 mov r10,r5 433 bgt loop_residue_4 434 435 b end_loops 436 437 438 439@height if 4 comes 440outer_loop_ht_4: 441 442 mov r10,r5 443 444prologue_ht_4: 445 mov r8,r3,lsl #1 446 447inner_loop_ht_4: 448 449 mov r12,r9 450 mov r4,r1 451 452 sub r0, r2, #6 @ not sure if r0 needs to be preserved 453 454 vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src 455 vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src 456 vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src 457 vld1.u32 {d3},[r12],r0 @(1)vector load pu1_src 458 459 vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src 460 vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src 461 vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src 462 vld1.u32 {d7},[r12],r0 @(2)vector load pu1_src 463 464 vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src 465 vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 466 467 vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src 468 vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 469 470 vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src 471 vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 472 473 vld1.u32 {d17},[r12],r0 @(3)vector load pu1_src 474 vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 475 476 vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src 477 vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 478 479 vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src 480 vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 481 482 vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src 483 vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 484 485 vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src 486 vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 487 488 add r9,r9,#8 @(core loop) 489 490 subs r10,r10,#8 @(prologue)decrement the wd loop 491 beq epilogue 492 493core_loop: 494 vst1.16 {d8, d9},[r4],r8 @(1)store the result pu1_dst 495 mov r12,r9 496 497 vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src 498 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 499 500 vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src 501 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 502 503 vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src 504 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 505 506 vld1.u32 {d3},[r12],r0 @(1_1)vector load pu1_src 507 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 508 509 vst1.16 {d10, d11},[r4],r8 @(2)store the result pu1_dst 510 add r9,r9,#8 @(core loop) 511 512 vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src 513 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 514 515 vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src 516 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 517 518 vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src 519 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 520 521 vld1.u32 {d7},[r12],r0 @(2_1)vector load pu1_src 522 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 523 524 vst1.16 {d12, d13},[r4],r8 @(3)store the result pu1_dst 525 add r1,r1,#16 @(core loop) 526 527 vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src 528 vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 529 530 vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src 531 vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 532 533 vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src 534 vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 535 536 vld1.u32 {d17},[r12],r0 @(3_1)vector load pu1_src 537 vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 538 539 vst1.16 {d22, d23}, [r4], r8 @(4)store the result pu1_dst 540 subs r10,r10,#8 @(core loop) 541 542 vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 543 vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src 544 545 vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src 546 vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 547 548 vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src 549 vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 550 551 mov r4, r1 @(core loop) 552 553 vld1.u32 {d21},[r12],r0 @(4_1)vector load pu1_src 554 vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 555 556 557 558 bgt core_loop @loopback 559 560epilogue: 561 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 562 563 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 564 565 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 566 567 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 568 569 vst1.16 {d8, d9},[r4], r8 @(1)store the result pu1_dst 570 571 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 572 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 573 574 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 575 576 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 577 578 vst1.16 {d10, d11},[r4], r8 @(2)store the result pu1_dst 579 580 vst1.16 {d12, d13},[r4], r8 @(3)store the result pu1_dst 581 582 add r1,r1,#16 @(core loop) 583 584 vst1.16 {d22, d23},[r4], r8 @(4)store the result pu1_dst 585 586 sub r9,r9,r5 587 subs r14,r14,#4 @decrement the ht loop 588 sub r1,r1,r5,lsl #1 589 add r9,r9,r2,lsl #2 590 add r1,r1,r3,lsl #3 591 bgt outer_loop_ht_4 592 593 cmp r7,#0 594 mov r10,r5 595 movgt r12,r9 596 movgt r4,r1 597 bgt loop_residue_4 598 599 b end_loops 600 601outer_loop_4: 602 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 603 mov r10,r5 604 add r4,r12,r2 @pu1_src + src_strd 605 606inner_loop_4: 607 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 608 vld1.u32 {d0},[r12],r11 @vector load pu1_src 609 vld1.u32 {d1},[r12],r11 @vector load pu1_src 610 vld1.u32 {d2},[r12],r11 @vector load pu1_src 611 vld1.u32 {d3},[r12] @vector load pu1_src 612 613@**** removal 614 @add r12,r12,#4 @increment the input pointer 615@**** removal ends 616@**** addn 617 sub r12,r12,#2 @increment the input pointer 618@**** addn ends 619 vld1.u32 {d4},[r4],r11 @vector load pu1_src 620 vld1.u32 {d5},[r4],r11 @vector load pu1_src 621 vld1.u32 {d6},[r4],r11 @vector load pu1_src 622 vld1.u32 {d7},[r4] @vector load pu1_src 623 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 624 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 625 @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd 626 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 627 628 @add r4,r4,#4 @increment the input pointer 629 sub r4,r4,#2 630 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 631 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 632 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 633 634@**** removal 635 @vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 636 @vzip.32 d2,d14 637 @vzip.32 d4,d16 638 @vzip.32 d6,d18 639@**** removal ends 640@**** addn 641 vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register 642 vzip.32 d1,d5 643 vzip.32 d2,d6 644 vzip.32 d3,d7 645@**** addn ends 646 647 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 648 vmlsl.u8 q4,d0,d24 649 vmlal.u8 q4,d2,d26 650 vmlsl.u8 q4,d3,d27 651 652 vst1.32 {d8},[r1]! @store the i iteration result which is in upper part of the register 653 subs r10,r10,#4 @decrement the wd by 4 654 655 vst1.32 {d9},[r6]! @store the ii iteration result which is in lower part of the register 656 657 bgt inner_loop_4 658 659 sub r12,r12,r5 660 subs r14,r14,#2 @decrement the ht by 2 661 sub r1,r1,r5,lsl #1 662 add r12,r12,r2,lsl #1 663 add r1,r1,r3,lsl #2 664 bgt outer_loop_4 665 666 cmp r7,#0 667 mov r10,r5 668 beq end_loops 669 670loop_residue_4: 671 672 mov r10,r5 @2wd 673 674loop_residue: 675 676 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 677 vld1.u32 {d0},[r12],r11 @vector load pu1_src 678 vld1.u32 {d1},[r12],r11 @vector load pu1_src 679 vld1.u32 {d2},[r12],r11 @vector load pu1_src 680 vld1.u32 {d3},[r12] @vector load pu1_src 681 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 682 @vmull.u8 q4,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 683 @vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 684 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 685 @add r12,r12,#4 @pu1_src + 4 686 sub r12, r12, #2 687 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 688 @vmlal.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 689 @vmlsl.u8 q4,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 690 vmull.u8 q4,d1,d25 691 vmlsl.u8 q4,d0,d24 692 vmlal.u8 q4,d2,d26 693 vmlsl.u8 q4,d3,d27 694 695 vst1.64 {d8 },[r1] @store the result pu1_dst 696 subs r10,r10,#4 @decrement the wd loop 697 add r1,r1,#8 @pi2_dst + 8 698 699 bgt loop_residue @loop again 700 701 @inner loop ends 702 @add r8,r3,lsl #1 @2*dst_strd 703 @sub r8,r8,r5,lsl #1 @2*dst_strd - 2wd 704 @sub r9,r2,r5 @src_strd - 2wd 705 @subs r7,r7,#1 @decrement the ht loop 706 @add r12,r12,r9 @pu1_src + src_strd 707 @add r1,r1,r8 @pu1_dst + 2*dst_strd 708 @bgt outer_loop_residue_4 @loop again 709 @b end_loops @jumps to end 710 711end_loops: 712 713 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 714 715 716 717 718 719 720