1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@****************************************************************************** 22@* @file 23@* ih264_inter_pred_luma_bilinear_a9q.s 24@* 25@* @brief 26@* Contains function definitions for inter prediction interpolation. 27@* 28@* @author 29@* Ittiam 30@* 31@* @par List of Functions: 32@* 33@* - ih264_inter_pred_luma_bilinear_a9q() 34@* 35@* @remarks 36@* None 37@* 38@******************************************************************************* 39@* 40 41@* All the functions here are replicated from ih264_inter_pred_filters.c 42@ 43 44@** 45@** 46@** 47@ ******************************************************************************* 48@ * function:ih264_inter_pred_luma_bilinear 49@ * 50@* @brief 51@* This routine applies the bilinear filter to the predictors . 52@* The filtering operation is described in 53@* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54@* 55@* @par Description: 56@\note 57@* This function is called to obtain pixels lying at the following 58@* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . 59@* The function averages the two adjacent values from the two input arrays in horizontal direction. 60@* 61@* 62@* @param[in] pu1_src1: 63@* UWORD8 Pointer to the buffer containing the first input array. 64@* 65@* @param[in] pu1_src2: 66@* UWORD8 Pointer to the buffer containing the second input array. 67@* 68@* @param[out] pu1_dst 69@* UWORD8 pointer to the destination where the output of bilinear filter is stored. 70@* 71@* @param[in] src_strd1 72@* Stride of the first input buffer 73@* 74@* @param[in] src_strd2 75@* Stride of the second input buffer 76@* 77@* @param[in] dst_strd 78@* integer destination stride of pu1_dst 79@* 80@* @param[in] ht 81@* integer height of the array 82@* 83@* @param[in] wd 84@* integer width of the array 85@* 86@* @returns 87@* 88@* @remarks 89@* None 90@* 91@******************************************************************************* 92@* 93 94@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, 95@ UWORD8 *pu1_src2, 96@ UWORD8 *pu1_dst, 97@ WORD32 src_strd1, 98@ WORD32 src_strd2, 99@ WORD32 dst_strd, 100@ WORD32 height, 101@ WORD32 width) 102@ 103@**************Variables Vs Registers***************************************** 104@ r0 => *pu1_src1 105@ r1 => *pu1_src2 106@ r2 => *pu1_dst 107@ r3 => src_strd1 108@ r4 => src_strd2 109@ r5 => dst_strd 110@ r6 => height 111@ r7 => width 112@ 113.text 114.p2align 2 115 116 .global ih264_inter_pred_luma_bilinear_a9q 117 118ih264_inter_pred_luma_bilinear_a9q: 119 120 121 122 stmfd sp!, {r4-r12, r14} @store register values to stack 123 vstmdb sp!, {d8-d15} @push neon registers to stack 124 ldr r4, [sp, #104] 125 ldr r5, [sp, #108] @ 126 ldr r6, [sp, #112] 127 ldr r7, [sp, #116] 128 129 subs r12, r7, #4 @if wd=4 branch to loop_4 130 beq loop_4 131 subs r12, r7, #8 @if wd=8 branch to loop_8 132 beq loop_8 133 134loop_16: @when wd=16 135 136 vld1.8 {q0}, [r0], r3 @// Load row0 ;src1 137 vld1.8 {q2}, [r1], r4 @// Load row0 ;src2 138 vld1.8 {q1}, [r0], r3 @// Load row1 ;src1 139 vaddl.u8 q10, d0, d4 140 vld1.8 {q3}, [r1], r4 @// Load row1 ;src2 141 vaddl.u8 q11, d1, d5 142 vld1.8 {q4}, [r0], r3 @// Load row2 ;src1 143 vaddl.u8 q12, d2, d6 144 vld1.8 {q5}, [r0], r3 @// Load row3 ;src1 145 vaddl.u8 q13, d3, d7 146 vld1.8 {q6}, [r1], r4 @// Load row2 ;src2 147 vaddl.u8 q8, d8, d12 148 vld1.8 {q7}, [r1], r4 @// Load row3 ;src2 149 vaddl.u8 q9, d9, d13 150 vqrshrun.s16 d28, q10, #1 151 vqrshrun.s16 d29, q11, #1 152 vaddl.u8 q10, d10, d14 153 vqrshrun.s16 d30, q12, #1 154 vqrshrun.s16 d31, q13, #1 155 vst1.8 {q14}, [r2], r5 @//Store dest row0 156 vaddl.u8 q11, d11, d15 157 vst1.8 {q15}, [r2], r5 @//Store dest row1 158 vqrshrun.s16 d28, q8, #1 159 vld1.8 {q0}, [r0], r3 @// Load row4 ;src1 160 vqrshrun.s16 d29, q9, #1 161 vld1.8 {q1}, [r0], r3 @// Load row5 ;src1 162 vqrshrun.s16 d30, q10, #1 163 vld1.8 {q2}, [r1], r4 @// Load row4 ;src2 164 vqrshrun.s16 d31, q11, #1 165 vld1.8 {q3}, [r1], r4 @// Load row5 ;src2 166 vaddl.u8 q10, d0, d4 167 vst1.8 {q14}, [r2], r5 @//Store dest row2 168 vaddl.u8 q13, d3, d7 169 vst1.8 {q15}, [r2], r5 @//Store dest row3 170 vaddl.u8 q11, d1, d5 171 vld1.8 {q4}, [r0], r3 @// Load row6 ;src1 172 vaddl.u8 q12, d2, d6 173 vld1.8 {q5}, [r0], r3 @// Load row7 ;src1 174 vqrshrun.s16 d28, q10, #1 175 vld1.8 {q6}, [r1], r4 @// Load row6 ;src2 176 vqrshrun.s16 d29, q11, #1 177 vld1.8 {q7}, [r1], r4 @// Load row7 ;src2 178 vaddl.u8 q8, d8, d12 179 vaddl.u8 q9, d9, d13 180 vaddl.u8 q10, d10, d14 181 vqrshrun.s16 d30, q12, #1 182 vqrshrun.s16 d31, q13, #1 183 vst1.8 {q14}, [r2], r5 @//Store dest row4 184 vaddl.u8 q11, d11, d15 185 vst1.8 {q15}, [r2], r5 @//Store dest row5 186 vqrshrun.s16 d28, q8, #1 187 vqrshrun.s16 d30, q10, #1 188 vqrshrun.s16 d29, q9, #1 189 vld1.8 {q2}, [r1], r4 @// Load row8 ;src2 190 vqrshrun.s16 d31, q11, #1 191 vst1.8 {q14}, [r2], r5 @//Store dest row6 192 subs r12, r6, #8 193 vst1.8 {q15}, [r2], r5 @//Store dest row7 194 195 beq end_func @ end function if ht=8 196 197 vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 198 vaddl.u8 q10, d0, d4 199 vld1.8 {q1}, [r0], r3 @// Load row9 ;src1 200 vaddl.u8 q11, d1, d5 201 vld1.8 {q3}, [r1], r4 @// Load row9 ;src2 202 vqrshrun.s16 d28, q10, #1 203 vld1.8 {q4}, [r0], r3 @// Load row10 ;src1 204 vqrshrun.s16 d29, q11, #1 205 vld1.8 {q5}, [r0], r3 @// Load row11 ;src1 206 vaddl.u8 q12, d2, d6 207 vld1.8 {q6}, [r1], r4 @// Load row10 ;src2 208 vaddl.u8 q13, d3, d7 209 vld1.8 {q7}, [r1], r4 @// Load row11 ;src2 210 vaddl.u8 q8, d8, d12 211 vaddl.u8 q9, d9, d13 212 vaddl.u8 q10, d10, d14 213 vqrshrun.s16 d30, q12, #1 214 vst1.8 {q14}, [r2], r5 @//Store dest row8 215 vqrshrun.s16 d31, q13, #1 216 vst1.8 {q15}, [r2], r5 @//Store dest row9 217 vqrshrun.s16 d28, q8, #1 218 vld1.8 {q0}, [r0], r3 @// Load row12 ;src1 219 vaddl.u8 q11, d11, d15 220 vld1.8 {q1}, [r0], r3 @// Load row13 ;src1 221 vqrshrun.s16 d29, q9, #1 222 vld1.8 {q2}, [r1], r4 @// Load row12 ;src2 223 vqrshrun.s16 d30, q10, #1 224 vld1.8 {q3}, [r1], r4 @// Load row13 ;src2 225 vqrshrun.s16 d31, q11, #1 226 vst1.8 {q14}, [r2], r5 @//Store dest row10 227 vaddl.u8 q10, d0, d4 228 vst1.8 {q15}, [r2], r5 @//Store dest row11 229 vaddl.u8 q11, d1, d5 230 vld1.8 {q4}, [r0], r3 @// Load row14 ;src1 231 vaddl.u8 q13, d3, d7 232 vld1.8 {q5}, [r0], r3 @// Load row15 ;src1 233 vaddl.u8 q12, d2, d6 234 vld1.8 {q6}, [r1], r4 @// Load row14 ;src2 235 vaddl.u8 q8, d8, d12 236 vld1.8 {q7}, [r1], r4 @// Load row15 ;src2 237 vaddl.u8 q9, d9, d13 238 vqrshrun.s16 d28, q10, #1 239 vqrshrun.s16 d29, q11, #1 240 vaddl.u8 q10, d10, d14 241 vst1.8 {q14}, [r2], r5 @//Store dest row12 242 vqrshrun.s16 d30, q12, #1 243 vqrshrun.s16 d31, q13, #1 244 vaddl.u8 q11, d11, d15 245 vst1.8 {q15}, [r2], r5 @//Store dest row13 246 vqrshrun.s16 d28, q8, #1 247 vqrshrun.s16 d29, q9, #1 248 vqrshrun.s16 d30, q10, #1 249 vst1.8 {q14}, [r2], r5 @//Store dest row14 250 vqrshrun.s16 d31, q11, #1 251 vst1.8 {q15}, [r2], r5 @//Store dest row15 252 b end_func 253 254 255 256loop_8: @wd=8; 257 vld1.8 {d0}, [r0], r3 @// Load row0 ;src1 258 vld1.8 {d4}, [r1], r4 @// Load row0 ;src2 259 vld1.8 {d1}, [r0], r3 @// Load row1 ;src1 260 vaddl.u8 q10, d0, d4 261 vld1.8 {d5}, [r1], r4 @// Load row1 ;src2 262 vld1.8 {d2}, [r0], r3 @// Load row2 ;src1 263 vqrshrun.s16 d28, q10, #1 264 vld1.8 {d6}, [r1], r4 @// Load row2 ;src2 265 vaddl.u8 q11, d1, d5 266 vld1.8 {d3}, [r0], r3 @// Load row3 ;src1 267 vaddl.u8 q12, d2, d6 268 vst1.8 {d28}, [r2], r5 @//Store dest row0 269 vqrshrun.s16 d29, q11, #1 270 vld1.8 {d7}, [r1], r4 @// Load row3 ;src2 271 vqrshrun.s16 d30, q12, #1 272 vst1.8 {d29}, [r2], r5 @//Store dest row1 273 vaddl.u8 q13, d3, d7 274 vst1.8 {d30}, [r2], r5 @//Store dest row2 275 vqrshrun.s16 d31, q13, #1 276 subs r12, r6, #4 277 vst1.8 {d31}, [r2], r5 @//Store dest row3 278 beq end_func @ end function if ht=4 279 280 vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 281 vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 282 vld1.8 {d9}, [r0], r3 @// Load row5 ;src1 283 vaddl.u8 q8, d8, d12 284 vld1.8 {d13}, [r1], r4 @// Load row5 ;src2 285 vld1.8 {d10}, [r0], r3 @// Load row6;src1 286 vaddl.u8 q9, d9, d13 287 vld1.8 {d14}, [r1], r4 @// Load row6 ;src2 288 vqrshrun.s16 d28, q8, #1 289 vld1.8 {d11}, [r0], r3 @// Load row7 ;src1 290 vqrshrun.s16 d29, q9, #1 291 vst1.8 {d28}, [r2], r5 @//Store dest row4 292 vaddl.u8 q10, d10, d14 293 vst1.8 {d29}, [r2], r5 @//Store dest row5 294 vqrshrun.s16 d30, q10, #1 295 vld1.8 {d15}, [r1], r4 @// Load row7 ;src2 296 vaddl.u8 q11, d11, d15 297 vst1.8 {d30}, [r2], r5 @//Store dest row6 298 vqrshrun.s16 d31, q11, #1 299 subs r12, r6, #8 300 vst1.8 {d31}, [r2], r5 @//Store dest row7 301 beq end_func @ end function if ht=8 302 303 vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 304 vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 305 vld1.8 {d1}, [r0], r3 @// Load row9 ;src1 306 vaddl.u8 q10, d0, d4 307 vld1.8 {d5}, [r1], r4 @// Load row9 ;src2 308 vld1.8 {d2}, [r0], r3 @// Load row10 ;src1 309 vaddl.u8 q11, d1, d5 310 vld1.8 {d6}, [r1], r4 @// Load row10 ;src2 311 vqrshrun.s16 d28, q10, #1 312 vld1.8 {d3}, [r0], r3 @// Load row11 ;src1 313 vaddl.u8 q12, d2, d6 314 vld1.8 {d7}, [r1], r4 @// Load row11 ;src2 315 vqrshrun.s16 d29, q11, #1 316 vld1.8 {d8}, [r0], r3 @// Load row12 ;src1 317 vaddl.u8 q13, d3, d7 318 vst1.8 {d28}, [r2], r5 @//Store dest row8 319 vqrshrun.s16 d30, q12, #1 320 vld1.8 {d12}, [r1], r4 @// Load row12 ;src2 321 vqrshrun.s16 d31, q13, #1 322 vst1.8 {d29}, [r2], r5 @//Store dest row9 323 vaddl.u8 q8, d8, d12 324 vld1.8 {d9}, [r0], r3 @// Load row13 ;src1 325 vqrshrun.s16 d28, q8, #1 326 vld1.8 {d13}, [r1], r4 @// Load row13 ;src2 327 vld1.8 {d10}, [r0], r3 @// Load row14;src1 328 vaddl.u8 q9, d9, d13 329 vld1.8 {d11}, [r0], r3 @// Load row15 ;src1 330 vld1.8 {d14}, [r1], r4 @// Load row14 ;src2 331 vqrshrun.s16 d29, q9, #1 332 vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2 333 vaddl.u8 q10, d10, d14 334 vst1.8 {d30}, [r2], r5 @//Store dest row10 335 vaddl.u8 q11, d11, d15 336 vst1.8 {d31}, [r2], r5 @//Store dest row11 337 vqrshrun.s16 d30, q10, #1 338 vst1.8 {d28}, [r2], r5 @//Store dest row12 339 vqrshrun.s16 d31, q11, #1 340 vst1.8 {d29}, [r2], r5 @//Store dest row13 341 vst1.8 {d30}, [r2], r5 @//Store dest row14 342 vst1.8 {d31}, [r2], r5 @//Store dest row15 343 344 b end_func 345 346 347 348loop_4: 349 vld1.32 d0[0], [r0], r3 @// Load row0 ;src1 350 vld1.32 d4[0], [r1], r4 @// Load row0 ;src2 351 vld1.32 d1[0], [r0], r3 @// Load row1 ;src1 352 vaddl.u8 q10, d0, d4 353 vld1.32 d5[0], [r1], r4 @// Load row1 ;src2 354 vld1.32 d2[0], [r0], r3 @// Load row2 ;src1 355 vqrshrun.s16 d28, q10, #1 356 vld1.32 d6[0], [r1], r4 @// Load row2 ;src2 357 vaddl.u8 q11, d1, d5 358 vld1.32 d3[0], [r0], r3 @// Load row3 ;src1 359 vaddl.u8 q12, d2, d6 360 vst1.32 d28[0], [r2], r5 @//Store dest row0 361 vqrshrun.s16 d29, q11, #1 362 vld1.32 d7[0], [r1], r4 @// Load row3 ;src2 363 vqrshrun.s16 d30, q12, #1 364 vst1.32 d29[0], [r2], r5 @//Store dest row1 365 vaddl.u8 q13, d3, d7 366 vst1.32 d30[0], [r2], r5 @//Store dest row2 367 vqrshrun.s16 d31, q13, #1 368 subs r12, r6, #4 369 vst1.32 d31[0], [r2], r5 @//Store dest row3 370 beq end_func @ end function if ht=4 371 372 vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 373 vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 374 vld1.32 d9[0], [r0], r3 @// Load row5 ;src1 375 vaddl.u8 q8, d8, d12 376 vld1.32 d13[0], [r1], r4 @// Load row5 ;src2 377 vld1.32 d10[0], [r0], r3 @// Load row6;src1 378 vaddl.u8 q9, d9, d13 379 vld1.32 d14[0], [r1], r4 @// Load row6 ;src2 380 vqrshrun.s16 d28, q8, #1 381 vld1.32 d11[0], [r0], r3 @// Load row7 ;src1 382 vqrshrun.s16 d29, q9, #1 383 vst1.32 d28[0], [r2], r5 @//Store dest row4 384 vaddl.u8 q10, d10, d14 385 vst1.32 d29[0], [r2], r5 @//Store dest row5 386 vqrshrun.s16 d30, q10, #1 387 vld1.32 d15[0], [r1], r4 @// Load row7 ;src2 388 vaddl.u8 q11, d11, d15 389 vst1.32 d30[0], [r2], r5 @//Store dest row6 390 vqrshrun.s16 d31, q11, #1 391 vst1.32 d31[0], [r2], r5 @//Store dest row7 392 393end_func: 394 395 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 396 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 397 398 399