1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* //file 21//* ihevc_inter_pred_chroma_vert_neon.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* yogeswaran rs 31//* 32//* //par list of functions: 33//* 34//* 35//* //remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41///** 42//******************************************************************************* 43//* 44//* //brief 45//* chroma interprediction filter for vertical input 46//* 47//* //par description: 48//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49//* the elements pointed by 'pu1_src' and writes to the location pointed by 50//* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits 51//* assumptions : the function is optimized considering the fact width is 52//* multiple of 2,4 or 8. and also considering height should be multiple of 2 53//* width 4,8 is optimized further 54//* 55//* //param[in] pu1_src 56//* uword8 pointer to the source 57//* 58//* //param[out] pu1_dst 59//* uword8 pointer to the destination 60//* 61//* //param[in] src_strd 62//* integer source stride 63//* 64//* //param[in] dst_strd 65//* integer destination stride 66//* 67//* //param[in] pi1_coeff 68//* word8 pointer to the filter coefficients 69//* 70//* //param[in] ht 71//* integer height of the array 72//* 73//* //param[in] wd 74//* integer width of the array 75//* 76//* //returns 77//* 78//* //remarks 79//* none 80//* 81//******************************************************************************* 82//*/ 83//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src, 84// uword8 *pu1_dst, 85// word32 src_strd, 86// word32 dst_strd, 87// word8 *pi1_coeff, 88// word32 ht, 89// word32 wd) 90//**************variables vs registers***************************************** 91//x0 => *pu1_src 92//x1 => *pi2_dst 93//x2 => src_strd 94//x3 => dst_strd 95.text 96.align 4 97 98.include "ihevc_neon_macros.s" 99 100.globl ihevc_inter_pred_chroma_vert_av8 101 102.type ihevc_inter_pred_chroma_vert_av8, %function 103 104ihevc_inter_pred_chroma_vert_av8: 105 106 // stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 mov x4,x16 //loads ht 115 mov x12,x15 //loads pi1_coeff 116 cmp x4,#0 //checks ht == 0 117 mov x6,x17 //loads wd 118 sub x0,x0,x2 //pu1_src - src_strd 119 ld1 {v0.8b},[x12] //loads pi1_coeff 120 121 ble end_loops //jumps to end 122 123 tst x6,#3 //checks (wd & 3) 124 abs v3.8b, v0.8b //vabs_s8(coeff) 125 lsl x10,x6,#1 //2*wd 126 dup v0.8b, v3.b[0] //coeffabs_0 127 dup v1.8b, v3.b[1] //coeffabs_1 128 dup v2.8b, v3.b[2] //coeffabs_2 129 dup v3.8b, v3.b[3] //coeffabs_3 130 131 bgt outer_loop_wd_2 //jumps to loop handling wd ==2 132 133 tst x4,#7 //checks ht for mul of 8 134 beq core_loop_ht_8 //when height is multiple of 8 135 136 lsl x7,x3,#1 //2*dst_strd 137 sub x9,x7,x10 //2*dst_strd - 2wd 138 lsl x12,x2,#1 //2*src_strd 139 sub x8,x12,x10 //2*src_strd - 2wd 140 mov x5,x10 //2wd 141 142inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2 143 144 add x6,x0,x2 //pu1_src +src_strd 145 ld1 {v17.8b},[x6],x2 //loads pu1_src 146 subs x5,x5,#8 //2wd - 8 147 ld1 {v5.8b},[x0],#8 //loads src 148 umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 149 ld1 {v4.8b},[x6],x2 //loads incremented src 150 umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0) 151 ld1 {v16.8b},[x6],x2 //loads incremented src 152 umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2) 153 umull v4.8h, v4.8b, v1.8b 154 umlsl v6.8h, v16.8b, v3.8b 155 umlsl v4.8h, v17.8b, v0.8b 156 ld1 {v18.8b},[x6] //loads the incremented src 157 umlal v4.8h, v16.8b, v2.8b 158 sqrshrun v6.8b, v6.8h,#6 //shifts right 159 umlsl v4.8h, v18.8b, v3.8b 160 add x6,x1,x3 //pu1_dst + dst_strd 161 sqrshrun v4.8b, v4.8h,#6 //shifts right 162 st1 {v6.8b},[x1],#8 //stores the loaded value 163 164 st1 {v4.8b},[x6] //stores the loaded value 165 166 bgt inner_loop_ht_2 //inner loop again 167 168 subs x4,x4,#2 //ht - 2 169 add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd) 170 mov x5,x10 //2wd 171 add x0,x0,x8 //pu1_src += (2*src_strd - 2wd) 172 173 bgt inner_loop_ht_2 //loop again 174 175 b end_loops //jumps to end 176 177outer_loop_wd_2: //called when width is multiple of 2 178 lsl x5,x3,#1 //2*dst_strd 179 mov x12,x10 //2wd 180 sub x9,x5,x10 //2*dst_strd - 2wd 181 lsl x7,x2,#1 //2*src_strd 182 sub x8,x7,x10 //2*src_strd - 2wd 183 184inner_loop_wd_2: 185 186 add x6,x0,x2 //pu1_src + src_strd 187 ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0 188 subs x12,x12,#4 //2wd - 4 189 add x0,x0,#4 //pu1_src + 4 190 ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp 191 dup v7.2s, v6.s[1] 192 ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp 193 umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1) 194 dup v7.2s, v7.s[1] 195 ld1 {v7.s}[1],[x6],x2 196 umlsl v4.8h, v6.8b, v0.8b 197 umlal v4.8h, v7.8b, v2.8b 198 dup v7.2s, v7.s[1] 199 ld1 {v7.s}[1],[x6] 200 add x6,x1,x3 //pu1_dst + dst_strd 201 umlsl v4.8h, v7.8b, v3.8b 202 sqrshrun v4.8b, v4.8h,#6 //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6) 203 st1 {v4.s}[0],[x1] //stores the loaded value 204 add x1,x1,#4 //pu1_dst += 4 205 st1 {v4.s}[1],[x6] //stores the loaded value 206 207 bgt inner_loop_wd_2 //inner loop again 208 209 //inner loop ends 210 subs x4,x4,#2 //ht - 2 211 add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd 212 mov x12,x10 //2wd 213 add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd 214 215 bgt inner_loop_wd_2 //loop again 216 217 b end_loops //jumps to end 218 219core_loop_ht_8: //when wd & ht is multiple of 8 220 221 lsl x12,x3,#2 //4*dst_strd 222 sub x8,x12,x10 //4*dst_strd - 2wd 223 lsl x12,x2,#2 //4*src_strd 224 sub x9,x12,x10 //4*src_strd - 2wd 225 226 bic x5,x10,#7 //x5 ->wd 227 lsr x14, x10, #3 //divide by 8 228 mul x12, x4 , x14 //multiply height by width 229 sub x12, x12,#4 //subtract by one for epilog 230 231prolog: 232 add x6,x0,x2 //pu1_src + src_strd 233 ld1 {v5.8b},[x6],x2 //loads pu1_src 234 subs x5,x5,#8 //2wd - 8 235 ld1 {v4.8b},[x0],#8 //loads the source 236 ld1 {v6.8b},[x6],x2 //load and increment 237 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 238 ld1 {v7.8b},[x6],x2 //load and increment 239 umlsl v30.8h, v4.8b, v0.8b 240 add x7,x1,x3 //pu1_dst 241 umlal v30.8h, v6.8b, v2.8b 242 umlsl v30.8h, v7.8b, v3.8b 243 ld1 {v16.8b},[x6],x2 //load and increment 244 245 umull v28.8h, v6.8b, v1.8b //mul_res 2 246 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 247 csel x0, x20, x0,le 248 umlsl v28.8h, v5.8b, v0.8b 249 bic x20,x10,#7 //x5 ->wd 250 csel x5, x20, x5,le 251 umlal v28.8h, v7.8b, v2.8b 252 ld1 {v17.8b},[x6],x2 253 umlsl v28.8h, v16.8b, v3.8b 254 sqrshrun v30.8b, v30.8h,#6 255 256 ld1 {v18.8b},[x6],x2 257 umull v26.8h, v7.8b, v1.8b 258 add x6,x0,x2 //pu1_src + src_strd 259 umlsl v26.8h, v6.8b, v0.8b 260 st1 {v30.8b},[x1],#8 //stores the loaded value 261 umlal v26.8h, v16.8b, v2.8b 262 ld1 {v4.8b},[x0],#8 //loads the source 263 umlsl v26.8h, v17.8b, v3.8b 264 sqrshrun v28.8b, v28.8h,#6 265 266 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 267 csel x1, x20, x1,le 268 umull v24.8h, v16.8b, v1.8b 269 ld1 {v5.8b},[x6],x2 //loads pu1_src 270 umlsl v24.8h, v7.8b, v0.8b 271 subs x12,x12,#4 272 ld1 {v6.8b},[x6],x2 //load and increment 273 umlal v24.8h, v17.8b, v2.8b 274 ld1 {v7.8b},[x6],x2 //load and increment 275 umlsl v24.8h, v18.8b, v3.8b 276 277 lsl x11,x2,#2 278 st1 {v28.8b},[x7],x3 //stores the loaded value 279 sqrshrun v26.8b, v26.8h,#6 280 sub x20,x2,x2,lsl #3 281 neg x11, x20 282 add x14,x2,x2,lsl #1 283 add x14,x14,x11 284 ble epilog //jumps to epilog 285 286kernel_8: 287 288 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 289 subs x5,x5,#8 //2wd - 8 290 umlsl v30.8h, v4.8b, v0.8b 291 add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd 292 csel x0, x20, x0,le 293 umlal v30.8h, v6.8b, v2.8b 294 lsl x20,x2,#3 295 sub x20,x20,x2 296 csel x11,x20,x11,le 297 //rsble x11,x2,x2,lsl #3 298 umlsl v30.8h, v7.8b, v3.8b 299 st1 {v26.8b},[x7],x3 //stores the loaded value 300 sqrshrun v24.8b, v24.8h,#6 301 302 ld1 {v16.8b},[x6],x2 //load and increment 303 304 umull v28.8h, v6.8b, v1.8b //mul_res 2 305 bic x20,x10,#7 //x5 ->wd 306 csel x5, x20, x5,le 307 umlsl v28.8h, v5.8b, v0.8b 308 st1 {v24.8b},[x7],x3 //stores the loaded value 309 310 umlal v28.8h, v7.8b, v2.8b 311 312 ld1 {v17.8b},[x6],x2 313 sqrshrun v30.8b, v30.8h,#6 314 315 umlsl v28.8h, v16.8b, v3.8b 316 ld1 {v18.8b},[x6],x2 317 add x7,x1,x3 //pu1_dst 318 umull v26.8h, v7.8b, v1.8b 319 add x6,x0,x2 //pu1_src + src_strd 320 321 add x20,x0, x11 322 prfm PLDL1KEEP,[x20] 323 324 325 umlsl v26.8h, v6.8b, v0.8b 326 ld1 {v4.8b},[x0],#8 //loads the source 327 328 umlal v26.8h, v16.8b, v2.8b 329 st1 {v30.8b},[x1],#8 //stores the loaded value 330 331 umlsl v26.8h, v17.8b, v3.8b 332 ld1 {v5.8b},[x6],x2 //loads pu1_src 333 334 add x11,x11,x2 335 sqrshrun v28.8b, v28.8h,#6 336 337 umull v24.8h, v16.8b, v1.8b 338 ld1 {v6.8b},[x6],x2 //load and increment 339 add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd 340 csel x1, x20, x1,le 341 342 cmp x11,x14 343 lsl x20,x2,#3 344 sub x20,x20,x2 345 csel x11,x20,x11,gt 346 //rsbgt x11,x2,x2,lsl #3 347 348 umlsl v24.8h, v7.8b, v0.8b 349 subs x12,x12,#4 350 351 umlal v24.8h, v17.8b, v2.8b 352 ld1 {v7.8b},[x6],x2 //load and increment 353 354 umlsl v24.8h, v18.8b, v3.8b 355 st1 {v28.8b},[x7],x3 //stores the loaded value 356 sqrshrun v26.8b, v26.8h,#6 357 358 bgt kernel_8 //jumps to kernel_8 359 360epilog: 361 362 umull v30.8h, v5.8b, v1.8b //mul with coeff 1 363 umlsl v30.8h, v4.8b, v0.8b 364 umlal v30.8h, v6.8b, v2.8b 365 umlsl v30.8h, v7.8b, v3.8b 366 st1 {v26.8b},[x7],x3 //stores the loaded value 367 sqrshrun v24.8b, v24.8h,#6 368 369 ld1 {v16.8b},[x6],x2 //load and increment 370 umull v28.8h, v6.8b, v1.8b //mul_res 2 371 umlsl v28.8h, v5.8b, v0.8b 372 umlal v28.8h, v7.8b, v2.8b 373 umlsl v28.8h, v16.8b, v3.8b 374 st1 {v24.8b},[x7],x3 //stores the loaded value 375 sqrshrun v30.8b, v30.8h,#6 376 377 ld1 {v17.8b},[x6],x2 378 umull v26.8h, v7.8b, v1.8b 379 add x7,x1,x3 //pu1_dst 380 umlsl v26.8h, v6.8b, v0.8b 381 st1 {v30.8b},[x1],#8 //stores the loaded value 382 383 sqrshrun v28.8b, v28.8h,#6 384 umlal v26.8h, v16.8b, v2.8b 385 ld1 {v18.8b},[x6],x2 386 umlsl v26.8h, v17.8b, v3.8b 387 388 umull v24.8h, v16.8b, v1.8b 389 sqrshrun v26.8b, v26.8h,#6 390 st1 {v28.8b},[x7],x3 //stores the loaded value 391 umlsl v24.8h, v7.8b, v0.8b 392 umlal v24.8h, v17.8b, v2.8b 393 st1 {v26.8b},[x7],x3 //stores the loaded value 394 umlsl v24.8h, v18.8b, v3.8b 395 396 sqrshrun v24.8b, v24.8h,#6 397 st1 {v24.8b},[x7],x3 //stores the loaded value 398end_loops: 399 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 400 ldp x19, x20,[sp],#16 401 402 ret 403 404 405 406