1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* //file 21//* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* yogeswaran rs / parthiban 31//* 32//* //par list of functions: 33//* 34//* 35//* //remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41///** 42//******************************************************************************* 43//* 44//* //brief 45//* chroma interprediction filter for 16bit vertical input. 46//* 47//* //par description: 48//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49//* the elements pointed by 'pu1_src' and writes to the location pointed by 50//* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and 51//* clipped to lie between 0 and 255 assumptions : the function is 52//* optimized considering the fact width and height are multiple of 2. 53//* 54//* //param[in] pi2_src 55//* word16 pointer to the source 56//* 57//* //param[out] pu1_dst 58//* uword8 pointer to the destination 59//* 60//* //param[in] src_strd 61//* integer source stride 62//* 63//* //param[in] dst_strd 64//* integer destination stride 65//* 66//* //param[in] pi1_coeff 67//* word8 pointer to the filter coefficients 68//* 69//* //param[in] ht 70//* integer height of the array 71//* 72//* //param[in] wd 73//* integer width of the array 74//* 75//* //returns 76//* 77//* //remarks 78//* none 79//* 80//******************************************************************************* 81//*/ 82//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src, 83// uword8 *pu1_dst, 84// word32 src_strd, 85// word32 dst_strd, 86// word8 *pi1_coeff, 87// word32 ht, 88// word32 wd) 89//**************variables vs registers***************************************** 90//x0 => *pu1_src 91//x1 => *pi2_dst 92//x2 => src_strd 93//x3 => dst_strd 94 95.text 96.align 4 97 98.include "ihevc_neon_macros.s" 99 100.globl ihevc_inter_pred_chroma_vert_w16inp_av8 101 102.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function 103 104ihevc_inter_pred_chroma_vert_w16inp_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 mov x4, x15 //loads pi1_coeff 115 mov x6, x17 //wd 116 lsl x2,x2,#1 //src_strd = 2* src_strd 117 mov x5,x16 //loads ht 118 ld1 {v0.8b},[x4] //loads pi1_coeff 119 sub x4,x0,x2 //pu1_src - src_strd 120 sxtl v0.8h, v0.8b //long the value 121 122 tst x6,#3 //checks wd == 2 123 dup v16.4h, v0.h[0] //coeff_0 124 dup v17.4h, v0.h[1] //coeff_1 125 dup v18.4h, v0.h[2] //coeff_2 126 dup v19.4h, v0.h[3] //coeff_3 127 128 bgt core_loop_ht_2 //jumps to loop handles wd 2 129 130 tst x5,#3 //checks ht == mul of 4 131 beq core_loop_ht_4 //jumps to loop handles ht mul of 4 132 133core_loop_ht_2: 134 lsl x7,x2,#1 //2*src_strd 135 lsl x12,x3,#1 //2*dst_strd 136 lsl x9,x6,#2 //4*wd 137 sub x6,x12,x6,lsl #1 //2*dst_strd - 2*wd 138 sub x8,x7,x9 //2*src_strd - 4*wd 139 mov x12,x9 //4wd 140 141inner_loop_ht_2: 142 add x0,x4,x2 //increments pi2_src 143 ld1 {v0.4h},[x4],#8 //loads pu1_src 144 smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 145 subs x12,x12,#8 //2wd + 8 146 ld1 {v2.4h},[x0],x2 //loads pi2_src 147 smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 148 ld1 {v3.4h},[x0],x2 //loads pi2_src 149 smlal v0.4s, v2.4h, v17.4h 150 ld1 {v6.4h},[x0],x2 151 smlal v7.4s, v3.4h, v17.4h 152 ld1 {v2.4h},[x0] 153 add x7,x1,x3 //pu1_dst + dst_strd 154 smlal v0.4s, v3.4h, v18.4h 155 smlal v7.4s, v6.4h, v18.4h 156 smlal v0.4s, v6.4h, v19.4h 157 smlal v7.4s, v2.4h, v19.4h 158 sqshrn v0.4h, v0.4s,#6 //right shift 159 sqshrn v30.4h, v7.4s,#6 //right shift 160 sqrshrun v0.8b, v0.8h,#6 //rounding shift 161 sqrshrun v30.8b, v30.8h,#6 //rounding shift 162 st1 {v0.s}[0],[x1],#4 //stores the loaded value 163 st1 {v30.s}[0],[x7] //stores the loaded value 164 bgt inner_loop_ht_2 //inner loop -again 165 166 //inner loop ends 167 subs x5,x5,#2 //increments ht 168 add x1,x1,x6 //pu1_dst += 2*dst_strd - 2*wd 169 mov x12,x9 //4wd 170 add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd 171 bgt inner_loop_ht_2 //loop again 172 173 b end_loops //jumps to end 174 175core_loop_ht_4: 176 lsl x7,x2,#2 //2*src_strd 177 lsl x12,x3,#2 //2*dst_strd 178 lsr x11, x6, #1 //divide by 2 179 sub x14,x12,x6,lsl #1 //2*dst_strd - 2*wd 180 sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd 181 182 mul x12, x5 , x11 //multiply height by width 183 sub x12, x12,#4 //subtract by one for epilog 184 lsl x11, x6, #1 //2*wd 185 186prolog: 187 add x0,x4,x2 //increments pi2_src 188 ld1 {v0.4h},[x4],#8 //loads pu1_src 189 ld1 {v1.4h},[x0],x2 //loads pi2_src 190 subs x11,x11,#4 191 ld1 {v2.4h},[x0],x2 //loads pi2_src 192 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 193 ld1 {v3.4h},[x0],x2 194 smlal v30.4s, v1.4h, v17.4h 195 smlal v30.4s, v2.4h, v18.4h 196 add x9,x1,x3 //pu1_dst + dst_strd 197 smlal v30.4s, v3.4h, v19.4h 198 199 ld1 {v4.4h},[x0],x2 200 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 201 add x20,x4,x8 202 csel x4, x20, x4,le 203 smlal v28.4s, v2.4h, v17.4h 204 ld1 {v5.4h},[x0],x2 205 smlal v28.4s, v3.4h, v18.4h 206 ld1 {v6.4h},[x0],x2 207 smlal v28.4s, v4.4h, v19.4h 208 lsl x20,x6,#1 209 csel x11, x20, x11,le 210 211 sqshrn v30.4h, v30.4s,#6 //right shift 212 213 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 214 add x0,x4,x2 215 smlal v26.4s, v3.4h, v17.4h 216 smlal v26.4s, v4.4h, v18.4h 217 ld1 {v0.4h},[x4],#8 //loads pu1_src 218 smlal v26.4s, v5.4h, v19.4h 219 220 sqrshrun v30.8b, v30.8h,#6 //rounding shift 221 sqshrn v28.4h, v28.4s,#6 //right shift 222 223 ld1 {v1.4h},[x0],x2 //loads pi2_src 224 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 225 st1 {v30.s}[0],[x1],#4 //stores the loaded value 226 smlal v24.4s, v4.4h, v17.4h 227 ld1 {v2.4h},[x0],x2 //loads pi2_src 228 smlal v24.4s, v5.4h, v18.4h 229 ld1 {v3.4h},[x0],x2 230 smlal v24.4s, v6.4h, v19.4h 231 add x20,x1,x14 232 csel x1, x20, x1,le 233 234 sqshrn v26.4h, v26.4s,#6 //right shift 235 subs x12,x12,#4 236 sqrshrun v28.8b, v28.8h,#6 //rounding shift 237 238 beq epilog //jumps to epilog 239 240kernel_4: 241 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 242 subs x11,x11,#4 243 smlal v30.4s, v1.4h, v17.4h 244 st1 {v28.s}[0],[x9],x3 //stores the loaded value 245 smlal v30.4s, v2.4h, v18.4h 246 smlal v30.4s, v3.4h, v19.4h 247 248 sqshrn v24.4h, v24.4s,#6 //right shift 249 sqrshrun v26.8b, v26.8h,#6 //rounding shift 250 251 ld1 {v4.4h},[x0],x2 252 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 253 smlal v28.4s, v2.4h, v17.4h 254 smlal v28.4s, v3.4h, v18.4h 255 smlal v28.4s, v4.4h, v19.4h 256 st1 {v26.s}[0],[x9],x3 //stores the loaded value 257 add x20,x4,x8 258 csel x4, x20, x4,le 259 lsl x20,x6,#1 260 csel x11, x20, x11,le 261 262 sqshrn v30.4h, v30.4s,#6 //right shift 263 sqrshrun v24.8b, v24.8h,#6 //rounding shift 264 265 ld1 {v5.4h},[x0],x2 266 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 267 ld1 {v6.4h},[x0],x2 268 smlal v26.4s, v3.4h, v17.4h 269 st1 {v24.s}[0],[x9] //stores the loaded value 270 add x0,x4,x2 271 smlal v26.4s, v4.4h, v18.4h 272 ld1 {v0.4h},[x4],#8 //loads pu1_src 273 smlal v26.4s, v5.4h, v19.4h 274 275 sqshrn v28.4h, v28.4s,#6 //right shift 276 sqrshrun v30.8b, v30.8h,#6 //rounding shift 277 278 ld1 {v1.4h},[x0],x2 //loads pi2_src 279 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 280 add x9,x1,x3 //pu1_dst + dst_strd 281 ld1 {v2.4h},[x0],x2 //loads pi2_src 282 smlal v24.4s, v4.4h, v17.4h 283 ld1 {v3.4h},[x0],x2 284 smlal v24.4s, v5.4h, v18.4h 285 286 st1 {v30.s}[0],[x1],#4 //stores the loaded value 287 smlal v24.4s, v6.4h, v19.4h 288 289 sqshrn v26.4h, v26.4s,#6 //right shift 290 sqrshrun v28.8b, v28.8h,#6 //rounding shift 291 add x20,x1,x14 292 csel x1, x20, x1,le 293 294 subs x12,x12,#4 295 296 bgt kernel_4 //jumps to kernel_4 297 298epilog: 299 smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) 300 st1 {v28.s}[0],[x9],x3 //stores the loaded value 301 smlal v30.4s, v1.4h, v17.4h 302 smlal v30.4s, v2.4h, v18.4h 303 smlal v30.4s, v3.4h, v19.4h 304 305 sqshrn v24.4h, v24.4s,#6 //right shift 306 sqrshrun v26.8b, v26.8h,#6 //rounding shift 307 308 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 309 ld1 {v4.4h},[x0],x2 310 smlal v28.4s, v2.4h, v17.4h 311 st1 {v26.s}[0],[x9],x3 //stores the loaded value 312 smlal v28.4s, v3.4h, v18.4h 313 smlal v28.4s, v4.4h, v19.4h 314 315 sqshrn v30.4h, v30.4s,#6 //right shift 316 sqrshrun v24.8b, v24.8h,#6 //rounding shift 317 318 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 319 ld1 {v5.4h},[x0],x2 320 smlal v26.4s, v3.4h, v17.4h 321 smlal v26.4s, v4.4h, v18.4h 322 smlal v26.4s, v5.4h, v19.4h 323 324 sqshrn v28.4h, v28.4s,#6 //right shift 325 sqrshrun v30.8b, v30.8h,#6 //rounding shift 326 327 st1 {v24.s}[0],[x9] //stores the loaded value 328 smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) 329 smlal v24.4s, v4.4h, v17.4h 330 add x9,x1,x3 //pu1_dst + dst_strd 331 ld1 {v6.4h},[x0],x2 332 smlal v24.4s, v5.4h, v18.4h 333 smlal v24.4s, v6.4h, v19.4h 334 st1 {v30.s}[0],[x1],#4 //stores the loaded value 335 336 sqrshrun v28.8b, v28.8h,#6 //rounding shift 337 sqshrn v26.4h, v26.4s,#6 //right shift 338 339 st1 {v28.s}[0],[x9],x3 //stores the loaded value 340 sqrshrun v26.8b, v26.8h,#6 //rounding shift 341 342 sqshrn v24.4h, v24.4s,#6 //right shift 343 st1 {v26.s}[0],[x9],x3 //stores the loaded value 344 sqrshrun v24.8b, v24.8h,#6 //rounding shift 345 346 st1 {v24.s}[0],[x9] //stores the loaded value 347 348end_loops: 349 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 350 ldp x19, x20,[sp],#16 351 352 ret 353 354 355 356 357