1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* //file 21//* ihevc_weighted_pred_bi.s 22//* 23//* //brief 24//* contains function definitions for weighted prediction used in inter 25//* prediction 26//* 27//* //author 28//* parthiban v 29//* 30//* //par list of functions: 31//* - ihevc_weighted_pred_bi() 32//* 33//* //remarks 34//* none 35//* 36//******************************************************************************* 37//*/ 38///** 39//******************************************************************************* 40//* 41//* //brief 42//* does bi-weighted prediction on the arrays pointed by pi2_src1 and 43//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the 44//* function is optimized considering the fact width and height are multiple 45//* of 2. 46//* 47//* //par description: 48//* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 49//* off1 + 1) << (shift - 1) ) >> shift 50//* 51//* //param[in] pi2_src1 52//* pointer to source 1 53//* 54//* //param[in] pi2_src2 55//* pointer to source 2 56//* 57//* //param[out] pu1_dst 58//* pointer to destination 59//* 60//* //param[in] src_strd1 61//* source stride 1 62//* 63//* //param[in] src_strd2 64//* source stride 2 65//* 66//* //param[in] dst_strd 67//* destination stride 68//* 69//* //param[in] wgt0 70//* weight to be multiplied to source 1 71//* 72//* //param[in] off0 73//* offset 0 74//* 75//* //param[in] wgt1 76//* weight to be multiplied to source 2 77//* 78//* //param[in] off1 79//* offset 1 80//* 81//* //param[in] shift 82//* (14 bit depth) + log2_weight_denominator 83//* 84//* //param[in] lvl_shift1 85//* added before shift and offset 86//* 87//* //param[in] lvl_shift2 88//* added before shift and offset 89//* 90//* //param[in] ht 91//* height of the source 92//* 93//* //param[in] wd 94//* width of the source 95//* 96//* //returns 97//* 98//* //remarks 99//* none 100//* 101//******************************************************************************* 102//*/ 103 104//void ihevc_weighted_pred_bi(word16 *pi2_src1, 105// word16 *pi2_src2, 106// uword8 *pu1_dst, 107// word32 src_strd1, 108// word32 src_strd2, 109// word32 dst_strd, 110// word32 wgt0, 111// word32 off0, 112// word32 wgt1, 113// word32 off1, 114// word32 shift, 115// word32 lvl_shift1, 116// word32 lvl_shift2, 117// word32 ht, 118// word32 wd) 119 120//**************variables vs registers***************************************** 121// x0 => *pi2_src1 122// x1 => *pi2_src2 123// x2 => *pu1_dst 124// x3 => src_strd1 125// x4 => src_strd2 126// x5 => dst_strd 127// x6 => wgt0 128// x7 => off0 129// x8 => wgt1 130// x9 => off1 131// x10 => shift 132// x11 => lvl_shift1 133// x12 => lvl_shift2 134// x14 => ht 135// x7 => wd 136 137.text 138.align 4 139 140.include "ihevc_neon_macros.s" 141 142.globl ihevc_weighted_pred_bi_av8 143 144.type ihevc_weighted_pred_bi_av8, %function 145 146ihevc_weighted_pred_bi_av8: 147 148 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 149 150 ldr w8,[sp,#0] 151 ldr w9,[sp,#8] 152 ldr w10,[sp,#16] 153 ldr w11,[sp,#24] 154 ldr w12,[sp,#32] 155 ldr w13,[sp,#40] 156 ldr w14,[sp,#48] 157 158 sxtw x8,w8 159 sxtw x9,w9 160 sxtw x10,w10 161 sxtw x11,w11 162 sxtw x12,w12 163 164 165 stp x19, x20,[sp,#-16]! 166 stp x21, x22,[sp,#-16]! 167 stp x23, x24,[sp,#-16]! 168 stp x25, x26,[sp,#-16]! 169 170 mov x15,x4 // src_strd2 40 171 mov x16,x5 // dst_strd 44 172 mov x17,x6 // wgt0 48 173 mov x19,x7 // off0 52 174 mov x20,x8 // wgt1 56 175 mov x21,x9 // off1 60 176 mov x22,x10 // shift 64 177 mov x23,x11 // lvl_shift1 68 178 mov x24,x12 // lvl_shift2 72 179 mov x25,x13 // ht 76 180 mov x26,x14 // wd 80 181 182 mov x6,x17 //load wgt0 183 mov x11,x23 //load lvl_shift1 184 mov x12,x24 //load lvl_shift2 185 mov v7.h[0],w6 //moved for scalar multiplication 186 mul x4, x11 , x6 //lvl_shift1 * wgt0 187 mov x8,x20 //load wgt1 188 mov x7,x19 //load off0 189 mov v7.h[1],w8 //moved for scalar multiplication 190 madd x4,x12,x8,x4 //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1) 191 mov x9,x21 //load off1 192 add x5,x7,x9 //off0 + off1 193 mov x10,x22 //load shift 194 add x5,x5,#1 //off0 + off1 + 1 195 sub x14,x10,#1 //shift - 1 196 mov x7,x26 //load wd 197 lsl x5,x5,x14 //((off0 + off1 + 1) << (shift - 1)) 198 dup v28.4s,w10 //vmovq_n_s32(0-shift) 199 add x4,x4,x5 //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1)) 200 dup v30.4s,w4 //vmovq_n_s32(tmp_lvl_shift) 201 neg v28.4s, v28.4s 202 mov x4,x15 //load src_strd2 203 lsl x9,x7,#1 204 mov x5,x16 //load dst_strd 205 lsl x3,x3,#1 206 mov x14,x25 //load ht 207 lsl x4,x4,#1 208 209 cmp x14,#0 //check ht == 0 210 beq end_loops //if equal, then end the function 211 212outer_loop: 213 cmp x7,#0 //check wd == 0 214 beq end_loops //if equal, then end the function 215 216core_loop: 217 add x6,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 218 add x8,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 219 ld1 {v0.4h},[x0],#8 //load and increment the pi2_src1 220 add x10,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd 221 ld1 {v1.4h},[x1],#8 //load and increment the pi2_src2 222 smull v4.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) 223 ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 ii iteration 224 smull v5.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) 225 ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 ii iteration 226 add v4.4s, v4.4s , v5.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) 227 228 ld1 {v0.4h},[x6],x3 //load and increment the pi2_src1 iii iteration 229 smull v6.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration 230 231 ld1 {v1.4h},[x8],x4 //load and increment the pi2_src2 iii iteration 232 add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 233 smull v19.4s, v0.4h, v7.h[0] //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration 234 235 ld1 {v2.4h},[x6],x3 //load and increment the pi2_src_tmp1 iv iteration 236 smull v17.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration 237 sshl v4.4s,v4.4s,v28.4s //vshlq_s32(i4_tmp1_t1, tmp_shift_t) 238 239 ld1 {v3.4h},[x8],x4 //load and increment the pi2_src_tmp1 iv iteration 240 add v6.4s, v6.4s , v17.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration 241 242 sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) 243 smull v16.4s, v1.4h, v7.h[1] //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration 244 245 add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration 246 //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) 247 add v19.4s, v19.4s , v16.4s //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration 248 249 sshl v6.4s,v6.4s,v28.4s 250 //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration 251 smull v18.4s, v2.4h, v7.h[0] //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration 252 uqxtn v4.8b,v4.8h 253 //vqmovn.u16 d4,q2 //vqmovn_u16(sto_res_tmp3) 254 add v19.4s, v19.4s , v30.4s //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration 255 256 sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration 257 smull v20.4s, v3.4h, v7.h[1] //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration 258 259 sshl v19.4s,v19.4s,v28.4s 260 //vshl.s32 q7,q7,q14 //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration 261 //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration 262 263 add v18.4s, v18.4s , v20.4s //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration 264 sqxtun v19.4h, v19.4s //vqmovun_s32(sto_res_tmp1) iii iteration 265 266 add v18.4s, v18.4s , v30.4s //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration 267 st1 {v4.s}[0],[x2],#4 //store pu1_dst i iteration 268 269 uqxtn v6.8b,v6.8h 270 //vqmovn.u16 d10,q5 //vqmovn_u16(sto_res_tmp3) ii iteration 271 sshl v18.4s,v18.4s,v28.4s 272 //vshl.s32 q9,q9,q14 //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration 273 st1 {v6.s}[0],[x10],x5 //store pu1_dst ii iteration 274 275 276 //mov v15, v14 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration 277 uqxtn v19.8b,v19.8h 278 //vqmovn.u16 d14,q7 //vqmovn_u16(sto_res_tmp3) iii iteration 279 sqxtun v18.4h, v18.4s //vqmovun_s32(sto_res_tmp1) iv iteration 280 //mov v19, v18 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) 281 st1 {v19.s}[0],[x10],x5 //store pu1_dst iii iteration 282 uqxtn v18.8b,v18.8h 283 //vqmovn.u16 d18,q9 //vqmovn_u16(sto_res_tmp3) iv iteration 284 subs x7,x7,#4 //decrement wd by 4 and check for 0 285 st1 {v18.s}[0],[x10],x5 //store pu1_dst iv iteration 286 287 bgt core_loop //if greater than 0 repeat the core loop again 288 289end_core_loop: 290 sub x20,x9,x3,lsl #2 //2*src_strd1 - wd 291 neg x11, x20 292 subs x14,x14,#4 //decrement the ht by 4 293 sub x20,x9,x4,lsl #2 //2*src_strd2 - wd 294 neg x12, x20 295 add x0,x0,x11 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 296 asr x7,x9,#1 297 add x1,x1,x12 //pi2_src2 + 4*src_strd2 - 2*wd 298 sub x20,x7,x5,lsl #2 //2*dst_strd - wd 299 neg x10, x20 300 add x2,x2,x10 //pu1_dst + dst_std - wd 301 bgt core_loop //if ht is greater than 0 goto outer_loop 302 303end_loops: 304 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 305 ldp x25, x26,[sp],#16 306 ldp x23, x24,[sp],#16 307 ldp x21, x22,[sp],#16 308 ldp x19, x20,[sp],#16 309 310 ret 311 312 313 314 315 316 317