1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_default_weighted_pred_av8.s 24//* 25//* @brief 26//* Contains function definitions for default weighted prediction. 27//* 28//* @author 29//* Kaushik Senthoor R 30//* 31//* @par List of Functions: 32//* 33//* - ih264_default_weighted_pred_luma_av8() 34//* - ih264_default_weighted_pred_chroma_av8() 35//* 36//* @remarks 37//* None 38//* 39//******************************************************************************* 40//*/ 41//******************************************************************************* 42//* @function 43//* ih264_default_weighted_pred_luma_av8() 44//* 45//* @brief 46//* This routine performs the default weighted prediction as described in sec 47//* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. 48//* 49//* @par Description: 50//* This function gets two ht x wd blocks, calculates their rounded-average and 51//* stores it in the destination block. 52//* 53//* @param[in] puc_src1: 54//* UWORD8 Pointer to the buffer containing the first input block. 55//* 56//* @param[in] puc_src2: 57//* UWORD8 Pointer to the buffer containing the second input block. 58//* 59//* @param[out] puc_dst 60//* UWORD8 pointer to the destination where the output block is stored. 61//* 62//* @param[in] src_strd1 63//* Stride of the first input buffer 64//* 65//* @param[in] src_strd2 66//* Stride of the second input buffer 67//* 68//* @param[in] dst_strd 69//* Stride of the destination buffer 70//* 71//* @param[in] ht 72//* integer height of the array 73//* 74//* @param[in] wd 75//* integer width of the array 76//* 77//* @returns 78//* None 79//* 80//* @remarks 81//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). 82//* 83//******************************************************************************* 84//*/ 85//void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1, 86// UWORD8 *puc_src2, 87// UWORD8 *puc_dst, 88// WORD32 src_strd1, 89// WORD32 src_strd2, 90// WORD32 dst_strd, 91// UWORD8 ht, 92// UWORD8 wd) 93// 94//**************Variables Vs Registers***************************************** 95// x0 => puc_src1 96// x1 => puc_src2 97// x2 => puc_dst 98// x3 => src_strd1 99// [sp] => src_strd2 (x4) 100// [sp+4] => dst_strd (x5) 101// [sp+8] => ht (x6) 102// [sp+12] => wd (x7) 103// 104.text 105.p2align 2 106.include "ih264_neon_macros.s" 107 108 109 110 .global ih264_default_weighted_pred_luma_av8 111 112ih264_default_weighted_pred_luma_av8: 113 114 push_v_regs 115 stp x19, x20, [sp, #-16]! 116 cmp w7, #16 117 beq loop_16 //branch if wd is 16 118 cmp w7, #8 119 beq loop_8 //branch if wd is 8 120 121loop_4: //each iteration processes four rows 122 123 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 124 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 125 ld1 {v2.s}[0], [x1], x4 //load row 1 in source 2 126 ld1 {v2.s}[1], [x1], x4 //load row 2 in source 2 127 ld1 {v1.s}[0], [x0], x3 //load row 3 in source 1 128 ld1 {v1.s}[1], [x0], x3 //load row 4 in source 1 129 urhadd v0.8b, v0.8b , v2.8b 130 ld1 {v3.s}[0], [x1], x4 //load row 3 in source 2 131 ld1 {v3.s}[1], [x1], x4 //load row 4 in source 2 132 subs w6, w6, #4 //decrement ht by 4 133 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 134 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 135 urhadd v1.8b, v1.8b , v3.8b 136 st1 {v1.s}[0], [x2], x5 //load row 3 in destination 137 st1 {v1.s}[1], [x2], x5 //load row 4 in destination 138 bgt loop_4 //if greater than 0 repeat the loop again 139 b end_loops 140 141loop_8: //each iteration processes four rows 142 143 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 144 ld1 {v4.8b}, [x1], x4 //load row 1 in source 2 145 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 146 ld1 {v5.8b}, [x1], x4 //load row 2 in source 2 147 ld1 {v2.8b}, [x0], x3 //load row 3 in source 1 148 urhadd v0.16b, v0.16b , v4.16b 149 urhadd v1.16b, v1.16b , v5.16b 150 ld1 {v6.8b}, [x1], x4 //load row 3 in source 2 151 ld1 {v3.8b}, [x0], x3 //load row 4 in source 1 152 urhadd v2.8b, v2.8b , v6.8b 153 ld1 {v7.8b}, [x1], x4 //load row 4 in source 2 154 subs w6, w6, #4 //decrement ht by 4 155 st1 {v0.8b}, [x2], x5 //load row 1 in destination 156 urhadd v3.8b, v3.8b , v7.8b 157 st1 {v1.8b}, [x2], x5 //load row 2 in destination 158 st1 {v2.8b}, [x2], x5 //load row 3 in destination 159 st1 {v3.8b}, [x2], x5 //load row 4 in destination 160 bgt loop_8 //if greater than 0 repeat the loop again 161 b end_loops 162 163loop_16: //each iteration processes eight rows 164 165 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 166 ld1 {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2 167 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 168 ld1 {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2 169 urhadd v0.16b, v0.16b , v16.16b 170 urhadd v1.16b, v1.16b , v17.16b 171 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 172 ld1 {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2 173 urhadd v2.16b, v2.16b , v18.16b 174 urhadd v3.16b, v3.16b , v19.16b 175 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 176 ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2 177 urhadd v4.16b, v4.16b , v20.16b 178 urhadd v5.16b, v5.16b , v21.16b 179 ld1 {v8.8b, v9.8b}, [x0], x3 //load row 5 in source 1 180 ld1 {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2 181 urhadd v6.16b, v6.16b , v22.16b 182 urhadd v7.16b, v7.16b , v23.16b 183 ld1 {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1 184 ld1 {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2 185 urhadd v8.16b, v8.16b , v24.16b 186 urhadd v9.16b, v9.16b , v25.16b 187 ld1 {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1 188 ld1 {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2 189 urhadd v10.16b, v10.16b , v26.16b 190 urhadd v11.16b, v11.16b , v27.16b 191 ld1 {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1 192 ld1 {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2 193 urhadd v12.16b, v12.16b , v28.16b 194 urhadd v13.16b, v13.16b , v29.16b 195 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 196 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 197 urhadd v14.16b, v14.16b , v30.16b 198 urhadd v15.16b, v15.16b , v31.16b 199 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 200 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 201 subs w6, w6, #8 //decrement ht by 8 202 st1 {v8.8b, v9.8b}, [x2], x5 //load row 5 in destination 203 st1 {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination 204 st1 {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination 205 st1 {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination 206 bgt loop_16 //if greater than 0 repeat the loop again 207 208end_loops: 209 210 // LDMFD sp!,{x4-x7,x15} //Reload the registers from sp 211 ldp x19, x20, [sp], #16 212 pop_v_regs 213 ret 214 215 216//******************************************************************************* 217//* @function 218//* ih264_default_weighted_pred_chroma_av8() 219//* 220//* @brief 221//* This routine performs the default weighted prediction as described in sec 222//* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. 223//* 224//* @par Description: 225//* This function gets two ht x wd blocks, calculates their rounded-average and 226//* stores it in the destination block for U and V. 227//* 228//* @param[in] puc_src1: 229//* UWORD8 Pointer to the buffer containing the first input block. 230//* 231//* @param[in] puc_src2: 232//* UWORD8 Pointer to the buffer containing the second input block. 233//* 234//* @param[out] puc_dst 235//* UWORD8 pointer to the destination where the output block is stored. 236//* 237//* @param[in] src_strd1 238//* Stride of the first input buffer 239//* 240//* @param[in] src_strd2 241//* Stride of the second input buffer 242//* 243//* @param[in] dst_strd 244//* Stride of the destination buffer 245//* 246//* @param[in] ht 247//* integer height of the array 248//* 249//* @param[in] wd 250//* integer width of the array 251//* 252//* @returns 253//* None 254//* 255//* @remarks 256//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). 257//* 258//******************************************************************************* 259//*/ 260//void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1, 261// UWORD8 *puc_src2, 262// UWORD8 *puc_dst, 263// WORD32 src_strd1, 264// WORD32 src_strd2, 265// WORD32 dst_strd, 266// UWORD8 ht, 267// UWORD8 wd) 268// 269//**************Variables Vs Registers***************************************** 270// x0 => puc_src1 271// x1 => puc_src2 272// x2 => puc_dst 273// x3 => src_strd1 274// [sp] => src_strd2 (x4) 275// [sp+4] => dst_strd (x5) 276// [sp+8] => ht (x6) 277// [sp+12] => wd (x7) 278// 279 280 281 282 283 .global ih264_default_weighted_pred_chroma_av8 284 285ih264_default_weighted_pred_chroma_av8: 286 287 push_v_regs 288 stp x19, x20, [sp, #-16]! 289 cmp w7, #8 290 beq loop_8_uv //branch if wd is 8 291 cmp w7, #4 292 beq loop_4_uv //branch if wd is 4 293 294loop_2_uv: //each iteration processes two rows 295 296 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 297 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 298 ld1 {v1.s}[0], [x1], x4 //load row 1 in source 2 299 ld1 {v1.s}[1], [x1], x4 //load row 2 in source 2 300 urhadd v0.8b, v0.8b , v1.8b 301 subs w6, w6, #2 //decrement ht by 2 302 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 303 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 304 bgt loop_2_uv //if greater than 0 repeat the loop again 305 b end_loops_uv 306 307loop_4_uv: //each iteration processes two rows 308 309 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 310 ld1 {v2.8b}, [x1], x4 //load row 1 in source 2 311 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 312 urhadd v0.8b, v0.8b , v2.8b 313 ld1 {v3.8b}, [x1], x4 //load row 2 in source 2 314 urhadd v1.8b, v1.8b , v3.8b 315 st1 {v0.8b}, [x2], x5 //load row 1 in destination 316 subs w6, w6, #2 //decrement ht by 2 317 st1 {v1.8b}, [x2], x5 //load row 2 in destination 318 bgt loop_4_uv //if greater than 0 repeat the loop again 319 b end_loops_uv 320 321loop_8_uv: //each iteration processes four rows 322 323 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 324 ld1 {v8.8b, v9.8b}, [x1], x4 //load row 1 in source 2 325 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 326 urhadd v0.16b, v0.16b , v8.16b 327 urhadd v1.16b, v1.16b , v9.16b 328 ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2 329 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 330 urhadd v2.16b, v2.16b , v10.16b 331 urhadd v3.16b, v3.16b , v11.16b 332 ld1 {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2 333 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 334 urhadd v4.16b, v4.16b , v12.16b 335 urhadd v5.16b, v5.16b , v13.16b 336 ld1 {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2 337 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 338 urhadd v6.16b, v6.16b , v14.16b 339 urhadd v7.16b, v7.16b , v15.16b 340 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 341 subs w6, w6, #4 //decrement ht by 4 342 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 343 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 344 bgt loop_8_uv //if greater than 0 repeat the loop again 345 346end_loops_uv: 347 ldp x19, x20, [sp], #16 348 pop_v_regs 349 ret 350 351 352 353