1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_horz_qpel_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction horizontal quarter pel interpolation. 27//* 28//* @author 29//* Mohit 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_horz_qpel_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Quarter pel interprediction luma filter for horizontal input 50//* 51//* @par Description: 52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[out] pu1_dst 59//* UWORD8 pointer to the destination 60//* 61//* @param[in] src_strd 62//* integer source stride 63//* 64//* @param[in] dst_strd 65//* integer destination stride 66//* 67//* @param[in] ht 68//* integer height of the array 69//* 70//* @param[in] wd 71//* integer width of the array 72//* 73// @param[in] pu1_tmp: temporary buffer: UNUSED in this function 74//* 75//* @param[in] dydx: x and y reference offset for qpel calculations. 76//* @returns 77//* 78// @remarks 79//* None 80//* 81//******************************************************************************* 82//*/ 83 84//void ih264_inter_pred_luma_horz ( 85// UWORD8 *pu1_src, 86// UWORD8 *pu1_dst, 87// WORD32 src_strd, 88// WORD32 dst_strd, 89// WORD32 ht, 90// WORD32 wd, 91// UWORD8* pu1_tmp, 92// UWORD32 dydx) 93 94//**************Variables Vs Registers***************************************** 95// x0 => *pu1_src 96// x1 => *pu1_dst 97// x2 => src_strd 98// x3 => dst_strd 99// x4 => ht 100// x5 => wd 101// x7 => dydx 102 103.text 104.p2align 2 105.include "ih264_neon_macros.s" 106 107 108 109 110 .global ih264_inter_pred_luma_horz_qpel_av8 111 112ih264_inter_pred_luma_horz_qpel_av8: 113 114 115 push_v_regs 116 stp x19, x20, [sp, #-16]! 117 118 119 and x7, x7, #3 //Finds x-offset 120 add x7, x0, x7, lsr #1 //pu1_src + (x_offset>>1) 121 sub x0, x0, #2 //pu1_src-2 122 sub x14, x4, #16 123 movi v0.16b, #5 //filter coeff 124 subs x12, x5, #8 //if wd=8 branch to loop_8 125 movi v1.16b, #20 //filter coeff 126 127 beq loop_8 128 129 subs x12, x5, #4 //if wd=4 branch to loop_4 130 beq loop_4 131 132loop_16: //when wd=16 133 //// Processing row0 and row1 134 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 135 add x14, x14, #1 //for checking loop 136 ext v31.8b, v2.8b , v3.8b , #5 137 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 138 ext v30.8b, v3.8b , v4.8b , #5 139 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 140 ext v28.8b, v5.8b , v6.8b , #5 141 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) 142 ext v27.8b, v6.8b , v7.8b , #5 143 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 144 ext v31.8b, v2.8b , v3.8b , #2 145 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) 146 ext v30.8b, v3.8b , v4.8b , #2 147 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 148 ext v28.8b, v5.8b , v6.8b , #2 149 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 150 ext v27.8b, v6.8b , v7.8b , #2 151 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 152 ext v31.8b, v2.8b , v3.8b , #3 153 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) 154 ext v30.8b, v3.8b , v4.8b , #3 155 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 156 ext v28.8b, v5.8b , v6.8b , #3 157 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 158 ext v27.8b, v6.8b , v7.8b , #3 159 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 160 ext v31.8b, v2.8b , v3.8b , #1 161 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) 162 ext v30.8b, v3.8b , v4.8b , #1 163 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 164 ext v28.8b, v5.8b , v6.8b , #1 165 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 166 ext v27.8b, v6.8b , v7.8b , #1 167 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 168 ext v31.8b, v2.8b , v3.8b , #4 169 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 170 ext v30.8b, v3.8b , v4.8b , #4 171 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 172 ext v28.8b, v5.8b , v6.8b , #4 173 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 174 ext v27.8b, v6.8b , v7.8b , #4 175 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 176 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 177 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 178 179 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row0) 180 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 181 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 182 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 183 ext v31.8b, v2.8b , v3.8b , #5 184 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 185 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 186 187 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 188 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 189 ext v30.8b, v3.8b , v4.8b , #5 190 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 191 192 193 194//// Processing row2 and row3 195 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 196 ext v28.8b, v5.8b , v6.8b , #5 197 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 198 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 199 200 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 201 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row1 202 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) 203 ext v27.8b, v6.8b , v7.8b , #5 204 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 205 ext v31.8b, v2.8b , v3.8b , #2 206 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) 207 ext v30.8b, v3.8b , v4.8b , #2 208 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 209 ext v27.8b, v6.8b , v7.8b , #2 210 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) 211 ext v28.8b, v5.8b , v6.8b , #2 212 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 213 ext v31.8b, v2.8b , v3.8b , #3 214 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) 215 ext v30.8b, v3.8b , v4.8b , #3 216 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 217 ext v28.8b, v5.8b , v6.8b , #3 218 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) 219 ext v27.8b, v6.8b , v7.8b , #3 220 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 221 ext v31.8b, v2.8b , v3.8b , #1 222 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) 223 ext v30.8b, v3.8b , v4.8b , #1 224 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 225 ext v28.8b, v5.8b , v6.8b , #1 226 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) 227 ext v27.8b, v6.8b , v7.8b , #1 228 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 229 ext v31.8b, v2.8b , v3.8b , #4 230 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) 231 ext v30.8b, v3.8b , v4.8b , #4 232 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 233 ext v28.8b, v5.8b , v6.8b , #4 234 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) 235 ext v27.8b, v6.8b , v7.8b , #4 236 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 237 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 238 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) 239 240 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row2) 241 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 242 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 243 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) 244 ext v31.8b, v2.8b , v3.8b , #5 245 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 246 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 247 248 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 249 ext v30.8b, v3.8b , v4.8b , #5 250 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 251 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) 252 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 253 254//// Processing row4 and row5 255 ext v28.8b, v5.8b , v6.8b , #5 256 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 257 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 258 259 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 260 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row3 261 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) 262 ext v27.8b, v6.8b , v7.8b , #5 263 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 264 ext v31.8b, v2.8b , v3.8b , #2 265 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) 266 ext v30.8b, v3.8b , v4.8b , #2 267 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 268 ext v27.8b, v6.8b , v7.8b , #2 269 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) 270 ext v28.8b, v5.8b , v6.8b , #2 271 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 272 ext v31.8b, v2.8b , v3.8b , #3 273 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) 274 ext v30.8b, v3.8b , v4.8b , #3 275 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 276 ext v28.8b, v5.8b , v6.8b , #3 277 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) 278 ext v27.8b, v6.8b , v7.8b , #3 279 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 280 ext v31.8b, v2.8b , v3.8b , #1 281 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) 282 ext v30.8b, v3.8b , v4.8b , #1 283 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 284 ext v28.8b, v5.8b , v6.8b , #1 285 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) 286 ext v27.8b, v6.8b , v7.8b , #1 287 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 288 ext v31.8b, v2.8b , v3.8b , #4 289 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) 290 ext v30.8b, v3.8b , v4.8b , #4 291 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 292 ext v28.8b, v5.8b , v6.8b , #4 293 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) 294 ext v27.8b, v6.8b , v7.8b , #4 295 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 296 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 297 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) 298 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row4) 299 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 300 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 301 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) 302 ext v31.8b, v2.8b , v3.8b , #5 303 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 304 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 305 306 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 307 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row4 308 ext v30.8b, v3.8b , v4.8b , #5 309 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) 310 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) 311 312 313 //// Processing row6 and row7 314 315 ext v28.8b, v5.8b , v6.8b , #5 316 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 317 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 318 319 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 320 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row5 321 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) 322 ext v27.8b, v6.8b , v7.8b , #5 323 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 324 ext v31.8b, v2.8b , v3.8b , #2 325 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) 326 ext v30.8b, v3.8b , v4.8b , #2 327 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 328 ext v27.8b, v6.8b , v7.8b , #2 329 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) 330 ext v28.8b, v5.8b , v6.8b , #2 331 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 332 ext v31.8b, v2.8b , v3.8b , #3 333 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) 334 ext v30.8b, v3.8b , v4.8b , #3 335 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 336 ext v28.8b, v5.8b , v6.8b , #3 337 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) 338 ext v27.8b, v6.8b , v7.8b , #3 339 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 340 ext v31.8b, v2.8b , v3.8b , #1 341 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) 342 ext v30.8b, v3.8b , v4.8b , #1 343 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 344 ext v28.8b, v5.8b , v6.8b , #1 345 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) 346 ext v27.8b, v6.8b , v7.8b , #1 347 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 348 ext v31.8b, v2.8b , v3.8b , #4 349 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) 350 ext v30.8b, v3.8b , v4.8b , #4 351 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 352 ext v28.8b, v5.8b , v6.8b , #4 353 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) 354 ext v27.8b, v6.8b , v7.8b , #4 355 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row6) 356 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 357 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 358 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) 359 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) 360 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 361 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 362 363 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) 364 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 365 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 366 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) 367 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 368 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 369 370 subs x12, x14, #1 // if height==16 - looping 371 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row7 372 373 374 375 beq loop_16 376 b end_func 377 378loop_8: 379//// Processing row0 and row1 380 381 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 382 add x14, x14, #1 //for checking loop 383 ext v28.8b, v5.8b , v6.8b , #5 384 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 385 ext v25.8b, v5.8b , v6.8b , #2 386 ext v31.8b, v2.8b , v3.8b , #5 387 ext v24.8b, v5.8b , v6.8b , #3 388 ext v23.8b, v5.8b , v6.8b , #1 389 ext v22.8b, v5.8b , v6.8b , #4 390 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 391 ext v29.8b, v2.8b , v3.8b , #3 392 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 393 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 394 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 395 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 396 ext v30.8b, v2.8b , v3.8b , #2 397 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 398 ext v27.8b, v2.8b , v3.8b , #1 399 ext v26.8b, v2.8b , v3.8b , #4 400 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 401 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 402 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 403 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 404 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 405 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 406 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 407 408 //// Processing row2 and row3 409 ext v28.8b, v5.8b , v6.8b , #5 410 ext v25.8b, v5.8b , v6.8b , #2 411 ext v31.8b, v2.8b , v3.8b , #5 412 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 413 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) 414 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 415 ext v24.8b, v5.8b , v6.8b , #3 416 ext v23.8b, v5.8b , v6.8b , #1 417 sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 418 ext v22.8b, v5.8b , v6.8b , #4 419 ext v29.8b, v2.8b , v3.8b , #3 420 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 421 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 422 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 423 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 424 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 425 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 426 427 st1 {v18.8b}, [x1], x3 ////Store dest row0 428 st1 {v19.8b}, [x1], x3 ////Store dest row1 429 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 430 ext v30.8b, v2.8b , v3.8b , #2 431 ext v27.8b, v2.8b , v3.8b , #1 432 ext v26.8b, v2.8b , v3.8b , #4 433 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 434 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 435 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 436 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 437 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 438 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row5 439 subs x9, x4, #4 440 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 441 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) 442 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 443 ext v28.8b, v5.8b , v6.8b , #5 444 ext v25.8b, v5.8b , v6.8b , #2 445 ext v31.8b, v2.8b , v3.8b , #5 446 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 447 ext v24.8b, v5.8b , v6.8b , #3 448 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 449 ext v22.8b, v5.8b , v6.8b , #4 450 ext v29.8b, v2.8b , v3.8b , #3 451 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 452 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 453 454 st1 {v18.8b}, [x1], x3 ////Store dest row2 455 ext v30.8b, v2.8b , v3.8b , #2 456 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 457 st1 {v19.8b}, [x1], x3 ////Store dest row3 458 beq end_func // Branch if height==4 459 460//// Processing row4 and row5 461 ext v23.8b, v5.8b , v6.8b , #1 462 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 463 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 464 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) 465 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 466 ext v27.8b, v2.8b , v3.8b , #1 467 ext v26.8b, v2.8b , v3.8b , #4 468 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 469 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 470 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 471 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 472 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 473 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 474 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 475 ext v31.8b, v2.8b , v3.8b , #5 476 ext v28.8b, v5.8b , v6.8b , #5 477 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row4) 478 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) 479 ext v25.8b, v5.8b , v6.8b , #2 480 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 481 ext v24.8b, v5.8b , v6.8b , #3 482 ext v22.8b, v5.8b , v6.8b , #4 483 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 484 ext v29.8b, v2.8b , v3.8b , #3 485 ext v30.8b, v2.8b , v3.8b , #2 486 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 487 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 488 489 st1 {v18.8b}, [x1], x3 ////Store dest row4 490 ext v27.8b, v2.8b , v3.8b , #1 491 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 492 ext v26.8b, v2.8b , v3.8b , #4 493 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 494 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 495 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 496 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 497 //// Processing row6 and row7 498 st1 {v19.8b}, [x1], x3 ////Store dest row5 499 ext v23.8b, v5.8b , v6.8b , #1 500 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 501 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 502 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) 503 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 504 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row6) 505 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) 506 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 507 subs x12, x14, #1 508 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 509 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 510 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 511 512 st1 {v18.8b}, [x1], x3 ////Store dest row6 513 st1 {v19.8b}, [x1], x3 ////Store dest row7 514 515 beq loop_8 //looping if height ==16 516 517 b end_func 518 519loop_4: 520 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 521 ext v28.8b, v5.8b , v6.8b , #5 522 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 523 ext v25.8b, v5.8b , v6.8b , #2 524 ext v31.8b, v2.8b , v3.8b , #5 525 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 526 ext v24.8b, v5.8b , v6.8b , #3 527 ext v23.8b, v5.8b , v6.8b , #1 528 ext v22.8b, v5.8b , v6.8b , #4 529 ext v29.8b, v2.8b , v3.8b , #3 530 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 531 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 532 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 533 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 534 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 535 ext v30.8b, v2.8b , v3.8b , #2 536 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) 537 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 538 ext v27.8b, v2.8b , v3.8b , #1 539 ext v26.8b, v2.8b , v3.8b , #4 540 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 541 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 542 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 543 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 544 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 545 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 546 ext v28.8b, v5.8b , v6.8b , #5 547 ext v25.8b, v5.8b , v6.8b , #2 548 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 549 ext v31.8b, v2.8b , v3.8b , #5 550 ext v24.8b, v5.8b , v6.8b , #3 551 552 ext v23.8b, v5.8b , v6.8b , #1 553 ext v22.8b, v5.8b , v6.8b , #4 554 ext v29.8b, v2.8b , v3.8b , #3 555 sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 556 ext v30.8b, v2.8b , v3.8b , #2 557 ext v27.8b, v2.8b , v3.8b , #1 558 559 //// Processing row2 and row3 560 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 561 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 562 563 st1 {v18.s}[0], [x1], x3 ////Store dest row0 564 st1 {v19.s}[0], [x1], x3 ////Store dest row1 565 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 566 ext v26.8b, v2.8b , v3.8b , #4 567 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) 568 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 569 570 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 571 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 572 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 573 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 574 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 575 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 576 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 577 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 578 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 579 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 580 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 581 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 582 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 583 584 st1 {v18.s}[0], [x1], x3 ////Store dest row2 585 subs x4, x4, #8 // Loop if height =8 586 st1 {v19.s}[0], [x1], x3 ////Store dest row3 587 588 beq loop_4 589 590end_func: 591 592 ldp x19, x20, [sp], #16 593 pop_v_regs 594 ret 595 596 597 598