1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_vert_qpel_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction vertical quarter pel interpolation. 27//* 28//* @author 29//* Mohit 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_vert_qpel_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Quarter pel interprediction luma filter for vertical input 50//* 51//* @par Description: 52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[out] pu1_dst 59//* UWORD8 pointer to the destination 60//* 61//* @param[in] src_strd 62//* integer source stride 63//* 64//* @param[in] dst_strd 65//* integer destination stride 66//* 67//* @param[in] ht 68//* integer height of the array 69//* 70//* @param[in] wd 71//* integer width of the array 72//* 73//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function 74//* 75//* @param[in] dydx: x and y reference offset for qpel calculations. 76//* @returns 77//* 78// @remarks 79//* None 80//* 81//******************************************************************************* 82//*/ 83 84//void ih264_inter_pred_luma_vert ( 85// UWORD8 *pu1_src, 86// UWORD8 *pu1_dst, 87// WORD32 src_strd, 88// WORD32 dst_strd, 89// WORD32 ht, 90// WORD32 wd, 91// UWORD8* pu1_tmp, 92// UWORD32 dydx) 93 94//**************Variables Vs Registers***************************************** 95// x0 => *pu1_src 96// x1 => *pu1_dst 97// x2 => src_strd 98// x3 => dst_strd 99// x4 => ht 100// x5 => wd 101// x7 => dydx 102 103.text 104.p2align 2 105.include "ih264_neon_macros.s" 106 107 108 109 .global ih264_inter_pred_luma_vert_qpel_av8 110 111ih264_inter_pred_luma_vert_qpel_av8: 112 113 push_v_regs 114 stp x19, x20, [sp, #-16]! 115 116 117 and x7, x7, #12 //Finds y-offset 118 lsr x7, x7, #3 //dydx>>3 119 mul x7, x2, x7 120 add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd 121 sub x14, x4, #16 122 movi v22.8h, #20 // Filter coeff 0x14 into Q11 123 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 124 subs x12, x5, #8 //if wd=8 branch to loop_8 125 movi v24.8h, #5 // Filter coeff 0x4 into Q12 126 beq loop_8_start 127 128 subs x12, x5, #4 //if wd=4 branch to loop_4 129 beq loop_4_start 130 131 132 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 133 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 134 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 135 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 136 add x14, x14, #1 //for checking loop 137 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 138 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 139 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 140 141loop_16: //when wd=16 142 143 uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 144 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 145 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 146 uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] 147 uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] 148 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 149 ld1 {v0.2s, v1.2s}, [x0], x2 150 uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] 151 uaddl v12.8h, v6.8b, v8.8b 152 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 153 uaddl v16.8h, v2.8b, v0.8b 154 uaddl v18.8h, v4.8b, v10.8b 155 mla v16.8h, v12.8h , v22.8h 156 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 157 uaddl v26.8h, v5.8b, v11.8b 158 uaddl v12.8h, v7.8b, v9.8b 159 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 160 uaddl v14.8h, v3.8b, v1.8b 161 ld1 {v2.2s, v3.2s}, [x0], x2 162 mla v14.8h, v12.8h , v22.8h 163 mls v16.8h, v18.8h , v24.8h 164 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 165 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0 166 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 167 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 168 uaddl v18.8h, v4.8b, v2.8b 169 uaddl v12.8h, v8.8b, v10.8b 170 st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] 171 mla v18.8h, v12.8h , v22.8h 172 uaddl v20.8h, v6.8b, v0.8b 173 mls v14.8h, v26.8h , v24.8h 174 sqrshrun v30.8b, v16.8h, #5 175 uaddl v12.8h, v9.8b, v11.8b 176 uaddl v16.8h, v5.8b, v3.8b 177 uaddl v26.8h, v7.8b, v1.8b 178 mla v16.8h, v12.8h , v22.8h 179 mls v18.8h, v20.8h , v24.8h 180 ld1 {v4.2s, v5.2s}, [x0], x2 181 sqrshrun v31.8b, v14.8h, #5 182 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1 183 uaddl v12.8h, v10.8b, v0.8b 184 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 185 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 186 uaddl v14.8h, v6.8b, v4.8b 187 uaddl v20.8h, v8.8b, v2.8b 188 mla v14.8h, v12.8h , v22.8h 189 mls v16.8h, v26.8h , v24.8h 190 st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 191 sqrshrun v30.8b, v18.8h, #5 192 uaddl v18.8h, v7.8b, v5.8b 193 uaddl v12.8h, v11.8b, v1.8b 194 mla v18.8h, v12.8h , v22.8h 195 uaddl v26.8h, v9.8b, v3.8b 196 mls v14.8h, v20.8h , v24.8h 197 ld1 {v6.2s, v7.2s}, [x0], x2 198 sqrshrun v31.8b, v16.8h, #5 199 ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2 200 mls v18.8h, v26.8h , v24.8h 201 urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value 202 urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value 203 uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] 204 st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 205 uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] 206 uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] 207 sqrshrun v30.8b, v14.8h, #5 208 uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] 209 uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] 210 sqrshrun v31.8b, v18.8h, #5 211 ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3 212 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 213 urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value 214 urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value 215 uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] 216 st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 217 // 4 rows processed 218 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 219 ld1 {v8.2s, v9.2s}, [x0], x2 220 uaddl v12.8h, v2.8b, v4.8b 221 uaddl v18.8h, v3.8b, v5.8b 222 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 223 uaddl v28.8h, v9.8b, v11.8b 224 uaddl v16.8h, v6.8b, v0.8b 225 mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 226 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 227 uaddl v26.8h, v1.8b, v7.8b 228 uaddl v18.8h, v5.8b, v7.8b 229 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 230 uaddl v14.8h, v8.8b, v10.8b 231 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 232 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4 233 ld1 {v10.2s, v11.2s}, [x0], x2 234 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 235 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 236 mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 237 st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 238 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 239 uaddl v20.8h, v11.8b, v1.8b 240 uaddl v26.8h, v3.8b, v9.8b 241 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 242 uaddl v12.8h, v6.8b, v4.8b 243 uaddl v18.8h, v7.8b, v9.8b 244 sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 245 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 246 uaddl v16.8h, v8.8b, v2.8b 247 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 248 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5 249 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 250 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 251 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 252 uaddl v14.8h, v10.8b, v0.8b 253 st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 254 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 255 ld1 {v0.2s, v1.2s}, [x0], x2 256 uaddl v26.8h, v5.8b, v11.8b 257 uaddl v12.8h, v8.8b, v6.8b 258 uaddl v28.8h, v0.8b, v2.8b 259 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 260 mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 261 uaddl v20.8h, v1.8b, v3.8b 262 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 263 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 264 uaddl v16.8h, v10.8b, v4.8b 265 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 266 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6 267 mov v2.8b, v6.8b 268 mov v3.8b, v7.8b 269 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 270 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 271 272 mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 273 st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 274 sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 275 swp v0.8b, v4.8b // swapping registers to put it in order 276 swp v1.8b, v5.8b // swapping registers to put it in order 277 278 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 279 mov v6.8b, v10.8b 280 mov v7.8b, v11.8b 281 subs x12, x14, #1 // if height==16 - looping 282 swp v4.8b, v8.8b 283 swp v5.8b, v9.8b 284 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 285 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7 286 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 287 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 288 st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 289 bne end_func //if height =8 end function 290 add x14, x14, #1 //for checking loop 291 ld1 {v10.2s, v11.2s}, [x0], x2 292 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 293 294 b loop_16 // looping if height =16 295 296loop_8_start: 297//// Processing row0 and row1 298 299 ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] 300 ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] 301 ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] 302 ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] 303 add x14, x14, #1 //for checking loop 304 ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] 305 ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] 306 307loop_8: 308 //for checking loop 309 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 310 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 311 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 312 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 313 ld1 {v6.2s}, [x0], x2 314 uaddl v14.8h, v3.8b, v4.8b 315 uaddl v16.8h, v1.8b, v6.8b 316 uaddl v18.8h, v2.8b, v5.8b 317 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 318 mla v16.8h, v14.8h , v22.8h 319 ld1 {v7.2s}, [x0], x2 320 uaddl v20.8h, v4.8b, v5.8b 321 uaddl v12.8h, v2.8b, v7.8b 322 uaddl v10.8h, v3.8b, v6.8b 323 mls v16.8h, v18.8h , v24.8h 324 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 325 mla v12.8h, v20.8h , v22.8h 326 ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0) 327 ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1) 328 ld1 {v0.2s}, [x0], x2 329 uaddl v14.8h, v5.8b, v6.8b 330 sqrshrun v27.8b, v16.8h, #5 331 urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation 332 urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation 333 334 uaddl v20.8h, v3.8b, v0.8b 335 mls v12.8h, v10.8h , v24.8h 336 st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] 337 uaddl v18.8h, v4.8b, v7.8b 338 mla v20.8h, v14.8h , v22.8h 339 st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0] 340 sqrshrun v28.8b, v12.8h, #5 341 mls v20.8h, v18.8h , v24.8h 342 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2) 343 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3) 344 ld1 {v1.2s}, [x0], x2 345 sqrshrun v29.8b, v20.8h, #5 346 subs x9, x4, #4 347 urhadd v28.16b, v12.16b , v28.16b 348 urhadd v29.16b, v13.16b , v29.16b 349 st1 {v28.2s}, [x1], x3 //store row 2 350 st1 {v29.2s}, [x1], x3 //store row 3 351 beq end_func // Branch if height==4 352 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 353 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 354 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 355 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 356 ld1 {v2.2s}, [x0], x2 357 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 358 uaddl v8.8h, v0.8b, v7.8b 359 uaddl v10.8h, v1.8b, v6.8b 360 uaddl v12.8h, v2.8b, v5.8b 361 sqrshrun v26.8b, v18.8h, #5 362 mla v12.8h, v8.8h , v22.8h 363 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4) 364 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5) 365 ld1 {v3.2s}, [x0], x2 366 mls v12.8h, v10.8h , v24.8h 367 sqrshrun v27.8b, v12.8h, #5 368 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 369 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 370 371 st1 {v26.2s}, [x1], x3 // store row 4 372 st1 {v27.2s}, [x1], x3 // store row 5 373 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 374 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 375 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 376 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 377 ld1 {v4.2s}, [x0], x2 378 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 379 uaddl v8.8h, v2.8b, v1.8b 380 uaddl v10.8h, v3.8b, v0.8b 381 uaddl v12.8h, v4.8b, v7.8b 382 sqrshrun v26.8b, v18.8h, #5 383 mla v12.8h, v8.8h , v22.8h 384 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6) 385 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7) 386 ld1 {v5.2s}, [x0], x2 387 mls v12.8h, v10.8h , v24.8h 388 sqrshrun v27.8b, v12.8h, #5 389 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 390 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 391 392 subs x12, x14, #1 393 st1 {v26.2s}, [x1], x3 // store row 6 394 st1 {v27.2s}, [x1], x3 // store row 7 395 add x14, x14, #1 396 beq loop_8 //looping if height ==16 397 398 b end_func 399 400 401loop_4_start: 402//// Processing row0 and row1 403 404 405 ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] 406 ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] 407 ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] 408 ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] 409 ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] 410 ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] 411 412 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 413 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 414 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 415 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 416 ld1 {v6.2s}, [x0], x2 417 uaddl v14.8h, v3.8b, v4.8b 418 uaddl v16.8h, v1.8b, v6.8b 419 uaddl v18.8h, v2.8b, v5.8b 420 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 421 ld1 {v7.s}[0], [x0], x2 422 mla v16.8h, v14.8h , v22.8h 423 uaddl v20.8h, v4.8b, v5.8b 424 uaddl v12.8h, v2.8b, v7.8b 425 uaddl v10.8h, v3.8b, v6.8b 426 mls v16.8h, v18.8h , v24.8h 427 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 428 ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0 429 ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1 430 mla v12.8h, v20.8h , v22.8h 431 ld1 {v0.s}[0], [x0], x2 432 uaddl v14.8h, v5.8b, v6.8b 433 sqrshrun v27.8b, v16.8h, #5 434 uaddl v20.8h, v3.8b, v0.8b 435 urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation 436 urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation 437 438 mls v12.8h, v10.8h , v24.8h 439 st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] 440 uaddl v18.8h, v4.8b, v7.8b 441 mla v20.8h, v14.8h , v22.8h 442 st1 {v27.s}[0], [x1], x3 // store row 1 443 sqrshrun v28.8b, v12.8h, #5 444 ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2 445 ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3 446 447 mls v20.8h, v18.8h , v24.8h 448 ld1 {v1.s}[0], [x0], x2 449 sqrshrun v29.8b, v20.8h, #5 450 urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation 451 urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation 452 453 st1 {v28.s}[0], [x1], x3 //store row 2 454 st1 {v29.s}[0], [x1], x3 //store row 3 455 456 subs x9, x4, #4 457 beq end_func // Branch if height==4 458 459 460 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 461 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 462 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 463 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 464 ld1 {v2.s}[0], [x0], x2 465 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 466 uaddl v8.8h, v0.8b, v7.8b 467 uaddl v10.8h, v1.8b, v6.8b 468 uaddl v12.8h, v2.8b, v5.8b 469 sqrshrun v26.8b, v18.8h, #5 470 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4 471 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5 472 mla v12.8h, v8.8h , v22.8h 473 ld1 {v3.s}[0], [x0], x2 474 mls v12.8h, v10.8h , v24.8h 475 sqrshrun v27.8b, v12.8h, #5 476 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 477 urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation 478 479 st1 {v26.s}[0], [x1], x3 //store row 4 480 st1 {v27.s}[0], [x1], x3 // store row 5 481 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 482 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 483 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 484 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 485 ld1 {v4.s}[0], [x0], x2 486 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 487 uaddl v8.8h, v2.8b, v1.8b 488 uaddl v10.8h, v3.8b, v0.8b 489 uaddl v12.8h, v4.8b, v7.8b 490 sqrshrun v26.8b, v18.8h, #5 491 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6 492 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7 493 mla v12.8h, v8.8h , v22.8h 494 ld1 {v5.s}[0], [x0], x2 495 mls v12.8h, v10.8h , v24.8h 496 sqrshrun v27.8b, v12.8h, #5 497 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 498 urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation 499 500 st1 {v26.s}[0], [x1], x3 // store row 6 501 st1 {v27.s}[0], [x1], x3 // store row 7 502 503 504end_func: 505 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 506 ldp x19, x20, [sp], #16 507 pop_v_regs 508 ret 509 510 511 512