1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21// ******************************************************************************* 22// * @file 23// * ih264e_half_pel.s 24// * 25// * @brief 26// * 27// * 28// * @author 29// * Ittiam 30// * 31// * @par List of Functions: 32// * ih264e_sixtapfilter_horz 33// * ih264e_sixtap_filter_2dvh_vert 34// 35// * 36// * @remarks 37// * None 38// * 39// ******************************************************************************* 40// */ 41 42 43.text 44.p2align 2 45.include "ih264_neon_macros.s" 46 47///******************************************************************************* 48//* 49//* @brief 50//* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) 51//* 52//* @par Description: 53//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 54//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 55//* 56//* @param[in] pu1_src 57//* UWORD8 pointer to the source 58//* 59//* @param[out] pu1_dst 60//* UWORD8 pointer to the destination 61//* 62//* @param[in] src_strd 63//* integer source stride 64//* 65//* @param[in] dst_strd 66//* integer destination stride 67//* 68//* 69//* @returns 70//* 71//* @remarks 72//* None 73//* 74//******************************************************************************* 75//*/ 76//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, 77// UWORD8 *pu1_dst, 78// WORD32 src_strd, 79// WORD32 dst_strd); 80 81 82.equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time) 83 84 85 .global ih264e_sixtapfilter_horz_av8 86ih264e_sixtapfilter_horz_av8: 87 // STMFD sp!,{x14} 88 push_v_regs 89 sxtw x2, w2 90 sxtw x3, w3 91 stp x19, x20, [sp, #-16]! 92 93 movi v0.8b, #5 94 sub x0, x0, #2 95 sub x3, x3, #16 96 movi v1.8b, #20 97 mov x14, #16 98 99filter_horz_loop: 100 101 102 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 103 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 104 105 //// Processing row0 and row1 106 107 ext v31.8b, v2.8b , v3.8b , #5 108 ext v30.8b, v3.8b , v4.8b , #5 109 110 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 111 ext v29.8b, v4.8b , v4.8b , #5 112 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) 113 ext v28.8b, v5.8b , v6.8b , #5 114 uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0) 115 ext v27.8b, v6.8b , v7.8b , #5 116 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 117 ext v26.8b, v7.8b , v7.8b , #5 118 119 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) 120 ext v31.8b, v2.8b , v3.8b , #2 121 uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1) 122 ext v30.8b, v3.8b , v4.8b , #2 123 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 124 ext v29.8b, v4.8b , v4.8b , #2 125 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 126 ext v28.8b, v5.8b , v6.8b , #2 127 umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 128 ext v27.8b, v6.8b , v7.8b , #2 129 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 130 ext v26.8b, v7.8b , v7.8b , #2 131 132 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) 133 ext v31.8b, v2.8b , v3.8b , #3 134 umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1) 135 ext v30.8b, v3.8b , v4.8b , #3 136 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 137 ext v29.8b, v4.8b , v4.8b , #3 138 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 139 ext v28.8b, v5.8b , v6.8b , #3 140 umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 141 ext v27.8b, v6.8b , v7.8b , #3 142 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 143 ext v26.8b, v7.8b , v7.8b , #3 144 145 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) 146 ext v31.8b, v2.8b , v3.8b , #1 147 umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1) 148 ext v30.8b, v3.8b , v4.8b , #1 149 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 150 ext v29.8b, v4.8b , v4.8b , #1 151 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 152 ext v28.8b, v5.8b , v6.8b , #1 153 umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 154 ext v27.8b, v6.8b , v7.8b , #1 155 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 156 ext v26.8b, v7.8b , v7.8b , #1 157 158 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 159 ext v31.8b, v2.8b , v3.8b , #4 160 umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) 161 ext v30.8b, v3.8b , v4.8b , #4 162 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 163 ext v29.8b, v4.8b , v4.8b , #4 164 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 165 ext v28.8b, v5.8b , v6.8b , #4 166 umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 167 ext v27.8b, v6.8b , v7.8b , #4 168 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 169 ext v26.8b, v7.8b , v7.8b , #4 170 171 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 172 umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) 173 174 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 175 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 176 sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 177 sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 178 sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 179 sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) 180 181 st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0 182 st1 {v22.h}[0], [x1], x3 183 st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1 184 st1 {v25.h}[0], [x1], x3 185 186 subs x14, x14, #2 // decrement counter 187 188 bne filter_horz_loop 189 190 191 // LDMFD sp!,{pc} 192 ldp x19, x20, [sp], #16 193 pop_v_regs 194 ret 195 196 197 198 199 200 201 202 203 204///** 205//******************************************************************************* 206//* 207//* @brief 208//* This function implements a two stage cascaded six tap filter. It 209//* applies the six tap filter in the vertical direction on the 210//* predictor values, followed by applying the same filter in the 211//* horizontal direction on the output of the first stage. The six tap 212//* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample 213//* interpolation process" 214//* (Filter run for width = 17 and height =17) 215//* @par Description: 216//* The function interpolates 217//* the predictors first in the vertical direction and then in the 218//* horizontal direction to output the (1/2,1/2). The output of the first 219//* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) 220//* in 16 bit precision. 221//* 222//* 223//* @param[in] pu1_src 224//* UWORD8 pointer to the source 225//* 226//* @param[out] pu1_dst1 227//* UWORD8 pointer to the destination(vertical filtered output) 228//* 229//* @param[out] pu1_dst2 230//* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) 231//* 232//* @param[in] src_strd 233//* integer source stride 234//* 235//* @param[in] dst_strd 236//* integer destination stride of pu1_dst 237//* 238//* @param[in]pi16_pred1 239//* Pointer to 16bit intermediate buffer(used only in c) 240//* 241//* @param[in] pi16_pred1_strd 242//* integer destination stride of pi16_pred1 243//* 244//* 245//* @returns 246//* 247//* @remarks 248//* None 249//* 250//******************************************************************************* 251//*/ 252//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, 253// UWORD8 *pu1_dst1, 254// UWORD8 *pu1_dst2, 255// WORD32 src_strd, 256// WORD32 dst_strd, 257// WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ 258// WORD32 pi16_pred1_strd) 259 260 261 262 263 .global ih264e_sixtap_filter_2dvh_vert_av8 264 265ih264e_sixtap_filter_2dvh_vert_av8: 266 // STMFD sp!,{x10,x11,x12,x14} 267 push_v_regs 268 sxtw x3, w3 269 sxtw x4, w4 270 stp x19, x20, [sp, #-16]! 271 272////x0 - pu1_ref 273////x3 - u4_ref_width 274 275 //// Load six rows for vertical interpolation 276 lsl x12, x3, #1 277 sub x0, x0, x12 278 sub x0, x0, #2 279 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 280 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 281 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 282 mov x12, #5 283 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 284 mov x14, #20 285 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 286 mov v0.h[0], w12 287 mov v0.h[1], w14 288 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 289 movi v1.8b, #20 290 291//// x12 - u2_buff1_width 292//// x14 - u2_buff2_width 293 mov x12, x4 294 add x11, x1, #16 295 296 mov x14, x12 297 298 mov x10, #3 //loop counter 299 sub x16 , x12, #8 300 sub x19, x14, #16 301filter_2dvh_loop: 302 303 //// ////////////// ROW 1 /////////////////////// 304 305//// Process first vertical interpolated row 306//// each column is 307 uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0) 308 movi v31.8b, #5 309 umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 310 umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 311 umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 312 umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 313 mov v21.d[0], v20.d[1] 314 315 uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0) 316 umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 317 umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 318 umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 319 umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 320 ext v30.8b, v20.8b , v21.8b , #4 321 mov v23.d[0], v22.d[1] 322 323 324 uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0) 325 ext v29.8b, v20.8b , v21.8b , #6 326 umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 327 umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 328 umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 329 umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 330 mov v25.d[0], v24.d[1] 331 332 sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 333 ext v31.8b, v21.8b , v22.8b , #2 334 sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 335 ext v28.8b, v20.8b , v21.8b , #2 336 337 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 338 ext v31.8b, v22.8b , v23.8b , #2 339 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 340 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 341 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 342 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 343 ext v30.8b, v21.8b , v22.8b , #4 344 345 sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 346 ext v29.8b, v21.8b , v22.8b , #6 347 348 ext v28.8b, v21.8b , v22.8b , #2 349 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 350 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 351 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 352 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 353 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 354 ext v31.8b, v23.8b , v24.8b , #2 355 mov v21.d[0], v20.d[1] 356 ext v2.8b, v2.8b , v3.8b , #2 357 ext v3.8b, v3.8b , v4.8b , #2 358 ext v4.8b, v4.8b , v4.8b , #2 359 360 st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid 361 st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 362 363 ext v30.8b, v22.8b , v23.8b , #4 364 ext v29.8b, v22.8b , v23.8b , #6 365 366 saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3) 367 ext v28.8b, v22.8b , v23.8b , #2 368 smlal v2.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 369 smlal v2.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 370 smlsl v2.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 371 smlsl v2.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 372 ext v31.8b, v24.8b , v25.8b , #2 373 374 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 375 ext v30.8b, v23.8b , v24.8b , #4 376 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 377 ext v29.8b, v23.8b , v24.8b , #6 378 379 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 380 ext v28.8b, v23.8b , v24.8b , #2 381 ext v31.8b, v25.8b , v25.8b , #2 382 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 383 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 384 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 385 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 386 ext v30.8b, v24.8b , v25.8b , #4 387 388 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 389 ext v29.8b, v24.8b , v25.8b , #6 390 391 ext v31.8b, v24.8b , v25.8b , #2 392 shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 393 394 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data 395 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 396 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 397 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 398 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 399 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 400 mov v20.d[1], v21.d[0] 401 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 402 403 404 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 405 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 406 407 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 408 409 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 410 //// ////////////// ROW 2 /////////////////////// 411 412//// Process first vertical interpolated row 413//// each column is 414 uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0) 415 movi v31.8b, #5 416 umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 417 umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 418 umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 419 umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 420 mov v21.d[0], v20.d[1] 421 422 mov v28.d[1], v29.d[0] 423 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 424 425 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 426 427 uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0) 428 umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 429 umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 430 umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 431 umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 432 mov v23.d[0], v22.d[1] 433 434 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 435 ext v30.8b, v20.8b , v21.8b , #4 436 437 uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0) 438 ext v29.8b, v20.8b , v21.8b , #6 439 umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 440 umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 441 umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 442 umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 443 mov v25.d[0], v24.d[1] 444 445 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 446 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 447 448 sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 449 ext v31.8b, v21.8b , v22.8b , #2 450 sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 451 ext v28.8b, v20.8b , v21.8b , #2 452 453 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 454 ext v31.8b, v22.8b , v23.8b , #2 455 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 456 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 457 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 458 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 459 ext v30.8b, v21.8b , v22.8b , #4 460 461 sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 462 ext v29.8b, v21.8b , v22.8b , #6 463 464 ext v28.8b, v21.8b , v22.8b , #2 465 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 466 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 467 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 468 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 469 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 470 ext v31.8b, v23.8b , v24.8b , #2 471 472 ext v5.8b, v5.8b , v6.8b , #2 473 ext v6.8b, v6.8b , v7.8b , #2 474 ext v7.8b, v7.8b , v7.8b , #2 475 476 st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid 477 st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 478 479 ext v30.8b, v22.8b , v23.8b , #4 480 ext v29.8b, v22.8b , v23.8b , #6 481 482 saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3) 483 ext v28.8b, v22.8b , v23.8b , #2 484 smlal v6.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 485 smlal v6.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 486 smlsl v6.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 487 smlsl v6.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 488 ext v31.8b, v24.8b , v25.8b , #2 489 490 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 491 ext v30.8b, v23.8b , v24.8b , #4 492 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 493 ext v29.8b, v23.8b , v24.8b , #6 494 495 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 496 ext v28.8b, v23.8b , v24.8b , #2 497 ext v31.8b, v25.8b , v25.8b , #2 498 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 499 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 500 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 501 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 502 ext v30.8b, v24.8b , v25.8b , #4 503 504 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 505 ext v29.8b, v24.8b , v25.8b , #6 506 507 ext v31.8b, v24.8b , v25.8b , #2 508 shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 509 510 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data 511 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 512 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 513 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 514 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 515 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 516 mov v20.d[1], v21.d[0] 517 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 518 519 520 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 521 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 522 523 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 524 525 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 526 //// ////////////// ROW 3 /////////////////////// 527 528//// Process first vertical interpolated row 529//// each column is 530 uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0) 531 movi v31.8b, #5 532 umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 533 umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 534 umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 535 umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 536 mov v21.d[0], v20.d[1] 537 538 mov v28.d[1], v29.d[0] 539 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 540 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 541 542 uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0) 543 umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 544 umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 545 umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 546 umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 547 mov v23.d[0], v22.d[1] 548 549 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 550 ext v30.8b, v20.8b , v21.8b , #4 551 552 uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0) 553 ext v29.8b, v20.8b , v21.8b , #6 554 umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 555 umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 556 umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 557 umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 558 mov v25.d[0], v24.d[1] 559 560 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 561 st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 562 563 sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 564 ext v31.8b, v21.8b , v22.8b , #2 565 sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 566 ext v28.8b, v20.8b , v21.8b , #2 567 568 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 569 ext v31.8b, v22.8b , v23.8b , #2 570 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 571 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 572 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 573 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 574 ext v30.8b, v21.8b , v22.8b , #4 575 576 sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 577 ext v29.8b, v21.8b , v22.8b , #6 578 579 ext v28.8b, v21.8b , v22.8b , #2 580 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 581 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 582 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 583 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 584 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 585 ext v31.8b, v23.8b , v24.8b , #2 586 587 ext v8.8b, v8.8b , v9.8b , #2 588 ext v9.8b, v9.8b , v10.8b , #2 589 ext v10.8b, v10.8b , v10.8b , #2 590 591 st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid 592 st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 593 594 ext v30.8b, v22.8b , v23.8b , #4 595 ext v29.8b, v22.8b , v23.8b , #6 596 597 saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3) 598 ext v28.8b, v22.8b , v23.8b , #2 599 smlal v8.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 600 smlal v8.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 601 smlsl v8.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 602 smlsl v8.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 603 ext v31.8b, v24.8b , v25.8b , #2 604 605 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 606 ext v30.8b, v23.8b , v24.8b , #4 607 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 608 ext v29.8b, v23.8b , v24.8b , #6 609 610 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 611 ext v28.8b, v23.8b , v24.8b , #2 612 ext v31.8b, v25.8b , v25.8b , #2 613 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 614 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 615 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 616 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 617 ext v30.8b, v24.8b , v25.8b , #4 618 619 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 620 ext v29.8b, v24.8b , v25.8b , #6 621 622 ext v31.8b, v24.8b , v25.8b , #2 623 shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 624 625 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data 626 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 627 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 628 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 629 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 630 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 631 mov v20.d[1], v21.d[0] 632 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 633 634 635 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 636 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 637 638 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 639 640 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 641 //// ////////////// ROW 4 /////////////////////// 642 643//// Process first vertical interpolated row 644//// each column is 645 uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0) 646 movi v31.8b, #5 647 umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 648 umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 649 umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 650 umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 651 mov v21.d[0], v20.d[1] 652 mov v28.d[1], v29.d[0] 653 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 654 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 655 656 uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0) 657 umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 658 umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 659 umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 660 umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 661 mov v23.d[0], v22.d[1] 662 663 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 664 ext v30.8b, v20.8b , v21.8b , #4 665 666 uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0) 667 ext v29.8b, v20.8b , v21.8b , #6 668 umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 669 umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 670 umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 671 umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 672 mov v25.d[0], v24.d[1] 673 674 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 675 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 676 677 sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 678 ext v31.8b, v21.8b , v22.8b , #2 679 sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 680 ext v28.8b, v20.8b , v21.8b , #2 681 682 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 683 ext v31.8b, v22.8b , v23.8b , #2 684 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 685 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 686 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 687 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 688 ext v30.8b, v21.8b , v22.8b , #4 689 690 sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 691 ext v29.8b, v21.8b , v22.8b , #6 692 693 ext v28.8b, v21.8b , v22.8b , #2 694 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 695 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 696 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 697 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 698 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 699 ext v31.8b, v23.8b , v24.8b , #2 700 701 ext v11.8b, v11.8b , v12.8b , #2 702 ext v12.8b, v12.8b , v13.8b , #2 703 ext v13.8b, v13.8b , v13.8b , #2 704 705 st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid 706 st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 707 708 ext v30.8b, v22.8b , v23.8b , #4 709 ext v29.8b, v22.8b , v23.8b , #6 710 711 saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3) 712 ext v28.8b, v22.8b , v23.8b , #2 713 smlal v12.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 714 smlal v12.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 715 smlsl v12.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 716 smlsl v12.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 717 ext v31.8b, v24.8b , v25.8b , #2 718 719 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 720 ext v30.8b, v23.8b , v24.8b , #4 721 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 722 ext v29.8b, v23.8b , v24.8b , #6 723 724 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 725 ext v28.8b, v23.8b , v24.8b , #2 726 ext v31.8b, v25.8b , v25.8b , #2 727 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 728 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 729 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 730 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 731 ext v30.8b, v24.8b , v25.8b , #4 732 733 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 734 ext v29.8b, v24.8b , v25.8b , #6 735 736 ext v31.8b, v24.8b , v25.8b , #2 737 shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 738 739 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data 740 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 741 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 742 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 743 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 744 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 745 mov v20.d[1], v21.d[0] 746 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 747 748 749 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 750 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 751 752 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 753 754 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 755 //// ////////////// ROW 5 /////////////////////// 756 757//// Process first vertical interpolated row 758//// each column is 759 uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0) 760 movi v31.8b, #5 761 umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 762 umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 763 umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 764 umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 765 mov v21.d[0], v20.d[1] 766 mov v28.d[1], v29.d[0] 767 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 768 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 769 770 uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0) 771 umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 772 umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 773 umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 774 umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 775 mov v23.d[0], v22.d[1] 776 777 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 778 ext v30.8b, v20.8b , v21.8b , #4 779 780 uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0) 781 ext v29.8b, v20.8b , v21.8b , #6 782 umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 783 umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 784 umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 785 umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 786 mov v25.d[0], v24.d[1] 787 788 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 789 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 790 791 sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 792 ext v31.8b, v21.8b , v22.8b , #2 793 sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 794 ext v28.8b, v20.8b , v21.8b , #2 795 796 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 797 ext v31.8b, v22.8b , v23.8b , #2 798 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 799 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 800 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 801 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 802 ext v30.8b, v21.8b , v22.8b , #4 803 804 sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 805 ext v29.8b, v21.8b , v22.8b , #6 806 807 ext v28.8b, v21.8b , v22.8b , #2 808 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 809 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 810 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 811 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 812 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 813 ext v31.8b, v23.8b , v24.8b , #2 814 815 ext v14.8b, v14.8b , v15.8b , #2 816 ext v15.8b, v15.8b , v16.8b , #2 817 ext v16.8b, v16.8b , v16.8b , #2 818 819 st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid 820 st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 821 822 ext v30.8b, v22.8b , v23.8b , #4 823 ext v29.8b, v22.8b , v23.8b , #6 824 825 saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3) 826 ext v28.8b, v22.8b , v23.8b , #2 827 smlal v14.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 828 smlal v14.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 829 smlsl v14.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 830 smlsl v14.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 831 ext v31.8b, v24.8b , v25.8b , #2 832 833 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 834 ext v30.8b, v23.8b , v24.8b , #4 835 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 836 ext v29.8b, v23.8b , v24.8b , #6 837 838 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 839 ext v28.8b, v23.8b , v24.8b , #2 840 ext v31.8b, v25.8b , v25.8b , #2 841 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 842 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 843 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 844 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 845 ext v30.8b, v24.8b , v25.8b , #4 846 847 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 848 ext v29.8b, v24.8b , v25.8b , #6 849 850 ext v31.8b, v24.8b , v25.8b , #2 851 shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 852 853 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data 854 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 855 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 856 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 857 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 858 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 859 mov v20.d[1], v21.d[0] 860 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 861 862 863 ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 864 ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) 865 866 ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 867 868 ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values 869 //// ////////////// ROW 6 /////////////////////// 870 871//// Process first vertical interpolated row 872//// each column is 873 874 cmp x10, #1 //// if it 17 rows are complete skip 875 beq filter_2dvh_skip_row 876 uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0) 877 movi v31.8b, #5 878 umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 879 umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 880 umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 881 umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 882 mov v21.d[0], v20.d[1] 883 mov v28.d[1], v29.d[0] 884 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 885 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 886 887 uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0) 888 umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 889 umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 890 umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 891 umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 892 mov v23.d[0], v22.d[1] 893 894 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 895 ext v30.8b, v20.8b , v21.8b , #4 896 897 uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0) 898 ext v29.8b, v20.8b , v21.8b , #6 899 umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) 900 umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) 901 umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) 902 umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) 903 mov v25.d[0], v24.d[1] 904 905 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 906 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 907 908 sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 909 ext v31.8b, v21.8b , v22.8b , #2 910 sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 911 ext v28.8b, v20.8b , v21.8b , #2 912 913 saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) 914 ext v31.8b, v22.8b , v23.8b , #2 915 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1) 916 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1) 917 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) 918 smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) 919 ext v30.8b, v21.8b , v22.8b , #4 920 921 sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) 922 ext v29.8b, v21.8b , v22.8b , #6 923 924 ext v28.8b, v21.8b , v22.8b , #2 925 saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) 926 smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2) 927 smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2) 928 smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) 929 smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) 930 ext v31.8b, v23.8b , v24.8b , #2 931 932 ext v17.8b, v17.8b , v18.8b , #2 933 ext v18.8b, v18.8b , v19.8b , #2 934 ext v19.8b, v19.8b , v19.8b , #2 935 936 st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid 937 st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid 938 939 ext v30.8b, v22.8b , v23.8b , #4 940 ext v29.8b, v22.8b , v23.8b , #6 941 942 saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3) 943 ext v28.8b, v22.8b , v23.8b , #2 944 smlal v18.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3) 945 smlal v18.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3) 946 smlsl v18.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) 947 smlsl v18.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) 948 ext v31.8b, v24.8b , v25.8b , #2 949 950 shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) 951 ext v30.8b, v23.8b , v24.8b , #4 952 shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) 953 ext v29.8b, v23.8b , v24.8b , #6 954 955 saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) 956 ext v28.8b, v23.8b , v24.8b , #2 957 ext v31.8b, v25.8b , v25.8b , #2 958 smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4) 959 smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4) 960 smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) 961 smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) 962 ext v30.8b, v24.8b , v25.8b , #4 963 964 saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) 965 ext v29.8b, v24.8b , v25.8b , #6 966 967 ext v31.8b, v24.8b , v25.8b , #2 968 shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) 969 970 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data 971 smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5) 972 smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5) 973 smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) 974 smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) 975 shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) 976 mov v20.d[1], v21.d[0] 977 sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 978 979 mov v28.d[1], v29.d[0] 980 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 981 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 982 983 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 984 985 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 986 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 987 988 subs x10, x10, #1 ////decrement loop counter 989 990 bne filter_2dvh_loop 991 992 993//// Process first vertical interpolated row 994//// each column is 995 //// ////////////// ROW 13 /////////////////////// 996 997//// Process first vertical interpolated row 998//// each column is 999 1000 // LDMFD sp!,{x10,x11,x12,pc} 1001 ldp x19, x20, [sp], #16 1002 pop_v_regs 1003 ret 1004 1005filter_2dvh_skip_row: 1006 mov v28.d[1], v29.d[0] 1007 sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 1008 shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) 1009 1010 sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 1011 1012 st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values 1013 st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values 1014 // LDMFD sp!,{x10,x11,x12,pc} 1015 ldp x19, x20, [sp], #16 1016 pop_v_regs 1017 ret 1018 1019 1020///***************************************** 1021