1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_chroma_mode_27_to_33.s 22//* 23//* @brief 24//* contains function definition for intra prediction interpolation filters 25//* 26//* 27//* @author 28//* parthiban v 29//* 30//* @par list of functions: 31//* - ihevc_intra_pred_chroma_mode_27_to_33() 32//* 33//* @remarksll 34//* none 35//* 36//******************************************************************************* 37//*/ 38 39///** 40//******************************************************************************* 41//* 42//* @brief 43//* intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 44//* neighboring samples location pointed by 'pu1_ref' to the tu 45//* block location pointed by 'pu1_dst' 46//* 47//* @par description: 48//* 49//* 50//* @param[in] pu1_src 51//* uword8 pointer to the source 52//* 53//* @param[in] pu1_dst 54//* uword8 pointer to the destination 55//* 56//* @param[in] src_strd 57//* integer source stride 58//* 59//* @param[in] dst_strd 60//* integer destination stride 61//* 62//* @param[in] nt 63//* integer transform block size 64//* 65//* @param[in] mode 66//* integer intraprediction mode 67//* 68//* @returns 69//* 70//* @remarks 71//* none 72//* 73//******************************************************************************* 74//*/ 75 76//.if intra_pred_chroma_27_t0_33 == c 77//void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83 84.text 85.align 4 86.include "ihevc_neon_macros.s" 87 88 89.globl ihevc_intra_pred_chroma_mode_27_to_33_av8 90.extern gai4_ihevc_ang_table 91.extern gau1_ihevc_planar_factor 92 93.type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function 94 95ihevc_intra_pred_chroma_mode_27_to_33_av8: 96 97 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 98 99 stp d9,d10,[sp,#-16]! 100 stp d12,d13,[sp,#-16]! 101 stp d14,d15,[sp,#-16]! 102 stp x19, x20,[sp,#-16]! 103 104 adrp x6, :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35] 105 ldr x6, [x6, #:got_lo12:gai4_ihevc_ang_table] 106 107 lsl x7,x4,#2 //four_nt 108 109 add x8,x6,x5,lsl #2 //*gai4_ihevc_ang_table[mode] 110 ldr w9, [x8] //intra_pred_ang = gai4_ihevc_ang_table[mode] 111 sxtw x9,w9 112 adrp x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values 113 ldr x1, [x1, #:got_lo12:gau1_ihevc_planar_factor] 114 add x6,x1,#1 115 116 tst x4,#7 117 add x8,x0,x7 //pu1_ref + four_nt 118 mov x14,#0 //row 119 mov x12,x4 120 bne core_loop_4 121 lsl x4,x4,#1 122 b core_loop_8 123 124core_loop_8: 125 add x8,x8,#2 //pu1_ref_main_idx += (four_nt + 1) 126 dup v0.8b,w9 //intra_pred_ang 127 lsr x12, x4, #4 //divide by 8 128 129 movi v1.8b, #32 130 mul x7, x4, x12 131 132 movi v6.8h, #31 133 134 mov x1,x8 135 mov x5,x4 136 mov x11,#2 137 138prologue: 139 ld1 {v3.8b},[x6] //loads the row value 140 umull v2.8h, v3.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) 141 and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) 142 xtn v4.8b, v4.8h 143 shrn v5.8b, v2.8h,#5 //idx = pos >> 5 144 145 dup v31.8b, v4.b[0] 146 add x0,x2,x3 147 148 smov x14, v5.s[0] //(i row)extract idx to the r register 149 lsl x14,x14,#1 150 151 dup v29.8b, v4.b[1] //(ii) 152 and x9,x14,#0xff //(i row) get the last byte 153 154 add x10,x8,x9 //(i row)*pu1_ref[ref_main_idx] 155 156 asr x14,x14,#8 //(ii)shift by 8 157 ld1 {v23.8b},[x10],x11 //(i row)ref_main_idx 158 and x9,x14,#0xff //(ii)get the last byte 159 160 asr x14,x14,#8 //(iii) 161 ld1 {v9.8b},[x10] //(i row)ref_main_idx_1 162 add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx] 163 164 and x9,x14,#0xff //(iii) 165 sub v30.8b, v1.8b , v31.8b //32-fract(dup_const_32_fract) 166 add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] 167 168 ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx 169 umull v10.8h, v23.8b, v30.8b //(i row)vmull_u8(ref_main_idx, dup_const_32_fract) 170 171 ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 172 umlal v10.8h, v9.8b, v31.8b //(i row)vmull_u8(ref_main_idx_1, dup_const_fract) 173 asr x14,x14,#8 //(iv) 174 175 dup v27.8b, v4.b[2] //(iii) 176 sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) 177 and x9,x14,#0xff //(iv) 178 179 dup v25.8b, v4.b[3] //(iv) 180 umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) 181 add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] 182 183 ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx 184 umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract) 185 186 ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 187 rshrn v10.8b, v10.8h,#5 //(i row)shift_res = vrshrn_n_u16(add_res, 5) 188 189 ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx 190 sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract) 191 192 ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 193 194 dup v31.8b, v4.b[4] //(v) 195 umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) 196 197 smov x14, v5.s[1] //extract idx to the r register 198 umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) 199 lsl x14,x14,#1 200 201 st1 {v10.8b},[x2],#8 //(i row) 202 rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) 203 204 and x9,x14,#0xff //(v) 205 dup v29.8b, v4.b[5] //(vi) 206 add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] 207 208 ld1 {v23.8b},[x10],x11 //(v)ref_main_idx 209 sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) 210 211 asr x14,x14,#8 //(vi) 212 umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) 213 and x9,x14,#0xff //(vi) 214 215 ld1 {v9.8b},[x10] //(v)ref_main_idx_1 216 umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) 217 218 st1 {v14.8b},[x0],x3 //(ii) 219 rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) 220 221 add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] 222 dup v27.8b, v4.b[6] //(vii) 223 asr x14,x14,#8 //(vii) 224 225 and x9,x14,#0xff //(vii) 226 sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract) 227 add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] 228 229 ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx 230 umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) 231 232 ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 233 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) 234 235 st1 {v18.8b},[x0],x3 //(iii) 236 rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) 237 238 asr x14,x14,#8 //(viii) 239 dup v25.8b, v4.b[7] //(viii) 240 and x9,x14,#0xff //(viii) 241 242 ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx 243 sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract) 244 245 ld1 {v17.8b},[x10] //(vii)ref_main_idx_1 246 umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract) 247 248 add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] 249 umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract) 250 subs x7,x7,#8 251 252 st1 {v22.8b},[x0],x3 //(iv) 253 rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) 254 255 ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx 256 sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract) 257 258 ld1 {v21.8b},[x12] //(viii)ref_main_idx_1 259 umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract) 260 261 add x20,x8,#8 262 csel x8, x20, x8,gt 263 umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract) 264 sub x20,x4,#8 265 csel x4, x20, x4,gt 266 267 st1 {v10.8b},[x0],x3 //(v) 268 rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5) 269 270 beq epilogue 271 272 ld1 {v5.8b},[x6] //loads the row value 273 umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) 274 and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) 275 xtn v4.8b, v4.8h 276 shrn v3.8b, v2.8h,#5 //idx = pos >> 5 277 smov x14, v3.s[0] //(i)extract idx to the r register 278 lsl x14,x14,#1 279 and x9,x14,#0xff //(i) 280 add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] 281 282kernel_8_rows: 283 asr x14,x14,#8 //(ii) 284 dup v31.8b, v4.b[0] 285 subs x4,x4,#8 286 287 ld1 {v23.8b},[x10],x11 //(i)ref_main_idx 288 sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) 289 and x9,x14,#0xff //(ii) 290 add x20,x6,#8 //increment the row value 291 csel x6, x20, x6,le 292 293 ld1 {v9.8b},[x10] //(i)ref_main_idx_1 294 umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract) 295 add x12,x8,x9 //(ii)*pu1_ref[ref_main_idx] 296 297 ld1 {v5.8b},[x6] //loads the row value 298 umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) 299 asr x14,x14,#8 //(iii) 300 301 dup v29.8b, v4.b[1] //(ii) 302 rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) 303 and x9,x14,#0xff //(iii) 304 305 st1 {v14.8b},[x0],x3 //(vi) 306 sub v30.8b, v1.8b , v31.8b //(i)32-fract(dup_const_32_fract) 307 add x10,x8,x9 //(iii)*pu1_ref[ref_main_idx] 308 309 ld1 {v12.8b},[x12],x11 //(ii)ref_main_idx 310 umull v10.8h, v23.8b, v30.8b //(i)vmull_u8(ref_main_idx, dup_const_32_fract) 311 asr x14,x14,#8 //(iv) 312 313 ld1 {v13.8b},[x12] //(ii)ref_main_idx_1 314 umlal v10.8h, v9.8b, v31.8b //(i)vmull_u8(ref_main_idx_1, dup_const_fract) 315 and x9,x14,#0xff //(iv) 316 317 smov x14, v3.s[1] //extract idx to the r register 318 rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) 319 320 dup v27.8b, v4.b[2] //(iii) 321 sub v28.8b, v1.8b , v29.8b //(ii)32-fract(dup_const_32_fract) 322 csel x4, x5, x4,le //reload nt 323 324 ld1 {v16.8b},[x10],x11 //(iii)ref_main_idx 325 umull v14.8h, v12.8b, v28.8b //(ii)vmull_u8(ref_main_idx, dup_const_32_fract) 326 add x12,x8,x9 //(iv)*pu1_ref[ref_main_idx] 327 328 st1 {v18.8b},[x0],x3 //(vii) 329 umlal v14.8h, v13.8b, v29.8b //(ii)vmull_u8(ref_main_idx_1, dup_const_fract) 330 331 ld1 {v17.8b},[x10] //(iii)ref_main_idx_1 332 rshrn v10.8b, v10.8h,#5 //(i)shift_res = vrshrn_n_u16(add_res, 5) 333 334 dup v25.8b, v4.b[3] //(iv) 335 umull v2.8h, v5.8b, v0.8b //pos = ((row + 1) * intra_pred_ang) 336 337 st1 {v22.8b},[x0] //(viii) 338 sub v26.8b, v1.8b , v27.8b //(iii)32-fract(dup_const_32_fract) 339 340 ld1 {v20.8b},[x12],x11 //(iv)ref_main_idx 341 umull v18.8h, v16.8b, v26.8b //(iii)vmull_u8(ref_main_idx, dup_const_32_fract) 342 lsl x14,x14,#1 343 344 ld1 {v21.8b},[x12] //(iv)ref_main_idx_1 345 umlal v18.8h, v17.8b, v27.8b //(iii)vmull_u8(ref_main_idx_1, dup_const_fract) 346 add x0,x2,x3 347 348 dup v31.8b, v4.b[4] //(v) 349 rshrn v14.8b, v14.8h,#5 //(ii)shift_res = vrshrn_n_u16(add_res, 5) 350 and x9,x14,#0xff //(v) 351 352 st1 {v10.8b},[x2],#8 //(i) 353 sub v24.8b, v1.8b , v25.8b //(iv)32-fract(dup_const_32_fract) 354 add x10,x8,x9 //(v)*pu1_ref[ref_main_idx] 355 356 dup v29.8b, v4.b[5] //(vi) 357 umull v22.8h, v20.8b, v24.8b //(iv)vmull_u8(ref_main_idx, dup_const_32_fract) 358 asr x14,x14,#8 //(vi) 359 360 dup v27.8b, v4.b[6] //(vii) 361 umlal v22.8h, v21.8b, v25.8b //(iv)vmull_u8(ref_main_idx_1, dup_const_fract) 362 and x9,x14,#0xff //(vi) 363 364 dup v25.8b, v4.b[7] //(viii) 365 rshrn v18.8b, v18.8h,#5 //(iii)shift_res = vrshrn_n_u16(add_res, 5) 366 add x12,x8,x9 //(vi)*pu1_ref[ref_main_idx] 367 368 ld1 {v23.8b},[x10],x11 //(v)ref_main_idx 369 and v4.16b, v2.16b , v6.16b //dup_const_fract(fract = pos & (31)) 370 asr x14,x14,#8 //(vii) 371 372 ld1 {v9.8b},[x10] //(v)ref_main_idx_1 373 shrn v3.8b, v2.8h,#5 //idx = pos >> 5 374 and x9,x14,#0xff //(vii) 375 376 st1 {v14.8b},[x0],x3 //(ii) 377 rshrn v22.8b, v22.8h,#5 //(iv)shift_res = vrshrn_n_u16(add_res, 5) 378 asr x14,x14,#8 //(viii) 379 380 ld1 {v12.8b},[x12],x11 //(vi)ref_main_idx 381 sub v30.8b, v1.8b , v31.8b //(v)32-fract(dup_const_32_fract) 382 add x10,x8,x9 //(vii)*pu1_ref[ref_main_idx] 383 384 ld1 {v13.8b},[x12] //(vi)ref_main_idx_1 385 umull v10.8h, v23.8b, v30.8b //(v)vmull_u8(ref_main_idx, dup_const_32_fract) 386 and x9,x14,#0xff //(viii) 387 388 smov x14, v3.s[0] //(i)extract idx to the r register 389 umlal v10.8h, v9.8b, v31.8b //(v)vmull_u8(ref_main_idx_1, dup_const_fract) 390 add x12,x8,x9 //(viii)*pu1_ref[ref_main_idx] 391 392 ld1 {v16.8b},[x10],x11 //(vii)ref_main_idx 393 sub v28.8b, v1.8b , v29.8b //(vi)32-fract(dup_const_32_fract) 394 395 st1 {v18.8b},[x0],x3 //(iii) 396 umull v14.8h, v12.8b, v28.8b //(vi)vmull_u8(ref_main_idx, dup_const_32_fract) 397 csel x8, x1, x8,le //reload the source to pu1_src+2nt 398 399 ld1 {v17.8b},[x10] //(vii)ref_main_idx_1 400 umlal v14.8h, v13.8b, v29.8b //(vi)vmull_u8(ref_main_idx_1, dup_const_fract) 401 add x20,x8,#8 //increment the source next set 8 columns in same row 402 csel x8, x20, x8,gt 403 404 ld1 {v20.8b},[x12],x11 //(viii)ref_main_idx 405 rshrn v10.8b, v10.8h,#5 //(v)shift_res = vrshrn_n_u16(add_res, 5) 406 407 ld1 {v21.8b},[x12] //(viii)ref_main_idx_1 408 sub v26.8b, v1.8b , v27.8b //(vii)32-fract(dup_const_32_fract) 409 lsl x20, x3,#3 410 csel x12,x20,x12,le 411 412 st1 {v22.8b},[x0],x3 //(iv) 413 umull v18.8h, v16.8b, v26.8b //(vii)vmull_u8(ref_main_idx, dup_const_32_fract) 414 sub x20,x12,x5 415 csel x12, x20, x12,le 416 417 st1 {v10.8b},[x0],x3 //(v) 418 umlal v18.8h, v17.8b, v27.8b //(vii)vmull_u8(ref_main_idx_1, dup_const_fract) 419 add x20,x2,x12 //increment the dst pointer to 8*dst_strd - nt 420 csel x2, x20, x2,le 421 422 xtn v4.8b, v4.8h 423 rshrn v14.8b, v14.8h,#5 //(vi)shift_res = vrshrn_n_u16(add_res, 5) 424 lsl x14,x14,#1 425 426 and x9,x14,#0xff //(i) 427 subs x7,x7,#8 428 add x10,x8,x9 //(i)*pu1_ref[ref_main_idx] 429 430 bne kernel_8_rows 431 432epilogue: 433 st1 {v14.8b},[x0],x3 //(vi) 434 rshrn v18.8b, v18.8h,#5 //(vii)shift_res = vrshrn_n_u16(add_res, 5) 435 436 sub v24.8b, v1.8b , v25.8b //(viii)32-fract(dup_const_32_fract) 437 umull v22.8h, v20.8b, v24.8b //(viii)vmull_u8(ref_main_idx, dup_const_32_fract) 438 umlal v22.8h, v21.8b, v25.8b //(viii)vmull_u8(ref_main_idx_1, dup_const_fract) 439 440 st1 {v18.8b},[x0],x3 //(vii) 441 rshrn v22.8b, v22.8h,#5 //(viii)shift_res = vrshrn_n_u16(add_res, 5) 442 443 st1 {v22.8b},[x0],x3 //(viii) 444 b end_loops 445 446core_loop_4: 447 add x10,x8,#2 //pu1_ref_main_idx += (four_nt + 1) 448 add x11,x8,#4 //pu1_ref_main_idx_1 += (four_nt + 2) 449 mov x8,#0 450 451 add x5,x8,#1 //row + 1 452 mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang) 453 and x5,x5,#31 //fract = pos & (31) 454 cmp x14,x5 //if(fract_prev > fract) 455 add x20,x10,#2 //pu1_ref_main_idx += 2 456 csel x10, x20, x10,gt 457 add x11,x10,#2 //pu1_ref_main_idx_1 += 2 458 dup v0.8b,w5 //dup_const_fract 459 sub x20,x5,#32 460 neg x4, x20 461 dup v1.8b,w4 //dup_const_32_fract 462 463//inner_loop_4 464 ld1 {v2.8b},[x10] //ref_main_idx 465 add x8,x8,#1 466 mov x14,x5 //fract_prev = fract 467 468 ld1 {v3.8b},[x11] //ref_main_idx_1 469 add x5,x8,#1 //row + 1 470 mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang) 471 and x5,x5,#31 //fract = pos & (31) 472 cmp x14,x5 //if(fract_prev > fract) 473 add x20,x10,#2 //pu1_ref_main_idx += 1 474 csel x10, x20, x10,gt 475 add x11,x10,#2 //pu1_ref_main_idx_1 += 1 476 477 dup v6.8b,w5 //dup_const_fract 478 umull v4.8h, v2.8b, v1.8b //vmull_u8(ref_main_idx, dup_const_32_fract) 479 480 sub x20,x5,#32 481 neg x4, x20 482 dup v7.8b,w4 //dup_const_32_fract 483 umlal v4.8h, v3.8b, v0.8b //vmull_u8(ref_main_idx_1, dup_const_fract) 484 485 ld1 {v23.8b},[x10] //ref_main_idx 486 add x8,x8,#1 487 488 ld1 {v9.8b},[x11] //ref_main_idx_1 489 rshrn v4.8b, v4.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5) 490 491 mov x14,x5 //fract_prev = fract 492 add x5,x8,#1 //row + 1 493 mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang) 494 and x5,x5,#31 //fract = pos & (31) 495 cmp x14,x5 //if(fract_prev > fract) 496 add x20,x10,#2 //pu1_ref_main_idx += 1 497 csel x10, x20, x10,gt 498 add x11,x10,#2 //pu1_ref_main_idx_1 += 1 499 500 dup v12.8b,w5 //dup_const_fract 501 umull v10.8h, v23.8b, v7.8b //vmull_u8(ref_main_idx, dup_const_32_fract) 502 503 sub x20,x5,#32 504 neg x4, x20 505 dup v13.8b,w4 //dup_const_32_fract 506 umlal v10.8h, v9.8b, v6.8b //vmull_u8(ref_main_idx_1, dup_const_fract) 507 508 ld1 {v14.8b},[x10] //ref_main_idx 509 add x8,x8,#1 510 511 st1 {v4.8b},[x2],x3 512 rshrn v10.8b, v10.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5) 513 514 ld1 {v15.8b},[x11] //ref_main_idx_1 515 mov x14,x5 //fract_prev = fract 516 add x5,x8,#1 //row + 1 517 mul x5, x5, x9 //pos = ((row + 1) * intra_pred_ang) 518 and x5,x5,#31 //fract = pos & (31) 519 cmp x14,x5 //if(fract_prev > fract) 520 add x20,x10,#2 //pu1_ref_main_idx += 1 521 csel x10, x20, x10,gt 522 add x11,x10,#2 //pu1_ref_main_idx_1 += 1 523 524 dup v18.8b,w5 //dup_const_fract 525 umull v16.8h, v14.8b, v13.8b //vmull_u8(ref_main_idx, dup_const_32_fract) 526 527 sub x20,x5,#32 528 neg x4, x20 529 dup v19.8b,w4 //dup_const_32_fract 530 umlal v16.8h, v15.8b, v12.8b //vmull_u8(ref_main_idx_1, dup_const_fract) 531 532 ld1 {v20.8b},[x10] //ref_main_idx 533 534 st1 {v10.8b},[x2],x3 535 rshrn v16.8b, v16.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5) 536 ld1 {v21.8b},[x11] //ref_main_idx_1 537 538 umull v22.8h, v20.8b, v19.8b //vmull_u8(ref_main_idx, dup_const_32_fract) 539 umlal v22.8h, v21.8b, v18.8b //vmull_u8(ref_main_idx_1, dup_const_fract) 540 541 st1 {v16.8b},[x2],x3 542 rshrn v22.8b, v22.8h,#5 //shift_res = vrshrn_n_u16(add_res, 5) 543 544 st1 {v22.8b},[x2],x3 545 546end_loops: 547 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 548 ldp x19, x20,[sp],#16 549 ldp d14,d15,[sp],#16 550 ldp d12,d13,[sp],#16 551 ldp d9,d10,[sp],#16 552 ret 553 554 555 556 557