1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_mode_3_to_9.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* parthiban v 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] nt 61//* size of tranform block 62//* 63//* @param[in] mode 64//* type of filtering 65//* 66//* @returns 67//* 68//* @remarks 69//* none 70//* 71//******************************************************************************* 72//*/ 73 74//void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref, 75// word32 src_strd, 76// uword8* pu1_dst, 77// word32 dst_strd, 78// word32 nt, 79// word32 mode) 80// 81//**************variables vs registers***************************************** 82//x0 => *pu1_ref 83//x1 => src_strd 84//x2 => *pu1_dst 85//x3 => dst_strd 86 87//stack contents from #40 88// nt 89// mode 90 91.text 92.align 4 93.include "ihevc_neon_macros.s" 94 95 96 97.globl ihevc_intra_pred_luma_mode_3_to_9_av8 98.extern gai4_ihevc_ang_table 99.extern gai4_ihevc_inv_ang_table 100.extern col_for_intra_luma 101.extern idx_neg_idx_3_9 102 103 104.type ihevc_intra_pred_luma_mode_3_to_9_av8, %function 105 106ihevc_intra_pred_luma_mode_3_to_9_av8: 107 108 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 109 110 stp d12,d13,[sp,#-16]! 111 stp d14,d15,[sp,#-16]! 112 stp x19, x20,[sp,#-16]! 113 114 adrp x7, :got:gai4_ihevc_ang_table 115 ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table] 116 117 adrp x8, :got:gai4_ihevc_inv_ang_table 118 ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table] 119 120 add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode] 121 ldr w7, [x7] //intra_pred_ang 122 sxtw x7,w7 123 dup v30.8b,w7 //intra_pred_ang 124 125 adrp x14, :got:col_for_intra_luma 126 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 127 128 cmp x4, #4 129 130 beq sz_4_proc 131 b prologue_8_16_32 132 133prologue_8_16_32: 134 lsr x10, x4, #3 135 ld1 {v31.8b},[x14],#8 136 mul x10, x4, x10 //block counter (dec by #8) 137 138 mov x11, x4 //col counter to be inc/dec by #8 139 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 140 141 sub x7, x5, #3 142 movi v2.8b, #1 //contains #1 for adding to get ref_main_idx + 1 143 adrp x12, :got:idx_neg_idx_3_9 //load least idx table 144 ldr x12, [x12, #:got_lo12:idx_neg_idx_3_9] 145 movi v3.8b, #2 146 147 add x12, x12, x7, lsl #4 148 mov x8, x12 149 150 mov x7, #8 151 sub x7, x7, x3, lsl #3 //x7 = 8-8x3 152 153 ldr w9, [x8] 154 sxtw x9,w9 155 add x1, x0, x4, lsl #1 //pu1_ref + nt 156 157 xtn v6.8b, v22.8h 158 dup v26.8b,w9 //least idx added to final idx values 159 sub x1, x1, #9 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row 160 161 sub x6, x1, x9 162 163 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 164 sshr v22.8h, v22.8h,#5 165 166 movi v29.8b, #31 //contains #31 for vand operation 167 168 movi v28.8b, #32 169 170 sqxtn v1.8b, v22.8h 171 172 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 173 174 mov x0, #1 175 176 movi v27.8b, #7 //row 0 to 7 177 178 sub v1.8b, v1.8b , v2.8b //ref_main_idx (sub row) 179 sub v1.8b, v26.8b , v1.8b //ref_main_idx (row 0) 180 add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 181 sub v19.8b, v1.8b , v2.8b //ref_main_idx + 1 (row 0) 182 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) 183 sub v7.8b, v28.8b , v6.8b //32-fract 184 185 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) 186 sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1) 187 sub v5.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 1) 188 189 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 190 umull v24.8h, v12.8b, v7.8b //mul (row 0) 191 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 192 193 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 194 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2) 195 sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 2) 196 197 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 198 199 tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) 200 umull v22.8h, v16.8b, v7.8b //mul (row 1) 201 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 202 203 tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) 204 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) 205 sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) 206 207 st1 {v24.8b},[x2], x3 //st (row 0) 208 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 209 210 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 211 umull v20.8h, v14.8b, v7.8b //mul (row 2) 212 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 213 214 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 215 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4) 216 sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 4) 217 218 st1 {v22.8b},[x2], x3 //st (row 1) 219 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 220 221 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4) 222 umull v18.8h, v23.8b, v7.8b //mul (row 3) 223 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 224 225 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4) 226 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) 227 sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) 228 229 st1 {v20.8b},[x2], x3 //st (row 2) 230 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 231 232 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) 233 umull v24.8h, v12.8b, v7.8b //mul (row 4) 234 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 235 236 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) 237 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6) 238 sub v19.8b, v19.8b , v3.8b //ref_main_idx + 1 (row 6) 239 240 st1 {v18.8b},[x2], x3 //st (row 3) 241 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 242 243 tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6) 244 umull v22.8h, v16.8b, v7.8b //mul (row 5) 245 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 246 247 tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6) 248 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) 249 sub v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) 250 251 st1 {v24.8b},[x2], x3 //st (row 4) 252 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 253 254 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 255 umull v20.8h, v14.8b, v7.8b //mul (row 6) 256 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 257 258 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) 259 umull v18.8h, v23.8b, v7.8b //mul (row 7) 260 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 261 262 st1 {v22.8b},[x2], x3 //st (row 5) 263 rshrn v20.8b, v20.8h,#5 //round shft (row 6) 264 rshrn v18.8b, v18.8h,#5 //round shft (row 7) 265 266 st1 {v20.8b},[x2], x3 //st (row 6) 267 268 subs x10, x10, #8 //subtract 8 and go to end if 8x8 269 270 st1 {v18.8b},[x2], x3 //st (row 7) 271 272 beq end_func 273 274 subs x11, x11, #8 275 add x20, x8, #4 276 csel x8, x20, x8,gt 277 add x20, x2, x7 278 csel x2, x20, x2,gt 279 csel x8, x12, x8,le 280 sub x20, x2, x4 281 csel x2, x20, x2,le 282 add x20, x2, #8 283 csel x2, x20, x2,le 284 csel x11, x4, x11,le 285 bgt lbl284 286 adrp x14, :got:col_for_intra_luma 287 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 288lbl284: 289 add x20, x0, #8 290 csel x0, x20, x0,le 291 292 mov x5,x2 293 ld1 {v31.8b},[x14],#8 294 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 295 xtn v23.8b, v12.8h 296 sshr v12.8h, v12.8h,#5 297 sqxtn v25.8b, v12.8h 298 ldr w9, [x8] 299 sxtw x9,w9 300 add x9, x0, x9 301 sub x9, x9, #1 302 dup v26.8b,w9 303 movi v16.8b, #8 304 305 sub x4,x4,#8 306 307kernel_8_16_32: 308 309 sub v1.8b, v26.8b , v25.8b //ref_main_idx 310 mov v26.8b, v23.8b 311 312 subs x11, x11, #8 313 sub x6, x1, x9 314 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 315 add v1.8b, v1.8b , v16.8b //to compensate the pu1_src idx incremented by 8 316 317 umull v20.8h, v14.8b, v7.8b //mul (row 6) 318 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx - 1 (row 7) 319 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 320 321 sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1 322 add x20, x0, #8 323 csel x0, x20, x0,le 324 add x20, x8, #4 325 csel x8, x20, x8,gt 326 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 327 328 st1 {v24.8b},[x5], x3 //st (row 4) 329 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 330 331 bgt lbl323 332 adrp x14, :got:col_for_intra_luma 333 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 334lbl323: 335 csel x8, x12, x8,le 336 dup v27.8b,w0 //row value inc or reset accordingly 337 338 sub v4.8b, v1.8b , v2.8b //ref_main_idx (row 1) 339 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) 340 sub v5.8b, v19.8b , v2.8b //ref_main_idx - 1 (row 1) 341 342 343 umull v18.8h, v23.8b, v7.8b //mul (row 7) 344 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) 345 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 346 347 ld1 {v31.8b},[x14],#8 348 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 349 350 st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5) 351 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 352 353 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 2) 354 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 355 sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 2) 356 357 add x20, x4, #8 358 csel x11, x20, x11,le 359 ldr w9, [x8] 360 sxtw x9,w9 361 sub v7.8b, v28.8b , v6.8b //32-fract 362 363 umull v24.8h, v12.8b, v7.8b //mul (row 0) 364 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 365 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 366 367 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 368 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 369 370 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) 371 tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) 372 sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 3) 373 374 umull v22.8h, v23.8b, v7.8b //mul (row 1) 375 tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) 376 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 377 378 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 379 st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) 380 381 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 4) 382 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 383 sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 4) 384 385 umull v20.8h, v14.8b, v7.8b //mul (row 2) 386 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 387 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 388 389 smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 390 add x5,x2,x3,lsl#2 391 add x9, x0, x9 392 393 st1 {v24.8b},[x2], x3 //st (row 0) 394 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 395 396 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) 397 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 4) 398 sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 5) 399 400 umull v18.8h, v23.8b, v7.8b //mul (row 3) 401 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 4) 402 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 403 404 st1 {v22.8b},[x2], x3 //st (row 1) 405 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 406 407 xtn v23.8b, v14.8h 408 sshr v14.8h, v14.8h,#5 409 410 sub v1.8b, v1.8b , v3.8b //ref_main_idx (row 6) 411 tbl v21.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) 412 sub v19.8b, v19.8b , v3.8b //ref_main_idx - 1 (row 6) 413 414 umull v24.8h, v12.8b, v7.8b //mul (row 4) 415 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) 416 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 417 418 st1 {v20.8b},[x2], x3 //st (row 2) 419 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 420 421 sub x9, x9, #1 422 sqxtn v25.8b, v14.8h 423 424 sub v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) 425 tbl v14.8b, {v0.16b},v1.8b //load from ref_main_idx (row 6) 426 sub v5.8b, v5.8b , v3.8b //ref_main_idx - 1 (row 7) 427 428 umull v22.8h, v21.8b, v7.8b //mul (row 5) 429 tbl v15.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 6) 430 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 431 432 add v25.8b, v27.8b , v25.8b //ref_main_idx (add row) 433 dup v26.8b,w9 434 435 st1 {v18.8b},[x2], x3 //st (row 3) 436 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 437 438 add x2, x2, x3, lsl #2 439 sub v25.8b, v25.8b , v2.8b //ref_main_idx -1 (sub 1) 440 add x20, x7, x2 441 csel x2, x20, x2,gt 442 443 sub x20, x2, x4 444 csel x2, x20, x2,le 445 446 subs x10, x10, #8 //subtract 8 and go to end if 8x8 447 448 bne kernel_8_16_32 449 450epil_8_16_32: 451 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 452 453 umull v20.8h, v14.8b, v7.8b //mul (row 6) 454 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) 455 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 456 457 st1 {v24.8b},[x5], x3 //st (row 4) 458 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 459 460 umull v18.8h, v23.8b, v7.8b //mul (row 7) 461 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 462 463 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 464 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 465 466 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 467 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 468 469 st1 {v18.8b},[x5], x3 //st (row 7) 470 471 b end_func 472 473sz_4_proc: 474 ld1 {v31.8b},[x14] 475 movi v2.8b, #1 //contains #1 for adding to get ref_main_idx - 1 476 477 movi v3.8b, #2 478 adrp x12, :got:idx_neg_idx_3_9 //load least idx table 479 ldr x12, [x12, #:got_lo12:idx_neg_idx_3_9] 480 481 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 482 sub x7, x5, #3 483 484 add x12, x12, x7, lsl #4 485 mov x8, x12 486 487 ldr w9, [x8] 488 sxtw x9,w9 489 490 dup v26.8b,w9 //least idx added to final idx values 491 add x6, x0, x4, lsl #1 //pu1_ref + 2nt 492 493 xtn v6.8b, v22.8h 494 sub x6, x6, #9 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row 495 sub x6, x6, x9 496 497 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 498 499 movi v29.8b, #31 //contains #31 for vand operation 500 501 movi v28.8b, #32 502 503 sshr v22.8h, v22.8h,#5 504 sqxtn v1.8b, v22.8h 505 506 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 507 sub v7.8b, v28.8b , v6.8b //32-fract 508 509 movi v27.8b, #7 //row 0 to 7(row-1) 510 sub v1.8b, v1.8b , v2.8b //ref_main_idx (add 1) 511 sub v1.8b, v26.8b , v1.8b //ref_main_idx 512 add v1.8b, v1.8b , v27.8b //t0 compensate the pu1_src idx incremented by 8 513 sub v19.8b, v1.8b , v2.8b //ref_main_idx - 1 514 515 sub v4.8b, v1.8b , v2.8b //row 1 ref_main_idx 516 sub v5.8b, v19.8b , v2.8b 517 518 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 0) 519 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 0) 520 521 522 umull v24.8h, v12.8b, v7.8b //mul (row 0) 523 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 524 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 525 526 sub v1.8b, v1.8b , v3.8b //idx (row 2) 527 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 528 sub v19.8b, v19.8b , v3.8b //idx+1 (row 2) 529 530 umull v22.8h, v16.8b, v7.8b //mul (row 1) 531 tbl v12.8b, {v0.16b},v1.8b //load from ref_main_idx (row 2) 532 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 533 534 rshrn v24.8b, v24.8h,#5 //round shift (row 0) 535 536 sub v4.8b, v4.8b , v3.8b //idx (row 3) 537 tbl v13.8b, {v0.16b},v19.8b //load from ref_main_idx + 1 (row 2) 538 sub v5.8b, v5.8b , v3.8b //idx+1 (row 3) 539 540 umull v20.8h, v12.8b, v7.8b //mul (row 2) 541 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 542 umlal v20.8h, v13.8b, v6.8b //mul (row 2) 543 544 st1 {v24.s}[0],[x2], x3 //st row 0 545 rshrn v22.8b, v22.8h,#5 //round shift (row 1) 546 547 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 548 549 umull v18.8h, v16.8b, v7.8b //mul (row 3) 550 umlal v18.8h, v17.8b, v6.8b //mul (row 3) 551 552 st1 {v22.s}[0],[x2], x3 //st row 1 553 rshrn v20.8b, v20.8h,#5 //round shift (row 2) 554 555 st1 {v20.s}[0],[x2], x3 //st row 2 556 557 rshrn v18.8b, v18.8h,#5 //round shift (row 3) 558 559 st1 {v18.s}[0],[x2], x3 //st (row 3) 560 561end_func: 562 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 563 ldp x19, x20,[sp],#16 564 ldp d14,d15,[sp],#16 565 ldp d12,d13,[sp],#16 566 ret 567 568 569 570 571