1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19// ******************************************************************************* 20// * @file 21// * ihevc_itrans_recon_8x8_neon.s 22// * 23// * @brief 24// * contains function definitions for single stage inverse transform 25// * 26// * @author 27// * anand s 28// * 29// * @par list of functions: 30// * - ihevc_itrans_recon_16x16() 31// * 32// * @remarks 33// * none 34// * 35// ******************************************************************************* 36//*/ 37 38///** 39// ******************************************************************************* 40// * 41// * @brief 42// * this function performs inverse transform and reconstruction for 8x8 43// * input block 44// * 45// * @par description: 46// * performs inverse transform and adds the prediction data and clips output 47// * to 8 bit 48// * 49// * @param[in] pi2_src 50// * input 16x16 coefficients 51// * 52// * @param[in] pi2_tmp 53// * temporary 16x16 buffer for storing inverse 54// * 55// * transform 56// * 1st stage output 57// * 58// * @param[in] pu1_pred 59// * prediction 16x16 block 60// * 61// * @param[out] pu1_dst 62// * output 8x8 block 63// * 64// * @param[in] src_strd 65// * input stride 66// * 67// * @param[in] pred_strd 68// * prediction stride 69// * 70// * @param[in] dst_strd 71// * output stride 72// * 73// * @param[in] shift 74// * output shift 75// * 76// * @param[in] x12 77// * zero columns in pi2_src 78// * 79// * @returns void 80// * 81// * @remarks 82// * none 83// * 84// ******************************************************************************* 85// */ 86 87//void ihevc_itrans_recon_16x16(word16 *pi2_src, 88// word16 *pi2_tmp, 89// uword8 *pu1_pred, 90// uword8 *pu1_dst, 91// word32 src_strd, 92// word32 pred_strd, 93// word32 dst_strd, 94// word32 x12 95// word32 x11 ) 96 97//**************variables vs registers************************* 98// x0 => *pi2_src 99// x1 => *pi2_tmp 100// x2 => *pu1_pred 101// x3 => *pu1_dst 102// src_strd 103// pred_strd 104// dst_strd 105// x12 106// x11 107 108.text 109.align 4 110 111.include "ihevc_neon_macros.s" 112 113 114 115 116.set shift_stage1_idct , 7 117.set shift_stage2_idct , 12 118//#define zero_cols x12 119//#define zero_rows x11 120.globl ihevc_itrans_recon_16x16_av8 121 122.extern g_ai2_ihevc_trans_16_transpose 123 124.type ihevc_itrans_recon_16x16_av8, %function 125 126ihevc_itrans_recon_16x16_av8: 127 128 ldr w11, [sp] 129 // stmfd sp!,{x4-x12,x14} 130 push_v_regs 131 stp x19, x20,[sp,#-16]! 132 stp x5, x6,[sp,#-16]! 133// add sp,sp,#40 134 135 136 137// ldr x8,[sp,#4] @ prediction stride 138// ldr x7,[sp,#8] @ destination stride 139 mov x6, x4 // src stride 140 mov x12, x7 141 142 143 144 adrp x14, :got:g_ai2_ihevc_trans_16_transpose 145 ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose] 146 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data 147 mov x7,#0xffff 148 and x12,x12,x7 149 and x11,x11,x7 150 lsl x6, x6, #1 // x sizeof(word16) 151 add x9,x0,x6, lsl #1 // 2 rows 152 153 add x10,x6,x6, lsl #1 // 3 rows 154 add x5,x6,x6,lsl #2 155 mov x7,#0xfff0 156 157 cmp x12,x7 158 bge zero_12cols_decision 159 160 mov x19,#0xff00 161 cmp x12,x19 162 bge zero_8cols_decision 163 164 165 166 167 mov x14,#4 168 cmp x11,x7 169 sub x20,x6,#0 170 neg x20, x20 171 csel x10,x20,x10,ge 172 173 mov x19,#0xff00 174 cmp x11,x19 175 csel x8, x5, x8,ge 176 sub x20,x8,#0 177 neg x20, x20 178 csel x8,x20,x8,ge 179 csel x8, x10, x8,lt 180 add x5,x5,x6,lsl #3 181 sub x20,x5,#0 182 neg x5, x20 183 184 b first_stage_top_four_bottom_four 185 186zero_12cols_decision: 187 mov x14,#1 188 mov x19,#0xff00 189 cmp x11,x19 190 csel x8, x5, x8,ge 191 csel x8, x10, x8,lt 192 add x5,x5,x6,lsl #3 193 sub x20,x5,#0 194 neg x5, x20 195 196 b first_stage_top_four_bottom_four 197 198zero_8cols_decision: 199 mov x14,#2 200 mov x8,x5 201 sub x20,x8,#0 202 neg x8, x20 203 mov x19,#0xff00 204 cmp x11,x19 205 csel x8, x10, x8,lt 206 add x5,x5,x6,lsl #3 207 sub x20,x5,#0 208 neg x5, x20 209 cmp x11,x7 210 sub x20,x6,#0 211 neg x20, x20 212 csel x10,x20,x10,ge 213 214 215 b first_stage_top_four_bottom_four 216 217 218//d0[0]= 64 d2[0]=64 219//d0[1]= 90 d2[1]=57 220//d0[2]= 89 d2[2]=50 221//d0[3]= 87 d2[3]=43 222//d1[0]= 83 d3[0]=36 223//d1[1]= 80 d3[1]=25 224//d1[2]= 75 d3[2]=18 225//d1[3]= 70 d3[3]=9 226 227 228 229first_stage: 230 add x0,x0,#8 231 add x9,x9,#8 232 233first_stage_top_four_bottom_four: 234 235 ld1 {v10.4h},[x0],x6 236 ld1 {v11.4h},[x9],x6 237 ld1 {v6.4h},[x0],x10 238 ld1 {v7.4h},[x9],x10 239 cmp x11,x7 240 bge skip_load4rows 241 242 ld1 {v4.4h},[x0],x6 243 ld1 {v5.4h},[x9],x6 244 ld1 {v8.4h},[x0],x8 245 ld1 {v9.4h},[x9],x8 246 247// registers used: q0,q1,q3,q5,q2,q4 248 249// d10 =x0 250//d6= x1 251//d11=x2 252//d7=x3 253 254skip_load4rows: 255 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 256 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 257 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 258 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 259 260 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 261 smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 262 smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 263 smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 264 265 266 267 268 269 270 smull v12.4s, v10.4h, v0.h[0] 271 smlal v12.4s, v11.4h, v0.h[2] 272 smull v14.4s, v10.4h, v0.h[0] 273 smlal v14.4s, v11.4h, v1.h[2] 274 smull v16.4s, v10.4h, v0.h[0] 275 smlal v16.4s, v11.4h, v2.h[2] 276 smull v18.4s, v10.4h, v0.h[0] 277 smlal v18.4s, v11.4h, v3.h[2] 278 279 bge skip_last12rows_kernel1 280 281 282 smlal v24.4s, v8.4h, v1.h[1] 283 smlal v26.4s, v8.4h, v3.h[3] 284 smlsl v28.4s, v8.4h, v1.h[3] 285 smlsl v30.4s, v8.4h, v0.h[3] 286 287 288 smlal v24.4s, v9.4h, v1.h[3] 289 smlsl v26.4s, v9.4h, v2.h[3] 290 smlsl v28.4s, v9.4h, v0.h[3] 291 smlal v30.4s, v9.4h, v3.h[3] 292 293 294 295 296 297 smlal v12.4s, v4.4h, v1.h[0] 298 smlal v12.4s, v5.4h, v1.h[2] 299 smlal v14.4s, v4.4h, v3.h[0] 300 smlsl v14.4s, v5.4h, v3.h[2] 301 smlsl v16.4s, v4.4h, v3.h[0] 302 smlsl v16.4s, v5.4h, v0.h[2] 303 smlsl v18.4s, v4.4h, v1.h[0] 304 smlsl v18.4s, v5.4h, v2.h[2] 305 306//d0[0]= 64 d2[0]=64 307//d0[1]= 90 d2[1]=57 308//d0[2]= 89 d2[2]=50 309//d0[3]= 87 d2[3]=43 310//d1[0]= 83 d3[0]=36 311//d1[1]= 80 d3[1]=25 312//d1[2]= 75 d3[2]=18 313//d1[3]= 70 d3[3]=9 314 mov x19,#0xff00 315 cmp x11,x19 316 bge skip_last12rows_kernel1 317 318 319 ld1 {v10.4h},[x0],x6 320 ld1 {v11.4h},[x9],x6 321 ld1 {v6.4h},[x0],x10 322 ld1 {v7.4h},[x9],x10 323 ld1 {v4.4h},[x0],x6 324 ld1 {v5.4h},[x9],x6 325 ld1 {v8.4h},[x0],x5 326 ld1 {v9.4h},[x9],x5 327 328 329 330 331 smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) 332 smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) 333 smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) 334 smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) 335 336 smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 337 smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 338 smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 339 smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 340 341 342 343 smlal v24.4s, v8.4h, v3.h[1] 344 smlsl v26.4s, v8.4h, v1.h[3] 345 smlal v28.4s, v8.4h, v0.h[1] 346 smlsl v30.4s, v8.4h, v1.h[1] 347 348 349 smlal v24.4s, v9.4h, v3.h[3] 350 smlsl v26.4s, v9.4h, v3.h[1] 351 smlal v28.4s, v9.4h, v2.h[3] 352 smlsl v30.4s, v9.4h, v2.h[1] 353 354 355 356 357 358 smlal v12.4s, v10.4h, v0.h[0] 359 smlal v12.4s, v11.4h, v2.h[2] 360 smlal v12.4s, v4.4h, v3.h[0] 361 smlal v12.4s, v5.4h, v3.h[2] 362 363 364 365 366 smlsl v14.4s, v10.4h, v0.h[0] 367 smlsl v14.4s, v11.4h, v0.h[2] 368 smlsl v14.4s, v4.4h, v1.h[0] 369 smlsl v14.4s, v5.4h, v2.h[2] 370 371 372 smlsl v16.4s, v10.4h, v0.h[0] 373 smlal v16.4s, v11.4h, v3.h[2] 374 smlal v16.4s, v4.4h, v1.h[0] 375 smlal v16.4s, v5.4h, v1.h[2] 376 377 378 smlal v18.4s, v10.4h, v0.h[0] 379 smlal v18.4s, v11.4h, v1.h[2] 380 smlsl v18.4s, v4.4h, v3.h[0] 381 smlsl v18.4s, v5.4h, v0.h[2] 382 383skip_last12rows_kernel1: 384 add v20.4s, v12.4s , v24.4s 385 sub v22.4s, v12.4s , v24.4s 386 387 add v12.4s, v14.4s , v26.4s 388 sub v24.4s, v14.4s , v26.4s 389 390 add v14.4s, v16.4s , v28.4s 391 sub v26.4s, v16.4s , v28.4s 392 393 394 add v16.4s, v18.4s , v30.4s 395 sub v28.4s, v18.4s , v30.4s 396 397 398 399 400 401 402 403 sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 404 sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 405 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 406 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 407 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 408 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 409 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 410 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 411 412 st1 {v30.4h, v31.4h},[x1],#16 413 st1 {v18.4h, v19.4h},[x1],#16 414 sub x1,x1,#32 415 416 bge skip_stage1_kernel_load 417 418first_stage_middle_eight: 419 420 421 422 ld1 {v10.4h},[x0],x6 423 ld1 {v11.4h},[x9],x6 424 ld1 {v6.4h},[x0],x10 425 ld1 {v7.4h},[x9],x10 426 ld1 {v4.4h},[x0],x6 427 ld1 {v5.4h},[x9],x6 428 ld1 {v8.4h},[x0],x8 429 ld1 {v9.4h},[x9],x8 430 431 432skip_stage1_kernel_load: 433 smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) 434 smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) 435 smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) 436 smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) 437 438 smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 439 smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 440 smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 441 smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 442 443 444 445 446 447 448 smull v22.4s, v10.4h, v0.h[0] 449 smlsl v22.4s, v11.4h, v3.h[2] 450 smull v20.4s, v10.4h, v0.h[0] 451 smlsl v20.4s, v11.4h, v2.h[2] 452 smull v16.4s, v10.4h, v0.h[0] 453 smlsl v16.4s, v11.4h, v1.h[2] 454 smull v18.4s, v10.4h, v0.h[0] 455 smlsl v18.4s, v11.4h, v0.h[2] 456 457 458 cmp x11,x7 459 bge skip_last12rows_kernel2 460 461 smlsl v24.4s, v8.4h, v3.h[1] 462 smlal v26.4s, v8.4h, v2.h[1] 463 smlal v28.4s, v8.4h, v0.h[1] 464 smlal v30.4s, v8.4h, v2.h[3] 465 466 467 smlal v24.4s, v9.4h, v0.h[1] 468 smlal v26.4s, v9.4h, v3.h[1] 469 smlsl v28.4s, v9.4h, v1.h[1] 470 smlsl v30.4s, v9.4h, v2.h[1] 471 472 473 474 smlsl v22.4s, v4.4h, v1.h[0] 475 smlal v22.4s, v5.4h, v2.h[2] 476 smlsl v20.4s, v4.4h, v3.h[0] 477 smlal v20.4s, v5.4h, v0.h[2] 478 smlal v16.4s, v4.4h, v3.h[0] 479 smlal v16.4s, v5.4h, v3.h[2] 480 smlal v18.4s, v4.4h, v1.h[0] 481 smlsl v18.4s, v5.4h, v1.h[2] 482 483//d0[0]= 64 d2[0]=64 484//d0[1]= 90 d2[1]=57 485//d0[2]= 89 d2[2]=50 486//d0[3]= 87 d2[3]=43 487//d1[0]= 83 d3[0]=36 488//d1[1]= 80 d3[1]=25 489//d1[2]= 75 d3[2]=18 490//d1[3]= 70 d3[3]=9 491 mov x19,#0xff00 492 cmp x11,x19 493 bge skip_last12rows_kernel2 494 495 ld1 {v10.4h},[x0],x6 496 ld1 {v11.4h},[x9],x6 497 ld1 {v6.4h},[x0],x10 498 ld1 {v7.4h},[x9],x10 499 ld1 {v4.4h},[x0],x6 500 ld1 {v5.4h},[x9],x6 501 ld1 {v8.4h},[x0],x5 502 ld1 {v9.4h},[x9],x5 503 504 505 smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) 506 smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 507 smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) 508 smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 509 510 smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 511 smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 512 smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 513 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 514 515 516 smlal v24.4s, v8.4h, v2.h[3] 517 smlal v26.4s, v8.4h, v3.h[3] 518 smlsl v28.4s, v8.4h, v2.h[1] 519 smlal v30.4s, v8.4h, v0.h[3] 520 521 522 smlal v24.4s, v9.4h, v1.h[3] 523 smlsl v26.4s, v9.4h, v1.h[1] 524 smlal v28.4s, v9.4h, v0.h[3] 525 smlsl v30.4s, v9.4h, v0.h[1] 526 527 528 529 530 smlal v22.4s, v10.4h, v0.h[0] 531 smlsl v22.4s, v11.4h, v1.h[2] 532 smlsl v22.4s, v4.4h, v3.h[0] 533 smlal v22.4s, v5.4h, v0.h[2] 534 535 536 537 smlsl v20.4s, v10.4h, v0.h[0] 538 smlsl v20.4s, v11.4h, v3.h[2] 539 smlal v20.4s, v4.4h, v1.h[0] 540 smlsl v20.4s, v5.4h, v1.h[2] 541 542 543 smlsl v16.4s, v10.4h, v0.h[0] 544 smlal v16.4s, v11.4h, v0.h[2] 545 smlsl v16.4s, v4.4h, v1.h[0] 546 smlal v16.4s, v5.4h, v2.h[2] 547 548 549 550 smlal v18.4s, v10.4h, v0.h[0] 551 smlsl v18.4s, v11.4h, v2.h[2] 552 smlal v18.4s, v4.4h, v3.h[0] 553 smlsl v18.4s, v5.4h, v3.h[2] 554 555skip_last12rows_kernel2: 556 557 add v4.4s, v22.4s , v24.4s 558 sub v22.4s, v22.4s , v24.4s 559 560 add v6.4s, v20.4s , v26.4s 561 sub v24.4s, v20.4s , v26.4s 562 563 add v10.4s, v16.4s , v28.4s 564 sub v26.4s, v16.4s , v28.4s 565 566 567 add v16.4s, v18.4s , v30.4s 568 sub v28.4s, v18.4s , v30.4s 569 570 571 sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 572 sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 573 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 574 sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 575 sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 576 sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 577 sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 578 sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 579 580 581 // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} 582 583 584 585 586 587 588 ld1 {v4.4h, v5.4h},[x1],#16 589 ld1 {v8.4h, v9.4h},[x1],#16 590 sub x1,x1,#32 591 592//d4=x0 593//d12=x1 594//d5=x2 595//d13=x3 596 597//d18=x4 598//d20=x5 599//d19=x6 600//d21=x7 601 602//d22=x8 603//d30=x9 604//d23=x10 605//d31=x11 606 607//d14=x12 608//d8=x13 609//d15=x14 610//d9=x15 611 612 umov x15,v26.d[0] 613 umov x16,v27.d[0] 614 umov x19,v28.d[0] 615 umov x20,v29.d[0] 616 617 trn1 v26.4h, v4.4h, v12.4h 618 trn2 v27.4h, v4.4h, v12.4h 619 trn1 v28.4h, v5.4h, v13.4h 620 trn2 v29.4h, v5.4h, v13.4h 621 622 trn1 v4.2s, v26.2s, v28.2s 623 trn2 v5.2s, v26.2s, v28.2s 624 trn1 v12.2s, v27.2s, v29.2s 625 trn2 v13.2s, v27.2s, v29.2s 626 627 trn1 v26.4h, v18.4h, v20.4h 628 trn2 v27.4h, v18.4h, v20.4h 629 trn1 v28.4h, v19.4h, v21.4h 630 trn2 v29.4h, v19.4h, v21.4h 631 632 trn1 v18.2s, v26.2s, v28.2s 633 trn2 v19.2s, v26.2s, v28.2s 634 trn1 v20.2s, v27.2s, v29.2s 635 trn2 v21.2s, v27.2s, v29.2s 636 637 trn1 v26.4h, v22.4h, v30.4h 638 trn2 v27.4h, v22.4h, v30.4h 639 trn1 v28.4h, v23.4h, v31.4h 640 trn2 v29.4h, v23.4h, v31.4h 641 642 trn1 v22.2s, v26.2s, v28.2s 643 trn2 v23.2s, v26.2s, v28.2s 644 trn1 v30.2s, v27.2s, v29.2s 645 trn2 v31.2s, v27.2s, v29.2s 646 647 trn1 v26.4h, v14.4h, v8.4h 648 trn2 v27.4h, v14.4h, v8.4h 649 trn1 v28.4h, v15.4h, v9.4h 650 trn2 v29.4h, v15.4h, v9.4h 651 652 trn1 v14.2s, v26.2s, v28.2s 653 trn2 v15.2s, v26.2s, v28.2s 654 trn1 v8.2s, v27.2s, v29.2s 655 trn2 v9.2s, v27.2s, v29.2s 656 657 mov v26.d[0],x15 658 mov v27.d[0],x16 659 mov v28.d[0],x19 660 mov v29.d[0],x20 661 662// d4 =x0 1- 4 values 663// d5 =x2 1- 4 values 664// d12=x1 1- 4 values 665// d13=x3 1- 4 values 666 667// d18 =x0 5- 8 values 668// d19 =x2 5- 8 values 669// d20=x1 5- 8 values 670// d21=x3 5- 8 values 671 672// d22 =x0 9- 12 values 673// d23 =x2 9- 12 values 674// d30=x1 9- 12 values 675// d31=x3 9- 12 values 676 677// d14 =x0 13-16 values 678// d15 =x2 13- 16 values 679// d8=x1 13- 16 values 680// d9=x3 13- 16 values 681 682 683 st1 { v4.4h, v5.4h},[x1],#16 684 st1 { v12.4h, v13.4h},[x1],#16 685 686 st1 { v18.4h, v19.4h},[x1],#16 687 st1 { v20.4h, v21.4h},[x1],#16 688 st1 { v22.4h, v23.4h},[x1],#16 689 st1 { v30.4h, v31.4h},[x1],#16 690 st1 { v14.4h, v15.4h},[x1],#16 691 st1 { v8.4h, v9.4h},[x1],#16 692 693 694 subs x14,x14,#1 695 bne first_stage 696 697 698 699 700 701 702 703 704 705 706 mov x6,x7 707 708 ldp x8, x7,[sp],#16 709 710 mov x10,#16 711 712 cmp x12,x6 713 sub x20,x1,#128 714 csel x1, x20, x1,ge 715 bge label1 716 717 mov x19,#0xff00 718 cmp x12,x19 719 sub x20,x1,#256 720 csel x1, x20, x1,ge 721 bge label_2 722 723 sub x1,x1,#512 724 sub x20,x10,#0 725 neg x10, x20 726 727label_2: 728 add x9,x1,#128 729 add x11,x9,#128 730 add x0,x11,#128 731 732 733 734label1: 735// mov x6,x1 736 737 738 mov x14,#4 739 add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 740 add x5,x8,x8, lsl #1 // 741// add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data 742// add x10,x7,x7, lsl #1 @ 743 744 745 746 747second_stage: 748 ld1 {v10.4h, v11.4h},[x1],#16 749 ld1 {v6.4h, v7.4h},[x1],x10 750 cmp x12,x6 751 bge second_stage_process 752 ld1 {v4.4h, v5.4h},[x9],#16 753 ld1 {v8.4h, v9.4h},[x9],x10 754 755second_stage_process: 756 757 758 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 759 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 760 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 761 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 762 763 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 764 smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 765 smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 766 smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 767 768 769 smull v12.4s, v10.4h, v0.h[0] 770 smlal v12.4s, v11.4h, v0.h[2] 771 smull v14.4s, v10.4h, v0.h[0] 772 smlal v14.4s, v11.4h, v1.h[2] 773 smull v16.4s, v10.4h, v0.h[0] 774 smlal v16.4s, v11.4h, v2.h[2] 775 smull v18.4s, v10.4h, v0.h[0] 776 smlal v18.4s, v11.4h, v3.h[2] 777 778 bge skip_last8rows_stage2_kernel1 779 780 smlal v24.4s, v8.4h, v1.h[1] 781 smlal v26.4s, v8.4h, v3.h[3] 782 smlsl v28.4s, v8.4h, v1.h[3] 783 smlsl v30.4s, v8.4h, v0.h[3] 784 785 786 smlal v24.4s, v9.4h, v1.h[3] 787 smlsl v26.4s, v9.4h, v2.h[3] 788 smlsl v28.4s, v9.4h, v0.h[3] 789 smlal v30.4s, v9.4h, v3.h[3] 790 791 792 smlal v12.4s, v4.4h, v1.h[0] 793 smlal v12.4s, v5.4h, v1.h[2] 794 smlal v14.4s, v4.4h, v3.h[0] 795 smlsl v14.4s, v5.4h, v3.h[2] 796 smlsl v16.4s, v4.4h, v3.h[0] 797 smlsl v16.4s, v5.4h, v0.h[2] 798 smlsl v18.4s, v4.4h, v1.h[0] 799 smlsl v18.4s, v5.4h, v2.h[2] 800 801 mov x19,#0xff00 802 cmp x12,x19 803 bge skip_last8rows_stage2_kernel1 804 805 806 ld1 {v10.4h, v11.4h},[x11],#16 807 ld1 {v6.4h, v7.4h},[x11],x10 808 ld1 {v4.4h, v5.4h},[x0],#16 809 ld1 {v8.4h, v9.4h},[x0],x10 810 811 812 813 814 815 smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) 816 smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1) 817 smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) 818 smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3) 819 820 smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 821 smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 822 smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 823 smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 824 825 826 827 smlal v24.4s, v8.4h, v3.h[1] 828 smlsl v26.4s, v8.4h, v1.h[3] 829 smlal v28.4s, v8.4h, v0.h[1] 830 smlsl v30.4s, v8.4h, v1.h[1] 831 832 833 smlal v24.4s, v9.4h, v3.h[3] 834 smlsl v26.4s, v9.4h, v3.h[1] 835 smlal v28.4s, v9.4h, v2.h[3] 836 smlsl v30.4s, v9.4h, v2.h[1] 837 838 839 840 841 842 smlal v12.4s, v10.4h, v0.h[0] 843 smlal v12.4s, v11.4h, v2.h[2] 844 smlal v12.4s, v4.4h, v3.h[0] 845 smlal v12.4s, v5.4h, v3.h[2] 846 847 848 849 850 smlsl v14.4s, v10.4h, v0.h[0] 851 smlsl v14.4s, v11.4h, v0.h[2] 852 smlsl v14.4s, v4.4h, v1.h[0] 853 smlsl v14.4s, v5.4h, v2.h[2] 854 855 856 smlsl v16.4s, v10.4h, v0.h[0] 857 smlal v16.4s, v11.4h, v3.h[2] 858 smlal v16.4s, v4.4h, v1.h[0] 859 smlal v16.4s, v5.4h, v1.h[2] 860 861 862 smlal v18.4s, v10.4h, v0.h[0] 863 smlal v18.4s, v11.4h, v1.h[2] 864 smlsl v18.4s, v4.4h, v3.h[0] 865 smlsl v18.4s, v5.4h, v0.h[2] 866 867 868 869 870 871 872skip_last8rows_stage2_kernel1: 873 874 875 876 add v20.4s, v12.4s , v24.4s 877 sub v22.4s, v12.4s , v24.4s 878 879 add v12.4s, v14.4s , v26.4s 880 sub v24.4s, v14.4s , v26.4s 881 882 add v14.4s, v16.4s , v28.4s 883 sub v26.4s, v16.4s , v28.4s 884 885 886 add v16.4s, v18.4s , v30.4s 887 sub v28.4s, v18.4s , v30.4s 888 889 890 891 892 893 894 895 sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 896 sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 897 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 898 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 899 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 900 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 901 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 902 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 903 904 bge skip_stage2_kernel_load 905 906 //q2,q4,q6,q7 is used 907 ld1 {v10.4h, v11.4h},[x1],#16 908 ld1 {v6.4h, v7.4h},[x1],#16 909 ld1 {v4.4h, v5.4h},[x9],#16 910 ld1 {v8.4h, v9.4h},[x9],#16 911skip_stage2_kernel_load: 912 sub x1,x1,#32 913 st1 {v30.4h, v31.4h},[x1],#16 914 st1 {v18.4h, v19.4h},[x1],#16 915 sub x1,x1,#32 916 917 smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0) 918 smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1) 919 smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2) 920 smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3) 921 922 smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 923 smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 924 smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 925 smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 926 927 928 smull v22.4s, v10.4h, v0.h[0] 929 smlsl v22.4s, v11.4h, v3.h[2] 930 smull v20.4s, v10.4h, v0.h[0] 931 smlsl v20.4s, v11.4h, v2.h[2] 932 smull v16.4s, v10.4h, v0.h[0] 933 smlsl v16.4s, v11.4h, v1.h[2] 934 smull v18.4s, v10.4h, v0.h[0] 935 smlsl v18.4s, v11.4h, v0.h[2] 936 937 938 939 cmp x12,x6 940 bge skip_last8rows_stage2_kernel2 941 942 943 smlsl v24.4s, v8.4h, v3.h[1] 944 smlal v26.4s, v8.4h, v2.h[1] 945 smlal v28.4s, v8.4h, v0.h[1] 946 smlal v30.4s, v8.4h, v2.h[3] 947 948 949 smlal v24.4s, v9.4h, v0.h[1] 950 smlal v26.4s, v9.4h, v3.h[1] 951 smlsl v28.4s, v9.4h, v1.h[1] 952 smlsl v30.4s, v9.4h, v2.h[1] 953 954 955 956 smlsl v22.4s, v4.4h, v1.h[0] 957 smlal v22.4s, v5.4h, v2.h[2] 958 smlsl v20.4s, v4.4h, v3.h[0] 959 smlal v20.4s, v5.4h, v0.h[2] 960 smlal v16.4s, v4.4h, v3.h[0] 961 smlal v16.4s, v5.4h, v3.h[2] 962 smlal v18.4s, v4.4h, v1.h[0] 963 smlsl v18.4s, v5.4h, v1.h[2] 964 mov x19,#0xff00 965 cmp x12,x19 966 bge skip_last8rows_stage2_kernel2 967 968 ld1 {v10.4h, v11.4h},[x11],#16 969 ld1 {v6.4h, v7.4h},[x11],#16 970 ld1 {v4.4h, v5.4h},[x0],#16 971 ld1 {v8.4h, v9.4h},[x0],#16 972 973 smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0) 974 smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 975 smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2) 976 smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 977 978 smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 979 smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 980 smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 981 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 982 983 984 smlal v24.4s, v8.4h, v2.h[3] 985 smlal v26.4s, v8.4h, v3.h[3] 986 smlsl v28.4s, v8.4h, v2.h[1] 987 smlal v30.4s, v8.4h, v0.h[3] 988 989 990 smlal v24.4s, v9.4h, v1.h[3] 991 smlsl v26.4s, v9.4h, v1.h[1] 992 smlal v28.4s, v9.4h, v0.h[3] 993 smlsl v30.4s, v9.4h, v0.h[1] 994 995 996 997 998 smlal v22.4s, v10.4h, v0.h[0] 999 smlsl v22.4s, v11.4h, v1.h[2] 1000 smlsl v22.4s, v4.4h, v3.h[0] 1001 smlal v22.4s, v5.4h, v0.h[2] 1002 1003 1004 1005 smlsl v20.4s, v10.4h, v0.h[0] 1006 smlsl v20.4s, v11.4h, v3.h[2] 1007 smlal v20.4s, v4.4h, v1.h[0] 1008 smlsl v20.4s, v5.4h, v1.h[2] 1009 1010 1011 smlsl v16.4s, v10.4h, v0.h[0] 1012 smlal v16.4s, v11.4h, v0.h[2] 1013 smlsl v16.4s, v4.4h, v1.h[0] 1014 smlal v16.4s, v5.4h, v2.h[2] 1015 1016 1017 1018 smlal v18.4s, v10.4h, v0.h[0] 1019 smlsl v18.4s, v11.4h, v2.h[2] 1020 smlal v18.4s, v4.4h, v3.h[0] 1021 smlsl v18.4s, v5.4h, v3.h[2] 1022 1023 1024skip_last8rows_stage2_kernel2: 1025 1026 1027 1028 add v4.4s, v22.4s , v24.4s 1029 sub v22.4s, v22.4s , v24.4s 1030 1031 add v6.4s, v20.4s , v26.4s 1032 sub v24.4s, v20.4s , v26.4s 1033 1034 add v10.4s, v16.4s , v28.4s 1035 sub v26.4s, v16.4s , v28.4s 1036 1037 1038 add v16.4s, v18.4s , v30.4s 1039 sub v28.4s, v18.4s , v30.4s 1040 1041 1042 sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 1043 sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 1044 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 1045 sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 1046 sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 1047 sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 1048 sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 1049 sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 1050 1051 ld1 {v4.4h, v5.4h},[x1],#16 1052 ld1 {v8.4h, v9.4h},[x1],#16 1053 1054 1055 1056 // registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11} 1057 1058//d4=x0 1059//d12=x1 1060//d5=x2 1061//d13=x3 1062 1063//d18=x4 1064//d20=x5 1065//d19=x6 1066//d21=x7 1067 1068//d22=x8 1069//d30=x9 1070//d23=x10 1071//d31=x11 1072 1073//d14=x12 1074//d8=x13 1075//d15=x14 1076//d9=x15 1077 1078 umov x15,v26.d[0] 1079 umov x16,v27.d[0] 1080 umov x19,v28.d[0] 1081 umov x20,v29.d[0] 1082 1083 trn1 v26.4h, v4.4h, v12.4h 1084 trn2 v27.4h, v4.4h, v12.4h 1085 trn1 v28.4h, v5.4h, v13.4h 1086 trn2 v29.4h, v5.4h, v13.4h 1087 1088 trn1 v4.2s, v26.2s, v28.2s 1089 trn2 v5.2s, v26.2s, v28.2s 1090 trn1 v12.2s, v27.2s, v29.2s 1091 trn2 v13.2s, v27.2s, v29.2s 1092 1093 trn1 v26.4h, v18.4h, v20.4h 1094 trn2 v27.4h, v18.4h, v20.4h 1095 trn1 v28.4h, v19.4h, v21.4h 1096 trn2 v29.4h, v19.4h, v21.4h 1097 1098 trn1 v18.2s, v26.2s, v28.2s 1099 trn2 v19.2s, v26.2s, v28.2s 1100 trn1 v20.2s, v27.2s, v29.2s 1101 trn2 v21.2s, v27.2s, v29.2s 1102 1103 trn1 v26.4h, v22.4h, v30.4h 1104 trn2 v27.4h, v22.4h, v30.4h 1105 trn1 v28.4h, v23.4h, v31.4h 1106 trn2 v29.4h, v23.4h, v31.4h 1107 1108 trn1 v22.2s, v26.2s, v28.2s 1109 trn2 v23.2s, v26.2s, v28.2s 1110 trn1 v30.2s, v27.2s, v29.2s 1111 trn2 v31.2s, v27.2s, v29.2s 1112 1113 trn1 v26.4h, v14.4h, v8.4h 1114 trn2 v27.4h, v14.4h, v8.4h 1115 trn1 v28.4h, v15.4h, v9.4h 1116 trn2 v29.4h, v15.4h, v9.4h 1117 1118 trn1 v14.2s, v26.2s, v28.2s 1119 trn2 v15.2s, v26.2s, v28.2s 1120 trn1 v8.2s, v27.2s, v29.2s 1121 trn2 v9.2s, v27.2s, v29.2s 1122 1123 mov v26.d[0],x15 1124 mov v27.d[0],x16 1125 mov v28.d[0],x19 1126 mov v29.d[0],x20 1127 1128// d4 =x0 1- 4 values 1129// d5 =x2 1- 4 values 1130// d12=x1 1- 4 values 1131// d13=x3 1- 4 values 1132 1133// d18 =x0 5- 8 values 1134// d19 =x2 5- 8 values 1135// d20=x1 5- 8 values 1136// d21=x3 5- 8 values 1137 1138// d22 =x0 9- 12 values 1139// d23 =x2 9- 12 values 1140// d30=x1 9- 12 values 1141// d31=x3 9- 12 values 1142 1143// d14 =x0 13-16 values 1144// d15 =x2 13- 16 values 1145// d8=x1 13- 16 values 1146// d9=x3 13- 16 values 1147 1148 // swapping v5 and v15 1149 mov v5.d[1],v5.d[0] 1150 mov v5.d[0],v18.d[0] 1151 mov v18.d[0],v5.d[1] 1152 // swapping v23 and v14 1153 mov v23.d[1],v23.d[0] 1154 mov v23.d[0],v14.d[0] 1155 mov v14.d[0],v23.d[1] 1156 // swapping v13 and v20 1157 mov v13.d[1],v13.d[0] 1158 mov v13.d[0],v20.d[0] 1159 mov v20.d[0],v13.d[1] 1160 // swapping v31 and v8 1161 mov v31.d[1],v31.d[0] 1162 mov v31.d[0],v8.d[0] 1163 mov v8.d[0],v31.d[1] 1164 1165// q2: x0 1-8 values 1166// q11: x0 9-16 values 1167// q9 : x2 1-8 values 1168// q7 : x2 9-16 values 1169// q6 : x1 1- 8 values 1170// q10: x3 1-8 values 1171// q15: x1 9-16 values 1172// q4: x3 9-16 values 1173 1174 1175// registers free: q8,q14,q12,q13 1176 1177 1178 ld1 {v16.8b, v17.8b},[x2],x8 1179 ld1 {v28.8b, v29.8b},[x2],x5 1180 ld1 {v24.8b, v25.8b},[x4],x8 1181 ld1 {v26.8b, v27.8b},[x4],x5 1182 1183 mov v4.d[1] ,v5.d[0] 1184 mov v22.d[1] ,v23.d[0] 1185 mov v12.d[1] ,v13.d[0] 1186 mov v30.d[1] ,v31.d[0] 1187 mov v18.d[1] ,v19.d[0] 1188 mov v14.d[1] ,v15.d[0] 1189 mov v20.d[1] ,v21.d[0] 1190 mov v8.d[1] ,v9.d[0] 1191 1192 uaddw v4.8h, v4.8h , v16.8b 1193 uaddw v22.8h, v22.8h , v17.8b 1194 uaddw v12.8h, v12.8h , v28.8b 1195 uaddw v30.8h, v30.8h , v29.8b 1196 uaddw v18.8h, v18.8h , v24.8b 1197 uaddw v14.8h, v14.8h , v25.8b 1198 uaddw v20.8h, v20.8h , v26.8b 1199 uaddw v8.8h, v8.8h , v27.8b 1200 1201 1202 sqxtun v16.8b, v4.8h 1203 sqxtun v17.8b, v22.8h 1204 sqxtun v28.8b, v12.8h 1205 sqxtun v29.8b, v30.8h 1206 sqxtun v24.8b, v18.8h 1207 sqxtun v25.8b, v14.8h 1208 sqxtun v26.8b, v20.8h 1209 sqxtun v27.8b, v8.8h 1210 1211 1212 1213 st1 {v16.8b, v17.8b},[x3],x7 1214 st1 {v28.8b, v29.8b},[x3],x7 1215 st1 {v24.8b, v25.8b},[x3],x7 1216 st1 {v26.8b, v27.8b},[x3],x7 1217 1218 subs x14,x14,#1 1219 1220 1221 1222 bne second_stage 1223 1224 1225// sub sp,sp,#40 1226 // ldmfd sp!,{x4-x12,pc} 1227 ldp x19, x20,[sp],#16 1228 pop_v_regs 1229 ret 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241