1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19// ******************************************************************************* 20// * @file 21// * ihevc_itrans_recon_8x8_neon.s 22// * 23// * @brief 24// * contains function definitions for single stage inverse transform 25// * 26// * @author 27// * anand s 28// * 29// * @par list of functions: 30// * - ihevc_itrans_recon_32x32() 31// * 32// * @remarks 33// * the input buffer is being corrupted 34// * 35// ******************************************************************************* 36//*/ 37 38///** 39// ******************************************************************************* 40// * 41// * @brief 42// * this function performs inverse transform and reconstruction for 8x8 43// * input block 44// * 45// * @par description: 46// * performs inverse transform and adds the prediction data and clips output 47// * to 8 bit 48// * 49// * @param[in] pi2_src 50// * input 16x16 coefficients 51// * 52// * @param[in] pi2_tmp 53// * temporary 16x16 buffer for storing inverse 54// * 55// * transform 56// * 1st stage output 57// * 58// * @param[in] pu1_pred 59// * prediction 16x16 block 60// * 61// * @param[out] pu1_dst 62// * output 8x8 block 63// * 64// * @param[in] src_strd 65// * input stride 66// * 67// * @param[in] pred_strd 68// * prediction stride 69// * 70// * @param[in] dst_strd 71// * output stride 72// * 73// * @param[in] shift 74// * output shift 75// * 76// * @param[in] x12 77// * zero columns in pi2_src 78// * 79// * @returns void 80// * 81// * @remarks 82// * none 83// * 84// ******************************************************************************* 85// */ 86 87//void ihevc_itrans_recon_32x32(word16 *pi2_src, 88// word16 *pi2_tmp, 89// uword8 *pu1_pred, 90// uword8 *pu1_dst, 91// word32 src_strd, 92// word32 pred_strd, 93// word32 dst_strd, 94// word32 x12 95// word32 x11 ) 96 97//**************variables vs registers************************* 98// x0 => *pi2_src 99// x1 => *pi2_tmp 100// x2 => *pu1_pred 101// x3 => *pu1_dst 102// src_strd 103// pred_strd 104// dst_strd 105// x12 106// x11 107 108 109//d0[0]= 64 d2[0]=83 110//d0[1]= 90 d2[1]=82 111//d0[2]= 90 d2[2]=80 112//d0[3]= 90 d2[3]=78 113//d1[0]= 89 d3[0]=75 114//d1[1]= 88 d3[1]=73 115//d1[2]= 87 d3[2]=70 116//d1[3]= 85 d3[3]=67 117 118//d4[0]= 64 d6[0]=36 119//d4[1]= 61 d6[1]=31 120//d4[2]= 57 d6[2]=25 121//d4[3]= 54 d6[3]=22 122//d5[0]= 50 d7[0]=18 123//d5[1]= 46 d7[1]=13 124//d5[2]= 43 d7[2]=9 125//d5[3]= 38 d7[3]=4 126 127.text 128.align 4 129.include "ihevc_neon_macros.s" 130 131 132 133 134.set shift_stage1_idct , 7 135.set shift_stage2_idct , 12 136 137//#define zero_cols x12 138//#define zero_rows x11 139 140.globl ihevc_itrans_recon_32x32_av8 141 142.extern g_ai2_ihevc_trans_32_transpose 143 144.type ihevc_itrans_recon_32x32_av8, %function 145 146ihevc_itrans_recon_32x32_av8: 147 148 ldr w11, [sp] 149 150// stmfd sp!,{x0-x12,x14} 151 push_v_regs 152 stp x19, x20,[sp,#-16]! 153 stp x0, x1,[sp,#-16]! 154 stp x5, x6,[sp,#-16]! 155 156//ldr x8,[sp,#56] @ prediction stride 157//ldr x7,[sp,#64] @ destination stride 158 mov x6, x4 // src stride 159 mov x12, x7 160 lsl x6, x6, #1 // x sizeof(word16) 161 add x10,x6,x6, lsl #1 // 3 rows 162 163 164 mov x8,x0 165 166 adrp x14, :got:g_ai2_ihevc_trans_32_transpose 167 ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose] 168 169 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32 170 ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32 171 172//registers which are free 173// x10,x9,x11,x12 174 mov x9,#0xffffff00 175 mov x10,#0xfffffff0 176 mov w5,#0xfffff000 177 mov w7,#0xffff0000 178 cmp x12,x10 179 mov x20,#1 180 csel x14, x20, x14,hs 181 bhs stage1 182 183 184 cmp x12,x9 185 mov x20,#2 186 csel x14, x20, x14,hs 187 bhs stage1 188 189 cmp x12,x5 190 mov x20,#3 191 csel x14, x20, x14,hs 192 bhs stage1 193 194 cmp x12,x7 195 mov x20,#4 196 csel x14, x20, x14,hs 197 198 mov x14,#8 199 b stage1 200//.ltorg 201 202 203dct_stage1: 204 add x8,x8,#8 205 mov x0,x8 206 207stage1: 208 ld1 {v10.4h},[x0],x6 209 ld1 {v8.4h},[x0],x6 210 ld1 {v11.4h},[x0],x6 211 ld1 {v9.4h},[x0],x6 212 213 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) 214 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) 215 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 216 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 217 218 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 219 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 220 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 221 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 222 223 224 225 226 227 smull v20.4s, v10.4h, v0.h[0] 228 smlal v20.4s, v11.4h, v0.h[2] 229 230 231 smull v22.4s, v10.4h, v0.h[0] 232 smlal v22.4s, v11.4h, v1.h[2] 233 234 smull v16.4s, v10.4h, v0.h[0] 235 smlal v16.4s, v11.4h, v2.h[2] 236 237 smull v18.4s, v10.4h, v0.h[0] 238 smlal v18.4s, v11.4h, v3.h[2] 239 cmp x11,x10 240 bhs shift1 241 242 ld1 {v12.4h},[x0],x6 243 ld1 {v14.4h},[x0],x6 244 ld1 {v13.4h},[x0],x6 245 ld1 {v15.4h},[x0],x6 246 247 248 249 250 251 252 253 smlal v24.4s, v14.4h, v1.h[1] 254 smlal v26.4s, v14.4h, v3.h[3] 255 smlal v28.4s, v14.4h, v6.h[1] 256 smlsl v30.4s, v14.4h, v7.h[1] 257 258 259 smlal v24.4s, v15.4h, v1.h[3] 260 smlal v26.4s, v15.4h, v5.h[1] 261 smlsl v28.4s, v15.4h, v7.h[1] 262 smlsl v30.4s, v15.4h, v3.h[3] 263 264 265 smlal v20.4s, v12.4h, v1.h[0] 266 smlal v20.4s, v13.4h, v1.h[2] 267 smlal v22.4s, v12.4h, v3.h[0] 268 smlal v22.4s, v13.4h, v4.h[2] 269 smlal v16.4s, v12.4h, v5.h[0] 270 smlal v16.4s, v13.4h, v7.h[2] 271 smlal v18.4s, v12.4h, v7.h[0] 272 smlsl v18.4s, v13.4h, v5.h[2] 273 274 cmp x11,x9 275 bhs shift1 276 277 ld1 {v10.4h},[x0],x6 278 ld1 {v8.4h},[x0],x6 279 ld1 {v11.4h},[x0],x6 280 ld1 {v9.4h},[x0],x6 281 282 283 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) 284 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 285 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) 286 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) 287 288 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 289 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 290 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 291 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 292 293 294 295 296 297 smlal v20.4s, v10.4h, v2.h[0] 298 smlal v20.4s, v11.4h, v2.h[2] 299 300 301 smlal v22.4s, v10.4h, v6.h[0] 302 smlal v22.4s, v11.4h, v7.h[2] 303 304 smlsl v16.4s, v10.4h, v6.h[0] 305 smlsl v16.4s, v11.4h, v3.h[2] 306 307 smlsl v18.4s, v10.4h, v2.h[0] 308 smlsl v18.4s, v11.4h, v1.h[2] 309 310 cmp x11,x5 311 bhs shift1 312 313 314 ld1 {v12.4h},[x0],x6 315 ld1 {v14.4h},[x0],x6 316 ld1 {v13.4h},[x0],x6 317 ld1 {v15.4h},[x0],x6 318 319 320 321 322 323 324 325 326 327 smlal v24.4s, v14.4h, v3.h[1] 328 smlsl v26.4s, v14.4h, v6.h[1] 329 smlsl v28.4s, v14.4h, v0.h[1] 330 smlsl v30.4s, v14.4h, v6.h[3] 331 332 333 smlal v24.4s, v15.4h, v3.h[3] 334 smlsl v26.4s, v15.4h, v4.h[3] 335 smlsl v28.4s, v15.4h, v2.h[3] 336 smlal v30.4s, v15.4h, v5.h[3] 337 338 339 smlal v20.4s, v12.4h, v3.h[0] 340 smlal v20.4s, v13.4h, v3.h[2] 341 smlsl v22.4s, v12.4h, v7.h[0] 342 smlsl v22.4s, v13.4h, v5.h[2] 343 smlsl v16.4s, v12.4h, v1.h[0] 344 smlsl v16.4s, v13.4h, v1.h[2] 345 smlsl v18.4s, v12.4h, v5.h[0] 346 smlal v18.4s, v13.4h, v7.h[2] 347 348 cmp x11,x7 349 bhs shift1 350 351 352 ld1 {v10.4h},[x0],x6 353 ld1 {v8.4h},[x0],x6 354 ld1 {v11.4h},[x0],x6 355 ld1 {v9.4h},[x0],x6 356 357 358 359 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 360 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) 361 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 362 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) 363 364 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 365 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 366 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 367 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 368 369 370 371 372 373 smlal v20.4s, v10.4h, v0.h[0] 374 smlal v20.4s, v11.4h, v4.h[2] 375 376 377 smlsl v22.4s, v10.4h, v0.h[0] 378 smlsl v22.4s, v11.4h, v2.h[2] 379 380 smlsl v16.4s, v10.4h, v0.h[0] 381 smlsl v16.4s, v11.4h, v6.h[2] 382 383 smlal v18.4s, v10.4h, v0.h[0] 384 smlal v18.4s, v11.4h, v0.h[2] 385 386 387 388 ld1 {v12.4h},[x0],x6 389 ld1 {v14.4h},[x0],x6 390 ld1 {v13.4h},[x0],x6 391 ld1 {v15.4h},[x0],x6 392 393 394 395 396 smlal v24.4s, v14.4h, v5.h[1] 397 smlsl v26.4s, v14.4h, v0.h[2] 398 smlal v28.4s, v14.4h, v5.h[3] 399 smlal v30.4s, v14.4h, v4.h[3] 400 401 402 smlal v24.4s, v15.4h, v5.h[3] 403 smlsl v26.4s, v15.4h, v1.h[1] 404 smlal v28.4s, v15.4h, v3.h[1] 405 smlsl v30.4s, v15.4h, v7.h[3] 406 407 408 smlal v20.4s, v12.4h, v5.h[0] 409 smlal v20.4s, v13.4h, v5.h[2] 410 smlsl v22.4s, v12.4h, v1.h[0] 411 smlsl v22.4s, v13.4h, v0.h[2] 412 smlal v16.4s, v12.4h, v7.h[0] 413 smlal v16.4s, v13.4h, v4.h[2] 414 smlal v18.4s, v12.4h, v3.h[0] 415 smlal v18.4s, v13.4h, v6.h[2] 416 417 418 ld1 {v10.4h},[x0],x6 419 ld1 {v8.4h},[x0],x6 420 ld1 {v11.4h},[x0],x6 421 ld1 {v9.4h},[x0],x6 422 423 424 425 426 427 428 429 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 430 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 431 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) 432 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) 433 434 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 435 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 436 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 437 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 438 439 440 441 442 443 smlal v20.4s, v10.4h, v6.h[0] 444 smlal v20.4s, v11.4h, v6.h[2] 445 446 447 smlsl v22.4s, v10.4h, v2.h[0] 448 smlsl v22.4s, v11.4h, v3.h[2] 449 450 smlal v16.4s, v10.4h, v2.h[0] 451 smlal v16.4s, v11.4h, v0.h[2] 452 453 smlsl v18.4s, v10.4h, v6.h[0] 454 smlsl v18.4s, v11.4h, v2.h[2] 455 456 ld1 {v12.4h},[x0],x6 457 ld1 {v14.4h},[x0],x6 458 ld1 {v13.4h},[x0],x6 459 ld1 {v15.4h},[x0],x6 460 461 462 smlal v24.4s, v14.4h, v7.h[1] 463 smlsl v26.4s, v14.4h, v5.h[3] 464 smlal v28.4s, v14.4h, v4.h[1] 465 smlsl v30.4s, v14.4h, v2.h[3] 466 467 468 smlal v24.4s, v15.4h, v7.h[3] 469 smlsl v26.4s, v15.4h, v7.h[1] 470 smlal v28.4s, v15.4h, v6.h[3] 471 smlsl v30.4s, v15.4h, v6.h[1] 472 473 474 smlal v20.4s, v12.4h, v7.h[0] 475 smlal v20.4s, v13.4h, v7.h[2] 476 smlsl v22.4s, v12.4h, v5.h[0] 477 smlsl v22.4s, v13.4h, v6.h[2] 478 smlal v16.4s, v12.4h, v3.h[0] 479 smlal v16.4s, v13.4h, v5.h[2] 480 smlsl v18.4s, v12.4h, v1.h[0] 481 smlsl v18.4s, v13.4h, v4.h[2] 482 483 484 485shift1: 486 add v8.4s, v20.4s , v24.4s 487 sub v10.4s, v20.4s , v24.4s 488 489 add v12.4s, v22.4s , v26.4s 490 sub v24.4s, v22.4s , v26.4s 491 492 add v14.4s, v16.4s , v28.4s 493 sub v26.4s, v16.4s , v28.4s 494 495 496 add v16.4s, v18.4s , v30.4s 497 sub v28.4s, v18.4s , v30.4s 498 499 500 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 501 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 502 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 503 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 504 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 505 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 506 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 507 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 508 509 510 // registers used q15,q14,q6,q7 511 512 umov x15,v24.d[0] 513 umov x16,v25.d[0] 514 umov x19,v26.d[0] 515 umov x20,v27.d[0] 516 517 trn1 v24.4h, v30.4h, v12.4h 518 trn2 v25.4h, v30.4h, v12.4h 519 trn1 v26.4h, v31.4h, v13.4h 520 trn2 v27.4h, v31.4h, v13.4h 521 522 trn1 v30.2s, v24.2s, v26.2s 523 trn2 v31.2s, v24.2s, v26.2s 524 trn1 v12.2s, v25.2s, v27.2s 525 trn2 v13.2s, v25.2s, v27.2s 526 527 trn1 v24.4h, v14.4h, v18.4h 528 trn2 v25.4h, v14.4h, v18.4h 529 trn1 v26.4h, v15.4h, v19.4h 530 trn2 v27.4h, v15.4h, v19.4h 531 532 trn1 v14.2s, v24.2s, v26.2s 533 trn2 v15.2s, v24.2s, v26.2s 534 trn1 v18.2s, v25.2s, v27.2s 535 trn2 v19.2s, v25.2s, v27.2s 536 537 mov v24.d[0],x15 538 mov v25.d[0],x16 539 mov v26.d[0],x19 540 mov v27.d[0],x20 541 542// d30 =x0 1- 4 values 543// d31 =x2 1- 4 values 544// d12=x1 1- 4 values 545// d13=x3 1- 4 values 546// d14 =x0 28-31 values 547// d15 =x2 28- 31 values 548// d18=x1 28- 31 values 549// d19=x3 28- 31 values 550 551 552 553 st1 { v30.4h, v31.4h},[x1],#16 554 st1 { v12.4h, v13.4h},[x1],#16 555 add x1,x1,#192 556 st1 { v14.4h, v15.4h},[x1],#16 557 st1 { v18.4h, v19.4h},[x1],#16 558 sub x1,x1,#224 559 560 mov x0,x8 561 562 563 564 565 566 ld1 {v10.4h},[x0],x6 567 ld1 {v8.4h},[x0],x6 568 ld1 {v11.4h},[x0],x6 569 ld1 {v9.4h},[x0],x6 570 571 572 573 574 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) 575 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 576 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) 577 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 578 579 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 580 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 581 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 582 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 583 584 585 586 587 588 smull v20.4s, v10.4h, v0.h[0] 589 smlal v20.4s, v11.4h, v4.h[2] 590 591 592 smull v22.4s, v10.4h, v0.h[0] 593 smlal v22.4s, v11.4h, v5.h[2] 594 595 smull v16.4s, v10.4h, v0.h[0] 596 smlal v16.4s, v11.4h, v6.h[2] 597 598 smull v18.4s, v10.4h, v0.h[0] 599 smlal v18.4s, v11.4h, v7.h[2] 600 cmp x11,x10 601 bhs shift2 602 603 ld1 {v12.4h},[x0],x6 604 ld1 {v14.4h},[x0],x6 605 ld1 {v13.4h},[x0],x6 606 ld1 {v15.4h},[x0],x6 607 608 609 smlsl v24.4s, v14.4h, v4.h[3] 610 smlsl v26.4s, v14.4h, v2.h[1] 611 smlsl v28.4s, v14.4h, v0.h[1] 612 smlsl v30.4s, v14.4h, v2.h[3] 613 614 615 smlsl v24.4s, v15.4h, v0.h[3] 616 smlsl v26.4s, v15.4h, v3.h[1] 617 smlsl v28.4s, v15.4h, v6.h[3] 618 smlal v30.4s, v15.4h, v5.h[3] 619 620 621 smlsl v20.4s, v12.4h, v7.h[0] 622 smlsl v20.4s, v13.4h, v2.h[2] 623 smlsl v22.4s, v12.4h, v5.h[0] 624 smlsl v22.4s, v13.4h, v0.h[2] 625 smlsl v16.4s, v12.4h, v3.h[0] 626 smlsl v16.4s, v13.4h, v3.h[2] 627 smlsl v18.4s, v12.4h, v1.h[0] 628 smlsl v18.4s, v13.4h, v6.h[2] 629 630 cmp x11,x9 631 bhs shift2 632 633 634 ld1 {v10.4h},[x0],x6 635 ld1 {v8.4h},[x0],x6 636 ld1 {v11.4h},[x0],x6 637 ld1 {v9.4h},[x0],x6 638 639 640 641 642 643 644 645 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 646 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) 647 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) 648 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 649 650 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 651 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 652 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 653 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 654 655 656 657 658 659 smlsl v20.4s, v10.4h, v2.h[0] 660 smlsl v20.4s, v11.4h, v6.h[2] 661 662 663 smlsl v22.4s, v10.4h, v6.h[0] 664 smlal v22.4s, v11.4h, v4.h[2] 665 666 smlal v16.4s, v10.4h, v6.h[0] 667 smlal v16.4s, v11.4h, v0.h[2] 668 669 smlal v18.4s, v10.4h, v2.h[0] 670 smlal v18.4s, v11.4h, v5.h[2] 671 672 cmp x11,x5 673 bhs shift2 674 675 676 ld1 {v12.4h},[x0],x6 677 ld1 {v14.4h},[x0],x6 678 ld1 {v13.4h},[x0],x6 679 ld1 {v15.4h},[x0],x6 680 681 682 683 684 685 smlal v24.4s, v14.4h, v2.h[3] 686 smlal v26.4s, v14.4h, v3.h[3] 687 smlsl v28.4s, v14.4h, v5.h[3] 688 smlsl v30.4s, v14.4h, v0.h[3] 689 690 691 smlal v24.4s, v15.4h, v1.h[3] 692 smlsl v26.4s, v15.4h, v6.h[3] 693 smlsl v28.4s, v15.4h, v0.h[3] 694 smlal v30.4s, v15.4h, v7.h[3] 695 696 697 smlal v20.4s, v12.4h, v5.h[0] 698 smlal v20.4s, v13.4h, v0.h[2] 699 smlal v22.4s, v12.4h, v1.h[0] 700 smlal v22.4s, v13.4h, v6.h[2] 701 smlal v16.4s, v12.4h, v7.h[0] 702 smlsl v16.4s, v13.4h, v2.h[2] 703 smlsl v18.4s, v12.4h, v3.h[0] 704 smlsl v18.4s, v13.4h, v4.h[2] 705 706 707 cmp x11,x7 708 bhs shift2 709 710 711 ld1 {v10.4h},[x0],x6 712 ld1 {v8.4h},[x0],x6 713 ld1 {v11.4h},[x0],x6 714 ld1 {v9.4h},[x0],x6 715 716 717 718 719 720 721 722 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 723 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) 724 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) 725 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) 726 727 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 728 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 729 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 730 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 731 732 733 734 735 736 smlal v20.4s, v10.4h, v0.h[0] 737 smlsl v20.4s, v11.4h, v7.h[2] 738 739 740 smlsl v22.4s, v10.4h, v0.h[0] 741 smlsl v22.4s, v11.4h, v1.h[2] 742 743 smlsl v16.4s, v10.4h, v0.h[0] 744 smlal v16.4s, v11.4h, v5.h[2] 745 746 smlal v18.4s, v10.4h, v0.h[0] 747 smlal v18.4s, v11.4h, v3.h[2] 748 749 750 751 ld1 {v12.4h},[x0],x6 752 ld1 {v14.4h},[x0],x6 753 ld1 {v13.4h},[x0],x6 754 ld1 {v15.4h},[x0],x6 755 756 757 smlsl v24.4s, v14.4h, v0.h[1] 758 smlal v26.4s, v14.4h, v6.h[1] 759 smlal v28.4s, v14.4h, v4.h[1] 760 smlsl v30.4s, v14.4h, v1.h[1] 761 762 763 smlsl v24.4s, v15.4h, v3.h[3] 764 smlal v26.4s, v15.4h, v0.h[1] 765 smlsl v28.4s, v15.4h, v5.h[1] 766 smlsl v30.4s, v15.4h, v6.h[1] 767 768 769 smlsl v20.4s, v12.4h, v3.h[0] 770 smlsl v20.4s, v13.4h, v1.h[2] 771 smlsl v22.4s, v12.4h, v7.h[0] 772 smlal v22.4s, v13.4h, v3.h[2] 773 smlal v16.4s, v12.4h, v1.h[0] 774 smlal v16.4s, v13.4h, v7.h[2] 775 smlsl v18.4s, v12.4h, v5.h[0] 776 smlsl v18.4s, v13.4h, v2.h[2] 777 778 ld1 {v10.4h},[x0],x6 779 ld1 {v8.4h},[x0],x6 780 ld1 {v11.4h},[x0],x6 781 ld1 {v9.4h},[x0],x6 782 783 784 785 786 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 787 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) 788 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 789 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) 790 791 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 792 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 793 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 794 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 795 796 797 798 799 800 smlsl v20.4s, v10.4h, v6.h[0] 801 smlal v20.4s, v11.4h, v5.h[2] 802 803 804 smlal v22.4s, v10.4h, v2.h[0] 805 smlal v22.4s, v11.4h, v7.h[2] 806 807 smlsl v16.4s, v10.4h, v2.h[0] 808 smlsl v16.4s, v11.4h, v4.h[2] 809 810 smlal v18.4s, v10.4h, v6.h[0] 811 smlal v18.4s, v11.4h, v1.h[2] 812 813 814 ld1 {v12.4h},[x0],x6 815 ld1 {v14.4h},[x0],x6 816 ld1 {v13.4h},[x0],x6 817 ld1 {v15.4h},[x0],x6 818 819 820 821 822 823 smlal v24.4s, v14.4h, v1.h[1] 824 smlsl v26.4s, v14.4h, v0.h[3] 825 smlal v28.4s, v14.4h, v1.h[3] 826 smlsl v30.4s, v14.4h, v3.h[1] 827 828 829 smlal v24.4s, v15.4h, v5.h[3] 830 smlsl v26.4s, v15.4h, v5.h[1] 831 smlal v28.4s, v15.4h, v4.h[3] 832 smlsl v30.4s, v15.4h, v4.h[1] 833 834 835 smlal v20.4s, v12.4h, v1.h[0] 836 smlal v20.4s, v13.4h, v3.h[2] 837 smlsl v22.4s, v12.4h, v3.h[0] 838 smlsl v22.4s, v13.4h, v2.h[2] 839 smlal v16.4s, v12.4h, v5.h[0] 840 smlal v16.4s, v13.4h, v1.h[2] 841 smlsl v18.4s, v12.4h, v7.h[0] 842 smlsl v18.4s, v13.4h, v0.h[2] 843 844shift2: 845 add v8.4s, v20.4s , v24.4s 846 sub v10.4s, v20.4s , v24.4s 847 848 add v12.4s, v22.4s , v26.4s 849 sub v24.4s, v22.4s , v26.4s 850 851 add v14.4s, v16.4s , v28.4s 852 sub v26.4s, v16.4s , v28.4s 853 854 855 add v16.4s, v18.4s , v30.4s 856 sub v28.4s, v18.4s , v30.4s 857 858 859 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 860 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 861 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 862 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 863 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 864 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 865 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 866 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 867 868 umov x15,v24.d[0] 869 umov x16,v25.d[0] 870 umov x19,v26.d[0] 871 umov x20,v27.d[0] 872 873 trn1 v24.4h, v30.4h, v12.4h 874 trn2 v25.4h, v30.4h, v12.4h 875 trn1 v26.4h, v31.4h, v13.4h 876 trn2 v27.4h, v31.4h, v13.4h 877 878 trn1 v30.2s, v24.2s, v26.2s 879 trn2 v31.2s, v24.2s, v26.2s 880 trn1 v12.2s, v25.2s, v27.2s 881 trn2 v13.2s, v25.2s, v27.2s 882 883 trn1 v24.4h, v14.4h, v18.4h 884 trn2 v25.4h, v14.4h, v18.4h 885 trn1 v26.4h, v15.4h, v19.4h 886 trn2 v27.4h, v15.4h, v19.4h 887 888 trn1 v14.2s, v24.2s, v26.2s 889 trn2 v15.2s, v24.2s, v26.2s 890 trn1 v18.2s, v25.2s, v27.2s 891 trn2 v19.2s, v25.2s, v27.2s 892 893 mov v24.d[0],x15 894 mov v25.d[0],x16 895 mov v26.d[0],x19 896 mov v27.d[0],x20 897 898 st1 { v30.4h, v31.4h},[x1],#16 899 st1 { v12.4h, v13.4h},[x1],#16 900 add x1,x1,#128 901 st1 { v14.4h, v15.4h},[x1],#16 902 st1 { v18.4h, v19.4h},[x1],#16 903 sub x1,x1,#160 904 mov x0,x8 905 906 907 908 ld1 {v10.4h},[x0],x6 909 ld1 {v8.4h},[x0],x6 910 ld1 {v11.4h},[x0],x6 911 ld1 {v9.4h},[x0],x6 912 913 914 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 915 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) 916 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 917 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) 918 919 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 920 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 921 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) 922 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 923 924 925 926 927 928 smull v20.4s, v10.4h, v0.h[0] 929 smlsl v20.4s, v11.4h, v7.h[2] 930 931 932 smull v22.4s, v10.4h, v0.h[0] 933 smlsl v22.4s, v11.4h, v6.h[2] 934 935 smull v16.4s, v10.4h, v0.h[0] 936 smlsl v16.4s, v11.4h, v5.h[2] 937 938 smull v18.4s, v10.4h, v0.h[0] 939 smlsl v18.4s, v11.4h, v4.h[2] 940 941 cmp x11,x10 942 bhs shift3 943 944 ld1 {v12.4h},[x0],x6 945 ld1 {v14.4h},[x0],x6 946 ld1 {v13.4h},[x0],x6 947 ld1 {v15.4h},[x0],x6 948 949 950 951 952 smlsl v24.4s, v14.4h, v5.h[1] 953 smlsl v26.4s, v14.4h, v7.h[3] 954 smlal v28.4s, v14.4h, v5.h[3] 955 smlal v30.4s, v14.4h, v3.h[1] 956 957 958 smlal v24.4s, v15.4h, v2.h[1] 959 smlal v26.4s, v15.4h, v1.h[1] 960 smlal v28.4s, v15.4h, v4.h[3] 961 smlsl v30.4s, v15.4h, v7.h[3] 962 963 964 smlsl v20.4s, v12.4h, v1.h[0] 965 smlal v20.4s, v13.4h, v6.h[2] 966 smlsl v22.4s, v12.4h, v3.h[0] 967 smlal v22.4s, v13.4h, v3.h[2] 968 smlsl v16.4s, v12.4h, v5.h[0] 969 smlal v16.4s, v13.4h, v0.h[2] 970 smlsl v18.4s, v12.4h, v7.h[0] 971 smlal v18.4s, v13.4h, v2.h[2] 972 973 cmp x11,x9 974 bhs shift3 975 976 ld1 {v10.4h},[x0],x6 977 ld1 {v8.4h},[x0],x6 978 ld1 {v11.4h},[x0],x6 979 ld1 {v9.4h},[x0],x6 980 981 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 982 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) 983 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) 984 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 985 986 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 987 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 988 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 989 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 990 991 992 993 994 995 smlal v20.4s, v10.4h, v2.h[0] 996 smlsl v20.4s, v11.4h, v5.h[2] 997 998 999 smlal v22.4s, v10.4h, v6.h[0] 1000 smlsl v22.4s, v11.4h, v0.h[2] 1001 1002 smlsl v16.4s, v10.4h, v6.h[0] 1003 smlsl v16.4s, v11.4h, v4.h[2] 1004 1005 smlsl v18.4s, v10.4h, v2.h[0] 1006 smlal v18.4s, v11.4h, v6.h[2] 1007 1008 cmp x11,x5 1009 bhs shift3 1010 1011 1012 ld1 {v12.4h},[x0],x6 1013 ld1 {v14.4h},[x0],x6 1014 ld1 {v13.4h},[x0],x6 1015 ld1 {v15.4h},[x0],x6 1016 1017 1018 1019 1020 1021 1022 smlsl v24.4s, v14.4h, v7.h[1] 1023 smlal v26.4s, v14.4h, v2.h[1] 1024 smlal v28.4s, v14.4h, v4.h[1] 1025 smlsl v30.4s, v14.4h, v5.h[1] 1026 1027 1028 smlal v24.4s, v15.4h, v0.h[3] 1029 smlal v26.4s, v15.4h, v7.h[1] 1030 smlsl v28.4s, v15.4h, v1.h[1] 1031 smlsl v30.4s, v15.4h, v6.h[1] 1032 1033 1034 smlsl v20.4s, v12.4h, v3.h[0] 1035 smlal v20.4s, v13.4h, v4.h[2] 1036 smlal v22.4s, v12.4h, v7.h[0] 1037 smlal v22.4s, v13.4h, v2.h[2] 1038 smlal v16.4s, v12.4h, v1.h[0] 1039 smlsl v16.4s, v13.4h, v6.h[2] 1040 smlal v18.4s, v12.4h, v5.h[0] 1041 smlsl v18.4s, v13.4h, v0.h[2] 1042 1043 1044 cmp x11,x7 1045 bhs shift3 1046 1047 1048 ld1 {v10.4h},[x0],x6 1049 ld1 {v8.4h},[x0],x6 1050 ld1 {v11.4h},[x0],x6 1051 ld1 {v9.4h},[x0],x6 1052 1053 1054 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 1055 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) 1056 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) 1057 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 1058 1059 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 1060 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1061 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1062 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1063 1064 1065 1066 1067 1068 smlal v20.4s, v10.4h, v0.h[0] 1069 smlsl v20.4s, v11.4h, v3.h[2] 1070 1071 1072 smlsl v22.4s, v10.4h, v0.h[0] 1073 smlsl v22.4s, v11.4h, v5.h[2] 1074 1075 smlsl v16.4s, v10.4h, v0.h[0] 1076 smlal v16.4s, v11.4h, v1.h[2] 1077 1078 smlal v18.4s, v10.4h, v0.h[0] 1079 smlal v18.4s, v11.4h, v7.h[2] 1080 1081 1082 ld1 {v12.4h},[x0],x6 1083 ld1 {v14.4h},[x0],x6 1084 ld1 {v13.4h},[x0],x6 1085 ld1 {v15.4h},[x0],x6 1086 1087 1088 1089 smlal v24.4s, v14.4h, v6.h[3] 1090 smlal v26.4s, v14.4h, v3.h[3] 1091 smlsl v28.4s, v14.4h, v1.h[3] 1092 smlal v30.4s, v14.4h, v7.h[1] 1093 1094 1095 smlal v24.4s, v15.4h, v1.h[3] 1096 smlsl v26.4s, v15.4h, v2.h[3] 1097 smlal v28.4s, v15.4h, v7.h[1] 1098 smlal v30.4s, v15.4h, v4.h[1] 1099 1100 1101 smlsl v20.4s, v12.4h, v5.h[0] 1102 smlal v20.4s, v13.4h, v2.h[2] 1103 smlal v22.4s, v12.4h, v1.h[0] 1104 smlsl v22.4s, v13.4h, v7.h[2] 1105 smlsl v16.4s, v12.4h, v7.h[0] 1106 smlsl v16.4s, v13.4h, v3.h[2] 1107 smlsl v18.4s, v12.4h, v3.h[0] 1108 smlal v18.4s, v13.4h, v1.h[2] 1109 1110 1111 1112 ld1 {v10.4h},[x0],x6 1113 ld1 {v8.4h},[x0],x6 1114 ld1 {v11.4h},[x0],x6 1115 ld1 {v9.4h},[x0],x6 1116 1117 1118 1119 1120 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) 1121 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 1122 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) 1123 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) 1124 1125 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1126 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1127 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1128 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1129 1130 1131 1132 1133 1134 smlal v20.4s, v10.4h, v6.h[0] 1135 smlsl v20.4s, v11.4h, v1.h[2] 1136 1137 1138 smlsl v22.4s, v10.4h, v2.h[0] 1139 smlal v22.4s, v11.4h, v4.h[2] 1140 1141 smlal v16.4s, v10.4h, v2.h[0] 1142 smlsl v16.4s, v11.4h, v7.h[2] 1143 1144 smlsl v18.4s, v10.4h, v6.h[0] 1145 smlsl v18.4s, v11.4h, v5.h[2] 1146 1147 1148 ld1 {v12.4h},[x0],x6 1149 ld1 {v14.4h},[x0],x6 1150 ld1 {v13.4h},[x0],x6 1151 ld1 {v15.4h},[x0],x6 1152 1153 smlal v24.4s, v14.4h, v4.h[3] 1154 smlsl v26.4s, v14.4h, v6.h[1] 1155 smlal v28.4s, v14.4h, v7.h[3] 1156 smlal v30.4s, v14.4h, v6.h[3] 1157 1158 1159 smlal v24.4s, v15.4h, v3.h[3] 1160 smlsl v26.4s, v15.4h, v3.h[1] 1161 smlal v28.4s, v15.4h, v2.h[3] 1162 smlsl v30.4s, v15.4h, v2.h[1] 1163 1164 1165 smlsl v20.4s, v12.4h, v7.h[0] 1166 smlal v20.4s, v13.4h, v0.h[2] 1167 smlal v22.4s, v12.4h, v5.h[0] 1168 smlsl v22.4s, v13.4h, v1.h[2] 1169 smlsl v16.4s, v12.4h, v3.h[0] 1170 smlal v16.4s, v13.4h, v2.h[2] 1171 smlal v18.4s, v12.4h, v1.h[0] 1172 smlsl v18.4s, v13.4h, v3.h[2] 1173 1174shift3: 1175 add v8.4s, v20.4s , v24.4s 1176 sub v10.4s, v20.4s , v24.4s 1177 1178 add v12.4s, v22.4s , v26.4s 1179 sub v24.4s, v22.4s , v26.4s 1180 1181 add v14.4s, v16.4s , v28.4s 1182 sub v26.4s, v16.4s , v28.4s 1183 1184 1185 add v16.4s, v18.4s , v30.4s 1186 sub v28.4s, v18.4s , v30.4s 1187 1188 1189 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 1190 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 1191 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 1192 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 1193 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 1194 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 1195 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 1196 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 1197 1198 umov x15,v24.d[0] 1199 umov x16,v25.d[0] 1200 umov x19,v26.d[0] 1201 umov x20,v27.d[0] 1202 1203 trn1 v24.4h, v30.4h, v12.4h 1204 trn2 v25.4h, v30.4h, v12.4h 1205 trn1 v26.4h, v31.4h, v13.4h 1206 trn2 v27.4h, v31.4h, v13.4h 1207 1208 trn1 v30.2s, v24.2s, v26.2s 1209 trn2 v31.2s, v24.2s, v26.2s 1210 trn1 v12.2s, v25.2s, v27.2s 1211 trn2 v13.2s, v25.2s, v27.2s 1212 1213 trn1 v24.4h, v14.4h, v18.4h 1214 trn2 v25.4h, v14.4h, v18.4h 1215 trn1 v26.4h, v15.4h, v19.4h 1216 trn2 v27.4h, v15.4h, v19.4h 1217 1218 trn1 v14.2s, v24.2s, v26.2s 1219 trn2 v15.2s, v24.2s, v26.2s 1220 trn1 v18.2s, v25.2s, v27.2s 1221 trn2 v19.2s, v25.2s, v27.2s 1222 1223 mov v24.d[0],x15 1224 mov v25.d[0],x16 1225 mov v26.d[0],x19 1226 mov v27.d[0],x20 1227 st1 { v30.4h, v31.4h},[x1],#16 1228 st1 { v12.4h, v13.4h},[x1],#16 1229 add x1,x1,#64 1230 st1 { v14.4h, v15.4h},[x1],#16 1231 st1 { v18.4h, v19.4h},[x1],#16 1232 sub x1,x1,#96 1233 1234 mov x0,x8 1235 1236 1237 1238 ld1 {v10.4h},[x0],x6 1239 ld1 {v8.4h},[x0],x6 1240 ld1 {v11.4h},[x0],x6 1241 ld1 {v9.4h},[x0],x6 1242 1243 1244 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 1245 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 1246 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) 1247 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) 1248 1249 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1250 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1251 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1252 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1253 1254 1255 1256 1257 1258 smull v20.4s, v10.4h, v0.h[0] 1259 smlsl v20.4s, v11.4h, v3.h[2] 1260 1261 1262 smull v22.4s, v10.4h, v0.h[0] 1263 smlsl v22.4s, v11.4h, v2.h[2] 1264 1265 smull v16.4s, v10.4h, v0.h[0] 1266 smlsl v16.4s, v11.4h, v1.h[2] 1267 1268 smull v18.4s, v10.4h, v0.h[0] 1269 smlsl v18.4s, v11.4h, v0.h[2] 1270 1271 cmp x11,x10 1272 bhs shift4 1273 1274 ld1 {v12.4h},[x0],x6 1275 ld1 {v14.4h},[x0],x6 1276 ld1 {v13.4h},[x0],x6 1277 ld1 {v15.4h},[x0],x6 1278 1279 1280 1281 1282 1283 1284 smlal v24.4s, v14.4h, v0.h[1] 1285 smlal v26.4s, v14.4h, v1.h[3] 1286 smlal v28.4s, v14.4h, v4.h[1] 1287 smlal v30.4s, v14.4h, v6.h[3] 1288 1289 1290 smlsl v24.4s, v15.4h, v4.h[1] 1291 smlsl v26.4s, v15.4h, v0.h[3] 1292 smlsl v28.4s, v15.4h, v2.h[3] 1293 smlsl v30.4s, v15.4h, v6.h[1] 1294 1295 1296 smlal v20.4s, v12.4h, v7.h[0] 1297 smlal v20.4s, v13.4h, v5.h[2] 1298 smlal v22.4s, v12.4h, v5.h[0] 1299 smlsl v22.4s, v13.4h, v7.h[2] 1300 smlal v16.4s, v12.4h, v3.h[0] 1301 smlsl v16.4s, v13.4h, v4.h[2] 1302 smlal v18.4s, v12.4h, v1.h[0] 1303 smlsl v18.4s, v13.4h, v1.h[2] 1304 1305 cmp x11,x9 1306 bhs shift4 1307 1308 ld1 {v10.4h},[x0],x6 1309 ld1 {v8.4h},[x0],x6 1310 ld1 {v11.4h},[x0],x6 1311 ld1 {v9.4h},[x0],x6 1312 1313 1314 1315 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 1316 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) 1317 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 1318 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) 1319 1320 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1321 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1322 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1323 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1324 1325 1326 1327 1328 1329 smlsl v20.4s, v10.4h, v2.h[0] 1330 smlal v20.4s, v11.4h, v1.h[2] 1331 1332 1333 smlsl v22.4s, v10.4h, v6.h[0] 1334 smlal v22.4s, v11.4h, v3.h[2] 1335 1336 smlal v16.4s, v10.4h, v6.h[0] 1337 smlsl v16.4s, v11.4h, v7.h[2] 1338 1339 smlal v18.4s, v10.4h, v2.h[0] 1340 smlsl v18.4s, v11.4h, v2.h[2] 1341 1342 cmp x11,x5 1343 bhs shift4 1344 1345 1346 ld1 {v12.4h},[x0],x6 1347 ld1 {v14.4h},[x0],x6 1348 ld1 {v13.4h},[x0],x6 1349 ld1 {v15.4h},[x0],x6 1350 1351 1352 1353 1354 1355 1356 smlsl v24.4s, v14.4h, v1.h[1] 1357 smlsl v26.4s, v14.4h, v7.h[3] 1358 smlal v28.4s, v14.4h, v1.h[3] 1359 smlal v30.4s, v14.4h, v4.h[3] 1360 1361 1362 smlal v24.4s, v15.4h, v2.h[1] 1363 smlal v26.4s, v15.4h, v5.h[1] 1364 smlsl v28.4s, v15.4h, v3.h[1] 1365 smlsl v30.4s, v15.4h, v4.h[1] 1366 1367 1368 smlsl v20.4s, v12.4h, v5.h[0] 1369 smlsl v20.4s, v13.4h, v7.h[2] 1370 smlsl v22.4s, v12.4h, v1.h[0] 1371 smlal v22.4s, v13.4h, v1.h[2] 1372 smlsl v16.4s, v12.4h, v7.h[0] 1373 smlal v16.4s, v13.4h, v5.h[2] 1374 smlal v18.4s, v12.4h, v3.h[0] 1375 smlsl v18.4s, v13.4h, v3.h[2] 1376 1377 cmp x11,x7 1378 bhs shift4 1379 1380 1381 ld1 {v10.4h},[x0],x6 1382 ld1 {v8.4h},[x0],x6 1383 ld1 {v11.4h},[x0],x6 1384 ld1 {v9.4h},[x0],x6 1385 1386 1387 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) 1388 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 1389 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) 1390 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 1391 1392 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1393 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1394 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1395 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1396 1397 1398 1399 1400 1401 smlal v20.4s, v10.4h, v0.h[0] 1402 smlsl v20.4s, v11.4h, v0.h[2] 1403 1404 1405 smlsl v22.4s, v10.4h, v0.h[0] 1406 smlal v22.4s, v11.4h, v6.h[2] 1407 1408 smlsl v16.4s, v10.4h, v0.h[0] 1409 smlal v16.4s, v11.4h, v2.h[2] 1410 1411 smlal v18.4s, v10.4h, v0.h[0] 1412 smlsl v18.4s, v11.4h, v4.h[2] 1413 1414 1415 1416 1417 ld1 {v12.4h},[x0],x6 1418 ld1 {v14.4h},[x0],x6 1419 ld1 {v13.4h},[x0],x6 1420 ld1 {v15.4h},[x0],x6 1421 1422 1423 1424 1425 1426 1427 smlal v24.4s, v14.4h, v3.h[1] 1428 smlsl v26.4s, v14.4h, v2.h[1] 1429 smlal v28.4s, v14.4h, v7.h[3] 1430 smlal v30.4s, v14.4h, v2.h[3] 1431 1432 1433 smlsl v24.4s, v15.4h, v0.h[3] 1434 smlal v26.4s, v15.4h, v4.h[3] 1435 smlal v28.4s, v15.4h, v6.h[3] 1436 smlsl v30.4s, v15.4h, v2.h[1] 1437 1438 1439 smlal v20.4s, v12.4h, v3.h[0] 1440 smlsl v20.4s, v13.4h, v6.h[2] 1441 smlal v22.4s, v12.4h, v7.h[0] 1442 smlsl v22.4s, v13.4h, v4.h[2] 1443 smlsl v16.4s, v12.4h, v1.h[0] 1444 smlal v16.4s, v13.4h, v0.h[2] 1445 smlal v18.4s, v12.4h, v5.h[0] 1446 smlsl v18.4s, v13.4h, v5.h[2] 1447 1448 1449 ld1 {v10.4h},[x0],x6 1450 ld1 {v8.4h},[x0],x6 1451 ld1 {v11.4h},[x0],x6 1452 ld1 {v9.4h},[x0],x6 1453 1454 1455 1456 1457 1458 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) 1459 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) 1460 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 1461 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 1462 1463 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 1464 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1465 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1466 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1467 1468 1469 1470 1471 1472 smlsl v20.4s, v10.4h, v6.h[0] 1473 smlal v20.4s, v11.4h, v2.h[2] 1474 1475 1476 smlal v22.4s, v10.4h, v2.h[0] 1477 smlsl v22.4s, v11.4h, v0.h[2] 1478 1479 smlsl v16.4s, v10.4h, v2.h[0] 1480 smlal v16.4s, v11.4h, v3.h[2] 1481 1482 smlal v18.4s, v10.4h, v6.h[0] 1483 smlsl v18.4s, v11.4h, v6.h[2] 1484 1485 1486 ld1 {v12.4h},[x0],x6 1487 ld1 {v14.4h},[x0],x6 1488 ld1 {v13.4h},[x0],x6 1489 ld1 {v15.4h},[x0],x6 1490 1491 1492 1493 1494 smlsl v24.4s, v14.4h, v5.h[1] 1495 smlal v26.4s, v14.4h, v3.h[3] 1496 smlsl v28.4s, v14.4h, v2.h[1] 1497 smlal v30.4s, v14.4h, v0.h[3] 1498 1499 1500 smlal v24.4s, v15.4h, v1.h[3] 1501 smlsl v26.4s, v15.4h, v1.h[1] 1502 smlal v28.4s, v15.4h, v0.h[3] 1503 smlsl v30.4s, v15.4h, v0.h[1] 1504 1505 1506 smlsl v20.4s, v12.4h, v1.h[0] 1507 smlal v20.4s, v13.4h, v4.h[2] 1508 smlal v22.4s, v12.4h, v3.h[0] 1509 smlsl v22.4s, v13.4h, v5.h[2] 1510 smlsl v16.4s, v12.4h, v5.h[0] 1511 smlal v16.4s, v13.4h, v6.h[2] 1512 smlal v18.4s, v12.4h, v7.h[0] 1513 smlsl v18.4s, v13.4h, v7.h[2] 1514 1515shift4: 1516 add v8.4s, v20.4s , v24.4s 1517 sub v10.4s, v20.4s , v24.4s 1518 1519 add v12.4s, v22.4s , v26.4s 1520 sub v24.4s, v22.4s , v26.4s 1521 1522 add v14.4s, v16.4s , v28.4s 1523 sub v26.4s, v16.4s , v28.4s 1524 1525 1526 add v16.4s, v18.4s , v30.4s 1527 sub v28.4s, v18.4s , v30.4s 1528 1529 1530 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 1531 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 1532 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 1533 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 1534 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 1535 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 1536 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 1537 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 1538 1539 umov x15,v24.d[0] 1540 umov x16,v25.d[0] 1541 umov x19,v26.d[0] 1542 umov x20,v27.d[0] 1543 1544 trn1 v24.4h, v30.4h, v12.4h 1545 trn2 v25.4h, v30.4h, v12.4h 1546 trn1 v26.4h, v31.4h, v13.4h 1547 trn2 v27.4h, v31.4h, v13.4h 1548 1549 trn1 v30.2s, v24.2s, v26.2s 1550 trn2 v31.2s, v24.2s, v26.2s 1551 trn1 v12.2s, v25.2s, v27.2s 1552 trn2 v13.2s, v25.2s, v27.2s 1553 1554 trn1 v24.4h, v14.4h, v18.4h 1555 trn2 v25.4h, v14.4h, v18.4h 1556 trn1 v26.4h, v15.4h, v19.4h 1557 trn2 v27.4h, v15.4h, v19.4h 1558 1559 trn1 v14.2s, v24.2s, v26.2s 1560 trn2 v15.2s, v24.2s, v26.2s 1561 trn1 v18.2s, v25.2s, v27.2s 1562 trn2 v19.2s, v25.2s, v27.2s 1563 1564 mov v24.d[0],x15 1565 mov v25.d[0],x16 1566 mov v26.d[0],x19 1567 mov v27.d[0],x20 1568 1569 st1 { v30.4h, v31.4h},[x1],#16 1570 st1 { v12.4h, v13.4h},[x1],#16 1571 st1 { v14.4h, v15.4h},[x1],#16 1572 st1 { v18.4h, v19.4h},[x1],#16 1573 1574 add x1,x1,#96 1575 1576 subs x14,x14,#1 1577 bne dct_stage1 1578second_stage_dct: 1579// mov x0,x1 1580 ldp x8, x7,[sp],#16 1581 ldp x0, x1,[sp],#16 1582 1583// add x4,x2,x8, lsl #1 @ x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 1584// add x5,x8,x8, lsl #1 @ 1585// sub x0,x0,#512 1586 mov x11,#0xfffffff0 1587 mov x5, #0xffffff00 1588 mov w6,#0xfffff000 1589 mov w9,#0xffff0000 1590// sub x1,x1,#2048 1591 mov x4,x1 1592 mov x10,#240 1593 mov x14,#8 1594 b stage2 1595 1596// registers free : 1597 1598// arm registers used 1599// x8 : predicition stride 1600// x7 : destination stride 1601// x1: temp buffer 1602// x2 : pred buffer 1603// x3 : destination buffer 1604// x14 : loop counter 1605//x0 : scratch buffer 1606//x10 : used as stride 1607// x4 : used to store the initial address 1608//x12 : zero cols 1609// x11 : 0xfffffff0 1610// x5 : 0xffffff00 1611dct_stage2: 1612 add x4,x4,#32 1613 mov x1,x4 1614stage2: 1615 ld1 {v10.4h, v11.4h},[x1],#16 1616 ld1 {v8.4h, v9.4h},[x1],x10 1617 1618 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) 1619 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) 1620 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 1621 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 1622 1623 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1624 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1625 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1626 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1627 1628 1629 1630 smull v20.4s, v10.4h, v0.h[0] 1631 smlal v20.4s, v11.4h, v0.h[2] 1632 1633 1634 smull v22.4s, v10.4h, v0.h[0] 1635 smlal v22.4s, v11.4h, v1.h[2] 1636 1637 smull v16.4s, v10.4h, v0.h[0] 1638 smlal v16.4s, v11.4h, v2.h[2] 1639 1640 smull v18.4s, v10.4h, v0.h[0] 1641 smlal v18.4s, v11.4h, v3.h[2] 1642 cmp x12,x11 1643 bhs stage2_shift1 1644 1645 ld1 {v12.4h, v13.4h},[x1],#16 1646 ld1 {v14.4h, v15.4h},[x1],x10 1647 1648 1649 1650 1651 1652 1653 smlal v24.4s, v14.4h, v1.h[1] 1654 smlal v26.4s, v14.4h, v3.h[3] 1655 smlal v28.4s, v14.4h, v6.h[1] 1656 smlsl v30.4s, v14.4h, v7.h[1] 1657 1658 1659 smlal v24.4s, v15.4h, v1.h[3] 1660 smlal v26.4s, v15.4h, v5.h[1] 1661 smlsl v28.4s, v15.4h, v7.h[1] 1662 smlsl v30.4s, v15.4h, v3.h[3] 1663 1664 1665 smlal v20.4s, v12.4h, v1.h[0] 1666 smlal v20.4s, v13.4h, v1.h[2] 1667 smlal v22.4s, v12.4h, v3.h[0] 1668 smlal v22.4s, v13.4h, v4.h[2] 1669 smlal v16.4s, v12.4h, v5.h[0] 1670 smlal v16.4s, v13.4h, v7.h[2] 1671 smlal v18.4s, v12.4h, v7.h[0] 1672 smlsl v18.4s, v13.4h, v5.h[2] 1673 cmp x12,x5 1674 bhs stage2_shift1 1675 1676 ld1 {v10.4h, v11.4h},[x1],#16 1677 ld1 {v8.4h, v9.4h},[x1],x10 1678 1679 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) 1680 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 1681 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) 1682 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) 1683 1684 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1685 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1686 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1687 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1688 1689 1690 1691 1692 1693 smlal v20.4s, v10.4h, v2.h[0] 1694 smlal v20.4s, v11.4h, v2.h[2] 1695 1696 1697 smlal v22.4s, v10.4h, v6.h[0] 1698 smlal v22.4s, v11.4h, v7.h[2] 1699 1700 smlsl v16.4s, v10.4h, v6.h[0] 1701 smlsl v16.4s, v11.4h, v3.h[2] 1702 1703 smlsl v18.4s, v10.4h, v2.h[0] 1704 smlsl v18.4s, v11.4h, v1.h[2] 1705 1706 cmp x12,x6 1707 bhs stage2_shift1 1708 1709 1710 ld1 {v12.4h, v13.4h},[x1],#16 1711 ld1 {v14.4h, v15.4h},[x1],x10 1712 1713 1714 1715 1716 1717 smlal v24.4s, v14.4h, v3.h[1] 1718 smlsl v26.4s, v14.4h, v6.h[1] 1719 smlsl v28.4s, v14.4h, v0.h[1] 1720 smlsl v30.4s, v14.4h, v6.h[3] 1721 1722 1723 smlal v24.4s, v15.4h, v3.h[3] 1724 smlsl v26.4s, v15.4h, v4.h[3] 1725 smlsl v28.4s, v15.4h, v2.h[3] 1726 smlal v30.4s, v15.4h, v5.h[3] 1727 1728 1729 smlal v20.4s, v12.4h, v3.h[0] 1730 smlal v20.4s, v13.4h, v3.h[2] 1731 smlsl v22.4s, v12.4h, v7.h[0] 1732 smlsl v22.4s, v13.4h, v5.h[2] 1733 smlsl v16.4s, v12.4h, v1.h[0] 1734 smlsl v16.4s, v13.4h, v1.h[2] 1735 smlsl v18.4s, v12.4h, v5.h[0] 1736 smlal v18.4s, v13.4h, v7.h[2] 1737 1738 cmp x12,x9 1739 bhs stage2_shift1 1740 1741 1742 ld1 {v10.4h, v11.4h},[x1],#16 1743 ld1 {v8.4h, v9.4h},[x1],x10 1744 1745 1746 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 1747 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) 1748 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 1749 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) 1750 1751 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1752 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1753 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1754 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1755 1756 1757 1758 1759 1760 smlal v20.4s, v10.4h, v0.h[0] 1761 smlal v20.4s, v11.4h, v4.h[2] 1762 1763 1764 smlsl v22.4s, v10.4h, v0.h[0] 1765 smlsl v22.4s, v11.4h, v2.h[2] 1766 1767 smlsl v16.4s, v10.4h, v0.h[0] 1768 smlsl v16.4s, v11.4h, v6.h[2] 1769 1770 smlal v18.4s, v10.4h, v0.h[0] 1771 smlal v18.4s, v11.4h, v0.h[2] 1772 1773 ld1 {v12.4h, v13.4h},[x1],#16 1774 ld1 {v14.4h, v15.4h},[x1],x10 1775 1776 1777 1778 1779 1780 smlal v24.4s, v14.4h, v5.h[1] 1781 smlsl v26.4s, v14.4h, v0.h[2] 1782 smlal v28.4s, v14.4h, v5.h[3] 1783 smlal v30.4s, v14.4h, v4.h[3] 1784 1785 1786 smlal v24.4s, v15.4h, v5.h[3] 1787 smlsl v26.4s, v15.4h, v1.h[1] 1788 smlal v28.4s, v15.4h, v3.h[1] 1789 smlsl v30.4s, v15.4h, v7.h[3] 1790 1791 1792 smlal v20.4s, v12.4h, v5.h[0] 1793 smlal v20.4s, v13.4h, v5.h[2] 1794 smlsl v22.4s, v12.4h, v1.h[0] 1795 smlsl v22.4s, v13.4h, v0.h[2] 1796 smlal v16.4s, v12.4h, v7.h[0] 1797 smlal v16.4s, v13.4h, v4.h[2] 1798 smlal v18.4s, v12.4h, v3.h[0] 1799 smlal v18.4s, v13.4h, v6.h[2] 1800 1801 1802 ld1 {v10.4h, v11.4h},[x1],#16 1803 ld1 {v8.4h, v9.4h},[x1],x10 1804 1805 1806 1807 1808 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 1809 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 1810 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2) 1811 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3) 1812 1813 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1814 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 1815 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 1816 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 1817 1818 1819 1820 1821 1822 smlal v20.4s, v10.4h, v6.h[0] 1823 smlal v20.4s, v11.4h, v6.h[2] 1824 1825 1826 smlsl v22.4s, v10.4h, v2.h[0] 1827 smlsl v22.4s, v11.4h, v3.h[2] 1828 1829 smlal v16.4s, v10.4h, v2.h[0] 1830 smlal v16.4s, v11.4h, v0.h[2] 1831 1832 smlsl v18.4s, v10.4h, v6.h[0] 1833 smlsl v18.4s, v11.4h, v2.h[2] 1834 1835 ld1 {v12.4h, v13.4h},[x1],#16 1836 ld1 {v14.4h, v15.4h},[x1],x10 1837 1838 smlal v24.4s, v14.4h, v7.h[1] 1839 smlsl v26.4s, v14.4h, v5.h[3] 1840 smlal v28.4s, v14.4h, v4.h[1] 1841 smlsl v30.4s, v14.4h, v2.h[3] 1842 1843 1844 smlal v24.4s, v15.4h, v7.h[3] 1845 smlsl v26.4s, v15.4h, v7.h[1] 1846 smlal v28.4s, v15.4h, v6.h[3] 1847 smlsl v30.4s, v15.4h, v6.h[1] 1848 1849 1850 smlal v20.4s, v12.4h, v7.h[0] 1851 smlal v20.4s, v13.4h, v7.h[2] 1852 smlsl v22.4s, v12.4h, v5.h[0] 1853 smlsl v22.4s, v13.4h, v6.h[2] 1854 smlal v16.4s, v12.4h, v3.h[0] 1855 smlal v16.4s, v13.4h, v5.h[2] 1856 smlsl v18.4s, v12.4h, v1.h[0] 1857 smlsl v18.4s, v13.4h, v4.h[2] 1858 1859stage2_shift1: 1860 add v8.4s, v20.4s , v24.4s 1861 sub v10.4s, v20.4s , v24.4s 1862 1863 add v12.4s, v22.4s , v26.4s 1864 sub v24.4s, v22.4s , v26.4s 1865 1866 add v14.4s, v16.4s , v28.4s 1867 sub v26.4s, v16.4s , v28.4s 1868 1869 1870 add v16.4s, v18.4s , v30.4s 1871 sub v28.4s, v18.4s , v30.4s 1872 1873 1874 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 1875 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 1876 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 1877 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 1878 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 1879 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 1880 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 1881 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 1882 1883 1884 umov x15,v24.d[0] 1885 umov x16,v25.d[0] 1886 umov x19,v26.d[0] 1887 umov x20,v27.d[0] 1888 1889 trn1 v24.4h, v30.4h, v12.4h 1890 trn2 v25.4h, v30.4h, v12.4h 1891 trn1 v26.4h, v31.4h, v13.4h 1892 trn2 v27.4h, v31.4h, v13.4h 1893 1894 trn1 v30.2s, v24.2s, v26.2s 1895 trn2 v31.2s, v24.2s, v26.2s 1896 trn1 v12.2s, v25.2s, v27.2s 1897 trn2 v13.2s, v25.2s, v27.2s 1898 1899 trn1 v24.4h, v14.4h, v18.4h 1900 trn2 v25.4h, v14.4h, v18.4h 1901 trn1 v26.4h, v15.4h, v19.4h 1902 trn2 v27.4h, v15.4h, v19.4h 1903 1904 trn1 v14.2s, v24.2s, v26.2s 1905 trn2 v15.2s, v24.2s, v26.2s 1906 trn1 v18.2s, v25.2s, v27.2s 1907 trn2 v19.2s, v25.2s, v27.2s 1908 1909 mov v24.d[0],x15 1910 mov v25.d[0],x16 1911 mov v26.d[0],x19 1912 mov v27.d[0],x20 1913 1914 st1 { v30.4h, v31.4h},[x0],#16 1915 st1 { v12.4h, v13.4h},[x0],#16 1916 st1 { v14.4h, v15.4h},[x0],#16 1917 st1 { v18.4h, v19.4h},[x0],#16 1918 1919 mov x1,x4 1920 1921 1922 1923 1924 1925 1926 ld1 {v10.4h, v11.4h},[x1],#16 1927 ld1 {v8.4h, v9.4h},[x1],x10 1928 1929 1930 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0) 1931 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 1932 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) 1933 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 1934 1935 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 1936 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 1937 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 1938 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 1939 1940 1941 1942 1943 1944 smull v20.4s, v10.4h, v0.h[0] 1945 smlal v20.4s, v11.4h, v4.h[2] 1946 1947 1948 smull v22.4s, v10.4h, v0.h[0] 1949 smlal v22.4s, v11.4h, v5.h[2] 1950 1951 smull v16.4s, v10.4h, v0.h[0] 1952 smlal v16.4s, v11.4h, v6.h[2] 1953 1954 smull v18.4s, v10.4h, v0.h[0] 1955 smlal v18.4s, v11.4h, v7.h[2] 1956 1957 cmp x12,x11 1958 bhs stage2_shift2 1959 1960 ld1 {v12.4h, v13.4h},[x1],#16 1961 ld1 {v14.4h, v15.4h},[x1],x10 1962 1963 1964 smlsl v24.4s, v14.4h, v4.h[3] 1965 smlsl v26.4s, v14.4h, v2.h[1] 1966 smlsl v28.4s, v14.4h, v0.h[1] 1967 smlsl v30.4s, v14.4h, v2.h[3] 1968 1969 1970 smlsl v24.4s, v15.4h, v0.h[3] 1971 smlsl v26.4s, v15.4h, v3.h[1] 1972 smlsl v28.4s, v15.4h, v6.h[3] 1973 smlal v30.4s, v15.4h, v5.h[3] 1974 1975 1976 smlsl v20.4s, v12.4h, v7.h[0] 1977 smlsl v20.4s, v13.4h, v2.h[2] 1978 smlsl v22.4s, v12.4h, v5.h[0] 1979 smlsl v22.4s, v13.4h, v0.h[2] 1980 smlsl v16.4s, v12.4h, v3.h[0] 1981 smlsl v16.4s, v13.4h, v3.h[2] 1982 smlsl v18.4s, v12.4h, v1.h[0] 1983 smlsl v18.4s, v13.4h, v6.h[2] 1984 1985 cmp x12,x5 1986 bhs stage2_shift2 1987 1988 ld1 {v10.4h, v11.4h},[x1],#16 1989 ld1 {v8.4h, v9.4h},[x1],x10 1990 1991 1992 1993 1994 1995 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 1996 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) 1997 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2) 1998 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 1999 2000 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2001 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2002 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2003 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2004 2005 2006 2007 2008 2009 smlsl v20.4s, v10.4h, v2.h[0] 2010 smlsl v20.4s, v11.4h, v6.h[2] 2011 2012 2013 smlsl v22.4s, v10.4h, v6.h[0] 2014 smlal v22.4s, v11.4h, v4.h[2] 2015 2016 smlal v16.4s, v10.4h, v6.h[0] 2017 smlal v16.4s, v11.4h, v0.h[2] 2018 2019 smlal v18.4s, v10.4h, v2.h[0] 2020 smlal v18.4s, v11.4h, v5.h[2] 2021 2022 cmp x12,x6 2023 bhs stage2_shift2 2024 2025 2026 ld1 {v12.4h, v13.4h},[x1],#16 2027 ld1 {v14.4h, v15.4h},[x1],x10 2028 2029 2030 2031 2032 2033 2034 smlal v24.4s, v14.4h, v2.h[3] 2035 smlal v26.4s, v14.4h, v3.h[3] 2036 smlsl v28.4s, v14.4h, v5.h[3] 2037 smlsl v30.4s, v14.4h, v0.h[3] 2038 2039 2040 smlal v24.4s, v15.4h, v1.h[3] 2041 smlsl v26.4s, v15.4h, v6.h[3] 2042 smlsl v28.4s, v15.4h, v0.h[3] 2043 smlal v30.4s, v15.4h, v7.h[3] 2044 2045 2046 smlal v20.4s, v12.4h, v5.h[0] 2047 smlal v20.4s, v13.4h, v0.h[2] 2048 smlal v22.4s, v12.4h, v1.h[0] 2049 smlal v22.4s, v13.4h, v6.h[2] 2050 smlal v16.4s, v12.4h, v7.h[0] 2051 smlsl v16.4s, v13.4h, v2.h[2] 2052 smlsl v18.4s, v12.4h, v3.h[0] 2053 smlsl v18.4s, v13.4h, v4.h[2] 2054 2055 cmp x12,x9 2056 bhs stage2_shift2 2057 2058 2059 ld1 {v10.4h, v11.4h},[x1],#16 2060 ld1 {v8.4h, v9.4h},[x1],x10 2061 2062 2063 2064 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 2065 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1) 2066 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) 2067 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3) 2068 2069 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2070 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2071 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2072 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2073 2074 2075 2076 2077 2078 smlal v20.4s, v10.4h, v0.h[0] 2079 smlsl v20.4s, v11.4h, v7.h[2] 2080 2081 2082 smlsl v22.4s, v10.4h, v0.h[0] 2083 smlsl v22.4s, v11.4h, v1.h[2] 2084 2085 smlsl v16.4s, v10.4h, v0.h[0] 2086 smlal v16.4s, v11.4h, v5.h[2] 2087 2088 smlal v18.4s, v10.4h, v0.h[0] 2089 smlal v18.4s, v11.4h, v3.h[2] 2090 2091 ld1 {v12.4h, v13.4h},[x1],#16 2092 ld1 {v14.4h, v15.4h},[x1],x10 2093 2094 2095 2096 2097 smlsl v24.4s, v14.4h, v0.h[1] 2098 smlal v26.4s, v14.4h, v6.h[1] 2099 smlal v28.4s, v14.4h, v4.h[1] 2100 smlsl v30.4s, v14.4h, v1.h[1] 2101 2102 2103 smlsl v24.4s, v15.4h, v3.h[3] 2104 smlal v26.4s, v15.4h, v0.h[1] 2105 smlsl v28.4s, v15.4h, v5.h[1] 2106 smlsl v30.4s, v15.4h, v6.h[1] 2107 2108 2109 smlsl v20.4s, v12.4h, v3.h[0] 2110 smlsl v20.4s, v13.4h, v1.h[2] 2111 smlsl v22.4s, v12.4h, v7.h[0] 2112 smlal v22.4s, v13.4h, v3.h[2] 2113 smlal v16.4s, v12.4h, v1.h[0] 2114 smlal v16.4s, v13.4h, v7.h[2] 2115 smlsl v18.4s, v12.4h, v5.h[0] 2116 smlsl v18.4s, v13.4h, v2.h[2] 2117 2118 2119 ld1 {v10.4h, v11.4h},[x1],#16 2120 ld1 {v8.4h, v9.4h},[x1],x10 2121 2122 2123 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 2124 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) 2125 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 2126 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3) 2127 2128 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2129 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2130 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2131 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2132 2133 2134 2135 2136 2137 smlsl v20.4s, v10.4h, v6.h[0] 2138 smlal v20.4s, v11.4h, v5.h[2] 2139 2140 2141 smlal v22.4s, v10.4h, v2.h[0] 2142 smlal v22.4s, v11.4h, v7.h[2] 2143 2144 smlsl v16.4s, v10.4h, v2.h[0] 2145 smlsl v16.4s, v11.4h, v4.h[2] 2146 2147 smlal v18.4s, v10.4h, v6.h[0] 2148 smlal v18.4s, v11.4h, v1.h[2] 2149 2150 2151 ld1 {v12.4h, v13.4h},[x1],#16 2152 ld1 {v14.4h, v15.4h},[x1],x10 2153 2154 2155 2156 smlal v24.4s, v14.4h, v1.h[1] 2157 smlsl v26.4s, v14.4h, v0.h[3] 2158 smlal v28.4s, v14.4h, v1.h[3] 2159 smlsl v30.4s, v14.4h, v3.h[1] 2160 2161 2162 smlal v24.4s, v15.4h, v5.h[3] 2163 smlsl v26.4s, v15.4h, v5.h[1] 2164 smlal v28.4s, v15.4h, v4.h[3] 2165 smlsl v30.4s, v15.4h, v4.h[1] 2166 2167 2168 smlal v20.4s, v12.4h, v1.h[0] 2169 smlal v20.4s, v13.4h, v3.h[2] 2170 smlsl v22.4s, v12.4h, v3.h[0] 2171 smlsl v22.4s, v13.4h, v2.h[2] 2172 smlal v16.4s, v12.4h, v5.h[0] 2173 smlal v16.4s, v13.4h, v1.h[2] 2174 smlsl v18.4s, v12.4h, v7.h[0] 2175 smlsl v18.4s, v13.4h, v0.h[2] 2176 2177stage2_shift2: 2178 add v8.4s, v20.4s , v24.4s 2179 sub v10.4s, v20.4s , v24.4s 2180 2181 add v12.4s, v22.4s , v26.4s 2182 sub v24.4s, v22.4s , v26.4s 2183 2184 add v14.4s, v16.4s , v28.4s 2185 sub v26.4s, v16.4s , v28.4s 2186 2187 2188 add v16.4s, v18.4s , v30.4s 2189 sub v28.4s, v18.4s , v30.4s 2190 2191 2192 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2193 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2194 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2195 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2196 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2197 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2198 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2199 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2200 2201 umov x15,v24.d[0] 2202 umov x16,v25.d[0] 2203 umov x19,v26.d[0] 2204 umov x20,v27.d[0] 2205 2206 trn1 v24.4h, v30.4h, v12.4h 2207 trn2 v25.4h, v30.4h, v12.4h 2208 trn1 v26.4h, v31.4h, v13.4h 2209 trn2 v27.4h, v31.4h, v13.4h 2210 2211 trn1 v30.2s, v24.2s, v26.2s 2212 trn2 v31.2s, v24.2s, v26.2s 2213 trn1 v12.2s, v25.2s, v27.2s 2214 trn2 v13.2s, v25.2s, v27.2s 2215 2216 trn1 v24.4h, v14.4h, v18.4h 2217 trn2 v25.4h, v14.4h, v18.4h 2218 trn1 v26.4h, v15.4h, v19.4h 2219 trn2 v27.4h, v15.4h, v19.4h 2220 2221 trn1 v14.2s, v24.2s, v26.2s 2222 trn2 v15.2s, v24.2s, v26.2s 2223 trn1 v18.2s, v25.2s, v27.2s 2224 trn2 v19.2s, v25.2s, v27.2s 2225 2226 mov v24.d[0],x15 2227 mov v25.d[0],x16 2228 mov v26.d[0],x19 2229 mov v27.d[0],x20 2230 2231 st1 { v30.4h, v31.4h},[x0],#16 2232 st1 { v12.4h, v13.4h},[x0],#16 2233 st1 { v14.4h, v15.4h},[x0],#16 2234 st1 { v18.4h, v19.4h},[x0],#16 2235 2236 2237 mov x1,x4 2238 2239 2240 2241 2242 ld1 {v10.4h, v11.4h},[x1],#16 2243 ld1 {v8.4h, v9.4h},[x1],x10 2244 2245 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0) 2246 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1) 2247 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 2248 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) 2249 2250 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2251 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2252 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2) 2253 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2254 2255 2256 2257 2258 2259 smull v20.4s, v10.4h, v0.h[0] 2260 smlsl v20.4s, v11.4h, v7.h[2] 2261 2262 2263 smull v22.4s, v10.4h, v0.h[0] 2264 smlsl v22.4s, v11.4h, v6.h[2] 2265 2266 smull v16.4s, v10.4h, v0.h[0] 2267 smlsl v16.4s, v11.4h, v5.h[2] 2268 2269 smull v18.4s, v10.4h, v0.h[0] 2270 smlsl v18.4s, v11.4h, v4.h[2] 2271 2272 cmp x12,x11 2273 bhs stage2_shift3 2274 2275 ld1 {v12.4h, v13.4h},[x1],#16 2276 ld1 {v14.4h, v15.4h},[x1],x10 2277 2278 smlsl v24.4s, v14.4h, v5.h[1] 2279 smlsl v26.4s, v14.4h, v7.h[3] 2280 smlal v28.4s, v14.4h, v5.h[3] 2281 smlal v30.4s, v14.4h, v3.h[1] 2282 2283 2284 smlal v24.4s, v15.4h, v2.h[1] 2285 smlal v26.4s, v15.4h, v1.h[1] 2286 smlal v28.4s, v15.4h, v4.h[3] 2287 smlsl v30.4s, v15.4h, v7.h[3] 2288 2289 2290 smlsl v20.4s, v12.4h, v1.h[0] 2291 smlal v20.4s, v13.4h, v6.h[2] 2292 smlsl v22.4s, v12.4h, v3.h[0] 2293 smlal v22.4s, v13.4h, v3.h[2] 2294 smlsl v16.4s, v12.4h, v5.h[0] 2295 smlal v16.4s, v13.4h, v0.h[2] 2296 smlsl v18.4s, v12.4h, v7.h[0] 2297 smlal v18.4s, v13.4h, v2.h[2] 2298 2299 cmp x12,x5 2300 bhs stage2_shift3 2301 2302 ld1 {v10.4h, v11.4h},[x1],#16 2303 ld1 {v8.4h, v9.4h},[x1],x10 2304 2305 2306 2307 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 2308 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1) 2309 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2) 2310 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 2311 2312 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2313 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2314 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2315 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2316 2317 2318 2319 2320 2321 smlal v20.4s, v10.4h, v2.h[0] 2322 smlsl v20.4s, v11.4h, v5.h[2] 2323 2324 2325 smlal v22.4s, v10.4h, v6.h[0] 2326 smlsl v22.4s, v11.4h, v0.h[2] 2327 2328 smlsl v16.4s, v10.4h, v6.h[0] 2329 smlsl v16.4s, v11.4h, v4.h[2] 2330 2331 smlsl v18.4s, v10.4h, v2.h[0] 2332 smlal v18.4s, v11.4h, v6.h[2] 2333 2334 cmp x12,x6 2335 bhs stage2_shift3 2336 2337 ld1 {v12.4h, v13.4h},[x1],#16 2338 ld1 {v14.4h, v15.4h},[x1],x10 2339 2340 2341 2342 2343 2344 smlsl v24.4s, v14.4h, v7.h[1] 2345 smlal v26.4s, v14.4h, v2.h[1] 2346 smlal v28.4s, v14.4h, v4.h[1] 2347 smlsl v30.4s, v14.4h, v5.h[1] 2348 2349 2350 smlal v24.4s, v15.4h, v0.h[3] 2351 smlal v26.4s, v15.4h, v7.h[1] 2352 smlsl v28.4s, v15.4h, v1.h[1] 2353 smlsl v30.4s, v15.4h, v6.h[1] 2354 2355 2356 smlsl v20.4s, v12.4h, v3.h[0] 2357 smlal v20.4s, v13.4h, v4.h[2] 2358 smlal v22.4s, v12.4h, v7.h[0] 2359 smlal v22.4s, v13.4h, v2.h[2] 2360 smlal v16.4s, v12.4h, v1.h[0] 2361 smlsl v16.4s, v13.4h, v6.h[2] 2362 smlal v18.4s, v12.4h, v5.h[0] 2363 smlsl v18.4s, v13.4h, v0.h[2] 2364 2365 cmp x12,x9 2366 bhs stage2_shift3 2367 2368 2369 ld1 {v10.4h, v11.4h},[x1],#16 2370 ld1 {v8.4h, v9.4h},[x1],x10 2371 2372 2373 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 2374 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1) 2375 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2) 2376 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 2377 2378 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2379 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2380 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2381 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2382 2383 2384 2385 2386 2387 smlal v20.4s, v10.4h, v0.h[0] 2388 smlsl v20.4s, v11.4h, v3.h[2] 2389 2390 2391 smlsl v22.4s, v10.4h, v0.h[0] 2392 smlsl v22.4s, v11.4h, v5.h[2] 2393 2394 smlsl v16.4s, v10.4h, v0.h[0] 2395 smlal v16.4s, v11.4h, v1.h[2] 2396 2397 smlal v18.4s, v10.4h, v0.h[0] 2398 smlal v18.4s, v11.4h, v7.h[2] 2399 2400 ld1 {v12.4h, v13.4h},[x1],#16 2401 ld1 {v14.4h, v15.4h},[x1],x10 2402 2403 2404 2405 2406 smlal v24.4s, v14.4h, v6.h[3] 2407 smlal v26.4s, v14.4h, v3.h[3] 2408 smlsl v28.4s, v14.4h, v1.h[3] 2409 smlal v30.4s, v14.4h, v7.h[1] 2410 2411 2412 smlal v24.4s, v15.4h, v1.h[3] 2413 smlsl v26.4s, v15.4h, v2.h[3] 2414 smlal v28.4s, v15.4h, v7.h[1] 2415 smlal v30.4s, v15.4h, v4.h[1] 2416 2417 2418 smlsl v20.4s, v12.4h, v5.h[0] 2419 smlal v20.4s, v13.4h, v2.h[2] 2420 smlal v22.4s, v12.4h, v1.h[0] 2421 smlsl v22.4s, v13.4h, v7.h[2] 2422 smlsl v16.4s, v12.4h, v7.h[0] 2423 smlsl v16.4s, v13.4h, v3.h[2] 2424 smlsl v18.4s, v12.4h, v3.h[0] 2425 smlal v18.4s, v13.4h, v1.h[2] 2426 2427 2428 ld1 {v10.4h, v11.4h},[x1],#16 2429 ld1 {v8.4h, v9.4h},[x1],x10 2430 2431 2432 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) 2433 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 2434 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2) 2435 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3) 2436 2437 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2438 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2439 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2440 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3) 2441 2442 2443 2444 2445 2446 smlal v20.4s, v10.4h, v6.h[0] 2447 smlsl v20.4s, v11.4h, v1.h[2] 2448 2449 2450 smlsl v22.4s, v10.4h, v2.h[0] 2451 smlal v22.4s, v11.4h, v4.h[2] 2452 2453 smlal v16.4s, v10.4h, v2.h[0] 2454 smlsl v16.4s, v11.4h, v7.h[2] 2455 2456 smlsl v18.4s, v10.4h, v6.h[0] 2457 smlsl v18.4s, v11.4h, v5.h[2] 2458 2459 ld1 {v12.4h, v13.4h},[x1],#16 2460 ld1 {v14.4h, v15.4h},[x1],x10 2461 2462 2463 2464 smlal v24.4s, v14.4h, v4.h[3] 2465 smlsl v26.4s, v14.4h, v6.h[1] 2466 smlal v28.4s, v14.4h, v7.h[3] 2467 smlal v30.4s, v14.4h, v6.h[3] 2468 2469 2470 smlal v24.4s, v15.4h, v3.h[3] 2471 smlsl v26.4s, v15.4h, v3.h[1] 2472 smlal v28.4s, v15.4h, v2.h[3] 2473 smlsl v30.4s, v15.4h, v2.h[1] 2474 2475 2476 smlsl v20.4s, v12.4h, v7.h[0] 2477 smlal v20.4s, v13.4h, v0.h[2] 2478 smlal v22.4s, v12.4h, v5.h[0] 2479 smlsl v22.4s, v13.4h, v1.h[2] 2480 smlsl v16.4s, v12.4h, v3.h[0] 2481 smlal v16.4s, v13.4h, v2.h[2] 2482 smlal v18.4s, v12.4h, v1.h[0] 2483 smlsl v18.4s, v13.4h, v3.h[2] 2484 2485stage2_shift3: 2486 add v8.4s, v20.4s , v24.4s 2487 sub v10.4s, v20.4s , v24.4s 2488 2489 add v12.4s, v22.4s , v26.4s 2490 sub v24.4s, v22.4s , v26.4s 2491 2492 add v14.4s, v16.4s , v28.4s 2493 sub v26.4s, v16.4s , v28.4s 2494 2495 2496 add v16.4s, v18.4s , v30.4s 2497 sub v28.4s, v18.4s , v30.4s 2498 2499 2500 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2501 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2502 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2503 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2504 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2505 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2506 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2507 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2508 2509 umov x15,v24.d[0] 2510 umov x16,v25.d[0] 2511 umov x19,v26.d[0] 2512 umov x20,v27.d[0] 2513 2514 trn1 v24.4h, v30.4h, v12.4h 2515 trn2 v25.4h, v30.4h, v12.4h 2516 trn1 v26.4h, v31.4h, v13.4h 2517 trn2 v27.4h, v31.4h, v13.4h 2518 2519 trn1 v30.2s, v24.2s, v26.2s 2520 trn2 v31.2s, v24.2s, v26.2s 2521 trn1 v12.2s, v25.2s, v27.2s 2522 trn2 v13.2s, v25.2s, v27.2s 2523 2524 trn1 v24.4h, v14.4h, v18.4h 2525 trn2 v25.4h, v14.4h, v18.4h 2526 trn1 v26.4h, v15.4h, v19.4h 2527 trn2 v27.4h, v15.4h, v19.4h 2528 2529 trn1 v14.2s, v24.2s, v26.2s 2530 trn2 v15.2s, v24.2s, v26.2s 2531 trn1 v18.2s, v25.2s, v27.2s 2532 trn2 v19.2s, v25.2s, v27.2s 2533 2534 mov v24.d[0],x15 2535 mov v25.d[0],x16 2536 mov v26.d[0],x19 2537 mov v27.d[0],x20 2538 2539 st1 { v30.4h, v31.4h},[x0],#16 2540 st1 { v12.4h, v13.4h},[x0],#16 2541 st1 { v14.4h, v15.4h},[x0],#16 2542 st1 { v18.4h, v19.4h},[x0],#16 2543 2544 2545 2546 mov x1,x4 2547 2548 2549 2550 2551 ld1 {v10.4h, v11.4h},[x1],#16 2552 ld1 {v8.4h, v9.4h},[x1],x10 2553 2554 2555 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0) 2556 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1) 2557 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2) 2558 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3) 2559 2560 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2561 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2562 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2563 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2564 2565 2566 2567 2568 2569 smull v20.4s, v10.4h, v0.h[0] 2570 smlsl v20.4s, v11.4h, v3.h[2] 2571 2572 2573 smull v22.4s, v10.4h, v0.h[0] 2574 smlsl v22.4s, v11.4h, v2.h[2] 2575 2576 smull v16.4s, v10.4h, v0.h[0] 2577 smlsl v16.4s, v11.4h, v1.h[2] 2578 2579 smull v18.4s, v10.4h, v0.h[0] 2580 smlsl v18.4s, v11.4h, v0.h[2] 2581 2582 cmp x12,x11 2583 bhs stage2_shift4 2584 ld1 {v12.4h, v13.4h},[x1],#16 2585 ld1 {v14.4h, v15.4h},[x1],x10 2586 2587 2588 2589 2590 2591 2592 smlal v24.4s, v14.4h, v0.h[1] 2593 smlal v26.4s, v14.4h, v1.h[3] 2594 smlal v28.4s, v14.4h, v4.h[1] 2595 smlal v30.4s, v14.4h, v6.h[3] 2596 2597 2598 smlsl v24.4s, v15.4h, v4.h[1] 2599 smlsl v26.4s, v15.4h, v0.h[3] 2600 smlsl v28.4s, v15.4h, v2.h[3] 2601 smlsl v30.4s, v15.4h, v6.h[1] 2602 2603 2604 smlal v20.4s, v12.4h, v7.h[0] 2605 smlal v20.4s, v13.4h, v5.h[2] 2606 smlal v22.4s, v12.4h, v5.h[0] 2607 smlsl v22.4s, v13.4h, v7.h[2] 2608 smlal v16.4s, v12.4h, v3.h[0] 2609 smlsl v16.4s, v13.4h, v4.h[2] 2610 smlal v18.4s, v12.4h, v1.h[0] 2611 smlsl v18.4s, v13.4h, v1.h[2] 2612 2613 cmp x12,x5 2614 bhs stage2_shift4 2615 2616 ld1 {v10.4h, v11.4h},[x1],#16 2617 ld1 {v8.4h, v9.4h},[x1],x10 2618 2619 2620 2621 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0) 2622 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1) 2623 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 2624 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3) 2625 2626 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2627 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2628 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2629 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2630 2631 2632 2633 2634 2635 smlsl v20.4s, v10.4h, v2.h[0] 2636 smlal v20.4s, v11.4h, v1.h[2] 2637 2638 2639 smlsl v22.4s, v10.4h, v6.h[0] 2640 smlal v22.4s, v11.4h, v3.h[2] 2641 2642 smlal v16.4s, v10.4h, v6.h[0] 2643 smlsl v16.4s, v11.4h, v7.h[2] 2644 2645 smlal v18.4s, v10.4h, v2.h[0] 2646 smlsl v18.4s, v11.4h, v2.h[2] 2647 2648 cmp x12,x6 2649 bhs stage2_shift4 2650 2651 2652 ld1 {v12.4h, v13.4h},[x1],#16 2653 ld1 {v14.4h, v15.4h},[x1],x10 2654 2655 2656 2657 2658 2659 2660 smlsl v24.4s, v14.4h, v1.h[1] 2661 smlsl v26.4s, v14.4h, v7.h[3] 2662 smlal v28.4s, v14.4h, v1.h[3] 2663 smlal v30.4s, v14.4h, v4.h[3] 2664 2665 2666 smlal v24.4s, v15.4h, v2.h[1] 2667 smlal v26.4s, v15.4h, v5.h[1] 2668 smlsl v28.4s, v15.4h, v3.h[1] 2669 smlsl v30.4s, v15.4h, v4.h[1] 2670 2671 2672 smlsl v20.4s, v12.4h, v5.h[0] 2673 smlsl v20.4s, v13.4h, v7.h[2] 2674 smlsl v22.4s, v12.4h, v1.h[0] 2675 smlal v22.4s, v13.4h, v1.h[2] 2676 smlsl v16.4s, v12.4h, v7.h[0] 2677 smlal v16.4s, v13.4h, v5.h[2] 2678 smlal v18.4s, v12.4h, v3.h[0] 2679 smlsl v18.4s, v13.4h, v3.h[2] 2680 2681 cmp x12,x9 2682 bhs stage2_shift4 2683 2684 2685 ld1 {v10.4h, v11.4h},[x1],#16 2686 ld1 {v8.4h, v9.4h},[x1],x10 2687 2688 2689 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0) 2690 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1) 2691 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2) 2692 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3) 2693 2694 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 2695 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 2696 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 2697 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2698 2699 2700 2701 2702 2703 smlal v20.4s, v10.4h, v0.h[0] 2704 smlsl v20.4s, v11.4h, v0.h[2] 2705 2706 2707 smlsl v22.4s, v10.4h, v0.h[0] 2708 smlal v22.4s, v11.4h, v6.h[2] 2709 2710 smlsl v16.4s, v10.4h, v0.h[0] 2711 smlal v16.4s, v11.4h, v2.h[2] 2712 2713 smlal v18.4s, v10.4h, v0.h[0] 2714 smlsl v18.4s, v11.4h, v4.h[2] 2715 2716 ld1 {v12.4h, v13.4h},[x1],#16 2717 ld1 {v14.4h, v15.4h},[x1],x10 2718 2719 2720 2721 2722 smlal v24.4s, v14.4h, v3.h[1] 2723 smlsl v26.4s, v14.4h, v2.h[1] 2724 smlal v28.4s, v14.4h, v7.h[3] 2725 smlal v30.4s, v14.4h, v2.h[3] 2726 2727 2728 smlsl v24.4s, v15.4h, v0.h[3] 2729 smlal v26.4s, v15.4h, v4.h[3] 2730 smlal v28.4s, v15.4h, v6.h[3] 2731 smlsl v30.4s, v15.4h, v2.h[1] 2732 2733 2734 smlal v20.4s, v12.4h, v3.h[0] 2735 smlsl v20.4s, v13.4h, v6.h[2] 2736 smlal v22.4s, v12.4h, v7.h[0] 2737 smlsl v22.4s, v13.4h, v4.h[2] 2738 smlsl v16.4s, v12.4h, v1.h[0] 2739 smlal v16.4s, v13.4h, v0.h[2] 2740 smlal v18.4s, v12.4h, v5.h[0] 2741 smlsl v18.4s, v13.4h, v5.h[2] 2742 2743 2744 ld1 {v10.4h, v11.4h},[x1],#16 2745 ld1 {v8.4h, v9.4h},[x1],x10 2746 2747 2748 2749 2750 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0) 2751 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1) 2752 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2) 2753 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 2754 2755 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0) 2756 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1) 2757 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2) 2758 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 2759 2760 2761 2762 2763 2764 smlsl v20.4s, v10.4h, v6.h[0] 2765 smlal v20.4s, v11.4h, v2.h[2] 2766 2767 2768 smlal v22.4s, v10.4h, v2.h[0] 2769 smlsl v22.4s, v11.4h, v0.h[2] 2770 2771 smlsl v16.4s, v10.4h, v2.h[0] 2772 smlal v16.4s, v11.4h, v3.h[2] 2773 2774 smlal v18.4s, v10.4h, v6.h[0] 2775 smlsl v18.4s, v11.4h, v6.h[2] 2776 2777 2778 ld1 {v12.4h, v13.4h},[x1],#16 2779 ld1 {v14.4h, v15.4h},[x1],x10 2780 2781 2782 2783 smlsl v24.4s, v14.4h, v5.h[1] 2784 smlal v26.4s, v14.4h, v3.h[3] 2785 smlsl v28.4s, v14.4h, v2.h[1] 2786 smlal v30.4s, v14.4h, v0.h[3] 2787 2788 2789 smlal v24.4s, v15.4h, v1.h[3] 2790 smlsl v26.4s, v15.4h, v1.h[1] 2791 smlal v28.4s, v15.4h, v0.h[3] 2792 smlsl v30.4s, v15.4h, v0.h[1] 2793 2794 2795 smlsl v20.4s, v12.4h, v1.h[0] 2796 smlal v20.4s, v13.4h, v4.h[2] 2797 smlal v22.4s, v12.4h, v3.h[0] 2798 smlsl v22.4s, v13.4h, v5.h[2] 2799 smlsl v16.4s, v12.4h, v5.h[0] 2800 smlal v16.4s, v13.4h, v6.h[2] 2801 smlal v18.4s, v12.4h, v7.h[0] 2802 smlsl v18.4s, v13.4h, v7.h[2] 2803 2804stage2_shift4: 2805 add v8.4s, v20.4s , v24.4s 2806 sub v10.4s, v20.4s , v24.4s 2807 2808 add v12.4s, v22.4s , v26.4s 2809 sub v24.4s, v22.4s , v26.4s 2810 2811 add v14.4s, v16.4s , v28.4s 2812 sub v26.4s, v16.4s , v28.4s 2813 2814 2815 add v16.4s, v18.4s , v30.4s 2816 sub v28.4s, v18.4s , v30.4s 2817 2818 2819 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct) 2820 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct) 2821 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct) 2822 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct) 2823 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct) 2824 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct) 2825 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct) 2826 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct) 2827 2828 2829 2830 umov x15,v24.d[0] 2831 umov x16,v25.d[0] 2832 umov x19,v26.d[0] 2833 umov x20,v27.d[0] 2834 2835 trn1 v24.4h, v30.4h, v12.4h 2836 trn2 v25.4h, v30.4h, v12.4h 2837 trn1 v26.4h, v31.4h, v13.4h 2838 trn2 v27.4h, v31.4h, v13.4h 2839 2840 trn1 v30.2s, v24.2s, v26.2s 2841 trn2 v31.2s, v24.2s, v26.2s 2842 trn1 v12.2s, v25.2s, v27.2s 2843 trn2 v13.2s, v25.2s, v27.2s 2844 2845 trn1 v24.4h, v14.4h, v18.4h 2846 trn2 v25.4h, v14.4h, v18.4h 2847 trn1 v26.4h, v15.4h, v19.4h 2848 trn2 v27.4h, v15.4h, v19.4h 2849 2850 trn1 v14.2s, v24.2s, v26.2s 2851 trn2 v15.2s, v24.2s, v26.2s 2852 trn1 v18.2s, v25.2s, v27.2s 2853 trn2 v19.2s, v25.2s, v27.2s 2854 2855 mov v24.d[0],x15 2856 mov v25.d[0],x16 2857 mov v26.d[0],x19 2858 mov v27.d[0],x20 2859 2860 st1 { v30.4h, v31.4h},[x0],#16 2861 st1 { v12.4h, v13.4h},[x0],#16 2862 st1 { v14.4h, v15.4h},[x0],#16 2863 st1 { v18.4h, v19.4h},[x0],#16 2864 2865 2866 2867 2868 sub x0,x0,#256 2869prediction_buffer: 2870 2871 2872 ld1 {v12.8h},[x0],#16 2873 ld1 {v14.8h},[x0],#16 2874 2875 add x0,x0,#32 2876 2877 ld1 {v16.8h},[x0],#16 2878 ld1 {v18.8h},[x0],#16 2879 add x0,x0,#32 2880 2881 ld1 {v20.8h},[x0],#16 2882 ld1 {v22.8h},[x0],#16 2883 2884 2885 add x0,x0,#32 2886 2887 ld1 {v24.8h},[x0],#16 2888 ld1 {v26.8h},[x0],#16 2889 2890 2891 2892 2893 2894// d12 =x0 1- 4 values 2895// d13 =x2 1- 4 values 2896// d14=x1 1- 4 values 2897// d15=x3 1- 4 values 2898 2899// d16 =x0 5- 8 values 2900// d17 =x2 5- 8 values 2901// d18=x1 5- 8 values 2902// d19=x3 5- 8 values 2903 2904// d20 =x0 9- 12 values 2905// d21 =x2 9- 12 values 2906// d22=x1 9- 12 values 2907// d23=x3 9- 12 values 2908 2909// d24 =x0 13-16 values 2910// d25 =x2 13- 16 values 2911// d26=x1 13- 16 values 2912// d27=x3 13- 16 values 2913 2914 // swapping v12 upper and v16 lower 64bits 2915 mov v13.d[0], v12.d[1] 2916 mov v12.d[1], v16.d[0] 2917 mov v16.d[0], v13.d[0] 2918 // swapping v20 upper and v24 lower 64bits 2919 mov v21.d[0], v20.d[1] 2920 mov v20.d[1], v24.d[0] 2921 mov v24.d[0], v21.d[0] 2922 // swapping v14 uppper and v18 lower 64bits 2923 mov v15.d[0], v14.d[1] 2924 mov v14.d[1], v18.d[0] 2925 mov v18.d[0], v15.d[0] 2926 // swapping v22 upper and v26 lower 64bits 2927 mov v23.d[0], v22.d[1] 2928 mov v22.d[1], v26.d[0] 2929 mov v26.d[0], v23.d[0] 2930 2931 2932 ld1 {v8.8b, v9.8b},[x2],x8 2933 ld1 {v10.8b, v11.8b},[x2],x8 2934 ld1 {v28.8b, v29.8b},[x2],x8 2935 ld1 {v30.8b, v31.8b},[x2],x8 2936 2937 2938 uaddw v12.8h, v12.8h , v8.8b 2939 uaddw v20.8h, v20.8h , v9.8b 2940 uaddw v14.8h, v14.8h , v10.8b 2941 uaddw v22.8h, v22.8h , v11.8b 2942 uaddw v16.8h, v16.8h , v28.8b 2943 uaddw v24.8h, v24.8h , v29.8b 2944 uaddw v18.8h, v18.8h , v30.8b 2945 uaddw v26.8h, v26.8h , v31.8b 2946 sub x2,x2,x8,lsl #2 2947 add x2,x2,#16 2948 sqxtun v12.8b, v12.8h 2949 sqxtun v13.8b, v20.8h 2950 sqxtun v20.8b, v14.8h 2951 sqxtun v21.8b, v22.8h 2952 sqxtun v14.8b, v16.8h 2953 sqxtun v15.8b, v24.8h 2954 sqxtun v22.8b, v18.8h 2955 sqxtun v23.8b, v26.8h 2956 2957 2958 st1 {v12.8b, v13.8b},[x3],x7 2959 st1 {v20.8b, v21.8b},[x3],x7 2960 st1 {v14.8b, v15.8b},[x3],x7 2961 st1 {v22.8b, v23.8b},[x3],x7 2962 2963 2964 sub x3,x3,x7,lsl #2 2965 add x3,x3,#16 2966 2967 ld1 {v12.8h},[x0],#16 2968 ld1 {v14.8h},[x0],#16 2969 2970 sub x0,x0,#96 2971 2972 ld1 {v16.8h},[x0],#16 2973 ld1 {v18.8h},[x0],#16 2974 sub x0,x0,#96 2975 2976 ld1 {v20.8h},[x0],#16 2977 ld1 {v22.8h},[x0],#16 2978 2979 2980 sub x0,x0,#96 2981 2982 ld1 {v24.8h},[x0],#16 2983 ld1 {v26.8h},[x0],#16 2984 2985 2986 sub x0,x0,#64 2987 2988 2989 // swapping v12 upper and v16 lower 64bits 2990 mov v13.d[0], v12.d[1] 2991 mov v12.d[1], v16.d[0] 2992 mov v16.d[0], v13.d[0] 2993 // swapping v20 upper and v24 lower 64bits 2994 mov v21.d[0], v20.d[1] 2995 mov v20.d[1], v24.d[0] 2996 mov v24.d[0], v21.d[0] 2997 // swapping v14 uppper and v18 lower 64bits 2998 mov v15.d[0], v14.d[1] 2999 mov v14.d[1], v18.d[0] 3000 mov v18.d[0], v15.d[0] 3001 // swapping v22 upper and v26 lower 64bits 3002 mov v23.d[0], v22.d[1] 3003 mov v22.d[1], v26.d[0] 3004 mov v26.d[0], v23.d[0] 3005 3006 3007 ld1 {v8.8b, v9.8b},[x2],x8 3008 ld1 {v10.8b, v11.8b},[x2],x8 3009 ld1 {v28.8b, v29.8b},[x2],x8 3010 ld1 {v30.8b, v31.8b},[x2],x8 3011 3012 3013 uaddw v12.8h, v12.8h , v8.8b 3014 uaddw v20.8h, v20.8h , v9.8b 3015 uaddw v14.8h, v14.8h , v10.8b 3016 uaddw v22.8h, v22.8h , v11.8b 3017 uaddw v16.8h, v16.8h , v28.8b 3018 uaddw v24.8h, v24.8h , v29.8b 3019 uaddw v18.8h, v18.8h , v30.8b 3020 uaddw v26.8h, v26.8h , v31.8b 3021 sub x2,x2,#16 3022 3023 sqxtun v12.8b, v12.8h 3024 sqxtun v13.8b, v20.8h 3025 sqxtun v20.8b, v14.8h 3026 sqxtun v21.8b, v22.8h 3027 sqxtun v14.8b, v16.8h 3028 sqxtun v15.8b, v24.8h 3029 sqxtun v22.8b, v18.8h 3030 sqxtun v23.8b, v26.8h 3031 3032 3033 st1 {v12.8b, v13.8b},[x3],x7 3034 st1 {v20.8b, v21.8b},[x3],x7 3035 st1 {v14.8b, v15.8b},[x3],x7 3036 st1 {v22.8b, v23.8b},[x3],x7 3037 3038 sub x3,x3,#16 3039 3040 subs x14,x14,#1 3041 bne dct_stage2 3042 // ldmfd sp!,{x0-x12,pc} 3043 ldp x19, x20,[sp],#16 3044 pop_v_regs 3045 ret 3046 3047 3048 3049 3050 3051