Lines Matching refs:h
172 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
173 ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
211 ld1 {v10.4h},[x0],x6
212 ld1 {v8.4h},[x0],x6
213 ld1 {v11.4h},[x0],x6
214 ld1 {v9.4h},[x0],x6
216 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
217 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
218 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
219 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
221 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
222 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
223 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
224 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
230 smull v20.4s, v10.4h, v0.h[0]
231 smlal v20.4s, v11.4h, v0.h[2]
234 smull v22.4s, v10.4h, v0.h[0]
235 smlal v22.4s, v11.4h, v1.h[2]
237 smull v16.4s, v10.4h, v0.h[0]
238 smlal v16.4s, v11.4h, v2.h[2]
240 smull v18.4s, v10.4h, v0.h[0]
241 smlal v18.4s, v11.4h, v3.h[2]
245 ld1 {v12.4h},[x0],x6
246 ld1 {v14.4h},[x0],x6
247 ld1 {v13.4h},[x0],x6
248 ld1 {v15.4h},[x0],x6
256 smlal v24.4s, v14.4h, v1.h[1]
257 smlal v26.4s, v14.4h, v3.h[3]
258 smlal v28.4s, v14.4h, v6.h[1]
259 smlsl v30.4s, v14.4h, v7.h[1]
262 smlal v24.4s, v15.4h, v1.h[3]
263 smlal v26.4s, v15.4h, v5.h[1]
264 smlsl v28.4s, v15.4h, v7.h[1]
265 smlsl v30.4s, v15.4h, v3.h[3]
268 smlal v20.4s, v12.4h, v1.h[0]
269 smlal v20.4s, v13.4h, v1.h[2]
270 smlal v22.4s, v12.4h, v3.h[0]
271 smlal v22.4s, v13.4h, v4.h[2]
272 smlal v16.4s, v12.4h, v5.h[0]
273 smlal v16.4s, v13.4h, v7.h[2]
274 smlal v18.4s, v12.4h, v7.h[0]
275 smlsl v18.4s, v13.4h, v5.h[2]
280 ld1 {v10.4h},[x0],x6
281 ld1 {v8.4h},[x0],x6
282 ld1 {v11.4h},[x0],x6
283 ld1 {v9.4h},[x0],x6
286 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
287 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
288 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
289 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
291 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
292 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
293 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
294 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
300 smlal v20.4s, v10.4h, v2.h[0]
301 smlal v20.4s, v11.4h, v2.h[2]
304 smlal v22.4s, v10.4h, v6.h[0]
305 smlal v22.4s, v11.4h, v7.h[2]
307 smlsl v16.4s, v10.4h, v6.h[0]
308 smlsl v16.4s, v11.4h, v3.h[2]
310 smlsl v18.4s, v10.4h, v2.h[0]
311 smlsl v18.4s, v11.4h, v1.h[2]
317 ld1 {v12.4h},[x0],x6
318 ld1 {v14.4h},[x0],x6
319 ld1 {v13.4h},[x0],x6
320 ld1 {v15.4h},[x0],x6
330 smlal v24.4s, v14.4h, v3.h[1]
331 smlsl v26.4s, v14.4h, v6.h[1]
332 smlsl v28.4s, v14.4h, v0.h[1]
333 smlsl v30.4s, v14.4h, v6.h[3]
336 smlal v24.4s, v15.4h, v3.h[3]
337 smlsl v26.4s, v15.4h, v4.h[3]
338 smlsl v28.4s, v15.4h, v2.h[3]
339 smlal v30.4s, v15.4h, v5.h[3]
342 smlal v20.4s, v12.4h, v3.h[0]
343 smlal v20.4s, v13.4h, v3.h[2]
344 smlsl v22.4s, v12.4h, v7.h[0]
345 smlsl v22.4s, v13.4h, v5.h[2]
346 smlsl v16.4s, v12.4h, v1.h[0]
347 smlsl v16.4s, v13.4h, v1.h[2]
348 smlsl v18.4s, v12.4h, v5.h[0]
349 smlal v18.4s, v13.4h, v7.h[2]
355 ld1 {v10.4h},[x0],x6
356 ld1 {v8.4h},[x0],x6
357 ld1 {v11.4h},[x0],x6
358 ld1 {v9.4h},[x0],x6
362 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
363 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
364 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
365 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
367 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
368 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
369 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
370 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
376 smlal v20.4s, v10.4h, v0.h[0]
377 smlal v20.4s, v11.4h, v4.h[2]
380 smlsl v22.4s, v10.4h, v0.h[0]
381 smlsl v22.4s, v11.4h, v2.h[2]
383 smlsl v16.4s, v10.4h, v0.h[0]
384 smlsl v16.4s, v11.4h, v6.h[2]
386 smlal v18.4s, v10.4h, v0.h[0]
387 smlal v18.4s, v11.4h, v0.h[2]
391 ld1 {v12.4h},[x0],x6
392 ld1 {v14.4h},[x0],x6
393 ld1 {v13.4h},[x0],x6
394 ld1 {v15.4h},[x0],x6
399 smlal v24.4s, v14.4h, v5.h[1]
400 smlsl v26.4s, v14.4h, v0.h[2]
401 smlal v28.4s, v14.4h, v5.h[3]
402 smlal v30.4s, v14.4h, v4.h[3]
405 smlal v24.4s, v15.4h, v5.h[3]
406 smlsl v26.4s, v15.4h, v1.h[1]
407 smlal v28.4s, v15.4h, v3.h[1]
408 smlsl v30.4s, v15.4h, v7.h[3]
411 smlal v20.4s, v12.4h, v5.h[0]
412 smlal v20.4s, v13.4h, v5.h[2]
413 smlsl v22.4s, v12.4h, v1.h[0]
414 smlsl v22.4s, v13.4h, v0.h[2]
415 smlal v16.4s, v12.4h, v7.h[0]
416 smlal v16.4s, v13.4h, v4.h[2]
417 smlal v18.4s, v12.4h, v3.h[0]
418 smlal v18.4s, v13.4h, v6.h[2]
421 ld1 {v10.4h},[x0],x6
422 ld1 {v8.4h},[x0],x6
423 ld1 {v11.4h},[x0],x6
424 ld1 {v9.4h},[x0],x6
432 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
433 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
434 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
435 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
437 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
438 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
439 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
440 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
446 smlal v20.4s, v10.4h, v6.h[0]
447 smlal v20.4s, v11.4h, v6.h[2]
450 smlsl v22.4s, v10.4h, v2.h[0]
451 smlsl v22.4s, v11.4h, v3.h[2]
453 smlal v16.4s, v10.4h, v2.h[0]
454 smlal v16.4s, v11.4h, v0.h[2]
456 smlsl v18.4s, v10.4h, v6.h[0]
457 smlsl v18.4s, v11.4h, v2.h[2]
459 ld1 {v12.4h},[x0],x6
460 ld1 {v14.4h},[x0],x6
461 ld1 {v13.4h},[x0],x6
462 ld1 {v15.4h},[x0],x6
465 smlal v24.4s, v14.4h, v7.h[1]
466 smlsl v26.4s, v14.4h, v5.h[3]
467 smlal v28.4s, v14.4h, v4.h[1]
468 smlsl v30.4s, v14.4h, v2.h[3]
471 smlal v24.4s, v15.4h, v7.h[3]
472 smlsl v26.4s, v15.4h, v7.h[1]
473 smlal v28.4s, v15.4h, v6.h[3]
474 smlsl v30.4s, v15.4h, v6.h[1]
477 smlal v20.4s, v12.4h, v7.h[0]
478 smlal v20.4s, v13.4h, v7.h[2]
479 smlsl v22.4s, v12.4h, v5.h[0]
480 smlsl v22.4s, v13.4h, v6.h[2]
481 smlal v16.4s, v12.4h, v3.h[0]
482 smlal v16.4s, v13.4h, v5.h[2]
483 smlsl v18.4s, v12.4h, v1.h[0]
484 smlsl v18.4s, v13.4h, v4.h[2]
503 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
504 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
505 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
506 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
507 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
508 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
509 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
510 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
520 trn1 v24.4h, v30.4h, v12.4h
521 trn2 v25.4h, v30.4h, v12.4h
522 trn1 v26.4h, v31.4h, v13.4h
523 trn2 v27.4h, v31.4h, v13.4h
530 trn1 v24.4h, v14.4h, v18.4h
531 trn2 v25.4h, v14.4h, v18.4h
532 trn1 v26.4h, v15.4h, v19.4h
533 trn2 v27.4h, v15.4h, v19.4h
556 st1 { v30.4h, v31.4h},[x1],#16
557 st1 { v12.4h, v13.4h},[x1],#16
559 st1 { v14.4h, v15.4h},[x1],#16
560 st1 { v18.4h, v19.4h},[x1],#16
569 ld1 {v10.4h},[x0],x6
570 ld1 {v8.4h},[x0],x6
571 ld1 {v11.4h},[x0],x6
572 ld1 {v9.4h},[x0],x6
577 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
578 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
579 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
580 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
582 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
583 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
584 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
585 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
591 smull v20.4s, v10.4h, v0.h[0]
592 smlal v20.4s, v11.4h, v4.h[2]
595 smull v22.4s, v10.4h, v0.h[0]
596 smlal v22.4s, v11.4h, v5.h[2]
598 smull v16.4s, v10.4h, v0.h[0]
599 smlal v16.4s, v11.4h, v6.h[2]
601 smull v18.4s, v10.4h, v0.h[0]
602 smlal v18.4s, v11.4h, v7.h[2]
606 ld1 {v12.4h},[x0],x6
607 ld1 {v14.4h},[x0],x6
608 ld1 {v13.4h},[x0],x6
609 ld1 {v15.4h},[x0],x6
612 smlsl v24.4s, v14.4h, v4.h[3]
613 smlsl v26.4s, v14.4h, v2.h[1]
614 smlsl v28.4s, v14.4h, v0.h[1]
615 smlsl v30.4s, v14.4h, v2.h[3]
618 smlsl v24.4s, v15.4h, v0.h[3]
619 smlsl v26.4s, v15.4h, v3.h[1]
620 smlsl v28.4s, v15.4h, v6.h[3]
621 smlal v30.4s, v15.4h, v5.h[3]
624 smlsl v20.4s, v12.4h, v7.h[0]
625 smlsl v20.4s, v13.4h, v2.h[2]
626 smlsl v22.4s, v12.4h, v5.h[0]
627 smlsl v22.4s, v13.4h, v0.h[2]
628 smlsl v16.4s, v12.4h, v3.h[0]
629 smlsl v16.4s, v13.4h, v3.h[2]
630 smlsl v18.4s, v12.4h, v1.h[0]
631 smlsl v18.4s, v13.4h, v6.h[2]
637 ld1 {v10.4h},[x0],x6
638 ld1 {v8.4h},[x0],x6
639 ld1 {v11.4h},[x0],x6
640 ld1 {v9.4h},[x0],x6
648 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
649 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
650 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
651 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
653 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
654 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
655 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
656 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
662 smlsl v20.4s, v10.4h, v2.h[0]
663 smlsl v20.4s, v11.4h, v6.h[2]
666 smlsl v22.4s, v10.4h, v6.h[0]
667 smlal v22.4s, v11.4h, v4.h[2]
669 smlal v16.4s, v10.4h, v6.h[0]
670 smlal v16.4s, v11.4h, v0.h[2]
672 smlal v18.4s, v10.4h, v2.h[0]
673 smlal v18.4s, v11.4h, v5.h[2]
679 ld1 {v12.4h},[x0],x6
680 ld1 {v14.4h},[x0],x6
681 ld1 {v13.4h},[x0],x6
682 ld1 {v15.4h},[x0],x6
688 smlal v24.4s, v14.4h, v2.h[3]
689 smlal v26.4s, v14.4h, v3.h[3]
690 smlsl v28.4s, v14.4h, v5.h[3]
691 smlsl v30.4s, v14.4h, v0.h[3]
694 smlal v24.4s, v15.4h, v1.h[3]
695 smlsl v26.4s, v15.4h, v6.h[3]
696 smlsl v28.4s, v15.4h, v0.h[3]
697 smlal v30.4s, v15.4h, v7.h[3]
700 smlal v20.4s, v12.4h, v5.h[0]
701 smlal v20.4s, v13.4h, v0.h[2]
702 smlal v22.4s, v12.4h, v1.h[0]
703 smlal v22.4s, v13.4h, v6.h[2]
704 smlal v16.4s, v12.4h, v7.h[0]
705 smlsl v16.4s, v13.4h, v2.h[2]
706 smlsl v18.4s, v12.4h, v3.h[0]
707 smlsl v18.4s, v13.4h, v4.h[2]
714 ld1 {v10.4h},[x0],x6
715 ld1 {v8.4h},[x0],x6
716 ld1 {v11.4h},[x0],x6
717 ld1 {v9.4h},[x0],x6
725 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
726 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
727 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
728 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
730 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
731 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
732 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
733 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
739 smlal v20.4s, v10.4h, v0.h[0]
740 smlsl v20.4s, v11.4h, v7.h[2]
743 smlsl v22.4s, v10.4h, v0.h[0]
744 smlsl v22.4s, v11.4h, v1.h[2]
746 smlsl v16.4s, v10.4h, v0.h[0]
747 smlal v16.4s, v11.4h, v5.h[2]
749 smlal v18.4s, v10.4h, v0.h[0]
750 smlal v18.4s, v11.4h, v3.h[2]
754 ld1 {v12.4h},[x0],x6
755 ld1 {v14.4h},[x0],x6
756 ld1 {v13.4h},[x0],x6
757 ld1 {v15.4h},[x0],x6
760 smlsl v24.4s, v14.4h, v0.h[1]
761 smlal v26.4s, v14.4h, v6.h[1]
762 smlal v28.4s, v14.4h, v4.h[1]
763 smlsl v30.4s, v14.4h, v1.h[1]
766 smlsl v24.4s, v15.4h, v3.h[3]
767 smlal v26.4s, v15.4h, v0.h[1]
768 smlsl v28.4s, v15.4h, v5.h[1]
769 smlsl v30.4s, v15.4h, v6.h[1]
772 smlsl v20.4s, v12.4h, v3.h[0]
773 smlsl v20.4s, v13.4h, v1.h[2]
774 smlsl v22.4s, v12.4h, v7.h[0]
775 smlal v22.4s, v13.4h, v3.h[2]
776 smlal v16.4s, v12.4h, v1.h[0]
777 smlal v16.4s, v13.4h, v7.h[2]
778 smlsl v18.4s, v12.4h, v5.h[0]
779 smlsl v18.4s, v13.4h, v2.h[2]
781 ld1 {v10.4h},[x0],x6
782 ld1 {v8.4h},[x0],x6
783 ld1 {v11.4h},[x0],x6
784 ld1 {v9.4h},[x0],x6
789 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
790 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
791 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
792 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
794 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
795 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
796 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
797 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
803 smlsl v20.4s, v10.4h, v6.h[0]
804 smlal v20.4s, v11.4h, v5.h[2]
807 smlal v22.4s, v10.4h, v2.h[0]
808 smlal v22.4s, v11.4h, v7.h[2]
810 smlsl v16.4s, v10.4h, v2.h[0]
811 smlsl v16.4s, v11.4h, v4.h[2]
813 smlal v18.4s, v10.4h, v6.h[0]
814 smlal v18.4s, v11.4h, v1.h[2]
817 ld1 {v12.4h},[x0],x6
818 ld1 {v14.4h},[x0],x6
819 ld1 {v13.4h},[x0],x6
820 ld1 {v15.4h},[x0],x6
826 smlal v24.4s, v14.4h, v1.h[1]
827 smlsl v26.4s, v14.4h, v0.h[3]
828 smlal v28.4s, v14.4h, v1.h[3]
829 smlsl v30.4s, v14.4h, v3.h[1]
832 smlal v24.4s, v15.4h, v5.h[3]
833 smlsl v26.4s, v15.4h, v5.h[1]
834 smlal v28.4s, v15.4h, v4.h[3]
835 smlsl v30.4s, v15.4h, v4.h[1]
838 smlal v20.4s, v12.4h, v1.h[0]
839 smlal v20.4s, v13.4h, v3.h[2]
840 smlsl v22.4s, v12.4h, v3.h[0]
841 smlsl v22.4s, v13.4h, v2.h[2]
842 smlal v16.4s, v12.4h, v5.h[0]
843 smlal v16.4s, v13.4h, v1.h[2]
844 smlsl v18.4s, v12.4h, v7.h[0]
845 smlsl v18.4s, v13.4h, v0.h[2]
862 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
863 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
864 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
865 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
866 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
867 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
868 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
869 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
876 trn1 v24.4h, v30.4h, v12.4h
877 trn2 v25.4h, v30.4h, v12.4h
878 trn1 v26.4h, v31.4h, v13.4h
879 trn2 v27.4h, v31.4h, v13.4h
886 trn1 v24.4h, v14.4h, v18.4h
887 trn2 v25.4h, v14.4h, v18.4h
888 trn1 v26.4h, v15.4h, v19.4h
889 trn2 v27.4h, v15.4h, v19.4h
901 st1 { v30.4h, v31.4h},[x1],#16
902 st1 { v12.4h, v13.4h},[x1],#16
904 st1 { v14.4h, v15.4h},[x1],#16
905 st1 { v18.4h, v19.4h},[x1],#16
911 ld1 {v10.4h},[x0],x6
912 ld1 {v8.4h},[x0],x6
913 ld1 {v11.4h},[x0],x6
914 ld1 {v9.4h},[x0],x6
917 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
918 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
919 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
920 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
922 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
923 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
924 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
925 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
931 smull v20.4s, v10.4h, v0.h[0]
932 smlsl v20.4s, v11.4h, v7.h[2]
935 smull v22.4s, v10.4h, v0.h[0]
936 smlsl v22.4s, v11.4h, v6.h[2]
938 smull v16.4s, v10.4h, v0.h[0]
939 smlsl v16.4s, v11.4h, v5.h[2]
941 smull v18.4s, v10.4h, v0.h[0]
942 smlsl v18.4s, v11.4h, v4.h[2]
947 ld1 {v12.4h},[x0],x6
948 ld1 {v14.4h},[x0],x6
949 ld1 {v13.4h},[x0],x6
950 ld1 {v15.4h},[x0],x6
955 smlsl v24.4s, v14.4h, v5.h[1]
956 smlsl v26.4s, v14.4h, v7.h[3]
957 smlal v28.4s, v14.4h, v5.h[3]
958 smlal v30.4s, v14.4h, v3.h[1]
961 smlal v24.4s, v15.4h, v2.h[1]
962 smlal v26.4s, v15.4h, v1.h[1]
963 smlal v28.4s, v15.4h, v4.h[3]
964 smlsl v30.4s, v15.4h, v7.h[3]
967 smlsl v20.4s, v12.4h, v1.h[0]
968 smlal v20.4s, v13.4h, v6.h[2]
969 smlsl v22.4s, v12.4h, v3.h[0]
970 smlal v22.4s, v13.4h, v3.h[2]
971 smlsl v16.4s, v12.4h, v5.h[0]
972 smlal v16.4s, v13.4h, v0.h[2]
973 smlsl v18.4s, v12.4h, v7.h[0]
974 smlal v18.4s, v13.4h, v2.h[2]
979 ld1 {v10.4h},[x0],x6
980 ld1 {v8.4h},[x0],x6
981 ld1 {v11.4h},[x0],x6
982 ld1 {v9.4h},[x0],x6
984 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
985 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
986 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
987 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
989 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
990 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
991 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
992 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
998 smlal v20.4s, v10.4h, v2.h[0]
999 smlsl v20.4s, v11.4h, v5.h[2]
1002 smlal v22.4s, v10.4h, v6.h[0]
1003 smlsl v22.4s, v11.4h, v0.h[2]
1005 smlsl v16.4s, v10.4h, v6.h[0]
1006 smlsl v16.4s, v11.4h, v4.h[2]
1008 smlsl v18.4s, v10.4h, v2.h[0]
1009 smlal v18.4s, v11.4h, v6.h[2]
1015 ld1 {v12.4h},[x0],x6
1016 ld1 {v14.4h},[x0],x6
1017 ld1 {v13.4h},[x0],x6
1018 ld1 {v15.4h},[x0],x6
1025 smlsl v24.4s, v14.4h, v7.h[1]
1026 smlal v26.4s, v14.4h, v2.h[1]
1027 smlal v28.4s, v14.4h, v4.h[1]
1028 smlsl v30.4s, v14.4h, v5.h[1]
1031 smlal v24.4s, v15.4h, v0.h[3]
1032 smlal v26.4s, v15.4h, v7.h[1]
1033 smlsl v28.4s, v15.4h, v1.h[1]
1034 smlsl v30.4s, v15.4h, v6.h[1]
1037 smlsl v20.4s, v12.4h, v3.h[0]
1038 smlal v20.4s, v13.4h, v4.h[2]
1039 smlal v22.4s, v12.4h, v7.h[0]
1040 smlal v22.4s, v13.4h, v2.h[2]
1041 smlal v16.4s, v12.4h, v1.h[0]
1042 smlsl v16.4s, v13.4h, v6.h[2]
1043 smlal v18.4s, v12.4h, v5.h[0]
1044 smlsl v18.4s, v13.4h, v0.h[2]
1051 ld1 {v10.4h},[x0],x6
1052 ld1 {v8.4h},[x0],x6
1053 ld1 {v11.4h},[x0],x6
1054 ld1 {v9.4h},[x0],x6
1057 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
1058 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
1059 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
1060 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1062 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1063 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1064 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1065 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1071 smlal v20.4s, v10.4h, v0.h[0]
1072 smlsl v20.4s, v11.4h, v3.h[2]
1075 smlsl v22.4s, v10.4h, v0.h[0]
1076 smlsl v22.4s, v11.4h, v5.h[2]
1078 smlsl v16.4s, v10.4h, v0.h[0]
1079 smlal v16.4s, v11.4h, v1.h[2]
1081 smlal v18.4s, v10.4h, v0.h[0]
1082 smlal v18.4s, v11.4h, v7.h[2]
1085 ld1 {v12.4h},[x0],x6
1086 ld1 {v14.4h},[x0],x6
1087 ld1 {v13.4h},[x0],x6
1088 ld1 {v15.4h},[x0],x6
1092 smlal v24.4s, v14.4h, v6.h[3]
1093 smlal v26.4s, v14.4h, v3.h[3]
1094 smlsl v28.4s, v14.4h, v1.h[3]
1095 smlal v30.4s, v14.4h, v7.h[1]
1098 smlal v24.4s, v15.4h, v1.h[3]
1099 smlsl v26.4s, v15.4h, v2.h[3]
1100 smlal v28.4s, v15.4h, v7.h[1]
1101 smlal v30.4s, v15.4h, v4.h[1]
1104 smlsl v20.4s, v12.4h, v5.h[0]
1105 smlal v20.4s, v13.4h, v2.h[2]
1106 smlal v22.4s, v12.4h, v1.h[0]
1107 smlsl v22.4s, v13.4h, v7.h[2]
1108 smlsl v16.4s, v12.4h, v7.h[0]
1109 smlsl v16.4s, v13.4h, v3.h[2]
1110 smlsl v18.4s, v12.4h, v3.h[0]
1111 smlal v18.4s, v13.4h, v1.h[2]
1115 ld1 {v10.4h},[x0],x6
1116 ld1 {v8.4h},[x0],x6
1117 ld1 {v11.4h},[x0],x6
1118 ld1 {v9.4h},[x0],x6
1123 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
1124 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1125 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
1126 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
1128 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1129 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1130 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1131 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1137 smlal v20.4s, v10.4h, v6.h[0]
1138 smlsl v20.4s, v11.4h, v1.h[2]
1141 smlsl v22.4s, v10.4h, v2.h[0]
1142 smlal v22.4s, v11.4h, v4.h[2]
1144 smlal v16.4s, v10.4h, v2.h[0]
1145 smlsl v16.4s, v11.4h, v7.h[2]
1147 smlsl v18.4s, v10.4h, v6.h[0]
1148 smlsl v18.4s, v11.4h, v5.h[2]
1151 ld1 {v12.4h},[x0],x6
1152 ld1 {v14.4h},[x0],x6
1153 ld1 {v13.4h},[x0],x6
1154 ld1 {v15.4h},[x0],x6
1156 smlal v24.4s, v14.4h, v4.h[3]
1157 smlsl v26.4s, v14.4h, v6.h[1]
1158 smlal v28.4s, v14.4h, v7.h[3]
1159 smlal v30.4s, v14.4h, v6.h[3]
1162 smlal v24.4s, v15.4h, v3.h[3]
1163 smlsl v26.4s, v15.4h, v3.h[1]
1164 smlal v28.4s, v15.4h, v2.h[3]
1165 smlsl v30.4s, v15.4h, v2.h[1]
1168 smlsl v20.4s, v12.4h, v7.h[0]
1169 smlal v20.4s, v13.4h, v0.h[2]
1170 smlal v22.4s, v12.4h, v5.h[0]
1171 smlsl v22.4s, v13.4h, v1.h[2]
1172 smlsl v16.4s, v12.4h, v3.h[0]
1173 smlal v16.4s, v13.4h, v2.h[2]
1174 smlal v18.4s, v12.4h, v1.h[0]
1175 smlsl v18.4s, v13.4h, v3.h[2]
1192 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1193 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1194 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1195 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1196 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1197 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1198 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1199 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1206 trn1 v24.4h, v30.4h, v12.4h
1207 trn2 v25.4h, v30.4h, v12.4h
1208 trn1 v26.4h, v31.4h, v13.4h
1209 trn2 v27.4h, v31.4h, v13.4h
1216 trn1 v24.4h, v14.4h, v18.4h
1217 trn2 v25.4h, v14.4h, v18.4h
1218 trn1 v26.4h, v15.4h, v19.4h
1219 trn2 v27.4h, v15.4h, v19.4h
1230 st1 { v30.4h, v31.4h},[x1],#16
1231 st1 { v12.4h, v13.4h},[x1],#16
1233 st1 { v14.4h, v15.4h},[x1],#16
1234 st1 { v18.4h, v19.4h},[x1],#16
1241 ld1 {v10.4h},[x0],x6
1242 ld1 {v8.4h},[x0],x6
1243 ld1 {v11.4h},[x0],x6
1244 ld1 {v9.4h},[x0],x6
1247 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
1248 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1249 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
1250 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
1252 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1253 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1254 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1255 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1261 smull v20.4s, v10.4h, v0.h[0]
1262 smlsl v20.4s, v11.4h, v3.h[2]
1265 smull v22.4s, v10.4h, v0.h[0]
1266 smlsl v22.4s, v11.4h, v2.h[2]
1268 smull v16.4s, v10.4h, v0.h[0]
1269 smlsl v16.4s, v11.4h, v1.h[2]
1271 smull v18.4s, v10.4h, v0.h[0]
1272 smlsl v18.4s, v11.4h, v0.h[2]
1277 ld1 {v12.4h},[x0],x6
1278 ld1 {v14.4h},[x0],x6
1279 ld1 {v13.4h},[x0],x6
1280 ld1 {v15.4h},[x0],x6
1287 smlal v24.4s, v14.4h, v0.h[1]
1288 smlal v26.4s, v14.4h, v1.h[3]
1289 smlal v28.4s, v14.4h, v4.h[1]
1290 smlal v30.4s, v14.4h, v6.h[3]
1293 smlsl v24.4s, v15.4h, v4.h[1]
1294 smlsl v26.4s, v15.4h, v0.h[3]
1295 smlsl v28.4s, v15.4h, v2.h[3]
1296 smlsl v30.4s, v15.4h, v6.h[1]
1299 smlal v20.4s, v12.4h, v7.h[0]
1300 smlal v20.4s, v13.4h, v5.h[2]
1301 smlal v22.4s, v12.4h, v5.h[0]
1302 smlsl v22.4s, v13.4h, v7.h[2]
1303 smlal v16.4s, v12.4h, v3.h[0]
1304 smlsl v16.4s, v13.4h, v4.h[2]
1305 smlal v18.4s, v12.4h, v1.h[0]
1306 smlsl v18.4s, v13.4h, v1.h[2]
1311 ld1 {v10.4h},[x0],x6
1312 ld1 {v8.4h},[x0],x6
1313 ld1 {v11.4h},[x0],x6
1314 ld1 {v9.4h},[x0],x6
1318 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
1319 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
1320 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
1321 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
1323 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1324 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1325 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1326 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1332 smlsl v20.4s, v10.4h, v2.h[0]
1333 smlal v20.4s, v11.4h, v1.h[2]
1336 smlsl v22.4s, v10.4h, v6.h[0]
1337 smlal v22.4s, v11.4h, v3.h[2]
1339 smlal v16.4s, v10.4h, v6.h[0]
1340 smlsl v16.4s, v11.4h, v7.h[2]
1342 smlal v18.4s, v10.4h, v2.h[0]
1343 smlsl v18.4s, v11.4h, v2.h[2]
1349 ld1 {v12.4h},[x0],x6
1350 ld1 {v14.4h},[x0],x6
1351 ld1 {v13.4h},[x0],x6
1352 ld1 {v15.4h},[x0],x6
1359 smlsl v24.4s, v14.4h, v1.h[1]
1360 smlsl v26.4s, v14.4h, v7.h[3]
1361 smlal v28.4s, v14.4h, v1.h[3]
1362 smlal v30.4s, v14.4h, v4.h[3]
1365 smlal v24.4s, v15.4h, v2.h[1]
1366 smlal v26.4s, v15.4h, v5.h[1]
1367 smlsl v28.4s, v15.4h, v3.h[1]
1368 smlsl v30.4s, v15.4h, v4.h[1]
1371 smlsl v20.4s, v12.4h, v5.h[0]
1372 smlsl v20.4s, v13.4h, v7.h[2]
1373 smlsl v22.4s, v12.4h, v1.h[0]
1374 smlal v22.4s, v13.4h, v1.h[2]
1375 smlsl v16.4s, v12.4h, v7.h[0]
1376 smlal v16.4s, v13.4h, v5.h[2]
1377 smlal v18.4s, v12.4h, v3.h[0]
1378 smlsl v18.4s, v13.4h, v3.h[2]
1384 ld1 {v10.4h},[x0],x6
1385 ld1 {v8.4h},[x0],x6
1386 ld1 {v11.4h},[x0],x6
1387 ld1 {v9.4h},[x0],x6
1390 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
1391 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1392 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
1393 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
1395 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1396 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1397 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1398 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1404 smlal v20.4s, v10.4h, v0.h[0]
1405 smlsl v20.4s, v11.4h, v0.h[2]
1408 smlsl v22.4s, v10.4h, v0.h[0]
1409 smlal v22.4s, v11.4h, v6.h[2]
1411 smlsl v16.4s, v10.4h, v0.h[0]
1412 smlal v16.4s, v11.4h, v2.h[2]
1414 smlal v18.4s, v10.4h, v0.h[0]
1415 smlsl v18.4s, v11.4h, v4.h[2]
1420 ld1 {v12.4h},[x0],x6
1421 ld1 {v14.4h},[x0],x6
1422 ld1 {v13.4h},[x0],x6
1423 ld1 {v15.4h},[x0],x6
1430 smlal v24.4s, v14.4h, v3.h[1]
1431 smlsl v26.4s, v14.4h, v2.h[1]
1432 smlal v28.4s, v14.4h, v7.h[3]
1433 smlal v30.4s, v14.4h, v2.h[3]
1436 smlsl v24.4s, v15.4h, v0.h[3]
1437 smlal v26.4s, v15.4h, v4.h[3]
1438 smlal v28.4s, v15.4h, v6.h[3]
1439 smlsl v30.4s, v15.4h, v2.h[1]
1442 smlal v20.4s, v12.4h, v3.h[0]
1443 smlsl v20.4s, v13.4h, v6.h[2]
1444 smlal v22.4s, v12.4h, v7.h[0]
1445 smlsl v22.4s, v13.4h, v4.h[2]
1446 smlsl v16.4s, v12.4h, v1.h[0]
1447 smlal v16.4s, v13.4h, v0.h[2]
1448 smlal v18.4s, v12.4h, v5.h[0]
1449 smlsl v18.4s, v13.4h, v5.h[2]
1452 ld1 {v10.4h},[x0],x6
1453 ld1 {v8.4h},[x0],x6
1454 ld1 {v11.4h},[x0],x6
1455 ld1 {v9.4h},[x0],x6
1461 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
1462 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
1463 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
1464 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1466 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1467 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1468 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1469 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1475 smlsl v20.4s, v10.4h, v6.h[0]
1476 smlal v20.4s, v11.4h, v2.h[2]
1479 smlal v22.4s, v10.4h, v2.h[0]
1480 smlsl v22.4s, v11.4h, v0.h[2]
1482 smlsl v16.4s, v10.4h, v2.h[0]
1483 smlal v16.4s, v11.4h, v3.h[2]
1485 smlal v18.4s, v10.4h, v6.h[0]
1486 smlsl v18.4s, v11.4h, v6.h[2]
1489 ld1 {v12.4h},[x0],x6
1490 ld1 {v14.4h},[x0],x6
1491 ld1 {v13.4h},[x0],x6
1492 ld1 {v15.4h},[x0],x6
1497 smlsl v24.4s, v14.4h, v5.h[1]
1498 smlal v26.4s, v14.4h, v3.h[3]
1499 smlsl v28.4s, v14.4h, v2.h[1]
1500 smlal v30.4s, v14.4h, v0.h[3]
1503 smlal v24.4s, v15.4h, v1.h[3]
1504 smlsl v26.4s, v15.4h, v1.h[1]
1505 smlal v28.4s, v15.4h, v0.h[3]
1506 smlsl v30.4s, v15.4h, v0.h[1]
1509 smlsl v20.4s, v12.4h, v1.h[0]
1510 smlal v20.4s, v13.4h, v4.h[2]
1511 smlal v22.4s, v12.4h, v3.h[0]
1512 smlsl v22.4s, v13.4h, v5.h[2]
1513 smlsl v16.4s, v12.4h, v5.h[0]
1514 smlal v16.4s, v13.4h, v6.h[2]
1515 smlal v18.4s, v12.4h, v7.h[0]
1516 smlsl v18.4s, v13.4h, v7.h[2]
1533 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1534 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1535 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1536 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1537 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1538 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1539 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1540 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1547 trn1 v24.4h, v30.4h, v12.4h
1548 trn2 v25.4h, v30.4h, v12.4h
1549 trn1 v26.4h, v31.4h, v13.4h
1550 trn2 v27.4h, v31.4h, v13.4h
1557 trn1 v24.4h, v14.4h, v18.4h
1558 trn2 v25.4h, v14.4h, v18.4h
1559 trn1 v26.4h, v15.4h, v19.4h
1560 trn2 v27.4h, v15.4h, v19.4h
1572 st1 { v30.4h, v31.4h},[x1],#16
1573 st1 { v12.4h, v13.4h},[x1],#16
1574 st1 { v14.4h, v15.4h},[x1],#16
1575 st1 { v18.4h, v19.4h},[x1],#16
1618 ld1 {v10.4h, v11.4h},[x1],#16
1619 ld1 {v8.4h, v9.4h},[x1],x10
1621 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
1622 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
1623 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
1624 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1626 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1627 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1628 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1629 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1633 smull v20.4s, v10.4h, v0.h[0]
1634 smlal v20.4s, v11.4h, v0.h[2]
1637 smull v22.4s, v10.4h, v0.h[0]
1638 smlal v22.4s, v11.4h, v1.h[2]
1640 smull v16.4s, v10.4h, v0.h[0]
1641 smlal v16.4s, v11.4h, v2.h[2]
1643 smull v18.4s, v10.4h, v0.h[0]
1644 smlal v18.4s, v11.4h, v3.h[2]
1648 ld1 {v12.4h, v13.4h},[x1],#16
1649 ld1 {v14.4h, v15.4h},[x1],x10
1656 smlal v24.4s, v14.4h, v1.h[1]
1657 smlal v26.4s, v14.4h, v3.h[3]
1658 smlal v28.4s, v14.4h, v6.h[1]
1659 smlsl v30.4s, v14.4h, v7.h[1]
1662 smlal v24.4s, v15.4h, v1.h[3]
1663 smlal v26.4s, v15.4h, v5.h[1]
1664 smlsl v28.4s, v15.4h, v7.h[1]
1665 smlsl v30.4s, v15.4h, v3.h[3]
1668 smlal v20.4s, v12.4h, v1.h[0]
1669 smlal v20.4s, v13.4h, v1.h[2]
1670 smlal v22.4s, v12.4h, v3.h[0]
1671 smlal v22.4s, v13.4h, v4.h[2]
1672 smlal v16.4s, v12.4h, v5.h[0]
1673 smlal v16.4s, v13.4h, v7.h[2]
1674 smlal v18.4s, v12.4h, v7.h[0]
1675 smlsl v18.4s, v13.4h, v5.h[2]
1679 ld1 {v10.4h, v11.4h},[x1],#16
1680 ld1 {v8.4h, v9.4h},[x1],x10
1682 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
1683 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1684 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
1685 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
1687 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1688 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1689 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1690 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1696 smlal v20.4s, v10.4h, v2.h[0]
1697 smlal v20.4s, v11.4h, v2.h[2]
1700 smlal v22.4s, v10.4h, v6.h[0]
1701 smlal v22.4s, v11.4h, v7.h[2]
1703 smlsl v16.4s, v10.4h, v6.h[0]
1704 smlsl v16.4s, v11.4h, v3.h[2]
1706 smlsl v18.4s, v10.4h, v2.h[0]
1707 smlsl v18.4s, v11.4h, v1.h[2]
1713 ld1 {v12.4h, v13.4h},[x1],#16
1714 ld1 {v14.4h, v15.4h},[x1],x10
1720 smlal v24.4s, v14.4h, v3.h[1]
1721 smlsl v26.4s, v14.4h, v6.h[1]
1722 smlsl v28.4s, v14.4h, v0.h[1]
1723 smlsl v30.4s, v14.4h, v6.h[3]
1726 smlal v24.4s, v15.4h, v3.h[3]
1727 smlsl v26.4s, v15.4h, v4.h[3]
1728 smlsl v28.4s, v15.4h, v2.h[3]
1729 smlal v30.4s, v15.4h, v5.h[3]
1732 smlal v20.4s, v12.4h, v3.h[0]
1733 smlal v20.4s, v13.4h, v3.h[2]
1734 smlsl v22.4s, v12.4h, v7.h[0]
1735 smlsl v22.4s, v13.4h, v5.h[2]
1736 smlsl v16.4s, v12.4h, v1.h[0]
1737 smlsl v16.4s, v13.4h, v1.h[2]
1738 smlsl v18.4s, v12.4h, v5.h[0]
1739 smlal v18.4s, v13.4h, v7.h[2]
1745 ld1 {v10.4h, v11.4h},[x1],#16
1746 ld1 {v8.4h, v9.4h},[x1],x10
1749 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
1750 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
1751 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
1752 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
1754 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1755 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1756 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1757 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1763 smlal v20.4s, v10.4h, v0.h[0]
1764 smlal v20.4s, v11.4h, v4.h[2]
1767 smlsl v22.4s, v10.4h, v0.h[0]
1768 smlsl v22.4s, v11.4h, v2.h[2]
1770 smlsl v16.4s, v10.4h, v0.h[0]
1771 smlsl v16.4s, v11.4h, v6.h[2]
1773 smlal v18.4s, v10.4h, v0.h[0]
1774 smlal v18.4s, v11.4h, v0.h[2]
1776 ld1 {v12.4h, v13.4h},[x1],#16
1777 ld1 {v14.4h, v15.4h},[x1],x10
1783 smlal v24.4s, v14.4h, v5.h[1]
1784 smlsl v26.4s, v14.4h, v0.h[2]
1785 smlal v28.4s, v14.4h, v5.h[3]
1786 smlal v30.4s, v14.4h, v4.h[3]
1789 smlal v24.4s, v15.4h, v5.h[3]
1790 smlsl v26.4s, v15.4h, v1.h[1]
1791 smlal v28.4s, v15.4h, v3.h[1]
1792 smlsl v30.4s, v15.4h, v7.h[3]
1795 smlal v20.4s, v12.4h, v5.h[0]
1796 smlal v20.4s, v13.4h, v5.h[2]
1797 smlsl v22.4s, v12.4h, v1.h[0]
1798 smlsl v22.4s, v13.4h, v0.h[2]
1799 smlal v16.4s, v12.4h, v7.h[0]
1800 smlal v16.4s, v13.4h, v4.h[2]
1801 smlal v18.4s, v12.4h, v3.h[0]
1802 smlal v18.4s, v13.4h, v6.h[2]
1805 ld1 {v10.4h, v11.4h},[x1],#16
1806 ld1 {v8.4h, v9.4h},[x1],x10
1811 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
1812 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1813 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
1814 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
1816 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1817 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1818 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1819 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1825 smlal v20.4s, v10.4h, v6.h[0]
1826 smlal v20.4s, v11.4h, v6.h[2]
1829 smlsl v22.4s, v10.4h, v2.h[0]
1830 smlsl v22.4s, v11.4h, v3.h[2]
1832 smlal v16.4s, v10.4h, v2.h[0]
1833 smlal v16.4s, v11.4h, v0.h[2]
1835 smlsl v18.4s, v10.4h, v6.h[0]
1836 smlsl v18.4s, v11.4h, v2.h[2]
1838 ld1 {v12.4h, v13.4h},[x1],#16
1839 ld1 {v14.4h, v15.4h},[x1],x10
1841 smlal v24.4s, v14.4h, v7.h[1]
1842 smlsl v26.4s, v14.4h, v5.h[3]
1843 smlal v28.4s, v14.4h, v4.h[1]
1844 smlsl v30.4s, v14.4h, v2.h[3]
1847 smlal v24.4s, v15.4h, v7.h[3]
1848 smlsl v26.4s, v15.4h, v7.h[1]
1849 smlal v28.4s, v15.4h, v6.h[3]
1850 smlsl v30.4s, v15.4h, v6.h[1]
1853 smlal v20.4s, v12.4h, v7.h[0]
1854 smlal v20.4s, v13.4h, v7.h[2]
1855 smlsl v22.4s, v12.4h, v5.h[0]
1856 smlsl v22.4s, v13.4h, v6.h[2]
1857 smlal v16.4s, v12.4h, v3.h[0]
1858 smlal v16.4s, v13.4h, v5.h[2]
1859 smlsl v18.4s, v12.4h, v1.h[0]
1860 smlsl v18.4s, v13.4h, v4.h[2]
1877 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
1878 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
1879 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
1880 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
1881 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
1882 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
1883 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
1884 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
1892 trn1 v24.4h, v30.4h, v12.4h
1893 trn2 v25.4h, v30.4h, v12.4h
1894 trn1 v26.4h, v31.4h, v13.4h
1895 trn2 v27.4h, v31.4h, v13.4h
1902 trn1 v24.4h, v14.4h, v18.4h
1903 trn2 v25.4h, v14.4h, v18.4h
1904 trn1 v26.4h, v15.4h, v19.4h
1905 trn2 v27.4h, v15.4h, v19.4h
1917 st1 { v30.4h, v31.4h},[x0],#16
1918 st1 { v12.4h, v13.4h},[x0],#16
1919 st1 { v14.4h, v15.4h},[x0],#16
1920 st1 { v18.4h, v19.4h},[x0],#16
1929 ld1 {v10.4h, v11.4h},[x1],#16
1930 ld1 {v8.4h, v9.4h},[x1],x10
1933 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
1934 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1935 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
1936 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
1938 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1939 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1940 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1941 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1947 smull v20.4s, v10.4h, v0.h[0]
1948 smlal v20.4s, v11.4h, v4.h[2]
1951 smull v22.4s, v10.4h, v0.h[0]
1952 smlal v22.4s, v11.4h, v5.h[2]
1954 smull v16.4s, v10.4h, v0.h[0]
1955 smlal v16.4s, v11.4h, v6.h[2]
1957 smull v18.4s, v10.4h, v0.h[0]
1958 smlal v18.4s, v11.4h, v7.h[2]
1963 ld1 {v12.4h, v13.4h},[x1],#16
1964 ld1 {v14.4h, v15.4h},[x1],x10
1967 smlsl v24.4s, v14.4h, v4.h[3]
1968 smlsl v26.4s, v14.4h, v2.h[1]
1969 smlsl v28.4s, v14.4h, v0.h[1]
1970 smlsl v30.4s, v14.4h, v2.h[3]
1973 smlsl v24.4s, v15.4h, v0.h[3]
1974 smlsl v26.4s, v15.4h, v3.h[1]
1975 smlsl v28.4s, v15.4h, v6.h[3]
1976 smlal v30.4s, v15.4h, v5.h[3]
1979 smlsl v20.4s, v12.4h, v7.h[0]
1980 smlsl v20.4s, v13.4h, v2.h[2]
1981 smlsl v22.4s, v12.4h, v5.h[0]
1982 smlsl v22.4s, v13.4h, v0.h[2]
1983 smlsl v16.4s, v12.4h, v3.h[0]
1984 smlsl v16.4s, v13.4h, v3.h[2]
1985 smlsl v18.4s, v12.4h, v1.h[0]
1986 smlsl v18.4s, v13.4h, v6.h[2]
1991 ld1 {v10.4h, v11.4h},[x1],#16
1992 ld1 {v8.4h, v9.4h},[x1],x10
1998 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
1999 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
2000 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
2001 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2003 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2004 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2005 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2006 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2012 smlsl v20.4s, v10.4h, v2.h[0]
2013 smlsl v20.4s, v11.4h, v6.h[2]
2016 smlsl v22.4s, v10.4h, v6.h[0]
2017 smlal v22.4s, v11.4h, v4.h[2]
2019 smlal v16.4s, v10.4h, v6.h[0]
2020 smlal v16.4s, v11.4h, v0.h[2]
2022 smlal v18.4s, v10.4h, v2.h[0]
2023 smlal v18.4s, v11.4h, v5.h[2]
2029 ld1 {v12.4h, v13.4h},[x1],#16
2030 ld1 {v14.4h, v15.4h},[x1],x10
2037 smlal v24.4s, v14.4h, v2.h[3]
2038 smlal v26.4s, v14.4h, v3.h[3]
2039 smlsl v28.4s, v14.4h, v5.h[3]
2040 smlsl v30.4s, v14.4h, v0.h[3]
2043 smlal v24.4s, v15.4h, v1.h[3]
2044 smlsl v26.4s, v15.4h, v6.h[3]
2045 smlsl v28.4s, v15.4h, v0.h[3]
2046 smlal v30.4s, v15.4h, v7.h[3]
2049 smlal v20.4s, v12.4h, v5.h[0]
2050 smlal v20.4s, v13.4h, v0.h[2]
2051 smlal v22.4s, v12.4h, v1.h[0]
2052 smlal v22.4s, v13.4h, v6.h[2]
2053 smlal v16.4s, v12.4h, v7.h[0]
2054 smlsl v16.4s, v13.4h, v2.h[2]
2055 smlsl v18.4s, v12.4h, v3.h[0]
2056 smlsl v18.4s, v13.4h, v4.h[2]
2062 ld1 {v10.4h, v11.4h},[x1],#16
2063 ld1 {v8.4h, v9.4h},[x1],x10
2067 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2068 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
2069 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
2070 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
2072 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2073 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2074 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2075 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2081 smlal v20.4s, v10.4h, v0.h[0]
2082 smlsl v20.4s, v11.4h, v7.h[2]
2085 smlsl v22.4s, v10.4h, v0.h[0]
2086 smlsl v22.4s, v11.4h, v1.h[2]
2088 smlsl v16.4s, v10.4h, v0.h[0]
2089 smlal v16.4s, v11.4h, v5.h[2]
2091 smlal v18.4s, v10.4h, v0.h[0]
2092 smlal v18.4s, v11.4h, v3.h[2]
2094 ld1 {v12.4h, v13.4h},[x1],#16
2095 ld1 {v14.4h, v15.4h},[x1],x10
2100 smlsl v24.4s, v14.4h, v0.h[1]
2101 smlal v26.4s, v14.4h, v6.h[1]
2102 smlal v28.4s, v14.4h, v4.h[1]
2103 smlsl v30.4s, v14.4h, v1.h[1]
2106 smlsl v24.4s, v15.4h, v3.h[3]
2107 smlal v26.4s, v15.4h, v0.h[1]
2108 smlsl v28.4s, v15.4h, v5.h[1]
2109 smlsl v30.4s, v15.4h, v6.h[1]
2112 smlsl v20.4s, v12.4h, v3.h[0]
2113 smlsl v20.4s, v13.4h, v1.h[2]
2114 smlsl v22.4s, v12.4h, v7.h[0]
2115 smlal v22.4s, v13.4h, v3.h[2]
2116 smlal v16.4s, v12.4h, v1.h[0]
2117 smlal v16.4s, v13.4h, v7.h[2]
2118 smlsl v18.4s, v12.4h, v5.h[0]
2119 smlsl v18.4s, v13.4h, v2.h[2]
2122 ld1 {v10.4h, v11.4h},[x1],#16
2123 ld1 {v8.4h, v9.4h},[x1],x10
2126 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2127 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
2128 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
2129 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
2131 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2132 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2133 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2134 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2140 smlsl v20.4s, v10.4h, v6.h[0]
2141 smlal v20.4s, v11.4h, v5.h[2]
2144 smlal v22.4s, v10.4h, v2.h[0]
2145 smlal v22.4s, v11.4h, v7.h[2]
2147 smlsl v16.4s, v10.4h, v2.h[0]
2148 smlsl v16.4s, v11.4h, v4.h[2]
2150 smlal v18.4s, v10.4h, v6.h[0]
2151 smlal v18.4s, v11.4h, v1.h[2]
2154 ld1 {v12.4h, v13.4h},[x1],#16
2155 ld1 {v14.4h, v15.4h},[x1],x10
2159 smlal v24.4s, v14.4h, v1.h[1]
2160 smlsl v26.4s, v14.4h, v0.h[3]
2161 smlal v28.4s, v14.4h, v1.h[3]
2162 smlsl v30.4s, v14.4h, v3.h[1]
2165 smlal v24.4s, v15.4h, v5.h[3]
2166 smlsl v26.4s, v15.4h, v5.h[1]
2167 smlal v28.4s, v15.4h, v4.h[3]
2168 smlsl v30.4s, v15.4h, v4.h[1]
2171 smlal v20.4s, v12.4h, v1.h[0]
2172 smlal v20.4s, v13.4h, v3.h[2]
2173 smlsl v22.4s, v12.4h, v3.h[0]
2174 smlsl v22.4s, v13.4h, v2.h[2]
2175 smlal v16.4s, v12.4h, v5.h[0]
2176 smlal v16.4s, v13.4h, v1.h[2]
2177 smlsl v18.4s, v12.4h, v7.h[0]
2178 smlsl v18.4s, v13.4h, v0.h[2]
2195 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2196 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2197 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2198 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2199 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2200 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2201 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2202 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2209 trn1 v24.4h, v30.4h, v12.4h
2210 trn2 v25.4h, v30.4h, v12.4h
2211 trn1 v26.4h, v31.4h, v13.4h
2212 trn2 v27.4h, v31.4h, v13.4h
2219 trn1 v24.4h, v14.4h, v18.4h
2220 trn2 v25.4h, v14.4h, v18.4h
2221 trn1 v26.4h, v15.4h, v19.4h
2222 trn2 v27.4h, v15.4h, v19.4h
2234 st1 { v30.4h, v31.4h},[x0],#16
2235 st1 { v12.4h, v13.4h},[x0],#16
2236 st1 { v14.4h, v15.4h},[x0],#16
2237 st1 { v18.4h, v19.4h},[x0],#16
2245 ld1 {v10.4h, v11.4h},[x1],#16
2246 ld1 {v8.4h, v9.4h},[x1],x10
2248 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
2249 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
2250 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
2251 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
2253 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2254 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2255 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
2256 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2262 smull v20.4s, v10.4h, v0.h[0]
2263 smlsl v20.4s, v11.4h, v7.h[2]
2266 smull v22.4s, v10.4h, v0.h[0]
2267 smlsl v22.4s, v11.4h, v6.h[2]
2269 smull v16.4s, v10.4h, v0.h[0]
2270 smlsl v16.4s, v11.4h, v5.h[2]
2272 smull v18.4s, v10.4h, v0.h[0]
2273 smlsl v18.4s, v11.4h, v4.h[2]
2278 ld1 {v12.4h, v13.4h},[x1],#16
2279 ld1 {v14.4h, v15.4h},[x1],x10
2281 smlsl v24.4s, v14.4h, v5.h[1]
2282 smlsl v26.4s, v14.4h, v7.h[3]
2283 smlal v28.4s, v14.4h, v5.h[3]
2284 smlal v30.4s, v14.4h, v3.h[1]
2287 smlal v24.4s, v15.4h, v2.h[1]
2288 smlal v26.4s, v15.4h, v1.h[1]
2289 smlal v28.4s, v15.4h, v4.h[3]
2290 smlsl v30.4s, v15.4h, v7.h[3]
2293 smlsl v20.4s, v12.4h, v1.h[0]
2294 smlal v20.4s, v13.4h, v6.h[2]
2295 smlsl v22.4s, v12.4h, v3.h[0]
2296 smlal v22.4s, v13.4h, v3.h[2]
2297 smlsl v16.4s, v12.4h, v5.h[0]
2298 smlal v16.4s, v13.4h, v0.h[2]
2299 smlsl v18.4s, v12.4h, v7.h[0]
2300 smlal v18.4s, v13.4h, v2.h[2]
2305 ld1 {v10.4h, v11.4h},[x1],#16
2306 ld1 {v8.4h, v9.4h},[x1],x10
2310 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2311 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
2312 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
2313 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
2315 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2316 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2317 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2318 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2324 smlal v20.4s, v10.4h, v2.h[0]
2325 smlsl v20.4s, v11.4h, v5.h[2]
2328 smlal v22.4s, v10.4h, v6.h[0]
2329 smlsl v22.4s, v11.4h, v0.h[2]
2331 smlsl v16.4s, v10.4h, v6.h[0]
2332 smlsl v16.4s, v11.4h, v4.h[2]
2334 smlsl v18.4s, v10.4h, v2.h[0]
2335 smlal v18.4s, v11.4h, v6.h[2]
2340 ld1 {v12.4h, v13.4h},[x1],#16
2341 ld1 {v14.4h, v15.4h},[x1],x10
2347 smlsl v24.4s, v14.4h, v7.h[1]
2348 smlal v26.4s, v14.4h, v2.h[1]
2349 smlal v28.4s, v14.4h, v4.h[1]
2350 smlsl v30.4s, v14.4h, v5.h[1]
2353 smlal v24.4s, v15.4h, v0.h[3]
2354 smlal v26.4s, v15.4h, v7.h[1]
2355 smlsl v28.4s, v15.4h, v1.h[1]
2356 smlsl v30.4s, v15.4h, v6.h[1]
2359 smlsl v20.4s, v12.4h, v3.h[0]
2360 smlal v20.4s, v13.4h, v4.h[2]
2361 smlal v22.4s, v12.4h, v7.h[0]
2362 smlal v22.4s, v13.4h, v2.h[2]
2363 smlal v16.4s, v12.4h, v1.h[0]
2364 smlsl v16.4s, v13.4h, v6.h[2]
2365 smlal v18.4s, v12.4h, v5.h[0]
2366 smlsl v18.4s, v13.4h, v0.h[2]
2372 ld1 {v10.4h, v11.4h},[x1],#16
2373 ld1 {v8.4h, v9.4h},[x1],x10
2376 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2377 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
2378 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
2379 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2381 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2382 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2383 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2384 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2390 smlal v20.4s, v10.4h, v0.h[0]
2391 smlsl v20.4s, v11.4h, v3.h[2]
2394 smlsl v22.4s, v10.4h, v0.h[0]
2395 smlsl v22.4s, v11.4h, v5.h[2]
2397 smlsl v16.4s, v10.4h, v0.h[0]
2398 smlal v16.4s, v11.4h, v1.h[2]
2400 smlal v18.4s, v10.4h, v0.h[0]
2401 smlal v18.4s, v11.4h, v7.h[2]
2403 ld1 {v12.4h, v13.4h},[x1],#16
2404 ld1 {v14.4h, v15.4h},[x1],x10
2409 smlal v24.4s, v14.4h, v6.h[3]
2410 smlal v26.4s, v14.4h, v3.h[3]
2411 smlsl v28.4s, v14.4h, v1.h[3]
2412 smlal v30.4s, v14.4h, v7.h[1]
2415 smlal v24.4s, v15.4h, v1.h[3]
2416 smlsl v26.4s, v15.4h, v2.h[3]
2417 smlal v28.4s, v15.4h, v7.h[1]
2418 smlal v30.4s, v15.4h, v4.h[1]
2421 smlsl v20.4s, v12.4h, v5.h[0]
2422 smlal v20.4s, v13.4h, v2.h[2]
2423 smlal v22.4s, v12.4h, v1.h[0]
2424 smlsl v22.4s, v13.4h, v7.h[2]
2425 smlsl v16.4s, v12.4h, v7.h[0]
2426 smlsl v16.4s, v13.4h, v3.h[2]
2427 smlsl v18.4s, v12.4h, v3.h[0]
2428 smlal v18.4s, v13.4h, v1.h[2]
2431 ld1 {v10.4h, v11.4h},[x1],#16
2432 ld1 {v8.4h, v9.4h},[x1],x10
2435 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
2436 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
2437 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
2438 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
2440 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2441 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2442 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2443 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2449 smlal v20.4s, v10.4h, v6.h[0]
2450 smlsl v20.4s, v11.4h, v1.h[2]
2453 smlsl v22.4s, v10.4h, v2.h[0]
2454 smlal v22.4s, v11.4h, v4.h[2]
2456 smlal v16.4s, v10.4h, v2.h[0]
2457 smlsl v16.4s, v11.4h, v7.h[2]
2459 smlsl v18.4s, v10.4h, v6.h[0]
2460 smlsl v18.4s, v11.4h, v5.h[2]
2462 ld1 {v12.4h, v13.4h},[x1],#16
2463 ld1 {v14.4h, v15.4h},[x1],x10
2467 smlal v24.4s, v14.4h, v4.h[3]
2468 smlsl v26.4s, v14.4h, v6.h[1]
2469 smlal v28.4s, v14.4h, v7.h[3]
2470 smlal v30.4s, v14.4h, v6.h[3]
2473 smlal v24.4s, v15.4h, v3.h[3]
2474 smlsl v26.4s, v15.4h, v3.h[1]
2475 smlal v28.4s, v15.4h, v2.h[3]
2476 smlsl v30.4s, v15.4h, v2.h[1]
2479 smlsl v20.4s, v12.4h, v7.h[0]
2480 smlal v20.4s, v13.4h, v0.h[2]
2481 smlal v22.4s, v12.4h, v5.h[0]
2482 smlsl v22.4s, v13.4h, v1.h[2]
2483 smlsl v16.4s, v12.4h, v3.h[0]
2484 smlal v16.4s, v13.4h, v2.h[2]
2485 smlal v18.4s, v12.4h, v1.h[0]
2486 smlsl v18.4s, v13.4h, v3.h[2]
2503 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2504 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2505 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2506 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2507 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2508 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2509 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2510 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2517 trn1 v24.4h, v30.4h, v12.4h
2518 trn2 v25.4h, v30.4h, v12.4h
2519 trn1 v26.4h, v31.4h, v13.4h
2520 trn2 v27.4h, v31.4h, v13.4h
2527 trn1 v24.4h, v14.4h, v18.4h
2528 trn2 v25.4h, v14.4h, v18.4h
2529 trn1 v26.4h, v15.4h, v19.4h
2530 trn2 v27.4h, v15.4h, v19.4h
2542 st1 { v30.4h, v31.4h},[x0],#16
2543 st1 { v12.4h, v13.4h},[x0],#16
2544 st1 { v14.4h, v15.4h},[x0],#16
2545 st1 { v18.4h, v19.4h},[x0],#16
2554 ld1 {v10.4h, v11.4h},[x1],#16
2555 ld1 {v8.4h, v9.4h},[x1],x10
2558 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2559 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
2560 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
2561 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
2563 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2564 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2565 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2566 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2572 smull v20.4s, v10.4h, v0.h[0]
2573 smlsl v20.4s, v11.4h, v3.h[2]
2576 smull v22.4s, v10.4h, v0.h[0]
2577 smlsl v22.4s, v11.4h, v2.h[2]
2579 smull v16.4s, v10.4h, v0.h[0]
2580 smlsl v16.4s, v11.4h, v1.h[2]
2582 smull v18.4s, v10.4h, v0.h[0]
2583 smlsl v18.4s, v11.4h, v0.h[2]
2587 ld1 {v12.4h, v13.4h},[x1],#16
2588 ld1 {v14.4h, v15.4h},[x1],x10
2595 smlal v24.4s, v14.4h, v0.h[1]
2596 smlal v26.4s, v14.4h, v1.h[3]
2597 smlal v28.4s, v14.4h, v4.h[1]
2598 smlal v30.4s, v14.4h, v6.h[3]
2601 smlsl v24.4s, v15.4h, v4.h[1]
2602 smlsl v26.4s, v15.4h, v0.h[3]
2603 smlsl v28.4s, v15.4h, v2.h[3]
2604 smlsl v30.4s, v15.4h, v6.h[1]
2607 smlal v20.4s, v12.4h, v7.h[0]
2608 smlal v20.4s, v13.4h, v5.h[2]
2609 smlal v22.4s, v12.4h, v5.h[0]
2610 smlsl v22.4s, v13.4h, v7.h[2]
2611 smlal v16.4s, v12.4h, v3.h[0]
2612 smlsl v16.4s, v13.4h, v4.h[2]
2613 smlal v18.4s, v12.4h, v1.h[0]
2614 smlsl v18.4s, v13.4h, v1.h[2]
2619 ld1 {v10.4h, v11.4h},[x1],#16
2620 ld1 {v8.4h, v9.4h},[x1],x10
2624 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2625 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
2626 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
2627 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
2629 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2630 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2631 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2632 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2638 smlsl v20.4s, v10.4h, v2.h[0]
2639 smlal v20.4s, v11.4h, v1.h[2]
2642 smlsl v22.4s, v10.4h, v6.h[0]
2643 smlal v22.4s, v11.4h, v3.h[2]
2645 smlal v16.4s, v10.4h, v6.h[0]
2646 smlsl v16.4s, v11.4h, v7.h[2]
2648 smlal v18.4s, v10.4h, v2.h[0]
2649 smlsl v18.4s, v11.4h, v2.h[2]
2655 ld1 {v12.4h, v13.4h},[x1],#16
2656 ld1 {v14.4h, v15.4h},[x1],x10
2663 smlsl v24.4s, v14.4h, v1.h[1]
2664 smlsl v26.4s, v14.4h, v7.h[3]
2665 smlal v28.4s, v14.4h, v1.h[3]
2666 smlal v30.4s, v14.4h, v4.h[3]
2669 smlal v24.4s, v15.4h, v2.h[1]
2670 smlal v26.4s, v15.4h, v5.h[1]
2671 smlsl v28.4s, v15.4h, v3.h[1]
2672 smlsl v30.4s, v15.4h, v4.h[1]
2675 smlsl v20.4s, v12.4h, v5.h[0]
2676 smlsl v20.4s, v13.4h, v7.h[2]
2677 smlsl v22.4s, v12.4h, v1.h[0]
2678 smlal v22.4s, v13.4h, v1.h[2]
2679 smlsl v16.4s, v12.4h, v7.h[0]
2680 smlal v16.4s, v13.4h, v5.h[2]
2681 smlal v18.4s, v12.4h, v3.h[0]
2682 smlsl v18.4s, v13.4h, v3.h[2]
2688 ld1 {v10.4h, v11.4h},[x1],#16
2689 ld1 {v8.4h, v9.4h},[x1],x10
2692 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
2693 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
2694 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
2695 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
2697 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2698 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2699 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2700 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2706 smlal v20.4s, v10.4h, v0.h[0]
2707 smlsl v20.4s, v11.4h, v0.h[2]
2710 smlsl v22.4s, v10.4h, v0.h[0]
2711 smlal v22.4s, v11.4h, v6.h[2]
2713 smlsl v16.4s, v10.4h, v0.h[0]
2714 smlal v16.4s, v11.4h, v2.h[2]
2716 smlal v18.4s, v10.4h, v0.h[0]
2717 smlsl v18.4s, v11.4h, v4.h[2]
2719 ld1 {v12.4h, v13.4h},[x1],#16
2720 ld1 {v14.4h, v15.4h},[x1],x10
2725 smlal v24.4s, v14.4h, v3.h[1]
2726 smlsl v26.4s, v14.4h, v2.h[1]
2727 smlal v28.4s, v14.4h, v7.h[3]
2728 smlal v30.4s, v14.4h, v2.h[3]
2731 smlsl v24.4s, v15.4h, v0.h[3]
2732 smlal v26.4s, v15.4h, v4.h[3]
2733 smlal v28.4s, v15.4h, v6.h[3]
2734 smlsl v30.4s, v15.4h, v2.h[1]
2737 smlal v20.4s, v12.4h, v3.h[0]
2738 smlsl v20.4s, v13.4h, v6.h[2]
2739 smlal v22.4s, v12.4h, v7.h[0]
2740 smlsl v22.4s, v13.4h, v4.h[2]
2741 smlsl v16.4s, v12.4h, v1.h[0]
2742 smlal v16.4s, v13.4h, v0.h[2]
2743 smlal v18.4s, v12.4h, v5.h[0]
2744 smlsl v18.4s, v13.4h, v5.h[2]
2747 ld1 {v10.4h, v11.4h},[x1],#16
2748 ld1 {v8.4h, v9.4h},[x1],x10
2753 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
2754 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
2755 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
2756 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2758 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2759 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2760 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2761 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2767 smlsl v20.4s, v10.4h, v6.h[0]
2768 smlal v20.4s, v11.4h, v2.h[2]
2771 smlal v22.4s, v10.4h, v2.h[0]
2772 smlsl v22.4s, v11.4h, v0.h[2]
2774 smlsl v16.4s, v10.4h, v2.h[0]
2775 smlal v16.4s, v11.4h, v3.h[2]
2777 smlal v18.4s, v10.4h, v6.h[0]
2778 smlsl v18.4s, v11.4h, v6.h[2]
2781 ld1 {v12.4h, v13.4h},[x1],#16
2782 ld1 {v14.4h, v15.4h},[x1],x10
2786 smlsl v24.4s, v14.4h, v5.h[1]
2787 smlal v26.4s, v14.4h, v3.h[3]
2788 smlsl v28.4s, v14.4h, v2.h[1]
2789 smlal v30.4s, v14.4h, v0.h[3]
2792 smlal v24.4s, v15.4h, v1.h[3]
2793 smlsl v26.4s, v15.4h, v1.h[1]
2794 smlal v28.4s, v15.4h, v0.h[3]
2795 smlsl v30.4s, v15.4h, v0.h[1]
2798 smlsl v20.4s, v12.4h, v1.h[0]
2799 smlal v20.4s, v13.4h, v4.h[2]
2800 smlal v22.4s, v12.4h, v3.h[0]
2801 smlsl v22.4s, v13.4h, v5.h[2]
2802 smlsl v16.4s, v12.4h, v5.h[0]
2803 smlal v16.4s, v13.4h, v6.h[2]
2804 smlal v18.4s, v12.4h, v7.h[0]
2805 smlsl v18.4s, v13.4h, v7.h[2]
2822 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2823 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2824 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2825 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2826 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2827 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2828 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2829 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2838 trn1 v24.4h, v30.4h, v12.4h
2839 trn2 v25.4h, v30.4h, v12.4h
2840 trn1 v26.4h, v31.4h, v13.4h
2841 trn2 v27.4h, v31.4h, v13.4h
2848 trn1 v24.4h, v14.4h, v18.4h
2849 trn2 v25.4h, v14.4h, v18.4h
2850 trn1 v26.4h, v15.4h, v19.4h
2851 trn2 v27.4h, v15.4h, v19.4h
2863 st1 { v30.4h, v31.4h},[x0],#16
2864 st1 { v12.4h, v13.4h},[x0],#16
2865 st1 { v14.4h, v15.4h},[x0],#16
2866 st1 { v18.4h, v19.4h},[x0],#16
2875 ld1 {v12.8h},[x0],#16
2876 ld1 {v14.8h},[x0],#16
2880 ld1 {v16.8h},[x0],#16
2881 ld1 {v18.8h},[x0],#16
2884 ld1 {v20.8h},[x0],#16
2885 ld1 {v22.8h},[x0],#16
2890 ld1 {v24.8h},[x0],#16
2891 ld1 {v26.8h},[x0],#16
2941 uaddw v12.8h, v12.8h , v8.8b
2942 uaddw v20.8h, v20.8h , v9.8b
2943 uaddw v14.8h, v14.8h , v10.8b
2944 uaddw v22.8h, v22.8h , v11.8b
2945 uaddw v16.8h, v16.8h , v28.8b
2946 uaddw v24.8h, v24.8h , v29.8b
2947 uaddw v18.8h, v18.8h , v30.8b
2948 uaddw v26.8h, v26.8h , v31.8b
2951 sqxtun v12.8b, v12.8h
2952 sqxtun v13.8b, v20.8h
2953 sqxtun v20.8b, v14.8h
2954 sqxtun v21.8b, v22.8h
2955 sqxtun v14.8b, v16.8h
2956 sqxtun v15.8b, v24.8h
2957 sqxtun v22.8b, v18.8h
2958 sqxtun v23.8b, v26.8h
2970 ld1 {v12.8h},[x0],#16
2971 ld1 {v14.8h},[x0],#16
2975 ld1 {v16.8h},[x0],#16
2976 ld1 {v18.8h},[x0],#16
2979 ld1 {v20.8h},[x0],#16
2980 ld1 {v22.8h},[x0],#16
2985 ld1 {v24.8h},[x0],#16
2986 ld1 {v26.8h},[x0],#16
3016 uaddw v12.8h, v12.8h , v8.8b
3017 uaddw v20.8h, v20.8h , v9.8b
3018 uaddw v14.8h, v14.8h , v10.8b
3019 uaddw v22.8h, v22.8h , v11.8b
3020 uaddw v16.8h, v16.8h , v28.8b
3021 uaddw v24.8h, v24.8h , v29.8b
3022 uaddw v18.8h, v18.8h , v30.8b
3023 uaddw v26.8h, v26.8h , v31.8b
3026 sqxtun v12.8b, v12.8h
3027 sqxtun v13.8b, v20.8h
3028 sqxtun v20.8b, v14.8h
3029 sqxtun v21.8b, v22.8h
3030 sqxtun v14.8b, v16.8h
3031 sqxtun v15.8b, v24.8h
3032 sqxtun v22.8b, v18.8h
3033 sqxtun v23.8b, v26.8h