Lines Matching full:h
169 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
170 ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
208 ld1 {v10.4h},[x0],x6
209 ld1 {v8.4h},[x0],x6
210 ld1 {v11.4h},[x0],x6
211 ld1 {v9.4h},[x0],x6
213 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
214 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
215 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
216 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
218 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
219 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
220 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
221 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
227 smull v20.4s, v10.4h, v0.h[0]
228 smlal v20.4s, v11.4h, v0.h[2]
231 smull v22.4s, v10.4h, v0.h[0]
232 smlal v22.4s, v11.4h, v1.h[2]
234 smull v16.4s, v10.4h, v0.h[0]
235 smlal v16.4s, v11.4h, v2.h[2]
237 smull v18.4s, v10.4h, v0.h[0]
238 smlal v18.4s, v11.4h, v3.h[2]
242 ld1 {v12.4h},[x0],x6
243 ld1 {v14.4h},[x0],x6
244 ld1 {v13.4h},[x0],x6
245 ld1 {v15.4h},[x0],x6
253 smlal v24.4s, v14.4h, v1.h[1]
254 smlal v26.4s, v14.4h, v3.h[3]
255 smlal v28.4s, v14.4h, v6.h[1]
256 smlsl v30.4s, v14.4h, v7.h[1]
259 smlal v24.4s, v15.4h, v1.h[3]
260 smlal v26.4s, v15.4h, v5.h[1]
261 smlsl v28.4s, v15.4h, v7.h[1]
262 smlsl v30.4s, v15.4h, v3.h[3]
265 smlal v20.4s, v12.4h, v1.h[0]
266 smlal v20.4s, v13.4h, v1.h[2]
267 smlal v22.4s, v12.4h, v3.h[0]
268 smlal v22.4s, v13.4h, v4.h[2]
269 smlal v16.4s, v12.4h, v5.h[0]
270 smlal v16.4s, v13.4h, v7.h[2]
271 smlal v18.4s, v12.4h, v7.h[0]
272 smlsl v18.4s, v13.4h, v5.h[2]
277 ld1 {v10.4h},[x0],x6
278 ld1 {v8.4h},[x0],x6
279 ld1 {v11.4h},[x0],x6
280 ld1 {v9.4h},[x0],x6
283 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
284 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
285 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
286 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
288 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
289 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
290 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
291 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
297 smlal v20.4s, v10.4h, v2.h[0]
298 smlal v20.4s, v11.4h, v2.h[2]
301 smlal v22.4s, v10.4h, v6.h[0]
302 smlal v22.4s, v11.4h, v7.h[2]
304 smlsl v16.4s, v10.4h, v6.h[0]
305 smlsl v16.4s, v11.4h, v3.h[2]
307 smlsl v18.4s, v10.4h, v2.h[0]
308 smlsl v18.4s, v11.4h, v1.h[2]
314 ld1 {v12.4h},[x0],x6
315 ld1 {v14.4h},[x0],x6
316 ld1 {v13.4h},[x0],x6
317 ld1 {v15.4h},[x0],x6
327 smlal v24.4s, v14.4h, v3.h[1]
328 smlsl v26.4s, v14.4h, v6.h[1]
329 smlsl v28.4s, v14.4h, v0.h[1]
330 smlsl v30.4s, v14.4h, v6.h[3]
333 smlal v24.4s, v15.4h, v3.h[3]
334 smlsl v26.4s, v15.4h, v4.h[3]
335 smlsl v28.4s, v15.4h, v2.h[3]
336 smlal v30.4s, v15.4h, v5.h[3]
339 smlal v20.4s, v12.4h, v3.h[0]
340 smlal v20.4s, v13.4h, v3.h[2]
341 smlsl v22.4s, v12.4h, v7.h[0]
342 smlsl v22.4s, v13.4h, v5.h[2]
343 smlsl v16.4s, v12.4h, v1.h[0]
344 smlsl v16.4s, v13.4h, v1.h[2]
345 smlsl v18.4s, v12.4h, v5.h[0]
346 smlal v18.4s, v13.4h, v7.h[2]
352 ld1 {v10.4h},[x0],x6
353 ld1 {v8.4h},[x0],x6
354 ld1 {v11.4h},[x0],x6
355 ld1 {v9.4h},[x0],x6
359 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
360 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
361 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
362 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
364 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
365 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
366 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
367 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
373 smlal v20.4s, v10.4h, v0.h[0]
374 smlal v20.4s, v11.4h, v4.h[2]
377 smlsl v22.4s, v10.4h, v0.h[0]
378 smlsl v22.4s, v11.4h, v2.h[2]
380 smlsl v16.4s, v10.4h, v0.h[0]
381 smlsl v16.4s, v11.4h, v6.h[2]
383 smlal v18.4s, v10.4h, v0.h[0]
384 smlal v18.4s, v11.4h, v0.h[2]
388 ld1 {v12.4h},[x0],x6
389 ld1 {v14.4h},[x0],x6
390 ld1 {v13.4h},[x0],x6
391 ld1 {v15.4h},[x0],x6
396 smlal v24.4s, v14.4h, v5.h[1]
397 smlsl v26.4s, v14.4h, v0.h[2]
398 smlal v28.4s, v14.4h, v5.h[3]
399 smlal v30.4s, v14.4h, v4.h[3]
402 smlal v24.4s, v15.4h, v5.h[3]
403 smlsl v26.4s, v15.4h, v1.h[1]
404 smlal v28.4s, v15.4h, v3.h[1]
405 smlsl v30.4s, v15.4h, v7.h[3]
408 smlal v20.4s, v12.4h, v5.h[0]
409 smlal v20.4s, v13.4h, v5.h[2]
410 smlsl v22.4s, v12.4h, v1.h[0]
411 smlsl v22.4s, v13.4h, v0.h[2]
412 smlal v16.4s, v12.4h, v7.h[0]
413 smlal v16.4s, v13.4h, v4.h[2]
414 smlal v18.4s, v12.4h, v3.h[0]
415 smlal v18.4s, v13.4h, v6.h[2]
418 ld1 {v10.4h},[x0],x6
419 ld1 {v8.4h},[x0],x6
420 ld1 {v11.4h},[x0],x6
421 ld1 {v9.4h},[x0],x6
429 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
430 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
431 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
432 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
434 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
435 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
436 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
437 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
443 smlal v20.4s, v10.4h, v6.h[0]
444 smlal v20.4s, v11.4h, v6.h[2]
447 smlsl v22.4s, v10.4h, v2.h[0]
448 smlsl v22.4s, v11.4h, v3.h[2]
450 smlal v16.4s, v10.4h, v2.h[0]
451 smlal v16.4s, v11.4h, v0.h[2]
453 smlsl v18.4s, v10.4h, v6.h[0]
454 smlsl v18.4s, v11.4h, v2.h[2]
456 ld1 {v12.4h},[x0],x6
457 ld1 {v14.4h},[x0],x6
458 ld1 {v13.4h},[x0],x6
459 ld1 {v15.4h},[x0],x6
462 smlal v24.4s, v14.4h, v7.h[1]
463 smlsl v26.4s, v14.4h, v5.h[3]
464 smlal v28.4s, v14.4h, v4.h[1]
465 smlsl v30.4s, v14.4h, v2.h[3]
468 smlal v24.4s, v15.4h, v7.h[3]
469 smlsl v26.4s, v15.4h, v7.h[1]
470 smlal v28.4s, v15.4h, v6.h[3]
471 smlsl v30.4s, v15.4h, v6.h[1]
474 smlal v20.4s, v12.4h, v7.h[0]
475 smlal v20.4s, v13.4h, v7.h[2]
476 smlsl v22.4s, v12.4h, v5.h[0]
477 smlsl v22.4s, v13.4h, v6.h[2]
478 smlal v16.4s, v12.4h, v3.h[0]
479 smlal v16.4s, v13.4h, v5.h[2]
480 smlsl v18.4s, v12.4h, v1.h[0]
481 smlsl v18.4s, v13.4h, v4.h[2]
500 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
501 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
502 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
503 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
504 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
505 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
506 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
507 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
517 trn1 v24.4h, v30.4h, v12.4h
518 trn2 v25.4h, v30.4h, v12.4h
519 trn1 v26.4h, v31.4h, v13.4h
520 trn2 v27.4h, v31.4h, v13.4h
527 trn1 v24.4h, v14.4h, v18.4h
528 trn2 v25.4h, v14.4h, v18.4h
529 trn1 v26.4h, v15.4h, v19.4h
530 trn2 v27.4h, v15.4h, v19.4h
553 st1 { v30.4h, v31.4h},[x1],#16
554 st1 { v12.4h, v13.4h},[x1],#16
556 st1 { v14.4h, v15.4h},[x1],#16
557 st1 { v18.4h, v19.4h},[x1],#16
566 ld1 {v10.4h},[x0],x6
567 ld1 {v8.4h},[x0],x6
568 ld1 {v11.4h},[x0],x6
569 ld1 {v9.4h},[x0],x6
574 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
575 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
576 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
577 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
579 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
580 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
581 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
582 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
588 smull v20.4s, v10.4h, v0.h[0]
589 smlal v20.4s, v11.4h, v4.h[2]
592 smull v22.4s, v10.4h, v0.h[0]
593 smlal v22.4s, v11.4h, v5.h[2]
595 smull v16.4s, v10.4h, v0.h[0]
596 smlal v16.4s, v11.4h, v6.h[2]
598 smull v18.4s, v10.4h, v0.h[0]
599 smlal v18.4s, v11.4h, v7.h[2]
603 ld1 {v12.4h},[x0],x6
604 ld1 {v14.4h},[x0],x6
605 ld1 {v13.4h},[x0],x6
606 ld1 {v15.4h},[x0],x6
609 smlsl v24.4s, v14.4h, v4.h[3]
610 smlsl v26.4s, v14.4h, v2.h[1]
611 smlsl v28.4s, v14.4h, v0.h[1]
612 smlsl v30.4s, v14.4h, v2.h[3]
615 smlsl v24.4s, v15.4h, v0.h[3]
616 smlsl v26.4s, v15.4h, v3.h[1]
617 smlsl v28.4s, v15.4h, v6.h[3]
618 smlal v30.4s, v15.4h, v5.h[3]
621 smlsl v20.4s, v12.4h, v7.h[0]
622 smlsl v20.4s, v13.4h, v2.h[2]
623 smlsl v22.4s, v12.4h, v5.h[0]
624 smlsl v22.4s, v13.4h, v0.h[2]
625 smlsl v16.4s, v12.4h, v3.h[0]
626 smlsl v16.4s, v13.4h, v3.h[2]
627 smlsl v18.4s, v12.4h, v1.h[0]
628 smlsl v18.4s, v13.4h, v6.h[2]
634 ld1 {v10.4h},[x0],x6
635 ld1 {v8.4h},[x0],x6
636 ld1 {v11.4h},[x0],x6
637 ld1 {v9.4h},[x0],x6
645 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
646 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
647 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
648 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
650 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
651 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
652 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
653 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
659 smlsl v20.4s, v10.4h, v2.h[0]
660 smlsl v20.4s, v11.4h, v6.h[2]
663 smlsl v22.4s, v10.4h, v6.h[0]
664 smlal v22.4s, v11.4h, v4.h[2]
666 smlal v16.4s, v10.4h, v6.h[0]
667 smlal v16.4s, v11.4h, v0.h[2]
669 smlal v18.4s, v10.4h, v2.h[0]
670 smlal v18.4s, v11.4h, v5.h[2]
676 ld1 {v12.4h},[x0],x6
677 ld1 {v14.4h},[x0],x6
678 ld1 {v13.4h},[x0],x6
679 ld1 {v15.4h},[x0],x6
685 smlal v24.4s, v14.4h, v2.h[3]
686 smlal v26.4s, v14.4h, v3.h[3]
687 smlsl v28.4s, v14.4h, v5.h[3]
688 smlsl v30.4s, v14.4h, v0.h[3]
691 smlal v24.4s, v15.4h, v1.h[3]
692 smlsl v26.4s, v15.4h, v6.h[3]
693 smlsl v28.4s, v15.4h, v0.h[3]
694 smlal v30.4s, v15.4h, v7.h[3]
697 smlal v20.4s, v12.4h, v5.h[0]
698 smlal v20.4s, v13.4h, v0.h[2]
699 smlal v22.4s, v12.4h, v1.h[0]
700 smlal v22.4s, v13.4h, v6.h[2]
701 smlal v16.4s, v12.4h, v7.h[0]
702 smlsl v16.4s, v13.4h, v2.h[2]
703 smlsl v18.4s, v12.4h, v3.h[0]
704 smlsl v18.4s, v13.4h, v4.h[2]
711 ld1 {v10.4h},[x0],x6
712 ld1 {v8.4h},[x0],x6
713 ld1 {v11.4h},[x0],x6
714 ld1 {v9.4h},[x0],x6
722 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
723 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
724 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
725 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
727 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
728 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
729 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
730 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
736 smlal v20.4s, v10.4h, v0.h[0]
737 smlsl v20.4s, v11.4h, v7.h[2]
740 smlsl v22.4s, v10.4h, v0.h[0]
741 smlsl v22.4s, v11.4h, v1.h[2]
743 smlsl v16.4s, v10.4h, v0.h[0]
744 smlal v16.4s, v11.4h, v5.h[2]
746 smlal v18.4s, v10.4h, v0.h[0]
747 smlal v18.4s, v11.4h, v3.h[2]
751 ld1 {v12.4h},[x0],x6
752 ld1 {v14.4h},[x0],x6
753 ld1 {v13.4h},[x0],x6
754 ld1 {v15.4h},[x0],x6
757 smlsl v24.4s, v14.4h, v0.h[1]
758 smlal v26.4s, v14.4h, v6.h[1]
759 smlal v28.4s, v14.4h, v4.h[1]
760 smlsl v30.4s, v14.4h, v1.h[1]
763 smlsl v24.4s, v15.4h, v3.h[3]
764 smlal v26.4s, v15.4h, v0.h[1]
765 smlsl v28.4s, v15.4h, v5.h[1]
766 smlsl v30.4s, v15.4h, v6.h[1]
769 smlsl v20.4s, v12.4h, v3.h[0]
770 smlsl v20.4s, v13.4h, v1.h[2]
771 smlsl v22.4s, v12.4h, v7.h[0]
772 smlal v22.4s, v13.4h, v3.h[2]
773 smlal v16.4s, v12.4h, v1.h[0]
774 smlal v16.4s, v13.4h, v7.h[2]
775 smlsl v18.4s, v12.4h, v5.h[0]
776 smlsl v18.4s, v13.4h, v2.h[2]
778 ld1 {v10.4h},[x0],x6
779 ld1 {v8.4h},[x0],x6
780 ld1 {v11.4h},[x0],x6
781 ld1 {v9.4h},[x0],x6
786 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
787 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
788 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
789 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
791 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
792 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
793 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
794 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
800 smlsl v20.4s, v10.4h, v6.h[0]
801 smlal v20.4s, v11.4h, v5.h[2]
804 smlal v22.4s, v10.4h, v2.h[0]
805 smlal v22.4s, v11.4h, v7.h[2]
807 smlsl v16.4s, v10.4h, v2.h[0]
808 smlsl v16.4s, v11.4h, v4.h[2]
810 smlal v18.4s, v10.4h, v6.h[0]
811 smlal v18.4s, v11.4h, v1.h[2]
814 ld1 {v12.4h},[x0],x6
815 ld1 {v14.4h},[x0],x6
816 ld1 {v13.4h},[x0],x6
817 ld1 {v15.4h},[x0],x6
823 smlal v24.4s, v14.4h, v1.h[1]
824 smlsl v26.4s, v14.4h, v0.h[3]
825 smlal v28.4s, v14.4h, v1.h[3]
826 smlsl v30.4s, v14.4h, v3.h[1]
829 smlal v24.4s, v15.4h, v5.h[3]
830 smlsl v26.4s, v15.4h, v5.h[1]
831 smlal v28.4s, v15.4h, v4.h[3]
832 smlsl v30.4s, v15.4h, v4.h[1]
835 smlal v20.4s, v12.4h, v1.h[0]
836 smlal v20.4s, v13.4h, v3.h[2]
837 smlsl v22.4s, v12.4h, v3.h[0]
838 smlsl v22.4s, v13.4h, v2.h[2]
839 smlal v16.4s, v12.4h, v5.h[0]
840 smlal v16.4s, v13.4h, v1.h[2]
841 smlsl v18.4s, v12.4h, v7.h[0]
842 smlsl v18.4s, v13.4h, v0.h[2]
859 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
860 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
861 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
862 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
863 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
864 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
865 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
866 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
873 trn1 v24.4h, v30.4h, v12.4h
874 trn2 v25.4h, v30.4h, v12.4h
875 trn1 v26.4h, v31.4h, v13.4h
876 trn2 v27.4h, v31.4h, v13.4h
883 trn1 v24.4h, v14.4h, v18.4h
884 trn2 v25.4h, v14.4h, v18.4h
885 trn1 v26.4h, v15.4h, v19.4h
886 trn2 v27.4h, v15.4h, v19.4h
898 st1 { v30.4h, v31.4h},[x1],#16
899 st1 { v12.4h, v13.4h},[x1],#16
901 st1 { v14.4h, v15.4h},[x1],#16
902 st1 { v18.4h, v19.4h},[x1],#16
908 ld1 {v10.4h},[x0],x6
909 ld1 {v8.4h},[x0],x6
910 ld1 {v11.4h},[x0],x6
911 ld1 {v9.4h},[x0],x6
914 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
915 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
916 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
917 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
919 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
920 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
921 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
922 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
928 smull v20.4s, v10.4h, v0.h[0]
929 smlsl v20.4s, v11.4h, v7.h[2]
932 smull v22.4s, v10.4h, v0.h[0]
933 smlsl v22.4s, v11.4h, v6.h[2]
935 smull v16.4s, v10.4h, v0.h[0]
936 smlsl v16.4s, v11.4h, v5.h[2]
938 smull v18.4s, v10.4h, v0.h[0]
939 smlsl v18.4s, v11.4h, v4.h[2]
944 ld1 {v12.4h},[x0],x6
945 ld1 {v14.4h},[x0],x6
946 ld1 {v13.4h},[x0],x6
947 ld1 {v15.4h},[x0],x6
952 smlsl v24.4s, v14.4h, v5.h[1]
953 smlsl v26.4s, v14.4h, v7.h[3]
954 smlal v28.4s, v14.4h, v5.h[3]
955 smlal v30.4s, v14.4h, v3.h[1]
958 smlal v24.4s, v15.4h, v2.h[1]
959 smlal v26.4s, v15.4h, v1.h[1]
960 smlal v28.4s, v15.4h, v4.h[3]
961 smlsl v30.4s, v15.4h, v7.h[3]
964 smlsl v20.4s, v12.4h, v1.h[0]
965 smlal v20.4s, v13.4h, v6.h[2]
966 smlsl v22.4s, v12.4h, v3.h[0]
967 smlal v22.4s, v13.4h, v3.h[2]
968 smlsl v16.4s, v12.4h, v5.h[0]
969 smlal v16.4s, v13.4h, v0.h[2]
970 smlsl v18.4s, v12.4h, v7.h[0]
971 smlal v18.4s, v13.4h, v2.h[2]
976 ld1 {v10.4h},[x0],x6
977 ld1 {v8.4h},[x0],x6
978 ld1 {v11.4h},[x0],x6
979 ld1 {v9.4h},[x0],x6
981 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
982 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
983 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
984 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
986 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
987 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
988 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
989 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
995 smlal v20.4s, v10.4h, v2.h[0]
996 smlsl v20.4s, v11.4h, v5.h[2]
999 smlal v22.4s, v10.4h, v6.h[0]
1000 smlsl v22.4s, v11.4h, v0.h[2]
1002 smlsl v16.4s, v10.4h, v6.h[0]
1003 smlsl v16.4s, v11.4h, v4.h[2]
1005 smlsl v18.4s, v10.4h, v2.h[0]
1006 smlal v18.4s, v11.4h, v6.h[2]
1012 ld1 {v12.4h},[x0],x6
1013 ld1 {v14.4h},[x0],x6
1014 ld1 {v13.4h},[x0],x6
1015 ld1 {v15.4h},[x0],x6
1022 smlsl v24.4s, v14.4h, v7.h[1]
1023 smlal v26.4s, v14.4h, v2.h[1]
1024 smlal v28.4s, v14.4h, v4.h[1]
1025 smlsl v30.4s, v14.4h, v5.h[1]
1028 smlal v24.4s, v15.4h, v0.h[3]
1029 smlal v26.4s, v15.4h, v7.h[1]
1030 smlsl v28.4s, v15.4h, v1.h[1]
1031 smlsl v30.4s, v15.4h, v6.h[1]
1034 smlsl v20.4s, v12.4h, v3.h[0]
1035 smlal v20.4s, v13.4h, v4.h[2]
1036 smlal v22.4s, v12.4h, v7.h[0]
1037 smlal v22.4s, v13.4h, v2.h[2]
1038 smlal v16.4s, v12.4h, v1.h[0]
1039 smlsl v16.4s, v13.4h, v6.h[2]
1040 smlal v18.4s, v12.4h, v5.h[0]
1041 smlsl v18.4s, v13.4h, v0.h[2]
1048 ld1 {v10.4h},[x0],x6
1049 ld1 {v8.4h},[x0],x6
1050 ld1 {v11.4h},[x0],x6
1051 ld1 {v9.4h},[x0],x6
1054 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
1055 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
1056 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
1057 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1059 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1060 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1061 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1062 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1068 smlal v20.4s, v10.4h, v0.h[0]
1069 smlsl v20.4s, v11.4h, v3.h[2]
1072 smlsl v22.4s, v10.4h, v0.h[0]
1073 smlsl v22.4s, v11.4h, v5.h[2]
1075 smlsl v16.4s, v10.4h, v0.h[0]
1076 smlal v16.4s, v11.4h, v1.h[2]
1078 smlal v18.4s, v10.4h, v0.h[0]
1079 smlal v18.4s, v11.4h, v7.h[2]
1082 ld1 {v12.4h},[x0],x6
1083 ld1 {v14.4h},[x0],x6
1084 ld1 {v13.4h},[x0],x6
1085 ld1 {v15.4h},[x0],x6
1089 smlal v24.4s, v14.4h, v6.h[3]
1090 smlal v26.4s, v14.4h, v3.h[3]
1091 smlsl v28.4s, v14.4h, v1.h[3]
1092 smlal v30.4s, v14.4h, v7.h[1]
1095 smlal v24.4s, v15.4h, v1.h[3]
1096 smlsl v26.4s, v15.4h, v2.h[3]
1097 smlal v28.4s, v15.4h, v7.h[1]
1098 smlal v30.4s, v15.4h, v4.h[1]
1101 smlsl v20.4s, v12.4h, v5.h[0]
1102 smlal v20.4s, v13.4h, v2.h[2]
1103 smlal v22.4s, v12.4h, v1.h[0]
1104 smlsl v22.4s, v13.4h, v7.h[2]
1105 smlsl v16.4s, v12.4h, v7.h[0]
1106 smlsl v16.4s, v13.4h, v3.h[2]
1107 smlsl v18.4s, v12.4h, v3.h[0]
1108 smlal v18.4s, v13.4h, v1.h[2]
1112 ld1 {v10.4h},[x0],x6
1113 ld1 {v8.4h},[x0],x6
1114 ld1 {v11.4h},[x0],x6
1115 ld1 {v9.4h},[x0],x6
1120 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
1121 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1122 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
1123 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
1125 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1126 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1127 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1128 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1134 smlal v20.4s, v10.4h, v6.h[0]
1135 smlsl v20.4s, v11.4h, v1.h[2]
1138 smlsl v22.4s, v10.4h, v2.h[0]
1139 smlal v22.4s, v11.4h, v4.h[2]
1141 smlal v16.4s, v10.4h, v2.h[0]
1142 smlsl v16.4s, v11.4h, v7.h[2]
1144 smlsl v18.4s, v10.4h, v6.h[0]
1145 smlsl v18.4s, v11.4h, v5.h[2]
1148 ld1 {v12.4h},[x0],x6
1149 ld1 {v14.4h},[x0],x6
1150 ld1 {v13.4h},[x0],x6
1151 ld1 {v15.4h},[x0],x6
1153 smlal v24.4s, v14.4h, v4.h[3]
1154 smlsl v26.4s, v14.4h, v6.h[1]
1155 smlal v28.4s, v14.4h, v7.h[3]
1156 smlal v30.4s, v14.4h, v6.h[3]
1159 smlal v24.4s, v15.4h, v3.h[3]
1160 smlsl v26.4s, v15.4h, v3.h[1]
1161 smlal v28.4s, v15.4h, v2.h[3]
1162 smlsl v30.4s, v15.4h, v2.h[1]
1165 smlsl v20.4s, v12.4h, v7.h[0]
1166 smlal v20.4s, v13.4h, v0.h[2]
1167 smlal v22.4s, v12.4h, v5.h[0]
1168 smlsl v22.4s, v13.4h, v1.h[2]
1169 smlsl v16.4s, v12.4h, v3.h[0]
1170 smlal v16.4s, v13.4h, v2.h[2]
1171 smlal v18.4s, v12.4h, v1.h[0]
1172 smlsl v18.4s, v13.4h, v3.h[2]
1189 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1190 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1191 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1192 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1193 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1194 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1195 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1196 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1203 trn1 v24.4h, v30.4h, v12.4h
1204 trn2 v25.4h, v30.4h, v12.4h
1205 trn1 v26.4h, v31.4h, v13.4h
1206 trn2 v27.4h, v31.4h, v13.4h
1213 trn1 v24.4h, v14.4h, v18.4h
1214 trn2 v25.4h, v14.4h, v18.4h
1215 trn1 v26.4h, v15.4h, v19.4h
1216 trn2 v27.4h, v15.4h, v19.4h
1227 st1 { v30.4h, v31.4h},[x1],#16
1228 st1 { v12.4h, v13.4h},[x1],#16
1230 st1 { v14.4h, v15.4h},[x1],#16
1231 st1 { v18.4h, v19.4h},[x1],#16
1238 ld1 {v10.4h},[x0],x6
1239 ld1 {v8.4h},[x0],x6
1240 ld1 {v11.4h},[x0],x6
1241 ld1 {v9.4h},[x0],x6
1244 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
1245 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1246 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
1247 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
1249 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1250 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1251 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1252 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1258 smull v20.4s, v10.4h, v0.h[0]
1259 smlsl v20.4s, v11.4h, v3.h[2]
1262 smull v22.4s, v10.4h, v0.h[0]
1263 smlsl v22.4s, v11.4h, v2.h[2]
1265 smull v16.4s, v10.4h, v0.h[0]
1266 smlsl v16.4s, v11.4h, v1.h[2]
1268 smull v18.4s, v10.4h, v0.h[0]
1269 smlsl v18.4s, v11.4h, v0.h[2]
1274 ld1 {v12.4h},[x0],x6
1275 ld1 {v14.4h},[x0],x6
1276 ld1 {v13.4h},[x0],x6
1277 ld1 {v15.4h},[x0],x6
1284 smlal v24.4s, v14.4h, v0.h[1]
1285 smlal v26.4s, v14.4h, v1.h[3]
1286 smlal v28.4s, v14.4h, v4.h[1]
1287 smlal v30.4s, v14.4h, v6.h[3]
1290 smlsl v24.4s, v15.4h, v4.h[1]
1291 smlsl v26.4s, v15.4h, v0.h[3]
1292 smlsl v28.4s, v15.4h, v2.h[3]
1293 smlsl v30.4s, v15.4h, v6.h[1]
1296 smlal v20.4s, v12.4h, v7.h[0]
1297 smlal v20.4s, v13.4h, v5.h[2]
1298 smlal v22.4s, v12.4h, v5.h[0]
1299 smlsl v22.4s, v13.4h, v7.h[2]
1300 smlal v16.4s, v12.4h, v3.h[0]
1301 smlsl v16.4s, v13.4h, v4.h[2]
1302 smlal v18.4s, v12.4h, v1.h[0]
1303 smlsl v18.4s, v13.4h, v1.h[2]
1308 ld1 {v10.4h},[x0],x6
1309 ld1 {v8.4h},[x0],x6
1310 ld1 {v11.4h},[x0],x6
1311 ld1 {v9.4h},[x0],x6
1315 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
1316 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
1317 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
1318 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
1320 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1321 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1322 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1323 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1329 smlsl v20.4s, v10.4h, v2.h[0]
1330 smlal v20.4s, v11.4h, v1.h[2]
1333 smlsl v22.4s, v10.4h, v6.h[0]
1334 smlal v22.4s, v11.4h, v3.h[2]
1336 smlal v16.4s, v10.4h, v6.h[0]
1337 smlsl v16.4s, v11.4h, v7.h[2]
1339 smlal v18.4s, v10.4h, v2.h[0]
1340 smlsl v18.4s, v11.4h, v2.h[2]
1346 ld1 {v12.4h},[x0],x6
1347 ld1 {v14.4h},[x0],x6
1348 ld1 {v13.4h},[x0],x6
1349 ld1 {v15.4h},[x0],x6
1356 smlsl v24.4s, v14.4h, v1.h[1]
1357 smlsl v26.4s, v14.4h, v7.h[3]
1358 smlal v28.4s, v14.4h, v1.h[3]
1359 smlal v30.4s, v14.4h, v4.h[3]
1362 smlal v24.4s, v15.4h, v2.h[1]
1363 smlal v26.4s, v15.4h, v5.h[1]
1364 smlsl v28.4s, v15.4h, v3.h[1]
1365 smlsl v30.4s, v15.4h, v4.h[1]
1368 smlsl v20.4s, v12.4h, v5.h[0]
1369 smlsl v20.4s, v13.4h, v7.h[2]
1370 smlsl v22.4s, v12.4h, v1.h[0]
1371 smlal v22.4s, v13.4h, v1.h[2]
1372 smlsl v16.4s, v12.4h, v7.h[0]
1373 smlal v16.4s, v13.4h, v5.h[2]
1374 smlal v18.4s, v12.4h, v3.h[0]
1375 smlsl v18.4s, v13.4h, v3.h[2]
1381 ld1 {v10.4h},[x0],x6
1382 ld1 {v8.4h},[x0],x6
1383 ld1 {v11.4h},[x0],x6
1384 ld1 {v9.4h},[x0],x6
1387 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
1388 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1389 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
1390 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
1392 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1393 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1394 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1395 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1401 smlal v20.4s, v10.4h, v0.h[0]
1402 smlsl v20.4s, v11.4h, v0.h[2]
1405 smlsl v22.4s, v10.4h, v0.h[0]
1406 smlal v22.4s, v11.4h, v6.h[2]
1408 smlsl v16.4s, v10.4h, v0.h[0]
1409 smlal v16.4s, v11.4h, v2.h[2]
1411 smlal v18.4s, v10.4h, v0.h[0]
1412 smlsl v18.4s, v11.4h, v4.h[2]
1417 ld1 {v12.4h},[x0],x6
1418 ld1 {v14.4h},[x0],x6
1419 ld1 {v13.4h},[x0],x6
1420 ld1 {v15.4h},[x0],x6
1427 smlal v24.4s, v14.4h, v3.h[1]
1428 smlsl v26.4s, v14.4h, v2.h[1]
1429 smlal v28.4s, v14.4h, v7.h[3]
1430 smlal v30.4s, v14.4h, v2.h[3]
1433 smlsl v24.4s, v15.4h, v0.h[3]
1434 smlal v26.4s, v15.4h, v4.h[3]
1435 smlal v28.4s, v15.4h, v6.h[3]
1436 smlsl v30.4s, v15.4h, v2.h[1]
1439 smlal v20.4s, v12.4h, v3.h[0]
1440 smlsl v20.4s, v13.4h, v6.h[2]
1441 smlal v22.4s, v12.4h, v7.h[0]
1442 smlsl v22.4s, v13.4h, v4.h[2]
1443 smlsl v16.4s, v12.4h, v1.h[0]
1444 smlal v16.4s, v13.4h, v0.h[2]
1445 smlal v18.4s, v12.4h, v5.h[0]
1446 smlsl v18.4s, v13.4h, v5.h[2]
1449 ld1 {v10.4h},[x0],x6
1450 ld1 {v8.4h},[x0],x6
1451 ld1 {v11.4h},[x0],x6
1452 ld1 {v9.4h},[x0],x6
1458 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
1459 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
1460 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
1461 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1463 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1464 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1465 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1466 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1472 smlsl v20.4s, v10.4h, v6.h[0]
1473 smlal v20.4s, v11.4h, v2.h[2]
1476 smlal v22.4s, v10.4h, v2.h[0]
1477 smlsl v22.4s, v11.4h, v0.h[2]
1479 smlsl v16.4s, v10.4h, v2.h[0]
1480 smlal v16.4s, v11.4h, v3.h[2]
1482 smlal v18.4s, v10.4h, v6.h[0]
1483 smlsl v18.4s, v11.4h, v6.h[2]
1486 ld1 {v12.4h},[x0],x6
1487 ld1 {v14.4h},[x0],x6
1488 ld1 {v13.4h},[x0],x6
1489 ld1 {v15.4h},[x0],x6
1494 smlsl v24.4s, v14.4h, v5.h[1]
1495 smlal v26.4s, v14.4h, v3.h[3]
1496 smlsl v28.4s, v14.4h, v2.h[1]
1497 smlal v30.4s, v14.4h, v0.h[3]
1500 smlal v24.4s, v15.4h, v1.h[3]
1501 smlsl v26.4s, v15.4h, v1.h[1]
1502 smlal v28.4s, v15.4h, v0.h[3]
1503 smlsl v30.4s, v15.4h, v0.h[1]
1506 smlsl v20.4s, v12.4h, v1.h[0]
1507 smlal v20.4s, v13.4h, v4.h[2]
1508 smlal v22.4s, v12.4h, v3.h[0]
1509 smlsl v22.4s, v13.4h, v5.h[2]
1510 smlsl v16.4s, v12.4h, v5.h[0]
1511 smlal v16.4s, v13.4h, v6.h[2]
1512 smlal v18.4s, v12.4h, v7.h[0]
1513 smlsl v18.4s, v13.4h, v7.h[2]
1530 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1531 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1532 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1533 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1534 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1535 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1536 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1537 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1544 trn1 v24.4h, v30.4h, v12.4h
1545 trn2 v25.4h, v30.4h, v12.4h
1546 trn1 v26.4h, v31.4h, v13.4h
1547 trn2 v27.4h, v31.4h, v13.4h
1554 trn1 v24.4h, v14.4h, v18.4h
1555 trn2 v25.4h, v14.4h, v18.4h
1556 trn1 v26.4h, v15.4h, v19.4h
1557 trn2 v27.4h, v15.4h, v19.4h
1569 st1 { v30.4h, v31.4h},[x1],#16
1570 st1 { v12.4h, v13.4h},[x1],#16
1571 st1 { v14.4h, v15.4h},[x1],#16
1572 st1 { v18.4h, v19.4h},[x1],#16
1615 ld1 {v10.4h, v11.4h},[x1],#16
1616 ld1 {v8.4h, v9.4h},[x1],x10
1618 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
1619 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
1620 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
1621 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
1623 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1624 smlal v26.4s, v9.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1625 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1626 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1630 smull v20.4s, v10.4h, v0.h[0]
1631 smlal v20.4s, v11.4h, v0.h[2]
1634 smull v22.4s, v10.4h, v0.h[0]
1635 smlal v22.4s, v11.4h, v1.h[2]
1637 smull v16.4s, v10.4h, v0.h[0]
1638 smlal v16.4s, v11.4h, v2.h[2]
1640 smull v18.4s, v10.4h, v0.h[0]
1641 smlal v18.4s, v11.4h, v3.h[2]
1645 ld1 {v12.4h, v13.4h},[x1],#16
1646 ld1 {v14.4h, v15.4h},[x1],x10
1653 smlal v24.4s, v14.4h, v1.h[1]
1654 smlal v26.4s, v14.4h, v3.h[3]
1655 smlal v28.4s, v14.4h, v6.h[1]
1656 smlsl v30.4s, v14.4h, v7.h[1]
1659 smlal v24.4s, v15.4h, v1.h[3]
1660 smlal v26.4s, v15.4h, v5.h[1]
1661 smlsl v28.4s, v15.4h, v7.h[1]
1662 smlsl v30.4s, v15.4h, v3.h[3]
1665 smlal v20.4s, v12.4h, v1.h[0]
1666 smlal v20.4s, v13.4h, v1.h[2]
1667 smlal v22.4s, v12.4h, v3.h[0]
1668 smlal v22.4s, v13.4h, v4.h[2]
1669 smlal v16.4s, v12.4h, v5.h[0]
1670 smlal v16.4s, v13.4h, v7.h[2]
1671 smlal v18.4s, v12.4h, v7.h[0]
1672 smlsl v18.4s, v13.4h, v5.h[2]
1676 ld1 {v10.4h, v11.4h},[x1],#16
1677 ld1 {v8.4h, v9.4h},[x1],x10
1679 smlal v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
1680 smlal v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
1681 smlsl v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
1682 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
1684 smlal v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1685 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1686 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1687 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1693 smlal v20.4s, v10.4h, v2.h[0]
1694 smlal v20.4s, v11.4h, v2.h[2]
1697 smlal v22.4s, v10.4h, v6.h[0]
1698 smlal v22.4s, v11.4h, v7.h[2]
1700 smlsl v16.4s, v10.4h, v6.h[0]
1701 smlsl v16.4s, v11.4h, v3.h[2]
1703 smlsl v18.4s, v10.4h, v2.h[0]
1704 smlsl v18.4s, v11.4h, v1.h[2]
1710 ld1 {v12.4h, v13.4h},[x1],#16
1711 ld1 {v14.4h, v15.4h},[x1],x10
1717 smlal v24.4s, v14.4h, v3.h[1]
1718 smlsl v26.4s, v14.4h, v6.h[1]
1719 smlsl v28.4s, v14.4h, v0.h[1]
1720 smlsl v30.4s, v14.4h, v6.h[3]
1723 smlal v24.4s, v15.4h, v3.h[3]
1724 smlsl v26.4s, v15.4h, v4.h[3]
1725 smlsl v28.4s, v15.4h, v2.h[3]
1726 smlal v30.4s, v15.4h, v5.h[3]
1729 smlal v20.4s, v12.4h, v3.h[0]
1730 smlal v20.4s, v13.4h, v3.h[2]
1731 smlsl v22.4s, v12.4h, v7.h[0]
1732 smlsl v22.4s, v13.4h, v5.h[2]
1733 smlsl v16.4s, v12.4h, v1.h[0]
1734 smlsl v16.4s, v13.4h, v1.h[2]
1735 smlsl v18.4s, v12.4h, v5.h[0]
1736 smlal v18.4s, v13.4h, v7.h[2]
1742 ld1 {v10.4h, v11.4h},[x1],#16
1743 ld1 {v8.4h, v9.4h},[x1],x10
1746 smlal v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
1747 smlsl v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
1748 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
1749 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
1751 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1752 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1753 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1754 smlal v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1760 smlal v20.4s, v10.4h, v0.h[0]
1761 smlal v20.4s, v11.4h, v4.h[2]
1764 smlsl v22.4s, v10.4h, v0.h[0]
1765 smlsl v22.4s, v11.4h, v2.h[2]
1767 smlsl v16.4s, v10.4h, v0.h[0]
1768 smlsl v16.4s, v11.4h, v6.h[2]
1770 smlal v18.4s, v10.4h, v0.h[0]
1771 smlal v18.4s, v11.4h, v0.h[2]
1773 ld1 {v12.4h, v13.4h},[x1],#16
1774 ld1 {v14.4h, v15.4h},[x1],x10
1780 smlal v24.4s, v14.4h, v5.h[1]
1781 smlsl v26.4s, v14.4h, v0.h[2]
1782 smlal v28.4s, v14.4h, v5.h[3]
1783 smlal v30.4s, v14.4h, v4.h[3]
1786 smlal v24.4s, v15.4h, v5.h[3]
1787 smlsl v26.4s, v15.4h, v1.h[1]
1788 smlal v28.4s, v15.4h, v3.h[1]
1789 smlsl v30.4s, v15.4h, v7.h[3]
1792 smlal v20.4s, v12.4h, v5.h[0]
1793 smlal v20.4s, v13.4h, v5.h[2]
1794 smlsl v22.4s, v12.4h, v1.h[0]
1795 smlsl v22.4s, v13.4h, v0.h[2]
1796 smlal v16.4s, v12.4h, v7.h[0]
1797 smlal v16.4s, v13.4h, v4.h[2]
1798 smlal v18.4s, v12.4h, v3.h[0]
1799 smlal v18.4s, v13.4h, v6.h[2]
1802 ld1 {v10.4h, v11.4h},[x1],#16
1803 ld1 {v8.4h, v9.4h},[x1],x10
1808 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
1809 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1810 smlal v28.4s, v8.4h, v0.h[1] //// y1 * sin3(part of b2)
1811 smlsl v30.4s, v8.4h, v4.h[1] //// y1 * sin1(part of b3)
1813 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1814 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1815 smlal v28.4s, v9.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1816 smlsl v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1822 smlal v20.4s, v10.4h, v6.h[0]
1823 smlal v20.4s, v11.4h, v6.h[2]
1826 smlsl v22.4s, v10.4h, v2.h[0]
1827 smlsl v22.4s, v11.4h, v3.h[2]
1829 smlal v16.4s, v10.4h, v2.h[0]
1830 smlal v16.4s, v11.4h, v0.h[2]
1832 smlsl v18.4s, v10.4h, v6.h[0]
1833 smlsl v18.4s, v11.4h, v2.h[2]
1835 ld1 {v12.4h, v13.4h},[x1],#16
1836 ld1 {v14.4h, v15.4h},[x1],x10
1838 smlal v24.4s, v14.4h, v7.h[1]
1839 smlsl v26.4s, v14.4h, v5.h[3]
1840 smlal v28.4s, v14.4h, v4.h[1]
1841 smlsl v30.4s, v14.4h, v2.h[3]
1844 smlal v24.4s, v15.4h, v7.h[3]
1845 smlsl v26.4s, v15.4h, v7.h[1]
1846 smlal v28.4s, v15.4h, v6.h[3]
1847 smlsl v30.4s, v15.4h, v6.h[1]
1850 smlal v20.4s, v12.4h, v7.h[0]
1851 smlal v20.4s, v13.4h, v7.h[2]
1852 smlsl v22.4s, v12.4h, v5.h[0]
1853 smlsl v22.4s, v13.4h, v6.h[2]
1854 smlal v16.4s, v12.4h, v3.h[0]
1855 smlal v16.4s, v13.4h, v5.h[2]
1856 smlsl v18.4s, v12.4h, v1.h[0]
1857 smlsl v18.4s, v13.4h, v4.h[2]
1874 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
1875 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
1876 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
1877 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
1878 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
1879 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
1880 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
1881 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
1889 trn1 v24.4h, v30.4h, v12.4h
1890 trn2 v25.4h, v30.4h, v12.4h
1891 trn1 v26.4h, v31.4h, v13.4h
1892 trn2 v27.4h, v31.4h, v13.4h
1899 trn1 v24.4h, v14.4h, v18.4h
1900 trn2 v25.4h, v14.4h, v18.4h
1901 trn1 v26.4h, v15.4h, v19.4h
1902 trn2 v27.4h, v15.4h, v19.4h
1914 st1 { v30.4h, v31.4h},[x0],#16
1915 st1 { v12.4h, v13.4h},[x0],#16
1916 st1 { v14.4h, v15.4h},[x0],#16
1917 st1 { v18.4h, v19.4h},[x0],#16
1926 ld1 {v10.4h, v11.4h},[x1],#16
1927 ld1 {v8.4h, v9.4h},[x1],x10
1930 smull v24.4s, v8.4h, v2.h[1] //// y1 * cos1(part of b0)
1931 smull v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
1932 smull v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
1933 smull v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
1935 smlal v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1936 smlsl v26.4s, v9.4h, v7.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1937 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1938 smlsl v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1944 smull v20.4s, v10.4h, v0.h[0]
1945 smlal v20.4s, v11.4h, v4.h[2]
1948 smull v22.4s, v10.4h, v0.h[0]
1949 smlal v22.4s, v11.4h, v5.h[2]
1951 smull v16.4s, v10.4h, v0.h[0]
1952 smlal v16.4s, v11.4h, v6.h[2]
1954 smull v18.4s, v10.4h, v0.h[0]
1955 smlal v18.4s, v11.4h, v7.h[2]
1960 ld1 {v12.4h, v13.4h},[x1],#16
1961 ld1 {v14.4h, v15.4h},[x1],x10
1964 smlsl v24.4s, v14.4h, v4.h[3]
1965 smlsl v26.4s, v14.4h, v2.h[1]
1966 smlsl v28.4s, v14.4h, v0.h[1]
1967 smlsl v30.4s, v14.4h, v2.h[3]
1970 smlsl v24.4s, v15.4h, v0.h[3]
1971 smlsl v26.4s, v15.4h, v3.h[1]
1972 smlsl v28.4s, v15.4h, v6.h[3]
1973 smlal v30.4s, v15.4h, v5.h[3]
1976 smlsl v20.4s, v12.4h, v7.h[0]
1977 smlsl v20.4s, v13.4h, v2.h[2]
1978 smlsl v22.4s, v12.4h, v5.h[0]
1979 smlsl v22.4s, v13.4h, v0.h[2]
1980 smlsl v16.4s, v12.4h, v3.h[0]
1981 smlsl v16.4s, v13.4h, v3.h[2]
1982 smlsl v18.4s, v12.4h, v1.h[0]
1983 smlsl v18.4s, v13.4h, v6.h[2]
1988 ld1 {v10.4h, v11.4h},[x1],#16
1989 ld1 {v8.4h, v9.4h},[x1],x10
1995 smlsl v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
1996 smlal v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
1997 smlal v28.4s, v8.4h, v2.h[3] //// y1 * sin3(part of b2)
1998 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2000 smlal v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2001 smlal v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2002 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2003 smlsl v30.4s, v9.4h, v6.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2009 smlsl v20.4s, v10.4h, v2.h[0]
2010 smlsl v20.4s, v11.4h, v6.h[2]
2013 smlsl v22.4s, v10.4h, v6.h[0]
2014 smlal v22.4s, v11.4h, v4.h[2]
2016 smlal v16.4s, v10.4h, v6.h[0]
2017 smlal v16.4s, v11.4h, v0.h[2]
2019 smlal v18.4s, v10.4h, v2.h[0]
2020 smlal v18.4s, v11.4h, v5.h[2]
2026 ld1 {v12.4h, v13.4h},[x1],#16
2027 ld1 {v14.4h, v15.4h},[x1],x10
2034 smlal v24.4s, v14.4h, v2.h[3]
2035 smlal v26.4s, v14.4h, v3.h[3]
2036 smlsl v28.4s, v14.4h, v5.h[3]
2037 smlsl v30.4s, v14.4h, v0.h[3]
2040 smlal v24.4s, v15.4h, v1.h[3]
2041 smlsl v26.4s, v15.4h, v6.h[3]
2042 smlsl v28.4s, v15.4h, v0.h[3]
2043 smlal v30.4s, v15.4h, v7.h[3]
2046 smlal v20.4s, v12.4h, v5.h[0]
2047 smlal v20.4s, v13.4h, v0.h[2]
2048 smlal v22.4s, v12.4h, v1.h[0]
2049 smlal v22.4s, v13.4h, v6.h[2]
2050 smlal v16.4s, v12.4h, v7.h[0]
2051 smlsl v16.4s, v13.4h, v2.h[2]
2052 smlsl v18.4s, v12.4h, v3.h[0]
2053 smlsl v18.4s, v13.4h, v4.h[2]
2059 ld1 {v10.4h, v11.4h},[x1],#16
2060 ld1 {v8.4h, v9.4h},[x1],x10
2064 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2065 smlsl v26.4s, v8.4h, v1.h[1] //// y1 * cos3(part of b1)
2066 smlsl v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
2067 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1(part of b3)
2069 smlsl v24.4s, v9.4h, v5.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2070 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2071 smlal v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2072 smlal v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2078 smlal v20.4s, v10.4h, v0.h[0]
2079 smlsl v20.4s, v11.4h, v7.h[2]
2082 smlsl v22.4s, v10.4h, v0.h[0]
2083 smlsl v22.4s, v11.4h, v1.h[2]
2085 smlsl v16.4s, v10.4h, v0.h[0]
2086 smlal v16.4s, v11.4h, v5.h[2]
2088 smlal v18.4s, v10.4h, v0.h[0]
2089 smlal v18.4s, v11.4h, v3.h[2]
2091 ld1 {v12.4h, v13.4h},[x1],#16
2092 ld1 {v14.4h, v15.4h},[x1],x10
2097 smlsl v24.4s, v14.4h, v0.h[1]
2098 smlal v26.4s, v14.4h, v6.h[1]
2099 smlal v28.4s, v14.4h, v4.h[1]
2100 smlsl v30.4s, v14.4h, v1.h[1]
2103 smlsl v24.4s, v15.4h, v3.h[3]
2104 smlal v26.4s, v15.4h, v0.h[1]
2105 smlsl v28.4s, v15.4h, v5.h[1]
2106 smlsl v30.4s, v15.4h, v6.h[1]
2109 smlsl v20.4s, v12.4h, v3.h[0]
2110 smlsl v20.4s, v13.4h, v1.h[2]
2111 smlsl v22.4s, v12.4h, v7.h[0]
2112 smlal v22.4s, v13.4h, v3.h[2]
2113 smlal v16.4s, v12.4h, v1.h[0]
2114 smlal v16.4s, v13.4h, v7.h[2]
2115 smlsl v18.4s, v12.4h, v5.h[0]
2116 smlsl v18.4s, v13.4h, v2.h[2]
2119 ld1 {v10.4h, v11.4h},[x1],#16
2120 ld1 {v8.4h, v9.4h},[x1],x10
2123 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2124 smlal v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
2125 smlsl v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
2126 smlal v30.4s, v8.4h, v2.h[1] //// y1 * sin1(part of b3)
2128 smlal v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2129 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2130 smlsl v28.4s, v9.4h, v7.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2131 smlal v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2137 smlsl v20.4s, v10.4h, v6.h[0]
2138 smlal v20.4s, v11.4h, v5.h[2]
2141 smlal v22.4s, v10.4h, v2.h[0]
2142 smlal v22.4s, v11.4h, v7.h[2]
2144 smlsl v16.4s, v10.4h, v2.h[0]
2145 smlsl v16.4s, v11.4h, v4.h[2]
2147 smlal v18.4s, v10.4h, v6.h[0]
2148 smlal v18.4s, v11.4h, v1.h[2]
2151 ld1 {v12.4h, v13.4h},[x1],#16
2152 ld1 {v14.4h, v15.4h},[x1],x10
2156 smlal v24.4s, v14.4h, v1.h[1]
2157 smlsl v26.4s, v14.4h, v0.h[3]
2158 smlal v28.4s, v14.4h, v1.h[3]
2159 smlsl v30.4s, v14.4h, v3.h[1]
2162 smlal v24.4s, v15.4h, v5.h[3]
2163 smlsl v26.4s, v15.4h, v5.h[1]
2164 smlal v28.4s, v15.4h, v4.h[3]
2165 smlsl v30.4s, v15.4h, v4.h[1]
2168 smlal v20.4s, v12.4h, v1.h[0]
2169 smlal v20.4s, v13.4h, v3.h[2]
2170 smlsl v22.4s, v12.4h, v3.h[0]
2171 smlsl v22.4s, v13.4h, v2.h[2]
2172 smlal v16.4s, v12.4h, v5.h[0]
2173 smlal v16.4s, v13.4h, v1.h[2]
2174 smlsl v18.4s, v12.4h, v7.h[0]
2175 smlsl v18.4s, v13.4h, v0.h[2]
2192 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2193 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2194 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2195 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2196 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2197 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2198 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2199 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2206 trn1 v24.4h, v30.4h, v12.4h
2207 trn2 v25.4h, v30.4h, v12.4h
2208 trn1 v26.4h, v31.4h, v13.4h
2209 trn2 v27.4h, v31.4h, v13.4h
2216 trn1 v24.4h, v14.4h, v18.4h
2217 trn2 v25.4h, v14.4h, v18.4h
2218 trn1 v26.4h, v15.4h, v19.4h
2219 trn2 v27.4h, v15.4h, v19.4h
2231 st1 { v30.4h, v31.4h},[x0],#16
2232 st1 { v12.4h, v13.4h},[x0],#16
2233 st1 { v14.4h, v15.4h},[x0],#16
2234 st1 { v18.4h, v19.4h},[x0],#16
2242 ld1 {v10.4h, v11.4h},[x1],#16
2243 ld1 {v8.4h, v9.4h},[x1],x10
2245 smull v24.4s, v8.4h, v4.h[1] //// y1 * cos1(part of b0)
2246 smull v26.4s, v8.4h, v4.h[3] //// y1 * cos3(part of b1)
2247 smull v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
2248 smull v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
2250 smlsl v24.4s, v9.4h, v3.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2251 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2252 smlsl v28.4s, v9.4h, v0.h[2] //// y1 * sin3 - y3 * cos1(part of b2)
2253 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2259 smull v20.4s, v10.4h, v0.h[0]
2260 smlsl v20.4s, v11.4h, v7.h[2]
2263 smull v22.4s, v10.4h, v0.h[0]
2264 smlsl v22.4s, v11.4h, v6.h[2]
2266 smull v16.4s, v10.4h, v0.h[0]
2267 smlsl v16.4s, v11.4h, v5.h[2]
2269 smull v18.4s, v10.4h, v0.h[0]
2270 smlsl v18.4s, v11.4h, v4.h[2]
2275 ld1 {v12.4h, v13.4h},[x1],#16
2276 ld1 {v14.4h, v15.4h},[x1],x10
2278 smlsl v24.4s, v14.4h, v5.h[1]
2279 smlsl v26.4s, v14.4h, v7.h[3]
2280 smlal v28.4s, v14.4h, v5.h[3]
2281 smlal v30.4s, v14.4h, v3.h[1]
2284 smlal v24.4s, v15.4h, v2.h[1]
2285 smlal v26.4s, v15.4h, v1.h[1]
2286 smlal v28.4s, v15.4h, v4.h[3]
2287 smlsl v30.4s, v15.4h, v7.h[3]
2290 smlsl v20.4s, v12.4h, v1.h[0]
2291 smlal v20.4s, v13.4h, v6.h[2]
2292 smlsl v22.4s, v12.4h, v3.h[0]
2293 smlal v22.4s, v13.4h, v3.h[2]
2294 smlsl v16.4s, v12.4h, v5.h[0]
2295 smlal v16.4s, v13.4h, v0.h[2]
2296 smlsl v18.4s, v12.4h, v7.h[0]
2297 smlal v18.4s, v13.4h, v2.h[2]
2302 ld1 {v10.4h, v11.4h},[x1],#16
2303 ld1 {v8.4h, v9.4h},[x1],x10
2307 smlal v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2308 smlsl v26.4s, v8.4h, v5.h[1] //// y1 * cos3(part of b1)
2309 smlsl v28.4s, v8.4h, v0.h[3] //// y1 * sin3(part of b2)
2310 smlsl v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
2312 smlsl v24.4s, v9.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2313 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2314 smlal v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2315 smlal v30.4s, v9.4h, v0.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2321 smlal v20.4s, v10.4h, v2.h[0]
2322 smlsl v20.4s, v11.4h, v5.h[2]
2325 smlal v22.4s, v10.4h, v6.h[0]
2326 smlsl v22.4s, v11.4h, v0.h[2]
2328 smlsl v16.4s, v10.4h, v6.h[0]
2329 smlsl v16.4s, v11.4h, v4.h[2]
2331 smlsl v18.4s, v10.4h, v2.h[0]
2332 smlal v18.4s, v11.4h, v6.h[2]
2337 ld1 {v12.4h, v13.4h},[x1],#16
2338 ld1 {v14.4h, v15.4h},[x1],x10
2344 smlsl v24.4s, v14.4h, v7.h[1]
2345 smlal v26.4s, v14.4h, v2.h[1]
2346 smlal v28.4s, v14.4h, v4.h[1]
2347 smlsl v30.4s, v14.4h, v5.h[1]
2350 smlal v24.4s, v15.4h, v0.h[3]
2351 smlal v26.4s, v15.4h, v7.h[1]
2352 smlsl v28.4s, v15.4h, v1.h[1]
2353 smlsl v30.4s, v15.4h, v6.h[1]
2356 smlsl v20.4s, v12.4h, v3.h[0]
2357 smlal v20.4s, v13.4h, v4.h[2]
2358 smlal v22.4s, v12.4h, v7.h[0]
2359 smlal v22.4s, v13.4h, v2.h[2]
2360 smlal v16.4s, v12.4h, v1.h[0]
2361 smlsl v16.4s, v13.4h, v6.h[2]
2362 smlal v18.4s, v12.4h, v5.h[0]
2363 smlsl v18.4s, v13.4h, v0.h[2]
2369 ld1 {v10.4h, v11.4h},[x1],#16
2370 ld1 {v8.4h, v9.4h},[x1],x10
2373 smlsl v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2374 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3(part of b1)
2375 smlal v28.4s, v8.4h, v6.h[3] //// y1 * sin3(part of b2)
2376 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2378 smlsl v24.4s, v9.4h, v0.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2379 smlal v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2380 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2381 smlsl v30.4s, v9.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2387 smlal v20.4s, v10.4h, v0.h[0]
2388 smlsl v20.4s, v11.4h, v3.h[2]
2391 smlsl v22.4s, v10.4h, v0.h[0]
2392 smlsl v22.4s, v11.4h, v5.h[2]
2394 smlsl v16.4s, v10.4h, v0.h[0]
2395 smlal v16.4s, v11.4h, v1.h[2]
2397 smlal v18.4s, v10.4h, v0.h[0]
2398 smlal v18.4s, v11.4h, v7.h[2]
2400 ld1 {v12.4h, v13.4h},[x1],#16
2401 ld1 {v14.4h, v15.4h},[x1],x10
2406 smlal v24.4s, v14.4h, v6.h[3]
2407 smlal v26.4s, v14.4h, v3.h[3]
2408 smlsl v28.4s, v14.4h, v1.h[3]
2409 smlal v30.4s, v14.4h, v7.h[1]
2412 smlal v24.4s, v15.4h, v1.h[3]
2413 smlsl v26.4s, v15.4h, v2.h[3]
2414 smlal v28.4s, v15.4h, v7.h[1]
2415 smlal v30.4s, v15.4h, v4.h[1]
2418 smlsl v20.4s, v12.4h, v5.h[0]
2419 smlal v20.4s, v13.4h, v2.h[2]
2420 smlal v22.4s, v12.4h, v1.h[0]
2421 smlsl v22.4s, v13.4h, v7.h[2]
2422 smlsl v16.4s, v12.4h, v7.h[0]
2423 smlsl v16.4s, v13.4h, v3.h[2]
2424 smlsl v18.4s, v12.4h, v3.h[0]
2425 smlal v18.4s, v13.4h, v1.h[2]
2428 ld1 {v10.4h, v11.4h},[x1],#16
2429 ld1 {v8.4h, v9.4h},[x1],x10
2432 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
2433 smlsl v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
2434 smlal v28.4s, v8.4h, v3.h[1] //// y1 * sin3(part of b2)
2435 smlsl v30.4s, v8.4h, v0.h[1] //// y1 * sin1(part of b3)
2437 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2438 smlal v26.4s, v9.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2439 smlsl v28.4s, v9.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2440 smlal v30.4s, v9.4h, v4.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2446 smlal v20.4s, v10.4h, v6.h[0]
2447 smlsl v20.4s, v11.4h, v1.h[2]
2450 smlsl v22.4s, v10.4h, v2.h[0]
2451 smlal v22.4s, v11.4h, v4.h[2]
2453 smlal v16.4s, v10.4h, v2.h[0]
2454 smlsl v16.4s, v11.4h, v7.h[2]
2456 smlsl v18.4s, v10.4h, v6.h[0]
2457 smlsl v18.4s, v11.4h, v5.h[2]
2459 ld1 {v12.4h, v13.4h},[x1],#16
2460 ld1 {v14.4h, v15.4h},[x1],x10
2464 smlal v24.4s, v14.4h, v4.h[3]
2465 smlsl v26.4s, v14.4h, v6.h[1]
2466 smlal v28.4s, v14.4h, v7.h[3]
2467 smlal v30.4s, v14.4h, v6.h[3]
2470 smlal v24.4s, v15.4h, v3.h[3]
2471 smlsl v26.4s, v15.4h, v3.h[1]
2472 smlal v28.4s, v15.4h, v2.h[3]
2473 smlsl v30.4s, v15.4h, v2.h[1]
2476 smlsl v20.4s, v12.4h, v7.h[0]
2477 smlal v20.4s, v13.4h, v0.h[2]
2478 smlal v22.4s, v12.4h, v5.h[0]
2479 smlsl v22.4s, v13.4h, v1.h[2]
2480 smlsl v16.4s, v12.4h, v3.h[0]
2481 smlal v16.4s, v13.4h, v2.h[2]
2482 smlal v18.4s, v12.4h, v1.h[0]
2483 smlsl v18.4s, v13.4h, v3.h[2]
2500 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2501 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2502 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2503 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2504 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2505 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2506 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2507 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2514 trn1 v24.4h, v30.4h, v12.4h
2515 trn2 v25.4h, v30.4h, v12.4h
2516 trn1 v26.4h, v31.4h, v13.4h
2517 trn2 v27.4h, v31.4h, v13.4h
2524 trn1 v24.4h, v14.4h, v18.4h
2525 trn2 v25.4h, v14.4h, v18.4h
2526 trn1 v26.4h, v15.4h, v19.4h
2527 trn2 v27.4h, v15.4h, v19.4h
2539 st1 { v30.4h, v31.4h},[x0],#16
2540 st1 { v12.4h, v13.4h},[x0],#16
2541 st1 { v14.4h, v15.4h},[x0],#16
2542 st1 { v18.4h, v19.4h},[x0],#16
2551 ld1 {v10.4h, v11.4h},[x1],#16
2552 ld1 {v8.4h, v9.4h},[x1],x10
2555 smull v24.4s, v8.4h, v6.h[1] //// y1 * cos1(part of b0)
2556 smull v26.4s, v8.4h, v6.h[3] //// y1 * cos3(part of b1)
2557 smull v28.4s, v8.4h, v7.h[1] //// y1 * sin3(part of b2)
2558 smull v30.4s, v8.4h, v7.h[3] //// y1 * sin1(part of b3)
2560 smlsl v24.4s, v9.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2561 smlsl v26.4s, v9.4h, v4.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2562 smlsl v28.4s, v9.4h, v5.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2563 smlsl v30.4s, v9.4h, v7.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2569 smull v20.4s, v10.4h, v0.h[0]
2570 smlsl v20.4s, v11.4h, v3.h[2]
2573 smull v22.4s, v10.4h, v0.h[0]
2574 smlsl v22.4s, v11.4h, v2.h[2]
2576 smull v16.4s, v10.4h, v0.h[0]
2577 smlsl v16.4s, v11.4h, v1.h[2]
2579 smull v18.4s, v10.4h, v0.h[0]
2580 smlsl v18.4s, v11.4h, v0.h[2]
2584 ld1 {v12.4h, v13.4h},[x1],#16
2585 ld1 {v14.4h, v15.4h},[x1],x10
2592 smlal v24.4s, v14.4h, v0.h[1]
2593 smlal v26.4s, v14.4h, v1.h[3]
2594 smlal v28.4s, v14.4h, v4.h[1]
2595 smlal v30.4s, v14.4h, v6.h[3]
2598 smlsl v24.4s, v15.4h, v4.h[1]
2599 smlsl v26.4s, v15.4h, v0.h[3]
2600 smlsl v28.4s, v15.4h, v2.h[3]
2601 smlsl v30.4s, v15.4h, v6.h[1]
2604 smlal v20.4s, v12.4h, v7.h[0]
2605 smlal v20.4s, v13.4h, v5.h[2]
2606 smlal v22.4s, v12.4h, v5.h[0]
2607 smlsl v22.4s, v13.4h, v7.h[2]
2608 smlal v16.4s, v12.4h, v3.h[0]
2609 smlsl v16.4s, v13.4h, v4.h[2]
2610 smlal v18.4s, v12.4h, v1.h[0]
2611 smlsl v18.4s, v13.4h, v1.h[2]
2616 ld1 {v10.4h, v11.4h},[x1],#16
2617 ld1 {v8.4h, v9.4h},[x1],x10
2621 smlal v24.4s, v8.4h, v7.h[3] //// y1 * cos1(part of b0)
2622 smlal v26.4s, v8.4h, v3.h[1] //// y1 * cos3(part of b1)
2623 smlal v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
2624 smlal v30.4s, v8.4h, v5.h[3] //// y1 * sin1(part of b3)
2626 smlal v24.4s, v9.4h, v4.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2627 smlsl v26.4s, v9.4h, v5.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2628 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2629 smlsl v30.4s, v9.4h, v5.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2635 smlsl v20.4s, v10.4h, v2.h[0]
2636 smlal v20.4s, v11.4h, v1.h[2]
2639 smlsl v22.4s, v10.4h, v6.h[0]
2640 smlal v22.4s, v11.4h, v3.h[2]
2642 smlal v16.4s, v10.4h, v6.h[0]
2643 smlsl v16.4s, v11.4h, v7.h[2]
2645 smlal v18.4s, v10.4h, v2.h[0]
2646 smlsl v18.4s, v11.4h, v2.h[2]
2652 ld1 {v12.4h, v13.4h},[x1],#16
2653 ld1 {v14.4h, v15.4h},[x1],x10
2660 smlsl v24.4s, v14.4h, v1.h[1]
2661 smlsl v26.4s, v14.4h, v7.h[3]
2662 smlal v28.4s, v14.4h, v1.h[3]
2663 smlal v30.4s, v14.4h, v4.h[3]
2666 smlal v24.4s, v15.4h, v2.h[1]
2667 smlal v26.4s, v15.4h, v5.h[1]
2668 smlsl v28.4s, v15.4h, v3.h[1]
2669 smlsl v30.4s, v15.4h, v4.h[1]
2672 smlsl v20.4s, v12.4h, v5.h[0]
2673 smlsl v20.4s, v13.4h, v7.h[2]
2674 smlsl v22.4s, v12.4h, v1.h[0]
2675 smlal v22.4s, v13.4h, v1.h[2]
2676 smlsl v16.4s, v12.4h, v7.h[0]
2677 smlal v16.4s, v13.4h, v5.h[2]
2678 smlal v18.4s, v12.4h, v3.h[0]
2679 smlsl v18.4s, v13.4h, v3.h[2]
2685 ld1 {v10.4h, v11.4h},[x1],#16
2686 ld1 {v8.4h, v9.4h},[x1],x10
2689 smlsl v24.4s, v8.4h, v5.h[3] //// y1 * cos1(part of b0)
2690 smlsl v26.4s, v8.4h, v2.h[3] //// y1 * cos3(part of b1)
2691 smlal v28.4s, v8.4h, v4.h[3] //// y1 * sin3(part of b2)
2692 smlal v30.4s, v8.4h, v3.h[3] //// y1 * sin1(part of b3)
2694 smlsl v24.4s, v9.4h, v6.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2695 smlal v26.4s, v9.4h, v0.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2696 smlsl v28.4s, v9.4h, v6.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2697 smlsl v30.4s, v9.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2703 smlal v20.4s, v10.4h, v0.h[0]
2704 smlsl v20.4s, v11.4h, v0.h[2]
2707 smlsl v22.4s, v10.4h, v0.h[0]
2708 smlal v22.4s, v11.4h, v6.h[2]
2710 smlsl v16.4s, v10.4h, v0.h[0]
2711 smlal v16.4s, v11.4h, v2.h[2]
2713 smlal v18.4s, v10.4h, v0.h[0]
2714 smlsl v18.4s, v11.4h, v4.h[2]
2716 ld1 {v12.4h, v13.4h},[x1],#16
2717 ld1 {v14.4h, v15.4h},[x1],x10
2722 smlal v24.4s, v14.4h, v3.h[1]
2723 smlsl v26.4s, v14.4h, v2.h[1]
2724 smlal v28.4s, v14.4h, v7.h[3]
2725 smlal v30.4s, v14.4h, v2.h[3]
2728 smlsl v24.4s, v15.4h, v0.h[3]
2729 smlal v26.4s, v15.4h, v4.h[3]
2730 smlal v28.4s, v15.4h, v6.h[3]
2731 smlsl v30.4s, v15.4h, v2.h[1]
2734 smlal v20.4s, v12.4h, v3.h[0]
2735 smlsl v20.4s, v13.4h, v6.h[2]
2736 smlal v22.4s, v12.4h, v7.h[0]
2737 smlsl v22.4s, v13.4h, v4.h[2]
2738 smlsl v16.4s, v12.4h, v1.h[0]
2739 smlal v16.4s, v13.4h, v0.h[2]
2740 smlal v18.4s, v12.4h, v5.h[0]
2741 smlsl v18.4s, v13.4h, v5.h[2]
2744 ld1 {v10.4h, v11.4h},[x1],#16
2745 ld1 {v8.4h, v9.4h},[x1],x10
2750 smlal v24.4s, v8.4h, v3.h[3] //// y1 * cos1(part of b0)
2751 smlsl v26.4s, v8.4h, v7.h[1] //// y1 * cos3(part of b1)
2752 smlsl v28.4s, v8.4h, v5.h[1] //// y1 * sin3(part of b2)
2753 smlal v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
2755 smlsl v24.4s, v9.4h, v7.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2756 smlsl v26.4s, v9.4h, v6.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2757 smlal v28.4s, v9.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2758 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2764 smlsl v20.4s, v10.4h, v6.h[0]
2765 smlal v20.4s, v11.4h, v2.h[2]
2768 smlal v22.4s, v10.4h, v2.h[0]
2769 smlsl v22.4s, v11.4h, v0.h[2]
2771 smlsl v16.4s, v10.4h, v2.h[0]
2772 smlal v16.4s, v11.4h, v3.h[2]
2774 smlal v18.4s, v10.4h, v6.h[0]
2775 smlsl v18.4s, v11.4h, v6.h[2]
2778 ld1 {v12.4h, v13.4h},[x1],#16
2779 ld1 {v14.4h, v15.4h},[x1],x10
2783 smlsl v24.4s, v14.4h, v5.h[1]
2784 smlal v26.4s, v14.4h, v3.h[3]
2785 smlsl v28.4s, v14.4h, v2.h[1]
2786 smlal v30.4s, v14.4h, v0.h[3]
2789 smlal v24.4s, v15.4h, v1.h[3]
2790 smlsl v26.4s, v15.4h, v1.h[1]
2791 smlal v28.4s, v15.4h, v0.h[3]
2792 smlsl v30.4s, v15.4h, v0.h[1]
2795 smlsl v20.4s, v12.4h, v1.h[0]
2796 smlal v20.4s, v13.4h, v4.h[2]
2797 smlal v22.4s, v12.4h, v3.h[0]
2798 smlsl v22.4s, v13.4h, v5.h[2]
2799 smlsl v16.4s, v12.4h, v5.h[0]
2800 smlal v16.4s, v13.4h, v6.h[2]
2801 smlal v18.4s, v12.4h, v7.h[0]
2802 smlsl v18.4s, v13.4h, v7.h[2]
2819 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2820 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2821 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2822 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2823 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2824 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2825 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2826 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2835 trn1 v24.4h, v30.4h, v12.4h
2836 trn2 v25.4h, v30.4h, v12.4h
2837 trn1 v26.4h, v31.4h, v13.4h
2838 trn2 v27.4h, v31.4h, v13.4h
2845 trn1 v24.4h, v14.4h, v18.4h
2846 trn2 v25.4h, v14.4h, v18.4h
2847 trn1 v26.4h, v15.4h, v19.4h
2848 trn2 v27.4h, v15.4h, v19.4h
2860 st1 { v30.4h, v31.4h},[x0],#16
2861 st1 { v12.4h, v13.4h},[x0],#16
2862 st1 { v14.4h, v15.4h},[x0],#16
2863 st1 { v18.4h, v19.4h},[x0],#16
2872 ld1 {v12.8h},[x0],#16
2873 ld1 {v14.8h},[x0],#16
2877 ld1 {v16.8h},[x0],#16
2878 ld1 {v18.8h},[x0],#16
2881 ld1 {v20.8h},[x0],#16
2882 ld1 {v22.8h},[x0],#16
2887 ld1 {v24.8h},[x0],#16
2888 ld1 {v26.8h},[x0],#16
2938 uaddw v12.8h, v12.8h , v8.8b
2939 uaddw v20.8h, v20.8h , v9.8b
2940 uaddw v14.8h, v14.8h , v10.8b
2941 uaddw v22.8h, v22.8h , v11.8b
2942 uaddw v16.8h, v16.8h , v28.8b
2943 uaddw v24.8h, v24.8h , v29.8b
2944 uaddw v18.8h, v18.8h , v30.8b
2945 uaddw v26.8h, v26.8h , v31.8b
2948 sqxtun v12.8b, v12.8h
2949 sqxtun v13.8b, v20.8h
2950 sqxtun v20.8b, v14.8h
2951 sqxtun v21.8b, v22.8h
2952 sqxtun v14.8b, v16.8h
2953 sqxtun v15.8b, v24.8h
2954 sqxtun v22.8b, v18.8h
2955 sqxtun v23.8b, v26.8h
2967 ld1 {v12.8h},[x0],#16
2968 ld1 {v14.8h},[x0],#16
2972 ld1 {v16.8h},[x0],#16
2973 ld1 {v18.8h},[x0],#16
2976 ld1 {v20.8h},[x0],#16
2977 ld1 {v22.8h},[x0],#16
2982 ld1 {v24.8h},[x0],#16
2983 ld1 {v26.8h},[x0],#16
3013 uaddw v12.8h, v12.8h , v8.8b
3014 uaddw v20.8h, v20.8h , v9.8b
3015 uaddw v14.8h, v14.8h , v10.8b
3016 uaddw v22.8h, v22.8h , v11.8b
3017 uaddw v16.8h, v16.8h , v28.8b
3018 uaddw v24.8h, v24.8h , v29.8b
3019 uaddw v18.8h, v18.8h , v30.8b
3020 uaddw v26.8h, v26.8h , v31.8b
3023 sqxtun v12.8b, v12.8h
3024 sqxtun v13.8b, v20.8h
3025 sqxtun v20.8b, v14.8h
3026 sqxtun v21.8b, v22.8h
3027 sqxtun v14.8b, v16.8h
3028 sqxtun v15.8b, v24.8h
3029 sqxtun v22.8b, v18.8h
3030 sqxtun v23.8b, v26.8h