.macro push_v_regs stp q8, q9, [sp, #-32]! stp q10, q11, [sp, #-32]! stp q12, q13, [sp, #-32]! stp q14, q15, [sp, #-32]! stp X8, X9, [sp, #-16]! stp X10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X16, X17, [sp, #-16]! stp X18, X19, [sp, #-16]! stp X20, X21, [sp, #-16]! stp X29, X30, [sp, #-16]! .endm .macro pop_v_regs ldp X29, X30, [sp], #16 ldp X20, X21, [sp], #16 ldp X18, X19, [sp], #16 ldp X16, X17, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 ldp q14, q15, [sp], #32 ldp q12, q13, [sp], #32 ldp q10, q11, [sp], #32 ldp q8, q9, [sp], #32 .endm .text .p2align 2 .global ixheaacd_cos_sin_mod_loop1 ixheaacd_cos_sin_mod_loop1: // STMFD sp!, {x4-x12, x14} push_v_regs //stp x19, x20,[sp,#-16]! //VPUSH {D8-D11} //generating load addresses ADD x4, x0, x1, lsl #3 //psubband1 SUB x4, x4, #4 ADD x5, x3, x1, lsl #3 //psubband1_t SUB x5, x5, #8 ASR x6, x1, #2 MOV w19, #0 DUP V0.8h, w19 LOOP1: //first part ld1 {v0.h}[0] , [x2] ADD x2, x2, #2 ld1 {v0.h}[2] , [x2] ADD x2, x2, #2 rev64 v1.2s, v0.2s ld1 {v2.s}[0], [x0] ADD x0, x0, #4 ADD x7, x0, #252 ld1 {v2.s}[1], [x7] ld1 {v3.s}[0], [x4] ADD x7, x4, #256 ld1 {v3.s}[1], [x7] SUB x4, x4, #4 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 add v0.4s, v8.4s , v6.4s SQSUB v2.4s, v10.4s , v4.4s //shrn v0.2s, v0.2d,#32 //shrn v2.2s, v2.2d,#32 mov v3.16b, v0.16b mov v1.16b, v2.16b ST2 {v0.s, v1.s}[0], [x3] ADD x3, x3, #8 ADD x7, x3, #248 ST2 {v2.s, v3.s}[2], [x7] MOV w19, #0 DUP V0.8h, w19 //second part ld1 {v0.h}[0] , [x2] ADD x2, x2, #2 ld1 {v0.h}[2] , [x2] ADD x2, x2, #2 rev64 v1.2s, v0.2s ld1 {v2.s}[0], [x0] ADD x0, x0, #4 ADD x7, x0, #252 ld1 {v2.s}[1], [x7] ld1 {v3.s}[0], [x4] ADD x7, x4, #256 ld1 {v3.s}[1], [x7] SUB x4, x4, #4 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 ADD v0.4s, v10.4s , v4.4s SQSUB v2.4s, v8.4s , v6.4s //shrn v0.2s, v0.2d,#32 //shrn v2.2s, v2.2d,#32 mov v3.16b, v0.16b mov v1.16b, v2.16b ST2 {v0.s, v1.s}[0], [x5] ADD x7, x5, #256 ST2 {v2.s, v3.s}[2], [x7] SUB x5, x5, #8 MOV w19, #0 DUP V0.8h, w19 //Third part ld1 {v0.h}[0] , [x2] ADD x2, x2, #2 ld1 {v0.h}[2] , [x2] ADD x2, x2, #2 rev64 v1.2s, v0.2s ld1 {v2.s}[0], [x0], #4 ADD x7, x0, #252 ld1 {v2.s}[1], [x7] ld1 {v3.s}[0], [x4] ADD x7, x4, #256 ld1 {v3.s}[1], [x7] SUB x4, x4, #4 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 add v0.4s, v8.4s , v6.4s SQSUB v2.4s, v10.4s , v4.4s //shrn v0.2s, v0.2d,#32 //shrn v2.2s, v2.2d,#32 mov v3.16b, v0.16b mov v1.16b, v2.16b ST2 {v0.s, v1.s}[0], [x3] ADD x3, x3, #8 ADD x7, x3, #248 ST2 {v2.s, v3.s}[2], [x7] MOV w19, #0 DUP V0.8h, w19 //Fourth part ld1 {v0.h}[0] , [x2] ADD x2, x2, #2 ld1 {v0.h}[2] , [x2] ADD x2, x2, #2 rev64 v1.2s, v0.2s ld1 {v2.s}[0], [x0] ADD x0, x0, #4 ADD x7, x0, #252 ld1 {v2.s}[1], [x7] ld1 {v3.s}[0], [x4] ADD x7, x4, #256 ld1 {v3.s}[1], [x7] SUB x4, x4, #4 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 ADD v0.4s, v10.4s , v4.4s SQSUB v2.4s, v8.4s , v6.4s //shrn v0.2s, v0.2d,#32 //shrn v2.2s, v2.2d,#32 mov v3.16b, v0.16b mov v1.16b, v2.16b ST2 {v0.s, v1.s}[0], [x5] ADD x7, x5, #256 SUBS x6, x6, #1 ST2 {v2.s, v3.s}[2], [x7] SUB x5, x5, #8 MOV w19, #0 DUP V0.8h, w19 BGT LOOP1 //VPOP {D8-D11} // LDMFD sp!, {x4-x12, x15} //ldp x19, x20,[sp],#16 pop_v_regs ret