//.include "ihevc_neon_macros.s" .macro push_v_regs stp x8, x9, [sp, #-16]! stp x10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X29, X30, [sp, #-16]! .endm .macro pop_v_regs ldp X29, X30, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 .endm .text .p2align 2 .global ixheaacd_postradixcompute4 ixheaacd_postradixcompute4: // STMFD sp!, {x4-x12, x14} push_v_regs //SUB sp, sp, #16 //HARD CODED for FFT Length of 16 // x3 is always 16 //SUB x4, x3, #2 ; y to y offset calculated //MOV x4, #14 //STR x4, [sp, #8] ; (npoints / 2)*4bytes - 4bytes //STR x0, [sp, #12] ; (3*(npoints/2))*4bytes - 4bytes // x0 to x2 offset (npoints / 2)*4bytes ADD x4, x1, x3, lsl #1 // x1 -> x0, x4 -> x2 MOV x3, #2 POSTRADIX4_START: // LDMIA x1!, {x5-x12} // x_0 :x_7 LDP w5, w6, [x1], #8 // x_0 :x_1 LDP w7, w8, [x1], #8 // x_2 :x_3 LDP w9, w10, [x1], #8 // x_4 :x_5 LDP w11, w12, [x1], #8 // x_6 :x_7 ADD w14, w5, w9 // xh0_0 = x_0 + x_4 SUB w5, w5, w9 // xl0_0 = x_0 - x_4 ADD w9, w6, w10 // xh1_0 = x_1 + x_5 SUB w6, w6, w10 // xl1_0 = x_1 - x_5 ADD w10, w7, w11 // xh0_1 = x_2 + x_6 SUB w7, w7, w11 // xl0_1 = x_2 - x_6 ADD w11, w8, w12 // xh1_1 = x_3 + x_7 SUB w8, w8, w12 // xl1_1 = x_3 - x_7 ADD w12, w14, w10 // n00 = xh0_0 + xh0_1 SUB w14, w14, w10 // n20 = xh0_0 - xh0_1 ADD w10, w9, w11 // n01 = xh1_0 + xh1_1 SUB w9, w9, w11 // n21 = xh1_0 - xh1_1 ADD w11, w5, w8 // n10 = xl0_0 + xl1_1 SUB w5, w5, w8 // n30 = xl0_0 - xl1_1 ADD w8, w6, w7 // n31 = xl1_0 + xl0_1 SUB w6, w6, w7 // n11 = xl1_0 - xl0_1 STR w12, [x0], #4 // y0[h2] = n00, x7 -> y0[h2 + 1] STR w10, [x0], #14<<1 // y0[h2 + 1] = n01, x7 -> y1[h2] STR w11, [x0], #4 // y1[h2] = n10, x7 -> y1[h2 + 1] STR w6 , [x0], #14<<1 // y1[h2 + 1] = n11, x7 -> y2[h2] STR w14, [x0], #4 // y2[h2] = n20, x7 -> y2[h2 + 1] STR w9 , [x0], #14<<1 // y2[h2 + 1] = n21, x7 -> y3[h2] STR w5, [x0], #4 // y3[h2] = n30, x7 -> y3[h2 + 1] STR w8, [x0], #0 // y3[h2 + 1] = n31, x7 -> y0[h2+2] // LDMIA x4!, {x5-x12} // x_0 :x_7 LDP w5, w6, [x4], #8 // x_8 :x_8 LDP w7, w8, [x4], #8 // x_a :x_b LDP w9, w10, [x4], #8 // x_c :x_d LDP w11, w12, [x4], #8 // x_e :x_f SUB x0, x0, #92 // #4*3 + #14<<1 * 3 - 8 ADD w14, w5, w9 SUB w5, w5, w9 ADD w9, w6, w10 SUB w6, w6, w10 ADD w10, w7, w11 SUB w7, w7, w11 ADD w11, w8, w12 SUB w8, w8, w12 ADD w12, w14, w10 SUB w14, w14, w10 ADD w10, w9, w11 SUB w9, w9, w11 ADD w11, w5, w8 SUB w5, w5, w8 ADD w8, w6, w7 SUB w6, w6, w7 STR w12, [x0], #4 STR w10, [x0], #14<<1 STR w11, [x0], #4 STR w6, [x0], #14<<1 STR w14, [x0], #4 STR w9, [x0], #14<<1 STR w5, [x0], #4 STR w8, [x0], #0 ADD x1, x1, #1 << 5 // x0 += (Word32) npoints >> 1 ADD x4, x4, #1 << 5 // x2 += (Word32) npoints >> 1 SUB x0, x0, #100-8 SUBS w3, w3, #1 BGT POSTRADIX4_START // LDMFD sp!, {x4-x12, x15} pop_v_regs ret