1//.include "ihevc_neon_macros.s" 2.macro push_v_regs 3 stp x8, x9, [sp, #-16]! 4 stp x10, X11, [sp, #-16]! 5 stp X12, X13, [sp, #-16]! 6 stp X14, X15, [sp, #-16]! 7 stp X29, X30, [sp, #-16]! 8.endm 9.macro pop_v_regs 10 ldp X29, X30, [sp], #16 11 ldp X14, X15, [sp], #16 12 ldp X12, X13, [sp], #16 13 ldp X10, X11, [sp], #16 14 ldp X8, X9, [sp], #16 15.endm 16 17.text 18.p2align 2 19 .global ixheaacd_postradixcompute4 20 21 22ixheaacd_postradixcompute4: 23 24 // STMFD sp!, {x4-x12, x14} 25 push_v_regs 26 //SUB sp, sp, #16 27 28 //HARD CODED for FFT Length of 16 29 // x3 is always 16 30 31 32 //SUB x4, x3, #2 ; y to y offset calculated 33 //MOV x4, #14 34 //STR x4, [sp, #8] ; (npoints / 2)*4bytes - 4bytes 35 36 //STR x0, [sp, #12] ; (3*(npoints/2))*4bytes - 4bytes 37 // x0 to x2 offset (npoints / 2)*4bytes 38 ADD x4, x1, x3, lsl #1 // x1 -> x0, x4 -> x2 39 MOV x3, #2 40 41 42POSTRADIX4_START: 43 44// LDMIA x1!, {x5-x12} // x_0 :x_7 45 46 LDP w5, w6, [x1], #8 // x_0 :x_1 47 LDP w7, w8, [x1], #8 // x_2 :x_3 48 LDP w9, w10, [x1], #8 // x_4 :x_5 49 LDP w11, w12, [x1], #8 // x_6 :x_7 50 51 ADD w14, w5, w9 // xh0_0 = x_0 + x_4 52 SUB w5, w5, w9 // xl0_0 = x_0 - x_4 53 54 ADD w9, w6, w10 // xh1_0 = x_1 + x_5 55 SUB w6, w6, w10 // xl1_0 = x_1 - x_5 56 57 ADD w10, w7, w11 // xh0_1 = x_2 + x_6 58 SUB w7, w7, w11 // xl0_1 = x_2 - x_6 59 60 ADD w11, w8, w12 // xh1_1 = x_3 + x_7 61 SUB w8, w8, w12 // xl1_1 = x_3 - x_7 62 63 ADD w12, w14, w10 // n00 = xh0_0 + xh0_1 64 SUB w14, w14, w10 // n20 = xh0_0 - xh0_1 65 66 ADD w10, w9, w11 // n01 = xh1_0 + xh1_1 67 SUB w9, w9, w11 // n21 = xh1_0 - xh1_1 68 69 ADD w11, w5, w8 // n10 = xl0_0 + xl1_1 70 SUB w5, w5, w8 // n30 = xl0_0 - xl1_1 71 72 ADD w8, w6, w7 // n31 = xl1_0 + xl0_1 73 SUB w6, w6, w7 // n11 = xl1_0 - xl0_1 74 75 76 STR w12, [x0], #4 // y0[h2] = n00, x7 -> y0[h2 + 1] 77 78 STR w10, [x0], #14<<1 // y0[h2 + 1] = n01, x7 -> y1[h2] 79 80 STR w11, [x0], #4 // y1[h2] = n10, x7 -> y1[h2 + 1] 81 STR w6 , [x0], #14<<1 // y1[h2 + 1] = n11, x7 -> y2[h2] 82 83 STR w14, [x0], #4 // y2[h2] = n20, x7 -> y2[h2 + 1] 84 STR w9 , [x0], #14<<1 // y2[h2 + 1] = n21, x7 -> y3[h2] 85 86 STR w5, [x0], #4 // y3[h2] = n30, x7 -> y3[h2 + 1] 87 STR w8, [x0], #0 // y3[h2 + 1] = n31, x7 -> y0[h2+2] 88 89// LDMIA x4!, {x5-x12} // x_0 :x_7 90 91 LDP w5, w6, [x4], #8 // x_8 :x_8 92 LDP w7, w8, [x4], #8 // x_a :x_b 93 LDP w9, w10, [x4], #8 // x_c :x_d 94 LDP w11, w12, [x4], #8 // x_e :x_f 95 96 SUB x0, x0, #92 // #4*3 + #14<<1 * 3 - 8 97 98 99 ADD w14, w5, w9 100 SUB w5, w5, w9 101 102 ADD w9, w6, w10 103 SUB w6, w6, w10 104 105 ADD w10, w7, w11 106 SUB w7, w7, w11 107 108 ADD w11, w8, w12 109 SUB w8, w8, w12 110 111 ADD w12, w14, w10 112 SUB w14, w14, w10 113 114 ADD w10, w9, w11 115 SUB w9, w9, w11 116 117 ADD w11, w5, w8 118 SUB w5, w5, w8 119 120 ADD w8, w6, w7 121 SUB w6, w6, w7 122 123 STR w12, [x0], #4 124 STR w10, [x0], #14<<1 125 126 STR w11, [x0], #4 127 STR w6, [x0], #14<<1 128 129 STR w14, [x0], #4 130 STR w9, [x0], #14<<1 131 132 133 STR w5, [x0], #4 134 STR w8, [x0], #0 135 136 ADD x1, x1, #1 << 5 // x0 += (Word32) npoints >> 1 137 ADD x4, x4, #1 << 5 // x2 += (Word32) npoints >> 1 138 SUB x0, x0, #100-8 139 140 SUBS w3, w3, #1 141 142 BGT POSTRADIX4_START 143 144 // LDMFD sp!, {x4-x12, x15} 145 pop_v_regs 146 ret 147 148 149