1.macro push_v_regs 2 stp q8, q9, [sp, #-32]! 3 stp q10, q11, [sp, #-32]! 4 stp q12, q13, [sp, #-32]! 5 stp q14, q15, [sp, #-32]! 6 stp X8, X9, [sp, #-16]! 7 stp X10, X11, [sp, #-16]! 8 stp X12, X13, [sp, #-16]! 9 stp X14, X15, [sp, #-16]! 10 stp X16, X17, [sp, #-16]! 11 stp X18, X19, [sp, #-16]! 12 stp X20, X21, [sp, #-16]! 13 stp X29, X30, [sp, #-16]! 14.endm 15 16.macro pop_v_regs 17 ldp X29, X30, [sp], #16 18 ldp X20, X21, [sp], #16 19 ldp X18, X19, [sp], #16 20 ldp X16, X17, [sp], #16 21 ldp X14, X15, [sp], #16 22 ldp X12, X13, [sp], #16 23 ldp X10, X11, [sp], #16 24 ldp X8, X9, [sp], #16 25 ldp q14, q15, [sp], #32 26 ldp q12, q13, [sp], #32 27 ldp q10, q11, [sp], #32 28 ldp q8, q9, [sp], #32 29.endm 30 31.text 32.p2align 2 33 34 .global ixheaacd_cos_sin_mod_loop1 35ixheaacd_cos_sin_mod_loop1: 36 37 // STMFD sp!, {x4-x12, x14} 38 push_v_regs 39 //stp x19, x20,[sp,#-16]! 40 //VPUSH {D8-D11} 41//generating load addresses 42 ADD x4, x0, x1, lsl #3 //psubband1 43 SUB x4, x4, #4 44 ADD x5, x3, x1, lsl #3 //psubband1_t 45 SUB x5, x5, #8 46 ASR x6, x1, #2 47 48 MOV w19, #0 49 DUP V0.8h, w19 50LOOP1: 51//first part 52 ld1 {v0.h}[0] , [x2] 53 ADD x2, x2, #2 54 ld1 {v0.h}[2] , [x2] 55 ADD x2, x2, #2 56 rev64 v1.2s, v0.2s 57 ld1 {v2.s}[0], [x0] 58 ADD x0, x0, #4 59 ADD x7, x0, #252 60 ld1 {v2.s}[1], [x7] 61 ld1 {v3.s}[0], [x4] 62 ADD x7, x4, #256 63 ld1 {v3.s}[1], [x7] 64 SUB x4, x4, #4 65 66 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 67 sshr v4.2d, v4.2d, #16 68 sMULL v6.2d, v0.2s, v3.2s //add 2nd 69 sshr v6.2d, v6.2d, #16 70 sMULL v8.2d, v1.2s, v2.2s //add 1st 71 sshr v8.2d, v8.2d, #16 72 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 73 sshr v10.2d, v10.2d, #16 74 75 add v0.4s, v8.4s , v6.4s 76 SQSUB v2.4s, v10.4s , v4.4s 77 78 //shrn v0.2s, v0.2d,#32 79 //shrn v2.2s, v2.2d,#32 80 mov v3.16b, v0.16b 81 mov v1.16b, v2.16b 82 ST2 {v0.s, v1.s}[0], [x3] 83 ADD x3, x3, #8 84 ADD x7, x3, #248 85 ST2 {v2.s, v3.s}[2], [x7] 86 MOV w19, #0 87 DUP V0.8h, w19 88//second part 89 ld1 {v0.h}[0] , [x2] 90 ADD x2, x2, #2 91 ld1 {v0.h}[2] , [x2] 92 ADD x2, x2, #2 93 rev64 v1.2s, v0.2s 94 ld1 {v2.s}[0], [x0] 95 ADD x0, x0, #4 96 ADD x7, x0, #252 97 ld1 {v2.s}[1], [x7] 98 ld1 {v3.s}[0], [x4] 99 ADD x7, x4, #256 100 ld1 {v3.s}[1], [x7] 101 SUB x4, x4, #4 102 103 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 104 sshr v4.2d, v4.2d, #16 105 sMULL v6.2d, v0.2s, v3.2s //add 2nd 106 sshr v6.2d, v6.2d, #16 107 sMULL v8.2d, v1.2s, v2.2s //add 1st 108 sshr v8.2d, v8.2d, #16 109 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 110 sshr v10.2d, v10.2d, #16 111 112 ADD v0.4s, v10.4s , v4.4s 113 SQSUB v2.4s, v8.4s , v6.4s 114 115 //shrn v0.2s, v0.2d,#32 116 //shrn v2.2s, v2.2d,#32 117 mov v3.16b, v0.16b 118 mov v1.16b, v2.16b 119 ST2 {v0.s, v1.s}[0], [x5] 120 ADD x7, x5, #256 121 ST2 {v2.s, v3.s}[2], [x7] 122 SUB x5, x5, #8 123 MOV w19, #0 124 DUP V0.8h, w19 125//Third part 126 ld1 {v0.h}[0] , [x2] 127 ADD x2, x2, #2 128 ld1 {v0.h}[2] , [x2] 129 ADD x2, x2, #2 130 rev64 v1.2s, v0.2s 131 ld1 {v2.s}[0], [x0], #4 132 ADD x7, x0, #252 133 ld1 {v2.s}[1], [x7] 134 ld1 {v3.s}[0], [x4] 135 ADD x7, x4, #256 136 ld1 {v3.s}[1], [x7] 137 SUB x4, x4, #4 138 139 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 140 sshr v4.2d, v4.2d, #16 141 sMULL v6.2d, v0.2s, v3.2s //add 2nd 142 sshr v6.2d, v6.2d, #16 143 sMULL v8.2d, v1.2s, v2.2s //add 1st 144 sshr v8.2d, v8.2d, #16 145 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 146 sshr v10.2d, v10.2d, #16 147 148 add v0.4s, v8.4s , v6.4s 149 SQSUB v2.4s, v10.4s , v4.4s 150 151 //shrn v0.2s, v0.2d,#32 152 //shrn v2.2s, v2.2d,#32 153 mov v3.16b, v0.16b 154 mov v1.16b, v2.16b 155 ST2 {v0.s, v1.s}[0], [x3] 156 ADD x3, x3, #8 157 ADD x7, x3, #248 158 ST2 {v2.s, v3.s}[2], [x7] 159 MOV w19, #0 160 DUP V0.8h, w19 161//Fourth part 162 ld1 {v0.h}[0] , [x2] 163 ADD x2, x2, #2 164 ld1 {v0.h}[2] , [x2] 165 ADD x2, x2, #2 166 rev64 v1.2s, v0.2s 167 ld1 {v2.s}[0], [x0] 168 ADD x0, x0, #4 169 ADD x7, x0, #252 170 ld1 {v2.s}[1], [x7] 171 ld1 {v3.s}[0], [x4] 172 ADD x7, x4, #256 173 ld1 {v3.s}[1], [x7] 174 SUB x4, x4, #4 175 176 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 177 sshr v4.2d, v4.2d, #16 178 sMULL v6.2d, v0.2s, v3.2s //add 2nd 179 sshr v6.2d, v6.2d, #16 180 sMULL v8.2d, v1.2s, v2.2s //add 1st 181 sshr v8.2d, v8.2d, #16 182 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 183 sshr v10.2d, v10.2d, #16 184 185 186 ADD v0.4s, v10.4s , v4.4s 187 SQSUB v2.4s, v8.4s , v6.4s 188 189 //shrn v0.2s, v0.2d,#32 190 //shrn v2.2s, v2.2d,#32 191 mov v3.16b, v0.16b 192 mov v1.16b, v2.16b 193 ST2 {v0.s, v1.s}[0], [x5] 194 ADD x7, x5, #256 195 SUBS x6, x6, #1 196 ST2 {v2.s, v3.s}[2], [x7] 197 SUB x5, x5, #8 198 MOV w19, #0 199 DUP V0.8h, w19 200 BGT LOOP1 201 //VPOP {D8-D11} 202 // LDMFD sp!, {x4-x12, x15} 203 //ldp x19, x20,[sp],#16 204 pop_v_regs 205 ret 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232