1.macro push_v_regs 2 stp q8, q9, [sp, #-32]! 3 stp q10, q11, [sp, #-32]! 4 stp q12, q13, [sp, #-32]! 5 stp q14, q15, [sp, #-32]! 6 stp X8, X9, [sp, #-16]! 7 stp X10, X11, [sp, #-16]! 8 stp X12, X13, [sp, #-16]! 9 stp X14, X15, [sp, #-16]! 10 stp X16, X17, [sp, #-16]! 11 stp X18, X19, [sp, #-16]! 12 stp X20, X21, [sp, #-16]! 13 stp X29, X30, [sp, #-16]! 14.endm 15 16.macro pop_v_regs 17 ldp X29, X30, [sp], #16 18 ldp X20, X21, [sp], #16 19 ldp X18, X19, [sp], #16 20 ldp X16, X17, [sp], #16 21 ldp X14, X15, [sp], #16 22 ldp X12, X13, [sp], #16 23 ldp X10, X11, [sp], #16 24 ldp X8, X9, [sp], #16 25 ldp q14, q15, [sp], #32 26 ldp q12, q13, [sp], #32 27 ldp q10, q11, [sp], #32 28 ldp q8, q9, [sp], #32 29.endm 30 31.text 32.p2align 2 33 34 .global ixheaacd_cos_sin_mod_loop2 35ixheaacd_cos_sin_mod_loop2: 36 37 // STMFD sp!, {x4-x12, x14} 38 push_v_regs 39 //stp x19, x20,[sp,#-16]! 40 //VPUSH {D8-D15} 41 //generating load addresses 42 ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1]; 43 SUB x3, x3, #4 44 ADD x10, x0, #256 45 ADD x11, x10, x2, LSL #3 46 SUB x11, x11, #4 47 MOV x8, #-4 48 MOV w19, #0 49 DUP V0.4s, w19 50 DUP V1.4s, w19 51 52 LDR w6, [x0] 53 sxtw x6, w6 54 ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1); 55 SUB x4, x4, #1 56 57 ASR x6, x6, #1 //*psubband = *psubband >> 1; 58 LD1 {v2.s}[0], [x3] 59 60 STR w6, [x0], #4 //psubband++; 61 sxtw x6, w6 62 LDR w7, [x0] 63 sxtw x7, w7 64 ASR x7, x7, #1 65 sub x20, x7, #0 66 neg x6, x20 67 STR w6, [x3], #-4 68 sxtw x6, w6 69 LD1 {v3.s}[0], [x3] // im = *psubband1; 70 71 LD2 {v0.h, v1.h}[0], [x1], #4 72 sxtl v0.4s, v0.4h 73 sxtl v1.4s, v1.4h 74 dup v0.2s, v0.s[0] 75 dup v1.2s, v1.s[0] 76 77 LD1 {v2.s}[1], [x11] //re = *psubband12; 78 79// LDR w6, [x10] 80// sxtw x6,w6 81// ASR x7, x6, #1 82// MOV x9, #0 83// QSUB x7, x9, x7 84 LD1 {v4.s}[0], [x10] 85 SSHR v4.2s, v4.2s, #1 86 MOV x9, #0 87 DUP v6.2s, w9 88 SQSUB v4.2s, v6.2s, v4.2s 89 90 ST1 {v4.s}[0], [x11] 91// str X7, [X11] 92 SUB x11, x11, #4 93// sxtw x7,w7 94 95 LDR w6, [x10, #4] 96 sxtw x6, w6 97 ASR x6, x6, #1 98 STR w6, [x10], #4 99 sxtw x6, w6 100 101 LD1 {v3.s}[1], [x11] 102 103 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 104 sshr v4.2d, v4.2d, #16 105 sMULL v6.2d, v0.2s, v3.2s //add 2nd 106 sshr v6.2d, v6.2d, #16 107 sMULL v8.2d, v1.2s, v2.2s //add 1st 108 sshr v8.2d, v8.2d, #16 109 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 110 sshr v10.2d, v10.2d, #16 111 112 add v12.2d, v8.2d , v6.2d 113 SQSUB v14.2d, v10.2d , v4.2d 114 SQSUB v16.2d, v4.2d , v10.2d 115 116 //shrn v12.2s, v12.2d,#32 117 //shrn v14.2s, v14.2d,#32 118 //shrn v16.2s, v16.2d,#32 119 120 ST1 {v12.s}[0], [x3], x8 121 122 ST1 {v14.s}[0], [x0], #4 123 124 SQNEG v12.4s, v12.4s 125 126 127 ST1 {v12.s}[2], [x10], #4 128 129 ST1 {v16.s}[2], [x11], x8 130 131LOOP1: 132 LD1 {v2.2s}, [x0] 133 LD1 {v3.2s}, [x10] 134 LDR w5, [x3] //RE2 135 sxtw x5, w5 136 LDR w6, [x11] //RE3 137 sxtw x6, w6 138 //VTRN.32 D2, D3 139 TRN1 v4.2s, v2.2s, v3.2s 140 TRN2 v3.2s, v2.2s, v3.2s 141 MOV v2.8b, v4.8b 142 143 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 144 sshr v4.2d, v4.2d, #16 145 sMULL v6.2d, v0.2s, v3.2s //add 2nd 146 sshr v6.2d, v6.2d, #16 147 sMULL v8.2d, v1.2s, v2.2s //add 1st 148 sshr v8.2d, v8.2d, #16 149 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 150 sshr v10.2d, v10.2d, #16 151 152 add v12.2d, v8.2d , v6.2d 153 SQSUB v14.2d, v4.2d , v10.2d 154 SQSUB v16.2d, v10.2d , v4.2d 155 156 //shrn v12.2s, v12.2d,#32 157 //shrn v14.2s, v14.2d,#32 158 //shrn v16.2s, v16.2d,#32 159 160 ST1 {v12.s}[0], [x0], #4 161 ST1 {v14.s}[0], [x3], x8 162 SQNEG v12.4s, v12.4s 163 164 ST1 {v12.s}[2], [x11], x8 165 ST1 {v16.s}[2], [x10], #4 166 167 MOV w19, #0 168 DUP V0.4s, w19 169 DUP V1.4s, w19 170 // second part 171 LD2 {v0.h, v1.h}[0], [x1], #4 172 sxtl v0.4s, v0.4h 173 sxtl v1.4s, v1.4h 174 dup v0.2s, v0.s[0] 175 dup v1.2s, v1.s[0] 176 177 mov v3.s[0], w5 178 mov v3.s[1], w6 179 LD1 {v2.s}[0], [x3] 180 LD1 {v2.s}[1], [x11] 181 182 sMULL v4.2d, v0.2s, v2.2s //qsub 2nd 183 sshr v4.2d, v4.2d, #16 184 sMULL v6.2d, v0.2s, v3.2s //add 2nd 185 sshr v6.2d, v6.2d, #16 186 sMULL v8.2d, v1.2s, v2.2s //add 1st 187 sshr v8.2d, v8.2d, #16 188 sMULL v10.2d, v1.2s, v3.2s //qsub 1st 189 sshr v10.2d, v10.2d, #16 190 191 add v12.2d, v4.2d , v10.2d 192 SQSUB v14.2d, v8.2d , v6.2d 193 SQSUB v16.2d, v6.2d , v8.2d 194 195 //shrn v12.2s, v12.2d,#32 196 //shrn v14.2s, v14.2d,#32 197 //shrn v16.2s, v16.2d,#32 198 199 ST1 {v12.s}[0], [x3], x8 200 ST1 {v14.s}[0], [x0], #4 201 202 SQNEG v12.4s, v12.4s 203 204 subs x4, x4, #1 205 ST1 {v12.s}[2], [x10], #4 206 ST1 {v16.s}[2], [x11], x8 207 208 BGT LOOP1 209 //VPOP {D8-D15} 210 // LDMFD sp!, {x4-x12, x15} 211 //ldp x19, x20,[sp],#16 212 pop_v_regs 213 ret 214