1.macro push_v_regs 2 stp X8, X9, [sp, #-16]! 3 stp X10, X11, [sp, #-16]! 4 stp X12, X13, [sp, #-16]! 5 stp X14, X15, [sp, #-16]! 6 stp X16, X17, [sp, #-16]! 7 stp X18, X19, [sp, #-16]! 8 stp X20, X21, [sp, #-16]! 9 stp X22, X24, [sp, #-16]! 10 stp X29, X30, [sp, #-16]! 11.endm 12 13.macro pop_v_regs 14 ldp X29, X30, [sp], #16 15 ldp X22, X24, [sp], #16 16 ldp X20, X21, [sp], #16 17 ldp X18, X19, [sp], #16 18 ldp X16, X17, [sp], #16 19 ldp X14, X15, [sp], #16 20 ldp X12, X13, [sp], #16 21 ldp X10, X11, [sp], #16 22 ldp X8, X9, [sp], #16 23.endm 24 25 26.text 27.p2align 2 28.global ixheaacd_fft32x32_ld2_armv8 29 30ixheaacd_fft32x32_ld2_armv8: 31 32 // STMFD sp!, {x4-x12,x14} 33 push_v_regs 34 stp x19, x20, [sp, #-16]! 35 36 //DIT Radix-4 FFT First Stage 37 //First Butterfly 38 MOV x0, x2 39 MOV x1, x3 40 LDR w2, [x0] //x_0 = x[0 ] 41 sxtw x2, w2 42 LDR w3, [x0, #32] //x_2 = x[8 ] 43 sxtw x3, w3 44 LDR w4, [x0, #64] //x_4 = x[16] 45 sxtw x4, w4 46 LDR w5, [x0, #96] //x_6 = x[24] 47 sxtw x5, w5 48 ADD w6, w2, w4 //xh0_0 = x_0 + x_4 49 SUB w7, w2, w4 //xl0_0 = x_0 - x_4 50 ADD w8, w3, w5 //xh0_1 = x_2 + x_6 51 SUB w9, w3, w5 //xl0_1 = x_2 - x_6 52 53 LDR w2, [x0, #4] //x_1 = x[0 +1] 54 sxtw x2, w2 55 LDR w3, [x0, #36] //x_3 = x[8 +1] 56 sxtw x3, w3 57 LDR w4, [x0, #68] //x_5 = x[16+1] 58 sxtw x4, w4 59 LDR w5, [x0, #100] //x_7 = x[24+1] 60 sxtw x5, w5 61 ADD w10, w2, w4 //xh1_0 = x_1 + x_5 62 SUB w11, w2, w4 //xl1_0 = x_1 - x_5 63 ADD w12, w3, w5 //xh1_1 = x_3 + x_7 64 SUB w14, w3, w5 //xl1_1 = x_3 - x_7 65 66 ADD w2, w6, w8 //n00 = xh0_0 + xh0_1 67 ADD w3, w7, w14 //n10 = xl0_0 + xl1_1 68 SUB w4, w6, w8 //n20 = xh0_0 - xh0_1 69 SUB w5, w7, w14 //n30 = xl0_0 - xl1_1 70 STR w2, [x0] //x[0 ] = n00 71 STR w3, [x0, #32] //x[8 ] = n10 72 STR w4, [x0, #64] //x[16] = n20 73 STR w5, [x0, #96] //x[24] = n30 74 75 ADD w2, w10, w12 //n01 = xh1_0 + xh1_1 76 SUB w3, w11, w9 //n11 = xl1_0 - xl0_1 77 SUB w4, w10, w12 //n21 = xh1_0 - xh1_1 78 ADD w5, w11, w9 //n31 = xl1_0 + xl0_1 79 STR w2, [x0, #4] //x[1 ] = n01 80 STR w3, [x0, #36] //x[8+1 ] = n11 81 STR w4, [x0, #68] //x[16+1] = n21 82 STR w5, [x0, #100] //x[24+1] = n31 83 84 //Second Butterfly 85 LDR w2, [x0, #8] //x_0 = x[2 ] 86 sxtw x2, w2 87 LDR w3, [x0, #40] //x_2 = x[10] 88 sxtw x3, w3 89 LDR w4, [x0, #72] //x_4 = x[18] 90 sxtw x4, w4 91 LDR w5, [x0, #104] //x_6 = x[26] 92 sxtw x5, w5 93 ADD w6, w2, w4 //xh0_0 = x_0 + x_4 94 SUB w7, w2, w4 //xl0_0 = x_0 - x_4 95 ADD w8, w3, w5 //xh0_1 = x_2 + x_6 96 SUB w9, w3, w5 //xl0_1 = x_2 - x_6 97 98 LDR w2, [x0, #12] //x_1 = x[2 +1] 99 sxtw x2, w2 100 LDR w3, [x0, #44] //x_3 = x[10+1] 101 sxtw x3, w3 102 LDR w4, [x0, #76] //x_5 = x[18+1] 103 sxtw x4, w4 104 LDR w5, [x0, #108] //x_7 = x[26+1] 105 sxtw x5, w5 106 ADD w10, w2, w4 //xh1_0 = x_1 + x_5 107 SUB w11, w2, w4 //xl1_0 = x_1 - x_5 108 ADD w12, w3, w5 //xh1_1 = x_3 + x_7 109 SUB w14, w3, w5 //xl1_1 = x_3 - x_7 110 111 ADD w2, w6, w8 //n00 = xh0_0 + xh0_1 112 ADD w3, w7, w14 //n10 = xl0_0 + xl1_1 113 SUB w4, w6, w8 //n20 = xh0_0 - xh0_1 114 SUB w5, w7, w14 //n30 = xl0_0 - xl1_1 115 STR w2, [x0, #8] //x[2 ] = n00 116 STR w3, [x0, #40] //x[10] = n10 117 STR w4, [x0, #72] //x[18] = n20 118 STR w5, [x0, #104] //x[26] = n30 119 120 ADD w2, w10, w12 //n01 = xh1_0 + xh1_1 121 SUB w3, w11, w9 //n11 = xl1_0 - xl0_1 122 SUB w4, w10, w12 //n21 = xh1_0 - xh1_1 123 ADD w5, w11, w9 //n31 = xl1_0 + xl0_1 124 STR w2, [x0, #12] //x[2 +1] = n01 125 STR w3, [x0, #44] //x[10+1] = n11 126 STR w4, [x0, #76] //x[18+1] = n21 127 STR w5, [x0, #108] //x[26+1] = n31 128 129 //Third Butterfly 130 LDR w2, [x0, #16] //x_0 = x[4 ] 131 sxtw x2, w2 132 LDR w3, [x0, #48] //x_2 = x[12] 133 sxtw x3, w3 134 LDR w4, [x0, #80] //x_4 = x[20] 135 sxtw x4, w4 136 LDR w5, [x0, #112] //x_6 = x[28] 137 sxtw x5, w5 138 ADD w6, w2, w4 //xh0_0 = x_0 + x_4 139 SUB w7, w2, w4 //xl0_0 = x_0 - x_4 140 ADD w8, w3, w5 //xh0_1 = x_2 + x_6 141 SUB w9, w3, w5 //xl0_1 = x_2 - x_6 142 143 LDR w2, [x0, #20] //x_1 = x[4 +1] 144 sxtw x2, w2 145 LDR w3, [x0, #52] //x_3 = x[12+1] 146 sxtw x3, w3 147 LDR w4, [x0, #84] //x_5 = x[20+1] 148 sxtw x4, w4 149 LDR w5, [x0, #116] //x_7 = x[28+1] 150 sxtw x5, w5 151 ADD w10, w2, w4 //xh1_0 = x_1 + x_5 152 SUB w11, w2, w4 //xl1_0 = x_1 - x_5 153 ADD w12, w3, w5 //xh1_1 = x_3 + x_7 154 SUB w14, w3, w5 //xl1_1 = x_3 - x_7 155 156 ADD w2, w6, w8 //n00 = xh0_0 + xh0_1 157 ADD w3, w7, w14 //n10 = xl0_0 + xl1_1 158 SUB w4, w6, w8 //n20 = xh0_0 - xh0_1 159 SUB w5, w7, w14 //n30 = xl0_0 - xl1_1 160 STR w2, [x0, #16] //x[4 ] = n00 161 STR w3, [x0, #48] //x[12] = n10 162 STR w4, [x0, #80] //x[20] = n20 163 STR w5, [x0, #112] //x[28] = n30 164 165 ADD w2, w10, w12 //n01 = xh1_0 + xh1_1 166 SUB w3, w11, w9 //n11 = xl1_0 - xl0_1 167 SUB w4, w10, w12 //n21 = xh1_0 - xh1_1 168 ADD w5, w11, w9 //n31 = xl1_0 + xl0_1 169 STR w2, [x0, #20] //x[4 +1] = n01 170 STR w3, [x0, #52] //x[12+1] = n11 171 STR w4, [x0, #84] //x[20+1] = n21 172 STR w5, [x0, #116] //x[28+1] = n31 173 174 //Fourth Butterfly 175 LDR w2, [x0, #24] //x_0 = x[6 ] 176 sxtw x2, w2 177 LDR w3, [x0, #56] //x_2 = x[14] 178 sxtw x3, w3 179 LDR w4, [x0, #88] //x_4 = x[22] 180 sxtw x4, w4 181 LDR w5, [x0, #120] //x_6 = x[30] 182 sxtw x5, w5 183 ADD w6, w2, w4 //xh0_0 = x_0 + x_4 184 SUB w7, w2, w4 //xl0_0 = x_0 - x_4 185 ADD w8, w3, w5 //xh0_1 = x_2 + x_6 186 SUB w9, w3, w5 //xl0_1 = x_2 - x_6 187 188 LDR w2, [x0, #28] //x_1 = x[6 +1] 189 sxtw x2, w2 190 LDR w3, [x0, #60] //x_3 = x[14+1] 191 sxtw x3, w3 192 LDR w4, [x0, #92] //x_5 = x[22+1] 193 sxtw x4, w4 194 LDR w5, [x0, #124] //x_7 = x[30+1] 195 sxtw x5, w5 196 ADD w10, w2, w4 //xh1_0 = x_1 + x_5 197 SUB w11, w2, w4 //xl1_0 = x_1 - x_5 198 ADD w12, w3, w5 //xh1_1 = x_3 + x_7 199 SUB w14, w3, w5 //xl1_1 = x_3 - x_7 200 201 ADD w2, w6, w8 //n00 = xh0_0 + xh0_1 202 ADD w3, w7, w14 //n10 = xl0_0 + xl1_1 203 SUB w4, w6, w8 //n20 = xh0_0 - xh0_1 204 SUB w5, w7, w14 //n30 = xl0_0 - xl1_1 205 STR w2, [x0, #24] //x[6 ] = n00 206 STR w3, [x0, #56] //x[14] = n10 207 STR w4, [x0, #88] //x[22] = n20 208 STR w5, [x0, #120] //x[30] = n30 209 210 ADD w2, w10, w12 //n01 = xh1_0 + xh1_1 211 SUB w3, w11, w9 //n11 = xl1_0 - xl0_1 212 SUB w4, w10, w12 //n21 = xh1_0 - xh1_1 213 ADD w5, w11, w9 //n31 = xl1_0 + xl0_1 214 STR w2, [x0, #28] //x[6 +1] = n01 215 STR w3, [x0, #60] //x[14+1] = n11 216 STR w4, [x0, #92] //x[22+1] = n21 217 STR w5, [x0, #124] //x[30+1] = n31 218 219 220 //DIT Radix-4 FFT Second Stage 221 //First Butterfly 222 LDR w2, [x0] //inp_0qr = x[0] 223 sxtw x2, w2 224 LDR w3, [x0, #8] //inp_1qr = x[2] 225 sxtw x3, w3 226 LDR w4, [x0, #16] //inp_2qr = x[4] 227 sxtw x4, w4 228 LDR w5, [x0, #24] //inp_3qr = x[6] 229 sxtw x5, w5 230 ADD w6, w2, w4 //sum_0qr = mul_0qr + mul_2qr 231 SUB w7, w2, w4 //sum_1qr = mul_0qr - mul_2qr 232 ADD w8, w3, w5 //sum_2qr = mul_1qr + mul_3qr 233 SUB w9, w3, w5 //sum_3qr = mul_1qr - mul_3qr 234 235 LDR w2, [x0, #4] //inp_0qi = x[1] 236 sxtw x2, w2 237 LDR w3, [x0, #12] //inp_1qi = x[3] 238 sxtw x3, w3 239 LDR w4, [x0, #20] //inp_2qi = x[5] 240 sxtw x4, w4 241 LDR w5, [x0, #28] //inp_3qi = x[7] 242 sxtw x5, w5 243 ADD w10, w2, w4 //sum_0qi = mul_0qi + mul_2qi 244 SUB w11, w2, w4 //sum_1qi = mul_0qi - mul_2qi 245 ADD w12, w3, w5 //sum_2qi = mul_1qi + mul_3qi 246 SUB w14, w3, w5 //sum_3qi = mul_1qi - mul_3qi 247 248 ADD w2, w6, w8 //sum_0qr + sum_2qr 249 ADD w3, w7, w14 //sum_1qr + sum_3qi 250 SUB w4, w6, w8 //sum_0qr - sum_2qr 251 SUB w5, w7, w14 //sum_1qr - sum_3qi 252 STR w2, [x1] //y[0 ] = sum_0qr + sum_2qr 253 STR w3, [x1, #32] //y[8 ] = sum_1qr + sum_3qi 254 STR w4, [x1, #64] //y[16] = sum_0qr - sum_2qr 255 STR w5, [x1, #96] //y[24] = sum_1qr - sum_3qi 256 257 ADD w2, w10, w12 //sum_0qi + sum_2qi 258 SUB w3, w11, w9 //sum_1qi - sum_3qr 259 SUB w4, w10, w12 //sum_0qi - sum_2qi 260 ADD w5, w11, w9 //sum_1qi + sum_3qr 261 STR w2, [x1, #4] //y[0 +1] = sum_0qi + sum_2qi 262 STR w3, [x1, #36] //y[8 +1] = sum_1qi - sum_3qr 263 STR w4, [x1, #68] //y[16+1] = sum_0qi - sum_2qi 264 STR w5, [x1, #100] //y[24+1] = sum_1qi + sum_3qr 265 266 267 //Load twiddle factors 268// LDR w11, =2310960706 //0x89BE7642 269 MOV w11, #0x7642 270 sxth w11, w11 271 MOV w21, #0x89BE 272 sxth w21, w21 273// LDR w12, =3473158396 //0xCF0430FC 274 MOV w12, #0x30FC 275 sxth w12, w12 276 MOV w22, #0xCF04 277 sxth w22, w22 278// LDR w14, =2776455811 //0xA57D5A83 279 MOV w14, #0x5A83 280 sxth w14, w14 281 MOV w24, #0xA57D 282 sxth w24, w24 283 284 //Second Butterfly 285 LDR w2, [x0, #32] //mul_0qr = inp_0qr = x[8] 286 sxtw x2, w2 287 LDR w3, [x0, #36] //mul_0qi = inp_1qr = x[9] 288 sxtw x3, w3 289 290 LDR w5, [x0, #40] //inp_1qr = x[10] 291 sxtw x5, w5 292 LDR w6, [x0, #44] //inp_1qi = x[11] 293 sxtw x6, w6 294 295 SMULL x4, w5, w11 296 ASR x4, x4, #16 297// SMULWB x4, x5, x11 //mul_1qr = mpy_16_32_ns( 0x7642 , inp_1qr) 298 299 SMULL x20, w6, w12 300 ASR x20, x20, #16 301 ADD w4, w4, w20 302// SMLAWB x4, x6, x12, x4 //mul_1qr -= mpy_16_32_ns(-0x30FC , inp_1qi) 303 304 SMULL x5, w5, w22 305 ASR x5, x5, #16 306// SMULWT x5, x5, x12 //mul_1qi = mpy_16_32_ns(-0x30FC , inp_1qr) 307 308 LDR w7, [x0, #48] //inp_2qr = x[12] 309 sxtw x7, w7 310 LDR w8, [x0, #52] //inp_2qi = x[13] 311 sxtw x8, w8 312 313 //Moved for delay slot 314 SMULL x20, w6, w11 315 ASR x20, x20, #16 316 ADD w5, w5, w20 317// SMLAWB x5, x6, x11, x5 //mul_1qi += mpy_16_32_ns( 0x7642 , inp_1qi) 318 319 ADD w6, w7, w8 //(inp_2qr + inp_2qi) 320 321 SMULL x6, w6, w14 322 ASR x6, x6, #16 323// SMULWB x6, x6, x14 //mul_2qr = mpy_16_32_ns(0x5A83 , (inp_2qr + inp_2qi)) 324 325 SUB w7, w8, w7 //(-inp_2qr + inp_2qi) 326 327 SMULL x7, w7, w14 328 ASR x7, x7, #16 329// SMULWB x7, x7, x14 //mul_2qi = mpy_16_32_ns(0x5A83 , (-inp_2qr + inp_2qi)) 330 331 LDR x9 , [x0, #56] //inp_3qr = x[14] 332 sxtw x9, w9 333 LDR w10, [x0, #60] //inp_3qi = x[15] 334 sxtw x10, w10 335 336 SMULL x8, w9, w12 337 ASR x8, x8, #16 338// SMULWB x8, x9 , x12 //mul_3qr = mpy_16_32_ns( 0x30FC , inp_3qr) 339 340 SMULL x20, w10, w11 341 ASR x20, x20, #16 342 ADD w8, w8, w20 343// SMLAWB x8, x10, x11, x8 //mul_3qr -= mpy_16_32_ns(-0x7642 , inp_3qi)// 344 345 SMULL x9, w9 , w21 346 ASR x9, x9, #16 347// SMULWT x9, x9 , x11 //mul_3qi = mpy_16_32_ns(-0x7642 , inp_3qr) 348 349 SMULL x20, w10, w12 350 ASR x20, x20, #16 351 ADD w9, w9, w20 352// SMLAWB x9, x10, x12, x9 //mul_3qi += mpy_16_32_ns( 0x30FC , inp_3qi) 353 354 ADD w10, w2, w6, lsl #1 //sum_0qr = mul_0qr + (mul_2qr << 1) 355 SUB w2 , w2, w6, lsl #1 //sum_1qr = mul_0qr - (mul_2qr << 1) 356 ADD w6 , w4, w8 //sum_2qr = mul_1qr + mul_3qr 357 SUB w4 , w4, w8 //sum_3qr = mul_1qr - mul_3qr 358 359 ADD w8 , w3, w7, lsl #1 //sum_0qi = mul_0qi + (mul_2qi << 1) 360 SUB w3 , w3, w7, lsl #1 //sum_1qi = mul_0qi - (mul_2qi << 1) 361 ADD w7 , w5, w9 //sum_2qi = mul_1qi + mul_3qi 362 SUB w5 , w5, w9 //sum_3qi = mul_1qi - mul_3qi 363 364 ADD w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1) 365 SUB w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1) 366 ADD w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1) 367 SUB w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1) 368 STR w9 , [x1, #8] //y[2 ] = sum_0qr + (sum_2qr << 1) 369 STR w10, [x1, #72] //y[18] = sum_0qr - (sum_2qr << 1) 370 STR w6 , [x1, #40] //y[10] = sum_1qr + (sum_3qi << 1) 371 STR w2 , [x1, #104] //y[26] = sum_1qr - (sum_3qi << 1) 372 373 ADD w5 , w8 , w7, lsl #1 //sum_0qi + (sum_2qi << 1) 374 SUB w8 , w8 , w7, lsl #1 //sum_0qi - (sum_2qi << 1) 375 SUB w7 , w3 , w4, lsl #1 //sum_1qi - (sum_3qr << 1) 376 ADD w3 , w3 , w4, lsl #1 //sum_1qi + (sum_3qr << 1) 377 STR w5 , [x1, #12] //y[2 +1] = sum_0qi + (sum_2qi << 1) 378 STR w8 , [x1, #76] //y[18+1] = sum_0qi - (sum_2qi << 1) 379 STR w7 , [x1, #44] //y[10+1] = sum_1qi - (sum_3qr << 1) 380 STR w3 , [x1, #108] //y[26+1] = sum_1qi + (sum_3qr << 1) 381 382 //Third Butterfly 383 LDR w2, [x0, #64] //mul_0qr = inp_0qr = x[16] 384 sxtw x2, w2 385 LDR w5, [x0, #72] //inp_1qr = x[18] 386 sxtw x5, w5 387 LDR w6, [x0, #76] //inp_1qi = x[19] 388 sxtw x6, w6 389 //Moved for delay slot 390 LDR w3, [x0, #68] //mul_0qi = inp_1qr = x[17] 391 sxtw x3, w3 392 393 ADD w4, w5, w6 //(inp_1qr + inp_1qi) 394 395 SMULL x4, w4, w14 396 ASR x4, x4, #16 397// SMULWB x4, x4, x14 //mul_1qr = mpy_16_32_ns(0x5A83 , (inp_1qr + inp_1qi)) 398 SUB w5, w6, w5 //(-inp_1qr + inp_1qi) 399 400 SMULL x5, w5, w14 401 ASR x5, x5, #16 402// SMULWB x5, x5, x14 //mul_1qi = mpy_16_32_ns(0x5A83 , (-inp_1qr + inp_1qi)) 403 404 LDR w6, [x0, #84] //mul_2qr = inp_2qi = x[21] 405 sxtw x6, w6 406 LDR x9 , [x0, #88] //inp_3qr = x[22] 407 sxtw x9, w9 408 LDR w10, [x0, #92] //inp_3qi = x[23] 409 sxtw x10, w10 410 //Moved for delay slot 411 LDR w7, [x0, #80] //mul_2qi = inp_2qr = x[20] 412 sxtw x7, w7 413 414 SUB w8 , w10, w9 //(-inp_3qr + inp_3qi) 415 416 SMULL x8, w8, w14 417 ASR x8, x8, #16 418// SMULWB x8 , x8 , x14 //mul_3qr = mpy_16_32_ns( 0x5A83 , (-inp_3qr + inp_3qi)) 419 420 ADD w9 , w9 , w10 //(inp_3qr + inp_3qi) 421 422 SMULL x9, w9, w24 423 ASR x9, x9, #16 424// SMULWT x9 , x9 , x14 //mul_3qi = mpy_16_32_ns(-0x5A83 , (inp_3qr + inp_3qi)) 425 426 ADD w10, w2, w6 //sum_0qr = mul_0qr + mul_2qr 427 SUB w2 , w2, w6 //sum_1qr = mul_0qr - mul_2qr 428 ADD w6 , w4, w8 //sum_2qr = mul_1qr + mul_3qr 429 SUB w4 , w4, w8 //sum_3qr = mul_1qr - mul_3qr 430 431 SUB w8 , w3, w7 //sum_0qi = mul_0qi - mul_2qi 432 ADD w3 , w3, w7 //sum_1qi = mul_0qi + mul_2qi 433 ADD w7 , w5, w9 //sum_2qi = mul_1qi + mul_3qi 434 SUB w5 , w5, w9 //sum_3qi = mul_1qi - mul_3qi 435 436 ADD w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1) 437 SUB w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1) 438 ADD w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1) 439 SUB w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1) 440 STR w9 , [x1, #16] //y[4 ] = sum_0qr + (sum_2qr << 1) 441 STR w10, [x1, #80] //y[20] = sum_0qr - (sum_2qr << 1) 442 STR w6 , [x1, #48] //y[12] = sum_1qr + (sum_3qi << 1) 443 STR w2 , [x1, #112] //y[28] = sum_1qr - (sum_3qi << 1) 444 445 ADD w5, w8, w7, lsl #1 //sum_0qi + (sum_2qi << 1) 446 SUB w8, w8, w7, lsl #1 //sum_0qi - (sum_2qi << 1) 447 SUB w7, w3, w4, lsl #1 //sum_1qi - (sum_3qr << 1) 448 ADD w3, w3, w4, lsl #1 //sum_1qi + (sum_3qr << 1) 449 STR w5 , [x1, #20] //y[4 +1] = sum_0qi + (sum_2qi << 1) 450 STR w8 , [x1, #84] //y[20+1] = sum_0qi - (sum_2qi << 1) 451 STR w7 , [x1, #52] //y[12+1] = sum_1qi - (sum_3qr << 1) 452 STR w3 , [x1, #116] //y[28+1] = sum_1qi + (sum_3qr << 1) 453 454 //Fourth Butterfly 455 LDR w2, [x0, #96] //mul_0qr = inp_0qr = x[24] 456 sxtw x2, w2 457 LDR w3, [x0, #100] //mul_0qi = inp_1qr = x[25] 458 sxtw x3, w3 459 460 LDR w5, [x0, #104] //inp_1qr = x[26] 461 sxtw x5, w5 462 LDR w6, [x0, #108] //inp_1qi = x[27] 463 sxtw x6, w6 464 465 SMULL x4, w5, w12 466 ASR x4, x4, #16 467// SMULWB x4, x5, x12 //mul_1qr = mpy_16_32_ns( 0x30FC , inp_1qr) 468 469 SMULL x20, w6, w11 470 ASR x20, x20, #16 471 ADD w4, w4, w20 472// SMLAWB x4, x6, x11, x4 //mul_1qr -= mpy_16_32_ns(-0x7642 , inp_1qi) 473 474 SMULL x5, w5, w21 475 ASR x5, x5, #16 476// SMULWT x5, x5, x11 //mul_1qi = mpy_16_32_ns(-0x7642 , inp_1qr) 477 478 LDR w7, [x0, #112] //inp_2qr = x[28] 479 sxtw x7, w7 480 LDR w8, [x0, #116] //inp_2qi = x[29] 481 sxtw x8, w8 482 483 //Moved for delay slot 484 SMULL x20, w6, w12 485 ASR x20, x20, #16 486 ADD w5, w5, w20 487// SMLAWB x5, x6, x12, x5 //mul_1qi += mpy_16_32_ns( 0x30FC , inp_1qi) 488 489 SUB w6, w8, w7 //(-inp_2qr + inp_2qi) 490 491 SMULL x6, w6, w14 492 ASR x6, x6, #16 493// SMULWB x6, x6, x14 //mul_2qr = mpy_16_32_ns( 0x5A83 , (-inp_2qr + inp_2qi)) 494 ADD w7, w8, w7 //(inp_2qr + inp_2qi) 495 496 SMULL x7, w7, w24 497 ASR x7, x7, #16 498// SMULWT x7, x7, x14 //mul_2qi = mpy_16_32_ns(-0x5A83 , (inp_2qr + inp_2qi)) 499 500 LDR w9 , [x0, #120] //inp_3qr = x[30] 501 sxtw x9, w9 502 LDR w10, [x0, #124] //inp_3qi = x[31] 503 sxtw x10, w10 504 505 SMULL x8, w9, w21 506 ASR x8, x8, #16 507// SMULWT x8, x9 , x11 //mul_3qr = mpy_16_32_ns(-0x7642 , inp_3qr) 508 509 SMULL x20, w10, w22 510 ASR x20, x20, #16 511 ADD w8, w8, w20 512// SMLAWT x8, x10, x12, x8 //mul_3qr -= mpy_16_32_ns( 0x30FC , inp_3qi)// 513 514 SMULL x9, w9, w12 515 ASR x9, x9, #16 516// SMULWB x9, x9 , x12 //mul_3qi = mpy_16_32_ns( 0x30FC , inp_3qr) 517 518 SMULL x20, w10, w21 519 ASR x20, x20, #16 520 ADD w9, w9, w20 521// SMLAWT x9, x10, x11, x9 //mul_3qi += mpy_16_32_ns(-0x7642 , inp_3qi) 522 523 ADD w10, w2, w6, lsl #1 //sum_0qr = mul_0qr + (mul_2qr << 1) 524 SUB w2 , w2, w6, lsl #1 //sum_1qr = mul_0qr - (mul_2qr << 1) 525 ADD w6 , w4, w8 //sum_2qr = mul_1qr + mul_3qr 526 SUB w4 , w4, w8 //sum_3qr = mul_1qr - mul_3qr 527 528 ADD w8 , w3, w7, lsl #1 //sum_0qi = mul_0qi + (mul_2qi << 1) 529 SUB w3 , w3, w7, lsl #1 //sum_1qi = mul_0qi - (mul_2qi << 1) 530 ADD w7 , w5, w9 //sum_2qi = mul_1qi + mul_3qi 531 SUB w5 , w5, w9 //sum_3qi = mul_1qi - mul_3qi 532 533 ADD w9 , w10, w6, lsl #1 //sum_0qr + (sum_2qr << 1) 534 SUB w10, w10, w6, lsl #1 //sum_0qr - (sum_2qr << 1) 535 ADD w6 , w2 , w5, lsl #1 //sum_1qr + (sum_3qi << 1) 536 SUB w2 , w2 , w5, lsl #1 //sum_1qr - (sum_3qi << 1) 537 STR w9 , [x1, #24] //y[6 ] = sum_0qr + (sum_2qr << 1) 538 STR w10, [x1, #88] //y[22] = sum_0qr - (sum_2qr << 1) 539 STR w6 , [x1, #56] //y[14] = sum_1qr + (sum_3qi << 1) 540 STR w2 , [x1, #120] //y[30] = sum_1qr - (sum_3qi << 1) 541 542 ADD w5 , w8 , w7, lsl #1 //sum_0qi + (sum_2qi << 1) 543 SUB w8 , w8 , w7, lsl #1 //sum_0qi - (sum_2qi << 1) 544 SUB w7 , w3 , w4, lsl #1 //sum_1qi - (sum_3qr << 1) 545 ADD w3 , w3 , w4, lsl #1 //sum_1qi + (sum_3qr << 1) 546 STR w5 , [x1, #28] //y[6 +1] = sum_0qi + (sum_2qi << 1) 547 STR w8 , [x1, #92] //y[22+1] = sum_0qi - (sum_2qi << 1) 548 STR w7 , [x1, #60] //y[14+1] = sum_1qi - (sum_3qr << 1) 549 STR w3 , [x1, #124] //y[30+1] = sum_1qi + (sum_3qr << 1) 550 551 // LDMFD sp!, {x4-x12,x15} 552 ldp x19, x20, [sp], #16 553 pop_v_regs 554 ret 555 556