1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 19 20 namespace android { 21 22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 23 24 #if USE_NEON 25 // 26 // NEON specializations are enabled for Process() and ProcessL() 27 // 28 // TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary) 29 // and looping stride 16 (or vice versa). This has some polyphase coef data alignment 30 // issues with S16 coefs. Consider this later. 31 32 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. 33 #define ASSEMBLY_ACCUMULATE_MONO \ 34 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ 35 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ 36 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ 37 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ 38 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ 39 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ 40 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ 41 42 #define ASSEMBLY_ACCUMULATE_STEREO \ 43 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ 44 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ 45 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ 46 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ 47 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ 48 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ 49 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ 50 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ 51 52 template <> 53 inline void ProcessL<1, 16>(int32_t* const out, 54 int count, 55 const int16_t* coefsP, 56 const int16_t* coefsN, 57 const int16_t* sP, 58 const int16_t* sN, 59 const int32_t* const volumeLR) 60 { 61 const int CHANNELS = 1; // template specialization does not preserve params 62 const int STRIDE = 16; 63 sP -= CHANNELS*((STRIDE>>1)-1); 64 asm ( 65 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 66 67 "1: \n" 68 69 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 70 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 71 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 72 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 73 74 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 75 76 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 77 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef 78 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef 79 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 80 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 81 82 // moving these ARM instructions before neon above seems to be slower 83 "subs %[count], %[count], #8 \n"// (1) update loop counter 84 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 85 86 // sP used after branch (warning) 87 "bne 1b \n"// loop 88 89 ASSEMBLY_ACCUMULATE_MONO 90 91 : [out] "=Uv" (out[0]), 92 [count] "+r" (count), 93 [coefsP0] "+r" (coefsP), 94 [coefsN0] "+r" (coefsN), 95 [sP] "+r" (sP), 96 [sN] "+r" (sN) 97 : [vLR] "r" (volumeLR) 98 : "cc", "memory", 99 "q0", "q1", "q2", "q3", 100 "q8", "q10" 101 ); 102 } 103 104 template <> 105 inline void ProcessL<2, 16>(int32_t* const out, 106 int count, 107 const int16_t* coefsP, 108 const int16_t* coefsN, 109 const int16_t* sP, 110 const int16_t* sN, 111 const int32_t* const volumeLR) 112 { 113 const int CHANNELS = 2; // template specialization does not preserve params 114 const int STRIDE = 16; 115 sP -= CHANNELS*((STRIDE>>1)-1); 116 asm ( 117 "veor q0, q0, q0 \n"// (1) acc_L = 0 118 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 119 120 "1: \n" 121 122 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 123 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 124 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 125 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 126 127 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 128 "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive 129 130 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left 131 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left 132 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right 133 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right 134 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 135 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 136 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 137 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 138 139 // moving these ARM before neon seems to be slower 140 "subs %[count], %[count], #8 \n"// (1) update loop counter 141 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 142 143 // sP used after branch (warning) 144 "bne 1b \n"// loop 145 146 ASSEMBLY_ACCUMULATE_STEREO 147 148 : [out] "=Uv" (out[0]), 149 [count] "+r" (count), 150 [coefsP0] "+r" (coefsP), 151 [coefsN0] "+r" (coefsN), 152 [sP] "+r" (sP), 153 [sN] "+r" (sN) 154 : [vLR] "r" (volumeLR) 155 : "cc", "memory", 156 "q0", "q1", "q2", "q3", 157 "q4", "q5", "q6", 158 "q8", "q10" 159 ); 160 } 161 162 template <> 163 inline void Process<1, 16>(int32_t* const out, 164 int count, 165 const int16_t* coefsP, 166 const int16_t* coefsN, 167 const int16_t* coefsP1, 168 const int16_t* coefsN1, 169 const int16_t* sP, 170 const int16_t* sN, 171 uint32_t lerpP, 172 const int32_t* const volumeLR) 173 { 174 const int CHANNELS = 1; // template specialization does not preserve params 175 const int STRIDE = 16; 176 sP -= CHANNELS*((STRIDE>>1)-1); 177 asm ( 178 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 179 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 180 181 "1: \n" 182 183 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 184 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 185 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 186 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 187 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 188 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 189 190 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 191 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 192 193 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 194 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 195 196 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 197 198 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set 199 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 200 201 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 202 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef 203 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef 204 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 205 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 206 207 // moving these ARM instructions before neon above seems to be slower 208 "subs %[count], %[count], #8 \n"// (1) update loop counter 209 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 210 211 // sP used after branch (warning) 212 "bne 1b \n"// loop 213 214 ASSEMBLY_ACCUMULATE_MONO 215 216 : [out] "=Uv" (out[0]), 217 [count] "+r" (count), 218 [coefsP0] "+r" (coefsP), 219 [coefsN0] "+r" (coefsN), 220 [coefsP1] "+r" (coefsP1), 221 [coefsN1] "+r" (coefsN1), 222 [sP] "+r" (sP), 223 [sN] "+r" (sN) 224 : [lerpP] "r" (lerpP), 225 [vLR] "r" (volumeLR) 226 : "cc", "memory", 227 "q0", "q1", "q2", "q3", 228 "q8", "q9", "q10", "q11" 229 ); 230 } 231 232 template <> 233 inline void Process<2, 16>(int32_t* const out, 234 int count, 235 const int16_t* coefsP, 236 const int16_t* coefsN, 237 const int16_t* coefsP1, 238 const int16_t* coefsN1, 239 const int16_t* sP, 240 const int16_t* sN, 241 uint32_t lerpP, 242 const int32_t* const volumeLR) 243 { 244 const int CHANNELS = 2; // template specialization does not preserve params 245 const int STRIDE = 16; 246 sP -= CHANNELS*((STRIDE>>1)-1); 247 asm ( 248 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 249 "veor q0, q0, q0 \n"// (1) acc_L = 0 250 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 251 252 "1: \n" 253 254 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 255 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 256 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 257 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 258 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 259 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 260 261 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 262 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 263 264 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 265 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 266 267 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 268 "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive 269 270 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set 271 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 272 273 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left 274 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left 275 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right 276 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right 277 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 278 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 279 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 280 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 281 282 // moving these ARM before neon seems to be slower 283 "subs %[count], %[count], #8 \n"// (1) update loop counter 284 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 285 286 // sP used after branch (warning) 287 "bne 1b \n"// loop 288 289 ASSEMBLY_ACCUMULATE_STEREO 290 291 : [out] "=Uv" (out[0]), 292 [count] "+r" (count), 293 [coefsP0] "+r" (coefsP), 294 [coefsN0] "+r" (coefsN), 295 [coefsP1] "+r" (coefsP1), 296 [coefsN1] "+r" (coefsN1), 297 [sP] "+r" (sP), 298 [sN] "+r" (sN) 299 : [lerpP] "r" (lerpP), 300 [vLR] "r" (volumeLR) 301 : "cc", "memory", 302 "q0", "q1", "q2", "q3", 303 "q4", "q5", "q6", 304 "q8", "q9", "q10", "q11" 305 ); 306 } 307 308 template <> 309 inline void ProcessL<1, 16>(int32_t* const out, 310 int count, 311 const int32_t* coefsP, 312 const int32_t* coefsN, 313 const int16_t* sP, 314 const int16_t* sN, 315 const int32_t* const volumeLR) 316 { 317 const int CHANNELS = 1; // template specialization does not preserve params 318 const int STRIDE = 16; 319 sP -= CHANNELS*((STRIDE>>1)-1); 320 asm ( 321 "veor q0, q0, q0 \n"// result, initialize to 0 322 323 "1: \n" 324 325 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 326 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 327 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 328 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 329 330 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 331 332 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 333 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 334 335 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 336 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 337 338 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 339 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 340 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 341 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 342 343 "vadd.s32 q0, q0, q12 \n"// accumulate result 344 "vadd.s32 q13, q13, q14 \n"// accumulate result 345 "vadd.s32 q0, q0, q15 \n"// accumulate result 346 "vadd.s32 q0, q0, q13 \n"// accumulate result 347 348 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 349 "subs %[count], %[count], #8 \n"// update loop counter 350 351 "bne 1b \n"// loop 352 353 ASSEMBLY_ACCUMULATE_MONO 354 355 : [out] "=Uv" (out[0]), 356 [count] "+r" (count), 357 [coefsP0] "+r" (coefsP), 358 [coefsN0] "+r" (coefsN), 359 [sP] "+r" (sP), 360 [sN] "+r" (sN) 361 : [vLR] "r" (volumeLR) 362 : "cc", "memory", 363 "q0", "q1", "q2", "q3", 364 "q8", "q9", "q10", "q11", 365 "q12", "q13", "q14", "q15" 366 ); 367 } 368 369 template <> 370 inline void ProcessL<2, 16>(int32_t* const out, 371 int count, 372 const int32_t* coefsP, 373 const int32_t* coefsN, 374 const int16_t* sP, 375 const int16_t* sN, 376 const int32_t* const volumeLR) 377 { 378 const int CHANNELS = 2; // template specialization does not preserve params 379 const int STRIDE = 16; 380 sP -= CHANNELS*((STRIDE>>1)-1); 381 asm ( 382 "veor q0, q0, q0 \n"// result, initialize to 0 383 "veor q4, q4, q4 \n"// result, initialize to 0 384 385 "1: \n" 386 387 "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples 388 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples 389 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 390 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 391 392 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 393 "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side 394 395 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 396 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 397 398 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 399 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 400 401 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 402 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 403 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 404 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 405 406 "vadd.s32 q0, q0, q12 \n"// accumulate result 407 "vadd.s32 q13, q13, q14 \n"// accumulate result 408 "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result 409 "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result 410 411 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 412 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 413 414 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 415 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 416 417 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 418 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 419 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 420 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 421 422 "vadd.s32 q4, q4, q12 \n"// accumulate result 423 "vadd.s32 q13, q13, q14 \n"// accumulate result 424 "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result 425 "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result 426 427 "subs %[count], %[count], #8 \n"// update loop counter 428 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 429 430 "bne 1b \n"// loop 431 432 ASSEMBLY_ACCUMULATE_STEREO 433 434 : [out] "=Uv" (out[0]), 435 [count] "+r" (count), 436 [coefsP0] "+r" (coefsP), 437 [coefsN0] "+r" (coefsN), 438 [sP] "+r" (sP), 439 [sN] "+r" (sN) 440 : [vLR] "r" (volumeLR) 441 : "cc", "memory", 442 "q0", "q1", "q2", "q3", 443 "q4", "q5", "q6", 444 "q8", "q9", "q10", "q11", 445 "q12", "q13", "q14", "q15" 446 ); 447 } 448 449 template <> 450 inline void Process<1, 16>(int32_t* const out, 451 int count, 452 const int32_t* coefsP, 453 const int32_t* coefsN, 454 const int32_t* coefsP1, 455 const int32_t* coefsN1, 456 const int16_t* sP, 457 const int16_t* sN, 458 uint32_t lerpP, 459 const int32_t* const volumeLR) 460 { 461 const int CHANNELS = 1; // template specialization does not preserve params 462 const int STRIDE = 16; 463 sP -= CHANNELS*((STRIDE>>1)-1); 464 asm ( 465 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 466 "veor q0, q0, q0 \n"// result, initialize to 0 467 468 "1: \n" 469 470 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 471 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 472 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 473 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 474 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 475 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 476 477 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 478 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 479 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 480 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 481 482 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 483 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 484 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 485 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 486 487 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 488 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 489 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 490 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 491 492 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 493 494 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 495 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 496 497 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 498 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 499 500 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 501 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 502 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 503 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 504 505 "vadd.s32 q0, q0, q12 \n"// accumulate result 506 "vadd.s32 q13, q13, q14 \n"// accumulate result 507 "vadd.s32 q0, q0, q15 \n"// accumulate result 508 "vadd.s32 q0, q0, q13 \n"// accumulate result 509 510 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 511 "subs %[count], %[count], #8 \n"// update loop counter 512 513 "bne 1b \n"// loop 514 515 ASSEMBLY_ACCUMULATE_MONO 516 517 : [out] "=Uv" (out[0]), 518 [count] "+r" (count), 519 [coefsP0] "+r" (coefsP), 520 [coefsN0] "+r" (coefsN), 521 [coefsP1] "+r" (coefsP1), 522 [coefsN1] "+r" (coefsN1), 523 [sP] "+r" (sP), 524 [sN] "+r" (sN) 525 : [lerpP] "r" (lerpP), 526 [vLR] "r" (volumeLR) 527 : "cc", "memory", 528 "q0", "q1", "q2", "q3", 529 "q8", "q9", "q10", "q11", 530 "q12", "q13", "q14", "q15" 531 ); 532 } 533 534 template <> 535 inline void Process<2, 16>(int32_t* const out, 536 int count, 537 const int32_t* coefsP, 538 const int32_t* coefsN, 539 const int32_t* coefsP1, 540 const int32_t* coefsN1, 541 const int16_t* sP, 542 const int16_t* sN, 543 uint32_t lerpP, 544 const int32_t* const volumeLR) 545 { 546 const int CHANNELS = 2; // template specialization does not preserve params 547 const int STRIDE = 16; 548 sP -= CHANNELS*((STRIDE>>1)-1); 549 asm ( 550 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 551 "veor q0, q0, q0 \n"// result, initialize to 0 552 "veor q4, q4, q4 \n"// result, initialize to 0 553 554 "1: \n" 555 556 "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples 557 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples 558 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 559 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 560 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 561 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 562 563 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 564 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 565 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 566 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 567 568 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 569 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 570 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 571 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 572 573 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 574 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 575 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 576 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 577 578 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 579 "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side 580 581 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 582 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 583 584 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 585 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 586 587 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 588 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 589 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 590 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 591 592 "vadd.s32 q0, q0, q12 \n"// accumulate result 593 "vadd.s32 q13, q13, q14 \n"// accumulate result 594 "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result 595 "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result 596 597 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 598 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 599 600 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 601 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 602 603 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 604 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 605 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 606 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 607 608 "vadd.s32 q4, q4, q12 \n"// accumulate result 609 "vadd.s32 q13, q13, q14 \n"// accumulate result 610 "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result 611 "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result 612 613 "subs %[count], %[count], #8 \n"// update loop counter 614 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 615 616 "bne 1b \n"// loop 617 618 ASSEMBLY_ACCUMULATE_STEREO 619 620 : [out] "=Uv" (out[0]), 621 [count] "+r" (count), 622 [coefsP0] "+r" (coefsP), 623 [coefsN0] "+r" (coefsN), 624 [coefsP1] "+r" (coefsP1), 625 [coefsN1] "+r" (coefsN1), 626 [sP] "+r" (sP), 627 [sN] "+r" (sN) 628 : [lerpP] "r" (lerpP), 629 [vLR] "r" (volumeLR) 630 : "cc", "memory", 631 "q0", "q1", "q2", "q3", 632 "q4", "q5", "q6", 633 "q8", "q9", "q10", "q11", 634 "q12", "q13", "q14", "q15" 635 ); 636 } 637 638 template <> 639 inline void ProcessL<1, 8>(int32_t* const out, 640 int count, 641 const int16_t* coefsP, 642 const int16_t* coefsN, 643 const int16_t* sP, 644 const int16_t* sN, 645 const int32_t* const volumeLR) 646 { 647 const int CHANNELS = 1; // template specialization does not preserve params 648 const int STRIDE = 8; 649 sP -= CHANNELS*((STRIDE>>1)-1); 650 asm ( 651 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 652 653 "1: \n" 654 655 "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples 656 "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples 657 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs 658 "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs 659 660 "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4 661 662 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 663 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef 664 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 665 666 // moving these ARM instructions before neon above seems to be slower 667 "subs %[count], %[count], #4 \n"// (1) update loop counter 668 "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples 669 670 // sP used after branch (warning) 671 "bne 1b \n"// loop 672 673 ASSEMBLY_ACCUMULATE_MONO 674 675 : [out] "=Uv" (out[0]), 676 [count] "+r" (count), 677 [coefsP0] "+r" (coefsP), 678 [coefsN0] "+r" (coefsN), 679 [sP] "+r" (sP), 680 [sN] "+r" (sN) 681 : [vLR] "r" (volumeLR) 682 : "cc", "memory", 683 "q0", "q1", "q2", "q3", 684 "q8", "q10" 685 ); 686 } 687 688 template <> 689 inline void ProcessL<2, 8>(int32_t* const out, 690 int count, 691 const int16_t* coefsP, 692 const int16_t* coefsN, 693 const int16_t* sP, 694 const int16_t* sN, 695 const int32_t* const volumeLR) 696 { 697 const int CHANNELS = 2; // template specialization does not preserve params 698 const int STRIDE = 8; 699 sP -= CHANNELS*((STRIDE>>1)-1); 700 asm ( 701 "veor q0, q0, q0 \n"// (1) acc_L = 0 702 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 703 704 "1: \n" 705 706 "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples 707 "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples 708 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs 709 "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs 710 711 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 712 713 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left 714 "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right 715 "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left 716 "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right 717 718 // moving these ARM before neon seems to be slower 719 "subs %[count], %[count], #4 \n"// (1) update loop counter 720 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 721 722 // sP used after branch (warning) 723 "bne 1b \n"// loop 724 725 ASSEMBLY_ACCUMULATE_STEREO 726 727 : [out] "=Uv" (out[0]), 728 [count] "+r" (count), 729 [coefsP0] "+r" (coefsP), 730 [coefsN0] "+r" (coefsN), 731 [sP] "+r" (sP), 732 [sN] "+r" (sN) 733 : [vLR] "r" (volumeLR) 734 : "cc", "memory", 735 "q0", "q1", "q2", "q3", 736 "q4", "q5", "q6", 737 "q8", "q10" 738 ); 739 } 740 741 template <> 742 inline void Process<1, 8>(int32_t* const out, 743 int count, 744 const int16_t* coefsP, 745 const int16_t* coefsN, 746 const int16_t* coefsP1, 747 const int16_t* coefsN1, 748 const int16_t* sP, 749 const int16_t* sN, 750 uint32_t lerpP, 751 const int32_t* const volumeLR) 752 { 753 const int CHANNELS = 1; // template specialization does not preserve params 754 const int STRIDE = 8; 755 sP -= CHANNELS*((STRIDE>>1)-1); 756 asm ( 757 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 758 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 759 760 "1: \n" 761 762 "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples 763 "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples 764 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs 765 "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation 766 "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs 767 "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation 768 769 "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs 770 "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets 771 772 "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 773 "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 774 775 "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 776 777 "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set 778 "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set 779 780 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 781 "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef 782 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 783 784 // moving these ARM instructions before neon above seems to be slower 785 "subs %[count], %[count], #4 \n"// (1) update loop counter 786 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 787 788 // sP used after branch (warning) 789 "bne 1b \n"// loop 790 791 ASSEMBLY_ACCUMULATE_MONO 792 793 : [out] "=Uv" (out[0]), 794 [count] "+r" (count), 795 [coefsP0] "+r" (coefsP), 796 [coefsN0] "+r" (coefsN), 797 [coefsP1] "+r" (coefsP1), 798 [coefsN1] "+r" (coefsN1), 799 [sP] "+r" (sP), 800 [sN] "+r" (sN) 801 : [lerpP] "r" (lerpP), 802 [vLR] "r" (volumeLR) 803 : "cc", "memory", 804 "q0", "q1", "q2", "q3", 805 "q8", "q9", "q10", "q11" 806 ); 807 } 808 809 template <> 810 inline void Process<2, 8>(int32_t* const out, 811 int count, 812 const int16_t* coefsP, 813 const int16_t* coefsN, 814 const int16_t* coefsP1, 815 const int16_t* coefsN1, 816 const int16_t* sP, 817 const int16_t* sN, 818 uint32_t lerpP, 819 const int32_t* const volumeLR) 820 { 821 const int CHANNELS = 2; // template specialization does not preserve params 822 const int STRIDE = 8; 823 sP -= CHANNELS*((STRIDE>>1)-1); 824 asm ( 825 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 826 "veor q0, q0, q0 \n"// (1) acc_L = 0 827 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 828 829 "1: \n" 830 831 "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 832 "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 833 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs 834 "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation 835 "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs 836 "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation 837 838 "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs 839 "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets 840 841 "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 842 "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 843 844 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 845 846 "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set 847 "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set 848 849 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left 850 "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right 851 "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left 852 "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right 853 854 // moving these ARM before neon seems to be slower 855 "subs %[count], %[count], #4 \n"// (1) update loop counter 856 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 857 858 // sP used after branch (warning) 859 "bne 1b \n"// loop 860 861 ASSEMBLY_ACCUMULATE_STEREO 862 863 : [out] "=Uv" (out[0]), 864 [count] "+r" (count), 865 [coefsP0] "+r" (coefsP), 866 [coefsN0] "+r" (coefsN), 867 [coefsP1] "+r" (coefsP1), 868 [coefsN1] "+r" (coefsN1), 869 [sP] "+r" (sP), 870 [sN] "+r" (sN) 871 : [lerpP] "r" (lerpP), 872 [vLR] "r" (volumeLR) 873 : "cc", "memory", 874 "q0", "q1", "q2", "q3", 875 "q4", "q5", "q6", 876 "q8", "q9", "q10", "q11" 877 ); 878 } 879 880 template <> 881 inline void ProcessL<1, 8>(int32_t* const out, 882 int count, 883 const int32_t* coefsP, 884 const int32_t* coefsN, 885 const int16_t* sP, 886 const int16_t* sN, 887 const int32_t* const volumeLR) 888 { 889 const int CHANNELS = 1; // template specialization does not preserve params 890 const int STRIDE = 8; 891 sP -= CHANNELS*((STRIDE>>1)-1); 892 asm ( 893 "veor q0, q0, q0 \n"// result, initialize to 0 894 895 "1: \n" 896 897 "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples 898 "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples 899 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 900 "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 901 902 "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side 903 904 "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits 905 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 906 907 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 908 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 909 910 "vadd.s32 q0, q0, q12 \n"// accumulate result 911 "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result 912 913 "subs %[count], %[count], #4 \n"// update loop counter 914 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 915 916 "bne 1b \n"// loop 917 918 ASSEMBLY_ACCUMULATE_MONO 919 920 : [out] "=Uv" (out[0]), 921 [count] "+r" (count), 922 [coefsP0] "+r" (coefsP), 923 [coefsN0] "+r" (coefsN), 924 [sP] "+r" (sP), 925 [sN] "+r" (sN) 926 : [vLR] "r" (volumeLR) 927 : "cc", "memory", 928 "q0", "q1", "q2", "q3", 929 "q8", "q9", "q10", "q11", 930 "q12", "q14" 931 ); 932 } 933 934 template <> 935 inline void ProcessL<2, 8>(int32_t* const out, 936 int count, 937 const int32_t* coefsP, 938 const int32_t* coefsN, 939 const int16_t* sP, 940 const int16_t* sN, 941 const int32_t* const volumeLR) 942 { 943 const int CHANNELS = 2; // template specialization does not preserve params 944 const int STRIDE = 8; 945 sP -= CHANNELS*((STRIDE>>1)-1); 946 asm ( 947 "veor q0, q0, q0 \n"// result, initialize to 0 948 "veor q4, q4, q4 \n"// result, initialize to 0 949 950 "1: \n" 951 952 "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples 953 "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples 954 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 955 "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 956 957 "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side 958 959 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 960 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 961 962 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 963 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 964 965 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef 966 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 967 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 968 "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef 969 970 "vadd.s32 q0, q0, q12 \n"// accumulate result 971 "vadd.s32 q4, q4, q13 \n"// accumulate result 972 "vadd.s32 q0, q0, q14 \n"// accumulate result 973 "vadd.s32 q4, q4, q15 \n"// accumulate result 974 975 "subs %[count], %[count], #4 \n"// update loop counter 976 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 977 978 "bne 1b \n"// loop 979 980 ASSEMBLY_ACCUMULATE_STEREO 981 982 : [out] "=Uv" (out[0]), 983 [count] "+r" (count), 984 [coefsP0] "+r" (coefsP), 985 [coefsN0] "+r" (coefsN), 986 [sP] "+r" (sP), 987 [sN] "+r" (sN) 988 : [vLR] "r" (volumeLR) 989 : "cc", "memory", 990 "q0", "q1", "q2", "q3", "q4", 991 "q8", "q9", "q10", "q11", 992 "q12", "q13", "q14", "q15" 993 ); 994 } 995 996 template <> 997 inline void Process<1, 8>(int32_t* const out, 998 int count, 999 const int32_t* coefsP, 1000 const int32_t* coefsN, 1001 const int32_t* coefsP1, 1002 const int32_t* coefsN1, 1003 const int16_t* sP, 1004 const int16_t* sN, 1005 uint32_t lerpP, 1006 const int32_t* const volumeLR) 1007 { 1008 const int CHANNELS = 1; // template specialization does not preserve params 1009 const int STRIDE = 8; 1010 sP -= CHANNELS*((STRIDE>>1)-1); 1011 asm ( 1012 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 1013 "veor q0, q0, q0 \n"// result, initialize to 0 1014 1015 "1: \n" 1016 1017 "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples 1018 "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples 1019 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 1020 "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation 1021 "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs 1022 "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation 1023 1024 "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side 1025 1026 "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs 1027 "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets 1028 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 1029 1030 "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs 1031 "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs 1032 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 1033 1034 "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set 1035 "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set 1036 1037 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 1038 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 1039 1040 "vadd.s32 q0, q0, q12 \n"// accumulate result 1041 "vadd.s32 q0, q0, q14 \n"// accumulate result 1042 1043 "subs %[count], %[count], #4 \n"// update loop counter 1044 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 1045 1046 "bne 1b \n"// loop 1047 1048 ASSEMBLY_ACCUMULATE_MONO 1049 1050 : [out] "=Uv" (out[0]), 1051 [count] "+r" (count), 1052 [coefsP0] "+r" (coefsP), 1053 [coefsP1] "+r" (coefsP1), 1054 [coefsN0] "+r" (coefsN), 1055 [coefsN1] "+r" (coefsN1), 1056 [sP] "+r" (sP), 1057 [sN] "+r" (sN) 1058 : [lerpP] "r" (lerpP), 1059 [vLR] "r" (volumeLR) 1060 : "cc", "memory", 1061 "q0", "q1", "q2", "q3", 1062 "q8", "q9", "q10", "q11", 1063 "q12", "q14" 1064 ); 1065 } 1066 1067 template <> 1068 inline 1069 void Process<2, 8>(int32_t* const out, 1070 int count, 1071 const int32_t* coefsP, 1072 const int32_t* coefsN, 1073 const int32_t* coefsP1, 1074 const int32_t* coefsN1, 1075 const int16_t* sP, 1076 const int16_t* sN, 1077 uint32_t lerpP, 1078 const int32_t* const volumeLR) 1079 { 1080 const int CHANNELS = 2; // template specialization does not preserve params 1081 const int STRIDE = 8; 1082 sP -= CHANNELS*((STRIDE>>1)-1); 1083 asm ( 1084 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 1085 "veor q0, q0, q0 \n"// result, initialize to 0 1086 "veor q4, q4, q4 \n"// result, initialize to 0 1087 1088 "1: \n" 1089 "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples 1090 "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples 1091 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 1092 "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation 1093 "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs 1094 "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation 1095 1096 "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side 1097 1098 "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs 1099 "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets 1100 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 1101 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 1102 1103 "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs 1104 "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs 1105 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 1106 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 1107 1108 "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set 1109 "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set 1110 1111 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 1112 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 1113 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 1114 "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef 1115 1116 "vadd.s32 q0, q0, q12 \n"// accumulate result 1117 "vadd.s32 q4, q4, q13 \n"// accumulate result 1118 "vadd.s32 q0, q0, q14 \n"// accumulate result 1119 "vadd.s32 q4, q4, q15 \n"// accumulate result 1120 1121 "subs %[count], %[count], #4 \n"// update loop counter 1122 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 1123 1124 "bne 1b \n"// loop 1125 1126 ASSEMBLY_ACCUMULATE_STEREO 1127 1128 : [out] "=Uv" (out[0]), 1129 [count] "+r" (count), 1130 [coefsP0] "+r" (coefsP), 1131 [coefsP1] "+r" (coefsP1), 1132 [coefsN0] "+r" (coefsN), 1133 [coefsN1] "+r" (coefsN1), 1134 [sP] "+r" (sP), 1135 [sN] "+r" (sN) 1136 : [lerpP] "r" (lerpP), 1137 [vLR] "r" (volumeLR) 1138 : "cc", "memory", 1139 "q0", "q1", "q2", "q3", "q4", 1140 "q8", "q9", "q10", "q11", 1141 "q12", "q13", "q14", "q15" 1142 ); 1143 } 1144 1145 #endif //USE_NEON 1146 1147 }; // namespace android 1148 1149 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ 1150