1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <audio_utils/primitives.h> 18 #include <string.h> 19 #include "private/private.h" 20 21 void ditherAndClamp(int32_t *out, const int32_t *sums, size_t pairs) 22 { 23 for (; pairs > 0; --pairs) { 24 const int32_t l = clamp16(*sums++ >> 12); 25 const int32_t r = clamp16(*sums++ >> 12); 26 *out++ = (r << 16) | (l & 0xFFFF); 27 } 28 } 29 30 void memcpy_to_i16_from_q4_27(int16_t *dst, const int32_t *src, size_t count) 31 { 32 for (; count > 0; --count) { 33 *dst++ = clamp16(*src++ >> 12); 34 } 35 } 36 37 void memcpy_to_i16_from_u8(int16_t *dst, const uint8_t *src, size_t count) 38 { 39 dst += count; 40 src += count; 41 for (; count > 0; --count) { 42 *--dst = (int16_t)(*--src - 0x80) << 8; 43 } 44 } 45 46 void memcpy_to_u8_from_i16(uint8_t *dst, const int16_t *src, size_t count) 47 { 48 for (; count > 0; --count) { 49 *dst++ = (*src++ >> 8) + 0x80; 50 } 51 } 52 53 void memcpy_to_u8_from_p24(uint8_t *dst, const uint8_t *src, size_t count) 54 { 55 for (; count > 0; --count) { 56 #if HAVE_BIG_ENDIAN 57 *dst++ = src[0] + 0x80; 58 #else 59 *dst++ = src[2] + 0x80; 60 #endif 61 src += 3; 62 } 63 } 64 65 void memcpy_to_u8_from_i32(uint8_t *dst, const int32_t *src, size_t count) 66 { 67 for (; count > 0; --count) { 68 *dst++ = (*src++ >> 24) + 0x80; 69 } 70 } 71 72 void memcpy_to_u8_from_q8_23(uint8_t *dst, const int32_t *src, size_t count) 73 { 74 for (; count > 0; --count) { 75 *dst++ = clamp8_from_q8_23(*src++); 76 } 77 } 78 79 void memcpy_to_u8_from_float(uint8_t *dst, const float *src, size_t count) 80 { 81 for (; count > 0; --count) { 82 *dst++ = clamp8_from_float(*src++); 83 } 84 } 85 86 void memcpy_to_i16_from_i32(int16_t *dst, const int32_t *src, size_t count) 87 { 88 for (; count > 0; --count) { 89 *dst++ = *src++ >> 16; 90 } 91 } 92 93 void memcpy_to_i16_from_float(int16_t *dst, const float *src, size_t count) 94 { 95 for (; count > 0; --count) { 96 *dst++ = clamp16_from_float(*src++); 97 } 98 } 99 100 void memcpy_to_float_from_q4_27(float *dst, const int32_t *src, size_t count) 101 { 102 for (; count > 0; --count) { 103 *dst++ = float_from_q4_27(*src++); 104 } 105 } 106 107 void memcpy_to_float_from_i16(float *dst, const int16_t *src, size_t count) 108 { 109 dst += count; 110 src += count; 111 for (; count > 0; --count) { 112 *--dst = float_from_i16(*--src); 113 } 114 } 115 116 void memcpy_to_float_from_u8(float *dst, const uint8_t *src, size_t count) 117 { 118 dst += count; 119 src += count; 120 for (; count > 0; --count) { 121 *--dst = float_from_u8(*--src); 122 } 123 } 124 125 void memcpy_to_float_from_p24(float *dst, const uint8_t *src, size_t count) 126 { 127 dst += count; 128 src += count * 3; 129 for (; count > 0; --count) { 130 src -= 3; 131 *--dst = float_from_p24(src); 132 } 133 } 134 135 void memcpy_to_i16_from_p24(int16_t *dst, const uint8_t *src, size_t count) 136 { 137 for (; count > 0; --count) { 138 #if HAVE_BIG_ENDIAN 139 *dst++ = src[1] | (src[0] << 8); 140 #else 141 *dst++ = src[1] | (src[2] << 8); 142 #endif 143 src += 3; 144 } 145 } 146 147 void memcpy_to_i32_from_p24(int32_t *dst, const uint8_t *src, size_t count) 148 { 149 dst += count; 150 src += count * 3; 151 for (; count > 0; --count) { 152 src -= 3; 153 #if HAVE_BIG_ENDIAN 154 *--dst = (src[2] << 8) | (src[1] << 16) | (src[0] << 24); 155 #else 156 *--dst = (src[0] << 8) | (src[1] << 16) | (src[2] << 24); 157 #endif 158 } 159 } 160 161 void memcpy_to_p24_from_i16(uint8_t *dst, const int16_t *src, size_t count) 162 { 163 dst += count * 3; 164 src += count; 165 for (; count > 0; --count) { 166 dst -= 3; 167 const int16_t sample = *--src; 168 #if HAVE_BIG_ENDIAN 169 dst[0] = sample >> 8; 170 dst[1] = sample; 171 dst[2] = 0; 172 #else 173 dst[0] = 0; 174 dst[1] = sample; 175 dst[2] = sample >> 8; 176 #endif 177 } 178 } 179 180 void memcpy_to_p24_from_float(uint8_t *dst, const float *src, size_t count) 181 { 182 for (; count > 0; --count) { 183 int32_t ival = clamp24_from_float(*src++); 184 185 #if HAVE_BIG_ENDIAN 186 *dst++ = ival >> 16; 187 *dst++ = ival >> 8; 188 *dst++ = ival; 189 #else 190 *dst++ = ival; 191 *dst++ = ival >> 8; 192 *dst++ = ival >> 16; 193 #endif 194 } 195 } 196 197 void memcpy_to_p24_from_q8_23(uint8_t *dst, const int32_t *src, size_t count) 198 { 199 for (; count > 0; --count) { 200 int32_t ival = clamp24_from_q8_23(*src++); 201 202 #if HAVE_BIG_ENDIAN 203 *dst++ = ival >> 16; 204 *dst++ = ival >> 8; 205 *dst++ = ival; 206 #else 207 *dst++ = ival; 208 *dst++ = ival >> 8; 209 *dst++ = ival >> 16; 210 #endif 211 } 212 } 213 214 void memcpy_to_p24_from_i32(uint8_t *dst, const int32_t *src, size_t count) 215 { 216 for (; count > 0; --count) { 217 int32_t ival = *src++ >> 8; 218 219 #if HAVE_BIG_ENDIAN 220 *dst++ = ival >> 16; 221 *dst++ = ival >> 8; 222 *dst++ = ival; 223 #else 224 *dst++ = ival; 225 *dst++ = ival >> 8; 226 *dst++ = ival >> 16; 227 #endif 228 } 229 } 230 231 void memcpy_to_q8_23_from_i16(int32_t *dst, const int16_t *src, size_t count) 232 { 233 dst += count; 234 src += count; 235 for (; count > 0; --count) { 236 *--dst = (int32_t)*--src << 8; 237 } 238 } 239 240 void memcpy_to_q8_23_from_float_with_clamp(int32_t *dst, const float *src, size_t count) 241 { 242 for (; count > 0; --count) { 243 *dst++ = clamp24_from_float(*src++); 244 } 245 } 246 247 void memcpy_to_q8_23_from_p24(int32_t *dst, const uint8_t *src, size_t count) 248 { 249 dst += count; 250 src += count * 3; 251 for (; count > 0; --count) { 252 src -= 3; 253 #if HAVE_BIG_ENDIAN 254 *--dst = (int8_t)src[0] << 16 | src[1] << 8 | src[2]; 255 #else 256 *--dst = (int8_t)src[2] << 16 | src[1] << 8 | src[0]; 257 #endif 258 } 259 } 260 261 void memcpy_to_q4_27_from_float(int32_t *dst, const float *src, size_t count) 262 { 263 for (; count > 0; --count) { 264 *dst++ = clampq4_27_from_float(*src++); 265 } 266 } 267 268 void memcpy_to_i16_from_q8_23(int16_t *dst, const int32_t *src, size_t count) 269 { 270 for (; count > 0; --count) { 271 *dst++ = clamp16(*src++ >> 8); 272 } 273 } 274 275 void memcpy_to_float_from_q8_23(float *dst, const int32_t *src, size_t count) 276 { 277 for (; count > 0; --count) { 278 *dst++ = float_from_q8_23(*src++); 279 } 280 } 281 282 void memcpy_to_i32_from_u8(int32_t *dst, const uint8_t *src, size_t count) 283 { 284 dst += count; 285 src += count; 286 for (; count > 0; --count) { 287 *--dst = ((int32_t)(*--src) - 0x80) << 24; 288 } 289 } 290 291 void memcpy_to_i32_from_i16(int32_t *dst, const int16_t *src, size_t count) 292 { 293 dst += count; 294 src += count; 295 for (; count > 0; --count) { 296 *--dst = (int32_t)*--src << 16; 297 } 298 } 299 300 void memcpy_to_i32_from_float(int32_t *dst, const float *src, size_t count) 301 { 302 for (; count > 0; --count) { 303 *dst++ = clamp32_from_float(*src++); 304 } 305 } 306 307 void memcpy_to_float_from_i32(float *dst, const int32_t *src, size_t count) 308 { 309 for (; count > 0; --count) { 310 *dst++ = float_from_i32(*src++); 311 } 312 } 313 314 void memcpy_to_float_from_float_with_clamping(float *dst, const float *src, size_t count, 315 float absMax) { 316 // Note: using NEON intrinsics (vminq_f32, vld1q_f32...) did NOT accelerate 317 // the function when benchmarked. The compiler already vectorize using FMINNM f32x4 & similar. 318 // Note: clamping induce a ~20% overhead compared to memcpy for count in [64, 512] 319 // See primitives_benchmark 320 for (; count > 0; --count) { 321 const float sample = *src++; 322 *dst++ = fmax(-absMax, fmin(absMax, sample)); 323 } 324 } 325 326 void downmix_to_mono_i16_from_stereo_i16(int16_t *dst, const int16_t *src, size_t count) 327 { 328 for (; count > 0; --count) { 329 *dst++ = (int16_t)(((int32_t)src[0] + (int32_t)src[1]) >> 1); 330 src += 2; 331 } 332 } 333 334 void upmix_to_stereo_i16_from_mono_i16(int16_t *dst, const int16_t *src, size_t count) 335 { 336 dst += count * 2; 337 src += count; 338 for (; count > 0; --count) { 339 const int32_t temp = *--src; 340 dst -= 2; 341 dst[0] = temp; 342 dst[1] = temp; 343 } 344 } 345 346 void downmix_to_mono_float_from_stereo_float(float *dst, const float *src, size_t frames) 347 { 348 for (; frames > 0; --frames) { 349 *dst++ = (src[0] + src[1]) * 0.5; 350 src += 2; 351 } 352 } 353 354 void upmix_to_stereo_float_from_mono_float(float *dst, const float *src, size_t frames) 355 { 356 dst += frames * 2; 357 src += frames; 358 for (; frames > 0; --frames) { 359 const float temp = *--src; 360 dst -= 2; 361 dst[0] = temp; 362 dst[1] = temp; 363 } 364 } 365 366 size_t nonZeroMono32(const int32_t *samples, size_t count) 367 { 368 size_t nonZero = 0; 369 for (; count > 0; --count) { 370 nonZero += *samples++ != 0; 371 } 372 return nonZero; 373 } 374 375 size_t nonZeroMono16(const int16_t *samples, size_t count) 376 { 377 size_t nonZero = 0; 378 for (; count > 0; --count) { 379 nonZero += *samples++ != 0; 380 } 381 return nonZero; 382 } 383 384 size_t nonZeroStereo32(const int32_t *frames, size_t count) 385 { 386 size_t nonZero = 0; 387 for (; count > 0; --count) { 388 nonZero += frames[0] != 0 || frames[1] != 0; 389 frames += 2; 390 } 391 return nonZero; 392 } 393 394 size_t nonZeroStereo16(const int16_t *frames, size_t count) 395 { 396 size_t nonZero = 0; 397 for (; count > 0; --count) { 398 nonZero += frames[0] != 0 || frames[1] != 0; 399 frames += 2; 400 } 401 return nonZero; 402 } 403 404 /* 405 * C macro to do channel mask copying independent of dst/src sample type. 406 * Don't pass in any expressions for the macro arguments here. 407 */ 408 #define copy_frame_by_mask(dst, dmask, src, smask, count, zero) \ 409 { \ 410 uint32_t bit, ormask; \ 411 for (; (count) > 0; --(count)) { \ 412 ormask = (dmask) | (smask); \ 413 while (ormask) { \ 414 bit = ormask & -ormask; /* get lowest bit */ \ 415 ormask ^= bit; /* remove lowest bit */ \ 416 if ((dmask) & bit) { \ 417 *(dst)++ = (smask) & bit ? *(src)++ : (zero); \ 418 } else { /* source channel only */ \ 419 ++(src); \ 420 } \ 421 } \ 422 } \ 423 } 424 425 void memcpy_by_channel_mask(void *dst, uint32_t dst_mask, 426 const void *src, uint32_t src_mask, size_t sample_size, size_t count) 427 { 428 #if 0 429 /* alternate way of handling memcpy_by_channel_mask by using the idxary */ 430 int8_t idxary[32]; 431 uint32_t src_channels = __builtin_popcount(src_mask); 432 uint32_t dst_channels = 433 memcpy_by_index_array_initialization(idxary, 32, dst_mask, src_mask); 434 435 memcpy_by_idxary(dst, dst_channels, src, src_channels, idxary, sample_size, count); 436 #else 437 if (dst_mask == src_mask) { 438 memcpy(dst, src, sample_size * __builtin_popcount(dst_mask) * count); 439 return; 440 } 441 switch (sample_size) { 442 case 1: { 443 uint8_t *udst = (uint8_t*)dst; 444 const uint8_t *usrc = (const uint8_t*)src; 445 446 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0); 447 } break; 448 case 2: { 449 uint16_t *udst = (uint16_t*)dst; 450 const uint16_t *usrc = (const uint16_t*)src; 451 452 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0); 453 } break; 454 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */ 455 uint8x3_t *udst = (uint8x3_t*)dst; 456 const uint8x3_t *usrc = (const uint8x3_t*)src; 457 static const uint8x3_t zero; /* tricky - we use this to zero out a sample */ 458 459 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, zero); 460 } break; 461 case 4: { 462 uint32_t *udst = (uint32_t*)dst; 463 const uint32_t *usrc = (const uint32_t*)src; 464 465 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0); 466 } break; 467 default: 468 abort(); /* illegal value */ 469 break; 470 } 471 #endif 472 } 473 474 /* 475 * C macro to do copying by index array, to rearrange samples 476 * within a frame. This is independent of src/dst sample type. 477 * Don't pass in any expressions for the macro arguments here. 478 */ 479 #define copy_frame_by_idx(dst, dst_channels, src, src_channels, idxary, count, zero) \ 480 { \ 481 unsigned i; \ 482 int index; \ 483 for (; (count) > 0; --(count)) { \ 484 for (i = 0; i < (dst_channels); ++i) { \ 485 index = (idxary)[i]; \ 486 *(dst)++ = index < 0 ? (zero) : (src)[index]; \ 487 } \ 488 (src) += (src_channels); \ 489 } \ 490 } 491 492 void memcpy_by_index_array(void *dst, uint32_t dst_channels, 493 const void *src, uint32_t src_channels, 494 const int8_t *idxary, size_t sample_size, size_t count) 495 { 496 switch (sample_size) { 497 case 1: { 498 uint8_t *udst = (uint8_t*)dst; 499 const uint8_t *usrc = (const uint8_t*)src; 500 501 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0); 502 } break; 503 case 2: { 504 uint16_t *udst = (uint16_t*)dst; 505 const uint16_t *usrc = (const uint16_t*)src; 506 507 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0); 508 } break; 509 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */ 510 uint8x3_t *udst = (uint8x3_t*)dst; 511 const uint8x3_t *usrc = (const uint8x3_t*)src; 512 static const uint8x3_t zero; 513 514 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, zero); 515 } break; 516 case 4: { 517 uint32_t *udst = (uint32_t*)dst; 518 const uint32_t *usrc = (const uint32_t*)src; 519 520 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0); 521 } break; 522 default: 523 abort(); /* illegal value */ 524 break; 525 } 526 } 527 528 size_t memcpy_by_index_array_initialization(int8_t *idxary, size_t idxcount, 529 uint32_t dst_mask, uint32_t src_mask) 530 { 531 size_t n = 0; 532 int srcidx = 0; 533 uint32_t bit, ormask = src_mask | dst_mask; 534 535 while (ormask && n < idxcount) { 536 bit = ormask & -ormask; /* get lowest bit */ 537 ormask ^= bit; /* remove lowest bit */ 538 if (src_mask & dst_mask & bit) { /* matching channel */ 539 idxary[n++] = srcidx++; 540 } else if (src_mask & bit) { /* source channel only */ 541 ++srcidx; 542 } else { /* destination channel only */ 543 idxary[n++] = -1; 544 } 545 } 546 return n + __builtin_popcount(ormask & dst_mask); 547 } 548 549 size_t memcpy_by_index_array_initialization_src_index(int8_t *idxary, size_t idxcount, 550 uint32_t dst_mask, uint32_t src_mask) { 551 size_t dst_count = __builtin_popcount(dst_mask); 552 if (idxcount == 0) { 553 return dst_count; 554 } 555 if (dst_count > idxcount) { 556 dst_count = idxcount; 557 } 558 559 size_t src_idx, dst_idx; 560 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++dst_idx) { 561 if (src_mask & 1) { 562 idxary[dst_idx] = src_idx++; 563 } else { 564 idxary[dst_idx] = -1; 565 } 566 src_mask >>= 1; 567 } 568 return dst_idx; 569 } 570 571 size_t memcpy_by_index_array_initialization_dst_index(int8_t *idxary, size_t idxcount, 572 uint32_t dst_mask, uint32_t src_mask) { 573 size_t src_idx, dst_idx; 574 size_t dst_count = __builtin_popcount(dst_mask); 575 size_t src_count = __builtin_popcount(src_mask); 576 if (idxcount == 0) { 577 return dst_count; 578 } 579 if (dst_count > idxcount) { 580 dst_count = idxcount; 581 } 582 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++src_idx) { 583 if (dst_mask & 1) { 584 idxary[dst_idx++] = src_idx < src_count ? (signed)src_idx : -1; 585 } 586 dst_mask >>= 1; 587 } 588 return dst_idx; 589 } 590 591 void accumulate_i16(int16_t *dst, const int16_t *src, size_t count) { 592 while (count--) { 593 *dst = clamp16((int32_t)*dst + *src++); 594 ++dst; 595 } 596 } 597 598 void accumulate_u8(uint8_t *dst, const uint8_t *src, size_t count) { 599 int32_t sum; 600 for (; count > 0; --count) { 601 // 8-bit samples are centered around 0x80. 602 sum = *dst + *src++ - 0x80; 603 // Clamp to [0, 0xff]. 604 *dst++ = (sum & 0x100) ? (~sum >> 9) : sum; 605 } 606 } 607 608 void accumulate_p24(uint8_t *dst, const uint8_t *src, size_t count) { 609 for (; count > 0; --count) { 610 // Unpack. 611 int32_t dst_q8_23 = 0; 612 int32_t src_q8_23 = 0; 613 memcpy_to_q8_23_from_p24(&dst_q8_23, dst, 1); 614 memcpy_to_q8_23_from_p24(&src_q8_23, src, 1); 615 616 // Accumulate and overwrite. 617 dst_q8_23 += src_q8_23; 618 memcpy_to_p24_from_q8_23(dst, &dst_q8_23, 1); 619 620 // Move on to next sample. 621 dst += 3; 622 src += 3; 623 } 624 } 625 626 void accumulate_q8_23(int32_t *dst, const int32_t *src, size_t count) { 627 for (; count > 0; --count) { 628 *dst = clamp24_from_q8_23(*dst + *src++); 629 ++dst; 630 } 631 } 632 633 void accumulate_i32(int32_t *dst, const int32_t *src, size_t count) { 634 for (; count > 0; --count) { 635 *dst = clamp32((int64_t)*dst + *src++); 636 ++dst; 637 } 638 } 639 640 void accumulate_float(float *dst, const float *src, size_t count) { 641 for (; count > 0; --count) { 642 *dst++ += *src++; 643 } 644 } 645