1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <audio_utils/primitives.h>
18 #include <string.h>
19 #include "private/private.h"
20 
21 void ditherAndClamp(int32_t *out, const int32_t *sums, size_t pairs)
22 {
23     for (; pairs > 0; --pairs) {
24         const int32_t l = clamp16(*sums++ >> 12);
25         const int32_t r = clamp16(*sums++ >> 12);
26         *out++ = (r << 16) | (l & 0xFFFF);
27     }
28 }
29 
30 void memcpy_to_i16_from_q4_27(int16_t *dst, const int32_t *src, size_t count)
31 {
32     for (; count > 0; --count) {
33         *dst++ = clamp16(*src++ >> 12);
34     }
35 }
36 
37 void memcpy_to_i16_from_u8(int16_t *dst, const uint8_t *src, size_t count)
38 {
39     dst += count;
40     src += count;
41     for (; count > 0; --count) {
42         *--dst = (int16_t)(*--src - 0x80) << 8;
43     }
44 }
45 
46 void memcpy_to_u8_from_i16(uint8_t *dst, const int16_t *src, size_t count)
47 {
48     for (; count > 0; --count) {
49         *dst++ = (*src++ >> 8) + 0x80;
50     }
51 }
52 
53 void memcpy_to_u8_from_p24(uint8_t *dst, const uint8_t *src, size_t count)
54 {
55     for (; count > 0; --count) {
56 #if HAVE_BIG_ENDIAN
57         *dst++ = src[0] + 0x80;
58 #else
59         *dst++ = src[2] + 0x80;
60 #endif
61         src += 3;
62     }
63 }
64 
65 void memcpy_to_u8_from_i32(uint8_t *dst, const int32_t *src, size_t count)
66 {
67     for (; count > 0; --count) {
68         *dst++ = (*src++ >> 24) + 0x80;
69     }
70 }
71 
72 void memcpy_to_u8_from_q8_23(uint8_t *dst, const int32_t *src, size_t count)
73 {
74     for (; count > 0; --count) {
75         *dst++ = clamp8_from_q8_23(*src++);
76     }
77 }
78 
79 void memcpy_to_u8_from_float(uint8_t *dst, const float *src, size_t count)
80 {
81     for (; count > 0; --count) {
82         *dst++ = clamp8_from_float(*src++);
83     }
84 }
85 
86 void memcpy_to_i16_from_i32(int16_t *dst, const int32_t *src, size_t count)
87 {
88     for (; count > 0; --count) {
89         *dst++ = *src++ >> 16;
90     }
91 }
92 
93 void memcpy_to_i16_from_float(int16_t *dst, const float *src, size_t count)
94 {
95     for (; count > 0; --count) {
96         *dst++ = clamp16_from_float(*src++);
97     }
98 }
99 
100 void memcpy_to_float_from_q4_27(float *dst, const int32_t *src, size_t count)
101 {
102     for (; count > 0; --count) {
103         *dst++ = float_from_q4_27(*src++);
104     }
105 }
106 
107 void memcpy_to_float_from_i16(float *dst, const int16_t *src, size_t count)
108 {
109     dst += count;
110     src += count;
111     for (; count > 0; --count) {
112         *--dst = float_from_i16(*--src);
113     }
114 }
115 
116 void memcpy_to_float_from_u8(float *dst, const uint8_t *src, size_t count)
117 {
118     dst += count;
119     src += count;
120     for (; count > 0; --count) {
121         *--dst = float_from_u8(*--src);
122     }
123 }
124 
125 void memcpy_to_float_from_p24(float *dst, const uint8_t *src, size_t count)
126 {
127     dst += count;
128     src += count * 3;
129     for (; count > 0; --count) {
130         src -= 3;
131         *--dst = float_from_p24(src);
132     }
133 }
134 
135 void memcpy_to_i16_from_p24(int16_t *dst, const uint8_t *src, size_t count)
136 {
137     for (; count > 0; --count) {
138 #if HAVE_BIG_ENDIAN
139         *dst++ = src[1] | (src[0] << 8);
140 #else
141         *dst++ = src[1] | (src[2] << 8);
142 #endif
143         src += 3;
144     }
145 }
146 
147 void memcpy_to_i32_from_p24(int32_t *dst, const uint8_t *src, size_t count)
148 {
149     dst += count;
150     src += count * 3;
151     for (; count > 0; --count) {
152         src -= 3;
153 #if HAVE_BIG_ENDIAN
154         *--dst = (src[2] << 8) | (src[1] << 16) | (src[0] << 24);
155 #else
156         *--dst = (src[0] << 8) | (src[1] << 16) | (src[2] << 24);
157 #endif
158     }
159 }
160 
161 void memcpy_to_p24_from_i16(uint8_t *dst, const int16_t *src, size_t count)
162 {
163     dst += count * 3;
164     src += count;
165     for (; count > 0; --count) {
166         dst -= 3;
167         const int16_t sample = *--src;
168 #if HAVE_BIG_ENDIAN
169         dst[0] = sample >> 8;
170         dst[1] = sample;
171         dst[2] = 0;
172 #else
173         dst[0] = 0;
174         dst[1] = sample;
175         dst[2] = sample >> 8;
176 #endif
177     }
178 }
179 
180 void memcpy_to_p24_from_float(uint8_t *dst, const float *src, size_t count)
181 {
182     for (; count > 0; --count) {
183         int32_t ival = clamp24_from_float(*src++);
184 
185 #if HAVE_BIG_ENDIAN
186         *dst++ = ival >> 16;
187         *dst++ = ival >> 8;
188         *dst++ = ival;
189 #else
190         *dst++ = ival;
191         *dst++ = ival >> 8;
192         *dst++ = ival >> 16;
193 #endif
194     }
195 }
196 
197 void memcpy_to_p24_from_q8_23(uint8_t *dst, const int32_t *src, size_t count)
198 {
199     for (; count > 0; --count) {
200         int32_t ival = clamp24_from_q8_23(*src++);
201 
202 #if HAVE_BIG_ENDIAN
203         *dst++ = ival >> 16;
204         *dst++ = ival >> 8;
205         *dst++ = ival;
206 #else
207         *dst++ = ival;
208         *dst++ = ival >> 8;
209         *dst++ = ival >> 16;
210 #endif
211     }
212 }
213 
214 void memcpy_to_p24_from_i32(uint8_t *dst, const int32_t *src, size_t count)
215 {
216     for (; count > 0; --count) {
217         int32_t ival = *src++ >> 8;
218 
219 #if HAVE_BIG_ENDIAN
220         *dst++ = ival >> 16;
221         *dst++ = ival >> 8;
222         *dst++ = ival;
223 #else
224         *dst++ = ival;
225         *dst++ = ival >> 8;
226         *dst++ = ival >> 16;
227 #endif
228     }
229 }
230 
231 void memcpy_to_q8_23_from_i16(int32_t *dst, const int16_t *src, size_t count)
232 {
233     dst += count;
234     src += count;
235     for (; count > 0; --count) {
236         *--dst = (int32_t)*--src << 8;
237     }
238 }
239 
240 void memcpy_to_q8_23_from_float_with_clamp(int32_t *dst, const float *src, size_t count)
241 {
242     for (; count > 0; --count) {
243         *dst++ = clamp24_from_float(*src++);
244     }
245 }
246 
247 void memcpy_to_q8_23_from_p24(int32_t *dst, const uint8_t *src, size_t count)
248 {
249     dst += count;
250     src += count * 3;
251     for (; count > 0; --count) {
252         src -= 3;
253 #if HAVE_BIG_ENDIAN
254         *--dst = (int8_t)src[0] << 16 | src[1] << 8 | src[2];
255 #else
256         *--dst = (int8_t)src[2] << 16 | src[1] << 8 | src[0];
257 #endif
258     }
259 }
260 
261 void memcpy_to_q4_27_from_float(int32_t *dst, const float *src, size_t count)
262 {
263     for (; count > 0; --count) {
264         *dst++ = clampq4_27_from_float(*src++);
265     }
266 }
267 
268 void memcpy_to_i16_from_q8_23(int16_t *dst, const int32_t *src, size_t count)
269 {
270     for (; count > 0; --count) {
271         *dst++ = clamp16(*src++ >> 8);
272     }
273 }
274 
275 void memcpy_to_float_from_q8_23(float *dst, const int32_t *src, size_t count)
276 {
277     for (; count > 0; --count) {
278         *dst++ = float_from_q8_23(*src++);
279     }
280 }
281 
282 void memcpy_to_i32_from_u8(int32_t *dst, const uint8_t *src, size_t count)
283 {
284     dst += count;
285     src += count;
286     for (; count > 0; --count) {
287         *--dst = ((int32_t)(*--src) - 0x80) << 24;
288     }
289 }
290 
291 void memcpy_to_i32_from_i16(int32_t *dst, const int16_t *src, size_t count)
292 {
293     dst += count;
294     src += count;
295     for (; count > 0; --count) {
296         *--dst = (int32_t)*--src << 16;
297     }
298 }
299 
300 void memcpy_to_i32_from_float(int32_t *dst, const float *src, size_t count)
301 {
302     for (; count > 0; --count) {
303         *dst++ = clamp32_from_float(*src++);
304     }
305 }
306 
307 void memcpy_to_float_from_i32(float *dst, const int32_t *src, size_t count)
308 {
309     for (; count > 0; --count) {
310         *dst++ = float_from_i32(*src++);
311     }
312 }
313 
314 void memcpy_to_float_from_float_with_clamping(float *dst, const float *src, size_t count,
315                                               float absMax) {
316     // Note: using NEON intrinsics (vminq_f32, vld1q_f32...) did NOT accelerate
317     // the function when benchmarked. The compiler already vectorize using FMINNM f32x4 & similar.
318     // Note: clamping induce a ~20% overhead compared to memcpy for count in [64, 512]
319     //       See primitives_benchmark
320     for (; count > 0; --count) {
321         const float sample = *src++;
322         *dst++ = fmax(-absMax, fmin(absMax, sample));
323     }
324 }
325 
326 void downmix_to_mono_i16_from_stereo_i16(int16_t *dst, const int16_t *src, size_t count)
327 {
328     for (; count > 0; --count) {
329         *dst++ = (int16_t)(((int32_t)src[0] + (int32_t)src[1]) >> 1);
330         src += 2;
331     }
332 }
333 
334 void upmix_to_stereo_i16_from_mono_i16(int16_t *dst, const int16_t *src, size_t count)
335 {
336     dst += count * 2;
337     src += count;
338     for (; count > 0; --count) {
339         const int32_t temp = *--src;
340         dst -= 2;
341         dst[0] = temp;
342         dst[1] = temp;
343     }
344 }
345 
346 void downmix_to_mono_float_from_stereo_float(float *dst, const float *src, size_t frames)
347 {
348     for (; frames > 0; --frames) {
349         *dst++ = (src[0] + src[1]) * 0.5;
350         src += 2;
351     }
352 }
353 
354 void upmix_to_stereo_float_from_mono_float(float *dst, const float *src, size_t frames)
355 {
356     dst += frames * 2;
357     src += frames;
358     for (; frames > 0; --frames) {
359         const float temp = *--src;
360         dst -= 2;
361         dst[0] = temp;
362         dst[1] = temp;
363     }
364 }
365 
366 size_t nonZeroMono32(const int32_t *samples, size_t count)
367 {
368     size_t nonZero = 0;
369     for (; count > 0; --count) {
370         nonZero += *samples++ != 0;
371     }
372     return nonZero;
373 }
374 
375 size_t nonZeroMono16(const int16_t *samples, size_t count)
376 {
377     size_t nonZero = 0;
378     for (; count > 0; --count) {
379         nonZero += *samples++ != 0;
380     }
381     return nonZero;
382 }
383 
384 size_t nonZeroStereo32(const int32_t *frames, size_t count)
385 {
386     size_t nonZero = 0;
387     for (; count > 0; --count) {
388         nonZero += frames[0] != 0 || frames[1] != 0;
389         frames += 2;
390     }
391     return nonZero;
392 }
393 
394 size_t nonZeroStereo16(const int16_t *frames, size_t count)
395 {
396     size_t nonZero = 0;
397     for (; count > 0; --count) {
398         nonZero += frames[0] != 0 || frames[1] != 0;
399         frames += 2;
400     }
401     return nonZero;
402 }
403 
404 /*
405  * C macro to do channel mask copying independent of dst/src sample type.
406  * Don't pass in any expressions for the macro arguments here.
407  */
408 #define copy_frame_by_mask(dst, dmask, src, smask, count, zero) \
409 { \
410     uint32_t bit, ormask; \
411     for (; (count) > 0; --(count)) { \
412         ormask = (dmask) | (smask); \
413         while (ormask) { \
414             bit = ormask & -ormask; /* get lowest bit */ \
415             ormask ^= bit; /* remove lowest bit */ \
416             if ((dmask) & bit) { \
417                 *(dst)++ = (smask) & bit ? *(src)++ : (zero); \
418             } else { /* source channel only */ \
419                 ++(src); \
420             } \
421         } \
422     } \
423 }
424 
425 void memcpy_by_channel_mask(void *dst, uint32_t dst_mask,
426         const void *src, uint32_t src_mask, size_t sample_size, size_t count)
427 {
428 #if 0
429     /* alternate way of handling memcpy_by_channel_mask by using the idxary */
430     int8_t idxary[32];
431     uint32_t src_channels = __builtin_popcount(src_mask);
432     uint32_t dst_channels =
433             memcpy_by_index_array_initialization(idxary, 32, dst_mask, src_mask);
434 
435     memcpy_by_idxary(dst, dst_channels, src, src_channels, idxary, sample_size, count);
436 #else
437     if (dst_mask == src_mask) {
438         memcpy(dst, src, sample_size * __builtin_popcount(dst_mask) * count);
439         return;
440     }
441     switch (sample_size) {
442     case 1: {
443         uint8_t *udst = (uint8_t*)dst;
444         const uint8_t *usrc = (const uint8_t*)src;
445 
446         copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
447     } break;
448     case 2: {
449         uint16_t *udst = (uint16_t*)dst;
450         const uint16_t *usrc = (const uint16_t*)src;
451 
452         copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
453     } break;
454     case 3: { /* could be slow.  use a struct to represent 3 bytes of data. */
455         uint8x3_t *udst = (uint8x3_t*)dst;
456         const uint8x3_t *usrc = (const uint8x3_t*)src;
457         static const uint8x3_t zero; /* tricky - we use this to zero out a sample */
458 
459         copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, zero);
460     } break;
461     case 4: {
462         uint32_t *udst = (uint32_t*)dst;
463         const uint32_t *usrc = (const uint32_t*)src;
464 
465         copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
466     } break;
467     default:
468         abort(); /* illegal value */
469         break;
470     }
471 #endif
472 }
473 
474 /*
475  * C macro to do copying by index array, to rearrange samples
476  * within a frame.  This is independent of src/dst sample type.
477  * Don't pass in any expressions for the macro arguments here.
478  */
479 #define copy_frame_by_idx(dst, dst_channels, src, src_channels, idxary, count, zero) \
480 { \
481     unsigned i; \
482     int index; \
483     for (; (count) > 0; --(count)) { \
484         for (i = 0; i < (dst_channels); ++i) { \
485             index = (idxary)[i]; \
486             *(dst)++ = index < 0 ? (zero) : (src)[index]; \
487         } \
488         (src) += (src_channels); \
489     } \
490 }
491 
492 void memcpy_by_index_array(void *dst, uint32_t dst_channels,
493         const void *src, uint32_t src_channels,
494         const int8_t *idxary, size_t sample_size, size_t count)
495 {
496     switch (sample_size) {
497     case 1: {
498         uint8_t *udst = (uint8_t*)dst;
499         const uint8_t *usrc = (const uint8_t*)src;
500 
501         copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
502     } break;
503     case 2: {
504         uint16_t *udst = (uint16_t*)dst;
505         const uint16_t *usrc = (const uint16_t*)src;
506 
507         copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
508     } break;
509     case 3: { /* could be slow.  use a struct to represent 3 bytes of data. */
510         uint8x3_t *udst = (uint8x3_t*)dst;
511         const uint8x3_t *usrc = (const uint8x3_t*)src;
512         static const uint8x3_t zero;
513 
514         copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, zero);
515     } break;
516     case 4: {
517         uint32_t *udst = (uint32_t*)dst;
518         const uint32_t *usrc = (const uint32_t*)src;
519 
520         copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
521     } break;
522     default:
523         abort(); /* illegal value */
524         break;
525     }
526 }
527 
528 size_t memcpy_by_index_array_initialization(int8_t *idxary, size_t idxcount,
529         uint32_t dst_mask, uint32_t src_mask)
530 {
531     size_t n = 0;
532     int srcidx = 0;
533     uint32_t bit, ormask = src_mask | dst_mask;
534 
535     while (ormask && n < idxcount) {
536         bit = ormask & -ormask;          /* get lowest bit */
537         ormask ^= bit;                   /* remove lowest bit */
538         if (src_mask & dst_mask & bit) { /* matching channel */
539             idxary[n++] = srcidx++;
540         } else if (src_mask & bit) {     /* source channel only */
541             ++srcidx;
542         } else {                         /* destination channel only */
543             idxary[n++] = -1;
544         }
545     }
546     return n + __builtin_popcount(ormask & dst_mask);
547 }
548 
549 size_t memcpy_by_index_array_initialization_src_index(int8_t *idxary, size_t idxcount,
550         uint32_t dst_mask, uint32_t src_mask) {
551     size_t dst_count = __builtin_popcount(dst_mask);
552     if (idxcount == 0) {
553         return dst_count;
554     }
555     if (dst_count > idxcount) {
556         dst_count = idxcount;
557     }
558 
559     size_t src_idx, dst_idx;
560     for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++dst_idx) {
561         if (src_mask & 1) {
562             idxary[dst_idx] = src_idx++;
563         } else {
564             idxary[dst_idx] = -1;
565         }
566         src_mask >>= 1;
567     }
568     return dst_idx;
569 }
570 
571 size_t memcpy_by_index_array_initialization_dst_index(int8_t *idxary, size_t idxcount,
572         uint32_t dst_mask, uint32_t src_mask) {
573     size_t src_idx, dst_idx;
574     size_t dst_count = __builtin_popcount(dst_mask);
575     size_t src_count = __builtin_popcount(src_mask);
576     if (idxcount == 0) {
577         return dst_count;
578     }
579     if (dst_count > idxcount) {
580         dst_count = idxcount;
581     }
582     for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++src_idx) {
583         if (dst_mask & 1) {
584             idxary[dst_idx++] = src_idx < src_count ? (signed)src_idx : -1;
585         }
586         dst_mask >>= 1;
587     }
588     return dst_idx;
589 }
590 
591 void accumulate_i16(int16_t *dst, const int16_t *src, size_t count) {
592     while (count--) {
593         *dst = clamp16((int32_t)*dst + *src++);
594         ++dst;
595     }
596 }
597 
598 void accumulate_u8(uint8_t *dst, const uint8_t *src, size_t count) {
599     int32_t sum;
600     for (; count > 0; --count) {
601         // 8-bit samples are centered around 0x80.
602         sum = *dst + *src++ - 0x80;
603         // Clamp to [0, 0xff].
604         *dst++ = (sum & 0x100) ? (~sum >> 9) : sum;
605     }
606 }
607 
608 void accumulate_p24(uint8_t *dst, const uint8_t *src, size_t count) {
609     for (; count > 0; --count) {
610         // Unpack.
611         int32_t dst_q8_23 = 0;
612         int32_t src_q8_23 = 0;
613         memcpy_to_q8_23_from_p24(&dst_q8_23, dst, 1);
614         memcpy_to_q8_23_from_p24(&src_q8_23, src, 1);
615 
616         // Accumulate and overwrite.
617         dst_q8_23 += src_q8_23;
618         memcpy_to_p24_from_q8_23(dst, &dst_q8_23, 1);
619 
620         // Move on to next sample.
621         dst += 3;
622         src += 3;
623   }
624 }
625 
626 void accumulate_q8_23(int32_t *dst, const int32_t *src, size_t count) {
627     for (; count > 0; --count) {
628         *dst = clamp24_from_q8_23(*dst + *src++);
629         ++dst;
630     }
631 }
632 
633 void accumulate_i32(int32_t *dst, const int32_t *src, size_t count) {
634     for (; count > 0; --count) {
635         *dst = clamp32((int64_t)*dst + *src++);
636         ++dst;
637     }
638 }
639 
640 void accumulate_float(float *dst, const float *src, size_t count) {
641     for (; count > 0; --count) {
642         *dst++ += *src++;
643     }
644 }
645