1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 /* Copyright (C) 2011 Google Inc. All rights reserved.
7 * Use of this source code is governed by a BSD-style license that can be
8 * found in the LICENSE.WEBKIT file.
9 */
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "drc_math.h"
16 #include "drc_kernel.h"
17
18 #define MAX_PRE_DELAY_FRAMES 1024U
19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1)
20 #define DEFAULT_PRE_DELAY_FRAMES 256U
21 #define DIVISION_FRAMES 32U
22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1)
23
24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)]))
25 #define assert_on_compile_is_power_of_2(n) \
26 assert_on_compile((n) != 0 && (((n) & ((n) - 1)) == 0))
27
28 const float uninitialized_value = -1;
29 static int drc_math_initialized;
30
dk_init(struct drc_kernel * dk,float sample_rate)31 void dk_init(struct drc_kernel *dk, float sample_rate)
32 {
33 unsigned int i;
34
35 if (!drc_math_initialized) {
36 drc_math_initialized = 1;
37 drc_math_init();
38 }
39
40 dk->sample_rate = sample_rate;
41 dk->detector_average = 0;
42 dk->compressor_gain = 1;
43 dk->enabled = 0;
44 dk->processed = 0;
45 dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES;
46 dk->pre_delay_read_index = 0;
47 dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES;
48 dk->max_attack_compression_diff_db = -INFINITY;
49 dk->ratio = uninitialized_value;
50 dk->slope = uninitialized_value;
51 dk->linear_threshold = uninitialized_value;
52 dk->db_threshold = uninitialized_value;
53 dk->db_knee = uninitialized_value;
54 dk->knee_threshold = uninitialized_value;
55 dk->ratio_base = uninitialized_value;
56 dk->K = uninitialized_value;
57
58 assert_on_compile_is_power_of_2(DIVISION_FRAMES);
59 assert_on_compile(DIVISION_FRAMES % 4 == 0);
60 /* Allocate predelay buffers */
61 assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES);
62 for (i = 0; i < DRC_NUM_CHANNELS; i++) {
63 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
64 dk->pre_delay_buffers[i] = (float *)calloc(1, size);
65 }
66 }
67
dk_free(struct drc_kernel * dk)68 void dk_free(struct drc_kernel *dk)
69 {
70 unsigned int i;
71 for (i = 0; i < DRC_NUM_CHANNELS; ++i)
72 free(dk->pre_delay_buffers[i]);
73 }
74
75 /* Sets the pre-delay (lookahead) buffer size */
set_pre_delay_time(struct drc_kernel * dk,float pre_delay_time)76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time)
77 {
78 unsigned int i;
79 /* Re-configure look-ahead section pre-delay if delay time has
80 * changed. */
81 unsigned pre_delay_frames = pre_delay_time * dk->sample_rate;
82 pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1);
83
84 /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we
85 * won't split a division of samples into two blocks of memory, so it is
86 * easier to process. This may make the actual delay time slightly less
87 * than the specified value, but the difference is less than 1ms. */
88 pre_delay_frames &= ~DIVISION_FRAMES_MASK;
89
90 /* We need at least one division buffer, so the incoming data won't
91 * overwrite the output data */
92 pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES);
93
94 if (dk->last_pre_delay_frames != pre_delay_frames) {
95 dk->last_pre_delay_frames = pre_delay_frames;
96 for (i = 0; i < DRC_NUM_CHANNELS; ++i) {
97 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
98 memset(dk->pre_delay_buffers[i], 0, size);
99 }
100
101 dk->pre_delay_read_index = 0;
102 dk->pre_delay_write_index = pre_delay_frames;
103 }
104 }
105
106 /* Exponential curve for the knee. It is 1st derivative matched at
107 * dk->linear_threshold and asymptotically approaches the value
108 * dk->linear_threshold + 1 / k.
109 *
110 * This is used only when calculating the static curve, not used when actually
111 * compress the input data (knee_curveK below is used instead).
112 */
knee_curve(struct drc_kernel * dk,float x,float k)113 static float knee_curve(struct drc_kernel *dk, float x, float k)
114 {
115 /* Linear up to threshold. */
116 if (x < dk->linear_threshold)
117 return x;
118
119 return dk->linear_threshold +
120 (1 - knee_expf(-k * (x - dk->linear_threshold))) / k;
121 }
122
123 /* Approximate 1st derivative with input and output expressed in dB. This slope
124 * is equal to the inverse of the compression "ratio". In other words, a
125 * compression ratio of 20 would be a slope of 1/20.
126 */
slope_at(struct drc_kernel * dk,float x,float k)127 static float slope_at(struct drc_kernel *dk, float x, float k)
128 {
129 if (x < dk->linear_threshold)
130 return 1;
131
132 float x2 = x * 1.001;
133
134 float x_db = linear_to_decibels(x);
135 float x2Db = linear_to_decibels(x2);
136
137 float y_db = linear_to_decibels(knee_curve(dk, x, k));
138 float y2Db = linear_to_decibels(knee_curve(dk, x2, k));
139
140 float m = (y2Db - y_db) / (x2Db - x_db);
141
142 return m;
143 }
144
k_at_slope(struct drc_kernel * dk,float desired_slope)145 static float k_at_slope(struct drc_kernel *dk, float desired_slope)
146 {
147 float x_db = dk->db_threshold + dk->db_knee;
148 float x = decibels_to_linear(x_db);
149
150 /* Approximate k given initial values. */
151 float minK = 0.1;
152 float maxK = 10000;
153 float k = 5;
154 unsigned int i;
155
156 for (i = 0; i < 15; ++i) {
157 /* A high value for k will more quickly asymptotically approach
158 * a slope of 0. */
159 float slope = slope_at(dk, x, k);
160
161 if (slope < desired_slope) {
162 /* k is too high. */
163 maxK = k;
164 } else {
165 /* k is too low. */
166 minK = k;
167 }
168
169 /* Re-calculate based on geometric mean. */
170 k = sqrtf(minK * maxK);
171 }
172
173 return k;
174 }
175
update_static_curve_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio)176 static void update_static_curve_parameters(struct drc_kernel *dk,
177 float db_threshold,
178 float db_knee, float ratio)
179 {
180 if (db_threshold != dk->db_threshold || db_knee != dk->db_knee ||
181 ratio != dk->ratio) {
182 /* Threshold and knee. */
183 dk->db_threshold = db_threshold;
184 dk->linear_threshold = decibels_to_linear(db_threshold);
185 dk->db_knee = db_knee;
186
187 /* Compute knee parameters. */
188 dk->ratio = ratio;
189 dk->slope = 1 / dk->ratio;
190
191 float k = k_at_slope(dk, 1 / dk->ratio);
192 dk->K = k;
193 /* See knee_curveK() for details */
194 dk->knee_alpha = dk->linear_threshold + 1 / k;
195 dk->knee_beta = -expf(k * dk->linear_threshold) / k;
196
197 dk->knee_threshold = decibels_to_linear(db_threshold + db_knee);
198 /* See volume_gain() for details */
199 float y0 = knee_curve(dk, dk->knee_threshold, k);
200 dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope);
201 }
202 }
203
204 /* This is the knee part of the compression curve. Returns the output level
205 * given the input level x. */
knee_curveK(struct drc_kernel * dk,float x)206 static float knee_curveK(struct drc_kernel *dk, float x)
207 {
208 /* The formula in knee_curveK is dk->linear_threshold +
209 * (1 - expf(-k * (x - dk->linear_threshold))) / k
210 * which simplifies to (alpha + beta * expf(gamma))
211 * where alpha = dk->linear_threshold + 1 / k
212 * beta = -expf(k * dk->linear_threshold) / k
213 * gamma = -k * x
214 */
215 return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x);
216 }
217
218 /* Full compression curve with constant ratio after knee. Returns the ratio of
219 * output and input signal. */
volume_gain(struct drc_kernel * dk,float x)220 static float volume_gain(struct drc_kernel *dk, float x)
221 {
222 float y;
223
224 if (x < dk->knee_threshold) {
225 if (x < dk->linear_threshold)
226 return 1;
227 y = knee_curveK(dk, x) / x;
228 } else {
229 /* Constant ratio after knee.
230 * log(y/y0) = s * log(x/x0)
231 * => y = y0 * (x/x0)^s
232 * => y = [y0 * (1/x0)^s] * x^s
233 * => y = dk->ratio_base * x^s
234 * => y/x = dk->ratio_base * x^(s - 1)
235 * => y/x = dk->ratio_base * e^(log(x) * (s - 1))
236 */
237 y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1));
238 }
239
240 return y;
241 }
242
dk_set_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio,float attack_time,float release_time,float pre_delay_time,float db_post_gain,float releaseZone1,float releaseZone2,float releaseZone3,float releaseZone4)243 void dk_set_parameters(struct drc_kernel *dk,
244 float db_threshold,
245 float db_knee,
246 float ratio,
247 float attack_time,
248 float release_time,
249 float pre_delay_time,
250 float db_post_gain,
251 float releaseZone1,
252 float releaseZone2,
253 float releaseZone3,
254 float releaseZone4)
255 {
256 float sample_rate = dk->sample_rate;
257
258 update_static_curve_parameters(dk, db_threshold, db_knee, ratio);
259
260 /* Makeup gain. */
261 float full_range_gain = volume_gain(dk, 1);
262 float full_range_makeup_gain = 1 / full_range_gain;
263
264 /* Empirical/perceptual tuning. */
265 full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f);
266
267 dk->master_linear_gain = decibels_to_linear(db_post_gain) *
268 full_range_makeup_gain;
269
270 /* Attack parameters. */
271 attack_time = max(0.001f, attack_time);
272 dk->attack_frames = attack_time * sample_rate;
273
274 /* Release parameters. */
275 float release_frames = sample_rate * release_time;
276
277 /* Detector release time. */
278 float sat_release_time = 0.0025f;
279 float sat_release_frames = sat_release_time * sample_rate;
280 dk->sat_release_frames_inv_neg = -1 / sat_release_frames;
281 dk->sat_release_rate_at_neg_two_db =
282 decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1;
283
284 /* Create a smooth function which passes through four points.
285 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4
286 */
287 float y1 = release_frames * releaseZone1;
288 float y2 = release_frames * releaseZone2;
289 float y3 = release_frames * releaseZone3;
290 float y4 = release_frames * releaseZone4;
291
292 /* All of these coefficients were derived for 4th order polynomial curve
293 * fitting where the y values match the evenly spaced x values as
294 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3)
295 */
296 dk->kA = 0.9999999999999998f*y1 + 1.8432219684323923e-16f*y2
297 - 1.9373394351676423e-16f*y3 + 8.824516011816245e-18f*y4;
298 dk->kB = -1.5788320352845888f*y1 + 2.3305837032074286f*y2
299 - 0.9141194204840429f*y3 + 0.1623677525612032f*y4;
300 dk->kC = 0.5334142869106424f*y1 - 1.272736789213631f*y2
301 + 0.9258856042207512f*y3 - 0.18656310191776226f*y4;
302 dk->kD = 0.08783463138207234f*y1 - 0.1694162967925622f*y2
303 + 0.08588057951595272f*y3 - 0.00429891410546283f*y4;
304 dk->kE = -0.042416883008123074f*y1 + 0.1115693827987602f*y2
305 - 0.09764676325265872f*y3 + 0.028494263462021576f*y4;
306
307 /* x ranges from 0 -> 3 0 1 2 3
308 * -15 -10 -5 0db
309 *
310 * y calculates adaptive release frames depending on the amount of
311 * compression.
312 */
313 set_pre_delay_time(dk, pre_delay_time);
314 }
315
dk_set_enabled(struct drc_kernel * dk,int enabled)316 void dk_set_enabled(struct drc_kernel *dk, int enabled)
317 {
318 dk->enabled = enabled;
319 }
320
321 /* Updates the envelope_rate used for the next division */
dk_update_envelope(struct drc_kernel * dk)322 static void dk_update_envelope(struct drc_kernel *dk)
323 {
324 const float kA = dk->kA;
325 const float kB = dk->kB;
326 const float kC = dk->kC;
327 const float kD = dk->kD;
328 const float kE = dk->kE;
329 const float attack_frames = dk->attack_frames;
330
331 /* Calculate desired gain */
332 float desired_gain = dk->detector_average;
333
334 /* Pre-warp so we get desired_gain after sin() warp below. */
335 float scaled_desired_gain = warp_asinf(desired_gain);
336
337 /* Deal with envelopes */
338
339 /* envelope_rate is the rate we slew from current compressor level to
340 * the desired level. The exact rate depends on if we're attacking or
341 * releasing and by how much.
342 */
343 float envelope_rate;
344
345 int is_releasing = scaled_desired_gain > dk->compressor_gain;
346
347 /* compression_diff_db is the difference between current compression
348 * level and the desired level. */
349 float compression_diff_db = linear_to_decibels(
350 dk->compressor_gain / scaled_desired_gain);
351
352 if (is_releasing) {
353 /* Release mode - compression_diff_db should be negative dB */
354 dk->max_attack_compression_diff_db = -INFINITY;
355
356 /* Fix gremlins. */
357 if (isbadf(compression_diff_db))
358 compression_diff_db = -1;
359
360 /* Adaptive release - higher compression (lower
361 * compression_diff_db) releases faster. Contain within range:
362 * -12 -> 0 then scale to go from 0 -> 3
363 */
364 float x = compression_diff_db;
365 x = max(-12.0f, x);
366 x = min(0.0f, x);
367 x = 0.25f * (x + 12);
368
369 /* Compute adaptive release curve using 4th order polynomial.
370 * Normal values for the polynomial coefficients would create a
371 * monotonically increasing function.
372 */
373 float x2 = x * x;
374 float x3 = x2 * x;
375 float x4 = x2 * x2;
376 float release_frames = kA + kB * x + kC * x2 + kD * x3 +
377 kE * x4;
378
379 #define kSpacingDb 5
380 float db_per_frame = kSpacingDb / release_frames;
381 envelope_rate = decibels_to_linear(db_per_frame);
382 } else {
383 /* Attack mode - compression_diff_db should be positive dB */
384
385 /* Fix gremlins. */
386 if (isbadf(compression_diff_db))
387 compression_diff_db = 1;
388
389 /* As long as we're still in attack mode, use a rate based off
390 * the largest compression_diff_db we've encountered so far.
391 */
392 dk->max_attack_compression_diff_db = max(
393 dk->max_attack_compression_diff_db,
394 compression_diff_db);
395
396 float eff_atten_diff_db =
397 max(0.5f, dk->max_attack_compression_diff_db);
398
399 float x = 0.25f / eff_atten_diff_db;
400 envelope_rate = 1 - powf(x, 1 / attack_frames);
401 }
402
403 dk->envelope_rate = envelope_rate;
404 dk->scaled_desired_gain = scaled_desired_gain;
405 }
406
407 /* For a division of frames, take the absolute values of left channel and right
408 * channel, store the maximum of them in output. */
409 #ifdef __ARM_NEON__
410 #include <arm_neon.h>
max_abs_division(float * output,float * data0,float * data1)411 static inline void max_abs_division(float *output, float *data0, float *data1)
412 {
413 float32x4_t x, y;
414 int count = DIVISION_FRAMES / 4;
415
416 __asm__ __volatile__(
417 "1: \n"
418 "vld1.32 {%e[x],%f[x]}, [%[data0]]! \n"
419 "vld1.32 {%e[y],%f[y]}, [%[data1]]! \n"
420 "vabs.f32 %q[x], %q[x] \n"
421 "vabs.f32 %q[y], %q[y] \n"
422 "vmax.f32 %q[x], %q[y] \n"
423 "vst1.32 {%e[x],%f[x]}, [%[output]]! \n"
424 "subs %[count], #1 \n"
425 "bne 1b \n"
426 : /* output */
427 "=r"(data0),
428 "=r"(data1),
429 "=r"(output),
430 "=r"(count),
431 [x]"=&w"(x),
432 [y]"=&w"(y)
433 : /* input */
434 [data0]"0"(data0),
435 [data1]"1"(data1),
436 [output]"2"(output),
437 [count]"3"(count)
438 : /* clobber */
439 "memory", "cc"
440 );
441 }
442 #elif defined(__SSE3__)
443 #include <emmintrin.h>
max_abs_division(float * output,float * data0,float * data1)444 static inline void max_abs_division(float *output, float *data0, float *data1)
445 {
446 __m128 x, y;
447 int count = DIVISION_FRAMES / 4;
448
449 __asm__ __volatile__(
450 "1: \n"
451 "lddqu (%[data0]), %[x] \n"
452 "lddqu (%[data1]), %[y] \n"
453 "andps %[mask], %[x] \n"
454 "andps %[mask], %[y] \n"
455 "maxps %[y], %[x] \n"
456 "movdqu %[x], (%[output]) \n"
457 "add $16, %[data0] \n"
458 "add $16, %[data1] \n"
459 "add $16, %[output] \n"
460 "sub $1, %[count] \n"
461 "jnz 1b \n"
462 : /* output */
463 [data0]"+r"(data0),
464 [data1]"+r"(data1),
465 [output]"+r"(output),
466 [count]"+r"(count),
467 [x]"=&x"(x),
468 [y]"=&x"(y)
469 : /* input */
470 [mask]"x"(_mm_set1_epi32(0x7fffffff))
471 : /* clobber */
472 "memory", "cc"
473 );
474 }
475 #else
max_abs_division(float * output,float * data0,float * data1)476 static inline void max_abs_division(float *output, float *data0, float *data1)
477 {
478 unsigned int i;
479 for (i = 0; i < DIVISION_FRAMES; i++)
480 output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i]));
481 }
482 #endif
483
484 /* Update detector_average from the last input division. */
dk_update_detector_average(struct drc_kernel * dk)485 static void dk_update_detector_average(struct drc_kernel *dk)
486 {
487 float abs_input_array[DIVISION_FRAMES];
488 const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg;
489 const float sat_release_rate_at_neg_two_db =
490 dk->sat_release_rate_at_neg_two_db;
491 float detector_average = dk->detector_average;
492 unsigned int div_start, i;
493
494 /* Calculate the start index of the last input division */
495 if (dk->pre_delay_write_index == 0) {
496 div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES;
497 } else {
498 div_start = dk->pre_delay_write_index - DIVISION_FRAMES;
499 }
500
501 /* The max abs value across all channels for this frame */
502 max_abs_division(abs_input_array,
503 &dk->pre_delay_buffers[0][div_start],
504 &dk->pre_delay_buffers[1][div_start]);
505
506 for (i = 0; i < DIVISION_FRAMES; i++) {
507 /* Compute compression amount from un-delayed signal */
508 float abs_input = abs_input_array[i];
509
510 /* Calculate shaped power on undelayed input. Put through
511 * shaping curve. This is linear up to the threshold, then
512 * enters a "knee" portion followed by the "ratio" portion. The
513 * transition from the threshold to the knee is smooth (1st
514 * derivative matched). The transition from the knee to the
515 * ratio portion is smooth (1st derivative matched).
516 */
517 float gain = volume_gain(dk, abs_input);
518 int is_release = (gain > detector_average);
519 if (is_release) {
520 if (gain > NEG_TWO_DB) {
521 detector_average += (gain - detector_average) *
522 sat_release_rate_at_neg_two_db;
523 } else {
524 float gain_db = linear_to_decibels(gain);
525 float db_per_frame = gain_db *
526 sat_release_frames_inv_neg;
527 float sat_release_rate =
528 decibels_to_linear(db_per_frame) - 1;
529 detector_average += (gain - detector_average) *
530 sat_release_rate;
531 }
532 } else {
533 detector_average = gain;
534 }
535
536 /* Fix gremlins. */
537 if (isbadf(detector_average))
538 detector_average = 1.0f;
539 else
540 detector_average = min(detector_average, 1.0f);
541 }
542
543 dk->detector_average = detector_average;
544 }
545
546 /* Calculate compress_gain from the envelope and apply total_gain to compress
547 * the next output division. */
548 #if defined(__ARM_NEON__)
549 #include <arm_neon.h>
dk_compress_output(struct drc_kernel * dk)550 static void dk_compress_output(struct drc_kernel *dk)
551 {
552 const float master_linear_gain = dk->master_linear_gain;
553 const float envelope_rate = dk->envelope_rate;
554 const float scaled_desired_gain = dk->scaled_desired_gain;
555 const float compressor_gain = dk->compressor_gain;
556 unsigned const int div_start = dk->pre_delay_read_index;
557 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
558 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
559 int count = DIVISION_FRAMES / 4;
560
561 /* See warp_sinf() for the details for the constants. */
562 const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f);
563 const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f);
564 const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f);
565 const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f);
566
567 /* Exponential approach to desired gain. */
568 if (envelope_rate < 1) {
569 float c = compressor_gain - scaled_desired_gain;
570 float r = 1 - envelope_rate;
571 float32x4_t x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
572 float32x4_t x, x2, x4, left, right, tmp1, tmp2;
573
574 __asm__ __volatile(
575 "b 2f \n"
576 "1: \n"
577 "vmul.f32 %q[x0], %q[r4] \n"
578 "2: \n"
579 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
580 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
581 "vadd.f32 %q[x], %q[x0], %q[base] \n"
582 /* Calculate warp_sin() for four values in x. */
583 "vmul.f32 %q[x2], %q[x], %q[x] \n"
584 "vmov.f32 %q[tmp1], %q[A5] \n"
585 "vmov.f32 %q[tmp2], %q[A1] \n"
586 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
587 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
588 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
589 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
590 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
591 /* Now tmp2 contains the result of warp_sin(). */
592 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
593 "vmul.f32 %q[left], %q[tmp2] \n"
594 "vmul.f32 %q[right], %q[tmp2] \n"
595 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
596 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
597 "subs %[count], #1 \n"
598 "bne 1b \n"
599 : /* output */
600 "=r"(count),
601 "=r"(ptr_left),
602 "=r"(ptr_right),
603 "=w"(x0),
604 [x]"=&w"(x),
605 [x2]"=&w"(x2),
606 [x4]"=&w"(x4),
607 [left]"=&w"(left),
608 [right]"=&w"(right),
609 [tmp1]"=&w"(tmp1),
610 [tmp2]"=&w"(tmp2)
611 : /* input */
612 [count]"0"(count),
613 [ptr_left]"1"(ptr_left),
614 [ptr_right]"2"(ptr_right),
615 [x0]"3"(x0),
616 [A1]"w"(A1),
617 [A3]"w"(A3),
618 [A5]"w"(A5),
619 [A7]"w"(A7),
620 [base]"w"(vdupq_n_f32(scaled_desired_gain)),
621 [r4]"w"(vdupq_n_f32(r*r*r*r)),
622 [g]"w"(vdupq_n_f32(master_linear_gain))
623 : /* clobber */
624 "memory", "cc"
625 );
626 dk->compressor_gain = x[3];
627 } else {
628 float c = compressor_gain;
629 float r = envelope_rate;
630 float32x4_t x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
631 float32x4_t x2, x4, left, right, tmp1, tmp2;
632
633 __asm__ __volatile(
634 "b 2f \n"
635 "1: \n"
636 "vmul.f32 %q[x], %q[r4] \n"
637 "2: \n"
638 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
639 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
640 "vmin.f32 %q[x], %q[one] \n"
641 /* Calculate warp_sin() for four values in x. */
642 "vmul.f32 %q[x2], %q[x], %q[x] \n"
643 "vmov.f32 %q[tmp1], %q[A5] \n"
644 "vmov.f32 %q[tmp2], %q[A1] \n"
645 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
646 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
647 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
648 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
649 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
650 /* Now tmp2 contains the result of warp_sin(). */
651 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
652 "vmul.f32 %q[left], %q[tmp2] \n"
653 "vmul.f32 %q[right], %q[tmp2] \n"
654 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
655 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
656 "subs %[count], #1 \n"
657 "bne 1b \n"
658 : /* output */
659 "=r"(count),
660 "=r"(ptr_left),
661 "=r"(ptr_right),
662 "=w"(x),
663 [x2]"=&w"(x2),
664 [x4]"=&w"(x4),
665 [left]"=&w"(left),
666 [right]"=&w"(right),
667 [tmp1]"=&w"(tmp1),
668 [tmp2]"=&w"(tmp2)
669 : /* input */
670 [count]"0"(count),
671 [ptr_left]"1"(ptr_left),
672 [ptr_right]"2"(ptr_right),
673 [x]"3"(x),
674 [A1]"w"(A1),
675 [A3]"w"(A3),
676 [A5]"w"(A5),
677 [A7]"w"(A7),
678 [one]"w"(vdupq_n_f32(1)),
679 [r4]"w"(vdupq_n_f32(r*r*r*r)),
680 [g]"w"(vdupq_n_f32(master_linear_gain))
681 : /* clobber */
682 "memory", "cc"
683 );
684 dk->compressor_gain = x[3];
685 }
686 }
687 #elif defined(__SSE3__) && defined(__x86_64__)
688 #include <emmintrin.h>
dk_compress_output(struct drc_kernel * dk)689 static void dk_compress_output(struct drc_kernel *dk)
690 {
691 const float master_linear_gain = dk->master_linear_gain;
692 const float envelope_rate = dk->envelope_rate;
693 const float scaled_desired_gain = dk->scaled_desired_gain;
694 const float compressor_gain = dk->compressor_gain;
695 const int div_start = dk->pre_delay_read_index;
696 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
697 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
698 int count = DIVISION_FRAMES / 4;
699
700 /* See warp_sinf() for the details for the constants. */
701 const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
702 const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
703 const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
704 const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
705
706 /* Exponential approach to desired gain. */
707 if (envelope_rate < 1) {
708 float c = compressor_gain - scaled_desired_gain;
709 float r = 1 - envelope_rate;
710 __m128 x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
711 __m128 x, x2, x4, left, right, tmp1, tmp2;
712
713 __asm__ __volatile(
714 "jmp 2f \n"
715 "1: \n"
716 "mulps %[r4], %[x0] \n"
717 "2: \n"
718 "lddqu (%[ptr_left]), %[left] \n"
719 "lddqu (%[ptr_right]), %[right] \n"
720 "movaps %[x0], %[x] \n"
721 "addps %[base], %[x] \n"
722 /* Calculate warp_sin() for four values in x. */
723 "movaps %[x], %[x2] \n"
724 "mulps %[x], %[x2] \n"
725 "movaps %[x2], %[x4] \n"
726 "movaps %[x2], %[tmp1] \n"
727 "movaps %[x2], %[tmp2] \n"
728 "mulps %[x2], %[x4] \n"
729 "mulps %[A7], %[tmp1] \n"
730 "mulps %[A3], %[tmp2] \n"
731 "addps %[A5], %[tmp1] \n"
732 "addps %[A1], %[tmp2] \n"
733 "mulps %[x4], %[tmp1] \n"
734 "addps %[tmp1], %[tmp2] \n"
735 "mulps %[x], %[tmp2] \n"
736 /* Now tmp2 contains the result of warp_sin(). */
737 "mulps %[g], %[tmp2] \n"
738 "mulps %[tmp2], %[left] \n"
739 "mulps %[tmp2], %[right] \n"
740 "movdqu %[left], (%[ptr_left]) \n"
741 "movdqu %[right], (%[ptr_right]) \n"
742 "add $16, %[ptr_left] \n"
743 "add $16, %[ptr_right] \n"
744 "sub $1, %[count] \n"
745 "jne 1b \n"
746 : /* output */
747 "=r"(count),
748 "=r"(ptr_left),
749 "=r"(ptr_right),
750 "=x"(x0),
751 [x]"=&x"(x),
752 [x2]"=&x"(x2),
753 [x4]"=&x"(x4),
754 [left]"=&x"(left),
755 [right]"=&x"(right),
756 [tmp1]"=&x"(tmp1),
757 [tmp2]"=&x"(tmp2)
758 : /* input */
759 [count]"0"(count),
760 [ptr_left]"1"(ptr_left),
761 [ptr_right]"2"(ptr_right),
762 [x0]"3"(x0),
763 [A1]"x"(A1),
764 [A3]"x"(A3),
765 [A5]"x"(A5),
766 [A7]"x"(A7),
767 [base]"x"(_mm_set1_ps(scaled_desired_gain)),
768 [r4]"x"(_mm_set1_ps(r*r*r*r)),
769 [g]"x"(_mm_set1_ps(master_linear_gain))
770 : /* clobber */
771 "memory", "cc"
772 );
773 dk->compressor_gain = x[3];
774 } else {
775 /* See warp_sinf() for the details for the constants. */
776 __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
777 __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
778 __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
779 __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
780
781 float c = compressor_gain;
782 float r = envelope_rate;
783 __m128 x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
784 __m128 x2, x4, left, right, tmp1, tmp2;
785
786 __asm__ __volatile(
787 "jmp 2f \n"
788 "1: \n"
789 "mulps %[r4], %[x] \n"
790 "2: \n"
791 "lddqu (%[ptr_left]), %[left] \n"
792 "lddqu (%[ptr_right]), %[right] \n"
793 "minps %[one], %[x] \n"
794 /* Calculate warp_sin() for four values in x. */
795 "movaps %[x], %[x2] \n"
796 "mulps %[x], %[x2] \n"
797 "movaps %[x2], %[x4] \n"
798 "movaps %[x2], %[tmp1] \n"
799 "movaps %[x2], %[tmp2] \n"
800 "mulps %[x2], %[x4] \n"
801 "mulps %[A7], %[tmp1] \n"
802 "mulps %[A3], %[tmp2] \n"
803 "addps %[A5], %[tmp1] \n"
804 "addps %[A1], %[tmp2] \n"
805 "mulps %[x4], %[tmp1] \n"
806 "addps %[tmp1], %[tmp2] \n"
807 "mulps %[x], %[tmp2] \n"
808 /* Now tmp2 contains the result of warp_sin(). */
809 "mulps %[g], %[tmp2] \n"
810 "mulps %[tmp2], %[left] \n"
811 "mulps %[tmp2], %[right] \n"
812 "movdqu %[left], (%[ptr_left]) \n"
813 "movdqu %[right], (%[ptr_right]) \n"
814 "add $16, %[ptr_left] \n"
815 "add $16, %[ptr_right] \n"
816 "sub $1, %[count] \n"
817 "jne 1b \n"
818 : /* output */
819 "=r"(count),
820 "=r"(ptr_left),
821 "=r"(ptr_right),
822 "=x"(x),
823 [x2]"=&x"(x2),
824 [x4]"=&x"(x4),
825 [left]"=&x"(left),
826 [right]"=&x"(right),
827 [tmp1]"=&x"(tmp1),
828 [tmp2]"=&x"(tmp2)
829 : /* input */
830 [count]"0"(count),
831 [ptr_left]"1"(ptr_left),
832 [ptr_right]"2"(ptr_right),
833 [x]"3"(x),
834 [A1]"x"(A1),
835 [A3]"x"(A3),
836 [A5]"x"(A5),
837 [A7]"x"(A7),
838 [one]"x"(_mm_set1_ps(1)),
839 [r4]"x"(_mm_set1_ps(r*r*r*r)),
840 [g]"x"(_mm_set1_ps(master_linear_gain))
841 : /* clobber */
842 "memory", "cc"
843 );
844 dk->compressor_gain = x[3];
845 }
846 }
847 #else
dk_compress_output(struct drc_kernel * dk)848 static void dk_compress_output(struct drc_kernel *dk)
849 {
850 const float master_linear_gain = dk->master_linear_gain;
851 const float envelope_rate = dk->envelope_rate;
852 const float scaled_desired_gain = dk->scaled_desired_gain;
853 const float compressor_gain = dk->compressor_gain;
854 const int div_start = dk->pre_delay_read_index;
855 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
856 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
857 unsigned int count = DIVISION_FRAMES / 4;
858
859 unsigned int i, j;
860
861 /* Exponential approach to desired gain. */
862 if (envelope_rate < 1) {
863 /* Attack - reduce gain to desired. */
864 float c = compressor_gain - scaled_desired_gain;
865 float base = scaled_desired_gain;
866 float r = 1 - envelope_rate;
867 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
868 float r4 = r*r*r*r;
869
870 i = 0;
871 while (1) {
872 for (j = 0; j < 4; j++) {
873 /* Warp pre-compression gain to smooth out sharp
874 * exponential transition points.
875 */
876 float post_warp_compressor_gain =
877 warp_sinf(x[j] + base);
878
879 /* Calculate total gain using master gain. */
880 float total_gain = master_linear_gain *
881 post_warp_compressor_gain;
882
883 /* Apply final gain. */
884 *ptr_left++ *= total_gain;
885 *ptr_right++ *= total_gain;
886 }
887
888 if (++i == count)
889 break;
890
891 for (j = 0; j < 4; j++)
892 x[j] = x[j] * r4;
893 }
894
895 dk->compressor_gain = x[3] + base;
896 } else {
897 /* Release - exponentially increase gain to 1.0 */
898 float c = compressor_gain;
899 float r = envelope_rate;
900 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
901 float r4 = r*r*r*r;
902
903 i = 0;
904 while (1) {
905 for (j = 0; j < 4; j++) {
906 /* Warp pre-compression gain to smooth out sharp
907 * exponential transition points.
908 */
909 float post_warp_compressor_gain =
910 warp_sinf(x[j]);
911
912 /* Calculate total gain using master gain. */
913 float total_gain = master_linear_gain *
914 post_warp_compressor_gain;
915
916 /* Apply final gain. */
917 *ptr_left++ *= total_gain;
918 *ptr_right++ *= total_gain;
919 }
920
921 if (++i == count)
922 break;
923
924 for (j = 0; j < 4; j++)
925 x[j] = min(1.0f, x[j] * r4);
926 }
927
928 dk->compressor_gain = x[3];
929 }
930 }
931 #endif
932
933 /* After one complete divison of samples have been received (and one divison of
934 * samples have been output), we calculate shaped power average
935 * (detector_average) from the input division, update envelope parameters from
936 * detector_average, then prepare the next output division by applying the
937 * envelope to compress the samples.
938 */
dk_process_one_division(struct drc_kernel * dk)939 static void dk_process_one_division(struct drc_kernel *dk)
940 {
941 dk_update_detector_average(dk);
942 dk_update_envelope(dk);
943 dk_compress_output(dk);
944 }
945
946 /* Copy the input data to the pre-delay buffer, and copy the output data back to
947 * the input buffer */
dk_copy_fragment(struct drc_kernel * dk,float * data_channels[],unsigned frame_index,int frames_to_process)948 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[],
949 unsigned frame_index, int frames_to_process)
950 {
951 int write_index = dk->pre_delay_write_index;
952 int read_index = dk->pre_delay_read_index;
953 int j;
954
955 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
956 memcpy(&dk->pre_delay_buffers[j][write_index],
957 &data_channels[j][frame_index],
958 frames_to_process * sizeof(float));
959 memcpy(&data_channels[j][frame_index],
960 &dk->pre_delay_buffers[j][read_index],
961 frames_to_process * sizeof(float));
962 }
963
964 dk->pre_delay_write_index = (write_index + frames_to_process) &
965 MAX_PRE_DELAY_FRAMES_MASK;
966 dk->pre_delay_read_index = (read_index + frames_to_process) &
967 MAX_PRE_DELAY_FRAMES_MASK;
968 }
969
970 /* Delay the input sample only and don't do other processing. This is used when
971 * the kernel is disabled. We want to do this to match the processing delay in
972 * kernels of other bands.
973 */
dk_process_delay_only(struct drc_kernel * dk,float * data_channels[],unsigned count)974 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[],
975 unsigned count)
976 {
977 int read_index = dk->pre_delay_read_index;
978 int write_index = dk->pre_delay_write_index;
979 unsigned int i = 0;
980
981 while (i < count) {
982 unsigned int j;
983 unsigned int small = min(read_index, write_index);
984 unsigned int large = max(read_index, write_index);
985 /* chunk is the minimum of readable samples in contiguous
986 * buffer, writable samples in contiguous buffer, and the
987 * available input samples. */
988 unsigned int chunk = min(large - small,
989 MAX_PRE_DELAY_FRAMES - large);
990 chunk = min(chunk, count - i);
991 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
992 memcpy(&dk->pre_delay_buffers[j][write_index],
993 &data_channels[j][i],
994 chunk * sizeof(float));
995 memcpy(&data_channels[j][i],
996 &dk->pre_delay_buffers[j][read_index],
997 chunk * sizeof(float));
998 }
999 read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1000 write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1001 i += chunk;
1002 }
1003
1004 dk->pre_delay_read_index = read_index;
1005 dk->pre_delay_write_index = write_index;
1006 }
1007
dk_process(struct drc_kernel * dk,float * data_channels[],unsigned count)1008 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count)
1009 {
1010 unsigned int i = 0;
1011 int fragment;
1012
1013 if (!dk->enabled) {
1014 dk_process_delay_only(dk, data_channels, count);
1015 return;
1016 }
1017
1018 if (!dk->processed) {
1019 dk_update_envelope(dk);
1020 dk_compress_output(dk);
1021 dk->processed = 1;
1022 }
1023
1024 int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK;
1025 while (i < count) {
1026 fragment = min(DIVISION_FRAMES - offset, count - i);
1027 dk_copy_fragment(dk, data_channels, i, fragment);
1028 i += fragment;
1029 offset = (offset + fragment) & DIVISION_FRAMES_MASK;
1030
1031 /* Process the input division (32 frames). */
1032 if (offset == 0)
1033 dk_process_one_division(dk);
1034 }
1035 }
1036