1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 /* Copyright (C) 2011 Google Inc. All rights reserved.
7  * Use of this source code is governed by a BSD-style license that can be
8  * found in the LICENSE.WEBKIT file.
9  */
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "drc_math.h"
16 #include "drc_kernel.h"
17 
18 #define MAX_PRE_DELAY_FRAMES 1024
19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1)
20 #define DEFAULT_PRE_DELAY_FRAMES 256
21 #define DIVISION_FRAMES 32
22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1)
23 
24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)]))
25 #define assert_on_compile_is_power_of_2(n)                                     \
26 	assert_on_compile((n) != 0 && (((n) & ((n)-1)) == 0))
27 
28 const float uninitialized_value = -1;
29 static int drc_math_initialized;
30 
dk_init(struct drc_kernel * dk,float sample_rate)31 void dk_init(struct drc_kernel *dk, float sample_rate)
32 {
33 	int i;
34 
35 	if (!drc_math_initialized) {
36 		drc_math_initialized = 1;
37 		drc_math_init();
38 	}
39 
40 	dk->sample_rate = sample_rate;
41 	dk->detector_average = 0;
42 	dk->compressor_gain = 1;
43 	dk->enabled = 0;
44 	dk->processed = 0;
45 	dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES;
46 	dk->pre_delay_read_index = 0;
47 	dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES;
48 	dk->max_attack_compression_diff_db = -INFINITY;
49 	dk->ratio = uninitialized_value;
50 	dk->slope = uninitialized_value;
51 	dk->linear_threshold = uninitialized_value;
52 	dk->db_threshold = uninitialized_value;
53 	dk->db_knee = uninitialized_value;
54 	dk->knee_threshold = uninitialized_value;
55 	dk->ratio_base = uninitialized_value;
56 	dk->K = uninitialized_value;
57 
58 	assert_on_compile_is_power_of_2(DIVISION_FRAMES);
59 	assert_on_compile(DIVISION_FRAMES % 4 == 0);
60 	/* Allocate predelay buffers */
61 	assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES);
62 	for (i = 0; i < DRC_NUM_CHANNELS; i++) {
63 		size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
64 		dk->pre_delay_buffers[i] = (float *)calloc(1, size);
65 	}
66 }
67 
dk_free(struct drc_kernel * dk)68 void dk_free(struct drc_kernel *dk)
69 {
70 	int i;
71 	for (i = 0; i < DRC_NUM_CHANNELS; ++i)
72 		free(dk->pre_delay_buffers[i]);
73 }
74 
75 /* Sets the pre-delay (lookahead) buffer size */
set_pre_delay_time(struct drc_kernel * dk,float pre_delay_time)76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time)
77 {
78 	int i;
79 	/* Re-configure look-ahead section pre-delay if delay time has
80 	 * changed. */
81 	unsigned pre_delay_frames = pre_delay_time * dk->sample_rate;
82 	pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1);
83 
84 	/* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we
85 	 * won't split a division of samples into two blocks of memory, so it is
86 	 * easier to process. This may make the actual delay time slightly less
87 	 * than the specified value, but the difference is less than 1ms. */
88 	pre_delay_frames &= ~DIVISION_FRAMES_MASK;
89 
90 	/* We need at least one division buffer, so the incoming data won't
91 	 * overwrite the output data */
92 	pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES);
93 
94 	if (dk->last_pre_delay_frames != pre_delay_frames) {
95 		dk->last_pre_delay_frames = pre_delay_frames;
96 		for (i = 0; i < DRC_NUM_CHANNELS; ++i) {
97 			size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
98 			memset(dk->pre_delay_buffers[i], 0, size);
99 		}
100 
101 		dk->pre_delay_read_index = 0;
102 		dk->pre_delay_write_index = pre_delay_frames;
103 	}
104 }
105 
106 /* Exponential curve for the knee.  It is 1st derivative matched at
107  * dk->linear_threshold and asymptotically approaches the value
108  * dk->linear_threshold + 1 / k.
109  *
110  * This is used only when calculating the static curve, not used when actually
111  * compress the input data (knee_curveK below is used instead).
112  */
knee_curve(struct drc_kernel * dk,float x,float k)113 static float knee_curve(struct drc_kernel *dk, float x, float k)
114 {
115 	/* Linear up to threshold. */
116 	if (x < dk->linear_threshold)
117 		return x;
118 
119 	return dk->linear_threshold +
120 	       (1 - knee_expf(-k * (x - dk->linear_threshold))) / k;
121 }
122 
123 /* Approximate 1st derivative with input and output expressed in dB.  This slope
124  * is equal to the inverse of the compression "ratio".  In other words, a
125  * compression ratio of 20 would be a slope of 1/20.
126  */
slope_at(struct drc_kernel * dk,float x,float k)127 static float slope_at(struct drc_kernel *dk, float x, float k)
128 {
129 	if (x < dk->linear_threshold)
130 		return 1;
131 
132 	float x2 = x * 1.001;
133 
134 	float x_db = linear_to_decibels(x);
135 	float x2Db = linear_to_decibels(x2);
136 
137 	float y_db = linear_to_decibels(knee_curve(dk, x, k));
138 	float y2Db = linear_to_decibels(knee_curve(dk, x2, k));
139 
140 	float m = (y2Db - y_db) / (x2Db - x_db);
141 
142 	return m;
143 }
144 
k_at_slope(struct drc_kernel * dk,float desired_slope)145 static float k_at_slope(struct drc_kernel *dk, float desired_slope)
146 {
147 	float x_db = dk->db_threshold + dk->db_knee;
148 	float x = decibels_to_linear(x_db);
149 
150 	/* Approximate k given initial values. */
151 	float minK = 0.1;
152 	float maxK = 10000;
153 	float k = 5;
154 	int i;
155 
156 	for (i = 0; i < 15; ++i) {
157 		/* A high value for k will more quickly asymptotically approach
158 		 * a slope of 0. */
159 		float slope = slope_at(dk, x, k);
160 
161 		if (slope < desired_slope) {
162 			/* k is too high. */
163 			maxK = k;
164 		} else {
165 			/* k is too low. */
166 			minK = k;
167 		}
168 
169 		/* Re-calculate based on geometric mean. */
170 		k = sqrtf(minK * maxK);
171 	}
172 
173 	return k;
174 }
175 
update_static_curve_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio)176 static void update_static_curve_parameters(struct drc_kernel *dk,
177 					   float db_threshold, float db_knee,
178 					   float ratio)
179 {
180 	if (db_threshold != dk->db_threshold || db_knee != dk->db_knee ||
181 	    ratio != dk->ratio) {
182 		/* Threshold and knee. */
183 		dk->db_threshold = db_threshold;
184 		dk->linear_threshold = decibels_to_linear(db_threshold);
185 		dk->db_knee = db_knee;
186 
187 		/* Compute knee parameters. */
188 		dk->ratio = ratio;
189 		dk->slope = 1 / dk->ratio;
190 
191 		float k = k_at_slope(dk, 1 / dk->ratio);
192 		dk->K = k;
193 		/* See knee_curveK() for details */
194 		dk->knee_alpha = dk->linear_threshold + 1 / k;
195 		dk->knee_beta = -expf(k * dk->linear_threshold) / k;
196 
197 		dk->knee_threshold = decibels_to_linear(db_threshold + db_knee);
198 		/* See volume_gain() for details */
199 		float y0 = knee_curve(dk, dk->knee_threshold, k);
200 		dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope);
201 	}
202 }
203 
204 /* This is the knee part of the compression curve. Returns the output level
205  * given the input level x. */
knee_curveK(struct drc_kernel * dk,float x)206 static float knee_curveK(struct drc_kernel *dk, float x)
207 {
208 	/* The formula in knee_curveK is dk->linear_threshold +
209 	 * (1 - expf(-k * (x - dk->linear_threshold))) / k
210 	 * which simplifies to (alpha + beta * expf(gamma))
211 	 * where alpha = dk->linear_threshold + 1 / k
212 	 *	 beta = -expf(k * dk->linear_threshold) / k
213 	 *	 gamma = -k * x
214 	 */
215 	return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x);
216 }
217 
218 /* Full compression curve with constant ratio after knee. Returns the ratio of
219  * output and input signal. */
volume_gain(struct drc_kernel * dk,float x)220 static float volume_gain(struct drc_kernel *dk, float x)
221 {
222 	float y;
223 
224 	if (x < dk->knee_threshold) {
225 		if (x < dk->linear_threshold)
226 			return 1;
227 		y = knee_curveK(dk, x) / x;
228 	} else {
229 		/* Constant ratio after knee.
230 		 * log(y/y0) = s * log(x/x0)
231 		 * => y = y0 * (x/x0)^s
232 		 * => y = [y0 * (1/x0)^s] * x^s
233 		 * => y = dk->ratio_base * x^s
234 		 * => y/x = dk->ratio_base * x^(s - 1)
235 		 * => y/x = dk->ratio_base * e^(log(x) * (s - 1))
236 		 */
237 		y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1));
238 	}
239 
240 	return y;
241 }
242 
dk_set_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio,float attack_time,float release_time,float pre_delay_time,float db_post_gain,float releaseZone1,float releaseZone2,float releaseZone3,float releaseZone4)243 void dk_set_parameters(struct drc_kernel *dk, float db_threshold, float db_knee,
244 		       float ratio, float attack_time, float release_time,
245 		       float pre_delay_time, float db_post_gain,
246 		       float releaseZone1, float releaseZone2,
247 		       float releaseZone3, float releaseZone4)
248 {
249 	float sample_rate = dk->sample_rate;
250 
251 	update_static_curve_parameters(dk, db_threshold, db_knee, ratio);
252 
253 	/* Makeup gain. */
254 	float full_range_gain = volume_gain(dk, 1);
255 	float full_range_makeup_gain = 1 / full_range_gain;
256 
257 	/* Empirical/perceptual tuning. */
258 	full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f);
259 
260 	dk->main_linear_gain =
261 		decibels_to_linear(db_post_gain) * full_range_makeup_gain;
262 
263 	/* Attack parameters. */
264 	attack_time = max(0.001f, attack_time);
265 	dk->attack_frames = attack_time * sample_rate;
266 
267 	/* Release parameters. */
268 	float release_frames = sample_rate * release_time;
269 
270 	/* Detector release time. */
271 	float sat_release_time = 0.0025f;
272 	float sat_release_frames = sat_release_time * sample_rate;
273 	dk->sat_release_frames_inv_neg = -1 / sat_release_frames;
274 	dk->sat_release_rate_at_neg_two_db =
275 		decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1;
276 
277 	/* Create a smooth function which passes through four points.
278 	 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4
279 	 */
280 	float y1 = release_frames * releaseZone1;
281 	float y2 = release_frames * releaseZone2;
282 	float y3 = release_frames * releaseZone3;
283 	float y4 = release_frames * releaseZone4;
284 
285 	/* All of these coefficients were derived for 4th order polynomial curve
286 	 * fitting where the y values match the evenly spaced x values as
287 	 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3)
288 	 */
289 	dk->kA = 0.9999999999999998f * y1 + 1.8432219684323923e-16f * y2 -
290 		 1.9373394351676423e-16f * y3 + 8.824516011816245e-18f * y4;
291 	dk->kB = -1.5788320352845888f * y1 + 2.3305837032074286f * y2 -
292 		 0.9141194204840429f * y3 + 0.1623677525612032f * y4;
293 	dk->kC = 0.5334142869106424f * y1 - 1.272736789213631f * y2 +
294 		 0.9258856042207512f * y3 - 0.18656310191776226f * y4;
295 	dk->kD = 0.08783463138207234f * y1 - 0.1694162967925622f * y2 +
296 		 0.08588057951595272f * y3 - 0.00429891410546283f * y4;
297 	dk->kE = -0.042416883008123074f * y1 + 0.1115693827987602f * y2 -
298 		 0.09764676325265872f * y3 + 0.028494263462021576f * y4;
299 
300 	/* x ranges from 0 -> 3	      0	   1	2   3
301 	 *			     -15  -10  -5   0db
302 	 *
303 	 * y calculates adaptive release frames depending on the amount of
304 	 * compression.
305 	 */
306 	set_pre_delay_time(dk, pre_delay_time);
307 }
308 
dk_set_enabled(struct drc_kernel * dk,int enabled)309 void dk_set_enabled(struct drc_kernel *dk, int enabled)
310 {
311 	dk->enabled = enabled;
312 }
313 
314 /* Updates the envelope_rate used for the next division */
dk_update_envelope(struct drc_kernel * dk)315 static void dk_update_envelope(struct drc_kernel *dk)
316 {
317 	const float kA = dk->kA;
318 	const float kB = dk->kB;
319 	const float kC = dk->kC;
320 	const float kD = dk->kD;
321 	const float kE = dk->kE;
322 	const float attack_frames = dk->attack_frames;
323 
324 	/* Calculate desired gain */
325 	float desired_gain = dk->detector_average;
326 
327 	/* Pre-warp so we get desired_gain after sin() warp below. */
328 	float scaled_desired_gain = warp_asinf(desired_gain);
329 
330 	/* Deal with envelopes */
331 
332 	/* envelope_rate is the rate we slew from current compressor level to
333 	 * the desired level.  The exact rate depends on if we're attacking or
334 	 * releasing and by how much.
335 	 */
336 	float envelope_rate;
337 
338 	int is_releasing = scaled_desired_gain > dk->compressor_gain;
339 
340 	/* compression_diff_db is the difference between current compression
341 	 * level and the desired level. */
342 	float compression_diff_db =
343 		linear_to_decibels(dk->compressor_gain / scaled_desired_gain);
344 
345 	if (is_releasing) {
346 		/* Release mode - compression_diff_db should be negative dB */
347 		dk->max_attack_compression_diff_db = -INFINITY;
348 
349 		/* Fix gremlins. */
350 		if (isbadf(compression_diff_db))
351 			compression_diff_db = -1;
352 
353 		/* Adaptive release - higher compression (lower
354 		 * compression_diff_db) releases faster. Contain within range:
355 		 * -12 -> 0 then scale to go from 0 -> 3
356 		 */
357 		float x = compression_diff_db;
358 		x = max(-12.0f, x);
359 		x = min(0.0f, x);
360 		x = 0.25f * (x + 12);
361 
362 		/* Compute adaptive release curve using 4th order polynomial.
363 		 * Normal values for the polynomial coefficients would create a
364 		 * monotonically increasing function.
365 		 */
366 		float x2 = x * x;
367 		float x3 = x2 * x;
368 		float x4 = x2 * x2;
369 		float release_frames =
370 			kA + kB * x + kC * x2 + kD * x3 + kE * x4;
371 
372 #define kSpacingDb 5
373 		float db_per_frame = kSpacingDb / release_frames;
374 		envelope_rate = decibels_to_linear(db_per_frame);
375 	} else {
376 		/* Attack mode - compression_diff_db should be positive dB */
377 
378 		/* Fix gremlins. */
379 		if (isbadf(compression_diff_db))
380 			compression_diff_db = 1;
381 
382 		/* As long as we're still in attack mode, use a rate based off
383 		 * the largest compression_diff_db we've encountered so far.
384 		 */
385 		dk->max_attack_compression_diff_db =
386 			max(dk->max_attack_compression_diff_db,
387 			    compression_diff_db);
388 
389 		float eff_atten_diff_db =
390 			max(0.5f, dk->max_attack_compression_diff_db);
391 
392 		float x = 0.25f / eff_atten_diff_db;
393 		envelope_rate = 1 - powf(x, 1 / attack_frames);
394 	}
395 
396 	dk->envelope_rate = envelope_rate;
397 	dk->scaled_desired_gain = scaled_desired_gain;
398 }
399 
400 /* For a division of frames, take the absolute values of left channel and right
401  * channel, store the maximum of them in output. */
402 #if defined(__aarch64__)
max_abs_division(float * output,const float * data0,const float * data1)403 static inline void max_abs_division(float *output, const float *data0,
404 				    const float *data1)
405 {
406 	int count = DIVISION_FRAMES / 4;
407 
408 	// clang-format off
409 	__asm__ __volatile__(
410 		"1:                                     \n"
411 		"ld1 {v0.4s}, [%[data0]], #16           \n"
412 		"ld1 {v1.4s}, [%[data1]], #16           \n"
413 		"fabs v0.4s, v0.4s                      \n"
414 		"fabs v1.4s, v1.4s                      \n"
415 		"fmax v0.4s, v0.4s, v1.4s               \n"
416 		"st1 {v0.4s}, [%[output]], #16          \n"
417 		"subs %w[count], %w[count], #1          \n"
418 		"b.ne 1b                                \n"
419 		: /* output */
420 		  [data0]"+r"(data0),
421 		  [data1]"+r"(data1),
422 		  [output]"+r"(output),
423 		  [count]"+r"(count)
424 		: /* input */
425 		: /* clobber */
426 		  "v0", "v1", "memory", "cc");
427 	// clang-format on
428 }
429 #elif defined(__ARM_NEON__)
max_abs_division(float * output,const float * data0,const float * data1)430 static inline void max_abs_division(float *output, const float *data0,
431 				    const float *data1)
432 {
433 	int count = DIVISION_FRAMES / 4;
434 
435 	// clang-format off
436 	__asm__ __volatile__(
437 		"1:                                     \n"
438 		"vld1.32 {q0}, [%[data0]]!              \n"
439 		"vld1.32 {q1}, [%[data1]]!              \n"
440 		"vabs.f32 q0, q0                        \n"
441 		"vabs.f32 q1, q1                        \n"
442 		"vmax.f32 q0, q1                        \n"
443 		"vst1.32 {q0}, [%[output]]!             \n"
444 		"subs %[count], #1                      \n"
445 		"bne 1b                                 \n"
446 		: /* output */
447 		  [data0]"+r"(data0),
448 		  [data1]"+r"(data1),
449 		  [output]"+r"(output),
450 		  [count]"+r"(count)
451 		: /* input */
452 		: /* clobber */
453 		  "q0", "q1", "memory", "cc");
454 	// clang-format on
455 }
456 #elif defined(__SSE3__)
457 #include <emmintrin.h>
max_abs_division(float * output,const float * data0,const float * data1)458 static inline void max_abs_division(float *output, const float *data0,
459 				    const float *data1)
460 {
461 	__m128 x, y;
462 	int count = DIVISION_FRAMES / 4;
463 	// clang-format off
464 	__asm__ __volatile__(
465 		"1:                                     \n"
466 		"lddqu (%[data0]), %[x]                 \n"
467 		"lddqu (%[data1]), %[y]                 \n"
468 		"andps %[mask], %[x]                    \n"
469 		"andps %[mask], %[y]                    \n"
470 		"maxps %[y], %[x]                       \n"
471 		"movdqu %[x], (%[output])               \n"
472 		"add $16, %[data0]                      \n"
473 		"add $16, %[data1]                      \n"
474 		"add $16, %[output]                     \n"
475 		"sub $1, %[count]                       \n"
476 		"jnz 1b                                 \n"
477 		: /* output */
478 		  [data0]"+r"(data0),
479 		  [data1]"+r"(data1),
480 		  [output]"+r"(output),
481 		  [count]"+r"(count),
482 		  [x]"=&x"(x),
483 		  [y]"=&x"(y)
484 		: /* input */
485 		  [mask]"x"(_mm_set1_epi32(0x7fffffff))
486 		: /* clobber */
487 		  "memory", "cc");
488 	// clang-format on
489 }
490 #else
max_abs_division(float * output,const float * data0,const float * data1)491 static inline void max_abs_division(float *output, const float *data0,
492 				    const float *data1)
493 {
494 	int i;
495 	for (i = 0; i < DIVISION_FRAMES; i++)
496 		output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i]));
497 }
498 #endif
499 
500 /* Update detector_average from the last input division. */
dk_update_detector_average(struct drc_kernel * dk)501 static void dk_update_detector_average(struct drc_kernel *dk)
502 {
503 	float abs_input_array[DIVISION_FRAMES];
504 	const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg;
505 	const float sat_release_rate_at_neg_two_db =
506 		dk->sat_release_rate_at_neg_two_db;
507 	float detector_average = dk->detector_average;
508 	int div_start, i;
509 
510 	/* Calculate the start index of the last input division */
511 	if (dk->pre_delay_write_index == 0) {
512 		div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES;
513 	} else {
514 		div_start = dk->pre_delay_write_index - DIVISION_FRAMES;
515 	}
516 
517 	/* The max abs value across all channels for this frame */
518 	max_abs_division(abs_input_array, &dk->pre_delay_buffers[0][div_start],
519 			 &dk->pre_delay_buffers[1][div_start]);
520 
521 	for (i = 0; i < DIVISION_FRAMES; i++) {
522 		/* Compute compression amount from un-delayed signal */
523 		float abs_input = abs_input_array[i];
524 
525 		/* Calculate shaped power on undelayed input.  Put through
526 		 * shaping curve. This is linear up to the threshold, then
527 		 * enters a "knee" portion followed by the "ratio" portion. The
528 		 * transition from the threshold to the knee is smooth (1st
529 		 * derivative matched). The transition from the knee to the
530 		 * ratio portion is smooth (1st derivative matched).
531 		 */
532 		float gain = volume_gain(dk, abs_input);
533 		int is_release = (gain > detector_average);
534 		if (is_release) {
535 			if (gain > NEG_TWO_DB) {
536 				detector_average +=
537 					(gain - detector_average) *
538 					sat_release_rate_at_neg_two_db;
539 			} else {
540 				float gain_db = linear_to_decibels(gain);
541 				float db_per_frame =
542 					gain_db * sat_release_frames_inv_neg;
543 				float sat_release_rate =
544 					decibels_to_linear(db_per_frame) - 1;
545 				detector_average += (gain - detector_average) *
546 						    sat_release_rate;
547 			}
548 		} else {
549 			detector_average = gain;
550 		}
551 
552 		/* Fix gremlins. */
553 		if (isbadf(detector_average))
554 			detector_average = 1.0f;
555 		else
556 			detector_average = min(detector_average, 1.0f);
557 	}
558 
559 	dk->detector_average = detector_average;
560 }
561 
562 /* Calculate compress_gain from the envelope and apply total_gain to compress
563  * the next output division. */
564 /* TODO(fbarchard): Port to aarch64 */
565 #if defined(__ARM_NEON__)
566 #include <arm_neon.h>
dk_compress_output(struct drc_kernel * dk)567 static void dk_compress_output(struct drc_kernel *dk)
568 {
569 	const float main_linear_gain = dk->main_linear_gain;
570 	const float envelope_rate = dk->envelope_rate;
571 	const float scaled_desired_gain = dk->scaled_desired_gain;
572 	const float compressor_gain = dk->compressor_gain;
573 	const int div_start = dk->pre_delay_read_index;
574 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
575 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
576 	int count = DIVISION_FRAMES / 4;
577 
578 	/* See warp_sinf() for the details for the constants. */
579 	const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f);
580 	const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f);
581 	const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f);
582 	const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f);
583 
584 	/* Exponential approach to desired gain. */
585 	if (envelope_rate < 1) {
586 		float c = compressor_gain - scaled_desired_gain;
587 		float r = 1 - envelope_rate;
588 		float32x4_t x0 = { c * r, c * r * r, c * r * r * r,
589 				   c * r * r * r * r };
590 		float32x4_t x, x2, x4, left, right, tmp1, tmp2;
591 
592 		// clang-format off
593 		__asm__ __volatile(
594 			"b 2f                                               \n"
595 			"1:                                                 \n"
596 			"vmul.f32 %q[x0], %q[r4]                            \n"
597 			"2:                                                 \n"
598 			"vld1.32 {%e[left],%f[left]}, [%[ptr_left]]         \n"
599 			"vld1.32 {%e[right],%f[right]}, [%[ptr_right]]      \n"
600 			"vadd.f32 %q[x], %q[x0], %q[base]                   \n"
601 			/* Calculate warp_sin() for four values in x. */
602 			"vmul.f32 %q[x2], %q[x], %q[x]                      \n"
603 			"vmov.f32 %q[tmp1], %q[A5]                          \n"
604 			"vmov.f32 %q[tmp2], %q[A1]                          \n"
605 			"vmul.f32 %q[x4], %q[x2], %q[x2]                    \n"
606 			"vmla.f32 %q[tmp1], %q[A7], %q[x2]                  \n"
607 			"vmla.f32 %q[tmp2], %q[A3], %q[x2]                  \n"
608 			"vmla.f32 %q[tmp2], %q[tmp1], %q[x4]                \n"
609 			"vmul.f32 %q[tmp2], %q[tmp2], %q[x]                 \n"
610 			/* Now tmp2 contains the result of warp_sin(). */
611 			"vmul.f32 %q[tmp2], %q[tmp2], %q[g]                 \n"
612 			"vmul.f32 %q[left], %q[tmp2]                        \n"
613 			"vmul.f32 %q[right], %q[tmp2]                       \n"
614 			"vst1.32 {%e[left],%f[left]}, [%[ptr_left]]!        \n"
615 			"vst1.32 {%e[right],%f[right]}, [%[ptr_right]]!     \n"
616 			"subs %[count], #1                                  \n"
617 			"bne 1b                                             \n"
618 			: /* output */
619 			  "=r"(count),
620 			  "=r"(ptr_left),
621 			  "=r"(ptr_right),
622 			  "=w"(x0),
623 			  [x]"=&w"(x),
624 			  [x2]"=&w"(x2),
625 			  [x4]"=&w"(x4),
626 			  [left]"=&w"(left),
627 			  [right]"=&w"(right),
628 			  [tmp1]"=&w"(tmp1),
629 			  [tmp2]"=&w"(tmp2)
630 			: /* input */
631 			  [count]"0"(count),
632 			  [ptr_left]"1"(ptr_left),
633 			  [ptr_right]"2"(ptr_right),
634 			  [x0]"3"(x0),
635 			  [A1]"w"(A1),
636 			  [A3]"w"(A3),
637 			  [A5]"w"(A5),
638 			  [A7]"w"(A7),
639 			  [base]"w"(vdupq_n_f32(scaled_desired_gain)),
640 			  [r4]"w"(vdupq_n_f32(r*r*r*r)),
641 			  [g]"w"(vdupq_n_f32(main_linear_gain))
642 			: /* clobber */
643 			  "memory", "cc");
644 		// clang-format on
645 		dk->compressor_gain = x[3];
646 	} else {
647 		float c = compressor_gain;
648 		float r = envelope_rate;
649 		float32x4_t x = { c * r, c * r * r, c * r * r * r,
650 				  c * r * r * r * r };
651 		float32x4_t x2, x4, left, right, tmp1, tmp2;
652 
653 		// clang-format off
654 		__asm__ __volatile(
655 			"b 2f                                               \n"
656 			"1:                                                 \n"
657 			"vmul.f32 %q[x], %q[r4]                             \n"
658 			"2:                                                 \n"
659 			"vld1.32 {%e[left],%f[left]}, [%[ptr_left]]         \n"
660 			"vld1.32 {%e[right],%f[right]}, [%[ptr_right]]      \n"
661 			"vmin.f32 %q[x], %q[one]                            \n"
662 			/* Calculate warp_sin() for four values in x. */
663 			"vmul.f32 %q[x2], %q[x], %q[x]                      \n"
664 			"vmov.f32 %q[tmp1], %q[A5]                          \n"
665 			"vmov.f32 %q[tmp2], %q[A1]                          \n"
666 			"vmul.f32 %q[x4], %q[x2], %q[x2]                    \n"
667 			"vmla.f32 %q[tmp1], %q[A7], %q[x2]                  \n"
668 			"vmla.f32 %q[tmp2], %q[A3], %q[x2]                  \n"
669 			"vmla.f32 %q[tmp2], %q[tmp1], %q[x4]                \n"
670 			"vmul.f32 %q[tmp2], %q[tmp2], %q[x]                 \n"
671 			/* Now tmp2 contains the result of warp_sin(). */
672 			"vmul.f32 %q[tmp2], %q[tmp2], %q[g]                 \n"
673 			"vmul.f32 %q[left], %q[tmp2]                        \n"
674 			"vmul.f32 %q[right], %q[tmp2]                       \n"
675 			"vst1.32 {%e[left],%f[left]}, [%[ptr_left]]!        \n"
676 			"vst1.32 {%e[right],%f[right]}, [%[ptr_right]]!     \n"
677 			"subs %[count], #1                                  \n"
678 			"bne 1b                                             \n"
679 			: /* output */
680 			  "=r"(count),
681 			  "=r"(ptr_left),
682 			  "=r"(ptr_right),
683 			  "=w"(x),
684 			  [x2]"=&w"(x2),
685 			  [x4]"=&w"(x4),
686 			  [left]"=&w"(left),
687 			  [right]"=&w"(right),
688 			  [tmp1]"=&w"(tmp1),
689 			  [tmp2]"=&w"(tmp2)
690 			: /* input */
691 			  [count]"0"(count),
692 			  [ptr_left]"1"(ptr_left),
693 			  [ptr_right]"2"(ptr_right),
694 			  [x]"3"(x),
695 			  [A1]"w"(A1),
696 			  [A3]"w"(A3),
697 			  [A5]"w"(A5),
698 			  [A7]"w"(A7),
699 			  [one]"w"(vdupq_n_f32(1)),
700 			  [r4]"w"(vdupq_n_f32(r*r*r*r)),
701 			  [g]"w"(vdupq_n_f32(main_linear_gain))
702 			: /* clobber */
703 			  "memory", "cc");
704 		// clang-format on
705 		dk->compressor_gain = x[3];
706 	}
707 }
708 #elif defined(__SSE3__) && defined(__x86_64__)
709 #include <emmintrin.h>
dk_compress_output(struct drc_kernel * dk)710 static void dk_compress_output(struct drc_kernel *dk)
711 {
712 	const float main_linear_gain = dk->main_linear_gain;
713 	const float envelope_rate = dk->envelope_rate;
714 	const float scaled_desired_gain = dk->scaled_desired_gain;
715 	const float compressor_gain = dk->compressor_gain;
716 	const int div_start = dk->pre_delay_read_index;
717 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
718 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
719 	int count = DIVISION_FRAMES / 4;
720 
721 	/* See warp_sinf() for the details for the constants. */
722 	const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
723 	const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
724 	const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
725 	const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
726 
727 	/* Exponential approach to desired gain. */
728 	if (envelope_rate < 1) {
729 		float c = compressor_gain - scaled_desired_gain;
730 		float r = 1 - envelope_rate;
731 		__m128 x0 = { c * r, c * r * r, c * r * r * r,
732 			      c * r * r * r * r };
733 		__m128 x, x2, x4, left, right, tmp1, tmp2;
734 
735 		// clang-format off
736 		__asm__ __volatile(
737 			"jmp 2f                                     \n"
738 			"1:                                         \n"
739 			"mulps %[r4], %[x0]                         \n"
740 			"2:                                         \n"
741 			"lddqu (%[ptr_left]), %[left]               \n"
742 			"lddqu (%[ptr_right]), %[right]             \n"
743 			"movaps %[x0], %[x]                         \n"
744 			"addps %[base], %[x]                        \n"
745 			/* Calculate warp_sin() for four values in x. */
746 			"movaps %[x], %[x2]                         \n"
747 			"mulps %[x], %[x2]                          \n"
748 			"movaps %[x2], %[x4]                        \n"
749 			"movaps %[x2], %[tmp1]                      \n"
750 			"movaps %[x2], %[tmp2]                      \n"
751 			"mulps %[x2], %[x4]                         \n"
752 			"mulps %[A7], %[tmp1]                       \n"
753 			"mulps %[A3], %[tmp2]                       \n"
754 			"addps %[A5], %[tmp1]                       \n"
755 			"addps %[A1], %[tmp2]                       \n"
756 			"mulps %[x4], %[tmp1]                       \n"
757 			"addps %[tmp1], %[tmp2]                     \n"
758 			"mulps %[x], %[tmp2]                        \n"
759 			/* Now tmp2 contains the result of warp_sin(). */
760 			"mulps %[g], %[tmp2]                        \n"
761 			"mulps %[tmp2], %[left]                     \n"
762 			"mulps %[tmp2], %[right]                    \n"
763 			"movdqu %[left], (%[ptr_left])              \n"
764 			"movdqu %[right], (%[ptr_right])            \n"
765 			"add $16, %[ptr_left]                       \n"
766 			"add $16, %[ptr_right]                      \n"
767 			"sub $1, %[count]                           \n"
768 			"jne 1b                                     \n"
769 			: /* output */
770 			  "=r"(count),
771 			  "=r"(ptr_left),
772 			  "=r"(ptr_right),
773 			  "=x"(x0),
774 			  [x]"=&x"(x),
775 			  [x2]"=&x"(x2),
776 			  [x4]"=&x"(x4),
777 			  [left]"=&x"(left),
778 			  [right]"=&x"(right),
779 			  [tmp1]"=&x"(tmp1),
780 			  [tmp2]"=&x"(tmp2)
781 			: /* input */
782 			  [count]"0"(count),
783 			  [ptr_left]"1"(ptr_left),
784 			  [ptr_right]"2"(ptr_right),
785 			  [x0]"3"(x0),
786 			  [A1]"x"(A1),
787 			  [A3]"x"(A3),
788 			  [A5]"x"(A5),
789 			  [A7]"x"(A7),
790 			  [base]"x"(_mm_set1_ps(scaled_desired_gain)),
791 			  [r4]"x"(_mm_set1_ps(r*r*r*r)),
792 			  [g]"x"(_mm_set1_ps(main_linear_gain))
793 			: /* clobber */
794 			  "memory", "cc");
795 		// clang-format on
796 		dk->compressor_gain = x[3];
797 	} else {
798 		/* See warp_sinf() for the details for the constants. */
799 		__m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
800 		__m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
801 		__m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
802 		__m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
803 
804 		float c = compressor_gain;
805 		float r = envelope_rate;
806 		__m128 x = { c * r, c * r * r, c * r * r * r,
807 			     c * r * r * r * r };
808 		__m128 x2, x4, left, right, tmp1, tmp2;
809 
810 		// clang-format off
811 		__asm__ __volatile(
812 			"jmp 2f                                     \n"
813 			"1:                                         \n"
814 			"mulps %[r4], %[x]                          \n"
815 			"2:                                         \n"
816 			"lddqu (%[ptr_left]), %[left]               \n"
817 			"lddqu (%[ptr_right]), %[right]             \n"
818 			"minps %[one], %[x]                         \n"
819 			/* Calculate warp_sin() for four values in x. */
820 			"movaps %[x], %[x2]                         \n"
821 			"mulps %[x], %[x2]                          \n"
822 			"movaps %[x2], %[x4]                        \n"
823 			"movaps %[x2], %[tmp1]                      \n"
824 			"movaps %[x2], %[tmp2]                      \n"
825 			"mulps %[x2], %[x4]                         \n"
826 			"mulps %[A7], %[tmp1]                       \n"
827 			"mulps %[A3], %[tmp2]                       \n"
828 			"addps %[A5], %[tmp1]                       \n"
829 			"addps %[A1], %[tmp2]                       \n"
830 			"mulps %[x4], %[tmp1]                       \n"
831 			"addps %[tmp1], %[tmp2]                     \n"
832 			"mulps %[x], %[tmp2]                        \n"
833 			/* Now tmp2 contains the result of warp_sin(). */
834 			"mulps %[g], %[tmp2]                        \n"
835 			"mulps %[tmp2], %[left]                     \n"
836 			"mulps %[tmp2], %[right]                    \n"
837 			"movdqu %[left], (%[ptr_left])              \n"
838 			"movdqu %[right], (%[ptr_right])            \n"
839 			"add $16, %[ptr_left]                       \n"
840 			"add $16, %[ptr_right]                      \n"
841 			"sub $1, %[count]                           \n"
842 			"jne 1b                                     \n"
843 			: /* output */
844 			  "=r"(count),
845 			  "=r"(ptr_left),
846 			  "=r"(ptr_right),
847 			  "=x"(x),
848 			  [x2]"=&x"(x2),
849 			  [x4]"=&x"(x4),
850 			  [left]"=&x"(left),
851 			  [right]"=&x"(right),
852 			  [tmp1]"=&x"(tmp1),
853 			  [tmp2]"=&x"(tmp2)
854 			: /* input */
855 			  [count]"0"(count),
856 			  [ptr_left]"1"(ptr_left),
857 			  [ptr_right]"2"(ptr_right),
858 			  [x]"3"(x),
859 			  [A1]"x"(A1),
860 			  [A3]"x"(A3),
861 			  [A5]"x"(A5),
862 			  [A7]"x"(A7),
863 			  [one]"x"(_mm_set1_ps(1)),
864 			  [r4]"x"(_mm_set1_ps(r*r*r*r)),
865 			  [g]"x"(_mm_set1_ps(main_linear_gain))
866 			: /* clobber */
867 			  "memory", "cc");
868 		// clang-format on
869 		dk->compressor_gain = x[3];
870 	}
871 }
872 #else
dk_compress_output(struct drc_kernel * dk)873 static void dk_compress_output(struct drc_kernel *dk)
874 {
875 	const float main_linear_gain = dk->main_linear_gain;
876 	const float envelope_rate = dk->envelope_rate;
877 	const float scaled_desired_gain = dk->scaled_desired_gain;
878 	const float compressor_gain = dk->compressor_gain;
879 	const int div_start = dk->pre_delay_read_index;
880 	float *ptr_left = &dk->pre_delay_buffers[0][div_start];
881 	float *ptr_right = &dk->pre_delay_buffers[1][div_start];
882 	int count = DIVISION_FRAMES / 4;
883 
884 	int i, j;
885 
886 	/* Exponential approach to desired gain. */
887 	if (envelope_rate < 1) {
888 		/* Attack - reduce gain to desired. */
889 		float c = compressor_gain - scaled_desired_gain;
890 		float base = scaled_desired_gain;
891 		float r = 1 - envelope_rate;
892 		float x[4] = { c * r, c * r * r, c * r * r * r,
893 			       c * r * r * r * r };
894 		float r4 = r * r * r * r;
895 
896 		i = 0;
897 		while (1) {
898 			for (j = 0; j < 4; j++) {
899 				/* Warp pre-compression gain to smooth out sharp
900 				 * exponential transition points.
901 				 */
902 				float post_warp_compressor_gain =
903 					warp_sinf(x[j] + base);
904 
905 				/* Calculate total gain using main gain. */
906 				float total_gain = main_linear_gain *
907 						   post_warp_compressor_gain;
908 
909 				/* Apply final gain. */
910 				*ptr_left++ *= total_gain;
911 				*ptr_right++ *= total_gain;
912 			}
913 
914 			if (++i == count)
915 				break;
916 
917 			for (j = 0; j < 4; j++)
918 				x[j] = x[j] * r4;
919 		}
920 
921 		dk->compressor_gain = x[3] + base;
922 	} else {
923 		/* Release - exponentially increase gain to 1.0 */
924 		float c = compressor_gain;
925 		float r = envelope_rate;
926 		float x[4] = { c * r, c * r * r, c * r * r * r,
927 			       c * r * r * r * r };
928 		float r4 = r * r * r * r;
929 
930 		i = 0;
931 		while (1) {
932 			for (j = 0; j < 4; j++) {
933 				/* Warp pre-compression gain to smooth out sharp
934 				 * exponential transition points.
935 				 */
936 				float post_warp_compressor_gain =
937 					warp_sinf(x[j]);
938 
939 				/* Calculate total gain using main gain. */
940 				float total_gain = main_linear_gain *
941 						   post_warp_compressor_gain;
942 
943 				/* Apply final gain. */
944 				*ptr_left++ *= total_gain;
945 				*ptr_right++ *= total_gain;
946 			}
947 
948 			if (++i == count)
949 				break;
950 
951 			for (j = 0; j < 4; j++)
952 				x[j] = min(1.0f, x[j] * r4);
953 		}
954 
955 		dk->compressor_gain = x[3];
956 	}
957 }
958 #endif
959 
960 /* After one complete divison of samples have been received (and one divison of
961  * samples have been output), we calculate shaped power average
962  * (detector_average) from the input division, update envelope parameters from
963  * detector_average, then prepare the next output division by applying the
964  * envelope to compress the samples.
965  */
dk_process_one_division(struct drc_kernel * dk)966 static void dk_process_one_division(struct drc_kernel *dk)
967 {
968 	dk_update_detector_average(dk);
969 	dk_update_envelope(dk);
970 	dk_compress_output(dk);
971 }
972 
973 /* Copy the input data to the pre-delay buffer, and copy the output data back to
974  * the input buffer */
dk_copy_fragment(struct drc_kernel * dk,float * data_channels[],unsigned frame_index,int frames_to_process)975 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[],
976 			     unsigned frame_index, int frames_to_process)
977 {
978 	int write_index = dk->pre_delay_write_index;
979 	int read_index = dk->pre_delay_read_index;
980 	int j;
981 
982 	for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
983 		memcpy(&dk->pre_delay_buffers[j][write_index],
984 		       &data_channels[j][frame_index],
985 		       frames_to_process * sizeof(float));
986 		memcpy(&data_channels[j][frame_index],
987 		       &dk->pre_delay_buffers[j][read_index],
988 		       frames_to_process * sizeof(float));
989 	}
990 
991 	dk->pre_delay_write_index =
992 		(write_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK;
993 	dk->pre_delay_read_index =
994 		(read_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK;
995 }
996 
997 /* Delay the input sample only and don't do other processing. This is used when
998  * the kernel is disabled. We want to do this to match the processing delay in
999  * kernels of other bands.
1000  */
dk_process_delay_only(struct drc_kernel * dk,float * data_channels[],unsigned count)1001 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[],
1002 				  unsigned count)
1003 {
1004 	int read_index = dk->pre_delay_read_index;
1005 	int write_index = dk->pre_delay_write_index;
1006 	int i = 0;
1007 
1008 	while (i < count) {
1009 		int j;
1010 		int small = min(read_index, write_index);
1011 		int large = max(read_index, write_index);
1012 		/* chunk is the minimum of readable samples in contiguous
1013 		 * buffer, writable samples in contiguous buffer, and the
1014 		 * available input samples. */
1015 		int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large);
1016 		chunk = min(chunk, count - i);
1017 		for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
1018 			memcpy(&dk->pre_delay_buffers[j][write_index],
1019 			       &data_channels[j][i], chunk * sizeof(float));
1020 			memcpy(&data_channels[j][i],
1021 			       &dk->pre_delay_buffers[j][read_index],
1022 			       chunk * sizeof(float));
1023 		}
1024 		read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1025 		write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1026 		i += chunk;
1027 	}
1028 
1029 	dk->pre_delay_read_index = read_index;
1030 	dk->pre_delay_write_index = write_index;
1031 }
1032 
dk_process(struct drc_kernel * dk,float * data_channels[],unsigned count)1033 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count)
1034 {
1035 	int i = 0;
1036 	int fragment;
1037 
1038 	if (!dk->enabled) {
1039 		dk_process_delay_only(dk, data_channels, count);
1040 		return;
1041 	}
1042 
1043 	if (!dk->processed) {
1044 		dk_update_envelope(dk);
1045 		dk_compress_output(dk);
1046 		dk->processed = 1;
1047 	}
1048 
1049 	int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK;
1050 	while (i < count) {
1051 		fragment = min(DIVISION_FRAMES - offset, count - i);
1052 		dk_copy_fragment(dk, data_channels, i, fragment);
1053 		i += fragment;
1054 		offset = (offset + fragment) & DIVISION_FRAMES_MASK;
1055 
1056 		/* Process the input division (32 frames). */
1057 		if (offset == 0)
1058 			dk_process_one_division(dk);
1059 	}
1060 }
1061