/*
* Copyright 2023 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "asrc_resampler.h"
#include
#include
#include
#include
#include
#include
#include "asrc_tables.h"
#include "common/repeating_timer.h"
#include "hal/link_clocker.h"
#include "hci/hci_layer.h"
#include "hci/hci_packets.h"
#include "main/shim/entry.h"
#include "stack/include/main_thread.h"
namespace bluetooth::audio::asrc {
class SourceAudioHalAsrc::ClockRecovery
: public bluetooth::hal::ReadClockHandler {
std::mutex mutex_;
bluetooth::common::RepeatingTimer read_clock_timer_;
enum class StateId { RESET, WARMUP, RUNNING };
struct {
StateId id;
uint32_t t0;
uint32_t local_time;
uint32_t stream_time;
uint32_t last_bt_clock;
uint32_t decim_t0;
int decim_dt[2];
double butter_drift;
double butter_s[2];
} state_;
struct {
uint32_t local_time;
uint32_t stream_time;
double drift;
} reference_timing_;
struct {
double sample_rate;
int drift_us;
} output_stats_;
__attribute__((no_sanitize("integer"))) void OnEvent(
uint32_t timestamp_us, uint32_t bt_clock) override {
auto& state = state_;
// Setup the start point of the streaming
if (state.id == StateId::RESET) {
state.t0 = timestamp_us;
state.local_time = state.stream_time = state.t0;
state.last_bt_clock = bt_clock;
state.decim_t0 = state.t0;
state.decim_dt[1] = INT_MAX;
state.id = StateId::WARMUP;
}
// Update timing informations, and compute the minimum deviation
// in the interval of the decimation (1 second).
// Convert the local clock interval from the last subampling event
// into microseconds.
uint32_t elapsed_us = ((bt_clock - state.last_bt_clock) * 625) >> 5;
uint32_t local_time = state.local_time + elapsed_us;
int dt_current = int(timestamp_us - local_time);
state.decim_dt[1] = std::min(state.decim_dt[1], dt_current);
if (local_time - state.decim_t0 < 1000 * 1000) return;
state.decim_t0 += 1000 * 1000;
state.last_bt_clock = bt_clock;
state.local_time += elapsed_us;
state.stream_time += elapsed_us;
// The first decimation interval is used to adjust the start point.
// The deviation between local time and stream time in this interval can be
// ignored.
if (state.id == StateId::WARMUP) {
state.decim_t0 += state.decim_dt[1];
state.local_time += state.decim_dt[1];
state.stream_time += state.decim_dt[1];
state.decim_dt[0] = 0;
state.decim_dt[1] = INT_MAX;
state.id = StateId::RUNNING;
return;
}
// Deduct the derive of the deviation, from the difference between
// the two consecutives decimated deviations.
int drift = state.decim_dt[1] - state.decim_dt[0];
state.decim_dt[0] = state.decim_dt[1];
state.decim_dt[1] = INT_MAX;
// Let's filter the derive, with a low-pass Butterworth filter.
// The cut-off frequency is set to 1/60th seconds.
const double a1 = -1.9259839697e+00, a2 = 9.2862708612e-01;
const double b0 = 6.6077909823e-04, b1 = 1.3215581965e-03, b2 = b0;
state.butter_drift = drift * b0 + state.butter_s[0];
state.butter_s[0] =
state.butter_s[1] + drift * b1 - state.butter_drift * a1;
state.butter_s[1] = drift * b2 - state.butter_drift * a2;
// The stream time is adjusted with the filtered drift, and the error is
// caught up with a gain of 2^-8 (~1/250us). The error is deducted from
// the difference between the instant stream time, and the local time
// corrected by the decimated deviation.
int err = state.stream_time - (state.local_time + state.decim_dt[0]);
state.stream_time +=
(int(ldexpf(state.butter_drift, 8)) - err + (1 << 7)) >> 8;
// Update recovered timing information, and sample the output statistics.
decltype(output_stats_) output_stats;
{
const std::lock_guard lock(mutex_);
auto& ref = reference_timing_;
ref.local_time = state.local_time - state.t0;
ref.stream_time = state.stream_time - state.t0;
ref.drift = state.butter_drift * 1e-6;
output_stats = output_stats_;
}
log::info(
"Deviation: {:6} us ({:3.0f} ppm) | Output Fs: {:5.2f} Hz drift: {:2} "
"us",
state.stream_time - state.local_time, state.butter_drift,
output_stats.sample_rate, output_stats.drift_us);
}
public:
ClockRecovery(bluetooth::common::MessageLoopThread* thread)
: state_{.id = StateId::RESET}, reference_timing_{0, 0, 0} {
if (com::android::bluetooth::flags::run_clock_recovery_in_worker_thread()) {
read_clock_timer_.SchedulePeriodic(
thread->GetWeakPtr(), FROM_HERE,
base::BindRepeating(
[](void*) {
bluetooth::shim::GetHciLayer()->EnqueueCommand(
bluetooth::hci::ReadClockBuilder::Create(
0, bluetooth::hci::WhichClock::LOCAL),
get_main_thread()->BindOnce(
[](bluetooth::hci::CommandCompleteView) {}));
},
nullptr),
std::chrono::milliseconds(100));
} else {
read_clock_timer_.SchedulePeriodic(
get_main_thread()->GetWeakPtr(), FROM_HERE,
base::BindRepeating(
[](void*) {
bluetooth::shim::GetHciLayer()->EnqueueCommand(
bluetooth::hci::ReadClockBuilder::Create(
0, bluetooth::hci::WhichClock::LOCAL),
get_main_thread()->BindOnce(
[](bluetooth::hci::CommandCompleteView) {}));
},
nullptr),
std::chrono::milliseconds(100));
}
hal::LinkClocker::Register(this);
}
~ClockRecovery() override {
hal::LinkClocker::Unregister();
read_clock_timer_.Cancel();
}
__attribute__((no_sanitize("integer"))) uint32_t Convert(
uint32_t stream_time) {
// Compute the difference between the stream time and the sampled time
// of the clock recovery, and adjust according to the drift.
// Then return the sampled local time, modified by this converted gap.
const std::lock_guard lock(mutex_);
const auto& ref = reference_timing_;
int stream_dt = int(stream_time - ref.stream_time);
int local_dt_us = int(round(stream_dt * (1 + ref.drift)));
return ref.local_time + local_dt_us;
}
void UpdateOutputStats(double sample_rate, int drift_us) {
// Atomically update the output statistics,
// this should be used for logging.
const std::lock_guard lock(mutex_);
output_stats_ = {sample_rate, drift_us};
}
};
class SourceAudioHalAsrc::Resampler {
static const int KERNEL_Q = asrc::ResamplerTables::KERNEL_Q;
static const int KERNEL_A = asrc::ResamplerTables::KERNEL_A;
const int32_t (*h_)[2 * KERNEL_A];
const int16_t (*d_)[2 * KERNEL_A];
static const unsigned WSIZE = 64;
int32_t win_[2][WSIZE];
unsigned out_pos_, in_pos_;
const int32_t pcm_min_, pcm_max_;
// Apply the transfer coefficients `h`, corrected by linear interpolation,
// given fraction position `mu` weigthed by `d` values.
inline int32_t Filter(const int32_t* in, const int32_t* h, int16_t mu,
const int16_t* d);
// Upsampling loop, the ratio is less than 1.0 in Q26 format,
// more output samples are produced compared to input.
template
__attribute__((no_sanitize("integer"))) void Upsample(
unsigned ratio, const T* in, int in_stride, size_t in_len,
size_t* in_count, T* out, int out_stride, size_t out_len,
size_t* out_count) {
int nin = in_len, nout = out_len;
while (nin > 0 && nout > 0) {
unsigned idx = (in_pos_ >> 26);
unsigned phy = (in_pos_ >> 17) & 0x1ff;
int16_t mu = (in_pos_ >> 2) & 0x7fff;
unsigned wbuf = idx < WSIZE / 2 || idx >= WSIZE + WSIZE / 2;
auto w = win_[wbuf] + ((idx + wbuf * WSIZE / 2) % WSIZE) - WSIZE / 2;
*out = Filter(w, h_[phy], mu, d_[phy]);
out += out_stride;
nout--;
in_pos_ += ratio;
if (in_pos_ - (out_pos_ << 26) >= (1u << 26)) {
win_[0][(out_pos_ + WSIZE / 2) % WSIZE] = win_[1][(out_pos_)] = *in;
in += in_stride;
nin--;
out_pos_ = (out_pos_ + 1) % WSIZE;
}
}
*in_count = in_len - nin;
*out_count = out_len - nout;
}
// Downsample loop, the ratio is greater than 1.0 in Q26 format,
// less output samples are produced compared to input.
template
__attribute__((no_sanitize("integer"))) void Downsample(
unsigned ratio, const T* in, int in_stride, size_t in_len,
size_t* in_count, T* out, int out_stride, size_t out_len,
size_t* out_count) {
size_t nin = in_len, nout = out_len;
while (nin > 0 && nout > 0) {
if (in_pos_ - (out_pos_ << 26) < (1u << 26)) {
unsigned idx = (in_pos_ >> 26);
unsigned phy = (in_pos_ >> 17) & 0x1ff;
int16_t mu = (in_pos_ >> 2) & 0x7fff;
unsigned wbuf = idx < WSIZE / 2 || idx >= WSIZE + WSIZE / 2;
auto w = win_[wbuf] + ((idx + wbuf * WSIZE / 2) % WSIZE) - WSIZE / 2;
*out = Filter(w, h_[phy], mu, d_[phy]);
out += out_stride;
nout--;
in_pos_ += ratio;
}
win_[0][(out_pos_ + WSIZE / 2) % WSIZE] = win_[1][(out_pos_)] = *in;
in += in_stride;
nin--;
out_pos_ = (out_pos_ + 1) % WSIZE;
}
*in_count = in_len - nin;
*out_count = out_len - nout;
}
public:
Resampler(int bit_depth)
: h_(asrc::resampler_tables.h),
d_(asrc::resampler_tables.d),
win_{{0}, {0}},
out_pos_(0),
in_pos_(0),
pcm_min_(-(int32_t(1) << (bit_depth - 1))),
pcm_max_((int32_t(1) << (bit_depth - 1)) - 1) {}
// Resample from `in` buffer to `out` buffer, until the end of any of
// the two buffers. `in_count` returns the number of consumed samples,
// and `out_count` the number produced. `in_sub` returns the phase in
// the input stream, in Q26 format.
template
void Resample(unsigned ratio_q26, const T* in, int in_stride, size_t in_len,
size_t* in_count, T* out, int out_stride, size_t out_len,
size_t* out_count, unsigned* in_sub_q26) {
auto fn = ratio_q26 < (1u << 26) ? &Resampler::Upsample
: &Resampler::Downsample;
(this->*fn)(ratio_q26, in, in_stride, in_len, in_count, out, out_stride,
out_len, out_count);
*in_sub_q26 = in_pos_ & ((1u << 26) - 1);
}
};
//
// ARM AArch 64 Neon Resampler Filtering
//
#if __ARM_NEON && __ARM_ARCH_ISA_A64
#include
static inline int32x4_t vmull_low_s16(int16x8_t a, int16x8_t b) {
return vmull_s16(vget_low_s16(a), vget_low_s16(b));
}
static inline int64x2_t vmull_low_s32(int32x4_t a, int32x4_t b) {
return vmull_s32(vget_low_s32(a), vget_low_s32(b));
}
static inline int64x2_t vmlal_low_s32(int64x2_t r, int32x4_t a, int32x4_t b) {
return vmlal_s32(r, vget_low_s32(a), vget_low_s32(b));
}
inline int32_t SourceAudioHalAsrc::Resampler::Filter(const int32_t* x,
const int32_t* h,
int16_t _mu,
const int16_t* d) {
int64x2_t sx;
int16x8_t mu = vdupq_n_s16(_mu);
int16x8_t d0 = vld1q_s16(d + 0);
int32x4_t h0 = vld1q_s32(h + 0), h4 = vld1q_s32(h + 4);
int32x4_t x0 = vld1q_s32(x + 0), x4 = vld1q_s32(x + 4);
h0 = vaddq_s32(h0, vrshrq_n_s32(vmull_low_s16(d0, mu), 7));
h4 = vaddq_s32(h4, vrshrq_n_s32(vmull_high_s16(d0, mu), 7));
sx = vmull_low_s32(x0, h0);
sx = vmlal_high_s32(sx, x0, h0);
sx = vmlal_low_s32(sx, x4, h4);
sx = vmlal_high_s32(sx, x4, h4);
for (int i = 8; i < 32; i += 8) {
int16x8_t d8 = vld1q_s16(d + i);
int32x4_t h8 = vld1q_s32(h + i), h12 = vld1q_s32(h + i + 4);
int32x4_t x8 = vld1q_s32(x + i), x12 = vld1q_s32(x + i + 4);
h8 = vaddq_s32(h8, vrshrq_n_s32(vmull_low_s16(d8, mu), 7));
h12 = vaddq_s32(h12, vrshrq_n_s32(vmull_high_s16(d8, mu), 7));
sx = vmlal_low_s32(sx, x8, h8);
sx = vmlal_high_s32(sx, x8, h8);
sx = vmlal_low_s32(sx, x12, h12);
sx = vmlal_high_s32(sx, x12, h12);
}
int64_t s = (vaddvq_s64(sx) + (1 << 30)) >> 31;
return std::clamp(s, int64_t(pcm_min_), int64_t(pcm_max_));
}
//
// Generic Resampler Filtering
//
#else
inline int32_t SourceAudioHalAsrc::Resampler::Filter(const int32_t* in,
const int32_t* h,
int16_t mu,
const int16_t* d) {
int64_t s = 0;
for (int i = 0; i < 2 * KERNEL_A - 1; i++)
s += int64_t(in[i]) * (h[i] + ((mu * d[i] + (1 << 6)) >> 7));
s = (s + (1 << 30)) >> 31;
return std::clamp(s, int64_t(pcm_min_), int64_t(pcm_max_));
}
#endif
SourceAudioHalAsrc::SourceAudioHalAsrc(
bluetooth::common::MessageLoopThread* thread, int channels, int sample_rate,
int bit_depth, int interval_us, int num_burst_buffers, int burst_delay_ms)
: sample_rate_(sample_rate),
bit_depth_(bit_depth),
interval_us_(interval_us),
stream_us_(0),
drift_us_(0),
out_counter_(0),
resampler_pos_{0, 0} {
buffers_size_ = 0;
// Check parameters
auto check_bounds = [](int v, int min, int max) {
return v >= min && v <= max;
};
if (!check_bounds(channels, 1, 8) ||
!check_bounds(sample_rate, 1 * 1000, 100 * 1000) ||
!check_bounds(bit_depth, 8, 32) ||
!check_bounds(interval_us, 1 * 1000, 100 * 1000) ||
!check_bounds(num_burst_buffers, 0, 10) ||
!check_bounds(burst_delay_ms, 0, 1000)) {
log::error(
"Bad parameters: channels: {} sample_rate: {} bit_depth: {} "
"interval_us: {} num_burst_buffers: {} burst_delay_ms: {}",
channels, sample_rate, bit_depth, interval_us, num_burst_buffers,
burst_delay_ms);
return;
}
// Compute filter constants
const double drift_release_sec = 3;
drift_z0_ = 1. - exp(-3. / (1e6 / interval_us_) / drift_release_sec);
// Setup modules, the 32 bits resampler is choosed over the 16 bits resampler
// when the PCM bit_depth is higher than 16 bits.
clock_recovery_ = std::make_unique(thread);
resamplers_ = std::make_unique>(channels, bit_depth_);
// Deduct from the PCM stream characteristics, the size of the pool buffers
// It needs 3 buffers (one almost full, an entire one, and a last which can be
// started).
auto& buffers = buffers_;
int num_interval_samples =
channels * (interval_us_ * sample_rate_) / (1000 * 1000);
buffers_size_ = num_interval_samples *
(bit_depth_ <= 16 ? sizeof(int16_t) : sizeof(int32_t));
for (auto& b : buffers.pool) b.resize(buffers_size_);
buffers.index = 0;
buffers.offset = 0;
// Setup the burst buffers to silence
auto silence_buffer = &buffers_.pool[0];
std::fill(silence_buffer->begin(), silence_buffer->end(), 0);
burst_buffers_.resize(num_burst_buffers);
for (auto& b : burst_buffers_) b = silence_buffer;
burst_delay_us_ = burst_delay_ms * 1000;
}
SourceAudioHalAsrc::~SourceAudioHalAsrc() {}
template
__attribute__((no_sanitize("integer"))) void SourceAudioHalAsrc::Resample(
double ratio, const std::vector& in,
std::vector*>* out, uint32_t* output_us) {
auto& resamplers = *resamplers_;
auto& buffers = buffers_;
auto channels = resamplers.size();
// Convert the resampling ration in fixed Q16,
// then loop until the input buffer is consumed.
auto in_size = in.size() / sizeof(T);
auto in_length = in_size / channels;
unsigned ratio_q26 = round(ldexp(ratio, 26));
unsigned sub_q26;
while (in_length > 0) {
auto in_data = (const T*)in.data() + (in_size - in_length * channels);
// Load from the context the current output buffer, the offset
// and deduct the remaning size. Let's resample the interleaved
// PCM stream, a separate reampler is used for each channel.
auto buffer = &buffers.pool[buffers.index];
auto out_data = (T*)buffer->data() + buffers.offset;
auto out_size = buffer->size() / sizeof(T);
auto out_length = (out_size - buffers.offset) / channels;
size_t in_count, out_count;
for (auto& r : resamplers)
r.Resample(ratio_q26, in_data++, channels, in_length, &in_count,
out_data++, channels, out_length, &out_count, &sub_q26);
in_length -= in_count;
buffers.offset += out_count * channels;
// Update the resampler position, expressed in seconds
// and a number of samples in a second. The `sub_q26` variable
// returned by the resampler, adds the sub-sample information.
resampler_pos_.samples += out_count;
for (; resampler_pos_.samples >= sample_rate_;
resampler_pos_.samples -= sample_rate_)
resampler_pos_.seconds++;
// An output buffer has been fulfilled,
// select a new buffer in the pool, used as a ring.
if (out_count >= out_length) {
buffers.index = (buffers.index + 1) % buffers.pool.size();
buffers.offset = 0;
out->push_back(buffer);
}
}
// Let's convert the resampler position, in a micro-seconds timestamp.
// The samples count within a seconds, and sub-sample position, are
// converted, then add the number of seconds modulo 2^32.
int64_t output_samples_q26 = (int64_t(resampler_pos_.samples) << 26) -
((int64_t(sub_q26) << 26) / ratio_q26);
*output_us = resampler_pos_.seconds * (1000 * 1000) +
uint32_t((output_samples_q26 * 1000 * 1000) /
(int64_t(sample_rate_) << 26));
}
__attribute__((no_sanitize("integer"))) std::vector*>
SourceAudioHalAsrc::Run(const std::vector& in) {
std::vector*> out;
if (in.size() != buffers_size_) {
log::error("Inconsistent input buffer size: {} ({} expected)", in.size(),
buffers_size_);
return out;
}
// The burst delay has expired, let's generate the burst.
if (burst_buffers_.size() && stream_us_ >= burst_delay_us_) {
for (size_t i = 0; i < burst_buffers_.size(); i++)
out.push_back(burst_buffers_[(out_counter_ + i) % burst_buffers_.size()]);
burst_buffers_.clear();
}
// Convert the stream position to a local time,
// and catch up the drift within the next second.
stream_us_ += interval_us_;
uint32_t local_us = clock_recovery_->Convert(stream_us_);
double ratio = 1e6 / (1e6 - drift_us_);
// Let's run the resampler,
// and update the drift according the output position returned.
uint32_t output_us;
if (bit_depth_ <= 16)
Resample(ratio, in, &out, &output_us);
else
Resample(ratio, in, &out, &output_us);
drift_us_ += drift_z0_ * (int(output_us - local_us) - drift_us_);
// Delay the stream, in order to generate a burst when
// the associated delay has expired.
if (burst_buffers_.size()) {
for (size_t i = 0; i < out.size(); i++)
std::exchange*>(
out[i], burst_buffers_[(out_counter_ + i) % burst_buffers_.size()]);
}
// Return the output statistics to the clock recovery module
out_counter_ += out.size();
clock_recovery_->UpdateOutputStats(ratio * sample_rate_,
int(output_us - local_us));
if (0)
log::info("[{:6}.{:06}] Fs: {:.2f} Hz drift: {} us",
output_us / (1000 * 1000), output_us % (1000 * 1000),
ratio * sample_rate_, int(output_us - local_us));
return out;
}
} // namespace bluetooth::audio::asrc