[audio, microphone] Add MicrophoneSource helper class (#8641)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
Kevin Ahrendt 2025-04-28 19:05:07 -05:00 committed by GitHub
parent 43580739ac
commit 844569e96b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 365 additions and 39 deletions

View File

@ -278,7 +278,7 @@ esphome/components/mdns/* @esphome/core
esphome/components/media_player/* @jesserockz
esphome/components/micro_wake_word/* @jesserockz @kahrendt
esphome/components/micronova/* @jorre05
esphome/components/microphone/* @jesserockz
esphome/components/microphone/* @jesserockz @kahrendt
esphome/components/mics_4514/* @jesserockz
esphome/components/midea/* @dudanov
esphome/components/midea_ir/* @dudanov

View File

@ -48,6 +48,12 @@ def set_stream_limits(
min_sample_rate: int = _UNDEF,
max_sample_rate: int = _UNDEF,
):
"""Sets the limits for the audio stream that audio component can handle
When the component sinks audio (e.g., a speaker), these indicate the limits to the audio it can receive.
When the component sources audio (e.g., a microphone), these indicate the limits to the audio it can send.
"""
def set_limits_in_config(config):
if min_bits_per_sample is not _UNDEF:
config[CONF_MIN_BITS_PER_SAMPLE] = min_bits_per_sample
@ -69,43 +75,87 @@ def final_validate_audio_schema(
name: str,
*,
audio_device: str,
bits_per_sample: int,
channels: int,
sample_rate: int,
bits_per_sample: int = _UNDEF,
channels: int = _UNDEF,
sample_rate: int = _UNDEF,
enabled_channels: list[int] = _UNDEF,
audio_device_issue: bool = False,
):
"""Validates audio compatibility when passed between different components.
The component derived from ``AUDIO_COMPONENT_SCHEMA`` should call ``set_stream_limits`` in a validator to specify its compatible settings
- If audio_device_issue is True, then the error message indicates the user should adjust the AUDIO_COMPONENT_SCHEMA derived component's configuration to match the values passed to this function
- If audio_device_issue is False, then the error message indicates the user should adjust the configuration of the component calling this function, as it falls out of the valid stream limits
Args:
name (str): Friendly name of the component calling this function with an audio component to validate
audio_device (str): The configuration parameter name that contains the ID of an AUDIO_COMPONENT_SCHEMA derived component to validate against
bits_per_sample (int, optional): The desired bits per sample
channels (int, optional): The desired number of channels
sample_rate (int, optional): The desired sample rate
enabled_channels (list[int], optional): The desired enabled channels
audio_device_issue (bool, optional): Format the error message to indicate the problem is in the configuration for the ``audio_device`` component. Defaults to False.
"""
def validate_audio_compatiblity(audio_config):
audio_schema = {}
try:
cv.int_range(
min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
)(bits_per_sample)
except cv.Invalid as exc:
raise cv.Invalid(
f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
) from exc
if bits_per_sample is not _UNDEF:
try:
cv.int_range(
min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
)(bits_per_sample)
except cv.Invalid as exc:
if audio_device_issue:
error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {bits_per_sample} bits per sample."
else:
error_string = f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
raise cv.Invalid(error_string) from exc
try:
cv.int_range(
min=audio_config.get(CONF_MIN_CHANNELS),
max=audio_config.get(CONF_MAX_CHANNELS),
)(channels)
except cv.Invalid as exc:
raise cv.Invalid(
f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
) from exc
if channels is not _UNDEF:
try:
cv.int_range(
min=audio_config.get(CONF_MIN_CHANNELS),
max=audio_config.get(CONF_MAX_CHANNELS),
)(channels)
except cv.Invalid as exc:
if audio_device_issue:
error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {channels} channels."
else:
error_string = f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
raise cv.Invalid(error_string) from exc
try:
cv.int_range(
min=audio_config.get(CONF_MIN_SAMPLE_RATE),
max=audio_config.get(CONF_MAX_SAMPLE_RATE),
)(sample_rate)
return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
except cv.Invalid as exc:
raise cv.Invalid(
f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
) from exc
if sample_rate is not _UNDEF:
try:
cv.int_range(
min=audio_config.get(CONF_MIN_SAMPLE_RATE),
max=audio_config.get(CONF_MAX_SAMPLE_RATE),
)(sample_rate)
except cv.Invalid as exc:
if audio_device_issue:
error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires a {sample_rate} sample rate."
else:
error_string = f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
raise cv.Invalid(error_string) from exc
if enabled_channels is not _UNDEF:
for channel in enabled_channels:
try:
# Channels are 0-indexed
cv.int_range(
min=0,
max=audio_config.get(CONF_MAX_CHANNELS) - 1,
)(channel)
except cv.Invalid as exc:
if audio_device_issue:
error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires channel {channel}."
else:
error_string = f"Invalid configuration for the {name} component. Enabled channel {channel} {str(exc)}"
raise cv.Invalid(error_string) from exc
return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
return cv.Schema(
{

View File

@ -4,6 +4,8 @@
#include "esphome/core/hal.h"
#include <cstring>
namespace esphome {
namespace audio {

View File

@ -6,6 +6,7 @@
#include "audio_transfer_buffer.h"
#include "esphome/core/defines.h"
#include "esphome/core/helpers.h"
#include "esphome/core/ring_buffer.h"
#ifdef USE_SPEAKER

View File

@ -1,12 +1,21 @@
from esphome import automation
from esphome.automation import maybe_simple_id
import esphome.codegen as cg
from esphome.components import audio
import esphome.config_validation as cv
from esphome.const import CONF_ID, CONF_TRIGGER_ID
from esphome.const import (
CONF_BITS_PER_SAMPLE,
CONF_CHANNELS,
CONF_GAIN_FACTOR,
CONF_ID,
CONF_MICROPHONE,
CONF_TRIGGER_ID,
)
from esphome.core import CORE
from esphome.coroutine import coroutine_with_priority
CODEOWNERS = ["@jesserockz"]
AUTO_LOAD = ["audio"]
CODEOWNERS = ["@jesserockz", "@kahrendt"]
IS_PLATFORM_COMPONENT = True
@ -15,6 +24,7 @@ CONF_ON_DATA = "on_data"
microphone_ns = cg.esphome_ns.namespace("microphone")
Microphone = microphone_ns.class_("Microphone")
MicrophoneSource = microphone_ns.class_("MicrophoneSource")
CaptureAction = microphone_ns.class_(
"CaptureAction", automation.Action, cg.Parented.template(Microphone)
@ -37,6 +47,7 @@ IsCapturingCondition = microphone_ns.class_(
async def setup_microphone_core_(var, config):
for conf in config.get(CONF_ON_DATA, []):
trigger = cg.new_Pvariable(conf[CONF_TRIGGER_ID], var)
# Future PR will change the vector type to uint8
await automation.build_automation(
trigger,
[(cg.std_vector.template(cg.int16).operator("ref").operator("const"), "x")],
@ -50,7 +61,7 @@ async def register_microphone(var, config):
await setup_microphone_core_(var, config)
MICROPHONE_SCHEMA = cv.Schema(
MICROPHONE_SCHEMA = cv.Schema.extend(audio.AUDIO_COMPONENT_SCHEMA).extend(
{
cv.Optional(CONF_ON_DATA): automation.validate_automation(
{
@ -64,7 +75,104 @@ MICROPHONE_SCHEMA = cv.Schema(
MICROPHONE_ACTION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(Microphone)})
async def media_player_action(config, action_id, template_arg, args):
def microphone_source_schema(
min_bits_per_sample: int = 16,
max_bits_per_sample: int = 16,
min_channels: int = 1,
max_channels: int = 1,
):
"""Schema for a microphone source
Components requesting microphone data should use this schema instead of accessing a microphone directly.
Args:
min_bits_per_sample (int, optional): Minimum number of bits per sample the requesting component supports. Defaults to 16.
max_bits_per_sample (int, optional): Maximum number of bits per sample the requesting component supports. Defaults to 16.
min_channels (int, optional): Minimum number of channels the requesting component supports. Defaults to 1.
max_channels (int, optional): Maximum number of channels the requesting component supports. Defaults to 1.
"""
def _validate_unique_channels(config):
if len(config) != len(set(config)):
raise cv.Invalid("Channels must be unique")
return config
return cv.All(
cv.maybe_simple_value(
{
cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
cv.Required(CONF_MICROPHONE): cv.use_id(Microphone),
cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
min_bits_per_sample, max_bits_per_sample
),
cv.Optional(CONF_CHANNELS, default="0"): cv.All(
cv.ensure_list(cv.int_range(0, 7)),
cv.Length(min=min_channels, max=max_channels),
_validate_unique_channels,
),
cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
},
key=CONF_MICROPHONE,
),
)
_UNDEF = object()
def final_validate_microphone_source_schema(
component_name: str, sample_rate: int = _UNDEF
):
"""Validates that the microphone source can provide audio in the correct format. In particular it validates the sample rate and the enabled channels.
Note that:
- MicrophoneSource class automatically handles converting bits per sample, so no need to validate
- microphone_source_schema already validates that channels are unique and specifies the max number of channels the component supports
Args:
component_name (str): The name of the component requesting mic audio
sample_rate (int, optional): The sample rate the component requesting mic audio requires
"""
def _validate_audio_compatability(config):
if sample_rate is not _UNDEF:
# Issues require changing the microphone configuration
# - Verifies sample rates match
audio.final_validate_audio_schema(
component_name,
audio_device=CONF_MICROPHONE,
sample_rate=sample_rate,
audio_device_issue=True,
)(config)
# Issues require changing the MicrophoneSource configuration
# - Verifies that each of the enabled channels are available
audio.final_validate_audio_schema(
component_name,
audio_device=CONF_MICROPHONE,
enabled_channels=config[CONF_CHANNELS],
audio_device_issue=False,
)(config)
return config
return _validate_audio_compatability
async def microphone_source_to_code(config):
mic = await cg.get_variable(config[CONF_MICROPHONE])
mic_source = cg.new_Pvariable(
config[CONF_ID],
mic,
config[CONF_BITS_PER_SAMPLE],
config[CONF_GAIN_FACTOR],
)
for channel in config[CONF_CHANNELS]:
cg.add(mic_source.add_channel(channel))
return mic_source
async def microphone_action(config, action_id, template_arg, args):
var = cg.new_Pvariable(action_id, template_arg)
await cg.register_parented(var, config[CONF_ID])
return var
@ -72,15 +180,15 @@ async def media_player_action(config, action_id, template_arg, args):
automation.register_action(
"microphone.capture", CaptureAction, MICROPHONE_ACTION_SCHEMA
)(media_player_action)
)(microphone_action)
automation.register_action(
"microphone.stop_capture", StopCaptureAction, MICROPHONE_ACTION_SCHEMA
)(media_player_action)
)(microphone_action)
automation.register_condition(
"microphone.is_capturing", IsCapturingCondition, MICROPHONE_ACTION_SCHEMA
)(media_player_action)
)(microphone_action)
@coroutine_with_priority(100.0)

View File

@ -1,5 +1,7 @@
#pragma once
#include "esphome/components/audio/audio.h"
#include <cstddef>
#include <cstdint>
#include <functional>
@ -28,9 +30,13 @@ class Microphone {
bool is_running() const { return this->state_ == STATE_RUNNING; }
bool is_stopped() const { return this->state_ == STATE_STOPPED; }
audio::AudioStreamInfo get_audio_stream_info() { return this->audio_stream_info_; }
protected:
State state_{STATE_STOPPED};
audio::AudioStreamInfo audio_stream_info_;
CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{};
};

View File

@ -0,0 +1,96 @@
#include "microphone_source.h"
namespace esphome {
namespace microphone {
void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
std::function<void(const std::vector<uint8_t> &)> filtered_callback =
[this, data_callback](const std::vector<uint8_t> &data) {
if (this->enabled_) {
data_callback(this->process_audio_(data));
}
};
// Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a
// mic callback.
// this->mic_->add_data_callback(std::move(filtered_callback));
}
void MicrophoneSource::start() {
this->enabled_ = true;
this->mic_->start();
}
void MicrophoneSource::stop() {
this->enabled_ = false;
this->mic_->stop();
}
std::vector<uint8_t> MicrophoneSource::process_audio_(const std::vector<uint8_t> &data) {
// Bit depth conversions are obtained by truncating bits or padding with zeros - no dithering is applied.
const size_t source_bytes_per_sample = this->mic_->get_audio_stream_info().samples_to_bytes(1);
const size_t source_channels = this->mic_->get_audio_stream_info().get_channels();
const size_t source_bytes_per_frame = this->mic_->get_audio_stream_info().frames_to_bytes(1);
const uint32_t total_frames = this->mic_->get_audio_stream_info().bytes_to_frames(data.size());
const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();
std::vector<uint8_t> filtered_data;
filtered_data.reserve(target_bytes_per_frame * total_frames);
const int32_t target_min_value = -(1 << (8 * target_bytes_per_sample - 1));
const int32_t target_max_value = (1 << (8 * target_bytes_per_sample - 1)) - 1;
for (size_t frame_index = 0; frame_index < total_frames; ++frame_index) {
for (size_t channel_index = 0; channel_index < source_channels; ++channel_index) {
if (this->channels_.test(channel_index)) {
// Channel's current sample is included in the target mask. Convert bits per sample, if necessary.
size_t sample_index = frame_index * source_bytes_per_frame + channel_index * source_bytes_per_sample;
int32_t sample = 0;
// Copy the data into the most significant bits of the sample variable to ensure the sign bit is correct
uint8_t bit_offset = (4 - source_bytes_per_sample) * 8;
for (int i = 0; i < source_bytes_per_sample; ++i) {
sample |= data[sample_index + i] << bit_offset;
bit_offset += 8;
}
// Shift data back to the least significant bits
if (source_bytes_per_sample >= target_bytes_per_sample) {
// Keep source bytes per sample of data so that the gain multiplication uses all significant bits instead of
// shifting to the target bytes per sample immediately, potentially losing information.
sample >>= (4 - source_bytes_per_sample) * 8; // ``source_bytes_per_sample`` bytes of valid data
} else {
// Keep padded zeros to match the target bytes per sample
sample >>= (4 - target_bytes_per_sample) * 8; // ``target_bytes_per_sample`` bytes of valid data
}
// Apply gain using multiplication
sample *= this->gain_factor_;
// Match target output bytes by shifting out the least significant bits
if (source_bytes_per_sample > target_bytes_per_sample) {
sample >>= 8 * (source_bytes_per_sample -
target_bytes_per_sample); // ``target_bytes_per_sample`` bytes of valid data
}
// Clamp ``sample`` to the target bytes per sample range in case gain multiplication overflows
sample = clamp<int32_t>(sample, target_min_value, target_max_value);
// Copy ``target_bytes_per_sample`` bytes to the output buffer.
for (int i = 0; i < target_bytes_per_sample; ++i) {
filtered_data.push_back(static_cast<uint8_t>(sample));
sample >>= 8;
}
}
}
}
return filtered_data;
}
} // namespace microphone
} // namespace esphome

View File

@ -0,0 +1,63 @@
#pragma once
#include <bitset>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <vector>
#include "microphone.h"
namespace esphome {
namespace microphone {
class MicrophoneSource {
/*
* @brief Helper class that handles converting raw microphone data to a requested format.
* Components requesting microphone audio should register a callback through this class instead of registering a
* callback directly with the microphone if a particular format is required.
*
* Raw microphone data may have a different number of bits per sample and number of channels than the requesting
* component needs. This class handles the conversion by:
* - Internally adds a callback to receive the raw microphone data
* - The ``process_audio_`` handles the raw data
* - Only the channels set in the ``channels_`` bitset are passed through
* - Passed through samples have the bits per sample converted
* - A gain factor is optionally applied to increase the volume - audio may clip!
* - The processed audio is passed to the callback of the component requesting microphone data
* - It tracks an internal enabled state, so it ignores raw microphone data when the component requesting
* microphone data is not actively requesting audio.
*
* Note that this class cannot convert sample rates!
*/
public:
MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor)
: mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor) {}
/// @brief Enables a channel to be processed through the callback.
///
/// If the microphone component only has reads from one channel, it is always in channel number 0, regardless if it
/// represents left or right. If the microphone reads from both left and right, channel number 0 and 1 represent the
/// left and right channels respectively.
///
/// @param channel 0-indexed channel number to enable
void add_channel(uint8_t channel) { this->channels_.set(channel); }
void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback);
void start();
void stop();
bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
bool is_stopped() const { return !this->enabled_; }
protected:
std::vector<uint8_t> process_audio_(const std::vector<uint8_t> &data);
Microphone *mic_;
uint8_t bits_per_sample_;
std::bitset<8> channels_;
int32_t gain_factor_;
bool enabled_{false};
};
} // namespace microphone
} // namespace esphome