[audio, microphone] Add MicrophoneSource helper class (#8641)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
2025-07-28 14:16:40 +00:00 · 2025-04-28 19:05:07 -05:00 · 2025-04-28 19:05:07 -05:00 · 844569e96b
commit 844569e96b
parent 43580739ac
8 changed files with 365 additions and 39 deletions
--- a/2
+++ b/2
@ -278,7 +278,7 @@ esphome/components/mdns/* @esphome/core
 esphome/components/media_player/* @jesserockz
 esphome/components/micro_wake_word/* @jesserockz @kahrendt
 esphome/components/micronova/* @jorre05
-esphome/components/microphone/* @jesserockz
+esphome/components/microphone/* @jesserockz @kahrendt
 esphome/components/mics_4514/* @jesserockz
 esphome/components/midea/* @dudanov
 esphome/components/midea_ir/* @dudanov
--- a/esphome/components/audio/init.py
+++ b/esphome/components/audio/init.py
@ -48,6 +48,12 @@ def set_stream_limits(
    min_sample_rate: int = _UNDEF,
    max_sample_rate: int = _UNDEF,
 ):
+    """Sets the limits for the audio stream that audio component can handle
+
+    When the component sinks audio (e.g., a speaker), these indicate the limits to the audio it can receive.
+    When the component sources audio (e.g., a microphone), these indicate the limits to the audio it can send.
+    """
+
    def set_limits_in_config(config):
        if min_bits_per_sample is not _UNDEF:
            config[CONF_MIN_BITS_PER_SAMPLE] = min_bits_per_sample
@ -69,43 +75,87 @@ def final_validate_audio_schema(
    name: str,
    *,
    audio_device: str,
-    bits_per_sample: int,
-    channels: int,
-    sample_rate: int,
+    bits_per_sample: int = _UNDEF,
+    channels: int = _UNDEF,
+    sample_rate: int = _UNDEF,
+    enabled_channels: list[int] = _UNDEF,
+    audio_device_issue: bool = False,
 ):
+    """Validates audio compatibility when passed between different components.
+
+    The component derived from ``AUDIO_COMPONENT_SCHEMA`` should call ``set_stream_limits`` in a validator to specify its compatible settings
+
+      - If audio_device_issue is True, then the error message indicates the user should adjust the AUDIO_COMPONENT_SCHEMA derived component's configuration to match the values passed to this function
+      - If audio_device_issue is False, then the error message indicates the user should adjust the configuration of the component calling this function, as it falls out of the valid stream limits
+
+    Args:
+        name (str): Friendly name of the component calling this function with an audio component to validate
+        audio_device (str): The configuration parameter name that contains the ID of an AUDIO_COMPONENT_SCHEMA derived component to validate against
+        bits_per_sample (int, optional): The desired bits per sample
+        channels (int, optional): The desired number of channels
+        sample_rate (int, optional): The desired sample rate
+        enabled_channels (list[int], optional): The desired enabled channels
+        audio_device_issue (bool, optional): Format the error message to indicate the problem is in the configuration for the ``audio_device`` component. Defaults to False.
+    """
+
    def validate_audio_compatiblity(audio_config):
        audio_schema = {}

-        try:
-            cv.int_range(
-                min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
-                max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
-            )(bits_per_sample)
-        except cv.Invalid as exc:
-            raise cv.Invalid(
-                f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
-            ) from exc
+        if bits_per_sample is not _UNDEF:
+            try:
+                cv.int_range(
+                    min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
+                    max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
+                )(bits_per_sample)
+            except cv.Invalid as exc:
+                if audio_device_issue:
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {bits_per_sample} bits per sample."
+                else:
+                    error_string = f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
+                raise cv.Invalid(error_string) from exc

-        try:
-            cv.int_range(
-                min=audio_config.get(CONF_MIN_CHANNELS),
-                max=audio_config.get(CONF_MAX_CHANNELS),
-            )(channels)
-        except cv.Invalid as exc:
-            raise cv.Invalid(
-                f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
-            ) from exc
+        if channels is not _UNDEF:
+            try:
+                cv.int_range(
+                    min=audio_config.get(CONF_MIN_CHANNELS),
+                    max=audio_config.get(CONF_MAX_CHANNELS),
+                )(channels)
+            except cv.Invalid as exc:
+                if audio_device_issue:
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {channels} channels."
+                else:
+                    error_string = f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
+                raise cv.Invalid(error_string) from exc

-        try:
-            cv.int_range(
-                min=audio_config.get(CONF_MIN_SAMPLE_RATE),
-                max=audio_config.get(CONF_MAX_SAMPLE_RATE),
-            )(sample_rate)
-            return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
-        except cv.Invalid as exc:
-            raise cv.Invalid(
-                f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
-            ) from exc
+        if sample_rate is not _UNDEF:
+            try:
+                cv.int_range(
+                    min=audio_config.get(CONF_MIN_SAMPLE_RATE),
+                    max=audio_config.get(CONF_MAX_SAMPLE_RATE),
+                )(sample_rate)
+            except cv.Invalid as exc:
+                if audio_device_issue:
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires a {sample_rate} sample rate."
+                else:
+                    error_string = f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
+                raise cv.Invalid(error_string) from exc
+
+        if enabled_channels is not _UNDEF:
+            for channel in enabled_channels:
+                try:
+                    # Channels are 0-indexed
+                    cv.int_range(
+                        min=0,
+                        max=audio_config.get(CONF_MAX_CHANNELS) - 1,
+                    )(channel)
+                except cv.Invalid as exc:
+                    if audio_device_issue:
+                        error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires channel {channel}."
+                    else:
+                        error_string = f"Invalid configuration for the {name} component. Enabled channel {channel} {str(exc)}"
+                    raise cv.Invalid(error_string) from exc
+
+        return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)

    return cv.Schema(
        {
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@ -4,6 +4,8 @@

 #include "esphome/core/hal.h"

+#include <cstring>
+
 namespace esphome {
 namespace audio {

--- a/esphome/components/audio/audio_resampler.h
+++ b/esphome/components/audio/audio_resampler.h
@ -6,6 +6,7 @@
 #include "audio_transfer_buffer.h"

 #include "esphome/core/defines.h"
+#include "esphome/core/helpers.h"
 #include "esphome/core/ring_buffer.h"

 #ifdef USE_SPEAKER
--- a/esphome/components/microphone/init.py
+++ b/esphome/components/microphone/init.py
@ -1,12 +1,21 @@
 from esphome import automation
 from esphome.automation import maybe_simple_id
 import esphome.codegen as cg
+from esphome.components import audio
 import esphome.config_validation as cv
-from esphome.const import CONF_ID, CONF_TRIGGER_ID
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_CHANNELS,
+    CONF_GAIN_FACTOR,
+    CONF_ID,
+    CONF_MICROPHONE,
+    CONF_TRIGGER_ID,
+)
 from esphome.core import CORE
 from esphome.coroutine import coroutine_with_priority

-CODEOWNERS = ["@jesserockz"]
+AUTO_LOAD = ["audio"]
+CODEOWNERS = ["@jesserockz", "@kahrendt"]

 IS_PLATFORM_COMPONENT = True

@ -15,6 +24,7 @@ CONF_ON_DATA = "on_data"
 microphone_ns = cg.esphome_ns.namespace("microphone")

 Microphone = microphone_ns.class_("Microphone")
+MicrophoneSource = microphone_ns.class_("MicrophoneSource")

 CaptureAction = microphone_ns.class_(
    "CaptureAction", automation.Action, cg.Parented.template(Microphone)
@ -37,6 +47,7 @@ IsCapturingCondition = microphone_ns.class_(
 async def setup_microphone_core_(var, config):
    for conf in config.get(CONF_ON_DATA, []):
        trigger = cg.new_Pvariable(conf[CONF_TRIGGER_ID], var)
+        # Future PR will change the vector type to uint8
        await automation.build_automation(
            trigger,
            [(cg.std_vector.template(cg.int16).operator("ref").operator("const"), "x")],
@ -50,7 +61,7 @@ async def register_microphone(var, config):
    await setup_microphone_core_(var, config)


-MICROPHONE_SCHEMA = cv.Schema(
+MICROPHONE_SCHEMA = cv.Schema.extend(audio.AUDIO_COMPONENT_SCHEMA).extend(
    {
        cv.Optional(CONF_ON_DATA): automation.validate_automation(
            {
@ -64,7 +75,104 @@ MICROPHONE_SCHEMA = cv.Schema(
 MICROPHONE_ACTION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(Microphone)})


-async def media_player_action(config, action_id, template_arg, args):
+def microphone_source_schema(
+    min_bits_per_sample: int = 16,
+    max_bits_per_sample: int = 16,
+    min_channels: int = 1,
+    max_channels: int = 1,
+):
+    """Schema for a microphone source
+
+    Components requesting microphone data should use this schema instead of accessing a microphone directly.
+
+    Args:
+      min_bits_per_sample (int, optional): Minimum number of bits per sample the requesting component supports. Defaults to 16.
+      max_bits_per_sample (int, optional): Maximum number of bits per sample the requesting component supports. Defaults to 16.
+      min_channels (int, optional): Minimum number of channels the requesting component supports. Defaults to 1.
+      max_channels (int, optional): Maximum number of channels the requesting component supports. Defaults to 1.
+    """
+
+    def _validate_unique_channels(config):
+        if len(config) != len(set(config)):
+            raise cv.Invalid("Channels must be unique")
+        return config
+
+    return cv.All(
+        cv.maybe_simple_value(
+            {
+                cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
+                cv.Required(CONF_MICROPHONE): cv.use_id(Microphone),
+                cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
+                    min_bits_per_sample, max_bits_per_sample
+                ),
+                cv.Optional(CONF_CHANNELS, default="0"): cv.All(
+                    cv.ensure_list(cv.int_range(0, 7)),
+                    cv.Length(min=min_channels, max=max_channels),
+                    _validate_unique_channels,
+                ),
+                cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
+            },
+            key=CONF_MICROPHONE,
+        ),
+    )
+
+
+_UNDEF = object()
+
+
+def final_validate_microphone_source_schema(
+    component_name: str, sample_rate: int = _UNDEF
+):
+    """Validates that the microphone source can provide audio in the correct format. In particular it validates the sample rate and the enabled channels.
+
+    Note that:
+      - MicrophoneSource class automatically handles converting bits per sample, so no need to validate
+      - microphone_source_schema already validates that channels are unique and specifies the max number of channels the component supports
+
+    Args:
+        component_name (str): The name of the component requesting mic audio
+        sample_rate (int, optional): The sample rate the component requesting mic audio requires
+    """
+
+    def _validate_audio_compatability(config):
+        if sample_rate is not _UNDEF:
+            # Issues require changing the microphone configuration
+            #  - Verifies sample rates match
+            audio.final_validate_audio_schema(
+                component_name,
+                audio_device=CONF_MICROPHONE,
+                sample_rate=sample_rate,
+                audio_device_issue=True,
+            )(config)
+
+        # Issues require changing the MicrophoneSource configuration
+        # - Verifies that each of the enabled channels are available
+        audio.final_validate_audio_schema(
+            component_name,
+            audio_device=CONF_MICROPHONE,
+            enabled_channels=config[CONF_CHANNELS],
+            audio_device_issue=False,
+        )(config)
+
+        return config
+
+    return _validate_audio_compatability
+
+
+async def microphone_source_to_code(config):
+    mic = await cg.get_variable(config[CONF_MICROPHONE])
+    mic_source = cg.new_Pvariable(
+        config[CONF_ID],
+        mic,
+        config[CONF_BITS_PER_SAMPLE],
+        config[CONF_GAIN_FACTOR],
+    )
+    for channel in config[CONF_CHANNELS]:
+        cg.add(mic_source.add_channel(channel))
+    return mic_source
+
+
+async def microphone_action(config, action_id, template_arg, args):
    var = cg.new_Pvariable(action_id, template_arg)
    await cg.register_parented(var, config[CONF_ID])
    return var
@ -72,15 +180,15 @@ async def media_player_action(config, action_id, template_arg, args):

 automation.register_action(
    "microphone.capture", CaptureAction, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)

 automation.register_action(
    "microphone.stop_capture", StopCaptureAction, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)

 automation.register_condition(
    "microphone.is_capturing", IsCapturingCondition, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)


@coroutine_with_priority(100.0)
--- a/esphome/components/microphone/microphone.h
+++ b/esphome/components/microphone/microphone.h
@ -1,5 +1,7 @@
 #pragma once

+#include "esphome/components/audio/audio.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@ -28,9 +30,13 @@ class Microphone {
  bool is_running() const { return this->state_ == STATE_RUNNING; }
  bool is_stopped() const { return this->state_ == STATE_STOPPED; }

+  audio::AudioStreamInfo get_audio_stream_info() { return this->audio_stream_info_; }
+
 protected:
  State state_{STATE_STOPPED};

+  audio::AudioStreamInfo audio_stream_info_;
+
  CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{};
 };

--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@ -0,0 +1,96 @@
+#include "microphone_source.h"
+
+namespace esphome {
+namespace microphone {
+
+void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
+  std::function<void(const std::vector<uint8_t> &)> filtered_callback =
+      [this, data_callback](const std::vector<uint8_t> &data) {
+        if (this->enabled_) {
+          data_callback(this->process_audio_(data));
+        }
+      };
+  // Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a
+  // mic callback.
+  // this->mic_->add_data_callback(std::move(filtered_callback));
+}
+
+void MicrophoneSource::start() {
+  this->enabled_ = true;
+  this->mic_->start();
+}
+void MicrophoneSource::stop() {
+  this->enabled_ = false;
+  this->mic_->stop();
+}
+
+std::vector<uint8_t> MicrophoneSource::process_audio_(const std::vector<uint8_t> &data) {
+  // Bit depth conversions are obtained by truncating bits or padding with zeros - no dithering is applied.
+
+  const size_t source_bytes_per_sample = this->mic_->get_audio_stream_info().samples_to_bytes(1);
+  const size_t source_channels = this->mic_->get_audio_stream_info().get_channels();
+
+  const size_t source_bytes_per_frame = this->mic_->get_audio_stream_info().frames_to_bytes(1);
+
+  const uint32_t total_frames = this->mic_->get_audio_stream_info().bytes_to_frames(data.size());
+  const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
+  const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();
+
+  std::vector<uint8_t> filtered_data;
+  filtered_data.reserve(target_bytes_per_frame * total_frames);
+
+  const int32_t target_min_value = -(1 << (8 * target_bytes_per_sample - 1));
+  const int32_t target_max_value = (1 << (8 * target_bytes_per_sample - 1)) - 1;
+
+  for (size_t frame_index = 0; frame_index < total_frames; ++frame_index) {
+    for (size_t channel_index = 0; channel_index < source_channels; ++channel_index) {
+      if (this->channels_.test(channel_index)) {
+        // Channel's current sample is included in the target mask. Convert bits per sample, if necessary.
+
+        size_t sample_index = frame_index * source_bytes_per_frame + channel_index * source_bytes_per_sample;
+
+        int32_t sample = 0;
+
+        // Copy the data into the most significant bits of the sample variable to ensure the sign bit is correct
+        uint8_t bit_offset = (4 - source_bytes_per_sample) * 8;
+        for (int i = 0; i < source_bytes_per_sample; ++i) {
+          sample |= data[sample_index + i] << bit_offset;
+          bit_offset += 8;
+        }
+
+        // Shift data back to the least significant bits
+        if (source_bytes_per_sample >= target_bytes_per_sample) {
+          // Keep source bytes per sample of data so that the gain multiplication uses all significant bits instead of
+          // shifting to the target bytes per sample immediately, potentially losing information.
+          sample >>= (4 - source_bytes_per_sample) * 8;  // ``source_bytes_per_sample`` bytes of valid data
+        } else {
+          // Keep padded zeros to match the target bytes per sample
+          sample >>= (4 - target_bytes_per_sample) * 8;  // ``target_bytes_per_sample`` bytes of valid data
+        }
+
+        // Apply gain using multiplication
+        sample *= this->gain_factor_;
+
+        // Match target output bytes by shifting out the least significant bits
+        if (source_bytes_per_sample > target_bytes_per_sample) {
+          sample >>= 8 * (source_bytes_per_sample -
+                          target_bytes_per_sample);  //  ``target_bytes_per_sample`` bytes of valid data
+        }
+
+        // Clamp ``sample`` to the target bytes per sample range in case gain multiplication overflows
+        sample = clamp<int32_t>(sample, target_min_value, target_max_value);
+
+        // Copy ``target_bytes_per_sample`` bytes to the output buffer.
+        for (int i = 0; i < target_bytes_per_sample; ++i) {
+          filtered_data.push_back(static_cast<uint8_t>(sample));
+          sample >>= 8;
+        }
+      }
+    }
+  }
+
+  return filtered_data;
+}
+
+}  // namespace microphone
+}  // namespace esphome
--- a/esphome/components/microphone/microphone_source.h
+++ b/esphome/components/microphone/microphone_source.h
@ -0,0 +1,63 @@
+#pragma once
+
+#include <bitset>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <vector>
+#include "microphone.h"
+
+namespace esphome {
+namespace microphone {
+
+class MicrophoneSource {
+  /*
+   * @brief Helper class that handles converting raw microphone data to a requested format.
+   * Components requesting microphone audio should register a callback through this class instead of registering a
+   * callback directly with the microphone if a particular format is required.
+   *
+   * Raw microphone data may have a different number of bits per sample and number of channels than the requesting
+   * component needs. This class handles the conversion by:
+   *   - Internally adds a callback to receive the raw microphone data
+   *   - The ``process_audio_`` handles the raw data
+   *     - Only the channels set in the ``channels_`` bitset are passed through
+   *     - Passed through samples have the bits per sample converted
+   *     - A gain factor is optionally applied to increase the volume - audio may clip!
+   *   - The processed audio is passed to the callback of the component requesting microphone data
+   *   - It tracks an internal enabled state, so it ignores raw microphone data when the component requesting
+   *     microphone data is not actively requesting audio.
+   *
+   * Note that this class cannot convert sample rates!
+   */
+ public:
+  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor)
+      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor) {}
+
+  /// @brief Enables a channel to be processed through the callback.
+  ///
+  /// If the microphone component only has reads from one channel, it is always in channel number 0, regardless if it
+  /// represents left or right. If the microphone reads from both left and right, channel number 0 and 1 represent the
+  /// left and right channels respectively.
+  ///
+  /// @param channel 0-indexed channel number to enable
+  void add_channel(uint8_t channel) { this->channels_.set(channel); }
+
+  void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback);
+
+  void start();
+  void stop();
+  bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
+  bool is_stopped() const { return !this->enabled_; }
+
+ protected:
+  std::vector<uint8_t> process_audio_(const std::vector<uint8_t> &data);
+
+  Microphone *mic_;
+  uint8_t bits_per_sample_;
+  std::bitset<8> channels_;
+  int32_t gain_factor_;
+  bool enabled_{false};
+};
+
+}  // namespace microphone
+}  // namespace esphome