[audio, microphone] - Allow MicrophoneSource to passively capture/optimization (#8732)

2025-07-28 14:16:40 +00:00 · 2025-05-09 16:54:33 -05:00 · 2025-05-09 16:54:33 -05:00 · bec9d91419
commit bec9d91419
parent 8399d894c1
4 changed files with 50 additions and 33 deletions
--- a/esphome/components/audio/audio.h
+++ b/esphome/components/audio/audio.h
@ -135,7 +135,7 @@ const char *audio_file_type_to_string(AudioFileType file_type);
 void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor,
                         size_t samples_to_scale);

-/// @brief Unpacks a quantized audio sample into a Q31 fixed point number.
+/// @brief Unpacks a quantized audio sample into a Q31 fixed-point number.
 /// @param data Pointer to uint8_t array containing the audio sample
 /// @param bytes_per_sample The number of bytes per sample
 /// @return Q31 sample
@ -160,5 +160,28 @@ inline int32_t unpack_audio_sample_to_q31(const uint8_t *data, size_t bytes_per_
  return sample;
 }

+/// @brief Packs a Q31 fixed-point number as an audio sample with the specified number of bytes per sample.
+/// Packs the most significant bits - no dithering is applied.
+/// @param sample Q31 fixed-point number to pack
+/// @param data Pointer to data array to store
+/// @param bytes_per_sample The audio data's bytes per sample
+inline void pack_q31_as_audio_sample(int32_t sample, uint8_t *data, size_t bytes_per_sample) {
+  if (bytes_per_sample == 1) {
+    data[0] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 2) {
+    data[0] = static_cast<uint8_t>(sample >> 16);
+    data[1] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 3) {
+    data[0] = static_cast<uint8_t>(sample >> 8);
+    data[1] = static_cast<uint8_t>(sample >> 16);
+    data[2] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 4) {
+    data[0] = static_cast<uint8_t>(sample);
+    data[1] = static_cast<uint8_t>(sample >> 8);
+    data[2] = static_cast<uint8_t>(sample >> 16);
+    data[3] = static_cast<uint8_t>(sample >> 24);
+  }
+}
+
 }  // namespace audio
 }  // namespace esphome
--- a/esphome/components/microphone/init.py
+++ b/esphome/components/microphone/init.py
@ -162,13 +162,22 @@ def final_validate_microphone_source_schema(
    return _validate_audio_compatability


-async def microphone_source_to_code(config):
+async def microphone_source_to_code(config, passive=False):
+    """Creates a MicrophoneSource variable for codegen.
+
+    Setting passive to true makes the MicrophoneSource never start/stop the microphone, but only receives audio when another component has actively started the Microphone. If false, then the microphone needs to be explicitly started/stopped.
+
+    Args:
+        config (Schema): Created with `microphone_source_schema` specifying bits per sample, channels, and gain factor
+        passive (bool): Enable passive mode for the MicrophoneSource
+    """
    mic = await cg.get_variable(config[CONF_MICROPHONE])
    mic_source = cg.new_Pvariable(
        config[CONF_ID],
        mic,
        config[CONF_BITS_PER_SAMPLE],
        config[CONF_GAIN_FACTOR],
+        passive,
    )
    for channel in config[CONF_CHANNELS]:
        cg.add(mic_source.add_channel(channel))
--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@ -6,12 +6,10 @@ namespace microphone {
 static const int32_t Q25_MAX_VALUE = (1 << 25) - 1;
 static const int32_t Q25_MIN_VALUE = ~Q25_MAX_VALUE;

-static const uint32_t HISTORY_VALUES = 32;
-
 void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
  std::function<void(const std::vector<uint8_t> &)> filtered_callback =
      [this, data_callback](const std::vector<uint8_t> &data) {
-        if (this->enabled_) {
+        if (this->enabled_ || this->passive_) {
          if (this->processed_samples_.use_count() == 0) {
            // Create vector if its unused
            this->processed_samples_ = std::make_shared<std::vector<uint8_t>>();
@ -32,13 +30,14 @@ audio::AudioStreamInfo MicrophoneSource::get_audio_stream_info() {
 }

 void MicrophoneSource::start() {
-  if (!this->enabled_) {
+  if (!this->enabled_ && !this->passive_) {
    this->enabled_ = true;
    this->mic_->start();
  }
 }
+
 void MicrophoneSource::stop() {
-  if (this->enabled_) {
+  if (this->enabled_ && !this->passive_) {
    this->enabled_ = false;
    this->mic_->stop();
    this->processed_samples_.reset();
@ -63,8 +62,9 @@ void MicrophoneSource::process_audio_(const std::vector<uint8_t> &data, std::vec
  const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
  const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();

-  filtered_data.reserve(target_bytes_per_frame * total_frames);
-  filtered_data.resize(0);
+  filtered_data.resize(target_bytes_per_frame * total_frames);
+
+  uint8_t *current_data = filtered_data.data();

  for (uint32_t frame_index = 0; frame_index < total_frames; ++frame_index) {
    for (uint32_t channel_index = 0; channel_index < source_channels; ++channel_index) {
@ -82,26 +82,10 @@ void MicrophoneSource::process_audio_(const std::vector<uint8_t> &data, std::vec
        // Clamp ``sample`` in case gain multiplication overflows 25 bits
        sample = clamp<int32_t>(sample, Q25_MIN_VALUE, Q25_MAX_VALUE);  // Q25

-        // Copy ``target_bytes_per_sample`` bytes to the output buffer.
-        if (target_bytes_per_sample == 1) {
-          sample >>= 18;  // Q25 -> Q7
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-        } else if (target_bytes_per_sample == 2) {
-          sample >>= 10;  // Q25 -> Q15
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-        } else if (target_bytes_per_sample == 3) {
-          sample >>= 2;  // Q25 -> Q23
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
-        } else {
-          sample *= (1 << 6);  // Q25 -> Q31
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 24));
-        }
+        sample *= (1 << 6);  // Q25 -> Q31
+
+        audio::pack_q31_as_audio_sample(sample, current_data, target_bytes_per_sample);
+        current_data = current_data + target_bytes_per_sample;
      }
    }
  }
--- a/esphome/components/microphone/microphone_source.h
+++ b/esphome/components/microphone/microphone_source.h
@ -35,8 +35,8 @@ class MicrophoneSource {
   * Note that this class cannot convert sample rates!
   */
 public:
-  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor)
-      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor) {}
+  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor, bool passive)
+      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor), passive_(passive) {}

  /// @brief Enables a channel to be processed through the callback.
  ///
@ -59,8 +59,8 @@ class MicrophoneSource {

  void start();
  void stop();
-  bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
-  bool is_stopped() const { return !this->enabled_; }
+  bool is_running() const { return (this->mic_->is_running() && (this->enabled_ || this->passive_)); }
+  bool is_stopped() const { return !this->is_running(); };

 protected:
  void process_audio_(const std::vector<uint8_t> &data, std::vector<uint8_t> &filtered_data);
@ -72,6 +72,7 @@ class MicrophoneSource {
  std::bitset<8> channels_;
  int32_t gain_factor_;
  bool enabled_{false};
+  bool passive_{false};
 };

 }  // namespace microphone