From 88be14aaa37398acaf870f6040641b82a53cbe5e Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kahrendt@gmail.com>
Date: Mon, 5 May 2025 16:23:50 -0500
Subject: [PATCH] [audio, microphone] Quantization Improvements (#8695)

---
 esphome/components/audio/audio.h              | 25 +++++
 .../microphone/microphone_source.cpp          | 97 +++++++++++--------
 .../components/microphone/microphone_source.h | 19 +++-
 3 files changed, 97 insertions(+), 44 deletions(-)

diff --git a/esphome/components/audio/audio.h b/esphome/components/audio/audio.h
index 6f0f1aaa46..2c556c68e2 100644
--- a/esphome/components/audio/audio.h
+++ b/esphome/components/audio/audio.h
@@ -135,5 +135,30 @@ const char *audio_file_type_to_string(AudioFileType file_type);
 void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor,
                          size_t samples_to_scale);
 
+/// @brief Unpacks a quantized audio sample into a Q31 fixed point number.
+/// @param data Pointer to uint8_t array containing the audio sample
+/// @param bytes_per_sample The number of bytes per sample
+/// @return Q31 sample
+inline int32_t unpack_audio_sample_to_q31(const uint8_t *data, size_t bytes_per_sample) {
+  int32_t sample = 0;
+  if (bytes_per_sample == 1) {
+    sample |= data[0] << 24;
+  } else if (bytes_per_sample == 2) {
+    sample |= data[0] << 16;
+    sample |= data[1] << 24;
+  } else if (bytes_per_sample == 3) {
+    sample |= data[0] << 8;
+    sample |= data[1] << 16;
+    sample |= data[2] << 24;
+  } else if (bytes_per_sample == 4) {
+    sample |= data[0];
+    sample |= data[1] << 8;
+    sample |= data[2] << 16;
+    sample |= data[3] << 24;
+  }
+
+  return sample;
+}
+
 }  // namespace audio
 }  // namespace esphome
diff --git a/esphome/components/microphone/microphone_source.cpp b/esphome/components/microphone/microphone_source.cpp
index 35e8d5dd4d..1ea0deb22b 100644
--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@@ -3,16 +3,34 @@
 namespace esphome {
 namespace microphone {
 
+static const int32_t Q25_MAX_VALUE = (1 << 25) - 1;
+static const int32_t Q25_MIN_VALUE = ~Q25_MAX_VALUE;
+
+static const uint32_t HISTORY_VALUES = 32;
+
 void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
   std::function<void(const std::vector<uint8_t> &)> filtered_callback =
       [this, data_callback](const std::vector<uint8_t> &data) {
         if (this->enabled_) {
-          data_callback(this->process_audio_(data));
+          if (this->processed_samples_.use_count() == 0) {
+            // Create vector if its unused
+            this->processed_samples_ = std::make_shared<std::vector<uint8_t>>();
+          }
+
+          // Take temporary ownership of samples vector to avoid deallaction before the callback finishes
+          std::shared_ptr<std::vector<uint8_t>> output_samples = this->processed_samples_;
+          this->process_audio_(data, *output_samples);
+          data_callback(*output_samples);
         }
       };
   this->mic_->add_data_callback(std::move(filtered_callback));
 }
 
+audio::AudioStreamInfo MicrophoneSource::get_audio_stream_info() {
+  return audio::AudioStreamInfo(this->bits_per_sample_, this->channels_.count(),
+                                this->mic_->get_audio_stream_info().get_sample_rate());
+}
+
 void MicrophoneSource::start() {
   if (!this->enabled_) {
     this->enabled_ = true;
@@ -23,14 +41,21 @@ void MicrophoneSource::stop() {
   if (this->enabled_) {
     this->enabled_ = false;
     this->mic_->stop();
+    this->processed_samples_.reset();
   }
 }
 
-std::vector<uint8_t> MicrophoneSource::process_audio_(const std::vector<uint8_t> &data) {
-  // Bit depth conversions are obtained by truncating bits or padding with zeros - no dithering is applied.
+void MicrophoneSource::process_audio_(const std::vector<uint8_t> &data, std::vector<uint8_t> &filtered_data) {
+  // - Bit depth conversions are obtained by truncating bits or padding with zeros - no dithering is applied.
+  // - In the comments, Qxx refers to a fixed point number with xx bits of precision for representing fractional values.
+  //   For example, audio with a bit depth of 16 can store a sample in a int16, which can be considered a Q15 number.
+  // - All samples are converted to Q25 before applying the gain factor - this results in a small precision loss for
+  //   data with 32 bits per sample. Since the maximum gain factor is 64 = (1<<6), this ensures that applying the gain
+  //   will never overflow a 32 bit signed integer. This still retains more bit depth than what is audibly noticeable.
+  // - Loops for reading/writing data buffers are unrolled, assuming little endian, for a small performance increase.
 
   const size_t source_bytes_per_sample = this->mic_->get_audio_stream_info().samples_to_bytes(1);
-  const size_t source_channels = this->mic_->get_audio_stream_info().get_channels();
+  const uint32_t source_channels = this->mic_->get_audio_stream_info().get_channels();
 
   const size_t source_bytes_per_frame = this->mic_->get_audio_stream_info().frames_to_bytes(1);
 
@@ -38,60 +63,48 @@ std::vector<uint8_t> MicrophoneSource::process_audio_(const std::vector<uint8_t>
   const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
   const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();
 
-  std::vector<uint8_t> filtered_data;
   filtered_data.reserve(target_bytes_per_frame * total_frames);
+  filtered_data.resize(0);
 
-  const int32_t target_min_value = -(1 << (8 * target_bytes_per_sample - 1));
-  const int32_t target_max_value = (1 << (8 * target_bytes_per_sample - 1)) - 1;
-
-  for (size_t frame_index = 0; frame_index < total_frames; ++frame_index) {
-    for (size_t channel_index = 0; channel_index < source_channels; ++channel_index) {
+  for (uint32_t frame_index = 0; frame_index < total_frames; ++frame_index) {
+    for (uint32_t channel_index = 0; channel_index < source_channels; ++channel_index) {
       if (this->channels_.test(channel_index)) {
         // Channel's current sample is included in the target mask. Convert bits per sample, if necessary.
 
-        size_t sample_index = frame_index * source_bytes_per_frame + channel_index * source_bytes_per_sample;
+        const uint32_t sample_index = frame_index * source_bytes_per_frame + channel_index * source_bytes_per_sample;
 
-        int32_t sample = 0;
-
-        // Copy the data into the most significant bits of the sample variable to ensure the sign bit is correct
-        uint8_t bit_offset = (4 - source_bytes_per_sample) * 8;
-        for (int i = 0; i < source_bytes_per_sample; ++i) {
-          sample |= data[sample_index + i] << bit_offset;
-          bit_offset += 8;
-        }
-
-        // Shift data back to the least significant bits
-        if (source_bytes_per_sample >= target_bytes_per_sample) {
-          // Keep source bytes per sample of data so that the gain multiplication uses all significant bits instead of
-          // shifting to the target bytes per sample immediately, potentially losing information.
-          sample >>= (4 - source_bytes_per_sample) * 8;  // ``source_bytes_per_sample`` bytes of valid data
-        } else {
-          // Keep padded zeros to match the target bytes per sample
-          sample >>= (4 - target_bytes_per_sample) * 8;  // ``target_bytes_per_sample`` bytes of valid data
-        }
+        int32_t sample = audio::unpack_audio_sample_to_q31(&data[sample_index], source_bytes_per_sample);  // Q31
+        sample >>= 6;                                                                                      // Q31 -> Q25
 
         // Apply gain using multiplication
-        sample *= this->gain_factor_;
+        sample *= this->gain_factor_;  // Q25
 
-        // Match target output bytes by shifting out the least significant bits
-        if (source_bytes_per_sample > target_bytes_per_sample) {
-          sample >>= 8 * (source_bytes_per_sample -
-                          target_bytes_per_sample);  //  ``target_bytes_per_sample`` bytes of valid data
-        }
-
-        // Clamp ``sample`` to the target bytes per sample range in case gain multiplication overflows
-        sample = clamp<int32_t>(sample, target_min_value, target_max_value);
+        // Clamp ``sample`` in case gain multiplication overflows 25 bits
+        sample = clamp<int32_t>(sample, Q25_MIN_VALUE, Q25_MAX_VALUE);  // Q25
 
         // Copy ``target_bytes_per_sample`` bytes to the output buffer.
-        for (int i = 0; i < target_bytes_per_sample; ++i) {
+        if (target_bytes_per_sample == 1) {
+          sample >>= 18;  // Q25 -> Q7
           filtered_data.push_back(static_cast<uint8_t>(sample));
-          sample >>= 8;
+        } else if (target_bytes_per_sample == 2) {
+          sample >>= 10;  // Q25 -> Q15
+          filtered_data.push_back(static_cast<uint8_t>(sample));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
+        } else if (target_bytes_per_sample == 3) {
+          sample >>= 2;  // Q25 -> Q23
+          filtered_data.push_back(static_cast<uint8_t>(sample));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
+        } else {
+          sample *= (1 << 6);  // Q25 -> Q31
+          filtered_data.push_back(static_cast<uint8_t>(sample));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
+          filtered_data.push_back(static_cast<uint8_t>(sample >> 24));
         }
       }
     }
   }
-
-  return filtered_data;
 }
 
 }  // namespace microphone
diff --git a/esphome/components/microphone/microphone_source.h b/esphome/components/microphone/microphone_source.h
index 028920f101..7f8a37b360 100644
--- a/esphome/components/microphone/microphone_source.h
+++ b/esphome/components/microphone/microphone_source.h
@@ -1,15 +1,20 @@
 #pragma once
 
+#include "microphone.h"
+
+#include "esphome/components/audio/audio.h"
+
 #include <bitset>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <vector>
-#include "microphone.h"
 
 namespace esphome {
 namespace microphone {
 
+static const int32_t MAX_GAIN_FACTOR = 64;
+
 class MicrophoneSource {
   /*
    * @brief Helper class that handles converting raw microphone data to a requested format.
@@ -44,13 +49,23 @@ class MicrophoneSource {
 
   void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback);
 
+  void set_gain_factor(int32_t gain_factor) { this->gain_factor_ = clamp<int32_t>(gain_factor, 1, MAX_GAIN_FACTOR); }
+  int32_t get_gain_factor() { return this->gain_factor_; }
+
+  /// @brief Gets the AudioStreamInfo of the data after processing
+  /// @return audio::AudioStreamInfo with the configured bits per sample, configured channel count, and source
+  ///         microphone's sample rate
+  audio::AudioStreamInfo get_audio_stream_info();
+
   void start();
   void stop();
   bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
   bool is_stopped() const { return !this->enabled_; }
 
  protected:
-  std::vector<uint8_t> process_audio_(const std::vector<uint8_t> &data);
+  void process_audio_(const std::vector<uint8_t> &data, std::vector<uint8_t> &filtered_data);
+
+  std::shared_ptr<std::vector<uint8_t>> processed_samples_;
 
   Microphone *mic_;
   uint8_t bits_per_sample_;