From 9f629dcaa245053d313f9db26c778ca33c27541c Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Date: Tue, 29 Apr 2025 17:27:03 -0500
Subject: [PATCH] [i2s_audio, microphone, micro_wake_word, voice_assistant] Use
 microphone source to process incoming audio (#8645)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
---
 .../i2s_audio/microphone/__init__.py          | 45 ++++++++--
 .../microphone/i2s_audio_microphone.cpp       | 87 +++++++++----------
 .../microphone/i2s_audio_microphone.h         |  4 +-
 .../components/micro_wake_word/__init__.py    | 27 +++++-
 .../micro_wake_word/micro_wake_word.cpp       | 14 +--
 .../micro_wake_word/micro_wake_word.h         |  8 +-
 esphome/components/microphone/__init__.py     |  8 +-
 esphome/components/microphone/automation.h    |  4 +-
 esphome/components/microphone/microphone.h    |  5 +-
 .../microphone/microphone_source.cpp          |  4 +-
 .../components/voice_assistant/__init__.py    | 26 +++++-
 .../voice_assistant/voice_assistant.cpp       | 20 ++---
 .../voice_assistant/voice_assistant.h         |  6 +-
 tests/components/micro_wake_word/common.yaml  |  1 +
 tests/components/voice_assistant/common.yaml  |  5 +-
 15 files changed, 166 insertions(+), 98 deletions(-)

diff --git a/esphome/components/i2s_audio/microphone/__init__.py b/esphome/components/i2s_audio/microphone/__init__.py
index 4950a25751..06eb29986d 100644
--- a/esphome/components/i2s_audio/microphone/__init__.py
+++ b/esphome/components/i2s_audio/microphone/__init__.py
@@ -1,13 +1,20 @@
 from esphome import pins
 import esphome.codegen as cg
-from esphome.components import esp32, microphone
+from esphome.components import audio, esp32, microphone
 from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin
 import esphome.config_validation as cv
-from esphome.const import CONF_ID, CONF_NUMBER
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_CHANNEL,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_NUMBER,
+    CONF_SAMPLE_RATE,
+)
 
 from .. import (
-    CONF_CHANNEL,
     CONF_I2S_DIN_PIN,
+    CONF_LEFT,
     CONF_MONO,
     CONF_RIGHT,
     I2SAudioIn,
@@ -32,7 +39,7 @@ INTERNAL_ADC_VARIANTS = [esp32.const.VARIANT_ESP32]
 PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3]
 
 
-def validate_esp32_variant(config):
+def _validate_esp32_variant(config):
     variant = esp32.get_esp32_variant()
     if config[CONF_ADC_TYPE] == "external":
         if config[CONF_PDM]:
@@ -46,12 +53,34 @@ def validate_esp32_variant(config):
     raise NotImplementedError
 
 
-def validate_channel(config):
+def _validate_channel(config):
     if config[CONF_CHANNEL] == CONF_MONO:
         raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.")
     return config
 
 
+def _set_num_channels_from_config(config):
+    if config[CONF_CHANNEL] in (CONF_LEFT, CONF_RIGHT):
+        config[CONF_NUM_CHANNELS] = 1
+    else:
+        config[CONF_NUM_CHANNELS] = 2
+
+    return config
+
+
+def _set_stream_limits(config):
+    audio.set_stream_limits(
+        min_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        min_channels=config.get(CONF_NUM_CHANNELS),
+        max_channels=config.get(CONF_NUM_CHANNELS),
+        min_sample_rate=config.get(CONF_SAMPLE_RATE),
+        max_sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
 BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend(
     i2s_audio_component_schema(
         I2SAudioMicrophone,
@@ -79,8 +108,10 @@ CONFIG_SCHEMA = cv.All(
         },
         key=CONF_ADC_TYPE,
     ),
-    validate_esp32_variant,
-    validate_channel,
+    _validate_esp32_variant,
+    _validate_channel,
+    _set_num_channels_from_config,
+    _set_stream_limits,
 )
 
 
diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
index 3ab3c88142..78a7f92c2f 100644
--- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
+++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
@@ -56,6 +56,35 @@ void I2SAudioMicrophone::start_() {
   }
   esp_err_t err;
 
+  uint8_t channel_count = 1;
+#ifdef USE_I2S_LEGACY
+  uint8_t bits_per_sample = this->bits_per_sample_;
+
+  if (this->channel_ == I2S_CHANNEL_FMT_RIGHT_LEFT) {
+    channel_count = 2;
+  }
+#else
+  if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_AUTO) {
+    this->slot_bit_width_ = I2S_SLOT_BIT_WIDTH_16BIT;
+  }
+  uint8_t bits_per_sample = this->slot_bit_width_;
+
+  if (this->slot_mode_ == I2S_SLOT_MODE_STEREO) {
+    channel_count = 2;
+  }
+#endif
+
+#ifdef USE_ESP32_VARIANT_ESP32
+  // ESP32 reads audio aligned to a multiple of 2 bytes. For example, if configured for 24 bits per sample, then it will
+  // produce 32 bits per sample, where the actual data is in the most significant bits. Other ESP32 variants produce 24
+  // bits per sample in this situation.
+  if (bits_per_sample < 16) {
+    bits_per_sample = 16;
+  } else if ((bits_per_sample > 16) && (bits_per_sample <= 32)) {
+    bits_per_sample = 32;
+  }
+#endif
+
 #ifdef USE_I2S_LEGACY
   i2s_driver_config_t config = {
       .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX),
@@ -144,6 +173,8 @@ void I2SAudioMicrophone::start_() {
   i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config();
 #if SOC_I2S_SUPPORTS_PDM_RX
   if (this->pdm_) {
+    bits_per_sample = 16;  // PDM mics are always 16 bits per sample with the IDF 5 driver
+
     i2s_pdm_rx_clk_config_t clk_cfg = {
         .sample_rate_hz = this->sample_rate_,
         .clk_src = clk_src,
@@ -187,13 +218,8 @@ void I2SAudioMicrophone::start_() {
         .clk_src = clk_src,
         .mclk_multiple = I2S_MCLK_MULTIPLE_256,
     };
-    i2s_data_bit_width_t data_bit_width;
-    if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_8BIT) {
-      data_bit_width = I2S_DATA_BIT_WIDTH_16BIT;
-    } else {
-      data_bit_width = I2S_DATA_BIT_WIDTH_8BIT;
-    }
-    i2s_std_slot_config_t std_slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, this->slot_mode_);
+    i2s_std_slot_config_t std_slot_cfg =
+        I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) this->slot_bit_width_, this->slot_mode_);
     std_slot_cfg.slot_bit_width = this->slot_bit_width_;
     std_slot_cfg.slot_mask = this->std_slot_mask_;
 
@@ -222,6 +248,8 @@ void I2SAudioMicrophone::start_() {
   }
 #endif
 
+  this->audio_stream_info_ = audio::AudioStreamInfo(bits_per_sample, channel_count, this->sample_rate_);
+
   this->state_ = microphone::STATE_RUNNING;
   this->high_freq_.start();
   this->status_clear_error();
@@ -284,7 +312,7 @@ void I2SAudioMicrophone::stop_() {
   this->status_clear_error();
 }
 
-size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wait) {
+size_t I2SAudioMicrophone::read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait) {
   size_t bytes_read = 0;
 #ifdef USE_I2S_LEGACY
   esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait);
@@ -303,38 +331,7 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
     return 0;
   }
   this->status_clear_warning();
-  // ESP-IDF I2S implementation right-extends 8-bit data to 16 bits,
-  // and 24-bit data to 32 bits.
-#ifdef USE_I2S_LEGACY
-  switch (this->bits_per_sample_) {
-    case I2S_BITS_PER_SAMPLE_8BIT:
-    case I2S_BITS_PER_SAMPLE_16BIT:
-      return bytes_read;
-    case I2S_BITS_PER_SAMPLE_24BIT:
-    case I2S_BITS_PER_SAMPLE_32BIT: {
-      size_t samples_read = bytes_read / sizeof(int32_t);
-      for (size_t i = 0; i < samples_read; i++) {
-        int32_t temp = reinterpret_cast<int32_t *>(buf)[i] >> 14;
-        buf[i] = clamp<int16_t>(temp, INT16_MIN, INT16_MAX);
-      }
-      return samples_read * sizeof(int16_t);
-    }
-    default:
-      ESP_LOGE(TAG, "Unsupported bits per sample: %d", this->bits_per_sample_);
-      return 0;
-  }
-#else
-#ifndef USE_ESP32_VARIANT_ESP32
-  // For newer ESP32 variants 8 bit data needs to be extended to 16 bit.
-  if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_8BIT) {
-    size_t samples_read = bytes_read / sizeof(int8_t);
-    for (size_t i = samples_read - 1; i >= 0; i--) {
-      int16_t temp = static_cast<int16_t>(reinterpret_cast<int8_t *>(buf)[i]) << 8;
-      buf[i] = temp;
-    }
-    return samples_read * sizeof(int16_t);
-  }
-#else
+#if defined(USE_ESP32_VARIANT_ESP32) and not defined(USE_I2S_LEGACY)
   // For ESP32 8/16 bit standard mono mode samples need to be switched.
   if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) {
     size_t samples_read = bytes_read / sizeof(int16_t);
@@ -346,14 +343,14 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
   }
 #endif
   return bytes_read;
-#endif
 }
 
 void I2SAudioMicrophone::read_() {
-  std::vector<int16_t> samples;
-  samples.resize(BUFFER_SIZE);
-  size_t bytes_read = this->read(samples.data(), BUFFER_SIZE * sizeof(int16_t), 0);
-  samples.resize(bytes_read / sizeof(int16_t));
+  std::vector<uint8_t> samples;
+  const size_t bytes_to_read = this->audio_stream_info_.ms_to_bytes(32);
+  samples.resize(bytes_to_read);
+  size_t bytes_read = this->read_(samples.data(), bytes_to_read, 0);
+  samples.resize(bytes_read);
   this->data_callbacks_.call(samples);
 }
 
diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h
index 2dbacb447e..072d312e0f 100644
--- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h
+++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h
@@ -25,9 +25,6 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
 
   void set_pdm(bool pdm) { this->pdm_ = pdm; }
 
-  size_t read(int16_t *buf, size_t len, TickType_t ticks_to_wait);
-  size_t read(int16_t *buf, size_t len) override { return this->read(buf, len, pdMS_TO_TICKS(100)); }
-
 #ifdef USE_I2S_LEGACY
 #if SOC_I2S_SUPPORTS_ADC
   void set_adc_channel(adc1_channel_t channel) {
@@ -41,6 +38,7 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
   void start_();
   void stop_();
   void read_();
+  size_t read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait);
 
 #ifdef USE_I2S_LEGACY
   int8_t din_pin_{I2S_PIN_NO_CHANGE};
diff --git a/esphome/components/micro_wake_word/__init__.py b/esphome/components/micro_wake_word/__init__.py
index 0862406e46..9d5caca937 100644
--- a/esphome/components/micro_wake_word/__init__.py
+++ b/esphome/components/micro_wake_word/__init__.py
@@ -328,7 +328,14 @@ CONFIG_SCHEMA = cv.All(
     cv.Schema(
         {
             cv.GenerateID(): cv.declare_id(MicroWakeWord),
-            cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone),
+            cv.Optional(
+                CONF_MICROPHONE, default={}
+            ): microphone.microphone_source_schema(
+                min_bits_per_sample=16,
+                max_bits_per_sample=16,
+                min_channels=1,
+                max_channels=1,
+            ),
             cv.Required(CONF_MODELS): cv.ensure_list(
                 cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL)
             ),
@@ -404,15 +411,27 @@ def _feature_step_size_validate(config):
             raise cv.Invalid("Cannot load models with different features step sizes.")
 
 
-FINAL_VALIDATE_SCHEMA = _feature_step_size_validate
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Required(
+                CONF_MICROPHONE
+            ): microphone.final_validate_microphone_source_schema(
+                "micro_wake_word", sample_rate=16000
+            ),
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    _feature_step_size_validate,
+)
 
 
 async def to_code(config):
     var = cg.new_Pvariable(config[CONF_ID])
     await cg.register_component(var, config)
 
-    mic = await cg.get_variable(config[CONF_MICROPHONE])
-    cg.add(var.set_microphone(mic))
+    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
+    cg.add(var.set_microphone_source(mic_source))
 
     esp32.add_idf_component(
         name="esp-tflite-micro",
diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp
index 533aa9fb75..dd1a8be378 100644
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@@ -61,7 +61,7 @@ void MicroWakeWord::dump_config() {
 void MicroWakeWord::setup() {
   ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
 
-  this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) {
+  this->microphone_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
     if (this->state_ != State::DETECTING_WAKE_WORD) {
       return;
     }
@@ -71,7 +71,7 @@ void MicroWakeWord::setup() {
 
       size_t bytes_free = temp_ring_buffer->free();
 
-      if (bytes_free < data.size() * sizeof(int16_t)) {
+      if (bytes_free < data.size()) {
         ESP_LOGW(
             TAG,
             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
@@ -80,7 +80,7 @@ void MicroWakeWord::setup() {
 
         temp_ring_buffer->reset();
       }
-      temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
+      temp_ring_buffer->write((void *) data.data(), data.size());
     }
   });
 
@@ -128,11 +128,11 @@ void MicroWakeWord::loop() {
       break;
     case State::START_MICROPHONE:
       ESP_LOGD(TAG, "Starting Microphone");
-      this->microphone_->start();
+      this->microphone_source_->start();
       this->set_state_(State::STARTING_MICROPHONE);
       break;
     case State::STARTING_MICROPHONE:
-      if (this->microphone_->is_running()) {
+      if (this->microphone_source_->is_running()) {
         this->set_state_(State::DETECTING_WAKE_WORD);
       }
       break;
@@ -148,13 +148,13 @@ void MicroWakeWord::loop() {
       break;
     case State::STOP_MICROPHONE:
       ESP_LOGD(TAG, "Stopping Microphone");
-      this->microphone_->stop();
+      this->microphone_source_->stop();
       this->set_state_(State::STOPPING_MICROPHONE);
       this->unload_models_();
       this->deallocate_buffers_();
       break;
     case State::STOPPING_MICROPHONE:
-      if (this->microphone_->is_stopped()) {
+      if (this->microphone_source_->is_stopped()) {
         this->set_state_(State::IDLE);
         if (this->detected_) {
           this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h
index 443911b1e4..b06d35ca1f 100644
--- a/esphome/components/micro_wake_word/micro_wake_word.h
+++ b/esphome/components/micro_wake_word/micro_wake_word.h
@@ -9,7 +9,7 @@
 #include "esphome/core/component.h"
 #include "esphome/core/ring_buffer.h"
 
-#include "esphome/components/microphone/microphone.h"
+#include "esphome/components/microphone/microphone_source.h"
 
 #include <frontend_util.h>
 
@@ -46,7 +46,9 @@ class MicroWakeWord : public Component {
 
   void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; }
 
-  void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; }
+  void set_microphone_source(microphone::MicrophoneSource *microphone_source) {
+    this->microphone_source_ = microphone_source;
+  }
 
   Trigger<std::string> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
 
@@ -59,7 +61,7 @@ class MicroWakeWord : public Component {
 #endif
 
  protected:
-  microphone::Microphone *microphone_{nullptr};
+  microphone::MicrophoneSource *microphone_source_{nullptr};
   Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
   State state_{State::IDLE};
 
diff --git a/esphome/components/microphone/__init__.py b/esphome/components/microphone/__init__.py
index b9d24bc4a7..dcae513578 100644
--- a/esphome/components/microphone/__init__.py
+++ b/esphome/components/microphone/__init__.py
@@ -36,7 +36,7 @@ StopCaptureAction = microphone_ns.class_(
 
 DataTrigger = microphone_ns.class_(
     "DataTrigger",
-    automation.Trigger.template(cg.std_vector.template(cg.int16).operator("ref")),
+    automation.Trigger.template(cg.std_vector.template(cg.uint8).operator("ref")),
 )
 
 IsCapturingCondition = microphone_ns.class_(
@@ -98,10 +98,11 @@ def microphone_source_schema(
         return config
 
     return cv.All(
-        cv.maybe_simple_value(
+        automation.maybe_conf(
+            CONF_MICROPHONE,
             {
                 cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
-                cv.Required(CONF_MICROPHONE): cv.use_id(Microphone),
+                cv.GenerateID(CONF_MICROPHONE): cv.use_id(Microphone),
                 cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
                     min_bits_per_sample, max_bits_per_sample
                 ),
@@ -112,7 +113,6 @@ def microphone_source_schema(
                 ),
                 cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
             },
-            key=CONF_MICROPHONE,
         ),
     )
 
diff --git a/esphome/components/microphone/automation.h b/esphome/components/microphone/automation.h
index 29c0ec5df2..324699c0af 100644
--- a/esphome/components/microphone/automation.h
+++ b/esphome/components/microphone/automation.h
@@ -16,10 +16,10 @@ template<typename... Ts> class StopCaptureAction : public Action<Ts...>, public
   void play(Ts... x) override { this->parent_->stop(); }
 };
 
-class DataTrigger : public Trigger<const std::vector<int16_t> &> {
+class DataTrigger : public Trigger<const std::vector<uint8_t> &> {
  public:
   explicit DataTrigger(Microphone *mic) {
-    mic->add_data_callback([this](const std::vector<int16_t> &data) { this->trigger(data); });
+    mic->add_data_callback([this](const std::vector<uint8_t> &data) { this->trigger(data); });
   }
 };
 
diff --git a/esphome/components/microphone/microphone.h b/esphome/components/microphone/microphone.h
index 58552aa34a..cef8d0f4c3 100644
--- a/esphome/components/microphone/microphone.h
+++ b/esphome/components/microphone/microphone.h
@@ -22,10 +22,9 @@ class Microphone {
  public:
   virtual void start() = 0;
   virtual void stop() = 0;
-  void add_data_callback(std::function<void(const std::vector<int16_t> &)> &&data_callback) {
+  void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
     this->data_callbacks_.add(std::move(data_callback));
   }
-  virtual size_t read(int16_t *buf, size_t len) = 0;
 
   bool is_running() const { return this->state_ == STATE_RUNNING; }
   bool is_stopped() const { return this->state_ == STATE_STOPPED; }
@@ -37,7 +36,7 @@ class Microphone {
 
   audio::AudioStreamInfo audio_stream_info_;
 
-  CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{};
+  CallbackManager<void(const std::vector<uint8_t> &)> data_callbacks_{};
 };
 
 }  // namespace microphone
diff --git a/esphome/components/microphone/microphone_source.cpp b/esphome/components/microphone/microphone_source.cpp
index 7e397348b9..dcd3b31622 100644
--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@@ -10,9 +10,7 @@ void MicrophoneSource::add_data_callback(std::function<void(const std::vector<ui
           data_callback(this->process_audio_(data));
         }
       };
-  // Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a
-  // mic callback.
-  // this->mic_->add_data_callback(std::move(filtered_callback));
+  this->mic_->add_data_callback(std::move(filtered_callback));
 }
 
 void MicrophoneSource::start() {
diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py
index e8cdca94b8..ca0b6da742 100644
--- a/esphome/components/voice_assistant/__init__.py
+++ b/esphome/components/voice_assistant/__init__.py
@@ -88,7 +88,14 @@ CONFIG_SCHEMA = cv.All(
     cv.Schema(
         {
             cv.GenerateID(): cv.declare_id(VoiceAssistant),
-            cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone),
+            cv.Optional(
+                CONF_MICROPHONE, default={}
+            ): microphone.microphone_source_schema(
+                min_bits_per_sample=16,
+                max_bits_per_sample=16,
+                min_channels=1,
+                max_channels=1,
+            ),
             cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
             cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                 media_player.MediaPlayer
@@ -163,13 +170,26 @@ CONFIG_SCHEMA = cv.All(
     tts_stream_validate,
 )
 
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(
+                CONF_MICROPHONE
+            ): microphone.final_validate_microphone_source_schema(
+                "voice_assistant", sample_rate=16000
+            ),
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+)
+
 
 async def to_code(config):
     var = cg.new_Pvariable(config[CONF_ID])
     await cg.register_component(var, config)
 
-    mic = await cg.get_variable(config[CONF_MICROPHONE])
-    cg.add(var.set_microphone(mic))
+    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
+    cg.add(var.set_microphone_source(mic_source))
 
     if CONF_SPEAKER in config:
         spkr = await cg.get_variable(config[CONF_SPEAKER])
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index c62767d7d5..37b97239c8 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -29,10 +29,10 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
 VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
 
 void VoiceAssistant::setup() {
-  this->mic_->add_data_callback([this](const std::vector<int16_t> &data) {
+  this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
     std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
     if (this->ring_buffer_.use_count() > 1) {
-      temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
+      temp_ring_buffer->write((void *) data.data(), data.size());
     }
   });
 }
@@ -162,7 +162,7 @@ void VoiceAssistant::reset_conversation_id() {
 void VoiceAssistant::loop() {
   if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
       this->state_ != State::STOPPING_MICROPHONE) {
-    if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
+    if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
       this->set_state_(State::STOP_MICROPHONE, State::IDLE);
     } else {
       this->set_state_(State::IDLE, State::IDLE);
@@ -193,12 +193,12 @@ void VoiceAssistant::loop() {
       }
       this->clear_buffers_();
 
-      this->mic_->start();
+      this->mic_source_->start();
       this->set_state_(State::STARTING_MICROPHONE);
       break;
     }
     case State::STARTING_MICROPHONE: {
-      if (this->mic_->is_running()) {
+      if (this->mic_source_->is_running()) {
         this->set_state_(this->desired_state_);
       }
       break;
@@ -262,8 +262,8 @@ void VoiceAssistant::loop() {
       break;
     }
     case State::STOP_MICROPHONE: {
-      if (this->mic_->is_running()) {
-        this->mic_->stop();
+      if (this->mic_source_->is_running()) {
+        this->mic_source_->stop();
         this->set_state_(State::STOPPING_MICROPHONE);
       } else {
         this->set_state_(this->desired_state_);
@@ -271,7 +271,7 @@ void VoiceAssistant::loop() {
       break;
     }
     case State::STOPPING_MICROPHONE: {
-      if (this->mic_->is_stopped()) {
+      if (this->mic_source_->is_stopped()) {
         this->set_state_(this->desired_state_);
       }
       break;
@@ -478,7 +478,7 @@ void VoiceAssistant::start_streaming() {
   ESP_LOGD(TAG, "Client started, streaming microphone");
   this->audio_mode_ = AUDIO_MODE_API;
 
-  if (this->mic_->is_running()) {
+  if (this->mic_source_->is_running()) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
@@ -508,7 +508,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
     return;
   }
 
-  if (this->mic_->is_running()) {
+  if (this->mic_source_->is_running()) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index cb57a6b05d..7122d69527 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -11,7 +11,7 @@
 
 #include "esphome/components/api/api_connection.h"
 #include "esphome/components/api/api_pb2.h"
-#include "esphome/components/microphone/microphone.h"
+#include "esphome/components/microphone/microphone_source.h"
 #ifdef USE_SPEAKER
 #include "esphome/components/speaker/speaker.h"
 #endif
@@ -98,7 +98,7 @@ class VoiceAssistant : public Component {
   void start_streaming(struct sockaddr_storage *addr, uint16_t port);
   void failed_to_start();
 
-  void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; }
+  void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
 #ifdef USE_SPEAKER
   void set_speaker(speaker::Speaker *speaker) {
     this->speaker_ = speaker;
@@ -249,7 +249,7 @@ class VoiceAssistant : public Component {
   bool has_timers_{false};
   bool timer_tick_running_{false};
 
-  microphone::Microphone *mic_{nullptr};
+  microphone::MicrophoneSource *mic_source_{nullptr};
 #ifdef USE_SPEAKER
   void write_speaker_();
   speaker::Speaker *speaker_{nullptr};
diff --git a/tests/components/micro_wake_word/common.yaml b/tests/components/micro_wake_word/common.yaml
index c5422baa67..b5507397f8 100644
--- a/tests/components/micro_wake_word/common.yaml
+++ b/tests/components/micro_wake_word/common.yaml
@@ -11,6 +11,7 @@ microphone:
     bits_per_sample: 16bit
 
 micro_wake_word:
+  microphone: echo_microphone
   on_wake_word_detected:
     - logger.log: "Wake word detected"
   models:
diff --git a/tests/components/voice_assistant/common.yaml b/tests/components/voice_assistant/common.yaml
index e7374941f7..f248154b7e 100644
--- a/tests/components/voice_assistant/common.yaml
+++ b/tests/components/voice_assistant/common.yaml
@@ -30,7 +30,10 @@ speaker:
     i2s_dout_pin: ${i2s_dout_pin}
 
 voice_assistant:
-  microphone: mic_id_external
+  microphone:
+    microphone: mic_id_external
+    gain_factor: 4
+    channels: 0
   speaker: speaker_id
   conversation_timeout: 60s
   on_listening: