diff --git a/esphome/components/audio/audio.h b/esphome/components/audio/audio.h
index 2c556c68e2..95c31872e3 100644
--- a/esphome/components/audio/audio.h
+++ b/esphome/components/audio/audio.h
@@ -135,7 +135,7 @@ const char *audio_file_type_to_string(AudioFileType file_type);
 void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor,
                          size_t samples_to_scale);
 
-/// @brief Unpacks a quantized audio sample into a Q31 fixed point number.
+/// @brief Unpacks a quantized audio sample into a Q31 fixed-point number.
 /// @param data Pointer to uint8_t array containing the audio sample
 /// @param bytes_per_sample The number of bytes per sample
 /// @return Q31 sample
@@ -160,5 +160,28 @@ inline int32_t unpack_audio_sample_to_q31(const uint8_t *data, size_t bytes_per_
   return sample;
 }
 
+/// @brief Packs a Q31 fixed-point number as an audio sample with the specified number of bytes per sample.
+/// Packs the most significant bits - no dithering is applied.
+/// @param sample Q31 fixed-point number to pack
+/// @param data Pointer to data array to store
+/// @param bytes_per_sample The audio data's bytes per sample
+inline void pack_q31_as_audio_sample(int32_t sample, uint8_t *data, size_t bytes_per_sample) {
+  if (bytes_per_sample == 1) {
+    data[0] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 2) {
+    data[0] = static_cast<uint8_t>(sample >> 16);
+    data[1] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 3) {
+    data[0] = static_cast<uint8_t>(sample >> 8);
+    data[1] = static_cast<uint8_t>(sample >> 16);
+    data[2] = static_cast<uint8_t>(sample >> 24);
+  } else if (bytes_per_sample == 4) {
+    data[0] = static_cast<uint8_t>(sample);
+    data[1] = static_cast<uint8_t>(sample >> 8);
+    data[2] = static_cast<uint8_t>(sample >> 16);
+    data[3] = static_cast<uint8_t>(sample >> 24);
+  }
+}
+
 }  // namespace audio
 }  // namespace esphome
diff --git a/esphome/components/microphone/__init__.py b/esphome/components/microphone/__init__.py
index 2fda99af05..29bdcfa3f3 100644
--- a/esphome/components/microphone/__init__.py
+++ b/esphome/components/microphone/__init__.py
@@ -162,13 +162,22 @@ def final_validate_microphone_source_schema(
     return _validate_audio_compatability
 
 
-async def microphone_source_to_code(config):
+async def microphone_source_to_code(config, passive=False):
+    """Creates a MicrophoneSource variable for codegen.
+
+    Setting passive to true makes the MicrophoneSource never start/stop the microphone, but only receives audio when another component has actively started the Microphone. If false, then the microphone needs to be explicitly started/stopped.
+
+    Args:
+        config (Schema): Created with `microphone_source_schema` specifying bits per sample, channels, and gain factor
+        passive (bool): Enable passive mode for the MicrophoneSource
+    """
     mic = await cg.get_variable(config[CONF_MICROPHONE])
     mic_source = cg.new_Pvariable(
         config[CONF_ID],
         mic,
         config[CONF_BITS_PER_SAMPLE],
         config[CONF_GAIN_FACTOR],
+        passive,
     )
     for channel in config[CONF_CHANNELS]:
         cg.add(mic_source.add_channel(channel))
diff --git a/esphome/components/microphone/microphone_source.cpp b/esphome/components/microphone/microphone_source.cpp
index 1ea0deb22b..00efcf22a1 100644
--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@@ -6,12 +6,10 @@ namespace microphone {
 static const int32_t Q25_MAX_VALUE = (1 << 25) - 1;
 static const int32_t Q25_MIN_VALUE = ~Q25_MAX_VALUE;
 
-static const uint32_t HISTORY_VALUES = 32;
-
 void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
   std::function<void(const std::vector<uint8_t> &)> filtered_callback =
       [this, data_callback](const std::vector<uint8_t> &data) {
-        if (this->enabled_) {
+        if (this->enabled_ || this->passive_) {
           if (this->processed_samples_.use_count() == 0) {
             // Create vector if its unused
             this->processed_samples_ = std::make_shared<std::vector<uint8_t>>();
@@ -32,13 +30,14 @@ audio::AudioStreamInfo MicrophoneSource::get_audio_stream_info() {
 }
 
 void MicrophoneSource::start() {
-  if (!this->enabled_) {
+  if (!this->enabled_ && !this->passive_) {
     this->enabled_ = true;
     this->mic_->start();
   }
 }
+
 void MicrophoneSource::stop() {
-  if (this->enabled_) {
+  if (this->enabled_ && !this->passive_) {
     this->enabled_ = false;
     this->mic_->stop();
     this->processed_samples_.reset();
@@ -63,8 +62,9 @@ void MicrophoneSource::process_audio_(const std::vector<uint8_t> &data, std::vec
   const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
   const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();
 
-  filtered_data.reserve(target_bytes_per_frame * total_frames);
-  filtered_data.resize(0);
+  filtered_data.resize(target_bytes_per_frame * total_frames);
+
+  uint8_t *current_data = filtered_data.data();
 
   for (uint32_t frame_index = 0; frame_index < total_frames; ++frame_index) {
     for (uint32_t channel_index = 0; channel_index < source_channels; ++channel_index) {
@@ -82,26 +82,10 @@ void MicrophoneSource::process_audio_(const std::vector<uint8_t> &data, std::vec
         // Clamp ``sample`` in case gain multiplication overflows 25 bits
         sample = clamp<int32_t>(sample, Q25_MIN_VALUE, Q25_MAX_VALUE);  // Q25
 
-        // Copy ``target_bytes_per_sample`` bytes to the output buffer.
-        if (target_bytes_per_sample == 1) {
-          sample >>= 18;  // Q25 -> Q7
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-        } else if (target_bytes_per_sample == 2) {
-          sample >>= 10;  // Q25 -> Q15
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-        } else if (target_bytes_per_sample == 3) {
-          sample >>= 2;  // Q25 -> Q23
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
-        } else {
-          sample *= (1 << 6);  // Q25 -> Q31
-          filtered_data.push_back(static_cast<uint8_t>(sample));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 8));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 16));
-          filtered_data.push_back(static_cast<uint8_t>(sample >> 24));
-        }
+        sample *= (1 << 6);  // Q25 -> Q31
+
+        audio::pack_q31_as_audio_sample(sample, current_data, target_bytes_per_sample);
+        current_data = current_data + target_bytes_per_sample;
       }
     }
   }
diff --git a/esphome/components/microphone/microphone_source.h b/esphome/components/microphone/microphone_source.h
index 7f8a37b360..228f2d9dc3 100644
--- a/esphome/components/microphone/microphone_source.h
+++ b/esphome/components/microphone/microphone_source.h
@@ -35,8 +35,8 @@ class MicrophoneSource {
    * Note that this class cannot convert sample rates!
    */
  public:
-  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor)
-      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor) {}
+  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor, bool passive)
+      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor), passive_(passive) {}
 
   /// @brief Enables a channel to be processed through the callback.
   ///
@@ -59,8 +59,8 @@ class MicrophoneSource {
 
   void start();
   void stop();
-  bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
-  bool is_stopped() const { return !this->enabled_; }
+  bool is_running() const { return (this->mic_->is_running() && (this->enabled_ || this->passive_)); }
+  bool is_stopped() const { return !this->is_running(); };
 
  protected:
   void process_audio_(const std::vector<uint8_t> &data, std::vector<uint8_t> &filtered_data);
@@ -72,6 +72,7 @@ class MicrophoneSource {
   std::bitset<8> channels_;
   int32_t gain_factor_;
   bool enabled_{false};
+  bool passive_{false};
 };
 
 }  // namespace microphone