diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index 9cf7d10936..647bbc7653 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -35,6 +35,27 @@ void VoiceAssistant::setup() {
       temp_ring_buffer->write((void *) data.data(), data.size());
     }
   });
+
+#ifdef USE_MEDIA_PLAYER
+  if (this->media_player_ != nullptr) {
+    this->media_player_->add_on_state_callback([this]() {
+      switch (this->media_player_->state) {
+        case media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING:
+          if (this->media_player_response_state_ == MediaPlayerResponseState::URL_SENT) {
+            // State changed to announcing after receiving the url
+            this->media_player_response_state_ = MediaPlayerResponseState::PLAYING;
+          }
+          break;
+        default:
+          if (this->media_player_response_state_ == MediaPlayerResponseState::PLAYING) {
+            // No longer announcing the TTS response
+            this->media_player_response_state_ = MediaPlayerResponseState::FINISHED;
+          }
+          break;
+      }
+    });
+  }
+#endif
 }
 
 float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
@@ -223,6 +244,13 @@ void VoiceAssistant::loop() {
       msg.wake_word_phrase = this->wake_word_;
       this->wake_word_ = "";
 
+      // Reset media player state tracking
+#ifdef USE_MEDIA_PLAYER
+      if (this->media_player_ != nullptr) {
+        this->media_player_response_state_ = MediaPlayerResponseState::IDLE;
+      }
+#endif
+
       if (this->api_client_ == nullptr || !this->api_client_->send_message(msg)) {
         ESP_LOGW(TAG, "Could not request start");
         this->error_trigger_->trigger("not-connected", "Could not request start");
@@ -314,17 +342,10 @@ void VoiceAssistant::loop() {
 #endif
 #ifdef USE_MEDIA_PLAYER
       if (this->media_player_ != nullptr) {
-        playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING);
+        playing = (this->media_player_response_state_ == MediaPlayerResponseState::PLAYING);
 
-        if (playing && this->media_player_wait_for_announcement_start_) {
-          // Announcement has started playing, wait for it to finish
-          this->media_player_wait_for_announcement_start_ = false;
-          this->media_player_wait_for_announcement_end_ = true;
-        }
-
-        if (!playing && this->media_player_wait_for_announcement_end_) {
-          // Announcement has finished playing
-          this->media_player_wait_for_announcement_end_ = false;
+        if (this->media_player_response_state_ == MediaPlayerResponseState::FINISHED) {
+          this->media_player_response_state_ = MediaPlayerResponseState::IDLE;
           this->cancel_timeout("playing");
           ESP_LOGD(TAG, "Announcement finished playing");
           this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
@@ -555,7 +576,7 @@ void VoiceAssistant::request_stop() {
       break;
     case State::AWAITING_RESPONSE:
       this->signal_stop_();
-      // Fallthrough intended to stop a streaming TTS announcement that has potentially started
+      break;
     case State::STREAMING_RESPONSE:
 #ifdef USE_MEDIA_PLAYER
       // Stop any ongoing media player announcement
@@ -565,6 +586,10 @@ void VoiceAssistant::request_stop() {
             .set_announcement(true)
             .perform();
       }
+      if (this->started_streaming_tts_) {
+        // Haven't reached the TTS_END stage, so send the stop signal to HA.
+        this->signal_stop_();
+      }
 #endif
       break;
     case State::RESPONSE_FINISHED:
@@ -648,13 +673,16 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       if (this->media_player_ != nullptr) {
         for (const auto &arg : msg.data) {
           if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
+            this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT;
+
             this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform();
 
-            this->media_player_wait_for_announcement_start_ = true;
-            this->media_player_wait_for_announcement_end_ = false;
             this->started_streaming_tts_ = true;
+            this->start_playback_timeout_();
+
             tts_url_for_trigger = this->tts_response_url_;
             this->tts_response_url_.clear();  // Reset streaming URL
+            this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
           }
         }
       }
@@ -713,18 +741,22 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       this->defer([this, url]() {
 #ifdef USE_MEDIA_PLAYER
         if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
+          this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT;
+
           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
 
-          this->media_player_wait_for_announcement_start_ = true;
-          this->media_player_wait_for_announcement_end_ = false;
-          // Start the playback timeout, as the media player state isn't immediately updated
           this->start_playback_timeout_();
         }
+        this->started_streaming_tts_ = false;  // Helps indicate reaching the TTS_END stage
 #endif
         this->tts_end_trigger_->trigger(url);
       });
       State new_state = this->local_output_ ? State::STREAMING_RESPONSE : State::IDLE;
-      this->set_state_(new_state, new_state);
+      if (new_state != this->state_) {
+        // Don't needlessly change the state. The intent progress stage may have already changed the state to streaming
+        // response.
+        this->set_state_(new_state, new_state);
+      }
       break;
     }
     case api::enums::VOICE_ASSISTANT_RUN_END: {
@@ -875,6 +907,9 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
 #ifdef USE_MEDIA_PLAYER
   if (this->media_player_ != nullptr) {
     this->tts_start_trigger_->trigger(msg.text);
+
+    this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT;
+
     if (!msg.preannounce_media_id.empty()) {
       this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform();
     }
@@ -886,9 +921,6 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
         .perform();
     this->continue_conversation_ = msg.start_conversation;
 
-    this->media_player_wait_for_announcement_start_ = true;
-    this->media_player_wait_for_announcement_end_ = false;
-    // Start the playback timeout, as the media player state isn't immediately updated
     this->start_playback_timeout_();
 
     if (this->continuous_) {
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index 2424ea6052..95f77dbf09 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -90,6 +90,15 @@ struct Configuration {
   uint32_t max_active_wake_words;
 };
 
+#ifdef USE_MEDIA_PLAYER
+enum class MediaPlayerResponseState {
+  IDLE,
+  URL_SENT,
+  PLAYING,
+  FINISHED,
+};
+#endif
+
 class VoiceAssistant : public Component {
  public:
   VoiceAssistant();
@@ -272,8 +281,8 @@ class VoiceAssistant : public Component {
   media_player::MediaPlayer *media_player_{nullptr};
   std::string tts_response_url_{""};
   bool started_streaming_tts_{false};
-  bool media_player_wait_for_announcement_start_{false};
-  bool media_player_wait_for_announcement_end_{false};
+
+  MediaPlayerResponseState media_player_response_state_{MediaPlayerResponseState::IDLE};
 #endif
 
   bool local_output_{false};