Use different VAD thresholds for before and during voice command (#129848)

* Use two VAD thresholds * Fix VoiceActivityTimeout class * Update homeassistant/components/assist_pipeline/audio_enhancer.py --------- Co-authored-by: Joost Lekkerkerker <joostlek@outlook.com>
2025-07-15 17:27:10 +00:00 · 2024-11-05 08:01:45 -06:00 · 2024-11-05 08:01:45 -06:00 · 69e3348cd7
commit 69e3348cd7
parent 6caa4baa00
4 changed files with 108 additions and 60 deletions
--- a/homeassistant/components/assist_pipeline/audio_enhancer.py
+++ b/homeassistant/components/assist_pipeline/audio_enhancer.py
@ -22,8 +22,8 @@ class EnhancedAudioChunk:
    timestamp_ms: int
    """Timestamp relative to start of audio stream (milliseconds)"""

-    is_speech: bool | None
-    """True if audio chunk likely contains speech, False if not, None if unknown"""
+    speech_probability: float | None
+    """Probability that audio chunk contains speech (0-1), None if unknown"""


 class AudioEnhancer(ABC):
@ -70,27 +70,27 @@ class MicroVadSpeexEnhancer(AudioEnhancer):
            )

        self.vad: MicroVad | None = None
-        self.threshold = 0.5

        if self.is_vad_enabled:
            self.vad = MicroVad()
-            _LOGGER.debug("Initialized microVAD with threshold=%s", self.threshold)
+            _LOGGER.debug("Initialized microVAD")

    def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
        """Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples."""
-        is_speech: bool | None = None
+        speech_probability: float | None = None

        assert len(audio) == BYTES_PER_CHUNK

        if self.vad is not None:
            # Run VAD
-            speech_prob = self.vad.Process10ms(audio)
-            is_speech = speech_prob > self.threshold
+            speech_probability = self.vad.Process10ms(audio)

        if self.audio_processor is not None:
            # Run noise suppression and auto gain
            audio = self.audio_processor.Process10ms(audio).audio

        return EnhancedAudioChunk(
-            audio=audio, timestamp_ms=timestamp_ms, is_speech=is_speech
+            audio=audio,
+            timestamp_ms=timestamp_ms,
+            speech_probability=speech_probability,
        )
--- a/homeassistant/components/assist_pipeline/pipeline.py
+++ b/homeassistant/components/assist_pipeline/pipeline.py
@ -780,7 +780,9 @@ class PipelineRun:
                # speaking the voice command.
                audio_chunks_for_stt.extend(
                    EnhancedAudioChunk(
-                        audio=chunk_ts[0], timestamp_ms=chunk_ts[1], is_speech=False
+                        audio=chunk_ts[0],
+                        timestamp_ms=chunk_ts[1],
+                        speech_probability=None,
                    )
                    for chunk_ts in result.queued_audio
                )
@ -827,7 +829,7 @@ class PipelineRun:

            if wake_word_vad is not None:
                chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
-                if not wake_word_vad.process(chunk_seconds, chunk.is_speech):
+                if not wake_word_vad.process(chunk_seconds, chunk.speech_probability):
                    raise WakeWordTimeoutError(
                        code="wake-word-timeout", message="Wake word was not detected"
                    )
@ -955,7 +957,7 @@ class PipelineRun:

            if stt_vad is not None:
                chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
-                if not stt_vad.process(chunk_seconds, chunk.is_speech):
+                if not stt_vad.process(chunk_seconds, chunk.speech_probability):
                    # Silence detected at the end of voice command
                    self.process_event(
                        PipelineEvent(
@ -1221,7 +1223,7 @@ class PipelineRun:
                yield EnhancedAudioChunk(
                    audio=sub_chunk,
                    timestamp_ms=timestamp_ms,
-                    is_speech=None,  # no VAD
+                    speech_probability=None,  # no VAD
                )
                timestamp_ms += MS_PER_CHUNK

--- a/homeassistant/components/assist_pipeline/vad.py
+++ b/homeassistant/components/assist_pipeline/vad.py
@ -75,7 +75,7 @@ class AudioBuffer:
 class VoiceCommandSegmenter:
    """Segments an audio stream into voice commands."""

-    speech_seconds: float = 0.3
+    speech_seconds: float = 0.1
    """Seconds of speech before voice command has started."""

    command_seconds: float = 1.0
@ -96,6 +96,12 @@ class VoiceCommandSegmenter:
    timed_out: bool = False
    """True a timeout occurred during voice command."""

+    before_command_speech_threshold: float = 0.2
+    """Probability threshold for speech before voice command."""
+
+    in_command_speech_threshold: float = 0.5
+    """Probability threshold for speech during voice command."""
+
    _speech_seconds_left: float = 0.0
    """Seconds left before considering voice command as started."""

@ -124,7 +130,7 @@ class VoiceCommandSegmenter:
        self._reset_seconds_left = self.reset_seconds
        self.in_command = False

-    def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
+    def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
        """Process samples using external VAD.

        Returns False when command is done.
@ -142,7 +148,12 @@ class VoiceCommandSegmenter:
            self.timed_out = True
            return False

+        if speech_probability is None:
+            speech_probability = 0.0
+
        if not self.in_command:
+            # Before command
+            is_speech = speech_probability > self.before_command_speech_threshold
            if is_speech:
                self._reset_seconds_left = self.reset_seconds
                self._speech_seconds_left -= chunk_seconds
@ -160,24 +171,29 @@ class VoiceCommandSegmenter:
                if self._reset_seconds_left <= 0:
                    self._speech_seconds_left = self.speech_seconds
                    self._reset_seconds_left = self.reset_seconds
-        elif not is_speech:
-            # Silence in command
-            self._reset_seconds_left = self.reset_seconds
-            self._silence_seconds_left -= chunk_seconds
-            self._command_seconds_left -= chunk_seconds
-            if (self._silence_seconds_left <= 0) and (self._command_seconds_left <= 0):
-                # Command finished successfully
-                self.reset()
-                _LOGGER.debug("Voice command finished")
-                return False
        else:
-            # Speech in command.
-            # Reset silence counter if enough speech.
-            self._reset_seconds_left -= chunk_seconds
-            self._command_seconds_left -= chunk_seconds
-            if self._reset_seconds_left <= 0:
-                self._silence_seconds_left = self.silence_seconds
+            # In command
+            is_speech = speech_probability > self.in_command_speech_threshold
+            if not is_speech:
+                # Silence in command
                self._reset_seconds_left = self.reset_seconds
+                self._silence_seconds_left -= chunk_seconds
+                self._command_seconds_left -= chunk_seconds
+                if (self._silence_seconds_left <= 0) and (
+                    self._command_seconds_left <= 0
+                ):
+                    # Command finished successfully
+                    self.reset()
+                    _LOGGER.debug("Voice command finished")
+                    return False
+            else:
+                # Speech in command.
+                # Reset silence counter if enough speech.
+                self._reset_seconds_left -= chunk_seconds
+                self._command_seconds_left -= chunk_seconds
+                if self._reset_seconds_left <= 0:
+                    self._silence_seconds_left = self.silence_seconds
+                    self._reset_seconds_left = self.reset_seconds

        return True

@ -226,6 +242,9 @@ class VoiceActivityTimeout:
    reset_seconds: float = 0.5
    """Seconds of speech before resetting timeout."""

+    speech_threshold: float = 0.5
+    """Threshold for speech."""
+
    _silence_seconds_left: float = 0.0
    """Seconds left before considering voice command as stopped."""

@ -241,12 +260,15 @@ class VoiceActivityTimeout:
        self._silence_seconds_left = self.silence_seconds
        self._reset_seconds_left = self.reset_seconds

-    def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
+    def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
        """Process samples using external VAD.

        Returns False when timeout is reached.
        """
-        if is_speech:
+        if speech_probability is None:
+            speech_probability = 0.0
+
+        if speech_probability > self.speech_threshold:
            # Speech
            self._reset_seconds_left -= chunk_seconds
            if self._reset_seconds_left <= 0:
--- a/tests/components/assist_pipeline/test_vad.py
+++ b/tests/components/assist_pipeline/test_vad.py
@ -16,7 +16,7 @@ def test_silence() -> None:
    segmenter = VoiceCommandSegmenter()

    # True return value indicates voice command has not finished
-    assert segmenter.process(_ONE_SECOND * 3, False)
+    assert segmenter.process(_ONE_SECOND * 3, 0.0)
    assert not segmenter.in_command


@ -26,15 +26,15 @@ def test_speech() -> None:
    segmenter = VoiceCommandSegmenter()

    # silence
-    assert segmenter.process(_ONE_SECOND, False)
+    assert segmenter.process(_ONE_SECOND, 0.0)

    # "speech"
-    assert segmenter.process(_ONE_SECOND, True)
+    assert segmenter.process(_ONE_SECOND, 1.0)
    assert segmenter.in_command

    # silence
    # False return value indicates voice command is finished
-    assert not segmenter.process(_ONE_SECOND, False)
+    assert not segmenter.process(_ONE_SECOND, 0.0)
    assert not segmenter.in_command


@ -112,19 +112,19 @@ def test_silence_seconds() -> None:
    segmenter = VoiceCommandSegmenter(silence_seconds=1.0)

    # silence
-    assert segmenter.process(_ONE_SECOND, False)
+    assert segmenter.process(_ONE_SECOND, 0.0)
    assert not segmenter.in_command

    # "speech"
-    assert segmenter.process(_ONE_SECOND, True)
+    assert segmenter.process(_ONE_SECOND, 1.0)
    assert segmenter.in_command

    # not enough silence to end
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert segmenter.in_command

    # exactly enough silence now
-    assert not segmenter.process(_ONE_SECOND * 0.5, False)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert not segmenter.in_command


@ -134,27 +134,27 @@ def test_silence_reset() -> None:
    segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)

    # silence
-    assert segmenter.process(_ONE_SECOND, False)
+    assert segmenter.process(_ONE_SECOND, 0.0)
    assert not segmenter.in_command

    # "speech"
-    assert segmenter.process(_ONE_SECOND, True)
+    assert segmenter.process(_ONE_SECOND, 1.0)
    assert segmenter.in_command

    # not enough silence to end
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert segmenter.in_command

    # speech should reset silence detection
-    assert segmenter.process(_ONE_SECOND * 0.5, True)
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert segmenter.in_command

    # not enough silence to end
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert segmenter.in_command

    # exactly enough silence now
-    assert not segmenter.process(_ONE_SECOND * 0.5, False)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert not segmenter.in_command


@ -166,23 +166,23 @@ def test_speech_reset() -> None:
    )

    # silence
-    assert segmenter.process(_ONE_SECOND, False)
+    assert segmenter.process(_ONE_SECOND, 0.0)
    assert not segmenter.in_command

    # not enough speech to start voice command
-    assert segmenter.process(_ONE_SECOND * 0.5, True)
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert not segmenter.in_command

    # silence should reset speech detection
-    assert segmenter.process(_ONE_SECOND, False)
+    assert segmenter.process(_ONE_SECOND, 0.0)
    assert not segmenter.in_command

    # not enough speech to start voice command
-    assert segmenter.process(_ONE_SECOND * 0.5, True)
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert not segmenter.in_command

    # exactly enough speech now
-    assert segmenter.process(_ONE_SECOND * 0.5, True)
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert segmenter.in_command


@ -193,18 +193,18 @@ def test_timeout() -> None:

    # not enough to time out
    assert not segmenter.timed_out
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert not segmenter.timed_out

    # enough to time out
-    assert not segmenter.process(_ONE_SECOND * 0.5, True)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert segmenter.timed_out

    # flag resets with more audio
-    assert segmenter.process(_ONE_SECOND * 0.5, True)
+    assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
    assert not segmenter.timed_out

-    assert not segmenter.process(_ONE_SECOND * 0.5, False)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
    assert segmenter.timed_out


@ -215,14 +215,38 @@ def test_command_seconds() -> None:
        command_seconds=3, speech_seconds=1, silence_seconds=1, reset_seconds=1
    )

-    assert segmenter.process(_ONE_SECOND, True)
+    assert segmenter.process(_ONE_SECOND, 1.0)

    # Silence counts towards total command length
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)

    # Enough to finish command now
-    assert segmenter.process(_ONE_SECOND, True)
-    assert segmenter.process(_ONE_SECOND * 0.5, False)
+    assert segmenter.process(_ONE_SECOND, 1.0)
+    assert segmenter.process(_ONE_SECOND * 0.5, 0.0)

    # Silence to finish
-    assert not segmenter.process(_ONE_SECOND * 0.5, False)
+    assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
+
+
+def test_speech_thresholds() -> None:
+    """Test before/in command speech thresholds."""
+
+    segmenter = VoiceCommandSegmenter(
+        before_command_speech_threshold=0.2,
+        in_command_speech_threshold=0.5,
+        command_seconds=2,
+        speech_seconds=1,
+        silence_seconds=1,
+    )
+
+    # Not high enough probability to trigger command
+    assert segmenter.process(_ONE_SECOND, 0.1)
+    assert not segmenter.in_command
+
+    # Triggers command
+    assert segmenter.process(_ONE_SECOND, 0.3)
+    assert segmenter.in_command
+
+    # Now that same probability is considered silence.
+    # Finishes command.
+    assert not segmenter.process(_ONE_SECOND, 0.3)