Use different VAD thresholds for before and during voice command (#129848)

* Use two VAD thresholds

* Fix VoiceActivityTimeout class

* Update homeassistant/components/assist_pipeline/audio_enhancer.py

---------

Co-authored-by: Joost Lekkerkerker <joostlek@outlook.com>
This commit is contained in:
Michael Hansen 2024-11-05 08:01:45 -06:00 committed by GitHub
parent 6caa4baa00
commit 69e3348cd7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 108 additions and 60 deletions

View File

@ -22,8 +22,8 @@ class EnhancedAudioChunk:
timestamp_ms: int
"""Timestamp relative to start of audio stream (milliseconds)"""
is_speech: bool | None
"""True if audio chunk likely contains speech, False if not, None if unknown"""
speech_probability: float | None
"""Probability that audio chunk contains speech (0-1), None if unknown"""
class AudioEnhancer(ABC):
@ -70,27 +70,27 @@ class MicroVadSpeexEnhancer(AudioEnhancer):
)
self.vad: MicroVad | None = None
self.threshold = 0.5
if self.is_vad_enabled:
self.vad = MicroVad()
_LOGGER.debug("Initialized microVAD with threshold=%s", self.threshold)
_LOGGER.debug("Initialized microVAD")
def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
"""Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples."""
is_speech: bool | None = None
speech_probability: float | None = None
assert len(audio) == BYTES_PER_CHUNK
if self.vad is not None:
# Run VAD
speech_prob = self.vad.Process10ms(audio)
is_speech = speech_prob > self.threshold
speech_probability = self.vad.Process10ms(audio)
if self.audio_processor is not None:
# Run noise suppression and auto gain
audio = self.audio_processor.Process10ms(audio).audio
return EnhancedAudioChunk(
audio=audio, timestamp_ms=timestamp_ms, is_speech=is_speech
audio=audio,
timestamp_ms=timestamp_ms,
speech_probability=speech_probability,
)

View File

@ -780,7 +780,9 @@ class PipelineRun:
# speaking the voice command.
audio_chunks_for_stt.extend(
EnhancedAudioChunk(
audio=chunk_ts[0], timestamp_ms=chunk_ts[1], is_speech=False
audio=chunk_ts[0],
timestamp_ms=chunk_ts[1],
speech_probability=None,
)
for chunk_ts in result.queued_audio
)
@ -827,7 +829,7 @@ class PipelineRun:
if wake_word_vad is not None:
chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
if not wake_word_vad.process(chunk_seconds, chunk.is_speech):
if not wake_word_vad.process(chunk_seconds, chunk.speech_probability):
raise WakeWordTimeoutError(
code="wake-word-timeout", message="Wake word was not detected"
)
@ -955,7 +957,7 @@ class PipelineRun:
if stt_vad is not None:
chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
if not stt_vad.process(chunk_seconds, chunk.is_speech):
if not stt_vad.process(chunk_seconds, chunk.speech_probability):
# Silence detected at the end of voice command
self.process_event(
PipelineEvent(
@ -1221,7 +1223,7 @@ class PipelineRun:
yield EnhancedAudioChunk(
audio=sub_chunk,
timestamp_ms=timestamp_ms,
is_speech=None, # no VAD
speech_probability=None, # no VAD
)
timestamp_ms += MS_PER_CHUNK

View File

@ -75,7 +75,7 @@ class AudioBuffer:
class VoiceCommandSegmenter:
"""Segments an audio stream into voice commands."""
speech_seconds: float = 0.3
speech_seconds: float = 0.1
"""Seconds of speech before voice command has started."""
command_seconds: float = 1.0
@ -96,6 +96,12 @@ class VoiceCommandSegmenter:
timed_out: bool = False
"""True a timeout occurred during voice command."""
before_command_speech_threshold: float = 0.2
"""Probability threshold for speech before voice command."""
in_command_speech_threshold: float = 0.5
"""Probability threshold for speech during voice command."""
_speech_seconds_left: float = 0.0
"""Seconds left before considering voice command as started."""
@ -124,7 +130,7 @@ class VoiceCommandSegmenter:
self._reset_seconds_left = self.reset_seconds
self.in_command = False
def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
"""Process samples using external VAD.
Returns False when command is done.
@ -142,7 +148,12 @@ class VoiceCommandSegmenter:
self.timed_out = True
return False
if speech_probability is None:
speech_probability = 0.0
if not self.in_command:
# Before command
is_speech = speech_probability > self.before_command_speech_threshold
if is_speech:
self._reset_seconds_left = self.reset_seconds
self._speech_seconds_left -= chunk_seconds
@ -160,24 +171,29 @@ class VoiceCommandSegmenter:
if self._reset_seconds_left <= 0:
self._speech_seconds_left = self.speech_seconds
self._reset_seconds_left = self.reset_seconds
elif not is_speech:
# Silence in command
self._reset_seconds_left = self.reset_seconds
self._silence_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if (self._silence_seconds_left <= 0) and (self._command_seconds_left <= 0):
# Command finished successfully
self.reset()
_LOGGER.debug("Voice command finished")
return False
else:
# Speech in command.
# Reset silence counter if enough speech.
self._reset_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._silence_seconds_left = self.silence_seconds
# In command
is_speech = speech_probability > self.in_command_speech_threshold
if not is_speech:
# Silence in command
self._reset_seconds_left = self.reset_seconds
self._silence_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if (self._silence_seconds_left <= 0) and (
self._command_seconds_left <= 0
):
# Command finished successfully
self.reset()
_LOGGER.debug("Voice command finished")
return False
else:
# Speech in command.
# Reset silence counter if enough speech.
self._reset_seconds_left -= chunk_seconds
self._command_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:
self._silence_seconds_left = self.silence_seconds
self._reset_seconds_left = self.reset_seconds
return True
@ -226,6 +242,9 @@ class VoiceActivityTimeout:
reset_seconds: float = 0.5
"""Seconds of speech before resetting timeout."""
speech_threshold: float = 0.5
"""Threshold for speech."""
_silence_seconds_left: float = 0.0
"""Seconds left before considering voice command as stopped."""
@ -241,12 +260,15 @@ class VoiceActivityTimeout:
self._silence_seconds_left = self.silence_seconds
self._reset_seconds_left = self.reset_seconds
def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
"""Process samples using external VAD.
Returns False when timeout is reached.
"""
if is_speech:
if speech_probability is None:
speech_probability = 0.0
if speech_probability > self.speech_threshold:
# Speech
self._reset_seconds_left -= chunk_seconds
if self._reset_seconds_left <= 0:

View File

@ -16,7 +16,7 @@ def test_silence() -> None:
segmenter = VoiceCommandSegmenter()
# True return value indicates voice command has not finished
assert segmenter.process(_ONE_SECOND * 3, False)
assert segmenter.process(_ONE_SECOND * 3, 0.0)
assert not segmenter.in_command
@ -26,15 +26,15 @@ def test_speech() -> None:
segmenter = VoiceCommandSegmenter()
# silence
assert segmenter.process(_ONE_SECOND, False)
assert segmenter.process(_ONE_SECOND, 0.0)
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.process(_ONE_SECOND, 1.0)
assert segmenter.in_command
# silence
# False return value indicates voice command is finished
assert not segmenter.process(_ONE_SECOND, False)
assert not segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
@ -112,19 +112,19 @@ def test_silence_seconds() -> None:
segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.process(_ONE_SECOND, 1.0)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert not segmenter.in_command
@ -134,27 +134,27 @@ def test_silence_reset() -> None:
segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
# "speech"
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.process(_ONE_SECOND, 1.0)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert segmenter.in_command
# speech should reset silence detection
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.in_command
# not enough silence to end
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert segmenter.in_command
# exactly enough silence now
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert not segmenter.in_command
@ -166,23 +166,23 @@ def test_speech_reset() -> None:
)
# silence
assert segmenter.process(_ONE_SECOND, False)
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert not segmenter.in_command
# silence should reset speech detection
assert segmenter.process(_ONE_SECOND, False)
assert segmenter.process(_ONE_SECOND, 0.0)
assert not segmenter.in_command
# not enough speech to start voice command
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert not segmenter.in_command
# exactly enough speech now
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.in_command
@ -193,18 +193,18 @@ def test_timeout() -> None:
# not enough to time out
assert not segmenter.timed_out
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert not segmenter.timed_out
# enough to time out
assert not segmenter.process(_ONE_SECOND * 0.5, True)
assert not segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert segmenter.timed_out
# flag resets with more audio
assert segmenter.process(_ONE_SECOND * 0.5, True)
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
assert not segmenter.timed_out
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
assert segmenter.timed_out
@ -215,14 +215,38 @@ def test_command_seconds() -> None:
command_seconds=3, speech_seconds=1, silence_seconds=1, reset_seconds=1
)
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.process(_ONE_SECOND, 1.0)
# Silence counts towards total command length
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
# Enough to finish command now
assert segmenter.process(_ONE_SECOND, True)
assert segmenter.process(_ONE_SECOND * 0.5, False)
assert segmenter.process(_ONE_SECOND, 1.0)
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
# Silence to finish
assert not segmenter.process(_ONE_SECOND * 0.5, False)
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
def test_speech_thresholds() -> None:
"""Test before/in command speech thresholds."""
segmenter = VoiceCommandSegmenter(
before_command_speech_threshold=0.2,
in_command_speech_threshold=0.5,
command_seconds=2,
speech_seconds=1,
silence_seconds=1,
)
# Not high enough probability to trigger command
assert segmenter.process(_ONE_SECOND, 0.1)
assert not segmenter.in_command
# Triggers command
assert segmenter.process(_ONE_SECOND, 0.3)
assert segmenter.in_command
# Now that same probability is considered silence.
# Finishes command.
assert not segmenter.process(_ONE_SECOND, 0.3)