mirror of
https://github.com/home-assistant/core.git
synced 2025-07-15 17:27:10 +00:00
Use different VAD thresholds for before and during voice command (#129848)
* Use two VAD thresholds * Fix VoiceActivityTimeout class * Update homeassistant/components/assist_pipeline/audio_enhancer.py --------- Co-authored-by: Joost Lekkerkerker <joostlek@outlook.com>
This commit is contained in:
parent
6caa4baa00
commit
69e3348cd7
@ -22,8 +22,8 @@ class EnhancedAudioChunk:
|
||||
timestamp_ms: int
|
||||
"""Timestamp relative to start of audio stream (milliseconds)"""
|
||||
|
||||
is_speech: bool | None
|
||||
"""True if audio chunk likely contains speech, False if not, None if unknown"""
|
||||
speech_probability: float | None
|
||||
"""Probability that audio chunk contains speech (0-1), None if unknown"""
|
||||
|
||||
|
||||
class AudioEnhancer(ABC):
|
||||
@ -70,27 +70,27 @@ class MicroVadSpeexEnhancer(AudioEnhancer):
|
||||
)
|
||||
|
||||
self.vad: MicroVad | None = None
|
||||
self.threshold = 0.5
|
||||
|
||||
if self.is_vad_enabled:
|
||||
self.vad = MicroVad()
|
||||
_LOGGER.debug("Initialized microVAD with threshold=%s", self.threshold)
|
||||
_LOGGER.debug("Initialized microVAD")
|
||||
|
||||
def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
|
||||
"""Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples."""
|
||||
is_speech: bool | None = None
|
||||
speech_probability: float | None = None
|
||||
|
||||
assert len(audio) == BYTES_PER_CHUNK
|
||||
|
||||
if self.vad is not None:
|
||||
# Run VAD
|
||||
speech_prob = self.vad.Process10ms(audio)
|
||||
is_speech = speech_prob > self.threshold
|
||||
speech_probability = self.vad.Process10ms(audio)
|
||||
|
||||
if self.audio_processor is not None:
|
||||
# Run noise suppression and auto gain
|
||||
audio = self.audio_processor.Process10ms(audio).audio
|
||||
|
||||
return EnhancedAudioChunk(
|
||||
audio=audio, timestamp_ms=timestamp_ms, is_speech=is_speech
|
||||
audio=audio,
|
||||
timestamp_ms=timestamp_ms,
|
||||
speech_probability=speech_probability,
|
||||
)
|
||||
|
@ -780,7 +780,9 @@ class PipelineRun:
|
||||
# speaking the voice command.
|
||||
audio_chunks_for_stt.extend(
|
||||
EnhancedAudioChunk(
|
||||
audio=chunk_ts[0], timestamp_ms=chunk_ts[1], is_speech=False
|
||||
audio=chunk_ts[0],
|
||||
timestamp_ms=chunk_ts[1],
|
||||
speech_probability=None,
|
||||
)
|
||||
for chunk_ts in result.queued_audio
|
||||
)
|
||||
@ -827,7 +829,7 @@ class PipelineRun:
|
||||
|
||||
if wake_word_vad is not None:
|
||||
chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
|
||||
if not wake_word_vad.process(chunk_seconds, chunk.is_speech):
|
||||
if not wake_word_vad.process(chunk_seconds, chunk.speech_probability):
|
||||
raise WakeWordTimeoutError(
|
||||
code="wake-word-timeout", message="Wake word was not detected"
|
||||
)
|
||||
@ -955,7 +957,7 @@ class PipelineRun:
|
||||
|
||||
if stt_vad is not None:
|
||||
chunk_seconds = (len(chunk.audio) // sample_width) / sample_rate
|
||||
if not stt_vad.process(chunk_seconds, chunk.is_speech):
|
||||
if not stt_vad.process(chunk_seconds, chunk.speech_probability):
|
||||
# Silence detected at the end of voice command
|
||||
self.process_event(
|
||||
PipelineEvent(
|
||||
@ -1221,7 +1223,7 @@ class PipelineRun:
|
||||
yield EnhancedAudioChunk(
|
||||
audio=sub_chunk,
|
||||
timestamp_ms=timestamp_ms,
|
||||
is_speech=None, # no VAD
|
||||
speech_probability=None, # no VAD
|
||||
)
|
||||
timestamp_ms += MS_PER_CHUNK
|
||||
|
||||
|
@ -75,7 +75,7 @@ class AudioBuffer:
|
||||
class VoiceCommandSegmenter:
|
||||
"""Segments an audio stream into voice commands."""
|
||||
|
||||
speech_seconds: float = 0.3
|
||||
speech_seconds: float = 0.1
|
||||
"""Seconds of speech before voice command has started."""
|
||||
|
||||
command_seconds: float = 1.0
|
||||
@ -96,6 +96,12 @@ class VoiceCommandSegmenter:
|
||||
timed_out: bool = False
|
||||
"""True a timeout occurred during voice command."""
|
||||
|
||||
before_command_speech_threshold: float = 0.2
|
||||
"""Probability threshold for speech before voice command."""
|
||||
|
||||
in_command_speech_threshold: float = 0.5
|
||||
"""Probability threshold for speech during voice command."""
|
||||
|
||||
_speech_seconds_left: float = 0.0
|
||||
"""Seconds left before considering voice command as started."""
|
||||
|
||||
@ -124,7 +130,7 @@ class VoiceCommandSegmenter:
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self.in_command = False
|
||||
|
||||
def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
|
||||
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
|
||||
"""Process samples using external VAD.
|
||||
|
||||
Returns False when command is done.
|
||||
@ -142,7 +148,12 @@ class VoiceCommandSegmenter:
|
||||
self.timed_out = True
|
||||
return False
|
||||
|
||||
if speech_probability is None:
|
||||
speech_probability = 0.0
|
||||
|
||||
if not self.in_command:
|
||||
# Before command
|
||||
is_speech = speech_probability > self.before_command_speech_threshold
|
||||
if is_speech:
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._speech_seconds_left -= chunk_seconds
|
||||
@ -160,24 +171,29 @@ class VoiceCommandSegmenter:
|
||||
if self._reset_seconds_left <= 0:
|
||||
self._speech_seconds_left = self.speech_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
elif not is_speech:
|
||||
# Silence in command
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._silence_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if (self._silence_seconds_left <= 0) and (self._command_seconds_left <= 0):
|
||||
# Command finished successfully
|
||||
self.reset()
|
||||
_LOGGER.debug("Voice command finished")
|
||||
return False
|
||||
else:
|
||||
# Speech in command.
|
||||
# Reset silence counter if enough speech.
|
||||
self._reset_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if self._reset_seconds_left <= 0:
|
||||
self._silence_seconds_left = self.silence_seconds
|
||||
# In command
|
||||
is_speech = speech_probability > self.in_command_speech_threshold
|
||||
if not is_speech:
|
||||
# Silence in command
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
self._silence_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if (self._silence_seconds_left <= 0) and (
|
||||
self._command_seconds_left <= 0
|
||||
):
|
||||
# Command finished successfully
|
||||
self.reset()
|
||||
_LOGGER.debug("Voice command finished")
|
||||
return False
|
||||
else:
|
||||
# Speech in command.
|
||||
# Reset silence counter if enough speech.
|
||||
self._reset_seconds_left -= chunk_seconds
|
||||
self._command_seconds_left -= chunk_seconds
|
||||
if self._reset_seconds_left <= 0:
|
||||
self._silence_seconds_left = self.silence_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
|
||||
return True
|
||||
|
||||
@ -226,6 +242,9 @@ class VoiceActivityTimeout:
|
||||
reset_seconds: float = 0.5
|
||||
"""Seconds of speech before resetting timeout."""
|
||||
|
||||
speech_threshold: float = 0.5
|
||||
"""Threshold for speech."""
|
||||
|
||||
_silence_seconds_left: float = 0.0
|
||||
"""Seconds left before considering voice command as stopped."""
|
||||
|
||||
@ -241,12 +260,15 @@ class VoiceActivityTimeout:
|
||||
self._silence_seconds_left = self.silence_seconds
|
||||
self._reset_seconds_left = self.reset_seconds
|
||||
|
||||
def process(self, chunk_seconds: float, is_speech: bool | None) -> bool:
|
||||
def process(self, chunk_seconds: float, speech_probability: float | None) -> bool:
|
||||
"""Process samples using external VAD.
|
||||
|
||||
Returns False when timeout is reached.
|
||||
"""
|
||||
if is_speech:
|
||||
if speech_probability is None:
|
||||
speech_probability = 0.0
|
||||
|
||||
if speech_probability > self.speech_threshold:
|
||||
# Speech
|
||||
self._reset_seconds_left -= chunk_seconds
|
||||
if self._reset_seconds_left <= 0:
|
||||
|
@ -16,7 +16,7 @@ def test_silence() -> None:
|
||||
segmenter = VoiceCommandSegmenter()
|
||||
|
||||
# True return value indicates voice command has not finished
|
||||
assert segmenter.process(_ONE_SECOND * 3, False)
|
||||
assert segmenter.process(_ONE_SECOND * 3, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
@ -26,15 +26,15 @@ def test_speech() -> None:
|
||||
segmenter = VoiceCommandSegmenter()
|
||||
|
||||
# silence
|
||||
assert segmenter.process(_ONE_SECOND, False)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
|
||||
# "speech"
|
||||
assert segmenter.process(_ONE_SECOND, True)
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# silence
|
||||
# False return value indicates voice command is finished
|
||||
assert not segmenter.process(_ONE_SECOND, False)
|
||||
assert not segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
@ -112,19 +112,19 @@ def test_silence_seconds() -> None:
|
||||
segmenter = VoiceCommandSegmenter(silence_seconds=1.0)
|
||||
|
||||
# silence
|
||||
assert segmenter.process(_ONE_SECOND, False)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# "speech"
|
||||
assert segmenter.process(_ONE_SECOND, True)
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# not enough silence to end
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# exactly enough silence now
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
@ -134,27 +134,27 @@ def test_silence_reset() -> None:
|
||||
segmenter = VoiceCommandSegmenter(silence_seconds=1.0, reset_seconds=0.5)
|
||||
|
||||
# silence
|
||||
assert segmenter.process(_ONE_SECOND, False)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# "speech"
|
||||
assert segmenter.process(_ONE_SECOND, True)
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# not enough silence to end
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# speech should reset silence detection
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# not enough silence to end
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
# exactly enough silence now
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
|
||||
@ -166,23 +166,23 @@ def test_speech_reset() -> None:
|
||||
)
|
||||
|
||||
# silence
|
||||
assert segmenter.process(_ONE_SECOND, False)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# not enough speech to start voice command
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# silence should reset speech detection
|
||||
assert segmenter.process(_ONE_SECOND, False)
|
||||
assert segmenter.process(_ONE_SECOND, 0.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# not enough speech to start voice command
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# exactly enough speech now
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.in_command
|
||||
|
||||
|
||||
@ -193,18 +193,18 @@ def test_timeout() -> None:
|
||||
|
||||
# not enough to time out
|
||||
assert not segmenter.timed_out
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert not segmenter.timed_out
|
||||
|
||||
# enough to time out
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert segmenter.timed_out
|
||||
|
||||
# flag resets with more audio
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 1.0)
|
||||
assert not segmenter.timed_out
|
||||
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
assert segmenter.timed_out
|
||||
|
||||
|
||||
@ -215,14 +215,38 @@ def test_command_seconds() -> None:
|
||||
command_seconds=3, speech_seconds=1, silence_seconds=1, reset_seconds=1
|
||||
)
|
||||
|
||||
assert segmenter.process(_ONE_SECOND, True)
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
|
||||
# Silence counts towards total command length
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
|
||||
# Enough to finish command now
|
||||
assert segmenter.process(_ONE_SECOND, True)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert segmenter.process(_ONE_SECOND, 1.0)
|
||||
assert segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
|
||||
# Silence to finish
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, False)
|
||||
assert not segmenter.process(_ONE_SECOND * 0.5, 0.0)
|
||||
|
||||
|
||||
def test_speech_thresholds() -> None:
|
||||
"""Test before/in command speech thresholds."""
|
||||
|
||||
segmenter = VoiceCommandSegmenter(
|
||||
before_command_speech_threshold=0.2,
|
||||
in_command_speech_threshold=0.5,
|
||||
command_seconds=2,
|
||||
speech_seconds=1,
|
||||
silence_seconds=1,
|
||||
)
|
||||
|
||||
# Not high enough probability to trigger command
|
||||
assert segmenter.process(_ONE_SECOND, 0.1)
|
||||
assert not segmenter.in_command
|
||||
|
||||
# Triggers command
|
||||
assert segmenter.process(_ONE_SECOND, 0.3)
|
||||
assert segmenter.in_command
|
||||
|
||||
# Now that same probability is considered silence.
|
||||
# Finishes command.
|
||||
assert not segmenter.process(_ONE_SECOND, 0.3)
|
||||
|
Loading…
x
Reference in New Issue
Block a user