core/homeassistant/components/assist_pipeline/audio_enhancer.py

"""Audio enhancement for Assist."""

from abc import ABC, abstractmethod
from dataclasses import dataclass
import logging
import math

from pysilero_vad import SileroVoiceActivityDetector
from pyspeex_noise import AudioProcessor

from .const import BYTES_PER_CHUNK

_LOGGER = logging.getLogger(__name__)


@dataclass(frozen=True, slots=True)
class EnhancedAudioChunk:
    """Enhanced audio chunk and metadata."""

    audio: bytes
    """Raw PCM audio @ 16Khz with 16-bit mono samples"""

    timestamp_ms: int
    """Timestamp relative to start of audio stream (milliseconds)"""

    speech_probability: float | None
    """Probability that audio chunk contains speech (0-1), None if unknown"""


class AudioEnhancer(ABC):
    """Base class for audio enhancement."""

    def __init__(
        self, auto_gain: int, noise_suppression: int, is_vad_enabled: bool
    ) -> None:
        """Initialize audio enhancer."""
        self.auto_gain = auto_gain
        self.noise_suppression = noise_suppression
        self.is_vad_enabled = is_vad_enabled

    @abstractmethod
    def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
        """Enhance chunk of PCM audio @ 16Khz with 16-bit mono samples."""


class SileroVadSpeexEnhancer(AudioEnhancer):
    """Audio enhancer that runs Silero VAD and speex."""

    def __init__(
        self, auto_gain: int, noise_suppression: int, is_vad_enabled: bool
    ) -> None:
        """Initialize audio enhancer."""
        super().__init__(auto_gain, noise_suppression, is_vad_enabled)

        self.audio_processor: AudioProcessor | None = None

        # Scale from 0-4
        self.noise_suppression = noise_suppression * -15

        # Scale from 0-31
        self.auto_gain = auto_gain * 300

        if (self.auto_gain != 0) or (self.noise_suppression != 0):
            self.audio_processor = AudioProcessor(
                self.auto_gain, self.noise_suppression
            )
            _LOGGER.debug(
                "Initialized speex with auto_gain=%s, noise_suppression=%s",
                self.auto_gain,
                self.noise_suppression,
            )

        self.vad: SileroVoiceActivityDetector | None = None

        # We get 10ms chunks but Silero works on 32ms chunks, so we have to
        # buffer audio. The previous speech probability is used until enough
        # audio has been buffered.
        self._vad_buffer: bytearray | None = None
        self._vad_buffer_chunks = 0
        self._vad_buffer_chunk_idx = 0
        self._last_speech_probability: float | None = None

        if self.is_vad_enabled:
            self.vad = SileroVoiceActivityDetector()

            # VAD buffer is a multiple of 10ms, but Silero VAD needs 32ms.
            self._vad_buffer_chunks = int(
                math.ceil(self.vad.chunk_bytes() / BYTES_PER_CHUNK)
            )
            self._vad_leftover_bytes = self.vad.chunk_bytes() - BYTES_PER_CHUNK
            self._vad_buffer = bytearray(self.vad.chunk_bytes())
            _LOGGER.debug("Initialized Silero VAD")

    def enhance_chunk(self, audio: bytes, timestamp_ms: int) -> EnhancedAudioChunk:
        """Enhance 10ms chunk of PCM audio @ 16Khz with 16-bit mono samples."""
        assert len(audio) == BYTES_PER_CHUNK

        if self.vad is not None:
            # Run VAD
            assert self._vad_buffer is not None
            start_idx = self._vad_buffer_chunk_idx * BYTES_PER_CHUNK
            self._vad_buffer[start_idx : start_idx + BYTES_PER_CHUNK] = audio

            self._vad_buffer_chunk_idx += 1
            if self._vad_buffer_chunk_idx >= self._vad_buffer_chunks:
                # We have enough data to run Silero VAD (32 ms)
                self._last_speech_probability = self.vad.process_chunk(
                    self._vad_buffer[: self.vad.chunk_bytes()]
                )

                # Copy leftover audio that wasn't processed to start
                self._vad_buffer[: self._vad_leftover_bytes] = self._vad_buffer[
                    -self._vad_leftover_bytes :
                ]
                self._vad_buffer_chunk_idx = 0

        if self.audio_processor is not None:
            # Run noise suppression and auto gain
            audio = self.audio_processor.Process10ms(audio).audio

        return EnhancedAudioChunk(
            audio=audio,
            timestamp_ms=timestamp_ms,
            speech_probability=self._last_speech_probability,
        )