Browse Source

silero_deactivity_detection

KoljaB 9 months ago
parent
commit
60f11672de
2 changed files with 16 additions and 4 deletions
  1. 3 1
      README.md
  2. 13 3
      RealtimeSTT/audio_recorder.py

+ 3 - 1
README.md

@@ -22,7 +22,7 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
 ### Updates
 
-Latest Version: v0.2.1
+Latest Version: v0.2.2
 
 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
 
@@ -333,6 +333,8 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
 - **silero_use_onnx** (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is False. Recommended for faster performance.
 
+- **silero_deactivity_detection** (bool, default=False): Enables the Silero model for end-of-speech detection. More robust against background noise. Utilizes additional GPU resources but improves accuracy in noisy environments. When False, uses the default WebRTC VAD, which is more sensitive but may continue recording longer due to background sounds.
+
 - **webrtc_sensitivity** (int, default=3): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 0 (least aggressive / most sensitive) to 3 (most aggressive, least sensitive). Default is 3.
 
 - **post_speech_silence_duration** (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.

+ 13 - 3
RealtimeSTT/audio_recorder.py

@@ -115,6 +115,7 @@ class AudioToTextRecorder:
                  # Voice activation parameters
                  silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
                  silero_use_onnx: bool = False,
+                 silero_deactivity_detection: bool = False,
                  webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
                  post_speech_silence_duration: float = (
                      INIT_POST_SPEECH_SILENCE_DURATION
@@ -228,6 +229,12 @@ class AudioToTextRecorder:
             pre-trained model from Silero in the ONNX (Open Neural Network
             Exchange) format instead of the PyTorch format. This is
             recommended for faster performance.
+		- silero_deactivity_detection (bool, default=False): Enables the Silero
+            model for end-of-speech detection. More robust against background
+            noise. Utilizes additional GPU resources but improves accuracy in
+            noisy environments. When False, uses the default WebRTC VAD,
+            which is more sensitive but may continue recording longer due
+            to background sounds.
         - webrtc_sensitivity (int, default=WEBRTC_SENSITIVITY): Sensitivity
             for the WebRTC Voice Activity Detection engine ranging from 0
             (least aggressive / most sensitive) to 3 (most aggressive,
@@ -381,6 +388,7 @@ class AudioToTextRecorder:
         self.silero_working = False
         self.speech_end_silence_start = 0
         self.silero_sensitivity = silero_sensitivity
+        self.silero_deactivity_detection = silero_deactivity_detection
         self.listen_start = 0
         self.spinner = spinner
         self.halo = None
@@ -1350,14 +1358,16 @@ class AudioToTextRecorder:
 
                     # Stop the recording if silence is detected after speech
                     if self.stop_recording_on_voice_deactivity:
+                        is_speech = (
+                            self._is_silero_speech(data) if self.silero_deactivity_detection
+                            else self._is_webrtc_speech(data, True)
+                        )
 
-                        if not self._is_webrtc_speech(data, True):
-
+                        if not is_speech:
                             # Voice deactivity was detected, so we start
                             # measuring silence time before stopping recording
                             if self.speech_end_silence_start == 0:
                                 self.speech_end_silence_start = time.time()
-
                         else:
                             self.speech_end_silence_start = 0