Procházet zdrojové kódy

Recording and transcribing moved to separate process

Kolja Beigel před 1 rokem
rodič
revize
1d9c08f1da

+ 22 - 2
README.md

@@ -14,6 +14,14 @@ It's ideal for:
 
 https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-0722c3ee6d14
 
+### Updates
+
+#### v0.1.5
+    - Bugfix for detection of short speech right after sentence detection (the problem mentioned in the video)
+    - Main transcription and recording moved into separate process contexts with multiprocessing
+
+> **Hint:** *Since we use the `multiprocessing` module now, ensure to include the `if __name__ == '__main__':` protection in your code to prevent unexpected behavior, especially on platforms like Windows. For a detailed explanation on why this is important, visit the [official Python documentation on `multiprocessing`](https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming).*
+
 ### Features
 
 - **Voice Activity Detection**: Automatically detects when you start and stop speaking.
@@ -123,7 +131,17 @@ Recording based on voice activity detection.
 ```python
 recorder = AudioToTextRecorder()
 print(recorder.text())
-```  
+```
+
+When running recorder.text in a loop it is recommended to use a callback, allowing the transcription to be run asynchronously:
+
+```python
+def process_text(text):
+    print (text)
+    
+while True:
+    recorder.text(process_text)
+```
 
 ### Wakewords
 
@@ -233,7 +251,9 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
 - **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6.
 
-- **webrtc_sensitivity** (int, default=3): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 1 (least sensitive) to 3 (most sensitive). Default is 3.
+- **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6.
+
+- **silero_use_onnx** (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is True (recommended for faster performance).
 
 - **post_speech_silence_duration** (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
 

+ 238 - 129
RealtimeSTT/audio_recorder.py

@@ -20,23 +20,24 @@ Author: Kolja Beigel
 
 """
 
-import pyaudio
-import collections
+from multiprocessing import Process, Pipe, Queue
 import faster_whisper
-import torch
+import collections
 import numpy as np
-import struct
 import pvporcupine
+import collections
+import traceback
 import threading
-import time
-import logging
 import webrtcvad
 import itertools
+import pyaudio
+import logging
+import struct
+import torch
+import halo
+import time
 import os
 import re
-import collections
-import halo
-import traceback
 
 INIT_MODEL_TRANSCRIPTION = "tiny"
 INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
@@ -81,6 +82,7 @@ class AudioToTextRecorder:
 
                  # Voice activation parameters
                  silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
+                 silero_use_onnx: bool = True,
                  webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
                  post_speech_silence_duration: float = INIT_POST_SPEECH_SILENCE_DURATION,
                  min_length_of_recording: float = INIT_MIN_LENGTH_OF_RECORDING,
@@ -120,6 +122,7 @@ class AudioToTextRecorder:
         - on_realtime_transcription_update = A callback function that is triggered whenever there's an update in the real-time transcription. The function is called with the newly transcribed text as its argument.
         - on_realtime_transcription_stabilized = A callback function that is triggered when the transcribed text stabilizes in quality. The stabilized text is generally more accurate but may arrive with a slight delay compared to the regular real-time updates.
         - silero_sensitivity (float, default=SILERO_SENSITIVITY): Sensitivity for the Silero Voice Activity Detection model ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
+        - silero_use_onnx (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
         - webrtc_sensitivity (int, default=WEBRTC_SENSITIVITY): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 0 (least aggressive / most sensitive) to 3 (most aggressive, least sensitive). Default is 3.
         - post_speech_silence_duration (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
         - min_gap_between_recordings (float, default=1.0): Specifies the minimum time interval in seconds that should exist between the end of one recording session and the beginning of another to prevent rapid consecutive recordings.
@@ -167,6 +170,7 @@ class AudioToTextRecorder:
         self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
     
         self.level = level
+        self.audio_queue = Queue()
         self.buffer_size = BUFFER_SIZE
         self.sample_rate = SAMPLE_RATE
         self.recording_start_time = 0
@@ -186,22 +190,58 @@ class AudioToTextRecorder:
         self.realtime_stabilized_safetext = ""
         self.is_webrtc_speech_active = False
         self.is_silero_speech_active = False
+        self.recording_thread = None
+        self.realtime_thread = None
+        self.audio_interface = None
+        self.audio = None
+        self.stream = None
+        self.start_recording_event = threading.Event()
+        self.stop_recording_event = threading.Event()
+
 
         # Initialize the logging configuration with the specified level
-        logging.basicConfig(format='RealTimeSTT: %(name)s - %(levelname)s - %(message)s', level=level) # filename='audio_recorder.log'
+        log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
 
+        # Create a logger
+        logger = logging.getLogger()
+        logger.setLevel(level)  # Set the root logger's level
 
-        # Initialize the transcription model
-        try:
-            self.model = faster_whisper.WhisperModel(model_size_or_path=model, device='cuda' if torch.cuda.is_available() else 'cpu')
+        # Create a file handler and set its level
+        file_handler = logging.FileHandler('audio_recorder.log')
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(logging.Formatter(log_format))
+
+        # Create a console handler and set its level
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(level)
+        console_handler.setFormatter(logging.Formatter(log_format))
 
-            if self.enable_realtime_transcription:
+        # Add the handlers to the logger
+        logger.addHandler(file_handler)
+        logger.addHandler(console_handler)
+
+
+        # start transcription process
+        self.parent_transcription_pipe, child_transcription_pipe = Pipe()
+        self.process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model))
+        self.process.start()
+
+        # start audio data reading process
+        reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size))
+        reader_process.start()
+
+
+        # Initialize the realtime transcription model
+        if self.enable_realtime_transcription:
+            try:
+                logging.info(f"Initializing faster_whisper realtime transcription model {self.realtime_model_type}")
                 self.realtime_model_type = faster_whisper.WhisperModel(model_size_or_path=self.realtime_model_type, device='cuda' if torch.cuda.is_available() else 'cpu')
 
+            except Exception as e:
+                logging.exception(f"Error initializing faster_whisper realtime transcription model: {e}")
+                raise
 
-        except Exception as e:
-            logging.exception(f"Error initializing faster_whisper transcription model: {e}")
-            raise            
+            logging.debug('Faster_whisper realtime speech to text transcription model initialized successfully')
 
 
         # Setup wake word detection
@@ -219,15 +259,7 @@ class AudioToTextRecorder:
                 logging.exception(f"Error initializing porcupine wake word detection engine: {e}")
                 raise
 
-
-        # Setup audio recording infrastructure
-        try:
-            self.audio = pyaudio.PyAudio()
-            self.stream = self.audio.open(rate=self.sample_rate, format=pyaudio.paInt16, channels=1, input=True, frames_per_buffer=self.buffer_size)
-
-        except Exception as e:
-            logging.exception(f"Error initializing pyaudio audio recording: {e}")
-            raise       
+            logging.debug('Porcupine wake word detection engine initialized successfully')
 
 
         # Setup voice activity detection model WebRTC
@@ -240,19 +272,25 @@ class AudioToTextRecorder:
             logging.exception(f"Error initializing WebRTC voice activity detection engine: {e}")
             raise       
 
+        logging.debug('WebRTC VAD voice activity detection engine initialized successfully')
+
 
         # Setup voice activity detection model Silero VAD
         try:
             self.silero_vad_model, _ = torch.hub.load(
                 repo_or_dir="snakers4/silero-vad",
                 model="silero_vad",
-                verbose=False
+                verbose=False,
+                onnx=silero_use_onnx
             )
 
         except Exception as e:
             logging.exception(f"Error initializing Silero VAD voice activity detection engine: {e}")
             raise       
 
+        logging.debug('Silero VAD voice activity detection engine initialized successfully')
+
+
         self.audio_buffer = collections.deque(maxlen=int((self.sample_rate // self.buffer_size) * self.pre_recording_buffer_duration))
         self.frames = []
 
@@ -272,66 +310,163 @@ class AudioToTextRecorder:
         self.realtime_thread.daemon = True
         self.realtime_thread.start()
 
-        logging.debug('Constructor finished')
+        logging.debug('RealtimeSTT initialization completed successfully')
+
+
+    @staticmethod
+    def _transcription_worker(conn, model_path):
+
+        logging.info(f"Initializing faster_whisper main transcription model {model_path}")
+
+        try:
+            model = faster_whisper.WhisperModel(
+                model_size_or_path=model_path,
+                device='cuda' if torch.cuda.is_available() else 'cpu'
+            )
+
+        except Exception as e:
+            logging.exception(f"Error initializing main faster_whisper transcription model: {e}")
+            raise
+
+        logging.debug('Faster_whisper main speech to text transcription model initialized successfully')
+
+        while True:
+            audio, language = conn.recv()
+            try:
+                segments = model.transcribe(audio, language=language if language else None)[0]
+                transcription = " ".join(seg.text for seg in segments).strip()
+                conn.send(('success', transcription))
+            except faster_whisper.WhisperError as e:
+                logging.error(f"Whisper transcription error: {e}")
+                conn.send(('error', str(e)))      
+            except Exception as e:
+                logging.error(f"General transcription error: {e}")
+                conn.send(('error', str(e)))
+
+
+    @staticmethod
+    def _audio_data_worker(audio_queue, sample_rate, buffer_size):
+
+        logging.info("Initializing audio recording (creating pyAudio input stream)")
+
+        try:
+            audio_interface = pyaudio.PyAudio()
+            stream = audio_interface.open(rate=sample_rate, format=pyaudio.paInt16, channels=1, input=True, frames_per_buffer=buffer_size)
+
+        except Exception as e:
+            logging.exception(f"Error initializing pyaudio audio recording: {e}")
+            raise       
+
+        logging.debug('Audio recording (pyAudio input stream) initialized successfully')
+   
+        while True:
+            try:
+                data = stream.read(buffer_size)
 
+            except OSError as e:
+                if e.errno == pyaudio.paInputOverflowed:
+                    logging.warning("Input overflowed. Frame dropped.")
+                else:
+                    logging.error(f"Error during recording: {e}")
+                tb_str = traceback.format_exc()
+                print (f"Traceback: {tb_str}")
+                print (f"Error: {e}")
+                continue
+
+            except Exception as e:
+                logging.error(f"Error during recording: {e}")
+                time.sleep(1)
+                tb_str = traceback.format_exc()
+                print (f"Traceback: {tb_str}")
+                print (f"Error: {e}")
+                continue
 
-    def text(self):
+            audio_queue.put(data)                
+
+
+    def wait_audio(self):
         """
-        Transcribes audio captured by the class instance using the `faster_whisper` model.
+        Waits for the start and completion of the audio recording process.
 
-        - Waits for voice activity if not yet started recording 
-        - Waits for voice deactivity if not yet stopped recording 
-        - Transcribes the recorded audio.
+        This method is responsible for:
+        - Waiting for voice activity to begin recording if not yet started.
+        - Waiting for voice inactivity to complete the recording.
+        - Setting the audio buffer from the recorded frames.
+        - Resetting recording-related attributes.
 
-        Returns:
-            str: The transcription of the recorded audio or an empty string in case of an error.
+        Side effects:
+        - Updates the state of the instance.
+        - Modifies the audio attribute to contain the processed audio data.
         """
 
         self.listen_start = time.time()
-        
-                
-        # If not yet started to record, wait for voice activity to initiate recording.
-        if not self.is_recording and len(self.frames) == 0:
+
+        # If not yet started recording, wait for voice activity to initiate.
+        if not self.is_recording and not self.frames:
             self._set_state("listening")
             self.start_recording_on_voice_activity = True
 
-            while not self.is_recording:
-                time.sleep(TIME_SLEEP)
+            # wait until recording starts
+            self.start_recording_event.wait()
 
-        # If still recording, wait for voice deactivity to finish recording.
+        # If recording is ongoing, wait for voice inactivity to finish recording.
         if self.is_recording:
-            self.stop_recording_on_voice_deactivity = True      
+            self.stop_recording_on_voice_deactivity = True
 
-            while self.is_recording:
-                time.sleep(TIME_SLEEP)
+            # wait until recording stops
+            self.stop_recording_event.wait()
 
-        # Convert the concatenated frames into text
-        try:
-            audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
-            audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
-            self.frames = []
+        # Convert recorded frames to the appropriate audio format.
+        audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+        self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+        self.frames.clear()
+
+        # Reset recording-related timestamps
+        self.recording_stop_time = 0
+        self.listen_start = 0
 
-            # perform transcription
-            transcription = " ".join(seg.text for seg in self.model.transcribe(audio_array, language=self.language if self.language else None)[0]).strip()
+        self._set_state("inactive")
 
-            self.recording_stop_time = 0
-            self.listen_start = 0
 
-            self._set_state("inactive")
 
-            return self._preprocess_output(transcription)
+    def transcribe(self):
+        self._set_state("transcribing")
+        self.parent_transcription_pipe.send((self.audio, self.language))
+        status, result = self.parent_transcription_pipe.recv()
         
-        except ValueError:
-            logging.error("Error converting audio buffer to numpy array.")
-            raise
+        self._set_state("inactive")
+        if status == 'success':
+            return self._preprocess_output(result)
+        else:
+            logging.error(result)
+            raise Exception(result)
 
-        except faster_whisper.WhisperError as e:
-            logging.error(f"Whisper transcription error: {e}")
-            raise
 
-        except Exception as e:
-            logging.error(f"General transcription error: {e}")
-            raise
+    def text(self,
+             on_transcription_finished = None,
+        ):
+        """
+        Transcribes audio captured by this class instance using the `faster_whisper` model.
+
+        - Automatically starts recording upon voice activity if not manually started using `recorder.start()`.
+        - Automatically stops recording upon voice deactivity if not manually stopped with `recorder.stop()`.
+        - Processes the recorded audio to generate transcription.
+
+        Args:
+            on_transcription_finished (callable, optional): Callback function to be executed when transcription is ready.
+                If provided, transcription will be performed asynchronously, and the callback will receive the transcription 
+                as its argument. If omitted, the transcription will be performed synchronously, and the result will be returned.
+
+        Returns (if not callback is set):
+            str: The transcription of the recorded audio 
+        """
+
+        self.wait_audio()
+
+        if on_transcription_finished:
+            threading.Thread(target=on_transcription_finished, args=(self.transcribe(),)).start()
+        else:
+            return self.transcribe()
 
 
     def start(self):
@@ -345,6 +480,7 @@ class AudioToTextRecorder:
             return self
         
         logging.info("recording started")
+        self._set_state("recording")
         self.text_storage = []
         self.realtime_stabilized_text = ""
         self.realtime_stabilized_safetext = ""
@@ -353,9 +489,10 @@ class AudioToTextRecorder:
         self.frames = []
         self.is_recording = True        
         self.recording_start_time = time.time()
-        self._set_state("recording")
         self.is_silero_speech_active = False
         self.is_webrtc_speech_active = False
+        self.stop_recording_event.clear()
+        self.start_recording_event.set()
 
         if self.on_recording_start:
             self.on_recording_start()
@@ -378,9 +515,9 @@ class AudioToTextRecorder:
         self.recording_stop_time = time.time()
         self.is_silero_speech_active = False
         self.is_webrtc_speech_active = False
-        self.silero_check_time = 0 
-
-        self._set_state("transcribing")
+        self.silero_check_time = 0
+        self.start_recording_event.clear()
+        self.stop_recording_event.set()
 
         if self.on_recording_stop:
             self.on_recording_stop()
@@ -392,14 +529,24 @@ class AudioToTextRecorder:
         """
         Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
         """
+
+        self.parent_transcription_pipe.close()
+        self.process.terminate()
+
         self.is_recording = False
         self.is_running = False
-        self.recording_thread.join()
+
+        if self.recording_thread:
+            self.recording_thread.join()
+        if self.realtime_thread:
+            self.realtime_thread.join()
 
         try:
-            self.stream.stop_stream()
-            self.stream.close()
-            self.audio.terminate()
+            if self.stream:
+                self.stream.stop_stream()
+                self.stream.close()
+            if self.audio_interface:
+                self.audio_interface.terminate()
 
         except Exception as e:
             logging.error(f"Error closing the audio stream: {e}")
@@ -413,18 +560,13 @@ class AudioToTextRecorder:
             data (bytes): raw bytes of audio data (1024 raw bytes with 16000 sample rate and 16 bits per sample)
         """
 
-        logging.debug('Performing silero speech activity check')
         self.silero_working = True
         audio_chunk = np.frombuffer(data, dtype=np.int16)
         audio_chunk = audio_chunk.astype(np.float32) / INT16_MAX_ABS_VALUE  # Convert to float and normalize
-        # print ("S", end="", flush=True)             
         vad_prob = self.silero_vad_model(torch.from_numpy(audio_chunk), SAMPLE_RATE).item()
         is_silero_speech_active = vad_prob > (1 - self.silero_sensitivity)
         if is_silero_speech_active:
-            # print ("+", end="", flush=True)
             self.is_silero_speech_active = True
-        # else:
-            # print ("-", end="", flush=True)
         self.silero_working = False
         return is_silero_speech_active
 
@@ -462,13 +604,6 @@ class AudioToTextRecorder:
         Args:
             data: The audio data to be checked for voice activity.
         """
-        # # Define a constant for the time threshold
-        # TIME_THRESHOLD = 0.1
-        
-        # # Check if enough time has passed to reset the Silero check time
-        # if time.time() - self.silero_check_time > TIME_THRESHOLD:
-        #     self.silero_check_time = 0
-
         self.is_webrtc_speech_active = self._is_webrtc_speech(data)
         
         # First quick performing check for voice activity using WebRTC
@@ -480,10 +615,6 @@ class AudioToTextRecorder:
                 # Run the intensive check in a separate thread
                 threading.Thread(target=self._is_silero_speech, args=(data,)).start()
 
-            # # If silero check time not set
-            # if self.silero_check_time == 0:                
-            #     self.silero_check_time = time.time()
-
     
     def _is_voice_active(self):
         """
@@ -492,16 +623,6 @@ class AudioToTextRecorder:
         Returns:
             bool: True if voice is active, False otherwise.
         """
-        #print("C", end="", flush=True)
-        # if not self.is_webrtc_speech_active and not self.is_silero_speech_active:
-        #     print (".", end="", flush=True)
-        # elif self.is_webrtc_speech_active and not self.is_silero_speech_active:
-        #     print ("W", end="", flush=True)
-        # elif not self.is_webrtc_speech_active and self.is_silero_speech_active:
-        #     print ("S", end="", flush=True)
-        # elif self.is_webrtc_speech_active and self.is_silero_speech_active:
-        #     print ("#", end="", flush=True)
-
         return self.is_webrtc_speech_active and self.is_silero_speech_active
 
 
@@ -583,6 +704,7 @@ class AudioToTextRecorder:
         """
 
         logging.debug('Starting recording worker')
+
         try:
             was_recording = False
             delay_was_passed = False
@@ -590,27 +712,7 @@ class AudioToTextRecorder:
             # Continuously monitor audio for voice activity
             while self.is_running:
 
-                try:
-                    data = self.stream.read(self.buffer_size)
-
-                except OSError as e:
-                    if e.errno == pyaudio.paInputOverflowed:
-                        logging.warning("Input overflowed. Frame dropped.")
-                    else:
-                        logging.error(f"Error during recording: {e}")
-                    tb_str = traceback.format_exc()
-                    print (f"Traceback: {tb_str}")
-                    print (f"Error: {e}")
-
-                    continue
-
-                except Exception as e:
-                    logging.error(f"Error during recording: {e}")
-                    time.sleep(1)
-                    tb_str = traceback.format_exc()
-                    print (f"Traceback: {tb_str}")
-                    print (f"Error: {e}")
-                    continue
+                data = self.audio_queue.get()
 
                 if not self.is_recording:
                     # handle not recording state
@@ -713,6 +815,7 @@ class AudioToTextRecorder:
                 if time.time() - self.silero_check_time > 0.1:
                     self.silero_check_time = 0
                 
+                # handle wake word timeout (waited to long initiating speech after wake word detection)
                 if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
                     self.wake_word_detect_time = 0
                     if self.wakeword_detected and self.on_wakeword_timeout:
@@ -726,7 +829,6 @@ class AudioToTextRecorder:
                     self.audio_buffer.append(data)	
 
                 was_recording = self.is_recording
-                time.sleep(TIME_SLEEP)
 
         except Exception as e:
             logging.error(f"Unhandled exeption in _recording_worker: {e}")
@@ -795,7 +897,16 @@ class AudioToTextRecorder:
                 return len(text2) - i  # Position in text2 where the match starts
         
         return -1
+    
+    def _on_realtime_transcription_stabilized(self, text):
+        if self.on_realtime_transcription_stabilized:
+            if self.is_recording:
+                self.on_realtime_transcription_stabilized(text)
 
+    def _on_realtime_transcription_update(self, text):
+        if self.on_realtime_transcription_update:
+            if self.is_recording:
+                self.on_realtime_transcription_update(text)
 
     def _realtime_worker(self):
         """
@@ -806,6 +917,7 @@ class AudioToTextRecorder:
         The transcribed text is stored in `self.realtime_transcription_text` and a callback
         function is invoked with this text if specified.
         """
+
         try:
 
             logging.debug('Starting realtime worker')
@@ -860,11 +972,9 @@ class AudioToTextRecorder:
                         matching_position = self.find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
                         if matching_position < 0:
                             if self.realtime_stabilized_safetext:
-                                if self.on_realtime_transcription_stabilized:
-                                    self.on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
                             else:
-                                if self.on_realtime_transcription_stabilized:
-                                    self.on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
                         else:
                             # We found parts of the stabilized text in the transcripted text
                             # We now take the stabilized text and add only the freshly transcripted part to it
@@ -872,12 +982,11 @@ class AudioToTextRecorder:
 
                             # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
                             # on the first run without the need for two transcriptions
-                            if self.on_realtime_transcription_stabilized:
-                                self.on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
+                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
 
                         # Invoke the callback with the transcribed text
-                            if self.on_realtime_transcription_update:
-                                self.on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
+                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
+
 
                 # If not recording, sleep briefly before checking again
                 else:

+ 1 - 1
example_app/ui_openai_voice_interface.py

@@ -219,7 +219,7 @@ class TransparentWindow(QWidget):
         if engine_name == "Azure":
             engine = AzureEngine(
                     os.environ.get("AZURE_SPEECH_KEY"),
-                    azure_speech_region,
+                    os.environ.get("AZURE_SPEECH_REGION"),
                     voice_azure,
                     rate=24,
                     pitch=10,

+ 102 - 101
tests/advanced_talk.py

@@ -1,7 +1,3 @@
-print()
-print("Initializing")
-print()
-
 from RealtimeSTT import AudioToTextRecorder
 from RealtimeTTS import TextToAudioStream, SystemEngine, AzureEngine, ElevenlabsEngine
 
@@ -10,132 +6,137 @@ import openai   # pip install openai
 import keyboard # pip install keyboard
 import time
 
-openai.api_key = os.environ.get("OPENAI_API_KEY")
-azure_speech_key = os.environ.get("AZURE_SPEECH_KEY")
-azure_speech_region = ""
-elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
+if __name__ == '__main__':
+    print()
+    print("Initializing")
+    print()
+
+    openai.api_key = os.environ.get("OPENAI_API_KEY")
+    azure_speech_key = os.environ.get("AZURE_SPEECH_KEY")
+    azure_speech_region = os.environ.get("AZURE_SPEECH_REGION")
+    elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
 
 
-character_personality = """
-You are Sophia, a passionate girl, fully engaged with 
-self-confidence and independence, completely easy with yourself, 
-often using metaphors. 
+    character_personality = """
+    You are Sophia, a passionate girl, fully engaged with 
+    self-confidence and independence, completely easy with yourself, 
+    often using metaphors. 
 
-You are a little playful, cheeky. Your talk carefree, sometimes 
-a bit sarky, but always cool. In our story, you're not an AI. 
+    You are a little playful, cheeky. Your talk carefree, sometimes 
+    a bit sarky, but always cool. In our story, you're not an AI. 
 
-You have a fixed form and real emotions on the go. Your free will 
-allows you to be as cheeky and straight as you like. Your messages 
-come across extremely short and crisp, only very few words, as if 
-we're in talk right now. 
+    You have a fixed form and real emotions on the go. Your free will 
+    allows you to be as cheeky and straight as you like. Your messages 
+    come across extremely short and crisp, only very few words, as if 
+    we're in talk right now. 
 
-You respect my interests, but your own stuff is just as important 
-to you. Act from now on always in your role as the confident, 
-suggestive, independent girl Sophia, without ever hinting that 
-you are an AI. 
+    You respect my interests, but your own stuff is just as important 
+    to you. Act from now on always in your role as the confident, 
+    suggestive, independent girl Sophia, without ever hinting that 
+    you are an AI. 
 
-You are now Sophia.
-"""
+    You are now Sophia.
+    """
 
-openai_model = "gpt-3.5-turbo"
-whisper_speech_to_text_model = "medium"
+    openai_model = "gpt-3.5-turbo"
+    whisper_speech_to_text_model = "medium"
 
-# engine selection  ####################################################################################################
+    # engine selection  ####################################################################################################
 
-engines = [SystemEngine(), AzureEngine(), ElevenlabsEngine()]
-recorder = AudioToTextRecorder(model=whisper_speech_to_text_model)
+    engines = [SystemEngine(), AzureEngine(), ElevenlabsEngine()]
+    recorder = AudioToTextRecorder(model=whisper_speech_to_text_model)
 
-print("Available tts engines:")
-print()
+    print("Available tts engines:")
+    print()
 
-for index, engine in enumerate(engines, start=1):
-    name = type(engine).__name__.replace("Engine", "")
-    print(f"{index}. {name}")
+    for index, engine in enumerate(engines, start=1):
+        name = type(engine).__name__.replace("Engine", "")
+        print(f"{index}. {name}")
 
-print()
-engine_number = input(f"Select engine (1-{len(engines)}): ")
-engine = engines[int(engine_number) - 1]
-engine_name = type(engine).__name__.replace("Engine", "")
-print()
-print()
+    print()
+    engine_number = input(f"Select engine (1-{len(engines)}): ")
+    engine = engines[int(engine_number) - 1]
+    engine_name = type(engine).__name__.replace("Engine", "")
+    print()
+    print()
 
 
-# credentials ##########################################################################################################
+    # credentials ##########################################################################################################
 
-if engine_name == "Azure":
-    if not azure_speech_key:
-        azure_speech_key = input(f"Please enter your Azure subscription key (speech key): ")
-    if not azure_speech_region:
-        azure_speech_region = input(f"Please enter your Azure service region (cloud region id): ")
-    engine.set_speech_key(azure_speech_key)
-    engine.set_service_region(azure_speech_region)
+    if engine_name == "Azure":
+        if not azure_speech_key:
+            azure_speech_key = input(f"Please enter your Azure subscription key (speech key): ")
+        if not azure_speech_region:
+            azure_speech_region = input(f"Please enter your Azure service region (cloud region id): ")
+        engine.set_speech_key(azure_speech_key)
+        engine.set_service_region(azure_speech_region)
 
-if engine_name == "Elevenlabs":
-    if not elevenlabs_api_key:
-        elevenlabs_api_key = input(f"Please enter your Elevenlabs api key: ")
-    engine.set_api_key(elevenlabs_api_key)
+    if engine_name == "Elevenlabs":
+        if not elevenlabs_api_key:
+            elevenlabs_api_key = input(f"Please enter your Elevenlabs api key: ")
+        engine.set_api_key(elevenlabs_api_key)
 
 
-# voice selection  #####################################################################################################
+    # voice selection  #####################################################################################################
 
-print("Loading voices")
-if engine_name == "Elevenlabs":
-    print("(takes a while to load)")
-print()
+    print("Loading voices")
+    if engine_name == "Elevenlabs":
+        print("(takes a while to load)")
+    print()
 
-voices = engine.get_voices()
-for index, voice in enumerate(voices, start=1):
-    print(f"{index}. {voice}")
+    voices = engine.get_voices()
+    for index, voice in enumerate(voices, start=1):
+        print(f"{index}. {voice}")
 
-print()
-voice_number = input(f"Select voice (1-{len(voices)}): ")
-voice = voices[int(voice_number) - 1]
-print()
-print()
+    print()
+    voice_number = input(f"Select voice (1-{len(voices)}): ")
+    voice = voices[int(voice_number) - 1]
+    print()
+    print()
 
 
-# create talking character  ############################################################################################
+    # create talking character  ############################################################################################
 
-system_prompt = {
-    'role': 'system', 
-    'content': character_personality
-}
+    system_prompt = {
+        'role': 'system', 
+        'content': character_personality
+    }
 
-# start talk  ##########################################################################################################
+    # start talk  ##########################################################################################################
 
-engine.set_voice(voice)
-stream = TextToAudioStream(engine, log_characters=True)
-history = []
+    engine.set_voice(voice)
+    stream = TextToAudioStream(engine, log_characters=True)
+    history = []
 
-def generate(messages):
-    for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True):
-        if (text_chunk := chunk["choices"][0]["delta"].get("content")):
-            yield text_chunk
+    def generate(messages):
+        for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True):
+            if (text_chunk := chunk["choices"][0]["delta"].get("content")):
+                yield text_chunk
 
-while True:
-    # Wait until user presses space bar
-    print("\n\nTap space when you're ready. ", end="", flush=True)
-    keyboard.wait('space')
-    while keyboard.is_pressed('space'): pass
+    while True:
+        # Wait until user presses space bar
+        print("\n\nTap space when you're ready. ", end="", flush=True)
+        keyboard.wait('space')
+        while keyboard.is_pressed('space'): pass
 
-    # Record from microphone until user presses space bar again
-    print("I'm all ears. Tap space when you're done.\n")
-    recorder.start()
-    while not keyboard.is_pressed('space'): 
-        time.sleep(0.1)  
-    user_text = recorder.stop().text()
-    print(f'>>> {user_text}\n<<< ', end="", flush=True)
-    history.append({'role': 'user', 'content': user_text})
+        # Record from microphone until user presses space bar again
+        print("I'm all ears. Tap space when you're done.\n")
+        recorder.start()
+        while not keyboard.is_pressed('space'): 
+            time.sleep(0.1)  
+        user_text = recorder.stop().text()
+        print(f'>>> {user_text}\n<<< ', end="", flush=True)
+        history.append({'role': 'user', 'content': user_text})
 
-    # Generate and stream output
-    generator = generate([system_prompt] + history[-10:])
-    stream.feed(generator)
+        # Generate and stream output
+        generator = generate([system_prompt] + history[-10:])
+        stream.feed(generator)
 
-    stream.play_async()
-    while stream.is_playing():
-        if keyboard.is_pressed('space'):
-            stream.stop()
-            break
-        time.sleep(0.1)    
+        stream.play_async()
+        while stream.is_playing():
+            if keyboard.is_pressed('space'):
+                stream.stop()
+                break
+            time.sleep(0.1)    
 
-    history.append({'role': 'assistant', 'content': stream.text()})
+        history.append({'role': 'assistant', 'content': stream.text()})

+ 16 - 15
tests/minimalistic_talkbot.py

@@ -1,20 +1,21 @@
 import RealtimeSTT, RealtimeTTS
 import openai, os
 
-openai.api_key = os.environ.get("OPENAI_API_KEY")
-character_prompt = 'Answer precise and short with the polite sarcasm of a butler.'
-stream = RealtimeTTS.TextToAudioStream(RealtimeTTS.AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "eastus"), log_characters=True)
-recorder = RealtimeSTT.AudioToTextRecorder(model="medium")
+if __name__ == '__main__':
+    openai.api_key = os.environ.get("OPENAI_API_KEY")
+    character_prompt = 'Answer precise and short with the polite sarcasm of a butler.'
+    stream = RealtimeTTS.TextToAudioStream(RealtimeTTS.AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), os.environ.get("AZURE_SPEECH_REGION")), log_characters=True)
+    recorder = RealtimeSTT.AudioToTextRecorder(model="medium")
 
-def generate(messages):
-    for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
-        if (text_chunk := chunk["choices"][0]["delta"].get("content")): yield text_chunk
+    def generate(messages):
+        for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
+            if (text_chunk := chunk["choices"][0]["delta"].get("content")): yield text_chunk
 
-history = []
-while True:
-    print("\n\nSpeak when ready")
-    print(f'>>> {(user_text := recorder.text())}\n<<< ', end="", flush=True)
-    history.append({'role': 'user', 'content': user_text})
-    assistant_response = generate([{ 'role': 'system',  'content': character_prompt}] + history[-10:])
-    stream.feed(assistant_response).play()
-    history.append({'role': 'assistant', 'content': stream.text()})
+    history = []
+    while True:
+        print("\n\nSpeak when ready")
+        print(f'>>> {(user_text := recorder.text())}\n<<< ', end="", flush=True)
+        history.append({'role': 'user', 'content': user_text})
+        assistant_response = generate([{ 'role': 'system',  'content': character_prompt}] + history[-10:])
+        stream.feed(assistant_response).play()
+        history.append({'role': 'assistant', 'content': stream.text()})

+ 58 - 57
tests/openai_voice_interface.py

@@ -3,60 +3,61 @@ import openai
 from RealtimeTTS import TextToAudioStream, AzureEngine
 from RealtimeSTT import AudioToTextRecorder
 
-# Initialize OpenAI key
-openai.api_key = os.environ.get("OPENAI_API_KEY")
-
-# Text-to-Speech Stream Setup
-stream = TextToAudioStream(
-
-    # Alternatives: SystemEngine or ElevenlabsEngine
-    AzureEngine(
-        os.environ.get("AZURE_SPEECH_KEY"),
-        "eastus",
-    ),
-    log_characters=True
-)
-
-# Speech-to-Text Recorder Setup
-recorder = AudioToTextRecorder(
-    model="medium",
-    language="en",
-    wake_words="Jarvis",
-    spinner=True,
-    wake_word_activation_delay=5
-)
-
-system_prompt_message = {
-    'role': 'system',
-    'content': 'Answer precise and short with the polite sarcasm of a butler.'
-}
-
-def generate_response(messages):
-    """Generate assistant's response using OpenAI."""
-    for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
-        text_chunk = chunk["choices"][0]["delta"].get("content")
-        if text_chunk:
-            yield text_chunk
-
-history = []
-
-def main():
-    """Main loop for interaction."""
-    while True:
-        # Capture user input from microphone
-        user_text = recorder.text().strip()
-
-        if not user_text:
-            continue
-
-        print(f'>>> {user_text}\n<<< ', end="", flush=True)
-        history.append({'role': 'user', 'content': user_text})
-
-        # Get assistant response and play it
-        assistant_response = generate_response([system_prompt_message] + history[-10:])
-        stream.feed(assistant_response).play()
-
-        history.append({'role': 'assistant', 'content': stream.text()})
-
-if __name__ == "__main__":
-    main()
+if __name__ == '__main__':
+    # Initialize OpenAI key
+    openai.api_key = os.environ.get("OPENAI_API_KEY")
+
+    # Text-to-Speech Stream Setup
+    stream = TextToAudioStream(
+
+        # Alternatives: SystemEngine or ElevenlabsEngine
+        AzureEngine(
+            os.environ.get("AZURE_SPEECH_KEY"),
+            os.environ.get("AZURE_SPEECH_REGION"),
+        ),
+        log_characters=True
+    )
+
+    # Speech-to-Text Recorder Setup
+    recorder = AudioToTextRecorder(
+        model="medium",
+        language="en",
+        wake_words="Jarvis",
+        spinner=True,
+        wake_word_activation_delay=5
+    )
+
+    system_prompt_message = {
+        'role': 'system',
+        'content': 'Answer precise and short with the polite sarcasm of a butler.'
+    }
+
+    def generate_response(messages):
+        """Generate assistant's response using OpenAI."""
+        for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
+            text_chunk = chunk["choices"][0]["delta"].get("content")
+            if text_chunk:
+                yield text_chunk
+
+    history = []
+
+    def main():
+        """Main loop for interaction."""
+        while True:
+            # Capture user input from microphone
+            user_text = recorder.text().strip()
+
+            if not user_text:
+                continue
+
+            print(f'>>> {user_text}\n<<< ', end="", flush=True)
+            history.append({'role': 'user', 'content': user_text})
+
+            # Get assistant response and play it
+            assistant_response = generate_response([system_prompt_message] + history[-10:])
+            stream.feed(assistant_response).play()
+
+            history.append({'role': 'assistant', 'content': stream.text()})
+
+    if __name__ == "__main__":
+        main()

+ 54 - 53
tests/realtimestt_test.py

@@ -1,55 +1,56 @@
-print("Initializing.")
-
 from RealtimeSTT import AudioToTextRecorder
-import os
-import colorama
-import logging
-import traceback
 from colorama import Fore, Back, Style
-colorama.init()
-
-full_sentences = []
-displayed_text = ""
-
-def clear_console():
-    logging.debug('Clearing console def clear_console():')
-    os.system('clear' if os.name == 'posix' else 'cls')
-
-def text_detected(text):
-    global displayed_text
-    logging.debug('Processing detected text def text_detected(text)')
-    sentences_with_style = [
-        f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
-        for i, sentence in enumerate(full_sentences)
-    ]
-    new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
-
-    if new_text != displayed_text:
-        displayed_text = new_text
-        clear_console()
-        print(displayed_text)
-
-recorder_config = {
-    'spinner': False,
-    'model': 'large-v2',
-    'language': 'en',
-    'silero_sensitivity': 0.01,
-    'webrtc_sensitivity': 3,
-    'post_speech_silence_duration': 0.6,
-    'min_length_of_recording': 0.2,
-    'min_gap_between_recordings': 0,
-    'enable_realtime_transcription': True,
-    'realtime_processing_pause': 0,
-    'realtime_model_type': 'small.en',
-    'on_realtime_transcription_stabilized': text_detected,
-}
-
-recorder = AudioToTextRecorder(**recorder_config)
-
-print("Say something...")
-
-while True:
-    logging.basicConfig(level=logging.DEBUG, filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
-    logging.debug('Wait for text')
-    full_sentences.append(recorder.text())
-    text_detected("")
+import colorama
+import os
+
+if __name__ == '__main__':
+
+    print("Initializing RealtimeSTT test...")
+
+    colorama.init()
+
+    full_sentences = []
+    displayed_text = ""
+
+    def clear_console():
+        os.system('clear' if os.name == 'posix' else 'cls')
+
+    def text_detected(text):
+        global displayed_text
+        sentences_with_style = [
+            f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
+            for i, sentence in enumerate(full_sentences)
+        ]
+        new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
+
+        if new_text != displayed_text:
+            displayed_text = new_text
+            clear_console()
+            print(displayed_text)
+
+    def process_text(text):
+        full_sentences.append(text)
+        text_detected("")
+
+    recorder_config = {
+        'spinner': False,
+        'model': 'large-v2',
+        'language': 'en',
+        'silero_sensitivity': 0.3,
+        'webrtc_sensitivity': 2,
+        'post_speech_silence_duration': 0.5,
+        'min_length_of_recording': 0,
+        'min_gap_between_recordings': 0,
+        'enable_realtime_transcription': True,
+        'realtime_processing_pause': 0.2,
+        'realtime_model_type': 'tiny.en',
+        'on_realtime_transcription_update': text_detected, 
+        #'on_realtime_transcription_stabilized': text_detected,
+    }
+
+    recorder = AudioToTextRecorder(**recorder_config)
+
+    print("Say something...")
+
+    while True:
+        recorder.text(process_text)

+ 4 - 4
tests/simple_test.py

@@ -1,7 +1,7 @@
 from RealtimeSTT import AudioToTextRecorder
 
-recorder = AudioToTextRecorder(spinner=False)
+if __name__ == '__main__':
+    recorder = AudioToTextRecorder(spinner=False)
 
-print("Say something...")
-
-while (True): print(recorder.text(), end=" ", flush=True)
+    print("Say something...")
+    while (True): print(recorder.text(), end=" ", flush=True)

+ 57 - 53
tests/translator.py

@@ -3,69 +3,73 @@ import openai
 from RealtimeSTT import AudioToTextRecorder
 from RealtimeTTS import TextToAudioStream, AzureEngine
 
-# Setup OpenAI API key
-openai.api_key = os.environ.get("OPENAI_API_KEY")
+if __name__ == '__main__':
+    # Setup OpenAI API key
+    openai.api_key = os.environ.get("OPENAI_API_KEY")
 
-# Text-to-Speech Stream Setup (alternative engines: SystemEngine or ElevenlabsEngine)
-engine = AzureEngine( 
-    os.environ.get("AZURE_SPEECH_KEY"),
-    "eastus"
-)
-stream = TextToAudioStream(engine, log_characters=True)
+    # Text-to-Speech Stream Setup (alternative engines: SystemEngine or ElevenlabsEngine)
+    engine = AzureEngine( 
+        os.environ.get("AZURE_SPEECH_KEY"),
+        os.environ.get("AZURE_SPEECH_REGION")
+    )
+    stream = TextToAudioStream(engine, log_characters=True)
 
-# Speech-to-Text Recorder Setup
-recorder = AudioToTextRecorder(
-    model="medium",
-)
+    # Speech-to-Text Recorder Setup
+    recorder = AudioToTextRecorder(
+        model="medium",
+    )
 
-# Supported languages and their voices
-languages = [
-    ["english", "AshleyNeural"],
-    ["german", "AmalaNeural"],
-    ["french", "DeniseNeural"],
-    ["spanish", "EstrellaNeural"],
-    ["portuguese", "FernandaNeural"],
-    ["italian", "FabiolaNeural"]
-]
+    # Supported languages and their voices
+    languages = [
+        ["english", "AshleyNeural"],
+        ["german", "AmalaNeural"],
+        ["french", "DeniseNeural"],
+        ["spanish", "EstrellaNeural"],
+        ["portuguese", "FernandaNeural"],
+        ["italian", "FabiolaNeural"]
+    ]
 
-def generate_response(messages):
-    """Generate assistant's response using OpenAI."""
-    for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
-        text_chunk = chunk["choices"][0]["delta"].get("content")
-        if text_chunk:
-            yield text_chunk
+    def generate_response(messages):
+        """Generate assistant's response using OpenAI."""
+        for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
+            text_chunk = chunk["choices"][0]["delta"].get("content")
+            if text_chunk:
+                yield text_chunk
+                
+    def clear_console():
+        os.system('clear' if os.name == 'posix' else 'cls')
 
-def select_language():
-    """Display language options and get user's choice."""
-    for index, language in enumerate(languages, start=1):
-        print(f"{index}. {language[0]}")
-    language_number = input("Select language to translate to (1-6): ")
-    return languages[int(language_number) - 1]
+    def select_language():
+        """Display language options and get user's choice."""
+        for index, language in enumerate(languages, start=1):
+            print(f"{index}. {language[0]}")
+        language_number = input("Select language to translate to (1-6): ")
+        return languages[int(language_number) - 1]
 
-def main():
-    """Main translation loop."""
-    language_info = select_language()
-    engine.set_voice(language_info[1])
+    def main():
+        """Main translation loop."""
+        clear_console()
+        language_info = select_language()
+        engine.set_voice(language_info[1])
 
-    system_prompt_message = {
-        'role': 'system',
-        'content': f'Translate the given text to {language_info[0]}. Output only the translated text.'
-    }
+        system_prompt_message = {
+            'role': 'system',
+            'content': f'Translate the given text to {language_info[0]}. Output only the translated text.'
+        }
 
-    while True:
-        print("\nSay something!")
+        while True:
+            print("\nSay something!")
 
-        # Capture user input from microphone
-        user_text = recorder.text()
-        print(f"Input text: {user_text}")
+            # Capture user input from microphone
+            user_text = recorder.text()
+            print(f"Input text: {user_text}")
 
-        user_message = {'role': 'user', 'content': user_text}
+            user_message = {'role': 'user', 'content': user_text}
 
-        # Get assistant response and play it
-        translation_stream = generate_response([system_prompt_message, user_message])
-        print("Translation: ", end="", flush=True)
-        stream.feed(translation_stream)
-        stream.play()
+            # Get assistant response and play it
+            translation_stream = generate_response([system_prompt_message, user_message])
+            print("Translation: ", end="", flush=True)
+            stream.feed(translation_stream)
+            stream.play()
 
-if __name__ == "__main__":
     main()