Bläddra i källkod

context manager implemented

Kolja Beigel 1 år sedan
förälder
incheckning
21b49bff31

+ 22 - 4
README.md

@@ -16,6 +16,10 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
 ### Updates
 
+#### v0.1.6
+- implements context manager protocol (recorder can be used in a `with` statement)
+- bugfix in shutdown method
+
 #### v0.1.5
 
 - Bugfix for detection of short speech right after sentence detection (the problem mentioned in the video)
@@ -130,8 +134,8 @@ print(recorder.text())
 Recording based on voice activity detection.
 
 ```python
-recorder = AudioToTextRecorder()
-print(recorder.text())
+with AudioToTextRecorder() as recorder:
+    print(recorder.text())
 ```
 
 When running recorder.text in a loop it is recommended to use a callback, allowing the transcription to be run asynchronously:
@@ -170,6 +174,20 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
                                on_recording_stop=my_stop_callback)
 ```
 
+### Shutdown
+
+You can shutdown the recorder safely by using the context manager protocol:
+
+```python
+with AudioToTextRecorder() as recorder:
+    [...]
+```
+
+Or you can call the shutdown method manually (if using "with" is not feasible):
+
+```python
+recorder.shutdown()
+```
 
 ## Testing the Library
 
@@ -254,7 +272,7 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
 - **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6.
 
-- **silero_use_onnx** (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is True (recommended for faster performance).
+- **silero_use_onnx** (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is False. Recommended for faster performance.
 
 - **post_speech_silence_duration** (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
 
@@ -299,4 +317,4 @@ MIT
 
 Kolja Beigel  
 Email: kolja.beigel@web.de  
-[GitHub](https://github.com/KoljaB/RealtimeSTT)
+[GitHub](https://github.com/KoljaB/RealtimeSTT)

+ 426 - 277
RealtimeSTT/audio_recorder.py

@@ -51,6 +51,7 @@ INIT_WAKE_WORDS_SENSITIVITY = 0.6
 INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
 INIT_WAKE_WORD_TIMEOUT = 5.0
+ALLOWED_LATENCY_LIMIT = 10
 
 TIME_SLEEP = 0.02
 SAMPLE_RATE = 16000
@@ -82,7 +83,7 @@ class AudioToTextRecorder:
 
                  # Voice activation parameters
                  silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
-                 silero_use_onnx: bool = True,
+                 silero_use_onnx: bool = False,
                  webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
                  post_speech_silence_duration: float = INIT_POST_SPEECH_SILENCE_DURATION,
                  min_length_of_recording: float = INIT_MIN_LENGTH_OF_RECORDING,
@@ -122,7 +123,7 @@ class AudioToTextRecorder:
         - on_realtime_transcription_update = A callback function that is triggered whenever there's an update in the real-time transcription. The function is called with the newly transcribed text as its argument.
         - on_realtime_transcription_stabilized = A callback function that is triggered when the transcribed text stabilizes in quality. The stabilized text is generally more accurate but may arrive with a slight delay compared to the regular real-time updates.
         - silero_sensitivity (float, default=SILERO_SENSITIVITY): Sensitivity for the Silero Voice Activity Detection model ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
-        - silero_use_onnx (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
+        - silero_use_onnx (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
         - webrtc_sensitivity (int, default=WEBRTC_SENSITIVITY): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 0 (least aggressive / most sensitive) to 3 (most aggressive, least sensitive). Default is 3.
         - post_speech_silence_duration (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
         - min_gap_between_recordings (float, default=1.0): Specifies the minimum time interval in seconds that should exist between the end of one recording session and the beginning of another to prevent rapid consecutive recordings.
@@ -168,6 +169,7 @@ class AudioToTextRecorder:
         self.realtime_processing_pause = realtime_processing_pause
         self.on_realtime_transcription_update = on_realtime_transcription_update
         self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
+        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
     
         self.level = level
         self.audio_queue = Queue()
@@ -207,7 +209,7 @@ class AudioToTextRecorder:
         logger.setLevel(level)  # Set the root logger's level
 
         # Create a file handler and set its level
-        file_handler = logging.FileHandler('audio_recorder.log')
+        file_handler = logging.FileHandler('realtimesst.log')
         file_handler.setLevel(logging.DEBUG)
         file_handler.setFormatter(logging.Formatter(log_format))
 
@@ -220,16 +222,20 @@ class AudioToTextRecorder:
         logger.addHandler(file_handler)
         logger.addHandler(console_handler)
 
+        self.is_shut_down = False
+        self.shutdown_event = Event()
 
-        # start transcription process
+        logging.info(f"Starting RealTimeSTT")
+
+        # Start transcription process
         self.main_transcription_ready_event = Event()
         self.parent_transcription_pipe, child_transcription_pipe = Pipe()
-        self.process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event))
-        self.process.start()
+        self.transcript_process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event, self.shutdown_event))
+        self.transcript_process.start()
 
-        # start audio data reading process
-        reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size))
-        reader_process.start()
+        # Start audio data reading process
+        self.reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size, self.shutdown_event))
+        self.reader_process.start()
 
         # Initialize the realtime transcription model
         if self.enable_realtime_transcription:
@@ -310,7 +316,7 @@ class AudioToTextRecorder:
         self.realtime_thread.daemon = True
         self.realtime_thread.start()
 
-        # wait for transcription models to start
+        # Wait for transcription models to start
         logging.debug('Waiting for main transcription model to start')
         self.main_transcription_ready_event.wait()
         logging.debug('Main transcription model ready')
@@ -319,7 +325,25 @@ class AudioToTextRecorder:
 
 
     @staticmethod
-    def _transcription_worker(conn, model_path, ready_event):
+    def _transcription_worker(conn, model_path, ready_event, shutdown_event):
+        """
+        Worker method that handles the continuous process of transcribing audio data.
+
+        This method runs in a separate process and is responsible for:
+        - Initializing the `faster_whisper` model used for transcription.
+        - Receiving audio data sent through a pipe and using the model to transcribe it.
+        - Sending transcription results back through the pipe.
+        - Continuously checking for a shutdown event to gracefully terminate the transcription process.
+
+        Args:
+            conn (multiprocessing.Connection): The connection endpoint used for receiving audio data and sending transcription results.
+            model_path (str): The path to the pre-trained faster_whisper model for transcription.
+            ready_event (threading.Event): An event that is set when the transcription model is successfully initialized and ready.
+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
+
+        Raises:
+            Exception: If there is an error while initializing the transcription model.
+        """        
 
         logging.info(f"Initializing faster_whisper main transcription model {model_path}")
 
@@ -337,23 +361,44 @@ class AudioToTextRecorder:
 
         logging.debug('Faster_whisper main speech to text transcription model initialized successfully')
 
-        while True:
-            audio, language = conn.recv()
-            try:
-                segments = model.transcribe(audio, language=language if language else None)[0]
-                transcription = " ".join(seg.text for seg in segments).strip()
-                conn.send(('success', transcription))
-            except faster_whisper.WhisperError as e:
-                logging.error(f"Whisper transcription error: {e}")
-                conn.send(('error', str(e)))      
-            except Exception as e:
-                logging.error(f"General transcription error: {e}")
-                conn.send(('error', str(e)))
+        while not shutdown_event.is_set():
+            if conn.poll(0.5):
+                audio, language = conn.recv()
+                try:
+                    segments = model.transcribe(audio, language=language if language else None)[0]
+                    transcription = " ".join(seg.text for seg in segments).strip()
+                    conn.send(('success', transcription))
+                except faster_whisper.WhisperError as e:
+                    logging.error(f"Whisper transcription error: {e}")
+                    conn.send(('error', str(e)))      
+                except Exception as e:
+                    logging.error(f"General transcription error: {e}")
+                    conn.send(('error', str(e)))
+            else:
+                # If there's no data, sleep for a short while to prevent busy waiting
+                time.sleep(0.02)
 
 
     @staticmethod
-    def _audio_data_worker(audio_queue, sample_rate, buffer_size):
+    def _audio_data_worker(audio_queue, sample_rate, buffer_size, shutdown_event):
+        """
+        Worker method that handles the audio recording process.
+
+        This method runs in a separate process and is responsible for:
+        - Setting up the audio input stream for recording.
+        - Continuously reading audio data from the input stream and placing it in a queue.
+        - Handling errors during the recording process, including input overflow.
+        - Gracefully terminating the recording process when a shutdown event is set.
 
+        Args:
+            audio_queue (queue.Queue): A queue where recorded audio data is placed.
+            sample_rate (int): The sample rate of the audio input stream.
+            buffer_size (int): The size of the buffer used in the audio input stream.
+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
+
+        Raises:
+            Exception: If there is an error while initializing the audio recording.
+        """
         logging.info("Initializing audio recording (creating pyAudio input stream)")
 
         try:
@@ -366,29 +411,33 @@ class AudioToTextRecorder:
 
         logging.debug('Audio recording (pyAudio input stream) initialized successfully')
    
-        while True:
-            try:
-                data = stream.read(buffer_size)
-
-            except OSError as e:
-                if e.errno == pyaudio.paInputOverflowed:
-                    logging.warning("Input overflowed. Frame dropped.")
-                else:
+        try:
+            while not shutdown_event.is_set():
+                try:
+                    data = stream.read(buffer_size)
+
+                except OSError as e:
+                    if e.errno == pyaudio.paInputOverflowed:
+                        logging.warning("Input overflowed. Frame dropped.")
+                    else:
+                        logging.error(f"Error during recording: {e}")
+                    tb_str = traceback.format_exc()
+                    print (f"Traceback: {tb_str}")
+                    print (f"Error: {e}")
+                    continue
+
+                except Exception as e:
                     logging.error(f"Error during recording: {e}")
-                tb_str = traceback.format_exc()
-                print (f"Traceback: {tb_str}")
-                print (f"Error: {e}")
-                continue
+                    tb_str = traceback.format_exc()
+                    print (f"Traceback: {tb_str}")
+                    print (f"Error: {e}")
+                    continue
 
-            except Exception as e:
-                logging.error(f"Error during recording: {e}")
-                time.sleep(1)
-                tb_str = traceback.format_exc()
-                print (f"Traceback: {tb_str}")
-                print (f"Error: {e}")
-                continue
-
-            audio_queue.put(data)                
+                audio_queue.put(data)                
+        finally:
+            stream.stop_stream()
+            stream.close()
+            audio_interface.terminate()
 
 
     def wait_audio(self):
@@ -413,14 +462,14 @@ class AudioToTextRecorder:
             self._set_state("listening")
             self.start_recording_on_voice_activity = True
 
-            # wait until recording starts
+            # Wait until recording starts
             self.start_recording_event.wait()
 
         # If recording is ongoing, wait for voice inactivity to finish recording.
         if self.is_recording:
             self.stop_recording_on_voice_deactivity = True
 
-            # wait until recording stops
+            # Wait until recording stops
             self.stop_recording_event.wait()
 
         # Convert recorded frames to the appropriate audio format.
@@ -435,8 +484,25 @@ class AudioToTextRecorder:
         self._set_state("inactive")
 
 
-
     def transcribe(self):
+        """
+        Transcribes audio captured by this class instance using the `faster_whisper` model.
+
+        Automatically starts recording upon voice activity if not manually started using `recorder.start()`.
+        Automatically stops recording upon voice deactivity if not manually stopped with `recorder.stop()`.
+        Processes the recorded audio to generate transcription.
+
+        Args:
+            on_transcription_finished (callable, optional): Callback function to be executed when transcription is ready.
+                If provided, transcription will be performed asynchronously, and the callback will receive the transcription 
+                as its argument. If omitted, the transcription will be performed synchronously, and the result will be returned.
+
+        Returns (if no callback is set):
+            str: The transcription of the recorded audio.
+
+        Raises:
+            Exception: If there is an error during the transcription process.
+        """        
         self._set_state("transcribing")
         self.parent_transcription_pipe.send((self.audio, self.language))
         status, result = self.parent_transcription_pipe.recv()
@@ -470,6 +536,9 @@ class AudioToTextRecorder:
 
         self.wait_audio()
 
+        if self.is_shut_down:
+            return ""
+
         if on_transcription_finished:
             threading.Thread(target=on_transcription_finished, args=(self.transcribe(),)).start()
         else:
@@ -537,26 +606,281 @@ class AudioToTextRecorder:
         Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
         """
 
-        self.parent_transcription_pipe.close()
-        self.process.terminate()
 
+        # Force wait_audio() and text() to exit
+        self.is_shut_down = True
+        self.start_recording_event.set()
+        self.stop_recording_event.set()
+
+        self.shutdown_event.set()
         self.is_recording = False
         self.is_running = False
 
+        logging.debug('Finishing recording thread')
         if self.recording_thread:
             self.recording_thread.join()
+
+        logging.debug('Terminating reader process')
+        # Give it some time to finish the loop and cleanup.
+        self.reader_process.join(timeout=10) 
+
+        if self.reader_process.is_alive():
+            logging.warning("Reader process did not terminate in time. Terminating forcefully.")
+            self.reader_process.terminate()
+        
+        logging.debug('Terminating transcription process')
+        self.transcript_process.join(timeout=10) 
+
+        if self.transcript_process.is_alive():
+            logging.warning("Transcript process did not terminate in time. Terminating forcefully.")
+            self.transcript_process.terminate()
+
+        self.parent_transcription_pipe.close()
+
+        logging.debug('Finishing realtime thread')
         if self.realtime_thread:
             self.realtime_thread.join()
 
+
+
+
+
+
+    def _recording_worker(self):
+        """
+        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
+        """
+
+        logging.debug('Starting recording worker')
+
         try:
-            if self.stream:
-                self.stream.stop_stream()
-                self.stream.close()
-            if self.audio_interface:
-                self.audio_interface.terminate()
+            was_recording = False
+            delay_was_passed = False
+
+            # Continuously monitor audio for voice activity
+            while self.is_running:
+
+                data = self.audio_queue.get()
+
+                # Handle queue overflow
+                queue_overflow_logged = False
+                while self.audio_queue.qsize() > self.allowed_latency_limit:
+                    if not queue_overflow_logged:
+                        logging.warning(f"Audio queue size exceeds latency limit. Current size: {self.audio_queue.qsize()}. Discarding old audio chunks.")
+                        queue_overflow_logged = True
+                    data = self.audio_queue.get()
+
+                if not self.is_recording:
+                    # Handle not recording state
+
+                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
+                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
+
+                    # Handle wake-word timeout callback
+                    if wake_word_activation_delay_passed and not delay_was_passed:
+                        if self.wake_words and self.wake_word_activation_delay:
+                            if self.on_wakeword_timeout:
+                                self.on_wakeword_timeout()
+                    delay_was_passed = wake_word_activation_delay_passed
+
+                    # Set state and spinner text 
+                    if not self.recording_stop_time:
+                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
+                            self._set_state("wakeword")
+                        else:
+                            if self.listen_start:
+                                self._set_state("listening")
+                            else:
+                                self._set_state("inactive")
+
+                    # Detect wake words if applicable
+                    if self.wake_words and wake_word_activation_delay_passed:
+                        try:
+                            pcm = struct.unpack_from("h" * self.buffer_size, data)
+                            wakeword_index = self.porcupine.process(pcm)
+
+                        except struct.error:
+                            logging.error("Error unpacking audio data for wake word processing.")
+                            continue
+                        
+                        except Exception as e:
+                            logging.error(f"Wake word processing error: {e}")
+                            continue
+                        
+                        # If a wake word is detected
+                        if wakeword_index >= 0:
+
+                            # Removing the wake word from the recording
+                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
+                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
+                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
+                            self.audio_buffer.clear()
+                            self.audio_buffer.extend(temp_samples)
+
+                            self.wake_word_detect_time = time.time()
+                            self.wakeword_detected = True
+                            if self.on_wakeword_detected:
+                                self.on_wakeword_detected()
+
+                    # Check for voice activity to trigger the start of recording
+                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
+
+                        if self._is_voice_active():
+                            logging.info("voice activity detected")
+
+                            self.start()
+
+                            if self.is_recording:
+                                self.start_recording_on_voice_activity = False
+
+                                # Add the buffered audio to the recording frames
+                                self.frames.extend(list(self.audio_buffer))
+                                self.audio_buffer.clear()
+
+                            self.silero_vad_model.reset_states()
+                        else:
+                            data_copy = data[:]
+                            self._check_voice_activity(data_copy)
+
+                    self.speech_end_silence_start = 0
+
+                else:
+                    # If we are currently recording
+
+                    # Stop the recording if silence is detected after speech
+                    if self.stop_recording_on_voice_deactivity:
+
+                        if not self._is_webrtc_speech(data, True):
+
+                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
+                            if self.speech_end_silence_start == 0:
+                                self.speech_end_silence_start = time.time()
+                            
+                        else:
+                            self.speech_end_silence_start = 0
+
+                        # Wait for silence to stop recording after speech
+                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
+                            logging.info("voice deactivity detected")
+                            self.stop()
+
+
+                if not self.is_recording and was_recording:
+                    # Reset after stopping recording to ensure clean state
+                    self.stop_recording_on_voice_deactivity = False
+
+                if time.time() - self.silero_check_time > 0.1:
+                    self.silero_check_time = 0
+                
+                # Handle wake word timeout (waited to long initiating speech after wake word detection)
+                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
+                    self.wake_word_detect_time = 0
+                    if self.wakeword_detected and self.on_wakeword_timeout:
+                        self.on_wakeword_timeout()
+                    self.wakeword_detected = False
+
+                was_recording = self.is_recording
+
+
+                if self.is_recording:
+                    self.frames.append(data)
+
+                if not self.is_recording or self.speech_end_silence_start:
+                    self.audio_buffer.append(data)	
+
 
         except Exception as e:
-            logging.error(f"Error closing the audio stream: {e}")
+            logging.error(f"Unhandled exeption in _recording_worker: {e}")
+            raise
+
+
+    def _realtime_worker(self):
+        """
+        Performs real-time transcription if the feature is enabled.
+
+        The method is responsible transcribing recorded audio frames in real-time
+         based on the specified resolution interval.
+        The transcribed text is stored in `self.realtime_transcription_text` and a callback
+        function is invoked with this text if specified.
+        """
+
+        try:
+
+            logging.debug('Starting realtime worker')
+
+            # Return immediately if real-time transcription is not enabled
+            if not self.enable_realtime_transcription:
+                return
+                
+            # Continue running as long as the main process is active
+            while self.is_running:
+
+                # Check if the recording is active
+                if self.is_recording:
+                    
+                    # Sleep for the duration of the transcription resolution
+                    time.sleep(self.realtime_processing_pause)
+                    
+                    # Convert the buffer frames to a NumPy array
+                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+                    
+                    # Normalize the array to a [-1, 1] range
+                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+
+                    # Perform transcription and assemble the text
+                    segments = self.realtime_model_type.transcribe(
+                        audio_array,
+                        language=self.language if self.language else None
+                    )
+
+                    # double check recording state because it could have changed mid-transcription
+                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
+
+                        logging.debug('Starting realtime transcription')
+                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
+
+                        self.text_storage.append(self.realtime_transcription_text)
+
+                        # Take the last two texts in storage, if they exist
+                        if len(self.text_storage) >= 2:
+                            last_two_texts = self.text_storage[-2:]
+                            
+                            # Find the longest common prefix between the two texts
+                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
+
+                            # This prefix is the text that was transcripted two times in the same way
+                            # Store as "safely detected text" 
+                            if len(prefix) >= len(self.realtime_stabilized_safetext):
+                                # Only store when longer than the previous as additional security 
+                                self.realtime_stabilized_safetext = prefix
+
+                        # Find parts of the stabilized text in the freshly transscripted text
+                        matching_position = self._find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
+                        if matching_position < 0:
+                            if self.realtime_stabilized_safetext:
+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
+                            else:
+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
+                        else:
+                            # We found parts of the stabilized text in the transcripted text
+                            # We now take the stabilized text and add only the freshly transcripted part to it
+                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
+
+                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
+                            # on the first run without the need for two transcriptions
+                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
+
+                        # Invoke the callback with the transcribed text
+                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
+
+
+                # If not recording, sleep briefly before checking again
+                else:
+                    time.sleep(TIME_SLEEP)
+
+        except Exception as e:
+            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
+            raise
 
 
     def _is_silero_speech(self, data):
@@ -705,143 +1029,6 @@ class AudioToTextRecorder:
                 self.halo.text = text
 
 
-    def _recording_worker(self):
-        """
-        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
-        """
-
-        logging.debug('Starting recording worker')
-
-        try:
-            was_recording = False
-            delay_was_passed = False
-
-            # Continuously monitor audio for voice activity
-            while self.is_running:
-
-                data = self.audio_queue.get()
-
-                if not self.is_recording:
-                    # handle not recording state
-
-                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
-                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
-
-                    # handle wake-word timeout callback
-                    if wake_word_activation_delay_passed and not delay_was_passed:
-                        if self.wake_words and self.wake_word_activation_delay:
-                            if self.on_wakeword_timeout:
-                                self.on_wakeword_timeout()
-                    delay_was_passed = wake_word_activation_delay_passed
-
-                    # Set state and spinner text 
-                    if not self.recording_stop_time:
-                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
-                            self._set_state("wakeword")
-                        else:
-                            if self.listen_start:
-                                self._set_state("listening")
-                            else:
-                                self._set_state("inactive")
-
-                    # Detect wake words if applicable
-                    if self.wake_words and wake_word_activation_delay_passed:
-                        try:
-                            pcm = struct.unpack_from("h" * self.buffer_size, data)
-                            wakeword_index = self.porcupine.process(pcm)
-
-                        except struct.error:
-                            logging.error("Error unpacking audio data for wake word processing.")
-                            continue
-                        
-                        except Exception as e:
-                            logging.error(f"Wake word processing error: {e}")
-                            continue
-                        
-                        # If a wake word is detected
-                        if wakeword_index >= 0:
-
-                            # Removing the wake word from the recording
-                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
-                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
-                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
-                            self.audio_buffer.clear()
-                            self.audio_buffer.extend(temp_samples)
-
-                            self.wake_word_detect_time = time.time()
-                            self.wakeword_detected = True
-                            if self.on_wakeword_detected:
-                                self.on_wakeword_detected()
-
-                    # Check for voice activity to trigger the start of recording
-                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
-
-                        if self._is_voice_active():
-                            logging.info("voice activity detected")
-
-                            self.start()
-
-                            if self.is_recording:
-                                self.start_recording_on_voice_activity = False
-
-                                # Add the buffered audio to the recording frames
-                                self.frames.extend(list(self.audio_buffer))
-                                self.audio_buffer.clear()
-
-                            self.silero_vad_model.reset_states()
-                        else:
-                            data_copy = data[:]
-                            self._check_voice_activity(data_copy)
-
-                    self.speech_end_silence_start = 0
-
-                else:
-                    # If we are currently recording
-
-                    # Stop the recording if silence is detected after speech
-                    if self.stop_recording_on_voice_deactivity:
-
-                        if not self._is_webrtc_speech(data, True):
-
-                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
-                            if self.speech_end_silence_start == 0:
-                                self.speech_end_silence_start = time.time()
-                            
-                        else:
-                            self.speech_end_silence_start = 0
-
-                        # Wait for silence to stop recording after speech
-                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
-                            logging.info("voice deactivity detected")
-                            self.stop()
-
-                if not self.is_recording and was_recording:
-                    # Reset after stopping recording to ensure clean state
-                    self.stop_recording_on_voice_deactivity = False
-
-                if time.time() - self.silero_check_time > 0.1:
-                    self.silero_check_time = 0
-                
-                # handle wake word timeout (waited to long initiating speech after wake word detection)
-                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
-                    self.wake_word_detect_time = 0
-                    if self.wakeword_detected and self.on_wakeword_timeout:
-                        self.on_wakeword_timeout()
-                    self.wakeword_detected = False
-
-                if self.is_recording:
-                    self.frames.append(data)
-
-                if not self.is_recording or self.speech_end_silence_start:
-                    self.audio_buffer.append(data)	
-
-                was_recording = self.is_recording
-
-        except Exception as e:
-            logging.error(f"Unhandled exeption in _recording_worker: {e}")
-            raise
-
-
     def _preprocess_output(self, text, preview=False):
         """
         Preprocesses the output text by removing any leading or trailing whitespace,
@@ -869,7 +1056,7 @@ class AudioToTextRecorder:
         return text
 
 
-    def find_tail_match_in_text(self, text1, text2, length_of_match=10):
+    def _find_tail_match_in_text(self, text1, text2, length_of_match=10):
         """
         Find the position where the last 'n' characters of text1 match with a substring in text2.
         
@@ -905,106 +1092,68 @@ class AudioToTextRecorder:
         
         return -1
     
+
     def _on_realtime_transcription_stabilized(self, text):
+        """
+        Callback method invoked when the real-time transcription stabilizes.
+
+        This method is called internally when the transcription text is considered "stable" 
+        meaning it's less likely to change significantly with additional audio input. It 
+        notifies any registered external listener about the stabilized text if recording is 
+        still ongoing. This is particularly useful for applications that need to display 
+        live transcription results to users and want to highlight parts of the transcription 
+        that are less likely to change.
+
+        Args:
+            text (str): The stabilized transcription text.
+        """        
         if self.on_realtime_transcription_stabilized:
             if self.is_recording:
                 self.on_realtime_transcription_stabilized(text)
 
+
     def _on_realtime_transcription_update(self, text):
+        """
+        Callback method invoked when there's an update in the real-time transcription.
+
+        This method is called internally whenever there's a change in the transcription text,
+        notifying any registered external listener about the update if recording is still 
+        ongoing. This provides a mechanism for applications to receive and possibly display 
+        live transcription updates, which could be partial and still subject to change.
+
+        Args:
+            text (str): The updated transcription text.
+        """        
         if self.on_realtime_transcription_update:
             if self.is_recording:
                 self.on_realtime_transcription_update(text)
 
-    def _realtime_worker(self):
-        """
-        Performs real-time transcription if the feature is enabled.
 
-        The method is responsible transcribing recorded audio frames in real-time
-         based on the specified resolution interval.
-        The transcribed text is stored in `self.realtime_transcription_text` and a callback
-        function is invoked with this text if specified.
+    def __enter__(self):
         """
+        Method to setup the context manager protocol.
 
-        try:
+        This enables the instance to be used in a `with` statement, ensuring proper 
+        resource management. When the `with` block is entered, this method is 
+        automatically called.
 
-            logging.debug('Starting realtime worker')
-
-            # Return immediately if real-time transcription is not enabled
-            if not self.enable_realtime_transcription:
-                return
-                
-            # Continue running as long as the main process is active
-            while self.is_running:
-
-                # Check if the recording is active
-                if self.is_recording:
-                    
-                    # Sleep for the duration of the transcription resolution
-                    time.sleep(self.realtime_processing_pause)
-                    
-                    # Convert the buffer frames to a NumPy array
-                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
-                    
-                    # Normalize the array to a [-1, 1] range
-                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
-
-                    # Perform transcription and assemble the text
-                    segments = self.realtime_model_type.transcribe(
-                        audio_array,
-                        language=self.language if self.language else None
-                    )
-
-                    # double check recording state because it could have changed mid-transcription
-                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
-
-                        logging.debug('Starting realtime transcription')
-                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
-
-                        self.text_storage.append(self.realtime_transcription_text)
-
-                        # Take the last two texts in storage, if they exist
-                        if len(self.text_storage) >= 2:
-                            last_two_texts = self.text_storage[-2:]
-                            
-                            # Find the longest common prefix between the two texts
-                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
-
-                            # This prefix is the text that was transcripted two times in the same way
-                            # Store as "safely detected text" 
-                            if len(prefix) >= len(self.realtime_stabilized_safetext):
-                                # Only store when longer than the previous as additional security 
-                                self.realtime_stabilized_safetext = prefix
-
-                        # Find parts of the stabilized text in the freshly transscripted text
-                        matching_position = self.find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
-                        if matching_position < 0:
-                            if self.realtime_stabilized_safetext:
-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
-                            else:
-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
-                        else:
-                            # We found parts of the stabilized text in the transcripted text
-                            # We now take the stabilized text and add only the freshly transcripted part to it
-                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
-
-                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
-                            # on the first run without the need for two transcriptions
-                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
-
-                        # Invoke the callback with the transcribed text
-                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
+        Returns:
+            self: The current instance of the class.
+        """
+        return self
 
 
-                # If not recording, sleep briefly before checking again
-                else:
-                    time.sleep(TIME_SLEEP)
+    def __exit__(self, exc_type, exc_value, traceback):
+        """
+        Method to define behavior when the context manager protocol exits.
 
-        except Exception as e:
-            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
-            raise
+        This is called when exiting the `with` block and ensures that any necessary 
+        cleanup or resource release processes are executed, such as shutting down 
+        the system properly.
 
-    def __del__(self):
-        """
-        Destructor method ensures safe shutdown of the recorder when the instance is destroyed.
+        Args:
+            exc_type (Exception or None): The type of the exception that caused the context to be exited, if any.
+            exc_value (Exception or None): The exception instance that caused the context to be exited, if any.
+            traceback (Traceback or None): The traceback corresponding to the exception, if any.
         """
         self.shutdown()

+ 475 - 467
example_app/ui_openai_voice_interface.py

@@ -1,514 +1,522 @@
-from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
-from RealtimeSTT import AudioToTextRecorder
-
-from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
-from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
-from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
-
-import os
-import openai
-import sys
-import time
-import sounddevice as sd
-import numpy as np
-import wavio
-import keyboard
-
-max_history_messages = 6
-return_to_wakewords_after_silence = 12
-start_with_wakeword = False
-start_engine = "Azure" # Azure, Elevenlabs
-recorder_model = "large-v2"
-language = "en"
-azure_speech_region = "eastus"
-openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
-
-openai.api_key = os.environ.get("OPENAI_API_KEY")
-
-user_font_size = 22
-user_color = QColor(0, 188, 242) # turquoise
-
-assistant_font_size = 24
-assistant_color = QColor(239, 98, 166) # pink
-
-voice_azure = "en-GB-SoniaNeural"
-voice_system = "Zira"
-#voice_system = "Hazel"
-prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
-elevenlabs_model = "eleven_monolingual_v1"
-
-if language == "de":
-    elevenlabs_model = "eleven_multilingual_v1"
-    voice_system = "Katja"
-    voice_azure = "de-DE-MajaNeural"
-    prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
-    
-print ("Click the top right corner to change the engine")
-print ("Press ESC to stop the current playback")
-
-system_prompt_message = {
-    'role': 'system',
-    'content': prompt
-}
-
-def generate_response(messages):
-    """Generate assistant's response using OpenAI."""
-    for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
-        text_chunk = chunk["choices"][0]["delta"].get("content")
-        if text_chunk:
-            yield text_chunk
-
-history = []
-MAX_WINDOW_WIDTH = 1600
-MAX_WIDTH_ASSISTANT = 1200
-MAX_WIDTH_USER = 1500
-
-class AudioPlayer(QThread):
-    def __init__(self, file_path):
-        super(AudioPlayer, self).__init__()
-        self.file_path = file_path
-
-    def run(self):
-        wav = wavio.read(self.file_path)
-        sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
-        sd.play(sound, wav.rate)
-        sd.wait()
-
-class TextRetrievalThread(QThread):
-    textRetrieved = pyqtSignal(str)
-
-    def __init__(self, recorder):
-        super().__init__()
-        self.recorder = recorder
-        self.active = False  
-
-    def run(self):
-        while True:
-            if self.active:  
-                text = self.recorder.text()
-                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
-                self.textRetrieved.emit(text)
-                self.active = False
-            time.sleep(0.1) 
-
-    def activate(self):
-        self.active = True 
-
-class TransparentWindow(QWidget):
-    updateUI = pyqtSignal()
-    clearAssistantTextSignal = pyqtSignal()
-    clearUserTextSignal = pyqtSignal()
-
-    def __init__(self):
-        super().__init__()
-
-        self.setGeometry(1, 1, 1, 1) 
-
-        self.setWindowTitle("Transparent Window")
-        self.setAttribute(Qt.WA_TranslucentBackground)
-        self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
-
-        self.big_symbol_font = QFont('Arial', 32)
-        self.small_symbol_font = QFont('Arial', 17)
-        self.user_font = QFont('Arial', user_font_size)
-        self.assistant_font = QFont('Arial', assistant_font_size)      
-        self.assistant_font.setItalic(True) 
-
-        self.big_symbol_text = ""
-        self.small_symbol_text = ""
-        self.user_text = ""
-        self.assistant_text = ""
-        self.displayed_user_text = ""
-        self.displayed_assistant_text = ""
-        self.stream = None
-        self.text_retrieval_thread = None
-
-        self.user_text_timer = QTimer(self)
-        self.assistant_text_timer = QTimer(self)
-        self.user_text_timer.timeout.connect(self.clear_user_text)
-        self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
-
-        self.clearUserTextSignal.connect(self.init_clear_user_text)
-        self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
-        self.user_text_opacity = 255 
-        self.assistant_text_opacity = 255 
-        self.updateUI.connect(self.update_self)
-        self.audio_player = None
-
-        self.run_fade_user = False
-        self.run_fade_assistant = False
-
-        self.menu = QMenu()
-        self.menu.setStyleSheet("""
-            QMenu {
-                background-color: black;
-                color: white;
-                border-radius: 10px;
-            }
-            QMenu::item:selected {
-                background-color: #555555;
-            }
-            """)
-
-        self.elevenlabs_action = QAction("Elevenlabs", self)
-        self.azure_action = QAction("Azure", self)
-        self.system_action = QAction("System", self)
-        self.quit_action = QAction("Quit", self)
-
-        self.menu.addAction(self.elevenlabs_action)
-        self.menu.addAction(self.azure_action)
-        self.menu.addAction(self.system_action)
-        self.menu.addSeparator() 
-        self.menu.addAction(self.quit_action)
-
-        self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
-        self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
-        self.system_action.triggered.connect(lambda: self.select_engine("System"))
-        self.quit_action.triggered.connect(self.close_application)
-
-    def mousePressEvent(self, event: QMouseEvent):
-        if event.button() == Qt.LeftButton:
-            if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
-                self.menu.exec_(self.mapToGlobal(event.pos()))        
-
-    def close_application(self):
-        QApplication.quit()                
-
-    def init(self):
-
-        self.select_engine(start_engine)
-
-        # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
-
-        self.recorder = AudioToTextRecorder(
-            model=recorder_model,
-            language=language,
-            wake_words="Jarvis",
-            spinner=True,
-            silero_sensitivity=0.2,
-            webrtc_sensitivity=3,
-            on_recording_start=self.on_recording_start,
-            on_vad_detect_start=self.on_vad_detect_start,
-            on_wakeword_detection_start=self.on_wakeword_detection_start,
-            on_transcription_start=self.on_transcription_start,
-            post_speech_silence_duration=0.4, 
-            min_length_of_recording=0.3, 
-            min_gap_between_recordings=0.01, 
-            enable_realtime_transcription = True,
-            realtime_processing_pause = 0.01, 
-            realtime_model_type = "tiny",
-            on_realtime_transcription_stabilized=self.text_detected
-        )
-        if not start_with_wakeword:
-            self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
-            
-        self.text_retrieval_thread = TextRetrievalThread(self.recorder)
-        self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
-        self.text_retrieval_thread.start()
-        self.text_retrieval_thread.activate()
-
-        keyboard.on_press_key('esc', self.on_escape)
+if __name__ == '__main__':
 
-    def select_engine(self, engine_name):
-        if self.stream:
-            self.stream.stop()
+    from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
+    from RealtimeSTT import AudioToTextRecorder
+
+    from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
+    from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
+    from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
+
+    import os
+    import openai
+    import sys
+    import time
+    import sounddevice as sd
+    import numpy as np
+    import wavio
+    import keyboard
+
+    max_history_messages = 6
+    return_to_wakewords_after_silence = 12
+    start_with_wakeword = False
+    start_engine = "Azure" # Azure, Elevenlabs
+    recorder_model = "large-v2"
+    language = "en"
+    azure_speech_region = "eastus"
+    openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
+
+    openai.api_key = os.environ.get("OPENAI_API_KEY")
+
+    user_font_size = 22
+    user_color = QColor(0, 188, 242) # turquoise
+
+    assistant_font_size = 24
+    assistant_color = QColor(239, 98, 166) # pink
+
+    voice_azure = "en-GB-SoniaNeural"
+    voice_system = "Zira"
+    #voice_system = "Hazel"
+    prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
+    elevenlabs_model = "eleven_monolingual_v1"
+
+    if language == "de":
+        elevenlabs_model = "eleven_multilingual_v1"
+        voice_system = "Katja"
+        voice_azure = "de-DE-MajaNeural"
+        prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
+        
+    print ("Click the top right corner to change the engine")
+    print ("Press ESC to stop the current playback")
+
+    system_prompt_message = {
+        'role': 'system',
+        'content': prompt
+    }
+
+    def generate_response(messages):
+        """Generate assistant's response using OpenAI."""
+        for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
+            text_chunk = chunk["choices"][0]["delta"].get("content")
+            if text_chunk:
+                yield text_chunk
+
+    history = []
+    MAX_WINDOW_WIDTH = 1600
+    MAX_WIDTH_ASSISTANT = 1200
+    MAX_WIDTH_USER = 1500
+
+    class AudioPlayer(QThread):
+        def __init__(self, file_path):
+            super(AudioPlayer, self).__init__()
+            self.file_path = file_path
+
+        def run(self):
+            wav = wavio.read(self.file_path)
+            sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
+            sd.play(sound, wav.rate)
+            sd.wait()
+
+    class TextRetrievalThread(QThread):
+        textRetrieved = pyqtSignal(str)
+
+        def __init__(self, recorder):
+            super().__init__()
+            self.recorder = recorder
+            self.active = False  
+
+        def run(self):
+            while True:
+                if self.active:  
+                    text = self.recorder.text()
+                    self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
+                    self.textRetrieved.emit(text)
+                    self.active = False
+                time.sleep(0.1) 
+
+        def activate(self):
+            self.active = True 
+
+    class TransparentWindow(QWidget):
+        updateUI = pyqtSignal()
+        clearAssistantTextSignal = pyqtSignal()
+        clearUserTextSignal = pyqtSignal()
+
+        def __init__(self):
+            super().__init__()
+
+            self.setGeometry(1, 1, 1, 1) 
+
+            self.setWindowTitle("Transparent Window")
+            self.setAttribute(Qt.WA_TranslucentBackground)
+            self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
+
+            self.big_symbol_font = QFont('Arial', 32)
+            self.small_symbol_font = QFont('Arial', 17)
+            self.user_font = QFont('Arial', user_font_size)
+            self.assistant_font = QFont('Arial', assistant_font_size)      
+            self.assistant_font.setItalic(True) 
+
+            self.big_symbol_text = ""
+            self.small_symbol_text = ""
+            self.user_text = ""
+            self.assistant_text = ""
+            self.displayed_user_text = ""
+            self.displayed_assistant_text = ""
             self.stream = None
+            self.text_retrieval_thread = None
 
-        engine = None
+            self.user_text_timer = QTimer(self)
+            self.assistant_text_timer = QTimer(self)
+            self.user_text_timer.timeout.connect(self.clear_user_text)
+            self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
 
-        if engine_name == "Azure":
-            engine = AzureEngine(
-                    os.environ.get("AZURE_SPEECH_KEY"),
-                    os.environ.get("AZURE_SPEECH_REGION"),
-                    voice_azure,
-                    rate=24,
-                    pitch=10,
-                )
+            self.clearUserTextSignal.connect(self.init_clear_user_text)
+            self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
+            self.user_text_opacity = 255 
+            self.assistant_text_opacity = 255 
+            self.updateUI.connect(self.update_self)
+            self.audio_player = None
+
+            self.run_fade_user = False
+            self.run_fade_assistant = False
+
+            self.menu = QMenu()
+            self.menu.setStyleSheet("""
+                QMenu {
+                    background-color: black;
+                    color: white;
+                    border-radius: 10px;
+                }
+                QMenu::item:selected {
+                    background-color: #555555;
+                }
+                """)
+
+            self.elevenlabs_action = QAction("Elevenlabs", self)
+            self.azure_action = QAction("Azure", self)
+            self.system_action = QAction("System", self)
+            self.quit_action = QAction("Quit", self)
+
+            self.menu.addAction(self.elevenlabs_action)
+            self.menu.addAction(self.azure_action)
+            self.menu.addAction(self.system_action)
+            self.menu.addSeparator() 
+            self.menu.addAction(self.quit_action)
+
+            self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
+            self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
+            self.system_action.triggered.connect(lambda: self.select_engine("System"))
+            self.quit_action.triggered.connect(self.close_application)
+
+        def mousePressEvent(self, event: QMouseEvent):
+            if event.button() == Qt.LeftButton:
+                if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
+                    self.menu.exec_(self.mapToGlobal(event.pos()))        
+
+        def close_application(self):
+            if self.recorder:
+                self.recorder.shutdown()                    
+            QApplication.quit()                
+
+        def init(self):
+
+            self.select_engine(start_engine)
+
+            # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
+
+            self.recorder = AudioToTextRecorder(
+                model=recorder_model,
+                language=language,
+                wake_words="Jarvis",
+                silero_use_onnx=False,
+                spinner=True,
+                silero_sensitivity=0.2,
+                webrtc_sensitivity=3,
+                on_recording_start=self.on_recording_start,
+                on_vad_detect_start=self.on_vad_detect_start,
+                on_wakeword_detection_start=self.on_wakeword_detection_start,
+                on_transcription_start=self.on_transcription_start,
+                post_speech_silence_duration=0.4, 
+                min_length_of_recording=0.3, 
+                min_gap_between_recordings=0.01, 
+                enable_realtime_transcription = True,
+                realtime_processing_pause = 0.01, 
+                realtime_model_type = "tiny",
+                on_realtime_transcription_stabilized=self.text_detected
+            )
+            if not start_with_wakeword:
+                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
+                
+            self.text_retrieval_thread = TextRetrievalThread(self.recorder)
+            self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
+            self.text_retrieval_thread.start()
+            self.text_retrieval_thread.activate()
 
-        elif engine_name == "Elevenlabs":
-            engine = ElevenlabsEngine(
-                    os.environ.get("ELEVENLABS_API_KEY"),
-                    model=elevenlabs_model
+            keyboard.on_press_key('esc', self.on_escape)
+
+        def closeEvent(self, event):
+            if self.recorder:
+                self.recorder.shutdown()            
+
+        def select_engine(self, engine_name):
+            if self.stream:
+                self.stream.stop()
+                self.stream = None
+
+            engine = None
+
+            if engine_name == "Azure":
+                engine = AzureEngine(
+                        os.environ.get("AZURE_SPEECH_KEY"),
+                        os.environ.get("AZURE_SPEECH_REGION"),
+                        voice_azure,
+                        rate=24,
+                        pitch=10,
+                    )
+
+            elif engine_name == "Elevenlabs":
+                engine = ElevenlabsEngine(
+                        os.environ.get("ELEVENLABS_API_KEY"),
+                        model=elevenlabs_model
+                    )
+            else:
+                engine = SystemEngine(
+                    voice=voice_system,
+                    #print_installed_voices=True
                 )
-        else:
-            engine = SystemEngine(
-                voice=voice_system,
-                #print_installed_voices=True
+
+            self.stream = TextToAudioStream(
+                engine,
+                on_character=self.on_character,
+                on_text_stream_stop=self.on_text_stream_stop,
+                on_text_stream_start=self.on_text_stream_start,
+                on_audio_stream_stop=self.on_audio_stream_stop,
+                log_characters=True
             )
+            sys.stdout.write('\033[K')  # Clear to the end of line
+            sys.stdout.write('\r')  # Move the cursor to the beginning of the line
+            print (f"Using {engine_name} engine")
 
-        self.stream = TextToAudioStream(
-            engine,
-            on_character=self.on_character,
-            on_text_stream_stop=self.on_text_stream_stop,
-            on_text_stream_start=self.on_text_stream_start,
-            on_audio_stream_stop=self.on_audio_stream_stop,
-            log_characters=True
-        )
-        sys.stdout.write('\033[K')  # Clear to the end of line
-        sys.stdout.write('\r')  # Move the cursor to the beginning of the line
-        print (f"Using {engine_name} engine")
-
-
-    def text_detected(self, text):
-        self.run_fade_user = False
-        if self.user_text_timer.isActive():
-            self.user_text_timer.stop()
-        self.user_text_opacity = 255 
-        self.user_text = text
-        self.updateUI.emit()
-
-    def on_escape(self, e):
-        if self.stream.is_playing():
-            self.stream.stop()
-
-    def showEvent(self, event: QEvent):
-        super().showEvent(event)
-        if event.type() == QEvent.Show:
-            self.set_symbols("⌛", "🚀")
-            QTimer.singleShot(1000, self.init) 
-
-    def on_character(self, char):
-        if self.stream:
-            self.assistant_text += char
+
+        def text_detected(self, text):
+            self.run_fade_user = False
+            if self.user_text_timer.isActive():
+                self.user_text_timer.stop()
+            self.user_text_opacity = 255 
+            self.user_text = text
             self.updateUI.emit()
 
-    def on_text_stream_stop(self):
-        print("\"", end="", flush=True)
-        if self.stream:
-            assistant_response = self.stream.text()            
-            self.assistant_text = assistant_response
-            history.append({'role': 'assistant', 'content': assistant_response})
+        def on_escape(self, e):
+            if self.stream.is_playing():
+                self.stream.stop()
 
-    def on_audio_stream_stop(self):
-        self.set_symbols("🎙️", "⚪")
+        def showEvent(self, event: QEvent):
+            super().showEvent(event)
+            if event.type() == QEvent.Show:
+                self.set_symbols("⌛", "🚀")
+                QTimer.singleShot(1000, self.init) 
 
-        if self.stream:
-            self.clearAssistantTextSignal.emit()
-            self.text_retrieval_thread.activate()
+        def on_character(self, char):
+            if self.stream:
+                self.assistant_text += char
+                self.updateUI.emit()
 
-    def generate_answer(self):
-        self.run_fade_assistant = False
-        if self.assistant_text_timer.isActive():
-            self.assistant_text_timer.stop()
+        def on_text_stream_stop(self):
+            print("\"", end="", flush=True)
+            if self.stream:
+                assistant_response = self.stream.text()            
+                self.assistant_text = assistant_response
+                history.append({'role': 'assistant', 'content': assistant_response})
 
-        history.append({'role': 'user', 'content': self.user_text})
-        self.remove_assistant_text()
-        assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
-        self.stream.feed(assistant_response)
-        self.stream.play_async(minimum_sentence_length=6,
-                               buffer_threshold_seconds=2)
+        def on_audio_stream_stop(self):
+            self.set_symbols("🎙️", "⚪")
 
-    def set_symbols(self, big_symbol, small_symbol):
-        self.big_symbol_text = big_symbol
-        self.small_symbol_text = small_symbol
-        self.updateUI.emit()
+            if self.stream:
+                self.clearAssistantTextSignal.emit()
+                self.text_retrieval_thread.activate()
 
-    def on_text_stream_start(self):
-        self.set_symbols("⌛", "👄")
+        def generate_answer(self):
+            self.run_fade_assistant = False
+            if self.assistant_text_timer.isActive():
+                self.assistant_text_timer.stop()
+
+            history.append({'role': 'user', 'content': self.user_text})
+            self.remove_assistant_text()
+            assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
+            self.stream.feed(assistant_response)
+            self.stream.play_async(minimum_sentence_length=6,
+                                buffer_threshold_seconds=2)
+
+        def set_symbols(self, big_symbol, small_symbol):
+            self.big_symbol_text = big_symbol
+            self.small_symbol_text = small_symbol
+            self.updateUI.emit()
 
-    def process_user_text(self, user_text):
-        user_text = user_text.strip()
-        if user_text:
-            self.run_fade_user = False
+        def on_text_stream_start(self):
+            self.set_symbols("⌛", "👄")
+
+        def process_user_text(self, user_text):
+            user_text = user_text.strip()
+            if user_text:
+                self.run_fade_user = False
+                if self.user_text_timer.isActive():
+                    self.user_text_timer.stop()
+
+                self.user_text_opacity = 255 
+                self.user_text = user_text
+                self.clearUserTextSignal.emit()
+                print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
+                self.set_symbols("⌛", "🧠")
+                QTimer.singleShot(100, self.generate_answer)
+
+        def on_transcription_start(self):
+            self.set_symbols("⌛", "📝")
+
+        def on_recording_start(self):
+            self.text_storage = []
+            self.ongoing_sentence = ""
+            self.set_symbols("🎙️", "🔴")
+
+        def on_vad_detect_start(self):
+            if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
+                self.audio_player = AudioPlayer("active.wav")
+                self.audio_player.start() 
+
+            self.set_symbols("🎙️", "⚪")
+
+        def on_wakeword_detection_start(self):
+            self.audio_player = AudioPlayer("inactive.wav")
+            self.audio_player.start()         
+
+            self.set_symbols("", "💤")
+
+        def init_clear_user_text(self):
             if self.user_text_timer.isActive():
-                self.user_text_timer.stop()
+                self.user_text_timer.stop()        
+            self.user_text_timer.start(10000)
 
+        def remove_user_text(self):
+            self.user_text = ""
             self.user_text_opacity = 255 
-            self.user_text = user_text
-            self.clearUserTextSignal.emit()
-            print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
-            self.set_symbols("⌛", "🧠")
-            QTimer.singleShot(100, self.generate_answer)
+            self.updateUI.emit()
 
-    def on_transcription_start(self):
-        self.set_symbols("⌛", "📝")
+        def fade_out_user_text(self):
+            if not self.run_fade_user:
+                return
 
-    def on_recording_start(self):
-        self.text_storage = []
-        self.ongoing_sentence = ""
-        self.set_symbols("🎙️", "🔴")
+            if self.user_text_opacity > 0:
+                self.user_text_opacity -= 5 
+                self.updateUI.emit()
+                QTimer.singleShot(50, self.fade_out_user_text)
+            else:
+                self.run_fade_user = False
+                self.remove_user_text()        
 
-    def on_vad_detect_start(self):
-        if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
-            self.audio_player = AudioPlayer("active.wav")
-            self.audio_player.start() 
+        def clear_user_text(self):
+            self.user_text_timer.stop()
 
-        self.set_symbols("🎙️", "⚪")
+            if not self.user_text:
+                return
 
-    def on_wakeword_detection_start(self):
-        self.audio_player = AudioPlayer("inactive.wav")
-        self.audio_player.start()         
+            self.user_text_opacity = 255
+            self.run_fade_user = True
+            self.fade_out_user_text()
 
-        self.set_symbols("", "💤")
+        def init_clear_assistant_text(self):
+            if self.assistant_text_timer.isActive():
+                self.assistant_text_timer.stop()        
+            self.assistant_text_timer.start(10000)
 
-    def init_clear_user_text(self):
-        if self.user_text_timer.isActive():
-            self.user_text_timer.stop()        
-        self.user_text_timer.start(10000)
+        def remove_assistant_text(self):
+            self.assistant_text = ""
+            self.assistant_text_opacity = 255 
+            self.updateUI.emit()
 
-    def remove_user_text(self):
-        self.user_text = ""
-        self.user_text_opacity = 255 
-        self.updateUI.emit()
+        def fade_out_assistant_text(self):
+            if not self.run_fade_assistant:
+                return
+            
+            if self.assistant_text_opacity > 0:
+                self.assistant_text_opacity -= 5 
+                self.updateUI.emit()
+                QTimer.singleShot(50, self.fade_out_assistant_text)
+            else:
+                self.run_fade_assistant = False
+                self.remove_assistant_text()        
 
-    def fade_out_user_text(self):
-        if not self.run_fade_user:
-            return
+        def clear_assistant_text(self):
+            self.assistant_text_timer.stop()
 
-        if self.user_text_opacity > 0:
-            self.user_text_opacity -= 5 
-            self.updateUI.emit()
-            QTimer.singleShot(50, self.fade_out_user_text)
-        else:
-            self.run_fade_user = False
-            self.remove_user_text()        
+            if not self.assistant_text:
+                return
 
-    def clear_user_text(self):
-        self.user_text_timer.stop()
+            self.assistant_text_opacity = 255
+            self.run_fade_assistant = True
+            self.fade_out_assistant_text()
 
-        if not self.user_text:
-            return
+        def update_self(self):
 
-        self.user_text_opacity = 255
-        self.run_fade_user = True
-        self.fade_out_user_text()
+            self.blockSignals(True)
+                    
+            self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
+            self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
 
-    def init_clear_assistant_text(self):
-        if self.assistant_text_timer.isActive():
-            self.assistant_text_timer.stop()        
-        self.assistant_text_timer.start(10000)
+            fm_symbol = QFontMetrics(self.big_symbol_font)
+            self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
+            self.symbol_height = fm_symbol.height() + 8
 
-    def remove_assistant_text(self):
-        self.assistant_text = ""
-        self.assistant_text_opacity = 255 
-        self.updateUI.emit()
+            self.total_width = MAX_WINDOW_WIDTH
 
-    def fade_out_assistant_text(self):
-        if not self.run_fade_assistant:
-            return
-        
-        if self.assistant_text_opacity > 0:
-            self.assistant_text_opacity -= 5 
-            self.updateUI.emit()
-            QTimer.singleShot(50, self.fade_out_assistant_text)
-        else:
-            self.run_fade_assistant = False
-            self.remove_assistant_text()        
+            fm_user = QFontMetrics(self.user_font)
+            user_text_lines = (self.displayed_user_text.count("\n") + 1)
+            self.user_height = fm_user.height() * user_text_lines + 7
 
-    def clear_assistant_text(self):
-        self.assistant_text_timer.stop()
+            fm_assistant = QFontMetrics(self.assistant_font)
+            assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
+            self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
 
-        if not self.assistant_text:
-            return
+            self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
 
-        self.assistant_text_opacity = 255
-        self.run_fade_assistant = True
-        self.fade_out_assistant_text()
+            desktop = QDesktopWidget()
+            screen_rect = desktop.availableGeometry(desktop.primaryScreen())
+            self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
 
-    def update_self(self):
+            self.blockSignals(False)
 
-        self.blockSignals(True)
-                
-        self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
-        self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
-
-        fm_symbol = QFontMetrics(self.big_symbol_font)
-        self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
-        self.symbol_height = fm_symbol.height() + 8
-
-        self.total_width = MAX_WINDOW_WIDTH
-
-        fm_user = QFontMetrics(self.user_font)
-        user_text_lines = (self.displayed_user_text.count("\n") + 1)
-        self.user_height = fm_user.height() * user_text_lines + 7
-
-        fm_assistant = QFontMetrics(self.assistant_font)
-        assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
-        self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
-
-        self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
-
-        desktop = QDesktopWidget()
-        screen_rect = desktop.availableGeometry(desktop.primaryScreen())
-        self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
-
-        self.blockSignals(False)
-
-        self.update()
-
-    def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
-        painter.setPen(outlineColor)
-        for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
-                    (-outline_size, -outline_size), (outline_size, -outline_size),
-                    (-outline_size, outline_size), (outline_size, outline_size)]:
-            painter.drawText(x + dx, y + dy, width, height, alignment, text)
-
-        painter.setPen(textColor)
-        painter.drawText(x, y, width, height, alignment, text)
-
-    def paintEvent(self, event):
-        painter = QPainter(self)
-
-        offsetX = 4
-        offsetY = 5
-    
-        painter.setPen(QColor(255, 255, 255))
-
-        # Draw symbol
-        painter.setFont(self.big_symbol_font)
-        if self.big_symbol_text:
-            painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
-            painter.setFont(self.small_symbol_font)
-            painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
-        else:
-            painter.setFont(self.small_symbol_font)
-            painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
-
-        # Draw User Text
-        painter.setFont(self.user_font)
-        user_x = self.total_width - self.user_width - 45 + offsetX
-        user_y = offsetY + 15
-        user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
-        outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
-        self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
-
-        # Draw Assistant Text
-        painter.setFont(self.assistant_font)
-        assistant_x = self.total_width - self.assistant_width - 5  + offsetX
-        assistant_y = self.user_height + offsetY + 15
-        assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
-        outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
-        self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
-
-    def return_text_adjusted_to_width(self, text, font, max_width_allowed):
-        """
-        Line feeds are inserted so that the text width does never exceed max_width.
-        Text is only broken up on whole words.
-        """
-        fm = QFontMetrics(font)
-        words = text.split(' ')
-        adjusted_text = ''
-        current_line = ''
-        max_width_used = 0
+            self.update()
+
+        def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
+            painter.setPen(outlineColor)
+            for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
+                        (-outline_size, -outline_size), (outline_size, -outline_size),
+                        (-outline_size, outline_size), (outline_size, outline_size)]:
+                painter.drawText(x + dx, y + dy, width, height, alignment, text)
+
+            painter.setPen(textColor)
+            painter.drawText(x, y, width, height, alignment, text)
+
+        def paintEvent(self, event):
+            painter = QPainter(self)
+
+            offsetX = 4
+            offsetY = 5
         
-        for word in words:
-            current_width = fm.width(current_line + word)
-            if current_width <= max_width_allowed:
-                current_line += word + ' '
+            painter.setPen(QColor(255, 255, 255))
+
+            # Draw symbol
+            painter.setFont(self.big_symbol_font)
+            if self.big_symbol_text:
+                painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
+                painter.setFont(self.small_symbol_font)
+                painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
             else:
-                line_width = fm.width(current_line)
-                if line_width > max_width_used:
-                    max_width_used = line_width
-                adjusted_text += current_line + '\n'
-                current_line = word + ' '
-        
-        line_width = fm.width(current_line)
-        if line_width > max_width_used:
-            max_width_used = line_width
-        adjusted_text += current_line 
-        return adjusted_text.rstrip(), max_width_used         
+                painter.setFont(self.small_symbol_font)
+                painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
+
+            # Draw User Text
+            painter.setFont(self.user_font)
+            user_x = self.total_width - self.user_width - 45 + offsetX
+            user_y = offsetY + 15
+            user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
+            outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
+            self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
+
+            # Draw Assistant Text
+            painter.setFont(self.assistant_font)
+            assistant_x = self.total_width - self.assistant_width - 5  + offsetX
+            assistant_y = self.user_height + offsetY + 15
+            assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
+            outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
+            self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
+
+        def return_text_adjusted_to_width(self, text, font, max_width_allowed):
+            """
+            Line feeds are inserted so that the text width does never exceed max_width.
+            Text is only broken up on whole words.
+            """
+            fm = QFontMetrics(font)
+            words = text.split(' ')
+            adjusted_text = ''
+            current_line = ''
+            max_width_used = 0
+            
+            for word in words:
+                current_width = fm.width(current_line + word)
+                if current_width <= max_width_allowed:
+                    current_line += word + ' '
+                else:
+                    line_width = fm.width(current_line)
+                    if line_width > max_width_used:
+                        max_width_used = line_width
+                    adjusted_text += current_line + '\n'
+                    current_line = word + ' '
+            
+            line_width = fm.width(current_line)
+            if line_width > max_width_used:
+                max_width_used = line_width
+            adjusted_text += current_line 
+            return adjusted_text.rstrip(), max_width_used         
 
-if __name__ == '__main__':
     app = QApplication(sys.argv)
 
     window = TransparentWindow()

+ 95 - 0
example_webserver/client.py

@@ -0,0 +1,95 @@
+from colorama import Fore, Back, Style
+import websockets
+import colorama
+import keyboard
+import asyncio
+import json
+import os
+
+colorama.init()
+
+SEND_START_COMMAND = False
+HOST = 'localhost:5025'
+URI = f'ws://{HOST}'
+RECONNECT_DELAY = 5  
+
+full_sentences = []
+
+def clear_console():
+    os.system('clear' if os.name == 'posix' else 'cls')
+
+def update_displayed_text(text = ""):
+    sentences_with_style = [
+        f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
+        for i, sentence in enumerate(full_sentences)
+    ]
+    text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
+    clear_console()
+    print("CLIENT retrieved text:")
+    print()
+    print(text)
+
+async def send_start_recording(websocket):
+    command = {
+        "type": "command",
+        "content": "start-recording"
+    }
+    await websocket.send(json.dumps(command))
+
+async def test_client():
+    while True:
+        try:
+            async with websockets.connect(URI, ping_interval=None) as websocket:
+
+                if SEND_START_COMMAND:
+                    # New: Check for space bar press and send start-recording message
+                    async def check_space_keypress():
+                        while True:
+                            if keyboard.is_pressed('space'):
+                                print ("Space bar pressed. Sending start-recording message to server.")
+                                await send_start_recording(websocket)
+                                await asyncio.sleep(1) 
+                            await asyncio.sleep(0.02)
+                    
+                    # Start a task to monitor the space keypress
+                    print ("Press space bar to start recording.")
+                    asyncio.create_task(check_space_keypress())
+                
+                while True:
+                    message = await websocket.recv()
+                    message_obj = json.loads(message)
+                    
+                    if message_obj["type"] == "realtime":
+                        clear_console()
+                        print (message_obj["content"])
+                    elif message_obj["type"] == "full":
+                        clear_console()
+                        colored_message = Fore.YELLOW + message_obj["content"] + Style.RESET_ALL
+                        print (colored_message)
+                        print ()
+                        if SEND_START_COMMAND:
+                            print ("Press space bar to start recording.")
+                        full_sentences.append(message_obj["content"])
+                    elif message_obj["type"] == "record_start":
+                        print ("recording started.")
+                    elif message_obj["type"] == "vad_start":
+                        print ("vad started.")
+                    elif message_obj["type"] == "wakeword_start":
+                        print ("wakeword started.")
+                    elif message_obj["type"] == "transcript_start":
+                        print ("transcript started.")
+
+                    else:
+                        print (f"Unknown message: {message_obj}")
+                    
+        except websockets.ConnectionClosed:
+            print("Connection with server closed. Reconnecting in", RECONNECT_DELAY, "seconds...")
+            await asyncio.sleep(RECONNECT_DELAY)
+        except KeyboardInterrupt:
+            print("Gracefully shutting down the client.")
+            break
+        except Exception as e:
+            print(f"An error occurred: {e}. Reconnecting in", RECONNECT_DELAY, "seconds...")
+            await asyncio.sleep(RECONNECT_DELAY)    
+
+asyncio.run(test_client())

+ 181 - 0
example_webserver/server.py

@@ -0,0 +1,181 @@
+WAIT_FOR_START_COMMAND = False
+
+if __name__ == '__main__':
+    server = "localhost"
+    port = 5025
+
+    print (f"STT speech to text server")
+    print (f"runs on http://{server}:{port}")
+    print ()
+    print ("starting")
+    print ("└─ ... ", end='', flush=True)
+
+    from RealtimeSTT import AudioToTextRecorder
+    from colorama import Fore, Back, Style
+    import websockets
+    import threading
+    import colorama
+    import asyncio
+    import shutil
+    import queue
+    import json
+    import time
+    import os
+
+    colorama.init()
+
+    first_chunk = True
+    full_sentences = []
+    displayed_text = ""
+    message_queue = queue.Queue() 
+    start_recording_event = threading.Event()
+    start_transcription_event = threading.Event()
+    connected_clients = set()
+
+    def clear_console():
+        os.system('clear' if os.name == 'posix' else 'cls')
+
+    async def handler(websocket, path):
+
+        print ("\r└─ OK")
+        if WAIT_FOR_START_COMMAND:
+            print("waiting for start command")
+            print ("└─ ... ", end='', flush=True)
+
+        connected_clients.add(websocket)
+
+        try:
+            while True:
+                async for message in websocket:
+                    data = json.loads(message)
+                    if data.get("type") == "command" and data.get("content") == "start-recording":
+                        print ("\r└─ OK")
+                        start_recording_event.set() 
+
+        except json.JSONDecodeError:
+            print (Fore.RED + "STT Received an invalid JSON message." + Style.RESET_ALL)
+        except websockets.ConnectionClosedError:
+            print (Fore.RED + "connection closed unexpectedly by the client" + Style.RESET_ALL)
+        except websockets.exceptions.ConnectionClosedOK:
+            print("connection closed.")
+        finally:
+
+            print("client disconnected")
+            connected_clients.remove(websocket)
+            print ("waiting for clients")
+            print ("└─ ... ", end='', flush=True)
+
+
+    def add_message_to_queue(type: str, content):
+        message = {
+            "type": type,
+            "content": content
+        }
+        message_queue.put(message)    
+
+    def fill_cli_line(text):
+        columns, _ = shutil.get_terminal_size()
+        return text.ljust(columns)[-columns:]
+
+    def text_detected(text):
+        global displayed_text, first_chunk
+
+        if text != displayed_text:
+            first_chunk = False
+            displayed_text = text
+            add_message_to_queue("realtime", text)
+
+            message = fill_cli_line(text)
+
+            message ="└─ " + Fore.CYAN + message[:-3] + Style.RESET_ALL
+            print(f"\r{message}", end='', flush=True)
+
+
+    async def broadcast(message_obj):
+        if connected_clients:
+            for client in connected_clients:
+                await client.send(json.dumps(message_obj))
+
+    async def send_handler():
+        while True:
+            while not message_queue.empty():
+                message = message_queue.get()
+                await broadcast(message)
+            await asyncio.sleep(0.02)
+
+    def recording_started():
+        add_message_to_queue("record_start", "")
+
+    def vad_detect_started():
+        add_message_to_queue("vad_start", "")
+
+    def wakeword_detect_started():
+        add_message_to_queue("wakeword_start", "")
+
+    def transcription_started():
+        add_message_to_queue("transcript_start", "")
+
+    recorder_config = {
+        'spinner': False,
+        'model': 'small.en',
+        'language': 'en',
+        'silero_sensitivity': 0.01,
+        'webrtc_sensitivity': 3,
+        'silero_use_onnx': False,
+        'post_speech_silence_duration': 1.2,
+        'min_length_of_recording': 0.2,
+        'min_gap_between_recordings': 0,
+        'enable_realtime_transcription': True,
+        'realtime_processing_pause': 0,
+        'realtime_model_type': 'tiny.en',
+        'on_realtime_transcription_stabilized': text_detected,
+        'on_recording_start' : recording_started,
+        'on_vad_detect_start' : vad_detect_started,
+        'on_wakeword_detection_start' : wakeword_detect_started,
+        'on_transcription_start' : transcription_started,
+    }
+
+    recorder = AudioToTextRecorder(**recorder_config)
+
+    def transcriber_thread():
+        while True:
+            start_transcription_event.wait()
+            text = "└─ transcribing ... "
+            text = fill_cli_line(text)
+            print (f"\r{text}", end='', flush=True)
+            sentence = recorder.transcribe()
+            print (Style.RESET_ALL + "\r└─ " + Fore.YELLOW + sentence + Style.RESET_ALL)
+            add_message_to_queue("full", sentence)
+            start_transcription_event.clear()
+            if WAIT_FOR_START_COMMAND:
+                print("waiting for start command")
+                print ("└─ ... ", end='', flush=True)
+
+    def recorder_thread():
+        global first_chunk
+        while True:
+            if not len(connected_clients) > 0:
+                time.sleep(0.1)
+                continue
+            first_chunk = True
+            if WAIT_FOR_START_COMMAND:
+                start_recording_event.wait() 
+            print("waiting for sentence")
+            print ("└─ ... ", end='', flush=True)
+            recorder.wait_audio()
+            start_transcription_event.set()
+            start_recording_event.clear()
+
+    threading.Thread(target=recorder_thread, daemon=True).start()
+    threading.Thread(target=transcriber_thread, daemon=True).start()
+
+    start_server = websockets.serve(handler, server, port)
+    loop = asyncio.get_event_loop()
+
+    print ("\r└─ OK")
+    print ("waiting for clients")
+    print ("└─ ... ", end='', flush=True)
+
+    loop.run_until_complete(start_server)
+    loop.create_task(send_handler())
+    loop.run_forever()

+ 104 - 100
tests/realtime_loop_test.py

@@ -8,110 +8,114 @@ import os
 from RealtimeTTS import TextToAudioStream, AzureEngine
 from RealtimeSTT import AudioToTextRecorder
 
+if __name__ == '__main__':
 
-class SimpleApp(QWidget):
-
-    update_stt_text_signal = pyqtSignal(str)
-    update_tts_text_signal = pyqtSignal(str)
-
-    def __init__(self):
-        super().__init__()
-
-        layout = QVBoxLayout()
-
-        font = QFont()
-        font.setPointSize(18)
-
-        self.input_text = QTextEdit(self)
-        self.input_text.setFont(font)
-        self.input_text.setPlaceholderText("Input")
-        self.input_text.setMinimumHeight(100) 
-        layout.addWidget(self.input_text)
-
-        self.button_speak_input = QPushButton("Speak and detect input text", self)
-        self.button_speak_input.setFont(font)        
-        self.button_speak_input.clicked.connect(self.speak_input)
-        layout.addWidget(self.button_speak_input)
-
-        self.tts_text = QTextEdit(self)
-        self.tts_text.setFont(font)
-        self.tts_text.setPlaceholderText("STT (final)")
-        self.tts_text.setMinimumHeight(100) 
-        self.tts_text.setReadOnly(True)
-        layout.addWidget(self.tts_text)
-
-        self.stt_text = QTextEdit(self)
-        self.stt_text.setFont(font)
-        self.stt_text.setPlaceholderText("STT (realtime)")
-        self.stt_text.setMinimumHeight(100) 
-        layout.addWidget(self.stt_text)
-
-        self.button_speak_stt = QPushButton("Speak detected text again", self)
-        self.button_speak_stt.setFont(font)        
-        self.button_speak_stt.clicked.connect(self.speak_stt)
-        layout.addWidget(self.button_speak_stt)
-
-        self.setLayout(layout)
-        self.setWindowTitle("Realtime TTS/STT Loop Test")
-        self.resize(800, 600)
-
-        self.update_stt_text_signal.connect(self.actual_update_stt_text)
-        self.update_tts_text_signal.connect(self.actual_update_tts_text)
-
-        self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
-
-        recorder_config = {
-            'spinner': False,
-            'model': 'large-v2',
-            'language': 'en',
-            'silero_sensitivity': 0.01,
-            'webrtc_sensitivity': 3,
-            'post_speech_silence_duration': 0.01,
-            'min_length_of_recording': 0.2,
-            'min_gap_between_recordings': 0,
-            'enable_realtime_transcription': True,
-            'realtime_processing_pause': 0,
-            'realtime_model_type': 'small.en',
-            'on_realtime_transcription_stabilized': self.text_detected,
-        }
-
-        self.recorder = AudioToTextRecorder(**recorder_config)
-
-    def speak_stt(self):
-        text = self.stt_text.toPlainText()
-        self.speak(text)
-
-    def speak_input(self):
-        text = self.input_text.toPlainText()
-        self.speak(text)
-
-    def text_detected(self, text):
-        self.update_stt_text_signal.emit(text)
-
-    def audio_stream_stop(self):
-        self.stream.stop()
-        self.recorder.stop()
-        detected_text = self.recorder.text()
-        self.update_stt_text_signal.emit(detected_text)
-        self.update_tts_text_signal.emit(detected_text)
-
-    def speak(self, text):
-        self.stt_text.clear()        
-        self.stream.feed(text)
-
-        self.recorder.start()
-        self.stream.play_async()
-
-    def actual_update_stt_text(self, text):
-        self.stt_text.setText(text)
-
-    def actual_update_tts_text(self, text):
-        self.tts_text.setText(text)        
+    class SimpleApp(QWidget):
+
+        update_stt_text_signal = pyqtSignal(str)
+        update_tts_text_signal = pyqtSignal(str)
+
+        def __init__(self):
+            super().__init__()
+
+            layout = QVBoxLayout()
+
+            font = QFont()
+            font.setPointSize(18)
+
+            self.input_text = QTextEdit(self)
+            self.input_text.setFont(font)
+            self.input_text.setPlaceholderText("Input")
+            self.input_text.setMinimumHeight(100) 
+            layout.addWidget(self.input_text)
+
+            self.button_speak_input = QPushButton("Speak and detect input text", self)
+            self.button_speak_input.setFont(font)        
+            self.button_speak_input.clicked.connect(self.speak_input)
+            layout.addWidget(self.button_speak_input)
+
+            self.tts_text = QTextEdit(self)
+            self.tts_text.setFont(font)
+            self.tts_text.setPlaceholderText("STT (final)")
+            self.tts_text.setMinimumHeight(100) 
+            self.tts_text.setReadOnly(True)
+            layout.addWidget(self.tts_text)
+
+            self.stt_text = QTextEdit(self)
+            self.stt_text.setFont(font)
+            self.stt_text.setPlaceholderText("STT (realtime)")
+            self.stt_text.setMinimumHeight(100) 
+            layout.addWidget(self.stt_text)
+
+            self.button_speak_stt = QPushButton("Speak detected text again", self)
+            self.button_speak_stt.setFont(font)        
+            self.button_speak_stt.clicked.connect(self.speak_stt)
+            layout.addWidget(self.button_speak_stt)
+
+            self.setLayout(layout)
+            self.setWindowTitle("Realtime TTS/STT Loop Test")
+            self.resize(800, 600)
+
+            self.update_stt_text_signal.connect(self.actual_update_stt_text)
+            self.update_tts_text_signal.connect(self.actual_update_tts_text)
+
+            self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
+
+            recorder_config = {
+                'spinner': False,
+                'model': 'large-v2',
+                'language': 'en',
+                'silero_sensitivity': 0.01,
+                'webrtc_sensitivity': 3,
+                'post_speech_silence_duration': 0.01,
+                'min_length_of_recording': 0.2,
+                'min_gap_between_recordings': 0,
+                'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0,
+                'realtime_model_type': 'small.en',
+                'on_realtime_transcription_stabilized': self.text_detected,
+            }
+
+            self.recorder = AudioToTextRecorder(**recorder_config)
+
+        def speak_stt(self):
+            text = self.stt_text.toPlainText()
+            self.speak(text)
+
+        def speak_input(self):
+            text = self.input_text.toPlainText()
+            self.speak(text)
+
+        def text_detected(self, text):
+            self.update_stt_text_signal.emit(text)
+
+        def audio_stream_stop(self):
+            self.stream.stop()
+            self.recorder.stop()
+            detected_text = self.recorder.text()
+            self.update_stt_text_signal.emit(detected_text)
+            self.update_tts_text_signal.emit(detected_text)
+
+        def speak(self, text):
+            self.stt_text.clear()        
+            self.stream.feed(text)
+
+            self.recorder.start()
+            self.stream.play_async()
+
+        def actual_update_stt_text(self, text):
+            self.stt_text.setText(text)
+
+        def actual_update_tts_text(self, text):
+            self.tts_text.setText(text)
+
+        def closeEvent(self, event):
+            if self.recorder:
+                self.recorder.shutdown()
 
-if __name__ == '__main__':
     app = QApplication(sys.argv)
 
     window = SimpleApp()
     window.show()
 
-    sys.exit(app.exec_())
+    sys.exit(app.exec_())

+ 0 - 1
tests/simple_test.py

@@ -1,5 +1,4 @@
 from RealtimeSTT import AudioToTextRecorder
-
 if __name__ == '__main__':
     recorder = AudioToTextRecorder(spinner=False)
 

+ 11 - 7
tests/wakeword_test.py

@@ -1,12 +1,16 @@
 from RealtimeSTT import AudioToTextRecorder
+import logging
 
-def recording_started():
-    print("Speak now...")
+if __name__ == '__main__':
 
-def recording_finished():
-    print("Speech end detected... transcribing...")
+    def recording_started():
+        print("Speak now...")
 
-recorder = AudioToTextRecorder(spinner=False, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished)
+    def recording_finished():
+        print("Speech end detected... transcribing...")
 
-print('Say "Jarvis" then speak.')
-print(recorder.text())
+    with AudioToTextRecorder(spinner=False, level=logging.DEBUG, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished
+        ) as recorder:
+        print('Say "Jarvis" then speak.')
+        print(recorder.text())
+        print("Done. Now we should exit.")