1 år sedan · 21b49bff31
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
				 
			
 
				 ### Updates
			
 
				 
			
 
				+#### v0.1.6
			
 
				+- implements context manager protocol (recorder can be used in a `with` statement)
			
 
				+- bugfix in shutdown method
			
 
				+
			
 
				 #### v0.1.5
			
 
				 
			
 
				 - Bugfix for detection of short speech right after sentence detection (the problem mentioned in the video)
			
@@ -130,8 +134,8 @@ print(recorder.text())
 
				 Recording based on voice activity detection.
			
 
				 
			
 
				 ```python
			
 
				-recorder = AudioToTextRecorder()
			
 
				-print(recorder.text())
			
 
				+with AudioToTextRecorder() as recorder:
			
 
				+    print(recorder.text())
			
 
				 ```
			
 
				 
			
 
				 When running recorder.text in a loop it is recommended to use a callback, allowing the transcription to be run asynchronously:
			
@@ -170,6 +174,20 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
 
				                                on_recording_stop=my_stop_callback)
			
 
				 ```
			
 
				 
			
 
				+### Shutdown
			
 
				+
			
 
				+You can shutdown the recorder safely by using the context manager protocol:
			
 
				+
			
 
				+```python
			
 
				+with AudioToTextRecorder() as recorder:
			
 
				+    [...]
			
 
				+```
			
 
				+
			
 
				+Or you can call the shutdown method manually (if using "with" is not feasible):
			
 
				+
			
 
				+```python
			
 
				+recorder.shutdown()
			
 
				+```
			
 
				 
			
 
				 ## Testing the Library
			
 
				 
			
@@ -254,7 +272,7 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
				 
			
 
				 - **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6.
			
 
				 
			
 
				-- **silero_use_onnx** (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is True (recommended for faster performance).
			
 
				+- **silero_use_onnx** (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is False. Recommended for faster performance.
			
 
				 
			
 
				 - **post_speech_silence_duration** (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
			
 
				 
			
@@ -299,4 +317,4 @@ MIT
 
				 
			
 
				 Kolja Beigel  
			
 
				 Email: kolja.beigel@web.de  
			
 
				-[GitHub](https://github.com/KoljaB/RealtimeSTT)
			
 
				+[GitHub](https://github.com/KoljaB/RealtimeSTT)
			
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
@@ -51,6 +51,7 @@ INIT_WAKE_WORDS_SENSITIVITY = 0.6
 
				 INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
			
 
				 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
			
 
				 INIT_WAKE_WORD_TIMEOUT = 5.0
			
 
				+ALLOWED_LATENCY_LIMIT = 10
			
 
				 
			
 
				 TIME_SLEEP = 0.02
			
 
				 SAMPLE_RATE = 16000
			
@@ -82,7 +83,7 @@ class AudioToTextRecorder:
 
				 
			
 
				                  # Voice activation parameters
			
 
				                  silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
			
 
				-                 silero_use_onnx: bool = True,
			
 
				+                 silero_use_onnx: bool = False,
			
 
				                  webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
			
 
				                  post_speech_silence_duration: float = INIT_POST_SPEECH_SILENCE_DURATION,
			
 
				                  min_length_of_recording: float = INIT_MIN_LENGTH_OF_RECORDING,
			
@@ -122,7 +123,7 @@ class AudioToTextRecorder:
 
				         - on_realtime_transcription_update = A callback function that is triggered whenever there's an update in the real-time transcription. The function is called with the newly transcribed text as its argument.
			
 
				         - on_realtime_transcription_stabilized = A callback function that is triggered when the transcribed text stabilizes in quality. The stabilized text is generally more accurate but may arrive with a slight delay compared to the regular real-time updates.
			
 
				         - silero_sensitivity (float, default=SILERO_SENSITIVITY): Sensitivity for the Silero Voice Activity Detection model ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
			
 
				-        - silero_use_onnx (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
			
 
				+        - silero_use_onnx (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
			
 
				         - webrtc_sensitivity (int, default=WEBRTC_SENSITIVITY): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 0 (least aggressive / most sensitive) to 3 (most aggressive, least sensitive). Default is 3.
			
 
				         - post_speech_silence_duration (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
			
 
				         - min_gap_between_recordings (float, default=1.0): Specifies the minimum time interval in seconds that should exist between the end of one recording session and the beginning of another to prevent rapid consecutive recordings.
			
@@ -168,6 +169,7 @@ class AudioToTextRecorder:
 
				         self.realtime_processing_pause = realtime_processing_pause
			
 
				         self.on_realtime_transcription_update = on_realtime_transcription_update
			
 
				         self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
			
 
				+        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
			
 
				     
			
 
				         self.level = level
			
 
				         self.audio_queue = Queue()
			
@@ -207,7 +209,7 @@ class AudioToTextRecorder:
 
				         logger.setLevel(level)  # Set the root logger's level
			
 
				 
			
 
				         # Create a file handler and set its level
			
 
				-        file_handler = logging.FileHandler('audio_recorder.log')
			
 
				+        file_handler = logging.FileHandler('realtimesst.log')
			
 
				         file_handler.setLevel(logging.DEBUG)
			
 
				         file_handler.setFormatter(logging.Formatter(log_format))
			
 
				 
			
@@ -220,16 +222,20 @@ class AudioToTextRecorder:
 
				         logger.addHandler(file_handler)
			
 
				         logger.addHandler(console_handler)
			
 
				 
			
 
				+        self.is_shut_down = False
			
 
				+        self.shutdown_event = Event()
			
 
				 
			
 
				-        # start transcription process
			
 
				+        logging.info(f"Starting RealTimeSTT")
			
 
				+
			
 
				+        # Start transcription process
			
 
				         self.main_transcription_ready_event = Event()
			
 
				         self.parent_transcription_pipe, child_transcription_pipe = Pipe()
			
 
				-        self.process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event))
			
 
				-        self.process.start()
			
 
				+        self.transcript_process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event, self.shutdown_event))
			
 
				+        self.transcript_process.start()
			
 
				 
			
 
				-        # start audio data reading process
			
 
				-        reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size))
			
 
				-        reader_process.start()
			
 
				+        # Start audio data reading process
			
 
				+        self.reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size, self.shutdown_event))
			
 
				+        self.reader_process.start()
			
 
				 
			
 
				         # Initialize the realtime transcription model
			
 
				         if self.enable_realtime_transcription:
			
@@ -310,7 +316,7 @@ class AudioToTextRecorder:
 
				         self.realtime_thread.daemon = True
			
 
				         self.realtime_thread.start()
			
 
				 
			
 
				-        # wait for transcription models to start
			
 
				+        # Wait for transcription models to start
			
 
				         logging.debug('Waiting for main transcription model to start')
			
 
				         self.main_transcription_ready_event.wait()
			
 
				         logging.debug('Main transcription model ready')
			
@@ -319,7 +325,25 @@ class AudioToTextRecorder:
 
				 
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _transcription_worker(conn, model_path, ready_event):
			
 
				+    def _transcription_worker(conn, model_path, ready_event, shutdown_event):
			
 
				+        """
			
 
				+        Worker method that handles the continuous process of transcribing audio data.
			
 
				+
			
 
				+        This method runs in a separate process and is responsible for:
			
 
				+        - Initializing the `faster_whisper` model used for transcription.
			
 
				+        - Receiving audio data sent through a pipe and using the model to transcribe it.
			
 
				+        - Sending transcription results back through the pipe.
			
 
				+        - Continuously checking for a shutdown event to gracefully terminate the transcription process.
			
 
				+
			
 
				+        Args:
			
 
				+            conn (multiprocessing.Connection): The connection endpoint used for receiving audio data and sending transcription results.
			
 
				+            model_path (str): The path to the pre-trained faster_whisper model for transcription.
			
 
				+            ready_event (threading.Event): An event that is set when the transcription model is successfully initialized and ready.
			
 
				+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
			
 
				+
			
 
				+        Raises:
			
 
				+            Exception: If there is an error while initializing the transcription model.
			
 
				+        """        
			
 
				 
			
 
				         logging.info(f"Initializing faster_whisper main transcription model {model_path}")
			
 
				 
			
@@ -337,23 +361,44 @@ class AudioToTextRecorder:
 
				 
			
 
				         logging.debug('Faster_whisper main speech to text transcription model initialized successfully')
			
 
				 
			
 
				-        while True:
			
 
				-            audio, language = conn.recv()
			
 
				-            try:
			
 
				-                segments = model.transcribe(audio, language=language if language else None)[0]
			
 
				-                transcription = " ".join(seg.text for seg in segments).strip()
			
 
				-                conn.send(('success', transcription))
			
 
				-            except faster_whisper.WhisperError as e:
			
 
				-                logging.error(f"Whisper transcription error: {e}")
			
 
				-                conn.send(('error', str(e)))      
			
 
				-            except Exception as e:
			
 
				-                logging.error(f"General transcription error: {e}")
			
 
				-                conn.send(('error', str(e)))
			
 
				+        while not shutdown_event.is_set():
			
 
				+            if conn.poll(0.5):
			
 
				+                audio, language = conn.recv()
			
 
				+                try:
			
 
				+                    segments = model.transcribe(audio, language=language if language else None)[0]
			
 
				+                    transcription = " ".join(seg.text for seg in segments).strip()
			
 
				+                    conn.send(('success', transcription))
			
 
				+                except faster_whisper.WhisperError as e:
			
 
				+                    logging.error(f"Whisper transcription error: {e}")
			
 
				+                    conn.send(('error', str(e)))      
			
 
				+                except Exception as e:
			
 
				+                    logging.error(f"General transcription error: {e}")
			
 
				+                    conn.send(('error', str(e)))
			
 
				+            else:
			
 
				+                # If there's no data, sleep for a short while to prevent busy waiting
			
 
				+                time.sleep(0.02)
			
 
				 
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _audio_data_worker(audio_queue, sample_rate, buffer_size):
			
 
				+    def _audio_data_worker(audio_queue, sample_rate, buffer_size, shutdown_event):
			
 
				+        """
			
 
				+        Worker method that handles the audio recording process.
			
 
				+
			
 
				+        This method runs in a separate process and is responsible for:
			
 
				+        - Setting up the audio input stream for recording.
			
 
				+        - Continuously reading audio data from the input stream and placing it in a queue.
			
 
				+        - Handling errors during the recording process, including input overflow.
			
 
				+        - Gracefully terminating the recording process when a shutdown event is set.
			
 
				 
			
 
				+        Args:
			
 
				+            audio_queue (queue.Queue): A queue where recorded audio data is placed.
			
 
				+            sample_rate (int): The sample rate of the audio input stream.
			
 
				+            buffer_size (int): The size of the buffer used in the audio input stream.
			
 
				+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
			
 
				+
			
 
				+        Raises:
			
 
				+            Exception: If there is an error while initializing the audio recording.
			
 
				+        """
			
 
				         logging.info("Initializing audio recording (creating pyAudio input stream)")
			
 
				 
			
 
				         try:
			
@@ -366,29 +411,33 @@ class AudioToTextRecorder:
 
				 
			
 
				         logging.debug('Audio recording (pyAudio input stream) initialized successfully')
			
 
				    
			
 
				-        while True:
			
 
				-            try:
			
 
				-                data = stream.read(buffer_size)
			
 
				-
			
 
				-            except OSError as e:
			
 
				-                if e.errno == pyaudio.paInputOverflowed:
			
 
				-                    logging.warning("Input overflowed. Frame dropped.")
			
 
				-                else:
			
 
				+        try:
			
 
				+            while not shutdown_event.is_set():
			
 
				+                try:
			
 
				+                    data = stream.read(buffer_size)
			
 
				+
			
 
				+                except OSError as e:
			
 
				+                    if e.errno == pyaudio.paInputOverflowed:
			
 
				+                        logging.warning("Input overflowed. Frame dropped.")
			
 
				+                    else:
			
 
				+                        logging.error(f"Error during recording: {e}")
			
 
				+                    tb_str = traceback.format_exc()
			
 
				+                    print (f"Traceback: {tb_str}")
			
 
				+                    print (f"Error: {e}")
			
 
				+                    continue
			
 
				+
			
 
				+                except Exception as e:
			
 
				                     logging.error(f"Error during recording: {e}")
			
 
				-                tb_str = traceback.format_exc()
			
 
				-                print (f"Traceback: {tb_str}")
			
 
				-                print (f"Error: {e}")
			
 
				-                continue
			
 
				+                    tb_str = traceback.format_exc()
			
 
				+                    print (f"Traceback: {tb_str}")
			
 
				+                    print (f"Error: {e}")
			
 
				+                    continue
			
 
				 
			
 
				-            except Exception as e:
			
 
				-                logging.error(f"Error during recording: {e}")
			
 
				-                time.sleep(1)
			
 
				-                tb_str = traceback.format_exc()
			
 
				-                print (f"Traceback: {tb_str}")
			
 
				-                print (f"Error: {e}")
			
 
				-                continue
			
 
				-
			
 
				-            audio_queue.put(data)                
			
 
				+                audio_queue.put(data)                
			
 
				+        finally:
			
 
				+            stream.stop_stream()
			
 
				+            stream.close()
			
 
				+            audio_interface.terminate()
			
 
				 
			
 
				 
			
 
				     def wait_audio(self):
			
@@ -413,14 +462,14 @@ class AudioToTextRecorder:
 
				             self._set_state("listening")
			
 
				             self.start_recording_on_voice_activity = True
			
 
				 
			
 
				-            # wait until recording starts
			
 
				+            # Wait until recording starts
			
 
				             self.start_recording_event.wait()
			
 
				 
			
 
				         # If recording is ongoing, wait for voice inactivity to finish recording.
			
 
				         if self.is_recording:
			
 
				             self.stop_recording_on_voice_deactivity = True
			
 
				 
			
 
				-            # wait until recording stops
			
 
				+            # Wait until recording stops
			
 
				             self.stop_recording_event.wait()
			
 
				 
			
 
				         # Convert recorded frames to the appropriate audio format.
			
@@ -435,8 +484,25 @@ class AudioToTextRecorder:
 
				         self._set_state("inactive")
			
 
				 
			
 
				 
			
 
				-
			
 
				     def transcribe(self):
			
 
				+        """
			
 
				+        Transcribes audio captured by this class instance using the `faster_whisper` model.
			
 
				+
			
 
				+        Automatically starts recording upon voice activity if not manually started using `recorder.start()`.
			
 
				+        Automatically stops recording upon voice deactivity if not manually stopped with `recorder.stop()`.
			
 
				+        Processes the recorded audio to generate transcription.
			
 
				+
			
 
				+        Args:
			
 
				+            on_transcription_finished (callable, optional): Callback function to be executed when transcription is ready.
			
 
				+                If provided, transcription will be performed asynchronously, and the callback will receive the transcription 
			
 
				+                as its argument. If omitted, the transcription will be performed synchronously, and the result will be returned.
			
 
				+
			
 
				+        Returns (if no callback is set):
			
 
				+            str: The transcription of the recorded audio.
			
 
				+
			
 
				+        Raises:
			
 
				+            Exception: If there is an error during the transcription process.
			
 
				+        """        
			
 
				         self._set_state("transcribing")
			
 
				         self.parent_transcription_pipe.send((self.audio, self.language))
			
 
				         status, result = self.parent_transcription_pipe.recv()
			
@@ -470,6 +536,9 @@ class AudioToTextRecorder:
 
				 
			
 
				         self.wait_audio()
			
 
				 
			
 
				+        if self.is_shut_down:
			
 
				+            return ""
			
 
				+
			
 
				         if on_transcription_finished:
			
 
				             threading.Thread(target=on_transcription_finished, args=(self.transcribe(),)).start()
			
 
				         else:
			
@@ -537,26 +606,281 @@ class AudioToTextRecorder:
 
				         Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
			
 
				         """
			
 
				 
			
 
				-        self.parent_transcription_pipe.close()
			
 
				-        self.process.terminate()
			
 
				 
			
 
				+        # Force wait_audio() and text() to exit
			
 
				+        self.is_shut_down = True
			
 
				+        self.start_recording_event.set()
			
 
				+        self.stop_recording_event.set()
			
 
				+
			
 
				+        self.shutdown_event.set()
			
 
				         self.is_recording = False
			
 
				         self.is_running = False
			
 
				 
			
 
				+        logging.debug('Finishing recording thread')
			
 
				         if self.recording_thread:
			
 
				             self.recording_thread.join()
			
 
				+
			
 
				+        logging.debug('Terminating reader process')
			
 
				+        # Give it some time to finish the loop and cleanup.
			
 
				+        self.reader_process.join(timeout=10) 
			
 
				+
			
 
				+        if self.reader_process.is_alive():
			
 
				+            logging.warning("Reader process did not terminate in time. Terminating forcefully.")
			
 
				+            self.reader_process.terminate()
			
 
				+        
			
 
				+        logging.debug('Terminating transcription process')
			
 
				+        self.transcript_process.join(timeout=10) 
			
 
				+
			
 
				+        if self.transcript_process.is_alive():
			
 
				+            logging.warning("Transcript process did not terminate in time. Terminating forcefully.")
			
 
				+            self.transcript_process.terminate()
			
 
				+
			
 
				+        self.parent_transcription_pipe.close()
			
 
				+
			
 
				+        logging.debug('Finishing realtime thread')
			
 
				         if self.realtime_thread:
			
 
				             self.realtime_thread.join()
			
 
				 
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def _recording_worker(self):
			
 
				+        """
			
 
				+        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
			
 
				+        """
			
 
				+
			
 
				+        logging.debug('Starting recording worker')
			
 
				+
			
 
				         try:
			
 
				-            if self.stream:
			
 
				-                self.stream.stop_stream()
			
 
				-                self.stream.close()
			
 
				-            if self.audio_interface:
			
 
				-                self.audio_interface.terminate()
			
 
				+            was_recording = False
			
 
				+            delay_was_passed = False
			
 
				+
			
 
				+            # Continuously monitor audio for voice activity
			
 
				+            while self.is_running:
			
 
				+
			
 
				+                data = self.audio_queue.get()
			
 
				+
			
 
				+                # Handle queue overflow
			
 
				+                queue_overflow_logged = False
			
 
				+                while self.audio_queue.qsize() > self.allowed_latency_limit:
			
 
				+                    if not queue_overflow_logged:
			
 
				+                        logging.warning(f"Audio queue size exceeds latency limit. Current size: {self.audio_queue.qsize()}. Discarding old audio chunks.")
			
 
				+                        queue_overflow_logged = True
			
 
				+                    data = self.audio_queue.get()
			
 
				+
			
 
				+                if not self.is_recording:
			
 
				+                    # Handle not recording state
			
 
				+
			
 
				+                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
			
 
				+                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
			
 
				+
			
 
				+                    # Handle wake-word timeout callback
			
 
				+                    if wake_word_activation_delay_passed and not delay_was_passed:
			
 
				+                        if self.wake_words and self.wake_word_activation_delay:
			
 
				+                            if self.on_wakeword_timeout:
			
 
				+                                self.on_wakeword_timeout()
			
 
				+                    delay_was_passed = wake_word_activation_delay_passed
			
 
				+
			
 
				+                    # Set state and spinner text 
			
 
				+                    if not self.recording_stop_time:
			
 
				+                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
			
 
				+                            self._set_state("wakeword")
			
 
				+                        else:
			
 
				+                            if self.listen_start:
			
 
				+                                self._set_state("listening")
			
 
				+                            else:
			
 
				+                                self._set_state("inactive")
			
 
				+
			
 
				+                    # Detect wake words if applicable
			
 
				+                    if self.wake_words and wake_word_activation_delay_passed:
			
 
				+                        try:
			
 
				+                            pcm = struct.unpack_from("h" * self.buffer_size, data)
			
 
				+                            wakeword_index = self.porcupine.process(pcm)
			
 
				+
			
 
				+                        except struct.error:
			
 
				+                            logging.error("Error unpacking audio data for wake word processing.")
			
 
				+                            continue
			
 
				+                        
			
 
				+                        except Exception as e:
			
 
				+                            logging.error(f"Wake word processing error: {e}")
			
 
				+                            continue
			
 
				+                        
			
 
				+                        # If a wake word is detected
			
 
				+                        if wakeword_index >= 0:
			
 
				+
			
 
				+                            # Removing the wake word from the recording
			
 
				+                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
			
 
				+                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
			
 
				+                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
			
 
				+                            self.audio_buffer.clear()
			
 
				+                            self.audio_buffer.extend(temp_samples)
			
 
				+
			
 
				+                            self.wake_word_detect_time = time.time()
			
 
				+                            self.wakeword_detected = True
			
 
				+                            if self.on_wakeword_detected:
			
 
				+                                self.on_wakeword_detected()
			
 
				+
			
 
				+                    # Check for voice activity to trigger the start of recording
			
 
				+                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
			
 
				+
			
 
				+                        if self._is_voice_active():
			
 
				+                            logging.info("voice activity detected")
			
 
				+
			
 
				+                            self.start()
			
 
				+
			
 
				+                            if self.is_recording:
			
 
				+                                self.start_recording_on_voice_activity = False
			
 
				+
			
 
				+                                # Add the buffered audio to the recording frames
			
 
				+                                self.frames.extend(list(self.audio_buffer))
			
 
				+                                self.audio_buffer.clear()
			
 
				+
			
 
				+                            self.silero_vad_model.reset_states()
			
 
				+                        else:
			
 
				+                            data_copy = data[:]
			
 
				+                            self._check_voice_activity(data_copy)
			
 
				+
			
 
				+                    self.speech_end_silence_start = 0
			
 
				+
			
 
				+                else:
			
 
				+                    # If we are currently recording
			
 
				+
			
 
				+                    # Stop the recording if silence is detected after speech
			
 
				+                    if self.stop_recording_on_voice_deactivity:
			
 
				+
			
 
				+                        if not self._is_webrtc_speech(data, True):
			
 
				+
			
 
				+                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
			
 
				+                            if self.speech_end_silence_start == 0:
			
 
				+                                self.speech_end_silence_start = time.time()
			
 
				+                            
			
 
				+                        else:
			
 
				+                            self.speech_end_silence_start = 0
			
 
				+
			
 
				+                        # Wait for silence to stop recording after speech
			
 
				+                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
			
 
				+                            logging.info("voice deactivity detected")
			
 
				+                            self.stop()
			
 
				+
			
 
				+
			
 
				+                if not self.is_recording and was_recording:
			
 
				+                    # Reset after stopping recording to ensure clean state
			
 
				+                    self.stop_recording_on_voice_deactivity = False
			
 
				+
			
 
				+                if time.time() - self.silero_check_time > 0.1:
			
 
				+                    self.silero_check_time = 0
			
 
				+                
			
 
				+                # Handle wake word timeout (waited to long initiating speech after wake word detection)
			
 
				+                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
			
 
				+                    self.wake_word_detect_time = 0
			
 
				+                    if self.wakeword_detected and self.on_wakeword_timeout:
			
 
				+                        self.on_wakeword_timeout()
			
 
				+                    self.wakeword_detected = False
			
 
				+
			
 
				+                was_recording = self.is_recording
			
 
				+
			
 
				+
			
 
				+                if self.is_recording:
			
 
				+                    self.frames.append(data)
			
 
				+
			
 
				+                if not self.is_recording or self.speech_end_silence_start:
			
 
				+                    self.audio_buffer.append(data)	
			
 
				+
			
 
				 
			
 
				         except Exception as e:
			
 
				-            logging.error(f"Error closing the audio stream: {e}")
			
 
				+            logging.error(f"Unhandled exeption in _recording_worker: {e}")
			
 
				+            raise
			
 
				+
			
 
				+
			
 
				+    def _realtime_worker(self):
			
 
				+        """
			
 
				+        Performs real-time transcription if the feature is enabled.
			
 
				+
			
 
				+        The method is responsible transcribing recorded audio frames in real-time
			
 
				+         based on the specified resolution interval.
			
 
				+        The transcribed text is stored in `self.realtime_transcription_text` and a callback
			
 
				+        function is invoked with this text if specified.
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            logging.debug('Starting realtime worker')
			
 
				+
			
 
				+            # Return immediately if real-time transcription is not enabled
			
 
				+            if not self.enable_realtime_transcription:
			
 
				+                return
			
 
				+                
			
 
				+            # Continue running as long as the main process is active
			
 
				+            while self.is_running:
			
 
				+
			
 
				+                # Check if the recording is active
			
 
				+                if self.is_recording:
			
 
				+                    
			
 
				+                    # Sleep for the duration of the transcription resolution
			
 
				+                    time.sleep(self.realtime_processing_pause)
			
 
				+                    
			
 
				+                    # Convert the buffer frames to a NumPy array
			
 
				+                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				+                    
			
 
				+                    # Normalize the array to a [-1, 1] range
			
 
				+                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				+
			
 
				+                    # Perform transcription and assemble the text
			
 
				+                    segments = self.realtime_model_type.transcribe(
			
 
				+                        audio_array,
			
 
				+                        language=self.language if self.language else None
			
 
				+                    )
			
 
				+
			
 
				+                    # double check recording state because it could have changed mid-transcription
			
 
				+                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
			
 
				+
			
 
				+                        logging.debug('Starting realtime transcription')
			
 
				+                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
			
 
				+
			
 
				+                        self.text_storage.append(self.realtime_transcription_text)
			
 
				+
			
 
				+                        # Take the last two texts in storage, if they exist
			
 
				+                        if len(self.text_storage) >= 2:
			
 
				+                            last_two_texts = self.text_storage[-2:]
			
 
				+                            
			
 
				+                            # Find the longest common prefix between the two texts
			
 
				+                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
			
 
				+
			
 
				+                            # This prefix is the text that was transcripted two times in the same way
			
 
				+                            # Store as "safely detected text" 
			
 
				+                            if len(prefix) >= len(self.realtime_stabilized_safetext):
			
 
				+                                # Only store when longer than the previous as additional security 
			
 
				+                                self.realtime_stabilized_safetext = prefix
			
 
				+
			
 
				+                        # Find parts of the stabilized text in the freshly transscripted text
			
 
				+                        matching_position = self._find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
			
 
				+                        if matching_position < 0:
			
 
				+                            if self.realtime_stabilized_safetext:
			
 
				+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
			
 
				+                            else:
			
 
				+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
			
 
				+                        else:
			
 
				+                            # We found parts of the stabilized text in the transcripted text
			
 
				+                            # We now take the stabilized text and add only the freshly transcripted part to it
			
 
				+                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
			
 
				+
			
 
				+                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
			
 
				+                            # on the first run without the need for two transcriptions
			
 
				+                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
			
 
				+
			
 
				+                        # Invoke the callback with the transcribed text
			
 
				+                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
			
 
				+
			
 
				+
			
 
				+                # If not recording, sleep briefly before checking again
			
 
				+                else:
			
 
				+                    time.sleep(TIME_SLEEP)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
			
 
				+            raise
			
 
				 
			
 
				 
			
 
				     def _is_silero_speech(self, data):
			
@@ -705,143 +1029,6 @@ class AudioToTextRecorder:
 
				                 self.halo.text = text
			
 
				 
			
 
				 
			
 
				-    def _recording_worker(self):
			
 
				-        """
			
 
				-        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
			
 
				-        """
			
 
				-
			
 
				-        logging.debug('Starting recording worker')
			
 
				-
			
 
				-        try:
			
 
				-            was_recording = False
			
 
				-            delay_was_passed = False
			
 
				-
			
 
				-            # Continuously monitor audio for voice activity
			
 
				-            while self.is_running:
			
 
				-
			
 
				-                data = self.audio_queue.get()
			
 
				-
			
 
				-                if not self.is_recording:
			
 
				-                    # handle not recording state
			
 
				-
			
 
				-                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
			
 
				-                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
			
 
				-
			
 
				-                    # handle wake-word timeout callback
			
 
				-                    if wake_word_activation_delay_passed and not delay_was_passed:
			
 
				-                        if self.wake_words and self.wake_word_activation_delay:
			
 
				-                            if self.on_wakeword_timeout:
			
 
				-                                self.on_wakeword_timeout()
			
 
				-                    delay_was_passed = wake_word_activation_delay_passed
			
 
				-
			
 
				-                    # Set state and spinner text 
			
 
				-                    if not self.recording_stop_time:
			
 
				-                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
			
 
				-                            self._set_state("wakeword")
			
 
				-                        else:
			
 
				-                            if self.listen_start:
			
 
				-                                self._set_state("listening")
			
 
				-                            else:
			
 
				-                                self._set_state("inactive")
			
 
				-
			
 
				-                    # Detect wake words if applicable
			
 
				-                    if self.wake_words and wake_word_activation_delay_passed:
			
 
				-                        try:
			
 
				-                            pcm = struct.unpack_from("h" * self.buffer_size, data)
			
 
				-                            wakeword_index = self.porcupine.process(pcm)
			
 
				-
			
 
				-                        except struct.error:
			
 
				-                            logging.error("Error unpacking audio data for wake word processing.")
			
 
				-                            continue
			
 
				-                        
			
 
				-                        except Exception as e:
			
 
				-                            logging.error(f"Wake word processing error: {e}")
			
 
				-                            continue
			
 
				-                        
			
 
				-                        # If a wake word is detected
			
 
				-                        if wakeword_index >= 0:
			
 
				-
			
 
				-                            # Removing the wake word from the recording
			
 
				-                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
			
 
				-                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
			
 
				-                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
			
 
				-                            self.audio_buffer.clear()
			
 
				-                            self.audio_buffer.extend(temp_samples)
			
 
				-
			
 
				-                            self.wake_word_detect_time = time.time()
			
 
				-                            self.wakeword_detected = True
			
 
				-                            if self.on_wakeword_detected:
			
 
				-                                self.on_wakeword_detected()
			
 
				-
			
 
				-                    # Check for voice activity to trigger the start of recording
			
 
				-                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
			
 
				-
			
 
				-                        if self._is_voice_active():
			
 
				-                            logging.info("voice activity detected")
			
 
				-
			
 
				-                            self.start()
			
 
				-
			
 
				-                            if self.is_recording:
			
 
				-                                self.start_recording_on_voice_activity = False
			
 
				-
			
 
				-                                # Add the buffered audio to the recording frames
			
 
				-                                self.frames.extend(list(self.audio_buffer))
			
 
				-                                self.audio_buffer.clear()
			
 
				-
			
 
				-                            self.silero_vad_model.reset_states()
			
 
				-                        else:
			
 
				-                            data_copy = data[:]
			
 
				-                            self._check_voice_activity(data_copy)
			
 
				-
			
 
				-                    self.speech_end_silence_start = 0
			
 
				-
			
 
				-                else:
			
 
				-                    # If we are currently recording
			
 
				-
			
 
				-                    # Stop the recording if silence is detected after speech
			
 
				-                    if self.stop_recording_on_voice_deactivity:
			
 
				-
			
 
				-                        if not self._is_webrtc_speech(data, True):
			
 
				-
			
 
				-                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
			
 
				-                            if self.speech_end_silence_start == 0:
			
 
				-                                self.speech_end_silence_start = time.time()
			
 
				-                            
			
 
				-                        else:
			
 
				-                            self.speech_end_silence_start = 0
			
 
				-
			
 
				-                        # Wait for silence to stop recording after speech
			
 
				-                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
			
 
				-                            logging.info("voice deactivity detected")
			
 
				-                            self.stop()
			
 
				-
			
 
				-                if not self.is_recording and was_recording:
			
 
				-                    # Reset after stopping recording to ensure clean state
			
 
				-                    self.stop_recording_on_voice_deactivity = False
			
 
				-
			
 
				-                if time.time() - self.silero_check_time > 0.1:
			
 
				-                    self.silero_check_time = 0
			
 
				-                
			
 
				-                # handle wake word timeout (waited to long initiating speech after wake word detection)
			
 
				-                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
			
 
				-                    self.wake_word_detect_time = 0
			
 
				-                    if self.wakeword_detected and self.on_wakeword_timeout:
			
 
				-                        self.on_wakeword_timeout()
			
 
				-                    self.wakeword_detected = False
			
 
				-
			
 
				-                if self.is_recording:
			
 
				-                    self.frames.append(data)
			
 
				-
			
 
				-                if not self.is_recording or self.speech_end_silence_start:
			
 
				-                    self.audio_buffer.append(data)	
			
 
				-
			
 
				-                was_recording = self.is_recording
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logging.error(f"Unhandled exeption in _recording_worker: {e}")
			
 
				-            raise
			
 
				-
			
 
				-
			
 
				     def _preprocess_output(self, text, preview=False):
			
 
				         """
			
 
				         Preprocesses the output text by removing any leading or trailing whitespace,
			
@@ -869,7 +1056,7 @@ class AudioToTextRecorder:
 
				         return text
			
 
				 
			
 
				 
			
 
				-    def find_tail_match_in_text(self, text1, text2, length_of_match=10):
			
 
				+    def _find_tail_match_in_text(self, text1, text2, length_of_match=10):
			
 
				         """
			
 
				         Find the position where the last 'n' characters of text1 match with a substring in text2.
			
 
				         
			
@@ -905,106 +1092,68 @@ class AudioToTextRecorder:
 
				         
			
 
				         return -1
			
 
				     
			
 
				+
			
 
				     def _on_realtime_transcription_stabilized(self, text):
			
 
				+        """
			
 
				+        Callback method invoked when the real-time transcription stabilizes.
			
 
				+
			
 
				+        This method is called internally when the transcription text is considered "stable" 
			
 
				+        meaning it's less likely to change significantly with additional audio input. It 
			
 
				+        notifies any registered external listener about the stabilized text if recording is 
			
 
				+        still ongoing. This is particularly useful for applications that need to display 
			
 
				+        live transcription results to users and want to highlight parts of the transcription 
			
 
				+        that are less likely to change.
			
 
				+
			
 
				+        Args:
			
 
				+            text (str): The stabilized transcription text.
			
 
				+        """        
			
 
				         if self.on_realtime_transcription_stabilized:
			
 
				             if self.is_recording:
			
 
				                 self.on_realtime_transcription_stabilized(text)
			
 
				 
			
 
				+
			
 
				     def _on_realtime_transcription_update(self, text):
			
 
				+        """
			
 
				+        Callback method invoked when there's an update in the real-time transcription.
			
 
				+
			
 
				+        This method is called internally whenever there's a change in the transcription text,
			
 
				+        notifying any registered external listener about the update if recording is still 
			
 
				+        ongoing. This provides a mechanism for applications to receive and possibly display 
			
 
				+        live transcription updates, which could be partial and still subject to change.
			
 
				+
			
 
				+        Args:
			
 
				+            text (str): The updated transcription text.
			
 
				+        """        
			
 
				         if self.on_realtime_transcription_update:
			
 
				             if self.is_recording:
			
 
				                 self.on_realtime_transcription_update(text)
			
 
				 
			
 
				-    def _realtime_worker(self):
			
 
				-        """
			
 
				-        Performs real-time transcription if the feature is enabled.
			
 
				 
			
 
				-        The method is responsible transcribing recorded audio frames in real-time
			
 
				-         based on the specified resolution interval.
			
 
				-        The transcribed text is stored in `self.realtime_transcription_text` and a callback
			
 
				-        function is invoked with this text if specified.
			
 
				+    def __enter__(self):
			
 
				         """
			
 
				+        Method to setup the context manager protocol.
			
 
				 
			
 
				-        try:
			
 
				+        This enables the instance to be used in a `with` statement, ensuring proper 
			
 
				+        resource management. When the `with` block is entered, this method is 
			
 
				+        automatically called.
			
 
				 
			
 
				-            logging.debug('Starting realtime worker')
			
 
				-
			
 
				-            # Return immediately if real-time transcription is not enabled
			
 
				-            if not self.enable_realtime_transcription:
			
 
				-                return
			
 
				-                
			
 
				-            # Continue running as long as the main process is active
			
 
				-            while self.is_running:
			
 
				-
			
 
				-                # Check if the recording is active
			
 
				-                if self.is_recording:
			
 
				-                    
			
 
				-                    # Sleep for the duration of the transcription resolution
			
 
				-                    time.sleep(self.realtime_processing_pause)
			
 
				-                    
			
 
				-                    # Convert the buffer frames to a NumPy array
			
 
				-                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				-                    
			
 
				-                    # Normalize the array to a [-1, 1] range
			
 
				-                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				-
			
 
				-                    # Perform transcription and assemble the text
			
 
				-                    segments = self.realtime_model_type.transcribe(
			
 
				-                        audio_array,
			
 
				-                        language=self.language if self.language else None
			
 
				-                    )
			
 
				-
			
 
				-                    # double check recording state because it could have changed mid-transcription
			
 
				-                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
			
 
				-
			
 
				-                        logging.debug('Starting realtime transcription')
			
 
				-                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
			
 
				-
			
 
				-                        self.text_storage.append(self.realtime_transcription_text)
			
 
				-
			
 
				-                        # Take the last two texts in storage, if they exist
			
 
				-                        if len(self.text_storage) >= 2:
			
 
				-                            last_two_texts = self.text_storage[-2:]
			
 
				-                            
			
 
				-                            # Find the longest common prefix between the two texts
			
 
				-                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
			
 
				-
			
 
				-                            # This prefix is the text that was transcripted two times in the same way
			
 
				-                            # Store as "safely detected text" 
			
 
				-                            if len(prefix) >= len(self.realtime_stabilized_safetext):
			
 
				-                                # Only store when longer than the previous as additional security 
			
 
				-                                self.realtime_stabilized_safetext = prefix
			
 
				-
			
 
				-                        # Find parts of the stabilized text in the freshly transscripted text
			
 
				-                        matching_position = self.find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
			
 
				-                        if matching_position < 0:
			
 
				-                            if self.realtime_stabilized_safetext:
			
 
				-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
			
 
				-                            else:
			
 
				-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
			
 
				-                        else:
			
 
				-                            # We found parts of the stabilized text in the transcripted text
			
 
				-                            # We now take the stabilized text and add only the freshly transcripted part to it
			
 
				-                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
			
 
				-
			
 
				-                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
			
 
				-                            # on the first run without the need for two transcriptions
			
 
				-                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
			
 
				-
			
 
				-                        # Invoke the callback with the transcribed text
			
 
				-                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
			
 
				+        Returns:
			
 
				+            self: The current instance of the class.
			
 
				+        """
			
 
				+        return self
			
 
				 
			
 
				 
			
 
				-                # If not recording, sleep briefly before checking again
			
 
				-                else:
			
 
				-                    time.sleep(TIME_SLEEP)
			
 
				+    def __exit__(self, exc_type, exc_value, traceback):
			
 
				+        """
			
 
				+        Method to define behavior when the context manager protocol exits.
			
 
				 
			
 
				-        except Exception as e:
			
 
				-            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
			
 
				-            raise
			
 
				+        This is called when exiting the `with` block and ensures that any necessary 
			
 
				+        cleanup or resource release processes are executed, such as shutting down 
			
 
				+        the system properly.
			
 
				 
			
 
				-    def __del__(self):
			
 
				-        """
			
 
				-        Destructor method ensures safe shutdown of the recorder when the instance is destroyed.
			
 
				+        Args:
			
 
				+            exc_type (Exception or None): The type of the exception that caused the context to be exited, if any.
			
 
				+            exc_value (Exception or None): The exception instance that caused the context to be exited, if any.
			
 
				+            traceback (Traceback or None): The traceback corresponding to the exception, if any.
			
 
				         """
			
 
				         self.shutdown()
			
--- a/example_app/ui_openai_voice_interface.py
+++ b/example_app/ui_openai_voice_interface.py
@@ -1,514 +1,522 @@
 
				-from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
			
 
				-from RealtimeSTT import AudioToTextRecorder
			
 
				-
			
 
				-from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
			
 
				-from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
			
 
				-from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
			
 
				-
			
 
				-import os
			
 
				-import openai
			
 
				-import sys
			
 
				-import time
			
 
				-import sounddevice as sd
			
 
				-import numpy as np
			
 
				-import wavio
			
 
				-import keyboard
			
 
				-
			
 
				-max_history_messages = 6
			
 
				-return_to_wakewords_after_silence = 12
			
 
				-start_with_wakeword = False
			
 
				-start_engine = "Azure" # Azure, Elevenlabs
			
 
				-recorder_model = "large-v2"
			
 
				-language = "en"
			
 
				-azure_speech_region = "eastus"
			
 
				-openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
			
 
				-
			
 
				-openai.api_key = os.environ.get("OPENAI_API_KEY")
			
 
				-
			
 
				-user_font_size = 22
			
 
				-user_color = QColor(0, 188, 242) # turquoise
			
 
				-
			
 
				-assistant_font_size = 24
			
 
				-assistant_color = QColor(239, 98, 166) # pink
			
 
				-
			
 
				-voice_azure = "en-GB-SoniaNeural"
			
 
				-voice_system = "Zira"
			
 
				-#voice_system = "Hazel"
			
 
				-prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
			
 
				-elevenlabs_model = "eleven_monolingual_v1"
			
 
				-
			
 
				-if language == "de":
			
 
				-    elevenlabs_model = "eleven_multilingual_v1"
			
 
				-    voice_system = "Katja"
			
 
				-    voice_azure = "de-DE-MajaNeural"
			
 
				-    prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
			
 
				-    
			
 
				-print ("Click the top right corner to change the engine")
			
 
				-print ("Press ESC to stop the current playback")
			
 
				-
			
 
				-system_prompt_message = {
			
 
				-    'role': 'system',
			
 
				-    'content': prompt
			
 
				-}
			
 
				-
			
 
				-def generate_response(messages):
			
 
				-    """Generate assistant's response using OpenAI."""
			
 
				-    for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
			
 
				-        text_chunk = chunk["choices"][0]["delta"].get("content")
			
 
				-        if text_chunk:
			
 
				-            yield text_chunk
			
 
				-
			
 
				-history = []
			
 
				-MAX_WINDOW_WIDTH = 1600
			
 
				-MAX_WIDTH_ASSISTANT = 1200
			
 
				-MAX_WIDTH_USER = 1500
			
 
				-
			
 
				-class AudioPlayer(QThread):
			
 
				-    def __init__(self, file_path):
			
 
				-        super(AudioPlayer, self).__init__()
			
 
				-        self.file_path = file_path
			
 
				-
			
 
				-    def run(self):
			
 
				-        wav = wavio.read(self.file_path)
			
 
				-        sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
			
 
				-        sd.play(sound, wav.rate)
			
 
				-        sd.wait()
			
 
				-
			
 
				-class TextRetrievalThread(QThread):
			
 
				-    textRetrieved = pyqtSignal(str)
			
 
				-
			
 
				-    def __init__(self, recorder):
			
 
				-        super().__init__()
			
 
				-        self.recorder = recorder
			
 
				-        self.active = False  
			
 
				-
			
 
				-    def run(self):
			
 
				-        while True:
			
 
				-            if self.active:  
			
 
				-                text = self.recorder.text()
			
 
				-                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
			
 
				-                self.textRetrieved.emit(text)
			
 
				-                self.active = False
			
 
				-            time.sleep(0.1) 
			
 
				-
			
 
				-    def activate(self):
			
 
				-        self.active = True 
			
 
				-
			
 
				-class TransparentWindow(QWidget):
			
 
				-    updateUI = pyqtSignal()
			
 
				-    clearAssistantTextSignal = pyqtSignal()
			
 
				-    clearUserTextSignal = pyqtSignal()
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        self.setGeometry(1, 1, 1, 1) 
			
 
				-
			
 
				-        self.setWindowTitle("Transparent Window")
			
 
				-        self.setAttribute(Qt.WA_TranslucentBackground)
			
 
				-        self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
			
 
				-
			
 
				-        self.big_symbol_font = QFont('Arial', 32)
			
 
				-        self.small_symbol_font = QFont('Arial', 17)
			
 
				-        self.user_font = QFont('Arial', user_font_size)
			
 
				-        self.assistant_font = QFont('Arial', assistant_font_size)      
			
 
				-        self.assistant_font.setItalic(True) 
			
 
				-
			
 
				-        self.big_symbol_text = ""
			
 
				-        self.small_symbol_text = ""
			
 
				-        self.user_text = ""
			
 
				-        self.assistant_text = ""
			
 
				-        self.displayed_user_text = ""
			
 
				-        self.displayed_assistant_text = ""
			
 
				-        self.stream = None
			
 
				-        self.text_retrieval_thread = None
			
 
				-
			
 
				-        self.user_text_timer = QTimer(self)
			
 
				-        self.assistant_text_timer = QTimer(self)
			
 
				-        self.user_text_timer.timeout.connect(self.clear_user_text)
			
 
				-        self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
			
 
				-
			
 
				-        self.clearUserTextSignal.connect(self.init_clear_user_text)
			
 
				-        self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
			
 
				-        self.user_text_opacity = 255 
			
 
				-        self.assistant_text_opacity = 255 
			
 
				-        self.updateUI.connect(self.update_self)
			
 
				-        self.audio_player = None
			
 
				-
			
 
				-        self.run_fade_user = False
			
 
				-        self.run_fade_assistant = False
			
 
				-
			
 
				-        self.menu = QMenu()
			
 
				-        self.menu.setStyleSheet("""
			
 
				-            QMenu {
			
 
				-                background-color: black;
			
 
				-                color: white;
			
 
				-                border-radius: 10px;
			
 
				-            }
			
 
				-            QMenu::item:selected {
			
 
				-                background-color: #555555;
			
 
				-            }
			
 
				-            """)
			
 
				-
			
 
				-        self.elevenlabs_action = QAction("Elevenlabs", self)
			
 
				-        self.azure_action = QAction("Azure", self)
			
 
				-        self.system_action = QAction("System", self)
			
 
				-        self.quit_action = QAction("Quit", self)
			
 
				-
			
 
				-        self.menu.addAction(self.elevenlabs_action)
			
 
				-        self.menu.addAction(self.azure_action)
			
 
				-        self.menu.addAction(self.system_action)
			
 
				-        self.menu.addSeparator() 
			
 
				-        self.menu.addAction(self.quit_action)
			
 
				-
			
 
				-        self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
			
 
				-        self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
			
 
				-        self.system_action.triggered.connect(lambda: self.select_engine("System"))
			
 
				-        self.quit_action.triggered.connect(self.close_application)
			
 
				-
			
 
				-    def mousePressEvent(self, event: QMouseEvent):
			
 
				-        if event.button() == Qt.LeftButton:
			
 
				-            if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
			
 
				-                self.menu.exec_(self.mapToGlobal(event.pos()))        
			
 
				-
			
 
				-    def close_application(self):
			
 
				-        QApplication.quit()                
			
 
				-
			
 
				-    def init(self):
			
 
				-
			
 
				-        self.select_engine(start_engine)
			
 
				-
			
 
				-        # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
			
 
				-
			
 
				-        self.recorder = AudioToTextRecorder(
			
 
				-            model=recorder_model,
			
 
				-            language=language,
			
 
				-            wake_words="Jarvis",
			
 
				-            spinner=True,
			
 
				-            silero_sensitivity=0.2,
			
 
				-            webrtc_sensitivity=3,
			
 
				-            on_recording_start=self.on_recording_start,
			
 
				-            on_vad_detect_start=self.on_vad_detect_start,
			
 
				-            on_wakeword_detection_start=self.on_wakeword_detection_start,
			
 
				-            on_transcription_start=self.on_transcription_start,
			
 
				-            post_speech_silence_duration=0.4, 
			
 
				-            min_length_of_recording=0.3, 
			
 
				-            min_gap_between_recordings=0.01, 
			
 
				-            enable_realtime_transcription = True,
			
 
				-            realtime_processing_pause = 0.01, 
			
 
				-            realtime_model_type = "tiny",
			
 
				-            on_realtime_transcription_stabilized=self.text_detected
			
 
				-        )
			
 
				-        if not start_with_wakeword:
			
 
				-            self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
			
 
				-            
			
 
				-        self.text_retrieval_thread = TextRetrievalThread(self.recorder)
			
 
				-        self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
			
 
				-        self.text_retrieval_thread.start()
			
 
				-        self.text_retrieval_thread.activate()
			
 
				-
			
 
				-        keyboard.on_press_key('esc', self.on_escape)
			
 
				+if __name__ == '__main__':
			
 
				 
			
 
				-    def select_engine(self, engine_name):
			
 
				-        if self.stream:
			
 
				-            self.stream.stop()
			
 
				+    from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
			
 
				+    from RealtimeSTT import AudioToTextRecorder
			
 
				+
			
 
				+    from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
			
 
				+    from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
			
 
				+    from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
			
 
				+
			
 
				+    import os
			
 
				+    import openai
			
 
				+    import sys
			
 
				+    import time
			
 
				+    import sounddevice as sd
			
 
				+    import numpy as np
			
 
				+    import wavio
			
 
				+    import keyboard
			
 
				+
			
 
				+    max_history_messages = 6
			
 
				+    return_to_wakewords_after_silence = 12
			
 
				+    start_with_wakeword = False
			
 
				+    start_engine = "Azure" # Azure, Elevenlabs
			
 
				+    recorder_model = "large-v2"
			
 
				+    language = "en"
			
 
				+    azure_speech_region = "eastus"
			
 
				+    openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
			
 
				+
			
 
				+    openai.api_key = os.environ.get("OPENAI_API_KEY")
			
 
				+
			
 
				+    user_font_size = 22
			
 
				+    user_color = QColor(0, 188, 242) # turquoise
			
 
				+
			
 
				+    assistant_font_size = 24
			
 
				+    assistant_color = QColor(239, 98, 166) # pink
			
 
				+
			
 
				+    voice_azure = "en-GB-SoniaNeural"
			
 
				+    voice_system = "Zira"
			
 
				+    #voice_system = "Hazel"
			
 
				+    prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
			
 
				+    elevenlabs_model = "eleven_monolingual_v1"
			
 
				+
			
 
				+    if language == "de":
			
 
				+        elevenlabs_model = "eleven_multilingual_v1"
			
 
				+        voice_system = "Katja"
			
 
				+        voice_azure = "de-DE-MajaNeural"
			
 
				+        prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
			
 
				+        
			
 
				+    print ("Click the top right corner to change the engine")
			
 
				+    print ("Press ESC to stop the current playback")
			
 
				+
			
 
				+    system_prompt_message = {
			
 
				+        'role': 'system',
			
 
				+        'content': prompt
			
 
				+    }
			
 
				+
			
 
				+    def generate_response(messages):
			
 
				+        """Generate assistant's response using OpenAI."""
			
 
				+        for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
			
 
				+            text_chunk = chunk["choices"][0]["delta"].get("content")
			
 
				+            if text_chunk:
			
 
				+                yield text_chunk
			
 
				+
			
 
				+    history = []
			
 
				+    MAX_WINDOW_WIDTH = 1600
			
 
				+    MAX_WIDTH_ASSISTANT = 1200
			
 
				+    MAX_WIDTH_USER = 1500
			
 
				+
			
 
				+    class AudioPlayer(QThread):
			
 
				+        def __init__(self, file_path):
			
 
				+            super(AudioPlayer, self).__init__()
			
 
				+            self.file_path = file_path
			
 
				+
			
 
				+        def run(self):
			
 
				+            wav = wavio.read(self.file_path)
			
 
				+            sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
			
 
				+            sd.play(sound, wav.rate)
			
 
				+            sd.wait()
			
 
				+
			
 
				+    class TextRetrievalThread(QThread):
			
 
				+        textRetrieved = pyqtSignal(str)
			
 
				+
			
 
				+        def __init__(self, recorder):
			
 
				+            super().__init__()
			
 
				+            self.recorder = recorder
			
 
				+            self.active = False  
			
 
				+
			
 
				+        def run(self):
			
 
				+            while True:
			
 
				+                if self.active:  
			
 
				+                    text = self.recorder.text()
			
 
				+                    self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
			
 
				+                    self.textRetrieved.emit(text)
			
 
				+                    self.active = False
			
 
				+                time.sleep(0.1) 
			
 
				+
			
 
				+        def activate(self):
			
 
				+            self.active = True 
			
 
				+
			
 
				+    class TransparentWindow(QWidget):
			
 
				+        updateUI = pyqtSignal()
			
 
				+        clearAssistantTextSignal = pyqtSignal()
			
 
				+        clearUserTextSignal = pyqtSignal()
			
 
				+
			
 
				+        def __init__(self):
			
 
				+            super().__init__()
			
 
				+
			
 
				+            self.setGeometry(1, 1, 1, 1) 
			
 
				+
			
 
				+            self.setWindowTitle("Transparent Window")
			
 
				+            self.setAttribute(Qt.WA_TranslucentBackground)
			
 
				+            self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
			
 
				+
			
 
				+            self.big_symbol_font = QFont('Arial', 32)
			
 
				+            self.small_symbol_font = QFont('Arial', 17)
			
 
				+            self.user_font = QFont('Arial', user_font_size)
			
 
				+            self.assistant_font = QFont('Arial', assistant_font_size)      
			
 
				+            self.assistant_font.setItalic(True) 
			
 
				+
			
 
				+            self.big_symbol_text = ""
			
 
				+            self.small_symbol_text = ""
			
 
				+            self.user_text = ""
			
 
				+            self.assistant_text = ""
			
 
				+            self.displayed_user_text = ""
			
 
				+            self.displayed_assistant_text = ""
			
 
				             self.stream = None
			
 
				+            self.text_retrieval_thread = None
			
 
				 
			
 
				-        engine = None
			
 
				+            self.user_text_timer = QTimer(self)
			
 
				+            self.assistant_text_timer = QTimer(self)
			
 
				+            self.user_text_timer.timeout.connect(self.clear_user_text)
			
 
				+            self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
			
 
				 
			
 
				-        if engine_name == "Azure":
			
 
				-            engine = AzureEngine(
			
 
				-                    os.environ.get("AZURE_SPEECH_KEY"),
			
 
				-                    os.environ.get("AZURE_SPEECH_REGION"),
			
 
				-                    voice_azure,
			
 
				-                    rate=24,
			
 
				-                    pitch=10,
			
 
				-                )
			
 
				+            self.clearUserTextSignal.connect(self.init_clear_user_text)
			
 
				+            self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
			
 
				+            self.user_text_opacity = 255 
			
 
				+            self.assistant_text_opacity = 255 
			
 
				+            self.updateUI.connect(self.update_self)
			
 
				+            self.audio_player = None
			
 
				+
			
 
				+            self.run_fade_user = False
			
 
				+            self.run_fade_assistant = False
			
 
				+
			
 
				+            self.menu = QMenu()
			
 
				+            self.menu.setStyleSheet("""
			
 
				+                QMenu {
			
 
				+                    background-color: black;
			
 
				+                    color: white;
			
 
				+                    border-radius: 10px;
			
 
				+                }
			
 
				+                QMenu::item:selected {
			
 
				+                    background-color: #555555;
			
 
				+                }
			
 
				+                """)
			
 
				+
			
 
				+            self.elevenlabs_action = QAction("Elevenlabs", self)
			
 
				+            self.azure_action = QAction("Azure", self)
			
 
				+            self.system_action = QAction("System", self)
			
 
				+            self.quit_action = QAction("Quit", self)
			
 
				+
			
 
				+            self.menu.addAction(self.elevenlabs_action)
			
 
				+            self.menu.addAction(self.azure_action)
			
 
				+            self.menu.addAction(self.system_action)
			
 
				+            self.menu.addSeparator() 
			
 
				+            self.menu.addAction(self.quit_action)
			
 
				+
			
 
				+            self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
			
 
				+            self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
			
 
				+            self.system_action.triggered.connect(lambda: self.select_engine("System"))
			
 
				+            self.quit_action.triggered.connect(self.close_application)
			
 
				+
			
 
				+        def mousePressEvent(self, event: QMouseEvent):
			
 
				+            if event.button() == Qt.LeftButton:
			
 
				+                if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
			
 
				+                    self.menu.exec_(self.mapToGlobal(event.pos()))        
			
 
				+
			
 
				+        def close_application(self):
			
 
				+            if self.recorder:
			
 
				+                self.recorder.shutdown()                    
			
 
				+            QApplication.quit()                
			
 
				+
			
 
				+        def init(self):
			
 
				+
			
 
				+            self.select_engine(start_engine)
			
 
				+
			
 
				+            # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
			
 
				+
			
 
				+            self.recorder = AudioToTextRecorder(
			
 
				+                model=recorder_model,
			
 
				+                language=language,
			
 
				+                wake_words="Jarvis",
			
 
				+                silero_use_onnx=False,
			
 
				+                spinner=True,
			
 
				+                silero_sensitivity=0.2,
			
 
				+                webrtc_sensitivity=3,
			
 
				+                on_recording_start=self.on_recording_start,
			
 
				+                on_vad_detect_start=self.on_vad_detect_start,
			
 
				+                on_wakeword_detection_start=self.on_wakeword_detection_start,
			
 
				+                on_transcription_start=self.on_transcription_start,
			
 
				+                post_speech_silence_duration=0.4, 
			
 
				+                min_length_of_recording=0.3, 
			
 
				+                min_gap_between_recordings=0.01, 
			
 
				+                enable_realtime_transcription = True,
			
 
				+                realtime_processing_pause = 0.01, 
			
 
				+                realtime_model_type = "tiny",
			
 
				+                on_realtime_transcription_stabilized=self.text_detected
			
 
				+            )
			
 
				+            if not start_with_wakeword:
			
 
				+                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
			
 
				+                
			
 
				+            self.text_retrieval_thread = TextRetrievalThread(self.recorder)
			
 
				+            self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
			
 
				+            self.text_retrieval_thread.start()
			
 
				+            self.text_retrieval_thread.activate()
			
 
				 
			
 
				-        elif engine_name == "Elevenlabs":
			
 
				-            engine = ElevenlabsEngine(
			
 
				-                    os.environ.get("ELEVENLABS_API_KEY"),
			
 
				-                    model=elevenlabs_model
			
 
				+            keyboard.on_press_key('esc', self.on_escape)
			
 
				+
			
 
				+        def closeEvent(self, event):
			
 
				+            if self.recorder:
			
 
				+                self.recorder.shutdown()            
			
 
				+
			
 
				+        def select_engine(self, engine_name):
			
 
				+            if self.stream:
			
 
				+                self.stream.stop()
			
 
				+                self.stream = None
			
 
				+
			
 
				+            engine = None
			
 
				+
			
 
				+            if engine_name == "Azure":
			
 
				+                engine = AzureEngine(
			
 
				+                        os.environ.get("AZURE_SPEECH_KEY"),
			
 
				+                        os.environ.get("AZURE_SPEECH_REGION"),
			
 
				+                        voice_azure,
			
 
				+                        rate=24,
			
 
				+                        pitch=10,
			
 
				+                    )
			
 
				+
			
 
				+            elif engine_name == "Elevenlabs":
			
 
				+                engine = ElevenlabsEngine(
			
 
				+                        os.environ.get("ELEVENLABS_API_KEY"),
			
 
				+                        model=elevenlabs_model
			
 
				+                    )
			
 
				+            else:
			
 
				+                engine = SystemEngine(
			
 
				+                    voice=voice_system,
			
 
				+                    #print_installed_voices=True
			
 
				                 )
			
 
				-        else:
			
 
				-            engine = SystemEngine(
			
 
				-                voice=voice_system,
			
 
				-                #print_installed_voices=True
			
 
				+
			
 
				+            self.stream = TextToAudioStream(
			
 
				+                engine,
			
 
				+                on_character=self.on_character,
			
 
				+                on_text_stream_stop=self.on_text_stream_stop,
			
 
				+                on_text_stream_start=self.on_text_stream_start,
			
 
				+                on_audio_stream_stop=self.on_audio_stream_stop,
			
 
				+                log_characters=True
			
 
				             )
			
 
				+            sys.stdout.write('\033[K')  # Clear to the end of line
			
 
				+            sys.stdout.write('\r')  # Move the cursor to the beginning of the line
			
 
				+            print (f"Using {engine_name} engine")
			
 
				 
			
 
				-        self.stream = TextToAudioStream(
			
 
				-            engine,
			
 
				-            on_character=self.on_character,
			
 
				-            on_text_stream_stop=self.on_text_stream_stop,
			
 
				-            on_text_stream_start=self.on_text_stream_start,
			
 
				-            on_audio_stream_stop=self.on_audio_stream_stop,
			
 
				-            log_characters=True
			
 
				-        )
			
 
				-        sys.stdout.write('\033[K')  # Clear to the end of line
			
 
				-        sys.stdout.write('\r')  # Move the cursor to the beginning of the line
			
 
				-        print (f"Using {engine_name} engine")
			
 
				-
			
 
				-
			
 
				-    def text_detected(self, text):
			
 
				-        self.run_fade_user = False
			
 
				-        if self.user_text_timer.isActive():
			
 
				-            self.user_text_timer.stop()
			
 
				-        self.user_text_opacity = 255 
			
 
				-        self.user_text = text
			
 
				-        self.updateUI.emit()
			
 
				-
			
 
				-    def on_escape(self, e):
			
 
				-        if self.stream.is_playing():
			
 
				-            self.stream.stop()
			
 
				-
			
 
				-    def showEvent(self, event: QEvent):
			
 
				-        super().showEvent(event)
			
 
				-        if event.type() == QEvent.Show:
			
 
				-            self.set_symbols("⌛", "🚀")
			
 
				-            QTimer.singleShot(1000, self.init) 
			
 
				-
			
 
				-    def on_character(self, char):
			
 
				-        if self.stream:
			
 
				-            self.assistant_text += char
			
 
				+
			
 
				+        def text_detected(self, text):
			
 
				+            self.run_fade_user = False
			
 
				+            if self.user_text_timer.isActive():
			
 
				+                self.user_text_timer.stop()
			
 
				+            self.user_text_opacity = 255 
			
 
				+            self.user_text = text
			
 
				             self.updateUI.emit()
			
 
				 
			
 
				-    def on_text_stream_stop(self):
			
 
				-        print("\"", end="", flush=True)
			
 
				-        if self.stream:
			
 
				-            assistant_response = self.stream.text()            
			
 
				-            self.assistant_text = assistant_response
			
 
				-            history.append({'role': 'assistant', 'content': assistant_response})
			
 
				+        def on_escape(self, e):
			
 
				+            if self.stream.is_playing():
			
 
				+                self.stream.stop()
			
 
				 
			
 
				-    def on_audio_stream_stop(self):
			
 
				-        self.set_symbols("🎙️", "⚪")
			
 
				+        def showEvent(self, event: QEvent):
			
 
				+            super().showEvent(event)
			
 
				+            if event.type() == QEvent.Show:
			
 
				+                self.set_symbols("⌛", "🚀")
			
 
				+                QTimer.singleShot(1000, self.init) 
			
 
				 
			
 
				-        if self.stream:
			
 
				-            self.clearAssistantTextSignal.emit()
			
 
				-            self.text_retrieval_thread.activate()
			
 
				+        def on_character(self, char):
			
 
				+            if self.stream:
			
 
				+                self.assistant_text += char
			
 
				+                self.updateUI.emit()
			
 
				 
			
 
				-    def generate_answer(self):
			
 
				-        self.run_fade_assistant = False
			
 
				-        if self.assistant_text_timer.isActive():
			
 
				-            self.assistant_text_timer.stop()
			
 
				+        def on_text_stream_stop(self):
			
 
				+            print("\"", end="", flush=True)
			
 
				+            if self.stream:
			
 
				+                assistant_response = self.stream.text()            
			
 
				+                self.assistant_text = assistant_response
			
 
				+                history.append({'role': 'assistant', 'content': assistant_response})
			
 
				 
			
 
				-        history.append({'role': 'user', 'content': self.user_text})
			
 
				-        self.remove_assistant_text()
			
 
				-        assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
			
 
				-        self.stream.feed(assistant_response)
			
 
				-        self.stream.play_async(minimum_sentence_length=6,
			
 
				-                               buffer_threshold_seconds=2)
			
 
				+        def on_audio_stream_stop(self):
			
 
				+            self.set_symbols("🎙️", "⚪")
			
 
				 
			
 
				-    def set_symbols(self, big_symbol, small_symbol):
			
 
				-        self.big_symbol_text = big_symbol
			
 
				-        self.small_symbol_text = small_symbol
			
 
				-        self.updateUI.emit()
			
 
				+            if self.stream:
			
 
				+                self.clearAssistantTextSignal.emit()
			
 
				+                self.text_retrieval_thread.activate()
			
 
				 
			
 
				-    def on_text_stream_start(self):
			
 
				-        self.set_symbols("⌛", "👄")
			
 
				+        def generate_answer(self):
			
 
				+            self.run_fade_assistant = False
			
 
				+            if self.assistant_text_timer.isActive():
			
 
				+                self.assistant_text_timer.stop()
			
 
				+
			
 
				+            history.append({'role': 'user', 'content': self.user_text})
			
 
				+            self.remove_assistant_text()
			
 
				+            assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
			
 
				+            self.stream.feed(assistant_response)
			
 
				+            self.stream.play_async(minimum_sentence_length=6,
			
 
				+                                buffer_threshold_seconds=2)
			
 
				+
			
 
				+        def set_symbols(self, big_symbol, small_symbol):
			
 
				+            self.big_symbol_text = big_symbol
			
 
				+            self.small_symbol_text = small_symbol
			
 
				+            self.updateUI.emit()
			
 
				 
			
 
				-    def process_user_text(self, user_text):
			
 
				-        user_text = user_text.strip()
			
 
				-        if user_text:
			
 
				-            self.run_fade_user = False
			
 
				+        def on_text_stream_start(self):
			
 
				+            self.set_symbols("⌛", "👄")
			
 
				+
			
 
				+        def process_user_text(self, user_text):
			
 
				+            user_text = user_text.strip()
			
 
				+            if user_text:
			
 
				+                self.run_fade_user = False
			
 
				+                if self.user_text_timer.isActive():
			
 
				+                    self.user_text_timer.stop()
			
 
				+
			
 
				+                self.user_text_opacity = 255 
			
 
				+                self.user_text = user_text
			
 
				+                self.clearUserTextSignal.emit()
			
 
				+                print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
			
 
				+                self.set_symbols("⌛", "🧠")
			
 
				+                QTimer.singleShot(100, self.generate_answer)
			
 
				+
			
 
				+        def on_transcription_start(self):
			
 
				+            self.set_symbols("⌛", "📝")
			
 
				+
			
 
				+        def on_recording_start(self):
			
 
				+            self.text_storage = []
			
 
				+            self.ongoing_sentence = ""
			
 
				+            self.set_symbols("🎙️", "🔴")
			
 
				+
			
 
				+        def on_vad_detect_start(self):
			
 
				+            if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
			
 
				+                self.audio_player = AudioPlayer("active.wav")
			
 
				+                self.audio_player.start() 
			
 
				+
			
 
				+            self.set_symbols("🎙️", "⚪")
			
 
				+
			
 
				+        def on_wakeword_detection_start(self):
			
 
				+            self.audio_player = AudioPlayer("inactive.wav")
			
 
				+            self.audio_player.start()         
			
 
				+
			
 
				+            self.set_symbols("", "💤")
			
 
				+
			
 
				+        def init_clear_user_text(self):
			
 
				             if self.user_text_timer.isActive():
			
 
				-                self.user_text_timer.stop()
			
 
				+                self.user_text_timer.stop()        
			
 
				+            self.user_text_timer.start(10000)
			
 
				 
			
 
				+        def remove_user_text(self):
			
 
				+            self.user_text = ""
			
 
				             self.user_text_opacity = 255 
			
 
				-            self.user_text = user_text
			
 
				-            self.clearUserTextSignal.emit()
			
 
				-            print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
			
 
				-            self.set_symbols("⌛", "🧠")
			
 
				-            QTimer.singleShot(100, self.generate_answer)
			
 
				+            self.updateUI.emit()
			
 
				 
			
 
				-    def on_transcription_start(self):
			
 
				-        self.set_symbols("⌛", "📝")
			
 
				+        def fade_out_user_text(self):
			
 
				+            if not self.run_fade_user:
			
 
				+                return
			
 
				 
			
 
				-    def on_recording_start(self):
			
 
				-        self.text_storage = []
			
 
				-        self.ongoing_sentence = ""
			
 
				-        self.set_symbols("🎙️", "🔴")
			
 
				+            if self.user_text_opacity > 0:
			
 
				+                self.user_text_opacity -= 5 
			
 
				+                self.updateUI.emit()
			
 
				+                QTimer.singleShot(50, self.fade_out_user_text)
			
 
				+            else:
			
 
				+                self.run_fade_user = False
			
 
				+                self.remove_user_text()        
			
 
				 
			
 
				-    def on_vad_detect_start(self):
			
 
				-        if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
			
 
				-            self.audio_player = AudioPlayer("active.wav")
			
 
				-            self.audio_player.start() 
			
 
				+        def clear_user_text(self):
			
 
				+            self.user_text_timer.stop()
			
 
				 
			
 
				-        self.set_symbols("🎙️", "⚪")
			
 
				+            if not self.user_text:
			
 
				+                return
			
 
				 
			
 
				-    def on_wakeword_detection_start(self):
			
 
				-        self.audio_player = AudioPlayer("inactive.wav")
			
 
				-        self.audio_player.start()         
			
 
				+            self.user_text_opacity = 255
			
 
				+            self.run_fade_user = True
			
 
				+            self.fade_out_user_text()
			
 
				 
			
 
				-        self.set_symbols("", "💤")
			
 
				+        def init_clear_assistant_text(self):
			
 
				+            if self.assistant_text_timer.isActive():
			
 
				+                self.assistant_text_timer.stop()        
			
 
				+            self.assistant_text_timer.start(10000)
			
 
				 
			
 
				-    def init_clear_user_text(self):
			
 
				-        if self.user_text_timer.isActive():
			
 
				-            self.user_text_timer.stop()        
			
 
				-        self.user_text_timer.start(10000)
			
 
				+        def remove_assistant_text(self):
			
 
				+            self.assistant_text = ""
			
 
				+            self.assistant_text_opacity = 255 
			
 
				+            self.updateUI.emit()
			
 
				 
			
 
				-    def remove_user_text(self):
			
 
				-        self.user_text = ""
			
 
				-        self.user_text_opacity = 255 
			
 
				-        self.updateUI.emit()
			
 
				+        def fade_out_assistant_text(self):
			
 
				+            if not self.run_fade_assistant:
			
 
				+                return
			
 
				+            
			
 
				+            if self.assistant_text_opacity > 0:
			
 
				+                self.assistant_text_opacity -= 5 
			
 
				+                self.updateUI.emit()
			
 
				+                QTimer.singleShot(50, self.fade_out_assistant_text)
			
 
				+            else:
			
 
				+                self.run_fade_assistant = False
			
 
				+                self.remove_assistant_text()        
			
 
				 
			
 
				-    def fade_out_user_text(self):
			
 
				-        if not self.run_fade_user:
			
 
				-            return
			
 
				+        def clear_assistant_text(self):
			
 
				+            self.assistant_text_timer.stop()
			
 
				 
			
 
				-        if self.user_text_opacity > 0:
			
 
				-            self.user_text_opacity -= 5 
			
 
				-            self.updateUI.emit()
			
 
				-            QTimer.singleShot(50, self.fade_out_user_text)
			
 
				-        else:
			
 
				-            self.run_fade_user = False
			
 
				-            self.remove_user_text()        
			
 
				+            if not self.assistant_text:
			
 
				+                return
			
 
				 
			
 
				-    def clear_user_text(self):
			
 
				-        self.user_text_timer.stop()
			
 
				+            self.assistant_text_opacity = 255
			
 
				+            self.run_fade_assistant = True
			
 
				+            self.fade_out_assistant_text()
			
 
				 
			
 
				-        if not self.user_text:
			
 
				-            return
			
 
				+        def update_self(self):
			
 
				 
			
 
				-        self.user_text_opacity = 255
			
 
				-        self.run_fade_user = True
			
 
				-        self.fade_out_user_text()
			
 
				+            self.blockSignals(True)
			
 
				+                    
			
 
				+            self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
			
 
				+            self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
			
 
				 
			
 
				-    def init_clear_assistant_text(self):
			
 
				-        if self.assistant_text_timer.isActive():
			
 
				-            self.assistant_text_timer.stop()        
			
 
				-        self.assistant_text_timer.start(10000)
			
 
				+            fm_symbol = QFontMetrics(self.big_symbol_font)
			
 
				+            self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
			
 
				+            self.symbol_height = fm_symbol.height() + 8
			
 
				 
			
 
				-    def remove_assistant_text(self):
			
 
				-        self.assistant_text = ""
			
 
				-        self.assistant_text_opacity = 255 
			
 
				-        self.updateUI.emit()
			
 
				+            self.total_width = MAX_WINDOW_WIDTH
			
 
				 
			
 
				-    def fade_out_assistant_text(self):
			
 
				-        if not self.run_fade_assistant:
			
 
				-            return
			
 
				-        
			
 
				-        if self.assistant_text_opacity > 0:
			
 
				-            self.assistant_text_opacity -= 5 
			
 
				-            self.updateUI.emit()
			
 
				-            QTimer.singleShot(50, self.fade_out_assistant_text)
			
 
				-        else:
			
 
				-            self.run_fade_assistant = False
			
 
				-            self.remove_assistant_text()        
			
 
				+            fm_user = QFontMetrics(self.user_font)
			
 
				+            user_text_lines = (self.displayed_user_text.count("\n") + 1)
			
 
				+            self.user_height = fm_user.height() * user_text_lines + 7
			
 
				 
			
 
				-    def clear_assistant_text(self):
			
 
				-        self.assistant_text_timer.stop()
			
 
				+            fm_assistant = QFontMetrics(self.assistant_font)
			
 
				+            assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
			
 
				+            self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
			
 
				 
			
 
				-        if not self.assistant_text:
			
 
				-            return
			
 
				+            self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
			
 
				 
			
 
				-        self.assistant_text_opacity = 255
			
 
				-        self.run_fade_assistant = True
			
 
				-        self.fade_out_assistant_text()
			
 
				+            desktop = QDesktopWidget()
			
 
				+            screen_rect = desktop.availableGeometry(desktop.primaryScreen())
			
 
				+            self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
			
 
				 
			
 
				-    def update_self(self):
			
 
				+            self.blockSignals(False)
			
 
				 
			
 
				-        self.blockSignals(True)
			
 
				-                
			
 
				-        self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
			
 
				-        self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
			
 
				-
			
 
				-        fm_symbol = QFontMetrics(self.big_symbol_font)
			
 
				-        self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
			
 
				-        self.symbol_height = fm_symbol.height() + 8
			
 
				-
			
 
				-        self.total_width = MAX_WINDOW_WIDTH
			
 
				-
			
 
				-        fm_user = QFontMetrics(self.user_font)
			
 
				-        user_text_lines = (self.displayed_user_text.count("\n") + 1)
			
 
				-        self.user_height = fm_user.height() * user_text_lines + 7
			
 
				-
			
 
				-        fm_assistant = QFontMetrics(self.assistant_font)
			
 
				-        assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
			
 
				-        self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
			
 
				-
			
 
				-        self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
			
 
				-
			
 
				-        desktop = QDesktopWidget()
			
 
				-        screen_rect = desktop.availableGeometry(desktop.primaryScreen())
			
 
				-        self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
			
 
				-
			
 
				-        self.blockSignals(False)
			
 
				-
			
 
				-        self.update()
			
 
				-
			
 
				-    def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
			
 
				-        painter.setPen(outlineColor)
			
 
				-        for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
			
 
				-                    (-outline_size, -outline_size), (outline_size, -outline_size),
			
 
				-                    (-outline_size, outline_size), (outline_size, outline_size)]:
			
 
				-            painter.drawText(x + dx, y + dy, width, height, alignment, text)
			
 
				-
			
 
				-        painter.setPen(textColor)
			
 
				-        painter.drawText(x, y, width, height, alignment, text)
			
 
				-
			
 
				-    def paintEvent(self, event):
			
 
				-        painter = QPainter(self)
			
 
				-
			
 
				-        offsetX = 4
			
 
				-        offsetY = 5
			
 
				-    
			
 
				-        painter.setPen(QColor(255, 255, 255))
			
 
				-
			
 
				-        # Draw symbol
			
 
				-        painter.setFont(self.big_symbol_font)
			
 
				-        if self.big_symbol_text:
			
 
				-            painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
			
 
				-            painter.setFont(self.small_symbol_font)
			
 
				-            painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
			
 
				-        else:
			
 
				-            painter.setFont(self.small_symbol_font)
			
 
				-            painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
			
 
				-
			
 
				-        # Draw User Text
			
 
				-        painter.setFont(self.user_font)
			
 
				-        user_x = self.total_width - self.user_width - 45 + offsetX
			
 
				-        user_y = offsetY + 15
			
 
				-        user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
			
 
				-        outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
			
 
				-        self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
			
 
				-
			
 
				-        # Draw Assistant Text
			
 
				-        painter.setFont(self.assistant_font)
			
 
				-        assistant_x = self.total_width - self.assistant_width - 5  + offsetX
			
 
				-        assistant_y = self.user_height + offsetY + 15
			
 
				-        assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
			
 
				-        outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
			
 
				-        self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
			
 
				-
			
 
				-    def return_text_adjusted_to_width(self, text, font, max_width_allowed):
			
 
				-        """
			
 
				-        Line feeds are inserted so that the text width does never exceed max_width.
			
 
				-        Text is only broken up on whole words.
			
 
				-        """
			
 
				-        fm = QFontMetrics(font)
			
 
				-        words = text.split(' ')
			
 
				-        adjusted_text = ''
			
 
				-        current_line = ''
			
 
				-        max_width_used = 0
			
 
				+            self.update()
			
 
				+
			
 
				+        def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
			
 
				+            painter.setPen(outlineColor)
			
 
				+            for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
			
 
				+                        (-outline_size, -outline_size), (outline_size, -outline_size),
			
 
				+                        (-outline_size, outline_size), (outline_size, outline_size)]:
			
 
				+                painter.drawText(x + dx, y + dy, width, height, alignment, text)
			
 
				+
			
 
				+            painter.setPen(textColor)
			
 
				+            painter.drawText(x, y, width, height, alignment, text)
			
 
				+
			
 
				+        def paintEvent(self, event):
			
 
				+            painter = QPainter(self)
			
 
				+
			
 
				+            offsetX = 4
			
 
				+            offsetY = 5
			
 
				         
			
 
				-        for word in words:
			
 
				-            current_width = fm.width(current_line + word)
			
 
				-            if current_width <= max_width_allowed:
			
 
				-                current_line += word + ' '
			
 
				+            painter.setPen(QColor(255, 255, 255))
			
 
				+
			
 
				+            # Draw symbol
			
 
				+            painter.setFont(self.big_symbol_font)
			
 
				+            if self.big_symbol_text:
			
 
				+                painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
			
 
				+                painter.setFont(self.small_symbol_font)
			
 
				+                painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
			
 
				             else:
			
 
				-                line_width = fm.width(current_line)
			
 
				-                if line_width > max_width_used:
			
 
				-                    max_width_used = line_width
			
 
				-                adjusted_text += current_line + '\n'
			
 
				-                current_line = word + ' '
			
 
				-        
			
 
				-        line_width = fm.width(current_line)
			
 
				-        if line_width > max_width_used:
			
 
				-            max_width_used = line_width
			
 
				-        adjusted_text += current_line 
			
 
				-        return adjusted_text.rstrip(), max_width_used         
			
 
				+                painter.setFont(self.small_symbol_font)
			
 
				+                painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
			
 
				+
			
 
				+            # Draw User Text
			
 
				+            painter.setFont(self.user_font)
			
 
				+            user_x = self.total_width - self.user_width - 45 + offsetX
			
 
				+            user_y = offsetY + 15
			
 
				+            user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
			
 
				+            outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
			
 
				+            self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
			
 
				+
			
 
				+            # Draw Assistant Text
			
 
				+            painter.setFont(self.assistant_font)
			
 
				+            assistant_x = self.total_width - self.assistant_width - 5  + offsetX
			
 
				+            assistant_y = self.user_height + offsetY + 15
			
 
				+            assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
			
 
				+            outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
			
 
				+            self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
			
 
				+
			
 
				+        def return_text_adjusted_to_width(self, text, font, max_width_allowed):
			
 
				+            """
			
 
				+            Line feeds are inserted so that the text width does never exceed max_width.
			
 
				+            Text is only broken up on whole words.
			
 
				+            """
			
 
				+            fm = QFontMetrics(font)
			
 
				+            words = text.split(' ')
			
 
				+            adjusted_text = ''
			
 
				+            current_line = ''
			
 
				+            max_width_used = 0
			
 
				+            
			
 
				+            for word in words:
			
 
				+                current_width = fm.width(current_line + word)
			
 
				+                if current_width <= max_width_allowed:
			
 
				+                    current_line += word + ' '
			
 
				+                else:
			
 
				+                    line_width = fm.width(current_line)
			
 
				+                    if line_width > max_width_used:
			
 
				+                        max_width_used = line_width
			
 
				+                    adjusted_text += current_line + '\n'
			
 
				+                    current_line = word + ' '
			
 
				+            
			
 
				+            line_width = fm.width(current_line)
			
 
				+            if line_width > max_width_used:
			
 
				+                max_width_used = line_width
			
 
				+            adjusted_text += current_line 
			
 
				+            return adjusted_text.rstrip(), max_width_used         
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				     app = QApplication(sys.argv)
			
 
				 
			
 
				     window = TransparentWindow()
			
--- a/example_webserver/client.py
+++ b/example_webserver/client.py
@@ -0,0 +1,95 @@
 
				+from colorama import Fore, Back, Style
			
 
				+import websockets
			
 
				+import colorama
			
 
				+import keyboard
			
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+
			
 
				+colorama.init()
			
 
				+
			
 
				+SEND_START_COMMAND = False
			
 
				+HOST = 'localhost:5025'
			
 
				+URI = f'ws://{HOST}'
			
 
				+RECONNECT_DELAY = 5  
			
 
				+
			
 
				+full_sentences = []
			
 
				+
			
 
				+def clear_console():
			
 
				+    os.system('clear' if os.name == 'posix' else 'cls')
			
 
				+
			
 
				+def update_displayed_text(text = ""):
			
 
				+    sentences_with_style = [
			
 
				+        f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
			
 
				+        for i, sentence in enumerate(full_sentences)
			
 
				+    ]
			
 
				+    text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
			
 
				+    clear_console()
			
 
				+    print("CLIENT retrieved text:")
			
 
				+    print()
			
 
				+    print(text)
			
 
				+
			
 
				+async def send_start_recording(websocket):
			
 
				+    command = {
			
 
				+        "type": "command",
			
 
				+        "content": "start-recording"
			
 
				+    }
			
 
				+    await websocket.send(json.dumps(command))
			
 
				+
			
 
				+async def test_client():
			
 
				+    while True:
			
 
				+        try:
			
 
				+            async with websockets.connect(URI, ping_interval=None) as websocket:
			
 
				+
			
 
				+                if SEND_START_COMMAND:
			
 
				+                    # New: Check for space bar press and send start-recording message
			
 
				+                    async def check_space_keypress():
			
 
				+                        while True:
			
 
				+                            if keyboard.is_pressed('space'):
			
 
				+                                print ("Space bar pressed. Sending start-recording message to server.")
			
 
				+                                await send_start_recording(websocket)
			
 
				+                                await asyncio.sleep(1) 
			
 
				+                            await asyncio.sleep(0.02)
			
 
				+                    
			
 
				+                    # Start a task to monitor the space keypress
			
 
				+                    print ("Press space bar to start recording.")
			
 
				+                    asyncio.create_task(check_space_keypress())
			
 
				+                
			
 
				+                while True:
			
 
				+                    message = await websocket.recv()
			
 
				+                    message_obj = json.loads(message)
			
 
				+                    
			
 
				+                    if message_obj["type"] == "realtime":
			
 
				+                        clear_console()
			
 
				+                        print (message_obj["content"])
			
 
				+                    elif message_obj["type"] == "full":
			
 
				+                        clear_console()
			
 
				+                        colored_message = Fore.YELLOW + message_obj["content"] + Style.RESET_ALL
			
 
				+                        print (colored_message)
			
 
				+                        print ()
			
 
				+                        if SEND_START_COMMAND:
			
 
				+                            print ("Press space bar to start recording.")
			
 
				+                        full_sentences.append(message_obj["content"])
			
 
				+                    elif message_obj["type"] == "record_start":
			
 
				+                        print ("recording started.")
			
 
				+                    elif message_obj["type"] == "vad_start":
			
 
				+                        print ("vad started.")
			
 
				+                    elif message_obj["type"] == "wakeword_start":
			
 
				+                        print ("wakeword started.")
			
 
				+                    elif message_obj["type"] == "transcript_start":
			
 
				+                        print ("transcript started.")
			
 
				+
			
 
				+                    else:
			
 
				+                        print (f"Unknown message: {message_obj}")
			
 
				+                    
			
 
				+        except websockets.ConnectionClosed:
			
 
				+            print("Connection with server closed. Reconnecting in", RECONNECT_DELAY, "seconds...")
			
 
				+            await asyncio.sleep(RECONNECT_DELAY)
			
 
				+        except KeyboardInterrupt:
			
 
				+            print("Gracefully shutting down the client.")
			
 
				+            break
			
 
				+        except Exception as e:
			
 
				+            print(f"An error occurred: {e}. Reconnecting in", RECONNECT_DELAY, "seconds...")
			
 
				+            await asyncio.sleep(RECONNECT_DELAY)    
			
 
				+
			
 
				+asyncio.run(test_client())
			
--- a/example_webserver/server.py
+++ b/example_webserver/server.py
@@ -0,0 +1,181 @@
 
				+WAIT_FOR_START_COMMAND = False
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    server = "localhost"
			
 
				+    port = 5025
			
 
				+
			
 
				+    print (f"STT speech to text server")
			
 
				+    print (f"runs on http://{server}:{port}")
			
 
				+    print ()
			
 
				+    print ("starting")
			
 
				+    print ("└─ ... ", end='', flush=True)
			
 
				+
			
 
				+    from RealtimeSTT import AudioToTextRecorder
			
 
				+    from colorama import Fore, Back, Style
			
 
				+    import websockets
			
 
				+    import threading
			
 
				+    import colorama
			
 
				+    import asyncio
			
 
				+    import shutil
			
 
				+    import queue
			
 
				+    import json
			
 
				+    import time
			
 
				+    import os
			
 
				+
			
 
				+    colorama.init()
			
 
				+
			
 
				+    first_chunk = True
			
 
				+    full_sentences = []
			
 
				+    displayed_text = ""
			
 
				+    message_queue = queue.Queue() 
			
 
				+    start_recording_event = threading.Event()
			
 
				+    start_transcription_event = threading.Event()
			
 
				+    connected_clients = set()
			
 
				+
			
 
				+    def clear_console():
			
 
				+        os.system('clear' if os.name == 'posix' else 'cls')
			
 
				+
			
 
				+    async def handler(websocket, path):
			
 
				+
			
 
				+        print ("\r└─ OK")
			
 
				+        if WAIT_FOR_START_COMMAND:
			
 
				+            print("waiting for start command")
			
 
				+            print ("└─ ... ", end='', flush=True)
			
 
				+
			
 
				+        connected_clients.add(websocket)
			
 
				+
			
 
				+        try:
			
 
				+            while True:
			
 
				+                async for message in websocket:
			
 
				+                    data = json.loads(message)
			
 
				+                    if data.get("type") == "command" and data.get("content") == "start-recording":
			
 
				+                        print ("\r└─ OK")
			
 
				+                        start_recording_event.set() 
			
 
				+
			
 
				+        except json.JSONDecodeError:
			
 
				+            print (Fore.RED + "STT Received an invalid JSON message." + Style.RESET_ALL)
			
 
				+        except websockets.ConnectionClosedError:
			
 
				+            print (Fore.RED + "connection closed unexpectedly by the client" + Style.RESET_ALL)
			
 
				+        except websockets.exceptions.ConnectionClosedOK:
			
 
				+            print("connection closed.")
			
 
				+        finally:
			
 
				+
			
 
				+            print("client disconnected")
			
 
				+            connected_clients.remove(websocket)
			
 
				+            print ("waiting for clients")
			
 
				+            print ("└─ ... ", end='', flush=True)
			
 
				+
			
 
				+
			
 
				+    def add_message_to_queue(type: str, content):
			
 
				+        message = {
			
 
				+            "type": type,
			
 
				+            "content": content
			
 
				+        }
			
 
				+        message_queue.put(message)    
			
 
				+
			
 
				+    def fill_cli_line(text):
			
 
				+        columns, _ = shutil.get_terminal_size()
			
 
				+        return text.ljust(columns)[-columns:]
			
 
				+
			
 
				+    def text_detected(text):
			
 
				+        global displayed_text, first_chunk
			
 
				+
			
 
				+        if text != displayed_text:
			
 
				+            first_chunk = False
			
 
				+            displayed_text = text
			
 
				+            add_message_to_queue("realtime", text)
			
 
				+
			
 
				+            message = fill_cli_line(text)
			
 
				+
			
 
				+            message ="└─ " + Fore.CYAN + message[:-3] + Style.RESET_ALL
			
 
				+            print(f"\r{message}", end='', flush=True)
			
 
				+
			
 
				+
			
 
				+    async def broadcast(message_obj):
			
 
				+        if connected_clients:
			
 
				+            for client in connected_clients:
			
 
				+                await client.send(json.dumps(message_obj))
			
 
				+
			
 
				+    async def send_handler():
			
 
				+        while True:
			
 
				+            while not message_queue.empty():
			
 
				+                message = message_queue.get()
			
 
				+                await broadcast(message)
			
 
				+            await asyncio.sleep(0.02)
			
 
				+
			
 
				+    def recording_started():
			
 
				+        add_message_to_queue("record_start", "")
			
 
				+
			
 
				+    def vad_detect_started():
			
 
				+        add_message_to_queue("vad_start", "")
			
 
				+
			
 
				+    def wakeword_detect_started():
			
 
				+        add_message_to_queue("wakeword_start", "")
			
 
				+
			
 
				+    def transcription_started():
			
 
				+        add_message_to_queue("transcript_start", "")
			
 
				+
			
 
				+    recorder_config = {
			
 
				+        'spinner': False,
			
 
				+        'model': 'small.en',
			
 
				+        'language': 'en',
			
 
				+        'silero_sensitivity': 0.01,
			
 
				+        'webrtc_sensitivity': 3,
			
 
				+        'silero_use_onnx': False,
			
 
				+        'post_speech_silence_duration': 1.2,
			
 
				+        'min_length_of_recording': 0.2,
			
 
				+        'min_gap_between_recordings': 0,
			
 
				+        'enable_realtime_transcription': True,
			
 
				+        'realtime_processing_pause': 0,
			
 
				+        'realtime_model_type': 'tiny.en',
			
 
				+        'on_realtime_transcription_stabilized': text_detected,
			
 
				+        'on_recording_start' : recording_started,
			
 
				+        'on_vad_detect_start' : vad_detect_started,
			
 
				+        'on_wakeword_detection_start' : wakeword_detect_started,
			
 
				+        'on_transcription_start' : transcription_started,
			
 
				+    }
			
 
				+
			
 
				+    recorder = AudioToTextRecorder(**recorder_config)
			
 
				+
			
 
				+    def transcriber_thread():
			
 
				+        while True:
			
 
				+            start_transcription_event.wait()
			
 
				+            text = "└─ transcribing ... "
			
 
				+            text = fill_cli_line(text)
			
 
				+            print (f"\r{text}", end='', flush=True)
			
 
				+            sentence = recorder.transcribe()
			
 
				+            print (Style.RESET_ALL + "\r└─ " + Fore.YELLOW + sentence + Style.RESET_ALL)
			
 
				+            add_message_to_queue("full", sentence)
			
 
				+            start_transcription_event.clear()
			
 
				+            if WAIT_FOR_START_COMMAND:
			
 
				+                print("waiting for start command")
			
 
				+                print ("└─ ... ", end='', flush=True)
			
 
				+
			
 
				+    def recorder_thread():
			
 
				+        global first_chunk
			
 
				+        while True:
			
 
				+            if not len(connected_clients) > 0:
			
 
				+                time.sleep(0.1)
			
 
				+                continue
			
 
				+            first_chunk = True
			
 
				+            if WAIT_FOR_START_COMMAND:
			
 
				+                start_recording_event.wait() 
			
 
				+            print("waiting for sentence")
			
 
				+            print ("└─ ... ", end='', flush=True)
			
 
				+            recorder.wait_audio()
			
 
				+            start_transcription_event.set()
			
 
				+            start_recording_event.clear()
			
 
				+
			
 
				+    threading.Thread(target=recorder_thread, daemon=True).start()
			
 
				+    threading.Thread(target=transcriber_thread, daemon=True).start()
			
 
				+
			
 
				+    start_server = websockets.serve(handler, server, port)
			
 
				+    loop = asyncio.get_event_loop()
			
 
				+
			
 
				+    print ("\r└─ OK")
			
 
				+    print ("waiting for clients")
			
 
				+    print ("└─ ... ", end='', flush=True)
			
 
				+
			
 
				+    loop.run_until_complete(start_server)
			
 
				+    loop.create_task(send_handler())
			
 
				+    loop.run_forever()
			
--- a/tests/realtime_loop_test.py
+++ b/tests/realtime_loop_test.py
@@ -8,110 +8,114 @@ import os
 
				 from RealtimeTTS import TextToAudioStream, AzureEngine
			
 
				 from RealtimeSTT import AudioToTextRecorder
			
 
				 
			
 
				+if __name__ == '__main__':
			
 
				 
			
 
				-class SimpleApp(QWidget):
			
 
				-
			
 
				-    update_stt_text_signal = pyqtSignal(str)
			
 
				-    update_tts_text_signal = pyqtSignal(str)
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        super().__init__()
			
 
				-
			
 
				-        layout = QVBoxLayout()
			
 
				-
			
 
				-        font = QFont()
			
 
				-        font.setPointSize(18)
			
 
				-
			
 
				-        self.input_text = QTextEdit(self)
			
 
				-        self.input_text.setFont(font)
			
 
				-        self.input_text.setPlaceholderText("Input")
			
 
				-        self.input_text.setMinimumHeight(100) 
			
 
				-        layout.addWidget(self.input_text)
			
 
				-
			
 
				-        self.button_speak_input = QPushButton("Speak and detect input text", self)
			
 
				-        self.button_speak_input.setFont(font)        
			
 
				-        self.button_speak_input.clicked.connect(self.speak_input)
			
 
				-        layout.addWidget(self.button_speak_input)
			
 
				-
			
 
				-        self.tts_text = QTextEdit(self)
			
 
				-        self.tts_text.setFont(font)
			
 
				-        self.tts_text.setPlaceholderText("STT (final)")
			
 
				-        self.tts_text.setMinimumHeight(100) 
			
 
				-        self.tts_text.setReadOnly(True)
			
 
				-        layout.addWidget(self.tts_text)
			
 
				-
			
 
				-        self.stt_text = QTextEdit(self)
			
 
				-        self.stt_text.setFont(font)
			
 
				-        self.stt_text.setPlaceholderText("STT (realtime)")
			
 
				-        self.stt_text.setMinimumHeight(100) 
			
 
				-        layout.addWidget(self.stt_text)
			
 
				-
			
 
				-        self.button_speak_stt = QPushButton("Speak detected text again", self)
			
 
				-        self.button_speak_stt.setFont(font)        
			
 
				-        self.button_speak_stt.clicked.connect(self.speak_stt)
			
 
				-        layout.addWidget(self.button_speak_stt)
			
 
				-
			
 
				-        self.setLayout(layout)
			
 
				-        self.setWindowTitle("Realtime TTS/STT Loop Test")
			
 
				-        self.resize(800, 600)
			
 
				-
			
 
				-        self.update_stt_text_signal.connect(self.actual_update_stt_text)
			
 
				-        self.update_tts_text_signal.connect(self.actual_update_tts_text)
			
 
				-
			
 
				-        self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
			
 
				-
			
 
				-        recorder_config = {
			
 
				-            'spinner': False,
			
 
				-            'model': 'large-v2',
			
 
				-            'language': 'en',
			
 
				-            'silero_sensitivity': 0.01,
			
 
				-            'webrtc_sensitivity': 3,
			
 
				-            'post_speech_silence_duration': 0.01,
			
 
				-            'min_length_of_recording': 0.2,
			
 
				-            'min_gap_between_recordings': 0,
			
 
				-            'enable_realtime_transcription': True,
			
 
				-            'realtime_processing_pause': 0,
			
 
				-            'realtime_model_type': 'small.en',
			
 
				-            'on_realtime_transcription_stabilized': self.text_detected,
			
 
				-        }
			
 
				-
			
 
				-        self.recorder = AudioToTextRecorder(**recorder_config)
			
 
				-
			
 
				-    def speak_stt(self):
			
 
				-        text = self.stt_text.toPlainText()
			
 
				-        self.speak(text)
			
 
				-
			
 
				-    def speak_input(self):
			
 
				-        text = self.input_text.toPlainText()
			
 
				-        self.speak(text)
			
 
				-
			
 
				-    def text_detected(self, text):
			
 
				-        self.update_stt_text_signal.emit(text)
			
 
				-
			
 
				-    def audio_stream_stop(self):
			
 
				-        self.stream.stop()
			
 
				-        self.recorder.stop()
			
 
				-        detected_text = self.recorder.text()
			
 
				-        self.update_stt_text_signal.emit(detected_text)
			
 
				-        self.update_tts_text_signal.emit(detected_text)
			
 
				-
			
 
				-    def speak(self, text):
			
 
				-        self.stt_text.clear()        
			
 
				-        self.stream.feed(text)
			
 
				-
			
 
				-        self.recorder.start()
			
 
				-        self.stream.play_async()
			
 
				-
			
 
				-    def actual_update_stt_text(self, text):
			
 
				-        self.stt_text.setText(text)
			
 
				-
			
 
				-    def actual_update_tts_text(self, text):
			
 
				-        self.tts_text.setText(text)        
			
 
				+    class SimpleApp(QWidget):
			
 
				+
			
 
				+        update_stt_text_signal = pyqtSignal(str)
			
 
				+        update_tts_text_signal = pyqtSignal(str)
			
 
				+
			
 
				+        def __init__(self):
			
 
				+            super().__init__()
			
 
				+
			
 
				+            layout = QVBoxLayout()
			
 
				+
			
 
				+            font = QFont()
			
 
				+            font.setPointSize(18)
			
 
				+
			
 
				+            self.input_text = QTextEdit(self)
			
 
				+            self.input_text.setFont(font)
			
 
				+            self.input_text.setPlaceholderText("Input")
			
 
				+            self.input_text.setMinimumHeight(100) 
			
 
				+            layout.addWidget(self.input_text)
			
 
				+
			
 
				+            self.button_speak_input = QPushButton("Speak and detect input text", self)
			
 
				+            self.button_speak_input.setFont(font)        
			
 
				+            self.button_speak_input.clicked.connect(self.speak_input)
			
 
				+            layout.addWidget(self.button_speak_input)
			
 
				+
			
 
				+            self.tts_text = QTextEdit(self)
			
 
				+            self.tts_text.setFont(font)
			
 
				+            self.tts_text.setPlaceholderText("STT (final)")
			
 
				+            self.tts_text.setMinimumHeight(100) 
			
 
				+            self.tts_text.setReadOnly(True)
			
 
				+            layout.addWidget(self.tts_text)
			
 
				+
			
 
				+            self.stt_text = QTextEdit(self)
			
 
				+            self.stt_text.setFont(font)
			
 
				+            self.stt_text.setPlaceholderText("STT (realtime)")
			
 
				+            self.stt_text.setMinimumHeight(100) 
			
 
				+            layout.addWidget(self.stt_text)
			
 
				+
			
 
				+            self.button_speak_stt = QPushButton("Speak detected text again", self)
			
 
				+            self.button_speak_stt.setFont(font)        
			
 
				+            self.button_speak_stt.clicked.connect(self.speak_stt)
			
 
				+            layout.addWidget(self.button_speak_stt)
			
 
				+
			
 
				+            self.setLayout(layout)
			
 
				+            self.setWindowTitle("Realtime TTS/STT Loop Test")
			
 
				+            self.resize(800, 600)
			
 
				+
			
 
				+            self.update_stt_text_signal.connect(self.actual_update_stt_text)
			
 
				+            self.update_tts_text_signal.connect(self.actual_update_tts_text)
			
 
				+
			
 
				+            self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
			
 
				+
			
 
				+            recorder_config = {
			
 
				+                'spinner': False,
			
 
				+                'model': 'large-v2',
			
 
				+                'language': 'en',
			
 
				+                'silero_sensitivity': 0.01,
			
 
				+                'webrtc_sensitivity': 3,
			
 
				+                'post_speech_silence_duration': 0.01,
			
 
				+                'min_length_of_recording': 0.2,
			
 
				+                'min_gap_between_recordings': 0,
			
 
				+                'enable_realtime_transcription': True,
			
 
				+                'realtime_processing_pause': 0,
			
 
				+                'realtime_model_type': 'small.en',
			
 
				+                'on_realtime_transcription_stabilized': self.text_detected,
			
 
				+            }
			
 
				+
			
 
				+            self.recorder = AudioToTextRecorder(**recorder_config)
			
 
				+
			
 
				+        def speak_stt(self):
			
 
				+            text = self.stt_text.toPlainText()
			
 
				+            self.speak(text)
			
 
				+
			
 
				+        def speak_input(self):
			
 
				+            text = self.input_text.toPlainText()
			
 
				+            self.speak(text)
			
 
				+
			
 
				+        def text_detected(self, text):
			
 
				+            self.update_stt_text_signal.emit(text)
			
 
				+
			
 
				+        def audio_stream_stop(self):
			
 
				+            self.stream.stop()
			
 
				+            self.recorder.stop()
			
 
				+            detected_text = self.recorder.text()
			
 
				+            self.update_stt_text_signal.emit(detected_text)
			
 
				+            self.update_tts_text_signal.emit(detected_text)
			
 
				+
			
 
				+        def speak(self, text):
			
 
				+            self.stt_text.clear()        
			
 
				+            self.stream.feed(text)
			
 
				+
			
 
				+            self.recorder.start()
			
 
				+            self.stream.play_async()
			
 
				+
			
 
				+        def actual_update_stt_text(self, text):
			
 
				+            self.stt_text.setText(text)
			
 
				+
			
 
				+        def actual_update_tts_text(self, text):
			
 
				+            self.tts_text.setText(text)
			
 
				+
			
 
				+        def closeEvent(self, event):
			
 
				+            if self.recorder:
			
 
				+                self.recorder.shutdown()
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				     app = QApplication(sys.argv)
			
 
				 
			
 
				     window = SimpleApp()
			
 
				     window.show()
			
 
				 
			
 
				-    sys.exit(app.exec_())
			
 
				+    sys.exit(app.exec_())
			
--- a/tests/simple_test.py
+++ b/tests/simple_test.py
@@ -1,5 +1,4 @@
 
				 from RealtimeSTT import AudioToTextRecorder
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				     recorder = AudioToTextRecorder(spinner=False)
			
 
				 
			
--- a/tests/wakeword_test.py
+++ b/tests/wakeword_test.py
@@ -1,12 +1,16 @@
 
				 from RealtimeSTT import AudioToTextRecorder
			
 
				+import logging
			
 
				 
			
 
				-def recording_started():
			
 
				-    print("Speak now...")
			
 
				+if __name__ == '__main__':
			
 
				 
			
 
				-def recording_finished():
			
 
				-    print("Speech end detected... transcribing...")
			
 
				+    def recording_started():
			
 
				+        print("Speak now...")
			
 
				 
			
 
				-recorder = AudioToTextRecorder(spinner=False, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished)
			
 
				+    def recording_finished():
			
 
				+        print("Speech end detected... transcribing...")
			
 
				 
			
 
				-print('Say "Jarvis" then speak.')
			
 
				-print(recorder.text())
			
 
				+    with AudioToTextRecorder(spinner=False, level=logging.DEBUG, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished
			
 
				+        ) as recorder:
			
 
				+        print('Say "Jarvis" then speak.')
			
 
				+        print(recorder.text())
			
 
				+        print("Done. Now we should exit.")