1 year ago · 21b49bff31
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
															 ### Updates
														
 
															+#### v0.1.6
														
 
															+- implements context manager protocol (recorder can be used in a `with` statement)
														
 
															+- bugfix in shutdown method
														
 
															+
														
 
															 #### v0.1.5
														
 
															 - Bugfix for detection of short speech right after sentence detection (the problem mentioned in the video)
														
@@ -130,8 +134,8 @@ print(recorder.text())
 
															 Recording based on voice activity detection.
														
 
															 ```python
														
 
															-recorder = AudioToTextRecorder()
														
 
															-print(recorder.text())
														
 
															+with AudioToTextRecorder() as recorder:
														
 
															+    print(recorder.text())
														
 
															 ```
														
 
															 When running recorder.text in a loop it is recommended to use a callback, allowing the transcription to be run asynchronously:
														
@@ -170,6 +174,20 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
 
															                                on_recording_stop=my_stop_callback)
														
 
															 ```
														
 
															+### Shutdown
														
 
															+
														
 
															+You can shutdown the recorder safely by using the context manager protocol:
														
 
															+
														
 
															+```python
														
 
															+with AudioToTextRecorder() as recorder:
														
 
															+    [...]
														
 
															+```
														
 
															+
														
 
															+Or you can call the shutdown method manually (if using "with" is not feasible):
														
 
															+
														
 
															+```python
														
 
															+recorder.shutdown()
														
 
															+```
														
 
															 ## Testing the Library
														
@@ -254,7 +272,7 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
															 - **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6.
														
 
															-- **silero_use_onnx** (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is True (recommended for faster performance).
														
 
															+- **silero_use_onnx** (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. Default is False. Recommended for faster performance.
														
 
															 - **post_speech_silence_duration** (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
														
@@ -299,4 +317,4 @@ MIT
 
															 Kolja Beigel  
														
 
															 Email: kolja.beigel@web.de  
														
 
															-[GitHub](https://github.com/KoljaB/RealtimeSTT)
														
 
															+[GitHub](https://github.com/KoljaB/RealtimeSTT)
														
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
@@ -51,6 +51,7 @@ INIT_WAKE_WORDS_SENSITIVITY = 0.6
 
															 INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
														
 
															 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
														
 
															 INIT_WAKE_WORD_TIMEOUT = 5.0
														
 
															+ALLOWED_LATENCY_LIMIT = 10
														
 
															 TIME_SLEEP = 0.02
														
 
															 SAMPLE_RATE = 16000
														
@@ -82,7 +83,7 @@ class AudioToTextRecorder:
 
															                  # Voice activation parameters
														
 
															                  silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
														
 
															-                 silero_use_onnx: bool = True,
														
 
															+                 silero_use_onnx: bool = False,
														
 
															                  webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
														
 
															                  post_speech_silence_duration: float = INIT_POST_SPEECH_SILENCE_DURATION,
														
 
															                  min_length_of_recording: float = INIT_MIN_LENGTH_OF_RECORDING,
														
@@ -122,7 +123,7 @@ class AudioToTextRecorder:
 
															         - on_realtime_transcription_update = A callback function that is triggered whenever there's an update in the real-time transcription. The function is called with the newly transcribed text as its argument.
														
 
															         - on_realtime_transcription_stabilized = A callback function that is triggered when the transcribed text stabilizes in quality. The stabilized text is generally more accurate but may arrive with a slight delay compared to the regular real-time updates.
														
 
															         - silero_sensitivity (float, default=SILERO_SENSITIVITY): Sensitivity for the Silero Voice Activity Detection model ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
														
 
															-        - silero_use_onnx (bool, default=True): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
														
 
															+        - silero_use_onnx (bool, default=False): Enables usage of the pre-trained model from Silero in the ONNX (Open Neural Network Exchange) format instead of the PyTorch format. This is recommended for faster performance.
														
 
															         - webrtc_sensitivity (int, default=WEBRTC_SENSITIVITY): Sensitivity for the WebRTC Voice Activity Detection engine ranging from 0 (least aggressive / most sensitive) to 3 (most aggressive, least sensitive). Default is 3.
														
 
															         - post_speech_silence_duration (float, default=0.2): Duration in seconds of silence that must follow speech before the recording is considered to be completed. This ensures that any brief pauses during speech don't prematurely end the recording.
														
 
															         - min_gap_between_recordings (float, default=1.0): Specifies the minimum time interval in seconds that should exist between the end of one recording session and the beginning of another to prevent rapid consecutive recordings.
														
@@ -168,6 +169,7 @@ class AudioToTextRecorder:
 
															         self.realtime_processing_pause = realtime_processing_pause
														
 
															         self.on_realtime_transcription_update = on_realtime_transcription_update
														
 
															         self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
														
 
															+        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
														
 
															         self.level = level
														
 
															         self.audio_queue = Queue()
														
@@ -207,7 +209,7 @@ class AudioToTextRecorder:
 
															         logger.setLevel(level)  # Set the root logger's level
														
 
															         # Create a file handler and set its level
														
 
															-        file_handler = logging.FileHandler('audio_recorder.log')
														
 
															+        file_handler = logging.FileHandler('realtimesst.log')
														
 
															         file_handler.setLevel(logging.DEBUG)
														
 
															         file_handler.setFormatter(logging.Formatter(log_format))
														
@@ -220,16 +222,20 @@ class AudioToTextRecorder:
 
															         logger.addHandler(file_handler)
														
 
															         logger.addHandler(console_handler)
														
 
															+        self.is_shut_down = False
														
 
															+        self.shutdown_event = Event()
														
 
															-        # start transcription process
														
 
															+        logging.info(f"Starting RealTimeSTT")
														
 
															+
														
 
															+        # Start transcription process
														
 
															         self.main_transcription_ready_event = Event()
														
 
															         self.parent_transcription_pipe, child_transcription_pipe = Pipe()
														
 
															-        self.process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event))
														
 
															-        self.process.start()
														
 
															+        self.transcript_process = Process(target=AudioToTextRecorder._transcription_worker, args=(child_transcription_pipe, model, self.main_transcription_ready_event, self.shutdown_event))
														
 
															+        self.transcript_process.start()
														
 
															-        # start audio data reading process
														
 
															-        reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size))
														
 
															-        reader_process.start()
														
 
															+        # Start audio data reading process
														
 
															+        self.reader_process = Process(target=AudioToTextRecorder._audio_data_worker, args=(self.audio_queue, self.sample_rate, self.buffer_size, self.shutdown_event))
														
 
															+        self.reader_process.start()
														
 
															         # Initialize the realtime transcription model
														
 
															         if self.enable_realtime_transcription:
														
@@ -310,7 +316,7 @@ class AudioToTextRecorder:
 
															         self.realtime_thread.daemon = True
														
 
															         self.realtime_thread.start()
														
 
															-        # wait for transcription models to start
														
 
															+        # Wait for transcription models to start
														
 
															         logging.debug('Waiting for main transcription model to start')
														
 
															         self.main_transcription_ready_event.wait()
														
 
															         logging.debug('Main transcription model ready')
														
@@ -319,7 +325,25 @@ class AudioToTextRecorder:
 
															     @staticmethod
														
 
															-    def _transcription_worker(conn, model_path, ready_event):
														
 
															+    def _transcription_worker(conn, model_path, ready_event, shutdown_event):
														
 
															+        """
														
 
															+        Worker method that handles the continuous process of transcribing audio data.
														
 
															+
														
 
															+        This method runs in a separate process and is responsible for:
														
 
															+        - Initializing the `faster_whisper` model used for transcription.
														
 
															+        - Receiving audio data sent through a pipe and using the model to transcribe it.
														
 
															+        - Sending transcription results back through the pipe.
														
 
															+        - Continuously checking for a shutdown event to gracefully terminate the transcription process.
														
 
															+
														
 
															+        Args:
														
 
															+            conn (multiprocessing.Connection): The connection endpoint used for receiving audio data and sending transcription results.
														
 
															+            model_path (str): The path to the pre-trained faster_whisper model for transcription.
														
 
															+            ready_event (threading.Event): An event that is set when the transcription model is successfully initialized and ready.
														
 
															+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
														
 
															+
														
 
															+        Raises:
														
 
															+            Exception: If there is an error while initializing the transcription model.
														
 
															+        """        
														
 
															         logging.info(f"Initializing faster_whisper main transcription model {model_path}")
														
@@ -337,23 +361,44 @@ class AudioToTextRecorder:
 
															         logging.debug('Faster_whisper main speech to text transcription model initialized successfully')
														
 
															-        while True:
														
 
															-            audio, language = conn.recv()
														
 
															-            try:
														
 
															-                segments = model.transcribe(audio, language=language if language else None)[0]
														
 
															-                transcription = " ".join(seg.text for seg in segments).strip()
														
 
															-                conn.send(('success', transcription))
														
 
															-            except faster_whisper.WhisperError as e:
														
 
															-                logging.error(f"Whisper transcription error: {e}")
														
 
															-                conn.send(('error', str(e)))      
														
 
															-            except Exception as e:
														
 
															-                logging.error(f"General transcription error: {e}")
														
 
															-                conn.send(('error', str(e)))
														
 
															+        while not shutdown_event.is_set():
														
 
															+            if conn.poll(0.5):
														
 
															+                audio, language = conn.recv()
														
 
															+                try:
														
 
															+                    segments = model.transcribe(audio, language=language if language else None)[0]
														
 
															+                    transcription = " ".join(seg.text for seg in segments).strip()
														
 
															+                    conn.send(('success', transcription))
														
 
															+                except faster_whisper.WhisperError as e:
														
 
															+                    logging.error(f"Whisper transcription error: {e}")
														
 
															+                    conn.send(('error', str(e)))      
														
 
															+                except Exception as e:
														
 
															+                    logging.error(f"General transcription error: {e}")
														
 
															+                    conn.send(('error', str(e)))
														
 
															+            else:
														
 
															+                # If there's no data, sleep for a short while to prevent busy waiting
														
 
															+                time.sleep(0.02)
														
 
															     @staticmethod
														
 
															-    def _audio_data_worker(audio_queue, sample_rate, buffer_size):
														
 
															+    def _audio_data_worker(audio_queue, sample_rate, buffer_size, shutdown_event):
														
 
															+        """
														
 
															+        Worker method that handles the audio recording process.
														
 
															+
														
 
															+        This method runs in a separate process and is responsible for:
														
 
															+        - Setting up the audio input stream for recording.
														
 
															+        - Continuously reading audio data from the input stream and placing it in a queue.
														
 
															+        - Handling errors during the recording process, including input overflow.
														
 
															+        - Gracefully terminating the recording process when a shutdown event is set.
														
 
															+        Args:
														
 
															+            audio_queue (queue.Queue): A queue where recorded audio data is placed.
														
 
															+            sample_rate (int): The sample rate of the audio input stream.
														
 
															+            buffer_size (int): The size of the buffer used in the audio input stream.
														
 
															+            shutdown_event (threading.Event): An event that, when set, signals this worker method to terminate.
														
 
															+
														
 
															+        Raises:
														
 
															+            Exception: If there is an error while initializing the audio recording.
														
 
															+        """
														
 
															         logging.info("Initializing audio recording (creating pyAudio input stream)")
														
 
															         try:
														
@@ -366,29 +411,33 @@ class AudioToTextRecorder:
 
															         logging.debug('Audio recording (pyAudio input stream) initialized successfully')
														
 
															-        while True:
														
 
															-            try:
														
 
															-                data = stream.read(buffer_size)
														
 
															-
														
 
															-            except OSError as e:
														
 
															-                if e.errno == pyaudio.paInputOverflowed:
														
 
															-                    logging.warning("Input overflowed. Frame dropped.")
														
 
															-                else:
														
 
															+        try:
														
 
															+            while not shutdown_event.is_set():
														
 
															+                try:
														
 
															+                    data = stream.read(buffer_size)
														
 
															+
														
 
															+                except OSError as e:
														
 
															+                    if e.errno == pyaudio.paInputOverflowed:
														
 
															+                        logging.warning("Input overflowed. Frame dropped.")
														
 
															+                    else:
														
 
															+                        logging.error(f"Error during recording: {e}")
														
 
															+                    tb_str = traceback.format_exc()
														
 
															+                    print (f"Traceback: {tb_str}")
														
 
															+                    print (f"Error: {e}")
														
 
															+                    continue
														
 
															+
														
 
															+                except Exception as e:
														
 
															                     logging.error(f"Error during recording: {e}")
														
 
															-                tb_str = traceback.format_exc()
														
 
															-                print (f"Traceback: {tb_str}")
														
 
															-                print (f"Error: {e}")
														
 
															-                continue
														
 
															+                    tb_str = traceback.format_exc()
														
 
															+                    print (f"Traceback: {tb_str}")
														
 
															+                    print (f"Error: {e}")
														
 
															+                    continue
														
 
															-            except Exception as e:
														
 
															-                logging.error(f"Error during recording: {e}")
														
 
															-                time.sleep(1)
														
 
															-                tb_str = traceback.format_exc()
														
 
															-                print (f"Traceback: {tb_str}")
														
 
															-                print (f"Error: {e}")
														
 
															-                continue
														
 
															-
														
 
															-            audio_queue.put(data)                
														
 
															+                audio_queue.put(data)                
														
 
															+        finally:
														
 
															+            stream.stop_stream()
														
 
															+            stream.close()
														
 
															+            audio_interface.terminate()
														
 
															     def wait_audio(self):
														
@@ -413,14 +462,14 @@ class AudioToTextRecorder:
 
															             self._set_state("listening")
														
 
															             self.start_recording_on_voice_activity = True
														
 
															-            # wait until recording starts
														
 
															+            # Wait until recording starts
														
 
															             self.start_recording_event.wait()
														
 
															         # If recording is ongoing, wait for voice inactivity to finish recording.
														
 
															         if self.is_recording:
														
 
															             self.stop_recording_on_voice_deactivity = True
														
 
															-            # wait until recording stops
														
 
															+            # Wait until recording stops
														
 
															             self.stop_recording_event.wait()
														
 
															         # Convert recorded frames to the appropriate audio format.
														
@@ -435,8 +484,25 @@ class AudioToTextRecorder:
 
															         self._set_state("inactive")
														
 
															-
														
 
															     def transcribe(self):
														
 
															+        """
														
 
															+        Transcribes audio captured by this class instance using the `faster_whisper` model.
														
 
															+
														
 
															+        Automatically starts recording upon voice activity if not manually started using `recorder.start()`.
														
 
															+        Automatically stops recording upon voice deactivity if not manually stopped with `recorder.stop()`.
														
 
															+        Processes the recorded audio to generate transcription.
														
 
															+
														
 
															+        Args:
														
 
															+            on_transcription_finished (callable, optional): Callback function to be executed when transcription is ready.
														
 
															+                If provided, transcription will be performed asynchronously, and the callback will receive the transcription 
														
 
															+                as its argument. If omitted, the transcription will be performed synchronously, and the result will be returned.
														
 
															+
														
 
															+        Returns (if no callback is set):
														
 
															+            str: The transcription of the recorded audio.
														
 
															+
														
 
															+        Raises:
														
 
															+            Exception: If there is an error during the transcription process.
														
 
															+        """        
														
 
															         self._set_state("transcribing")
														
 
															         self.parent_transcription_pipe.send((self.audio, self.language))
														
 
															         status, result = self.parent_transcription_pipe.recv()
														
@@ -470,6 +536,9 @@ class AudioToTextRecorder:
 
															         self.wait_audio()
														
 
															+        if self.is_shut_down:
														
 
															+            return ""
														
 
															+
														
 
															         if on_transcription_finished:
														
 
															             threading.Thread(target=on_transcription_finished, args=(self.transcribe(),)).start()
														
 
															         else:
														
@@ -537,26 +606,281 @@ class AudioToTextRecorder:
 
															         Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
														
 
															         """
														
 
															-        self.parent_transcription_pipe.close()
														
 
															-        self.process.terminate()
														
 
															+        # Force wait_audio() and text() to exit
														
 
															+        self.is_shut_down = True
														
 
															+        self.start_recording_event.set()
														
 
															+        self.stop_recording_event.set()
														
 
															+
														
 
															+        self.shutdown_event.set()
														
 
															         self.is_recording = False
														
 
															         self.is_running = False
														
 
															+        logging.debug('Finishing recording thread')
														
 
															         if self.recording_thread:
														
 
															             self.recording_thread.join()
														
 
															+
														
 
															+        logging.debug('Terminating reader process')
														
 
															+        # Give it some time to finish the loop and cleanup.
														
 
															+        self.reader_process.join(timeout=10) 
														
 
															+
														
 
															+        if self.reader_process.is_alive():
														
 
															+            logging.warning("Reader process did not terminate in time. Terminating forcefully.")
														
 
															+            self.reader_process.terminate()
														
 
															+        
														
 
															+        logging.debug('Terminating transcription process')
														
 
															+        self.transcript_process.join(timeout=10) 
														
 
															+
														
 
															+        if self.transcript_process.is_alive():
														
 
															+            logging.warning("Transcript process did not terminate in time. Terminating forcefully.")
														
 
															+            self.transcript_process.terminate()
														
 
															+
														
 
															+        self.parent_transcription_pipe.close()
														
 
															+
														
 
															+        logging.debug('Finishing realtime thread')
														
 
															         if self.realtime_thread:
														
 
															             self.realtime_thread.join()
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def _recording_worker(self):
														
 
															+        """
														
 
															+        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
														
 
															+        """
														
 
															+
														
 
															+        logging.debug('Starting recording worker')
														
 
															+
														
 
															         try:
														
 
															-            if self.stream:
														
 
															-                self.stream.stop_stream()
														
 
															-                self.stream.close()
														
 
															-            if self.audio_interface:
														
 
															-                self.audio_interface.terminate()
														
 
															+            was_recording = False
														
 
															+            delay_was_passed = False
														
 
															+
														
 
															+            # Continuously monitor audio for voice activity
														
 
															+            while self.is_running:
														
 
															+
														
 
															+                data = self.audio_queue.get()
														
 
															+
														
 
															+                # Handle queue overflow
														
 
															+                queue_overflow_logged = False
														
 
															+                while self.audio_queue.qsize() > self.allowed_latency_limit:
														
 
															+                    if not queue_overflow_logged:
														
 
															+                        logging.warning(f"Audio queue size exceeds latency limit. Current size: {self.audio_queue.qsize()}. Discarding old audio chunks.")
														
 
															+                        queue_overflow_logged = True
														
 
															+                    data = self.audio_queue.get()
														
 
															+
														
 
															+                if not self.is_recording:
														
 
															+                    # Handle not recording state
														
 
															+
														
 
															+                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
														
 
															+                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
														
 
															+
														
 
															+                    # Handle wake-word timeout callback
														
 
															+                    if wake_word_activation_delay_passed and not delay_was_passed:
														
 
															+                        if self.wake_words and self.wake_word_activation_delay:
														
 
															+                            if self.on_wakeword_timeout:
														
 
															+                                self.on_wakeword_timeout()
														
 
															+                    delay_was_passed = wake_word_activation_delay_passed
														
 
															+
														
 
															+                    # Set state and spinner text 
														
 
															+                    if not self.recording_stop_time:
														
 
															+                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
														
 
															+                            self._set_state("wakeword")
														
 
															+                        else:
														
 
															+                            if self.listen_start:
														
 
															+                                self._set_state("listening")
														
 
															+                            else:
														
 
															+                                self._set_state("inactive")
														
 
															+
														
 
															+                    # Detect wake words if applicable
														
 
															+                    if self.wake_words and wake_word_activation_delay_passed:
														
 
															+                        try:
														
 
															+                            pcm = struct.unpack_from("h" * self.buffer_size, data)
														
 
															+                            wakeword_index = self.porcupine.process(pcm)
														
 
															+
														
 
															+                        except struct.error:
														
 
															+                            logging.error("Error unpacking audio data for wake word processing.")
														
 
															+                            continue
														
 
															+                        
														
 
															+                        except Exception as e:
														
 
															+                            logging.error(f"Wake word processing error: {e}")
														
 
															+                            continue
														
 
															+                        
														
 
															+                        # If a wake word is detected
														
 
															+                        if wakeword_index >= 0:
														
 
															+
														
 
															+                            # Removing the wake word from the recording
														
 
															+                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
														
 
															+                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
														
 
															+                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
														
 
															+                            self.audio_buffer.clear()
														
 
															+                            self.audio_buffer.extend(temp_samples)
														
 
															+
														
 
															+                            self.wake_word_detect_time = time.time()
														
 
															+                            self.wakeword_detected = True
														
 
															+                            if self.on_wakeword_detected:
														
 
															+                                self.on_wakeword_detected()
														
 
															+
														
 
															+                    # Check for voice activity to trigger the start of recording
														
 
															+                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
														
 
															+
														
 
															+                        if self._is_voice_active():
														
 
															+                            logging.info("voice activity detected")
														
 
															+
														
 
															+                            self.start()
														
 
															+
														
 
															+                            if self.is_recording:
														
 
															+                                self.start_recording_on_voice_activity = False
														
 
															+
														
 
															+                                # Add the buffered audio to the recording frames
														
 
															+                                self.frames.extend(list(self.audio_buffer))
														
 
															+                                self.audio_buffer.clear()
														
 
															+
														
 
															+                            self.silero_vad_model.reset_states()
														
 
															+                        else:
														
 
															+                            data_copy = data[:]
														
 
															+                            self._check_voice_activity(data_copy)
														
 
															+
														
 
															+                    self.speech_end_silence_start = 0
														
 
															+
														
 
															+                else:
														
 
															+                    # If we are currently recording
														
 
															+
														
 
															+                    # Stop the recording if silence is detected after speech
														
 
															+                    if self.stop_recording_on_voice_deactivity:
														
 
															+
														
 
															+                        if not self._is_webrtc_speech(data, True):
														
 
															+
														
 
															+                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
														
 
															+                            if self.speech_end_silence_start == 0:
														
 
															+                                self.speech_end_silence_start = time.time()
														
 
															+                            
														
 
															+                        else:
														
 
															+                            self.speech_end_silence_start = 0
														
 
															+
														
 
															+                        # Wait for silence to stop recording after speech
														
 
															+                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
														
 
															+                            logging.info("voice deactivity detected")
														
 
															+                            self.stop()
														
 
															+
														
 
															+
														
 
															+                if not self.is_recording and was_recording:
														
 
															+                    # Reset after stopping recording to ensure clean state
														
 
															+                    self.stop_recording_on_voice_deactivity = False
														
 
															+
														
 
															+                if time.time() - self.silero_check_time > 0.1:
														
 
															+                    self.silero_check_time = 0
														
 
															+                
														
 
															+                # Handle wake word timeout (waited to long initiating speech after wake word detection)
														
 
															+                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
														
 
															+                    self.wake_word_detect_time = 0
														
 
															+                    if self.wakeword_detected and self.on_wakeword_timeout:
														
 
															+                        self.on_wakeword_timeout()
														
 
															+                    self.wakeword_detected = False
														
 
															+
														
 
															+                was_recording = self.is_recording
														
 
															+
														
 
															+
														
 
															+                if self.is_recording:
														
 
															+                    self.frames.append(data)
														
 
															+
														
 
															+                if not self.is_recording or self.speech_end_silence_start:
														
 
															+                    self.audio_buffer.append(data)	
														
 
															+
														
 
															         except Exception as e:
														
 
															-            logging.error(f"Error closing the audio stream: {e}")
														
 
															+            logging.error(f"Unhandled exeption in _recording_worker: {e}")
														
 
															+            raise
														
 
															+
														
 
															+
														
 
															+    def _realtime_worker(self):
														
 
															+        """
														
 
															+        Performs real-time transcription if the feature is enabled.
														
 
															+
														
 
															+        The method is responsible transcribing recorded audio frames in real-time
														
 
															+         based on the specified resolution interval.
														
 
															+        The transcribed text is stored in `self.realtime_transcription_text` and a callback
														
 
															+        function is invoked with this text if specified.
														
 
															+        """
														
 
															+
														
 
															+        try:
														
 
															+
														
 
															+            logging.debug('Starting realtime worker')
														
 
															+
														
 
															+            # Return immediately if real-time transcription is not enabled
														
 
															+            if not self.enable_realtime_transcription:
														
 
															+                return
														
 
															+                
														
 
															+            # Continue running as long as the main process is active
														
 
															+            while self.is_running:
														
 
															+
														
 
															+                # Check if the recording is active
														
 
															+                if self.is_recording:
														
 
															+                    
														
 
															+                    # Sleep for the duration of the transcription resolution
														
 
															+                    time.sleep(self.realtime_processing_pause)
														
 
															+                    
														
 
															+                    # Convert the buffer frames to a NumPy array
														
 
															+                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															+                    
														
 
															+                    # Normalize the array to a [-1, 1] range
														
 
															+                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															+
														
 
															+                    # Perform transcription and assemble the text
														
 
															+                    segments = self.realtime_model_type.transcribe(
														
 
															+                        audio_array,
														
 
															+                        language=self.language if self.language else None
														
 
															+                    )
														
 
															+
														
 
															+                    # double check recording state because it could have changed mid-transcription
														
 
															+                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
														
 
															+
														
 
															+                        logging.debug('Starting realtime transcription')
														
 
															+                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
														
 
															+
														
 
															+                        self.text_storage.append(self.realtime_transcription_text)
														
 
															+
														
 
															+                        # Take the last two texts in storage, if they exist
														
 
															+                        if len(self.text_storage) >= 2:
														
 
															+                            last_two_texts = self.text_storage[-2:]
														
 
															+                            
														
 
															+                            # Find the longest common prefix between the two texts
														
 
															+                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
														
 
															+
														
 
															+                            # This prefix is the text that was transcripted two times in the same way
														
 
															+                            # Store as "safely detected text" 
														
 
															+                            if len(prefix) >= len(self.realtime_stabilized_safetext):
														
 
															+                                # Only store when longer than the previous as additional security 
														
 
															+                                self.realtime_stabilized_safetext = prefix
														
 
															+
														
 
															+                        # Find parts of the stabilized text in the freshly transscripted text
														
 
															+                        matching_position = self._find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
														
 
															+                        if matching_position < 0:
														
 
															+                            if self.realtime_stabilized_safetext:
														
 
															+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
														
 
															+                            else:
														
 
															+                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
														
 
															+                        else:
														
 
															+                            # We found parts of the stabilized text in the transcripted text
														
 
															+                            # We now take the stabilized text and add only the freshly transcripted part to it
														
 
															+                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
														
 
															+
														
 
															+                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
														
 
															+                            # on the first run without the need for two transcriptions
														
 
															+                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
														
 
															+
														
 
															+                        # Invoke the callback with the transcribed text
														
 
															+                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
														
 
															+
														
 
															+
														
 
															+                # If not recording, sleep briefly before checking again
														
 
															+                else:
														
 
															+                    time.sleep(TIME_SLEEP)
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
														
 
															+            raise
														
 
															     def _is_silero_speech(self, data):
														
@@ -705,143 +1029,6 @@ class AudioToTextRecorder:
 
															                 self.halo.text = text
														
 
															-    def _recording_worker(self):
														
 
															-        """
														
 
															-        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
														
 
															-        """
														
 
															-
														
 
															-        logging.debug('Starting recording worker')
														
 
															-
														
 
															-        try:
														
 
															-            was_recording = False
														
 
															-            delay_was_passed = False
														
 
															-
														
 
															-            # Continuously monitor audio for voice activity
														
 
															-            while self.is_running:
														
 
															-
														
 
															-                data = self.audio_queue.get()
														
 
															-
														
 
															-                if not self.is_recording:
														
 
															-                    # handle not recording state
														
 
															-
														
 
															-                    time_since_listen_start = time.time() - self.listen_start if self.listen_start else 0
														
 
															-                    wake_word_activation_delay_passed = (time_since_listen_start > self.wake_word_activation_delay)
														
 
															-
														
 
															-                    # handle wake-word timeout callback
														
 
															-                    if wake_word_activation_delay_passed and not delay_was_passed:
														
 
															-                        if self.wake_words and self.wake_word_activation_delay:
														
 
															-                            if self.on_wakeword_timeout:
														
 
															-                                self.on_wakeword_timeout()
														
 
															-                    delay_was_passed = wake_word_activation_delay_passed
														
 
															-
														
 
															-                    # Set state and spinner text 
														
 
															-                    if not self.recording_stop_time:
														
 
															-                        if self.wake_words and wake_word_activation_delay_passed and not self.wakeword_detected:
														
 
															-                            self._set_state("wakeword")
														
 
															-                        else:
														
 
															-                            if self.listen_start:
														
 
															-                                self._set_state("listening")
														
 
															-                            else:
														
 
															-                                self._set_state("inactive")
														
 
															-
														
 
															-                    # Detect wake words if applicable
														
 
															-                    if self.wake_words and wake_word_activation_delay_passed:
														
 
															-                        try:
														
 
															-                            pcm = struct.unpack_from("h" * self.buffer_size, data)
														
 
															-                            wakeword_index = self.porcupine.process(pcm)
														
 
															-
														
 
															-                        except struct.error:
														
 
															-                            logging.error("Error unpacking audio data for wake word processing.")
														
 
															-                            continue
														
 
															-                        
														
 
															-                        except Exception as e:
														
 
															-                            logging.error(f"Wake word processing error: {e}")
														
 
															-                            continue
														
 
															-                        
														
 
															-                        # If a wake word is detected
														
 
															-                        if wakeword_index >= 0:
														
 
															-
														
 
															-                            # Removing the wake word from the recording
														
 
															-                            samples_for_0_1_sec = int(self.sample_rate * 0.1)
														
 
															-                            start_index = max(0, len(self.audio_buffer) - samples_for_0_1_sec)
														
 
															-                            temp_samples = collections.deque(itertools.islice(self.audio_buffer, start_index, None))
														
 
															-                            self.audio_buffer.clear()
														
 
															-                            self.audio_buffer.extend(temp_samples)
														
 
															-
														
 
															-                            self.wake_word_detect_time = time.time()
														
 
															-                            self.wakeword_detected = True
														
 
															-                            if self.on_wakeword_detected:
														
 
															-                                self.on_wakeword_detected()
														
 
															-
														
 
															-                    # Check for voice activity to trigger the start of recording
														
 
															-                    if ((not self.wake_words or not wake_word_activation_delay_passed) and self.start_recording_on_voice_activity) or self.wakeword_detected:
														
 
															-
														
 
															-                        if self._is_voice_active():
														
 
															-                            logging.info("voice activity detected")
														
 
															-
														
 
															-                            self.start()
														
 
															-
														
 
															-                            if self.is_recording:
														
 
															-                                self.start_recording_on_voice_activity = False
														
 
															-
														
 
															-                                # Add the buffered audio to the recording frames
														
 
															-                                self.frames.extend(list(self.audio_buffer))
														
 
															-                                self.audio_buffer.clear()
														
 
															-
														
 
															-                            self.silero_vad_model.reset_states()
														
 
															-                        else:
														
 
															-                            data_copy = data[:]
														
 
															-                            self._check_voice_activity(data_copy)
														
 
															-
														
 
															-                    self.speech_end_silence_start = 0
														
 
															-
														
 
															-                else:
														
 
															-                    # If we are currently recording
														
 
															-
														
 
															-                    # Stop the recording if silence is detected after speech
														
 
															-                    if self.stop_recording_on_voice_deactivity:
														
 
															-
														
 
															-                        if not self._is_webrtc_speech(data, True):
														
 
															-
														
 
															-                            # Voice deactivity was detected, so we start measuring silence time before stopping recording
														
 
															-                            if self.speech_end_silence_start == 0:
														
 
															-                                self.speech_end_silence_start = time.time()
														
 
															-                            
														
 
															-                        else:
														
 
															-                            self.speech_end_silence_start = 0
														
 
															-
														
 
															-                        # Wait for silence to stop recording after speech
														
 
															-                        if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.post_speech_silence_duration:
														
 
															-                            logging.info("voice deactivity detected")
														
 
															-                            self.stop()
														
 
															-
														
 
															-                if not self.is_recording and was_recording:
														
 
															-                    # Reset after stopping recording to ensure clean state
														
 
															-                    self.stop_recording_on_voice_deactivity = False
														
 
															-
														
 
															-                if time.time() - self.silero_check_time > 0.1:
														
 
															-                    self.silero_check_time = 0
														
 
															-                
														
 
															-                # handle wake word timeout (waited to long initiating speech after wake word detection)
														
 
															-                if self.wake_word_detect_time and time.time() - self.wake_word_detect_time > self.wake_word_timeout:
														
 
															-                    self.wake_word_detect_time = 0
														
 
															-                    if self.wakeword_detected and self.on_wakeword_timeout:
														
 
															-                        self.on_wakeword_timeout()
														
 
															-                    self.wakeword_detected = False
														
 
															-
														
 
															-                if self.is_recording:
														
 
															-                    self.frames.append(data)
														
 
															-
														
 
															-                if not self.is_recording or self.speech_end_silence_start:
														
 
															-                    self.audio_buffer.append(data)	
														
 
															-
														
 
															-                was_recording = self.is_recording
														
 
															-
														
 
															-        except Exception as e:
														
 
															-            logging.error(f"Unhandled exeption in _recording_worker: {e}")
														
 
															-            raise
														
 
															-
														
 
															-
														
 
															     def _preprocess_output(self, text, preview=False):
														
 
															         """
														
 
															         Preprocesses the output text by removing any leading or trailing whitespace,
														
@@ -869,7 +1056,7 @@ class AudioToTextRecorder:
 
															         return text
														
 
															-    def find_tail_match_in_text(self, text1, text2, length_of_match=10):
														
 
															+    def _find_tail_match_in_text(self, text1, text2, length_of_match=10):
														
 
															         """
														
 
															         Find the position where the last 'n' characters of text1 match with a substring in text2.
														
@@ -905,106 +1092,68 @@ class AudioToTextRecorder:
 
															         return -1
														
 
															+
														
 
															     def _on_realtime_transcription_stabilized(self, text):
														
 
															+        """
														
 
															+        Callback method invoked when the real-time transcription stabilizes.
														
 
															+
														
 
															+        This method is called internally when the transcription text is considered "stable" 
														
 
															+        meaning it's less likely to change significantly with additional audio input. It 
														
 
															+        notifies any registered external listener about the stabilized text if recording is 
														
 
															+        still ongoing. This is particularly useful for applications that need to display 
														
 
															+        live transcription results to users and want to highlight parts of the transcription 
														
 
															+        that are less likely to change.
														
 
															+
														
 
															+        Args:
														
 
															+            text (str): The stabilized transcription text.
														
 
															+        """        
														
 
															         if self.on_realtime_transcription_stabilized:
														
 
															             if self.is_recording:
														
 
															                 self.on_realtime_transcription_stabilized(text)
														
 
															+
														
 
															     def _on_realtime_transcription_update(self, text):
														
 
															+        """
														
 
															+        Callback method invoked when there's an update in the real-time transcription.
														
 
															+
														
 
															+        This method is called internally whenever there's a change in the transcription text,
														
 
															+        notifying any registered external listener about the update if recording is still 
														
 
															+        ongoing. This provides a mechanism for applications to receive and possibly display 
														
 
															+        live transcription updates, which could be partial and still subject to change.
														
 
															+
														
 
															+        Args:
														
 
															+            text (str): The updated transcription text.
														
 
															+        """        
														
 
															         if self.on_realtime_transcription_update:
														
 
															             if self.is_recording:
														
 
															                 self.on_realtime_transcription_update(text)
														
 
															-    def _realtime_worker(self):
														
 
															-        """
														
 
															-        Performs real-time transcription if the feature is enabled.
														
 
															-        The method is responsible transcribing recorded audio frames in real-time
														
 
															-         based on the specified resolution interval.
														
 
															-        The transcribed text is stored in `self.realtime_transcription_text` and a callback
														
 
															-        function is invoked with this text if specified.
														
 
															+    def __enter__(self):
														
 
															         """
														
 
															+        Method to setup the context manager protocol.
														
 
															-        try:
														
 
															+        This enables the instance to be used in a `with` statement, ensuring proper 
														
 
															+        resource management. When the `with` block is entered, this method is 
														
 
															+        automatically called.
														
 
															-            logging.debug('Starting realtime worker')
														
 
															-
														
 
															-            # Return immediately if real-time transcription is not enabled
														
 
															-            if not self.enable_realtime_transcription:
														
 
															-                return
														
 
															-                
														
 
															-            # Continue running as long as the main process is active
														
 
															-            while self.is_running:
														
 
															-
														
 
															-                # Check if the recording is active
														
 
															-                if self.is_recording:
														
 
															-                    
														
 
															-                    # Sleep for the duration of the transcription resolution
														
 
															-                    time.sleep(self.realtime_processing_pause)
														
 
															-                    
														
 
															-                    # Convert the buffer frames to a NumPy array
														
 
															-                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															-                    
														
 
															-                    # Normalize the array to a [-1, 1] range
														
 
															-                    audio_array = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															-
														
 
															-                    # Perform transcription and assemble the text
														
 
															-                    segments = self.realtime_model_type.transcribe(
														
 
															-                        audio_array,
														
 
															-                        language=self.language if self.language else None
														
 
															-                    )
														
 
															-
														
 
															-                    # double check recording state because it could have changed mid-transcription
														
 
															-                    if self.is_recording and time.time() - self.recording_start_time > 0.5:
														
 
															-
														
 
															-                        logging.debug('Starting realtime transcription')
														
 
															-                        self.realtime_transcription_text = " ".join(seg.text for seg in segments[0]).strip()
														
 
															-
														
 
															-                        self.text_storage.append(self.realtime_transcription_text)
														
 
															-
														
 
															-                        # Take the last two texts in storage, if they exist
														
 
															-                        if len(self.text_storage) >= 2:
														
 
															-                            last_two_texts = self.text_storage[-2:]
														
 
															-                            
														
 
															-                            # Find the longest common prefix between the two texts
														
 
															-                            prefix = os.path.commonprefix([last_two_texts[0], last_two_texts[1]])
														
 
															-
														
 
															-                            # This prefix is the text that was transcripted two times in the same way
														
 
															-                            # Store as "safely detected text" 
														
 
															-                            if len(prefix) >= len(self.realtime_stabilized_safetext):
														
 
															-                                # Only store when longer than the previous as additional security 
														
 
															-                                self.realtime_stabilized_safetext = prefix
														
 
															-
														
 
															-                        # Find parts of the stabilized text in the freshly transscripted text
														
 
															-                        matching_position = self.find_tail_match_in_text(self.realtime_stabilized_safetext, self.realtime_transcription_text)
														
 
															-                        if matching_position < 0:
														
 
															-                            if self.realtime_stabilized_safetext:
														
 
															-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_stabilized_safetext, True))
														
 
															-                            else:
														
 
															-                                self._on_realtime_transcription_stabilized(self._preprocess_output(self.realtime_transcription_text, True))
														
 
															-                        else:
														
 
															-                            # We found parts of the stabilized text in the transcripted text
														
 
															-                            # We now take the stabilized text and add only the freshly transcripted part to it
														
 
															-                            output_text = self.realtime_stabilized_safetext + self.realtime_transcription_text[matching_position:]
														
 
															-
														
 
															-                            # This yields us the "left" text part as stabilized AND at the same time delivers fresh detected parts 
														
 
															-                            # on the first run without the need for two transcriptions
														
 
															-                            self._on_realtime_transcription_stabilized(self._preprocess_output(output_text, True))
														
 
															-
														
 
															-                        # Invoke the callback with the transcribed text
														
 
															-                        self._on_realtime_transcription_update(self._preprocess_output(self.realtime_transcription_text, True))
														
 
															+        Returns:
														
 
															+            self: The current instance of the class.
														
 
															+        """
														
 
															+        return self
														
 
															-                # If not recording, sleep briefly before checking again
														
 
															-                else:
														
 
															-                    time.sleep(TIME_SLEEP)
														
 
															+    def __exit__(self, exc_type, exc_value, traceback):
														
 
															+        """
														
 
															+        Method to define behavior when the context manager protocol exits.
														
 
															-        except Exception as e:
														
 
															-            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
														
 
															-            raise
														
 
															+        This is called when exiting the `with` block and ensures that any necessary 
														
 
															+        cleanup or resource release processes are executed, such as shutting down 
														
 
															+        the system properly.
														
 
															-    def __del__(self):
														
 
															-        """
														
 
															-        Destructor method ensures safe shutdown of the recorder when the instance is destroyed.
														
 
															+        Args:
														
 
															+            exc_type (Exception or None): The type of the exception that caused the context to be exited, if any.
														
 
															+            exc_value (Exception or None): The exception instance that caused the context to be exited, if any.
														
 
															+            traceback (Traceback or None): The traceback corresponding to the exception, if any.
														
 
															         """
														
 
															         self.shutdown()
														
--- a/example_app/ui_openai_voice_interface.py
+++ b/example_app/ui_openai_voice_interface.py
@@ -1,514 +1,522 @@
 
															-from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
														
 
															-from RealtimeSTT import AudioToTextRecorder
														
 
															-
														
 
															-from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
														
 
															-from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
														
 
															-from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
														
 
															-
														
 
															-import os
														
 
															-import openai
														
 
															-import sys
														
 
															-import time
														
 
															-import sounddevice as sd
														
 
															-import numpy as np
														
 
															-import wavio
														
 
															-import keyboard
														
 
															-
														
 
															-max_history_messages = 6
														
 
															-return_to_wakewords_after_silence = 12
														
 
															-start_with_wakeword = False
														
 
															-start_engine = "Azure" # Azure, Elevenlabs
														
 
															-recorder_model = "large-v2"
														
 
															-language = "en"
														
 
															-azure_speech_region = "eastus"
														
 
															-openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
														
 
															-
														
 
															-openai.api_key = os.environ.get("OPENAI_API_KEY")
														
 
															-
														
 
															-user_font_size = 22
														
 
															-user_color = QColor(0, 188, 242) # turquoise
														
 
															-
														
 
															-assistant_font_size = 24
														
 
															-assistant_color = QColor(239, 98, 166) # pink
														
 
															-
														
 
															-voice_azure = "en-GB-SoniaNeural"
														
 
															-voice_system = "Zira"
														
 
															-#voice_system = "Hazel"
														
 
															-prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
														
 
															-elevenlabs_model = "eleven_monolingual_v1"
														
 
															-
														
 
															-if language == "de":
														
 
															-    elevenlabs_model = "eleven_multilingual_v1"
														
 
															-    voice_system = "Katja"
														
 
															-    voice_azure = "de-DE-MajaNeural"
														
 
															-    prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
														
 
															-    
														
 
															-print ("Click the top right corner to change the engine")
														
 
															-print ("Press ESC to stop the current playback")
														
 
															-
														
 
															-system_prompt_message = {
														
 
															-    'role': 'system',
														
 
															-    'content': prompt
														
 
															-}
														
 
															-
														
 
															-def generate_response(messages):
														
 
															-    """Generate assistant's response using OpenAI."""
														
 
															-    for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
														
 
															-        text_chunk = chunk["choices"][0]["delta"].get("content")
														
 
															-        if text_chunk:
														
 
															-            yield text_chunk
														
 
															-
														
 
															-history = []
														
 
															-MAX_WINDOW_WIDTH = 1600
														
 
															-MAX_WIDTH_ASSISTANT = 1200
														
 
															-MAX_WIDTH_USER = 1500
														
 
															-
														
 
															-class AudioPlayer(QThread):
														
 
															-    def __init__(self, file_path):
														
 
															-        super(AudioPlayer, self).__init__()
														
 
															-        self.file_path = file_path
														
 
															-
														
 
															-    def run(self):
														
 
															-        wav = wavio.read(self.file_path)
														
 
															-        sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
														
 
															-        sd.play(sound, wav.rate)
														
 
															-        sd.wait()
														
 
															-
														
 
															-class TextRetrievalThread(QThread):
														
 
															-    textRetrieved = pyqtSignal(str)
														
 
															-
														
 
															-    def __init__(self, recorder):
														
 
															-        super().__init__()
														
 
															-        self.recorder = recorder
														
 
															-        self.active = False  
														
 
															-
														
 
															-    def run(self):
														
 
															-        while True:
														
 
															-            if self.active:  
														
 
															-                text = self.recorder.text()
														
 
															-                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
														
 
															-                self.textRetrieved.emit(text)
														
 
															-                self.active = False
														
 
															-            time.sleep(0.1) 
														
 
															-
														
 
															-    def activate(self):
														
 
															-        self.active = True 
														
 
															-
														
 
															-class TransparentWindow(QWidget):
														
 
															-    updateUI = pyqtSignal()
														
 
															-    clearAssistantTextSignal = pyqtSignal()
														
 
															-    clearUserTextSignal = pyqtSignal()
														
 
															-
														
 
															-    def __init__(self):
														
 
															-        super().__init__()
														
 
															-
														
 
															-        self.setGeometry(1, 1, 1, 1) 
														
 
															-
														
 
															-        self.setWindowTitle("Transparent Window")
														
 
															-        self.setAttribute(Qt.WA_TranslucentBackground)
														
 
															-        self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
														
 
															-
														
 
															-        self.big_symbol_font = QFont('Arial', 32)
														
 
															-        self.small_symbol_font = QFont('Arial', 17)
														
 
															-        self.user_font = QFont('Arial', user_font_size)
														
 
															-        self.assistant_font = QFont('Arial', assistant_font_size)      
														
 
															-        self.assistant_font.setItalic(True) 
														
 
															-
														
 
															-        self.big_symbol_text = ""
														
 
															-        self.small_symbol_text = ""
														
 
															-        self.user_text = ""
														
 
															-        self.assistant_text = ""
														
 
															-        self.displayed_user_text = ""
														
 
															-        self.displayed_assistant_text = ""
														
 
															-        self.stream = None
														
 
															-        self.text_retrieval_thread = None
														
 
															-
														
 
															-        self.user_text_timer = QTimer(self)
														
 
															-        self.assistant_text_timer = QTimer(self)
														
 
															-        self.user_text_timer.timeout.connect(self.clear_user_text)
														
 
															-        self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
														
 
															-
														
 
															-        self.clearUserTextSignal.connect(self.init_clear_user_text)
														
 
															-        self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
														
 
															-        self.user_text_opacity = 255 
														
 
															-        self.assistant_text_opacity = 255 
														
 
															-        self.updateUI.connect(self.update_self)
														
 
															-        self.audio_player = None
														
 
															-
														
 
															-        self.run_fade_user = False
														
 
															-        self.run_fade_assistant = False
														
 
															-
														
 
															-        self.menu = QMenu()
														
 
															-        self.menu.setStyleSheet("""
														
 
															-            QMenu {
														
 
															-                background-color: black;
														
 
															-                color: white;
														
 
															-                border-radius: 10px;
														
 
															-            }
														
 
															-            QMenu::item:selected {
														
 
															-                background-color: #555555;
														
 
															-            }
														
 
															-            """)
														
 
															-
														
 
															-        self.elevenlabs_action = QAction("Elevenlabs", self)
														
 
															-        self.azure_action = QAction("Azure", self)
														
 
															-        self.system_action = QAction("System", self)
														
 
															-        self.quit_action = QAction("Quit", self)
														
 
															-
														
 
															-        self.menu.addAction(self.elevenlabs_action)
														
 
															-        self.menu.addAction(self.azure_action)
														
 
															-        self.menu.addAction(self.system_action)
														
 
															-        self.menu.addSeparator() 
														
 
															-        self.menu.addAction(self.quit_action)
														
 
															-
														
 
															-        self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
														
 
															-        self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
														
 
															-        self.system_action.triggered.connect(lambda: self.select_engine("System"))
														
 
															-        self.quit_action.triggered.connect(self.close_application)
														
 
															-
														
 
															-    def mousePressEvent(self, event: QMouseEvent):
														
 
															-        if event.button() == Qt.LeftButton:
														
 
															-            if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
														
 
															-                self.menu.exec_(self.mapToGlobal(event.pos()))        
														
 
															-
														
 
															-    def close_application(self):
														
 
															-        QApplication.quit()                
														
 
															-
														
 
															-    def init(self):
														
 
															-
														
 
															-        self.select_engine(start_engine)
														
 
															-
														
 
															-        # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
														
 
															-
														
 
															-        self.recorder = AudioToTextRecorder(
														
 
															-            model=recorder_model,
														
 
															-            language=language,
														
 
															-            wake_words="Jarvis",
														
 
															-            spinner=True,
														
 
															-            silero_sensitivity=0.2,
														
 
															-            webrtc_sensitivity=3,
														
 
															-            on_recording_start=self.on_recording_start,
														
 
															-            on_vad_detect_start=self.on_vad_detect_start,
														
 
															-            on_wakeword_detection_start=self.on_wakeword_detection_start,
														
 
															-            on_transcription_start=self.on_transcription_start,
														
 
															-            post_speech_silence_duration=0.4, 
														
 
															-            min_length_of_recording=0.3, 
														
 
															-            min_gap_between_recordings=0.01, 
														
 
															-            enable_realtime_transcription = True,
														
 
															-            realtime_processing_pause = 0.01, 
														
 
															-            realtime_model_type = "tiny",
														
 
															-            on_realtime_transcription_stabilized=self.text_detected
														
 
															-        )
														
 
															-        if not start_with_wakeword:
														
 
															-            self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
														
 
															-            
														
 
															-        self.text_retrieval_thread = TextRetrievalThread(self.recorder)
														
 
															-        self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
														
 
															-        self.text_retrieval_thread.start()
														
 
															-        self.text_retrieval_thread.activate()
														
 
															-
														
 
															-        keyboard.on_press_key('esc', self.on_escape)
														
 
															+if __name__ == '__main__':
														
 
															-    def select_engine(self, engine_name):
														
 
															-        if self.stream:
														
 
															-            self.stream.stop()
														
 
															+    from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
														
 
															+    from RealtimeSTT import AudioToTextRecorder
														
 
															+
														
 
															+    from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
														
 
															+    from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
														
 
															+    from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
														
 
															+
														
 
															+    import os
														
 
															+    import openai
														
 
															+    import sys
														
 
															+    import time
														
 
															+    import sounddevice as sd
														
 
															+    import numpy as np
														
 
															+    import wavio
														
 
															+    import keyboard
														
 
															+
														
 
															+    max_history_messages = 6
														
 
															+    return_to_wakewords_after_silence = 12
														
 
															+    start_with_wakeword = False
														
 
															+    start_engine = "Azure" # Azure, Elevenlabs
														
 
															+    recorder_model = "large-v2"
														
 
															+    language = "en"
														
 
															+    azure_speech_region = "eastus"
														
 
															+    openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
														
 
															+
														
 
															+    openai.api_key = os.environ.get("OPENAI_API_KEY")
														
 
															+
														
 
															+    user_font_size = 22
														
 
															+    user_color = QColor(0, 188, 242) # turquoise
														
 
															+
														
 
															+    assistant_font_size = 24
														
 
															+    assistant_color = QColor(239, 98, 166) # pink
														
 
															+
														
 
															+    voice_azure = "en-GB-SoniaNeural"
														
 
															+    voice_system = "Zira"
														
 
															+    #voice_system = "Hazel"
														
 
															+    prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
														
 
															+    elevenlabs_model = "eleven_monolingual_v1"
														
 
															+
														
 
															+    if language == "de":
														
 
															+        elevenlabs_model = "eleven_multilingual_v1"
														
 
															+        voice_system = "Katja"
														
 
															+        voice_azure = "de-DE-MajaNeural"
														
 
															+        prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
														
 
															+        
														
 
															+    print ("Click the top right corner to change the engine")
														
 
															+    print ("Press ESC to stop the current playback")
														
 
															+
														
 
															+    system_prompt_message = {
														
 
															+        'role': 'system',
														
 
															+        'content': prompt
														
 
															+    }
														
 
															+
														
 
															+    def generate_response(messages):
														
 
															+        """Generate assistant's response using OpenAI."""
														
 
															+        for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
														
 
															+            text_chunk = chunk["choices"][0]["delta"].get("content")
														
 
															+            if text_chunk:
														
 
															+                yield text_chunk
														
 
															+
														
 
															+    history = []
														
 
															+    MAX_WINDOW_WIDTH = 1600
														
 
															+    MAX_WIDTH_ASSISTANT = 1200
														
 
															+    MAX_WIDTH_USER = 1500
														
 
															+
														
 
															+    class AudioPlayer(QThread):
														
 
															+        def __init__(self, file_path):
														
 
															+            super(AudioPlayer, self).__init__()
														
 
															+            self.file_path = file_path
														
 
															+
														
 
															+        def run(self):
														
 
															+            wav = wavio.read(self.file_path)
														
 
															+            sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max  
														
 
															+            sd.play(sound, wav.rate)
														
 
															+            sd.wait()
														
 
															+
														
 
															+    class TextRetrievalThread(QThread):
														
 
															+        textRetrieved = pyqtSignal(str)
														
 
															+
														
 
															+        def __init__(self, recorder):
														
 
															+            super().__init__()
														
 
															+            self.recorder = recorder
														
 
															+            self.active = False  
														
 
															+
														
 
															+        def run(self):
														
 
															+            while True:
														
 
															+                if self.active:  
														
 
															+                    text = self.recorder.text()
														
 
															+                    self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
														
 
															+                    self.textRetrieved.emit(text)
														
 
															+                    self.active = False
														
 
															+                time.sleep(0.1) 
														
 
															+
														
 
															+        def activate(self):
														
 
															+            self.active = True 
														
 
															+
														
 
															+    class TransparentWindow(QWidget):
														
 
															+        updateUI = pyqtSignal()
														
 
															+        clearAssistantTextSignal = pyqtSignal()
														
 
															+        clearUserTextSignal = pyqtSignal()
														
 
															+
														
 
															+        def __init__(self):
														
 
															+            super().__init__()
														
 
															+
														
 
															+            self.setGeometry(1, 1, 1, 1) 
														
 
															+
														
 
															+            self.setWindowTitle("Transparent Window")
														
 
															+            self.setAttribute(Qt.WA_TranslucentBackground)
														
 
															+            self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
														
 
															+
														
 
															+            self.big_symbol_font = QFont('Arial', 32)
														
 
															+            self.small_symbol_font = QFont('Arial', 17)
														
 
															+            self.user_font = QFont('Arial', user_font_size)
														
 
															+            self.assistant_font = QFont('Arial', assistant_font_size)      
														
 
															+            self.assistant_font.setItalic(True) 
														
 
															+
														
 
															+            self.big_symbol_text = ""
														
 
															+            self.small_symbol_text = ""
														
 
															+            self.user_text = ""
														
 
															+            self.assistant_text = ""
														
 
															+            self.displayed_user_text = ""
														
 
															+            self.displayed_assistant_text = ""
														
 
															             self.stream = None
														
 
															+            self.text_retrieval_thread = None
														
 
															-        engine = None
														
 
															+            self.user_text_timer = QTimer(self)
														
 
															+            self.assistant_text_timer = QTimer(self)
														
 
															+            self.user_text_timer.timeout.connect(self.clear_user_text)
														
 
															+            self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
														
 
															-        if engine_name == "Azure":
														
 
															-            engine = AzureEngine(
														
 
															-                    os.environ.get("AZURE_SPEECH_KEY"),
														
 
															-                    os.environ.get("AZURE_SPEECH_REGION"),
														
 
															-                    voice_azure,
														
 
															-                    rate=24,
														
 
															-                    pitch=10,
														
 
															-                )
														
 
															+            self.clearUserTextSignal.connect(self.init_clear_user_text)
														
 
															+            self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
														
 
															+            self.user_text_opacity = 255 
														
 
															+            self.assistant_text_opacity = 255 
														
 
															+            self.updateUI.connect(self.update_self)
														
 
															+            self.audio_player = None
														
 
															+
														
 
															+            self.run_fade_user = False
														
 
															+            self.run_fade_assistant = False
														
 
															+
														
 
															+            self.menu = QMenu()
														
 
															+            self.menu.setStyleSheet("""
														
 
															+                QMenu {
														
 
															+                    background-color: black;
														
 
															+                    color: white;
														
 
															+                    border-radius: 10px;
														
 
															+                }
														
 
															+                QMenu::item:selected {
														
 
															+                    background-color: #555555;
														
 
															+                }
														
 
															+                """)
														
 
															+
														
 
															+            self.elevenlabs_action = QAction("Elevenlabs", self)
														
 
															+            self.azure_action = QAction("Azure", self)
														
 
															+            self.system_action = QAction("System", self)
														
 
															+            self.quit_action = QAction("Quit", self)
														
 
															+
														
 
															+            self.menu.addAction(self.elevenlabs_action)
														
 
															+            self.menu.addAction(self.azure_action)
														
 
															+            self.menu.addAction(self.system_action)
														
 
															+            self.menu.addSeparator() 
														
 
															+            self.menu.addAction(self.quit_action)
														
 
															+
														
 
															+            self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
														
 
															+            self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
														
 
															+            self.system_action.triggered.connect(lambda: self.select_engine("System"))
														
 
															+            self.quit_action.triggered.connect(self.close_application)
														
 
															+
														
 
															+        def mousePressEvent(self, event: QMouseEvent):
														
 
															+            if event.button() == Qt.LeftButton:
														
 
															+                if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
														
 
															+                    self.menu.exec_(self.mapToGlobal(event.pos()))        
														
 
															+
														
 
															+        def close_application(self):
														
 
															+            if self.recorder:
														
 
															+                self.recorder.shutdown()                    
														
 
															+            QApplication.quit()                
														
 
															+
														
 
															+        def init(self):
														
 
															+
														
 
															+            self.select_engine(start_engine)
														
 
															+
														
 
															+            # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
														
 
															+
														
 
															+            self.recorder = AudioToTextRecorder(
														
 
															+                model=recorder_model,
														
 
															+                language=language,
														
 
															+                wake_words="Jarvis",
														
 
															+                silero_use_onnx=False,
														
 
															+                spinner=True,
														
 
															+                silero_sensitivity=0.2,
														
 
															+                webrtc_sensitivity=3,
														
 
															+                on_recording_start=self.on_recording_start,
														
 
															+                on_vad_detect_start=self.on_vad_detect_start,
														
 
															+                on_wakeword_detection_start=self.on_wakeword_detection_start,
														
 
															+                on_transcription_start=self.on_transcription_start,
														
 
															+                post_speech_silence_duration=0.4, 
														
 
															+                min_length_of_recording=0.3, 
														
 
															+                min_gap_between_recordings=0.01, 
														
 
															+                enable_realtime_transcription = True,
														
 
															+                realtime_processing_pause = 0.01, 
														
 
															+                realtime_model_type = "tiny",
														
 
															+                on_realtime_transcription_stabilized=self.text_detected
														
 
															+            )
														
 
															+            if not start_with_wakeword:
														
 
															+                self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
														
 
															+                
														
 
															+            self.text_retrieval_thread = TextRetrievalThread(self.recorder)
														
 
															+            self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
														
 
															+            self.text_retrieval_thread.start()
														
 
															+            self.text_retrieval_thread.activate()
														
 
															-        elif engine_name == "Elevenlabs":
														
 
															-            engine = ElevenlabsEngine(
														
 
															-                    os.environ.get("ELEVENLABS_API_KEY"),
														
 
															-                    model=elevenlabs_model
														
 
															+            keyboard.on_press_key('esc', self.on_escape)
														
 
															+
														
 
															+        def closeEvent(self, event):
														
 
															+            if self.recorder:
														
 
															+                self.recorder.shutdown()            
														
 
															+
														
 
															+        def select_engine(self, engine_name):
														
 
															+            if self.stream:
														
 
															+                self.stream.stop()
														
 
															+                self.stream = None
														
 
															+
														
 
															+            engine = None
														
 
															+
														
 
															+            if engine_name == "Azure":
														
 
															+                engine = AzureEngine(
														
 
															+                        os.environ.get("AZURE_SPEECH_KEY"),
														
 
															+                        os.environ.get("AZURE_SPEECH_REGION"),
														
 
															+                        voice_azure,
														
 
															+                        rate=24,
														
 
															+                        pitch=10,
														
 
															+                    )
														
 
															+
														
 
															+            elif engine_name == "Elevenlabs":
														
 
															+                engine = ElevenlabsEngine(
														
 
															+                        os.environ.get("ELEVENLABS_API_KEY"),
														
 
															+                        model=elevenlabs_model
														
 
															+                    )
														
 
															+            else:
														
 
															+                engine = SystemEngine(
														
 
															+                    voice=voice_system,
														
 
															+                    #print_installed_voices=True
														
 
															                 )
														
 
															-        else:
														
 
															-            engine = SystemEngine(
														
 
															-                voice=voice_system,
														
 
															-                #print_installed_voices=True
														
 
															+
														
 
															+            self.stream = TextToAudioStream(
														
 
															+                engine,
														
 
															+                on_character=self.on_character,
														
 
															+                on_text_stream_stop=self.on_text_stream_stop,
														
 
															+                on_text_stream_start=self.on_text_stream_start,
														
 
															+                on_audio_stream_stop=self.on_audio_stream_stop,
														
 
															+                log_characters=True
														
 
															             )
														
 
															+            sys.stdout.write('\033[K')  # Clear to the end of line
														
 
															+            sys.stdout.write('\r')  # Move the cursor to the beginning of the line
														
 
															+            print (f"Using {engine_name} engine")
														
 
															-        self.stream = TextToAudioStream(
														
 
															-            engine,
														
 
															-            on_character=self.on_character,
														
 
															-            on_text_stream_stop=self.on_text_stream_stop,
														
 
															-            on_text_stream_start=self.on_text_stream_start,
														
 
															-            on_audio_stream_stop=self.on_audio_stream_stop,
														
 
															-            log_characters=True
														
 
															-        )
														
 
															-        sys.stdout.write('\033[K')  # Clear to the end of line
														
 
															-        sys.stdout.write('\r')  # Move the cursor to the beginning of the line
														
 
															-        print (f"Using {engine_name} engine")
														
 
															-
														
 
															-
														
 
															-    def text_detected(self, text):
														
 
															-        self.run_fade_user = False
														
 
															-        if self.user_text_timer.isActive():
														
 
															-            self.user_text_timer.stop()
														
 
															-        self.user_text_opacity = 255 
														
 
															-        self.user_text = text
														
 
															-        self.updateUI.emit()
														
 
															-
														
 
															-    def on_escape(self, e):
														
 
															-        if self.stream.is_playing():
														
 
															-            self.stream.stop()
														
 
															-
														
 
															-    def showEvent(self, event: QEvent):
														
 
															-        super().showEvent(event)
														
 
															-        if event.type() == QEvent.Show:
														
 
															-            self.set_symbols("⌛", "🚀")
														
 
															-            QTimer.singleShot(1000, self.init) 
														
 
															-
														
 
															-    def on_character(self, char):
														
 
															-        if self.stream:
														
 
															-            self.assistant_text += char
														
 
															+
														
 
															+        def text_detected(self, text):
														
 
															+            self.run_fade_user = False
														
 
															+            if self.user_text_timer.isActive():
														
 
															+                self.user_text_timer.stop()
														
 
															+            self.user_text_opacity = 255 
														
 
															+            self.user_text = text
														
 
															             self.updateUI.emit()
														
 
															-    def on_text_stream_stop(self):
														
 
															-        print("\"", end="", flush=True)
														
 
															-        if self.stream:
														
 
															-            assistant_response = self.stream.text()            
														
 
															-            self.assistant_text = assistant_response
														
 
															-            history.append({'role': 'assistant', 'content': assistant_response})
														
 
															+        def on_escape(self, e):
														
 
															+            if self.stream.is_playing():
														
 
															+                self.stream.stop()
														
 
															-    def on_audio_stream_stop(self):
														
 
															-        self.set_symbols("🎙️", "⚪")
														
 
															+        def showEvent(self, event: QEvent):
														
 
															+            super().showEvent(event)
														
 
															+            if event.type() == QEvent.Show:
														
 
															+                self.set_symbols("⌛", "🚀")
														
 
															+                QTimer.singleShot(1000, self.init) 
														
 
															-        if self.stream:
														
 
															-            self.clearAssistantTextSignal.emit()
														
 
															-            self.text_retrieval_thread.activate()
														
 
															+        def on_character(self, char):
														
 
															+            if self.stream:
														
 
															+                self.assistant_text += char
														
 
															+                self.updateUI.emit()
														
 
															-    def generate_answer(self):
														
 
															-        self.run_fade_assistant = False
														
 
															-        if self.assistant_text_timer.isActive():
														
 
															-            self.assistant_text_timer.stop()
														
 
															+        def on_text_stream_stop(self):
														
 
															+            print("\"", end="", flush=True)
														
 
															+            if self.stream:
														
 
															+                assistant_response = self.stream.text()            
														
 
															+                self.assistant_text = assistant_response
														
 
															+                history.append({'role': 'assistant', 'content': assistant_response})
														
 
															-        history.append({'role': 'user', 'content': self.user_text})
														
 
															-        self.remove_assistant_text()
														
 
															-        assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
														
 
															-        self.stream.feed(assistant_response)
														
 
															-        self.stream.play_async(minimum_sentence_length=6,
														
 
															-                               buffer_threshold_seconds=2)
														
 
															+        def on_audio_stream_stop(self):
														
 
															+            self.set_symbols("🎙️", "⚪")
														
 
															-    def set_symbols(self, big_symbol, small_symbol):
														
 
															-        self.big_symbol_text = big_symbol
														
 
															-        self.small_symbol_text = small_symbol
														
 
															-        self.updateUI.emit()
														
 
															+            if self.stream:
														
 
															+                self.clearAssistantTextSignal.emit()
														
 
															+                self.text_retrieval_thread.activate()
														
 
															-    def on_text_stream_start(self):
														
 
															-        self.set_symbols("⌛", "👄")
														
 
															+        def generate_answer(self):
														
 
															+            self.run_fade_assistant = False
														
 
															+            if self.assistant_text_timer.isActive():
														
 
															+                self.assistant_text_timer.stop()
														
 
															+
														
 
															+            history.append({'role': 'user', 'content': self.user_text})
														
 
															+            self.remove_assistant_text()
														
 
															+            assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
														
 
															+            self.stream.feed(assistant_response)
														
 
															+            self.stream.play_async(minimum_sentence_length=6,
														
 
															+                                buffer_threshold_seconds=2)
														
 
															+
														
 
															+        def set_symbols(self, big_symbol, small_symbol):
														
 
															+            self.big_symbol_text = big_symbol
														
 
															+            self.small_symbol_text = small_symbol
														
 
															+            self.updateUI.emit()
														
 
															-    def process_user_text(self, user_text):
														
 
															-        user_text = user_text.strip()
														
 
															-        if user_text:
														
 
															-            self.run_fade_user = False
														
 
															+        def on_text_stream_start(self):
														
 
															+            self.set_symbols("⌛", "👄")
														
 
															+
														
 
															+        def process_user_text(self, user_text):
														
 
															+            user_text = user_text.strip()
														
 
															+            if user_text:
														
 
															+                self.run_fade_user = False
														
 
															+                if self.user_text_timer.isActive():
														
 
															+                    self.user_text_timer.stop()
														
 
															+
														
 
															+                self.user_text_opacity = 255 
														
 
															+                self.user_text = user_text
														
 
															+                self.clearUserTextSignal.emit()
														
 
															+                print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
														
 
															+                self.set_symbols("⌛", "🧠")
														
 
															+                QTimer.singleShot(100, self.generate_answer)
														
 
															+
														
 
															+        def on_transcription_start(self):
														
 
															+            self.set_symbols("⌛", "📝")
														
 
															+
														
 
															+        def on_recording_start(self):
														
 
															+            self.text_storage = []
														
 
															+            self.ongoing_sentence = ""
														
 
															+            self.set_symbols("🎙️", "🔴")
														
 
															+
														
 
															+        def on_vad_detect_start(self):
														
 
															+            if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
														
 
															+                self.audio_player = AudioPlayer("active.wav")
														
 
															+                self.audio_player.start() 
														
 
															+
														
 
															+            self.set_symbols("🎙️", "⚪")
														
 
															+
														
 
															+        def on_wakeword_detection_start(self):
														
 
															+            self.audio_player = AudioPlayer("inactive.wav")
														
 
															+            self.audio_player.start()         
														
 
															+
														
 
															+            self.set_symbols("", "💤")
														
 
															+
														
 
															+        def init_clear_user_text(self):
														
 
															             if self.user_text_timer.isActive():
														
 
															-                self.user_text_timer.stop()
														
 
															+                self.user_text_timer.stop()        
														
 
															+            self.user_text_timer.start(10000)
														
 
															+        def remove_user_text(self):
														
 
															+            self.user_text = ""
														
 
															             self.user_text_opacity = 255 
														
 
															-            self.user_text = user_text
														
 
															-            self.clearUserTextSignal.emit()
														
 
															-            print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
														
 
															-            self.set_symbols("⌛", "🧠")
														
 
															-            QTimer.singleShot(100, self.generate_answer)
														
 
															+            self.updateUI.emit()
														
 
															-    def on_transcription_start(self):
														
 
															-        self.set_symbols("⌛", "📝")
														
 
															+        def fade_out_user_text(self):
														
 
															+            if not self.run_fade_user:
														
 
															+                return
														
 
															-    def on_recording_start(self):
														
 
															-        self.text_storage = []
														
 
															-        self.ongoing_sentence = ""
														
 
															-        self.set_symbols("🎙️", "🔴")
														
 
															+            if self.user_text_opacity > 0:
														
 
															+                self.user_text_opacity -= 5 
														
 
															+                self.updateUI.emit()
														
 
															+                QTimer.singleShot(50, self.fade_out_user_text)
														
 
															+            else:
														
 
															+                self.run_fade_user = False
														
 
															+                self.remove_user_text()        
														
 
															-    def on_vad_detect_start(self):
														
 
															-        if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
														
 
															-            self.audio_player = AudioPlayer("active.wav")
														
 
															-            self.audio_player.start() 
														
 
															+        def clear_user_text(self):
														
 
															+            self.user_text_timer.stop()
														
 
															-        self.set_symbols("🎙️", "⚪")
														
 
															+            if not self.user_text:
														
 
															+                return
														
 
															-    def on_wakeword_detection_start(self):
														
 
															-        self.audio_player = AudioPlayer("inactive.wav")
														
 
															-        self.audio_player.start()         
														
 
															+            self.user_text_opacity = 255
														
 
															+            self.run_fade_user = True
														
 
															+            self.fade_out_user_text()
														
 
															-        self.set_symbols("", "💤")
														
 
															+        def init_clear_assistant_text(self):
														
 
															+            if self.assistant_text_timer.isActive():
														
 
															+                self.assistant_text_timer.stop()        
														
 
															+            self.assistant_text_timer.start(10000)
														
 
															-    def init_clear_user_text(self):
														
 
															-        if self.user_text_timer.isActive():
														
 
															-            self.user_text_timer.stop()        
														
 
															-        self.user_text_timer.start(10000)
														
 
															+        def remove_assistant_text(self):
														
 
															+            self.assistant_text = ""
														
 
															+            self.assistant_text_opacity = 255 
														
 
															+            self.updateUI.emit()
														
 
															-    def remove_user_text(self):
														
 
															-        self.user_text = ""
														
 
															-        self.user_text_opacity = 255 
														
 
															-        self.updateUI.emit()
														
 
															+        def fade_out_assistant_text(self):
														
 
															+            if not self.run_fade_assistant:
														
 
															+                return
														
 
															+            
														
 
															+            if self.assistant_text_opacity > 0:
														
 
															+                self.assistant_text_opacity -= 5 
														
 
															+                self.updateUI.emit()
														
 
															+                QTimer.singleShot(50, self.fade_out_assistant_text)
														
 
															+            else:
														
 
															+                self.run_fade_assistant = False
														
 
															+                self.remove_assistant_text()        
														
 
															-    def fade_out_user_text(self):
														
 
															-        if not self.run_fade_user:
														
 
															-            return
														
 
															+        def clear_assistant_text(self):
														
 
															+            self.assistant_text_timer.stop()
														
 
															-        if self.user_text_opacity > 0:
														
 
															-            self.user_text_opacity -= 5 
														
 
															-            self.updateUI.emit()
														
 
															-            QTimer.singleShot(50, self.fade_out_user_text)
														
 
															-        else:
														
 
															-            self.run_fade_user = False
														
 
															-            self.remove_user_text()        
														
 
															+            if not self.assistant_text:
														
 
															+                return
														
 
															-    def clear_user_text(self):
														
 
															-        self.user_text_timer.stop()
														
 
															+            self.assistant_text_opacity = 255
														
 
															+            self.run_fade_assistant = True
														
 
															+            self.fade_out_assistant_text()
														
 
															-        if not self.user_text:
														
 
															-            return
														
 
															+        def update_self(self):
														
 
															-        self.user_text_opacity = 255
														
 
															-        self.run_fade_user = True
														
 
															-        self.fade_out_user_text()
														
 
															+            self.blockSignals(True)
														
 
															+                    
														
 
															+            self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
														
 
															+            self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
														
 
															-    def init_clear_assistant_text(self):
														
 
															-        if self.assistant_text_timer.isActive():
														
 
															-            self.assistant_text_timer.stop()        
														
 
															-        self.assistant_text_timer.start(10000)
														
 
															+            fm_symbol = QFontMetrics(self.big_symbol_font)
														
 
															+            self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
														
 
															+            self.symbol_height = fm_symbol.height() + 8
														
 
															-    def remove_assistant_text(self):
														
 
															-        self.assistant_text = ""
														
 
															-        self.assistant_text_opacity = 255 
														
 
															-        self.updateUI.emit()
														
 
															+            self.total_width = MAX_WINDOW_WIDTH
														
 
															-    def fade_out_assistant_text(self):
														
 
															-        if not self.run_fade_assistant:
														
 
															-            return
														
 
															-        
														
 
															-        if self.assistant_text_opacity > 0:
														
 
															-            self.assistant_text_opacity -= 5 
														
 
															-            self.updateUI.emit()
														
 
															-            QTimer.singleShot(50, self.fade_out_assistant_text)
														
 
															-        else:
														
 
															-            self.run_fade_assistant = False
														
 
															-            self.remove_assistant_text()        
														
 
															+            fm_user = QFontMetrics(self.user_font)
														
 
															+            user_text_lines = (self.displayed_user_text.count("\n") + 1)
														
 
															+            self.user_height = fm_user.height() * user_text_lines + 7
														
 
															-    def clear_assistant_text(self):
														
 
															-        self.assistant_text_timer.stop()
														
 
															+            fm_assistant = QFontMetrics(self.assistant_font)
														
 
															+            assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
														
 
															+            self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
														
 
															-        if not self.assistant_text:
														
 
															-            return
														
 
															+            self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
														
 
															-        self.assistant_text_opacity = 255
														
 
															-        self.run_fade_assistant = True
														
 
															-        self.fade_out_assistant_text()
														
 
															+            desktop = QDesktopWidget()
														
 
															+            screen_rect = desktop.availableGeometry(desktop.primaryScreen())
														
 
															+            self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
														
 
															-    def update_self(self):
														
 
															+            self.blockSignals(False)
														
 
															-        self.blockSignals(True)
														
 
															-                
														
 
															-        self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
														
 
															-        self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)       
														
 
															-
														
 
															-        fm_symbol = QFontMetrics(self.big_symbol_font)
														
 
															-        self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
														
 
															-        self.symbol_height = fm_symbol.height() + 8
														
 
															-
														
 
															-        self.total_width = MAX_WINDOW_WIDTH
														
 
															-
														
 
															-        fm_user = QFontMetrics(self.user_font)
														
 
															-        user_text_lines = (self.displayed_user_text.count("\n") + 1)
														
 
															-        self.user_height = fm_user.height() * user_text_lines + 7
														
 
															-
														
 
															-        fm_assistant = QFontMetrics(self.assistant_font)
														
 
															-        assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
														
 
															-        self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
														
 
															-
														
 
															-        self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
														
 
															-
														
 
															-        desktop = QDesktopWidget()
														
 
															-        screen_rect = desktop.availableGeometry(desktop.primaryScreen())
														
 
															-        self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
														
 
															-
														
 
															-        self.blockSignals(False)
														
 
															-
														
 
															-        self.update()
														
 
															-
														
 
															-    def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
														
 
															-        painter.setPen(outlineColor)
														
 
															-        for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
														
 
															-                    (-outline_size, -outline_size), (outline_size, -outline_size),
														
 
															-                    (-outline_size, outline_size), (outline_size, outline_size)]:
														
 
															-            painter.drawText(x + dx, y + dy, width, height, alignment, text)
														
 
															-
														
 
															-        painter.setPen(textColor)
														
 
															-        painter.drawText(x, y, width, height, alignment, text)
														
 
															-
														
 
															-    def paintEvent(self, event):
														
 
															-        painter = QPainter(self)
														
 
															-
														
 
															-        offsetX = 4
														
 
															-        offsetY = 5
														
 
															-    
														
 
															-        painter.setPen(QColor(255, 255, 255))
														
 
															-
														
 
															-        # Draw symbol
														
 
															-        painter.setFont(self.big_symbol_font)
														
 
															-        if self.big_symbol_text:
														
 
															-            painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
														
 
															-            painter.setFont(self.small_symbol_font)
														
 
															-            painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
														
 
															-        else:
														
 
															-            painter.setFont(self.small_symbol_font)
														
 
															-            painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
														
 
															-
														
 
															-        # Draw User Text
														
 
															-        painter.setFont(self.user_font)
														
 
															-        user_x = self.total_width - self.user_width - 45 + offsetX
														
 
															-        user_y = offsetY + 15
														
 
															-        user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
														
 
															-        outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
														
 
															-        self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
														
 
															-
														
 
															-        # Draw Assistant Text
														
 
															-        painter.setFont(self.assistant_font)
														
 
															-        assistant_x = self.total_width - self.assistant_width - 5  + offsetX
														
 
															-        assistant_y = self.user_height + offsetY + 15
														
 
															-        assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
														
 
															-        outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
														
 
															-        self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
														
 
															-
														
 
															-    def return_text_adjusted_to_width(self, text, font, max_width_allowed):
														
 
															-        """
														
 
															-        Line feeds are inserted so that the text width does never exceed max_width.
														
 
															-        Text is only broken up on whole words.
														
 
															-        """
														
 
															-        fm = QFontMetrics(font)
														
 
															-        words = text.split(' ')
														
 
															-        adjusted_text = ''
														
 
															-        current_line = ''
														
 
															-        max_width_used = 0
														
 
															+            self.update()
														
 
															+
														
 
															+        def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
														
 
															+            painter.setPen(outlineColor)
														
 
															+            for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
														
 
															+                        (-outline_size, -outline_size), (outline_size, -outline_size),
														
 
															+                        (-outline_size, outline_size), (outline_size, outline_size)]:
														
 
															+                painter.drawText(x + dx, y + dy, width, height, alignment, text)
														
 
															+
														
 
															+            painter.setPen(textColor)
														
 
															+            painter.drawText(x, y, width, height, alignment, text)
														
 
															+
														
 
															+        def paintEvent(self, event):
														
 
															+            painter = QPainter(self)
														
 
															+
														
 
															+            offsetX = 4
														
 
															+            offsetY = 5
														
 
															-        for word in words:
														
 
															-            current_width = fm.width(current_line + word)
														
 
															-            if current_width <= max_width_allowed:
														
 
															-                current_line += word + ' '
														
 
															+            painter.setPen(QColor(255, 255, 255))
														
 
															+
														
 
															+            # Draw symbol
														
 
															+            painter.setFont(self.big_symbol_font)
														
 
															+            if self.big_symbol_text:
														
 
															+                painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
														
 
															+                painter.setFont(self.small_symbol_font)
														
 
															+                painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
														
 
															             else:
														
 
															-                line_width = fm.width(current_line)
														
 
															-                if line_width > max_width_used:
														
 
															-                    max_width_used = line_width
														
 
															-                adjusted_text += current_line + '\n'
														
 
															-                current_line = word + ' '
														
 
															-        
														
 
															-        line_width = fm.width(current_line)
														
 
															-        if line_width > max_width_used:
														
 
															-            max_width_used = line_width
														
 
															-        adjusted_text += current_line 
														
 
															-        return adjusted_text.rstrip(), max_width_used         
														
 
															+                painter.setFont(self.small_symbol_font)
														
 
															+                painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
														
 
															+
														
 
															+            # Draw User Text
														
 
															+            painter.setFont(self.user_font)
														
 
															+            user_x = self.total_width - self.user_width - 45 + offsetX
														
 
															+            user_y = offsetY + 15
														
 
															+            user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
														
 
															+            outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
														
 
															+            self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
														
 
															+
														
 
															+            # Draw Assistant Text
														
 
															+            painter.setFont(self.assistant_font)
														
 
															+            assistant_x = self.total_width - self.assistant_width - 5  + offsetX
														
 
															+            assistant_y = self.user_height + offsetY + 15
														
 
															+            assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
														
 
															+            outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
														
 
															+            self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
														
 
															+
														
 
															+        def return_text_adjusted_to_width(self, text, font, max_width_allowed):
														
 
															+            """
														
 
															+            Line feeds are inserted so that the text width does never exceed max_width.
														
 
															+            Text is only broken up on whole words.
														
 
															+            """
														
 
															+            fm = QFontMetrics(font)
														
 
															+            words = text.split(' ')
														
 
															+            adjusted_text = ''
														
 
															+            current_line = ''
														
 
															+            max_width_used = 0
														
 
															+            
														
 
															+            for word in words:
														
 
															+                current_width = fm.width(current_line + word)
														
 
															+                if current_width <= max_width_allowed:
														
 
															+                    current_line += word + ' '
														
 
															+                else:
														
 
															+                    line_width = fm.width(current_line)
														
 
															+                    if line_width > max_width_used:
														
 
															+                        max_width_used = line_width
														
 
															+                    adjusted_text += current_line + '\n'
														
 
															+                    current_line = word + ' '
														
 
															+            
														
 
															+            line_width = fm.width(current_line)
														
 
															+            if line_width > max_width_used:
														
 
															+                max_width_used = line_width
														
 
															+            adjusted_text += current_line 
														
 
															+            return adjusted_text.rstrip(), max_width_used         
														
 
															-if __name__ == '__main__':
														
 
															     app = QApplication(sys.argv)
														
 
															     window = TransparentWindow()
														
--- a/example_webserver/client.py
+++ b/example_webserver/client.py
@@ -0,0 +1,95 @@
 
															+from colorama import Fore, Back, Style
														
 
															+import websockets
														
 
															+import colorama
														
 
															+import keyboard
														
 
															+import asyncio
														
 
															+import json
														
 
															+import os
														
 
															+
														
 
															+colorama.init()
														
 
															+
														
 
															+SEND_START_COMMAND = False
														
 
															+HOST = 'localhost:5025'
														
 
															+URI = f'ws://{HOST}'
														
 
															+RECONNECT_DELAY = 5  
														
 
															+
														
 
															+full_sentences = []
														
 
															+
														
 
															+def clear_console():
														
 
															+    os.system('clear' if os.name == 'posix' else 'cls')
														
 
															+
														
 
															+def update_displayed_text(text = ""):
														
 
															+    sentences_with_style = [
														
 
															+        f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
														
 
															+        for i, sentence in enumerate(full_sentences)
														
 
															+    ]
														
 
															+    text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
														
 
															+    clear_console()
														
 
															+    print("CLIENT retrieved text:")
														
 
															+    print()
														
 
															+    print(text)
														
 
															+
														
 
															+async def send_start_recording(websocket):
														
 
															+    command = {
														
 
															+        "type": "command",
														
 
															+        "content": "start-recording"
														
 
															+    }
														
 
															+    await websocket.send(json.dumps(command))
														
 
															+
														
 
															+async def test_client():
														
 
															+    while True:
														
 
															+        try:
														
 
															+            async with websockets.connect(URI, ping_interval=None) as websocket:
														
 
															+
														
 
															+                if SEND_START_COMMAND:
														
 
															+                    # New: Check for space bar press and send start-recording message
														
 
															+                    async def check_space_keypress():
														
 
															+                        while True:
														
 
															+                            if keyboard.is_pressed('space'):
														
 
															+                                print ("Space bar pressed. Sending start-recording message to server.")
														
 
															+                                await send_start_recording(websocket)
														
 
															+                                await asyncio.sleep(1) 
														
 
															+                            await asyncio.sleep(0.02)
														
 
															+                    
														
 
															+                    # Start a task to monitor the space keypress
														
 
															+                    print ("Press space bar to start recording.")
														
 
															+                    asyncio.create_task(check_space_keypress())
														
 
															+                
														
 
															+                while True:
														
 
															+                    message = await websocket.recv()
														
 
															+                    message_obj = json.loads(message)
														
 
															+                    
														
 
															+                    if message_obj["type"] == "realtime":
														
 
															+                        clear_console()
														
 
															+                        print (message_obj["content"])
														
 
															+                    elif message_obj["type"] == "full":
														
 
															+                        clear_console()
														
 
															+                        colored_message = Fore.YELLOW + message_obj["content"] + Style.RESET_ALL
														
 
															+                        print (colored_message)
														
 
															+                        print ()
														
 
															+                        if SEND_START_COMMAND:
														
 
															+                            print ("Press space bar to start recording.")
														
 
															+                        full_sentences.append(message_obj["content"])
														
 
															+                    elif message_obj["type"] == "record_start":
														
 
															+                        print ("recording started.")
														
 
															+                    elif message_obj["type"] == "vad_start":
														
 
															+                        print ("vad started.")
														
 
															+                    elif message_obj["type"] == "wakeword_start":
														
 
															+                        print ("wakeword started.")
														
 
															+                    elif message_obj["type"] == "transcript_start":
														
 
															+                        print ("transcript started.")
														
 
															+
														
 
															+                    else:
														
 
															+                        print (f"Unknown message: {message_obj}")
														
 
															+                    
														
 
															+        except websockets.ConnectionClosed:
														
 
															+            print("Connection with server closed. Reconnecting in", RECONNECT_DELAY, "seconds...")
														
 
															+            await asyncio.sleep(RECONNECT_DELAY)
														
 
															+        except KeyboardInterrupt:
														
 
															+            print("Gracefully shutting down the client.")
														
 
															+            break
														
 
															+        except Exception as e:
														
 
															+            print(f"An error occurred: {e}. Reconnecting in", RECONNECT_DELAY, "seconds...")
														
 
															+            await asyncio.sleep(RECONNECT_DELAY)    
														
 
															+
														
 
															+asyncio.run(test_client())
														
--- a/example_webserver/server.py
+++ b/example_webserver/server.py
@@ -0,0 +1,181 @@
 
															+WAIT_FOR_START_COMMAND = False
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    server = "localhost"
														
 
															+    port = 5025
														
 
															+
														
 
															+    print (f"STT speech to text server")
														
 
															+    print (f"runs on http://{server}:{port}")
														
 
															+    print ()
														
 
															+    print ("starting")
														
 
															+    print ("└─ ... ", end='', flush=True)
														
 
															+
														
 
															+    from RealtimeSTT import AudioToTextRecorder
														
 
															+    from colorama import Fore, Back, Style
														
 
															+    import websockets
														
 
															+    import threading
														
 
															+    import colorama
														
 
															+    import asyncio
														
 
															+    import shutil
														
 
															+    import queue
														
 
															+    import json
														
 
															+    import time
														
 
															+    import os
														
 
															+
														
 
															+    colorama.init()
														
 
															+
														
 
															+    first_chunk = True
														
 
															+    full_sentences = []
														
 
															+    displayed_text = ""
														
 
															+    message_queue = queue.Queue() 
														
 
															+    start_recording_event = threading.Event()
														
 
															+    start_transcription_event = threading.Event()
														
 
															+    connected_clients = set()
														
 
															+
														
 
															+    def clear_console():
														
 
															+        os.system('clear' if os.name == 'posix' else 'cls')
														
 
															+
														
 
															+    async def handler(websocket, path):
														
 
															+
														
 
															+        print ("\r└─ OK")
														
 
															+        if WAIT_FOR_START_COMMAND:
														
 
															+            print("waiting for start command")
														
 
															+            print ("└─ ... ", end='', flush=True)
														
 
															+
														
 
															+        connected_clients.add(websocket)
														
 
															+
														
 
															+        try:
														
 
															+            while True:
														
 
															+                async for message in websocket:
														
 
															+                    data = json.loads(message)
														
 
															+                    if data.get("type") == "command" and data.get("content") == "start-recording":
														
 
															+                        print ("\r└─ OK")
														
 
															+                        start_recording_event.set() 
														
 
															+
														
 
															+        except json.JSONDecodeError:
														
 
															+            print (Fore.RED + "STT Received an invalid JSON message." + Style.RESET_ALL)
														
 
															+        except websockets.ConnectionClosedError:
														
 
															+            print (Fore.RED + "connection closed unexpectedly by the client" + Style.RESET_ALL)
														
 
															+        except websockets.exceptions.ConnectionClosedOK:
														
 
															+            print("connection closed.")
														
 
															+        finally:
														
 
															+
														
 
															+            print("client disconnected")
														
 
															+            connected_clients.remove(websocket)
														
 
															+            print ("waiting for clients")
														
 
															+            print ("└─ ... ", end='', flush=True)
														
 
															+
														
 
															+
														
 
															+    def add_message_to_queue(type: str, content):
														
 
															+        message = {
														
 
															+            "type": type,
														
 
															+            "content": content
														
 
															+        }
														
 
															+        message_queue.put(message)    
														
 
															+
														
 
															+    def fill_cli_line(text):
														
 
															+        columns, _ = shutil.get_terminal_size()
														
 
															+        return text.ljust(columns)[-columns:]
														
 
															+
														
 
															+    def text_detected(text):
														
 
															+        global displayed_text, first_chunk
														
 
															+
														
 
															+        if text != displayed_text:
														
 
															+            first_chunk = False
														
 
															+            displayed_text = text
														
 
															+            add_message_to_queue("realtime", text)
														
 
															+
														
 
															+            message = fill_cli_line(text)
														
 
															+
														
 
															+            message ="└─ " + Fore.CYAN + message[:-3] + Style.RESET_ALL
														
 
															+            print(f"\r{message}", end='', flush=True)
														
 
															+
														
 
															+
														
 
															+    async def broadcast(message_obj):
														
 
															+        if connected_clients:
														
 
															+            for client in connected_clients:
														
 
															+                await client.send(json.dumps(message_obj))
														
 
															+
														
 
															+    async def send_handler():
														
 
															+        while True:
														
 
															+            while not message_queue.empty():
														
 
															+                message = message_queue.get()
														
 
															+                await broadcast(message)
														
 
															+            await asyncio.sleep(0.02)
														
 
															+
														
 
															+    def recording_started():
														
 
															+        add_message_to_queue("record_start", "")
														
 
															+
														
 
															+    def vad_detect_started():
														
 
															+        add_message_to_queue("vad_start", "")
														
 
															+
														
 
															+    def wakeword_detect_started():
														
 
															+        add_message_to_queue("wakeword_start", "")
														
 
															+
														
 
															+    def transcription_started():
														
 
															+        add_message_to_queue("transcript_start", "")
														
 
															+
														
 
															+    recorder_config = {
														
 
															+        'spinner': False,
														
 
															+        'model': 'small.en',
														
 
															+        'language': 'en',
														
 
															+        'silero_sensitivity': 0.01,
														
 
															+        'webrtc_sensitivity': 3,
														
 
															+        'silero_use_onnx': False,
														
 
															+        'post_speech_silence_duration': 1.2,
														
 
															+        'min_length_of_recording': 0.2,
														
 
															+        'min_gap_between_recordings': 0,
														
 
															+        'enable_realtime_transcription': True,
														
 
															+        'realtime_processing_pause': 0,
														
 
															+        'realtime_model_type': 'tiny.en',
														
 
															+        'on_realtime_transcription_stabilized': text_detected,
														
 
															+        'on_recording_start' : recording_started,
														
 
															+        'on_vad_detect_start' : vad_detect_started,
														
 
															+        'on_wakeword_detection_start' : wakeword_detect_started,
														
 
															+        'on_transcription_start' : transcription_started,
														
 
															+    }
														
 
															+
														
 
															+    recorder = AudioToTextRecorder(**recorder_config)
														
 
															+
														
 
															+    def transcriber_thread():
														
 
															+        while True:
														
 
															+            start_transcription_event.wait()
														
 
															+            text = "└─ transcribing ... "
														
 
															+            text = fill_cli_line(text)
														
 
															+            print (f"\r{text}", end='', flush=True)
														
 
															+            sentence = recorder.transcribe()
														
 
															+            print (Style.RESET_ALL + "\r└─ " + Fore.YELLOW + sentence + Style.RESET_ALL)
														
 
															+            add_message_to_queue("full", sentence)
														
 
															+            start_transcription_event.clear()
														
 
															+            if WAIT_FOR_START_COMMAND:
														
 
															+                print("waiting for start command")
														
 
															+                print ("└─ ... ", end='', flush=True)
														
 
															+
														
 
															+    def recorder_thread():
														
 
															+        global first_chunk
														
 
															+        while True:
														
 
															+            if not len(connected_clients) > 0:
														
 
															+                time.sleep(0.1)
														
 
															+                continue
														
 
															+            first_chunk = True
														
 
															+            if WAIT_FOR_START_COMMAND:
														
 
															+                start_recording_event.wait() 
														
 
															+            print("waiting for sentence")
														
 
															+            print ("└─ ... ", end='', flush=True)
														
 
															+            recorder.wait_audio()
														
 
															+            start_transcription_event.set()
														
 
															+            start_recording_event.clear()
														
 
															+
														
 
															+    threading.Thread(target=recorder_thread, daemon=True).start()
														
 
															+    threading.Thread(target=transcriber_thread, daemon=True).start()
														
 
															+
														
 
															+    start_server = websockets.serve(handler, server, port)
														
 
															+    loop = asyncio.get_event_loop()
														
 
															+
														
 
															+    print ("\r└─ OK")
														
 
															+    print ("waiting for clients")
														
 
															+    print ("└─ ... ", end='', flush=True)
														
 
															+
														
 
															+    loop.run_until_complete(start_server)
														
 
															+    loop.create_task(send_handler())
														
 
															+    loop.run_forever()
														
--- a/tests/realtime_loop_test.py
+++ b/tests/realtime_loop_test.py
@@ -8,110 +8,114 @@ import os
 
															 from RealtimeTTS import TextToAudioStream, AzureEngine
														
 
															 from RealtimeSTT import AudioToTextRecorder
														
 
															+if __name__ == '__main__':
														
 
															-class SimpleApp(QWidget):
														
 
															-
														
 
															-    update_stt_text_signal = pyqtSignal(str)
														
 
															-    update_tts_text_signal = pyqtSignal(str)
														
 
															-
														
 
															-    def __init__(self):
														
 
															-        super().__init__()
														
 
															-
														
 
															-        layout = QVBoxLayout()
														
 
															-
														
 
															-        font = QFont()
														
 
															-        font.setPointSize(18)
														
 
															-
														
 
															-        self.input_text = QTextEdit(self)
														
 
															-        self.input_text.setFont(font)
														
 
															-        self.input_text.setPlaceholderText("Input")
														
 
															-        self.input_text.setMinimumHeight(100) 
														
 
															-        layout.addWidget(self.input_text)
														
 
															-
														
 
															-        self.button_speak_input = QPushButton("Speak and detect input text", self)
														
 
															-        self.button_speak_input.setFont(font)        
														
 
															-        self.button_speak_input.clicked.connect(self.speak_input)
														
 
															-        layout.addWidget(self.button_speak_input)
														
 
															-
														
 
															-        self.tts_text = QTextEdit(self)
														
 
															-        self.tts_text.setFont(font)
														
 
															-        self.tts_text.setPlaceholderText("STT (final)")
														
 
															-        self.tts_text.setMinimumHeight(100) 
														
 
															-        self.tts_text.setReadOnly(True)
														
 
															-        layout.addWidget(self.tts_text)
														
 
															-
														
 
															-        self.stt_text = QTextEdit(self)
														
 
															-        self.stt_text.setFont(font)
														
 
															-        self.stt_text.setPlaceholderText("STT (realtime)")
														
 
															-        self.stt_text.setMinimumHeight(100) 
														
 
															-        layout.addWidget(self.stt_text)
														
 
															-
														
 
															-        self.button_speak_stt = QPushButton("Speak detected text again", self)
														
 
															-        self.button_speak_stt.setFont(font)        
														
 
															-        self.button_speak_stt.clicked.connect(self.speak_stt)
														
 
															-        layout.addWidget(self.button_speak_stt)
														
 
															-
														
 
															-        self.setLayout(layout)
														
 
															-        self.setWindowTitle("Realtime TTS/STT Loop Test")
														
 
															-        self.resize(800, 600)
														
 
															-
														
 
															-        self.update_stt_text_signal.connect(self.actual_update_stt_text)
														
 
															-        self.update_tts_text_signal.connect(self.actual_update_tts_text)
														
 
															-
														
 
															-        self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
														
 
															-
														
 
															-        recorder_config = {
														
 
															-            'spinner': False,
														
 
															-            'model': 'large-v2',
														
 
															-            'language': 'en',
														
 
															-            'silero_sensitivity': 0.01,
														
 
															-            'webrtc_sensitivity': 3,
														
 
															-            'post_speech_silence_duration': 0.01,
														
 
															-            'min_length_of_recording': 0.2,
														
 
															-            'min_gap_between_recordings': 0,
														
 
															-            'enable_realtime_transcription': True,
														
 
															-            'realtime_processing_pause': 0,
														
 
															-            'realtime_model_type': 'small.en',
														
 
															-            'on_realtime_transcription_stabilized': self.text_detected,
														
 
															-        }
														
 
															-
														
 
															-        self.recorder = AudioToTextRecorder(**recorder_config)
														
 
															-
														
 
															-    def speak_stt(self):
														
 
															-        text = self.stt_text.toPlainText()
														
 
															-        self.speak(text)
														
 
															-
														
 
															-    def speak_input(self):
														
 
															-        text = self.input_text.toPlainText()
														
 
															-        self.speak(text)
														
 
															-
														
 
															-    def text_detected(self, text):
														
 
															-        self.update_stt_text_signal.emit(text)
														
 
															-
														
 
															-    def audio_stream_stop(self):
														
 
															-        self.stream.stop()
														
 
															-        self.recorder.stop()
														
 
															-        detected_text = self.recorder.text()
														
 
															-        self.update_stt_text_signal.emit(detected_text)
														
 
															-        self.update_tts_text_signal.emit(detected_text)
														
 
															-
														
 
															-    def speak(self, text):
														
 
															-        self.stt_text.clear()        
														
 
															-        self.stream.feed(text)
														
 
															-
														
 
															-        self.recorder.start()
														
 
															-        self.stream.play_async()
														
 
															-
														
 
															-    def actual_update_stt_text(self, text):
														
 
															-        self.stt_text.setText(text)
														
 
															-
														
 
															-    def actual_update_tts_text(self, text):
														
 
															-        self.tts_text.setText(text)        
														
 
															+    class SimpleApp(QWidget):
														
 
															+
														
 
															+        update_stt_text_signal = pyqtSignal(str)
														
 
															+        update_tts_text_signal = pyqtSignal(str)
														
 
															+
														
 
															+        def __init__(self):
														
 
															+            super().__init__()
														
 
															+
														
 
															+            layout = QVBoxLayout()
														
 
															+
														
 
															+            font = QFont()
														
 
															+            font.setPointSize(18)
														
 
															+
														
 
															+            self.input_text = QTextEdit(self)
														
 
															+            self.input_text.setFont(font)
														
 
															+            self.input_text.setPlaceholderText("Input")
														
 
															+            self.input_text.setMinimumHeight(100) 
														
 
															+            layout.addWidget(self.input_text)
														
 
															+
														
 
															+            self.button_speak_input = QPushButton("Speak and detect input text", self)
														
 
															+            self.button_speak_input.setFont(font)        
														
 
															+            self.button_speak_input.clicked.connect(self.speak_input)
														
 
															+            layout.addWidget(self.button_speak_input)
														
 
															+
														
 
															+            self.tts_text = QTextEdit(self)
														
 
															+            self.tts_text.setFont(font)
														
 
															+            self.tts_text.setPlaceholderText("STT (final)")
														
 
															+            self.tts_text.setMinimumHeight(100) 
														
 
															+            self.tts_text.setReadOnly(True)
														
 
															+            layout.addWidget(self.tts_text)
														
 
															+
														
 
															+            self.stt_text = QTextEdit(self)
														
 
															+            self.stt_text.setFont(font)
														
 
															+            self.stt_text.setPlaceholderText("STT (realtime)")
														
 
															+            self.stt_text.setMinimumHeight(100) 
														
 
															+            layout.addWidget(self.stt_text)
														
 
															+
														
 
															+            self.button_speak_stt = QPushButton("Speak detected text again", self)
														
 
															+            self.button_speak_stt.setFont(font)        
														
 
															+            self.button_speak_stt.clicked.connect(self.speak_stt)
														
 
															+            layout.addWidget(self.button_speak_stt)
														
 
															+
														
 
															+            self.setLayout(layout)
														
 
															+            self.setWindowTitle("Realtime TTS/STT Loop Test")
														
 
															+            self.resize(800, 600)
														
 
															+
														
 
															+            self.update_stt_text_signal.connect(self.actual_update_stt_text)
														
 
															+            self.update_tts_text_signal.connect(self.actual_update_tts_text)
														
 
															+
														
 
															+            self.stream = TextToAudioStream(AzureEngine(os.environ.get("AZURE_SPEECH_KEY"), "germanywestcentral"), on_audio_stream_stop=self.audio_stream_stop)
														
 
															+
														
 
															+            recorder_config = {
														
 
															+                'spinner': False,
														
 
															+                'model': 'large-v2',
														
 
															+                'language': 'en',
														
 
															+                'silero_sensitivity': 0.01,
														
 
															+                'webrtc_sensitivity': 3,
														
 
															+                'post_speech_silence_duration': 0.01,
														
 
															+                'min_length_of_recording': 0.2,
														
 
															+                'min_gap_between_recordings': 0,
														
 
															+                'enable_realtime_transcription': True,
														
 
															+                'realtime_processing_pause': 0,
														
 
															+                'realtime_model_type': 'small.en',
														
 
															+                'on_realtime_transcription_stabilized': self.text_detected,
														
 
															+            }
														
 
															+
														
 
															+            self.recorder = AudioToTextRecorder(**recorder_config)
														
 
															+
														
 
															+        def speak_stt(self):
														
 
															+            text = self.stt_text.toPlainText()
														
 
															+            self.speak(text)
														
 
															+
														
 
															+        def speak_input(self):
														
 
															+            text = self.input_text.toPlainText()
														
 
															+            self.speak(text)
														
 
															+
														
 
															+        def text_detected(self, text):
														
 
															+            self.update_stt_text_signal.emit(text)
														
 
															+
														
 
															+        def audio_stream_stop(self):
														
 
															+            self.stream.stop()
														
 
															+            self.recorder.stop()
														
 
															+            detected_text = self.recorder.text()
														
 
															+            self.update_stt_text_signal.emit(detected_text)
														
 
															+            self.update_tts_text_signal.emit(detected_text)
														
 
															+
														
 
															+        def speak(self, text):
														
 
															+            self.stt_text.clear()        
														
 
															+            self.stream.feed(text)
														
 
															+
														
 
															+            self.recorder.start()
														
 
															+            self.stream.play_async()
														
 
															+
														
 
															+        def actual_update_stt_text(self, text):
														
 
															+            self.stt_text.setText(text)
														
 
															+
														
 
															+        def actual_update_tts_text(self, text):
														
 
															+            self.tts_text.setText(text)
														
 
															+
														
 
															+        def closeEvent(self, event):
														
 
															+            if self.recorder:
														
 
															+                self.recorder.shutdown()
														
 
															-if __name__ == '__main__':
														
 
															     app = QApplication(sys.argv)
														
 
															     window = SimpleApp()
														
 
															     window.show()
														
 
															-    sys.exit(app.exec_())
														
 
															+    sys.exit(app.exec_())
														
--- a/tests/simple_test.py
+++ b/tests/simple_test.py
@@ -1,5 +1,4 @@
 
															 from RealtimeSTT import AudioToTextRecorder
														
 
															-
														
 
															 if __name__ == '__main__':
														
 
															     recorder = AudioToTextRecorder(spinner=False)
														
--- a/tests/wakeword_test.py
+++ b/tests/wakeword_test.py
@@ -1,12 +1,16 @@
 
															 from RealtimeSTT import AudioToTextRecorder
														
 
															+import logging
														
 
															-def recording_started():
														
 
															-    print("Speak now...")
														
 
															+if __name__ == '__main__':
														
 
															-def recording_finished():
														
 
															-    print("Speech end detected... transcribing...")
														
 
															+    def recording_started():
														
 
															+        print("Speak now...")
														
 
															-recorder = AudioToTextRecorder(spinner=False, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished)
														
 
															+    def recording_finished():
														
 
															+        print("Speech end detected... transcribing...")
														
 
															-print('Say "Jarvis" then speak.')
														
 
															-print(recorder.text())
														
 
															+    with AudioToTextRecorder(spinner=False, level=logging.DEBUG, model="small.en", language="en", wake_words="jarvis", on_wakeword_detected=recording_started, on_recording_stop=recording_finished
														
 
															+        ) as recorder:
														
 
															+        print('Say "Jarvis" then speak.')
														
 
															+        print(recorder.text())
														
 
															+        print("Done. Now we should exit.")