Selaa lähdekoodia

Merge pull request #125 from KoljaB/dev

Dev
Kolja Beigel 7 kuukautta sitten
vanhempi
commit
a92e433e34
4 muutettua tiedostoa jossa 351 lisäystä ja 250 poistoa
  1. 1 1
      README.md
  2. 300 221
      RealtimeSTT/audio_recorder.py
  3. 1 1
      setup.py
  4. 49 27
      tests/realtimestt_test.py

+ 1 - 1
README.md

@@ -22,7 +22,7 @@ https://github.com/user-attachments/assets/797e6552-27cd-41b1-a7f3-e5cbc72094f5
 
 ### Updates
 
-Latest Version: v0.2.5
+Latest Version: v0.3.0
 
 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
 

+ 300 - 221
RealtimeSTT/audio_recorder.py

@@ -85,6 +85,99 @@ if platform.system() != 'Darwin':
     INIT_HANDLE_BUFFER_OVERFLOW = True
 
 
+class TranscriptionWorker:
+    def __init__(self, conn, stdout_pipe, model_path, compute_type, gpu_device_index, device,
+                 ready_event, shutdown_event, interrupt_stop_event, beam_size, initial_prompt, suppress_tokens):
+        self.conn = conn
+        self.stdout_pipe = stdout_pipe
+        self.model_path = model_path
+        self.compute_type = compute_type
+        self.gpu_device_index = gpu_device_index
+        self.device = device
+        self.ready_event = ready_event
+        self.shutdown_event = shutdown_event
+        self.interrupt_stop_event = interrupt_stop_event
+        self.beam_size = beam_size
+        self.initial_prompt = initial_prompt
+        self.suppress_tokens = suppress_tokens
+        self.queue = queue.Queue()
+
+    def custom_print(self, *args, **kwargs):
+        message = ' '.join(map(str, args))
+        try:
+            self.stdout_pipe.send(message)
+        except (BrokenPipeError, EOFError, OSError):
+            pass
+
+    def poll_connection(self):
+        while not self.shutdown_event.is_set():
+            if self.conn.poll(0.01):
+                try:
+                    data = self.conn.recv()
+                    self.queue.put(data)
+                except Exception as e:
+                    logging.error(f"Error receiving data from connection: {e}")
+            else:
+                time.sleep(TIME_SLEEP)
+
+    def run(self):
+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
+        __builtins__['print'] = self.custom_print
+
+        logging.info(f"Initializing faster_whisper main transcription model {self.model_path}")
+
+        try:
+            model = faster_whisper.WhisperModel(
+                model_size_or_path=self.model_path,
+                device=self.device,
+                compute_type=self.compute_type,
+                device_index=self.gpu_device_index,
+            )
+        except Exception as e:
+            logging.exception(f"Error initializing main faster_whisper transcription model: {e}")
+            raise
+
+        self.ready_event.set()
+        logging.debug("Faster_whisper main speech to text transcription model initialized successfully")
+
+        # Start the polling thread
+        polling_thread = threading.Thread(target=self.poll_connection)
+        polling_thread.start()
+
+        try:
+            while not self.shutdown_event.is_set():
+                try:
+                    audio, language = self.queue.get(timeout=0.1)
+                    try:
+                        segments, info = model.transcribe(
+                            audio,
+                            language=language if language else None,
+                            beam_size=self.beam_size,
+                            initial_prompt=self.initial_prompt,
+                            suppress_tokens=self.suppress_tokens
+                        )
+                        transcription = " ".join(seg.text for seg in segments).strip()
+                        logging.debug(f"Final text detected with main model: {transcription}")
+                        self.conn.send(('success', (transcription, info)))
+                    except Exception as e:
+                        logging.error(f"General error in transcription: {e}")
+                        self.conn.send(('error', str(e)))
+                except queue.Empty:
+                    continue
+                except KeyboardInterrupt:
+                    self.interrupt_stop_event.set()
+                    logging.debug("Transcription worker process finished due to KeyboardInterrupt")
+                    break
+                except Exception as e:
+                    logging.error(f"General error in processing queue item: {e}")
+        finally:
+            __builtins__['print'] = print  # Restore the original print function
+            self.conn.close()
+            self.stdout_pipe.close()
+            self.shutdown_event.set()  # Ensure the polling thread will stop
+            polling_thread.join()  # Wait for the polling thread to finish
+
+
 class AudioToTextRecorder:
     """
     A class responsible for capturing audio from the microphone, detecting
@@ -163,7 +256,8 @@ class AudioToTextRecorder:
                  print_transcription_time: bool = False,
                  early_transcription_on_silence: int = 0,
                  allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
-                 no_log_file: bool = False
+                 no_log_file: bool = False,
+                 use_extended_logging: bool = False,
                  ):
         """
         Initializes an audio recorder and  transcription
@@ -360,6 +454,9 @@ class AudioToTextRecorder:
         - allowed_latency_limit (int, default=100): Maximal amount of chunks
             that can be unprocessed in queue before discarding chunks.
         - no_log_file (bool, default=False): Skips writing of debug log file.
+        - use_extended_logging (bool, default=False): Writes extensive
+            log messages for the recording worker, that processes the audio
+            chunks.
 
         Raises:
             Exception: Errors related to initializing transcription
@@ -450,9 +547,11 @@ class AudioToTextRecorder:
         self.detected_realtime_language = None
         self.detected_realtime_language_probability = 0
         self.transcription_lock = threading.Lock()
+        self.shutdown_lock = threading.Lock()
         self.transcribe_count = 0
         self.print_transcription_time = print_transcription_time
         self.early_transcription_on_silence = early_transcription_on_silence
+        self.use_extended_logging = use_extended_logging
 
         # Initialize the logging configuration with the specified level
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
@@ -758,136 +857,9 @@ class AudioToTextRecorder:
                 break 
             time.sleep(0.1)
 
-    @staticmethod
-    def _transcription_worker(conn,
-                              stdout_pipe,
-                              model_path,
-                              compute_type,
-                              gpu_device_index,
-                              device,
-                              ready_event,
-                              shutdown_event,
-                              interrupt_stop_event,
-                              beam_size,
-                              initial_prompt,
-                              suppress_tokens
-                              ):
-        """
-        Worker method that handles the continuous
-        process of transcribing audio data.
-
-        This method runs in a separate process and is responsible for:
-        - Initializing the `faster_whisper` model used for transcription.
-        - Receiving audio data sent through a pipe and using the model
-          to transcribe it.
-        - Sending transcription results back through the pipe.
-        - Continuously checking for a shutdown event to gracefully
-          terminate the transcription process.
-
-        Args:
-            conn (multiprocessing.Connection): The connection endpoint used
-              for receiving audio data and sending transcription results.
-            model_path (str): The path to the pre-trained faster_whisper model
-              for transcription.
-            compute_type (str): Specifies the type of computation to be used
-                for transcription.
-            gpu_device_index (int): Device ID to use.
-            device (str): Device for model to use.
-            ready_event (threading.Event): An event that is set when the
-              transcription model is successfully initialized and ready.
-            shutdown_event (threading.Event): An event that, when set,
-              signals this worker method to terminate.
-            interrupt_stop_event (threading.Event): An event that, when set,
-                signals this worker method to stop processing audio data.
-            beam_size (int): The beam size to use for beam search decoding.
-            initial_prompt (str or iterable of int): Initial prompt to be fed
-                to the transcription model.
-            suppress_tokens (list of int): Tokens to be suppressed from the
-                transcription output.
-        Raises:
-            Exception: If there is an error while initializing the
-            transcription model.
-        """
-
-        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
-
-        def custom_print(*args, **kwargs):
-            message = ' '.join(map(str, args))
-            try:
-                stdout_pipe.send(message)
-            except (BrokenPipeError, EOFError, OSError):
-                # The pipe probably has been closed, so we ignore the error
-                pass
-
-        # Replace the built-in print function with our custom one
-        __builtins__['print'] = custom_print
-
-        logging.info("Initializing faster_whisper "
-                     f"main transcription model {model_path}"
-                     )
-
-        try:
-            model = faster_whisper.WhisperModel(
-                model_size_or_path=model_path,
-                device=device,
-                compute_type=compute_type,
-                device_index=gpu_device_index,
-            )
-
-        except Exception as e:
-            logging.exception("Error initializing main "
-                              f"faster_whisper transcription model: {e}"
-                              )
-            raise
-
-        ready_event.set()
-
-        logging.debug("Faster_whisper main speech to text "
-                      "transcription model initialized successfully"
-                      )
-
-        try:
-            while not shutdown_event.is_set():
-                try:
-                    if conn.poll(0.01):
-                        logging.debug("Receive from _transcription_worker  pipe")
-                        audio, language = conn.recv()
-                        try:
-                            segments, info = model.transcribe(
-                                audio,
-                                language=language if language else None,
-                                beam_size=beam_size,
-                                initial_prompt=initial_prompt,
-                                suppress_tokens=suppress_tokens
-                            )
-                            transcription = " ".join(seg.text for seg in segments)
-                            transcription = transcription.strip()
-                            logging.debug(f"Final text detected with main model: {transcription}")
-                            conn.send(('success', (transcription, info)))
-                        except Exception as e:
-                            logging.error(f"General error in _transcription_worker in transcription: {e}")
-                            conn.send(('error', str(e)))
-                    else:
-                        time.sleep(TIME_SLEEP)
-
-
-
-                except KeyboardInterrupt:
-                    interrupt_stop_event.set()
-                    
-                    logging.debug("Transcription worker process "
-                                    "finished due to KeyboardInterrupt"
-                                    )
-                    stdout_pipe.close()
-                    break
-
-                except Exception as e:
-                    logging.error(f"General error in _transcription_worker in accessing pipe: {e}")
-
-        finally:
-            __builtins__['print'] = print  # Restore the original print function            
-            conn.close()
-            stdout_pipe.close()
+    def _transcription_worker(*args, **kwargs):
+        worker = TranscriptionWorker(*args, **kwargs)
+        worker.run()
 
     @staticmethod
     def _audio_data_worker(audio_queue,
@@ -1342,46 +1314,6 @@ class AudioToTextRecorder:
         else:
             return self.transcribe()
 
-    # def text(self,
-    #          on_transcription_finished=None,
-    #          ):
-    #     """
-    #     Transcribes audio captured by this class instance
-    #     using the `faster_whisper` model.
-
-    #     - Automatically starts recording upon voice activity if not manually
-    #       started using `recorder.start()`.
-    #     - Automatically stops recording upon voice deactivity if not manually
-    #       stopped with `recorder.stop()`.
-    #     - Processes the recorded audio to generate transcription.
-
-    #     Args:
-    #         on_transcription_finished (callable, optional): Callback function
-    #           to be executed when transcription is ready.
-    #         If provided, transcription will be performed asynchronously, and
-    #           the callback will receive the transcription as its argument.
-    #           If omitted, the transcription will be performed synchronously,
-    #           and the result will be returned.
-
-    #     Returns (if not callback is set):
-    #         str: The transcription of the recorded audio
-    #     """
-
-    #     self.interrupt_stop_event.clear()
-    #     self.was_interrupted.clear()
-
-    #     self.wait_audio()
-
-    #     if self.is_shut_down or self.interrupt_stop_event.is_set():
-    #         if self.interrupt_stop_event.is_set():
-    #             self.was_interrupted.set()
-    #         return ""
-
-    #     if on_transcription_finished:
-    #         threading.Thread(target=on_transcription_finished,
-    #                          args=(self.transcribe(),)).start()
-    #     else:
-    #         return self.transcribe()
 
     def start(self):
         """
@@ -1498,54 +1430,58 @@ class AudioToTextRecorder:
         recording worker and closing the audio stream.
         """
 
-        print("RealtimeSTT shutting down")
-        logging.debug("RealtimeSTT shutting down")
+        with self.shutdown_lock:
+            if self.is_shut_down:
+                return
 
-        # Force wait_audio() and text() to exit
-        self.is_shut_down = True
-        self.start_recording_event.set()
-        self.stop_recording_event.set()
+            print("\033[91mRealtimeSTT shutting down\033[0m")
+            # logging.debug("RealtimeSTT shutting down")
 
-        self.shutdown_event.set()
-        self.is_recording = False
-        self.is_running = False
+            # Force wait_audio() and text() to exit
+            self.is_shut_down = True
+            self.start_recording_event.set()
+            self.stop_recording_event.set()
 
-        logging.debug('Finishing recording thread')
-        if self.recording_thread:
-            self.recording_thread.join()
+            self.shutdown_event.set()
+            self.is_recording = False
+            self.is_running = False
 
-        logging.debug('Terminating reader process')
+            logging.debug('Finishing recording thread')
+            if self.recording_thread:
+                self.recording_thread.join()
 
-        # Give it some time to finish the loop and cleanup.
-        if self.use_microphone:
-            self.reader_process.join(timeout=10)
+            logging.debug('Terminating reader process')
 
-        if self.reader_process.is_alive():
-            logging.warning("Reader process did not terminate "
-                            "in time. Terminating forcefully."
-                            )
-            self.reader_process.terminate()
+            # Give it some time to finish the loop and cleanup.
+            if self.use_microphone:
+                self.reader_process.join(timeout=10)
+
+            if self.reader_process.is_alive():
+                logging.warning("Reader process did not terminate "
+                                "in time. Terminating forcefully."
+                                )
+                self.reader_process.terminate()
 
-        logging.debug('Terminating transcription process')
-        self.transcript_process.join(timeout=10)
+            logging.debug('Terminating transcription process')
+            self.transcript_process.join(timeout=10)
 
-        if self.transcript_process.is_alive():
-            logging.warning("Transcript process did not terminate "
-                            "in time. Terminating forcefully."
-                            )
-            self.transcript_process.terminate()
+            if self.transcript_process.is_alive():
+                logging.warning("Transcript process did not terminate "
+                                "in time. Terminating forcefully."
+                                )
+                self.transcript_process.terminate()
 
-        self.parent_transcription_pipe.close()
+            self.parent_transcription_pipe.close()
 
-        logging.debug('Finishing realtime thread')
-        if self.realtime_thread:
-            self.realtime_thread.join()
+            logging.debug('Finishing realtime thread')
+            if self.realtime_thread:
+                self.realtime_thread.join()
 
-        if self.enable_realtime_transcription:
-            if self.realtime_model_type:
-                del self.realtime_model_type
-                self.realtime_model_type = None
-        gc.collect()
+            if self.enable_realtime_transcription:
+                if self.realtime_model_type:
+                    del self.realtime_model_type
+                    self.realtime_model_type = None
+            gc.collect()
 
     def _recording_worker(self):
         """
@@ -1553,9 +1489,13 @@ class AudioToTextRecorder:
         input for voice activity and accordingly starts/stops the recording.
         """
 
-        logging.debug('Starting recording worker')
+        if self.use_extended_logging:
+            logging.debug('Debug: Entering try block')
 
+        last_inner_try_time = 0
         try:
+            if self.use_extended_logging:
+                logging.debug('Debug: Initializing variables')
             time_since_last_buffer_message = 0
             was_recording = False
             delay_was_passed = False
@@ -1563,32 +1503,60 @@ class AudioToTextRecorder:
             wakeword_samples_to_remove = None
             self.allowed_to_early_transcribe = True
 
+            if self.use_extended_logging:
+                logging.debug('Debug: Starting main loop')
             # Continuously monitor audio for voice activity
             while self.is_running:
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Entering inner try block')
+                if last_inner_try_time:
+                    last_processing_time = time.time() - last_inner_try_time
+                    if last_processing_time > 0.1:
+                        if self.use_extended_logging:
+                            logging.warning('### WARNING: PROCESSING TOOK TOO LONG')
+                last_inner_try_time = time.time()
                 try:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Trying to get data from audio queue')
                     try:
-                        data = self.audio_queue.get(timeout=0.1)
+                        data = self.audio_queue.get(timeout=0.01)
                     except queue.Empty:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Queue is empty, checking if still running')
                         if not self.is_running:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Not running, breaking loop')
                             break
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Continuing to next iteration')
                         continue
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Checking for on_recorded_chunk callback')
                     if self.on_recorded_chunk:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Calling on_recorded_chunk')
                         self.on_recorded_chunk(data)
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Checking if handle_buffer_overflow is True')
                     if self.handle_buffer_overflow:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Handling buffer overflow')
                         # Handle queue overflow
                         if (self.audio_queue.qsize() >
                                 self.allowed_latency_limit):
-                            logging.warning("!!! ### !!! ### !!!")
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Queue size exceeds limit, logging warnings')
                             logging.warning("Audio queue size exceeds "
                                             "latency limit. Current size: "
                                             f"{self.audio_queue.qsize()}. "
                                             "Discarding old audio chunks."
                                             )
-                            logging.warning("!!! ### !!! ### !!!")
 
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Discarding old chunks if necessary')
                         while (self.audio_queue.qsize() >
                                 self.allowed_latency_limit):
 
@@ -1599,99 +1567,150 @@ class AudioToTextRecorder:
                     self.is_running = False
                     break
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Updating time_since_last_buffer_message')
                 # Feed the extracted data to the audio_queue
                 if time_since_last_buffer_message:
                     time_passed = time.time() - time_since_last_buffer_message
                     if time_passed > 1:
-                        logging.debug("_recording_worker processing audio data")
+                        if self.use_extended_logging:
+                            logging.debug("_recording_worker processing audio data")
                         time_since_last_buffer_message = time.time()
                 else:
                     time_since_last_buffer_message = time.time()
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Initializing failed_stop_attempt')
                 failed_stop_attempt = False
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Checking if not recording')
                 if not self.is_recording:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Handling not recording state')
                     # Handle not recording state
                     time_since_listen_start = (time.time() - self.listen_start
-                                               if self.listen_start else 0)
+                                            if self.listen_start else 0)
 
                     wake_word_activation_delay_passed = (
                         time_since_listen_start >
                         self.wake_word_activation_delay
                     )
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Handling wake-word timeout callback')
                     # Handle wake-word timeout callback
                     if wake_word_activation_delay_passed \
                             and not delay_was_passed:
 
                         if self.use_wake_words and self.wake_word_activation_delay:
                             if self.on_wakeword_timeout:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Calling on_wakeword_timeout')
                                 self.on_wakeword_timeout()
                     delay_was_passed = wake_word_activation_delay_passed
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Setting state and spinner text')
                     # Set state and spinner text
                     if not self.recording_stop_time:
                         if self.use_wake_words \
                                 and wake_word_activation_delay_passed \
                                 and not self.wakeword_detected:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Setting state to "wakeword"')
                             self._set_state("wakeword")
                         else:
                             if self.listen_start:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Setting state to "listening"')
                                 self._set_state("listening")
                             else:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Setting state to "inactive"')
                                 self._set_state("inactive")
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Checking wake word conditions')
                     if self.use_wake_words and wake_word_activation_delay_passed:
                         try:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Processing wakeword')
                             wakeword_index = self._process_wakeword(data)
 
                         except struct.error:
                             logging.error("Error unpacking audio data "
-                                          "for wake word processing.")
+                                        "for wake word processing.")
                             continue
 
                         except Exception as e:
                             logging.error(f"Wake word processing error: {e}")
                             continue
 
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Checking if wake word detected')
                         # If a wake word is detected                        
                         if wakeword_index >= 0:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Wake word detected, updating variables')
                             self.wake_word_detect_time = time.time()
                             wakeword_detected_time = time.time()
                             wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
                             self.wakeword_detected = True
                             if self.on_wakeword_detected:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Calling on_wakeword_detected')
                                 self.on_wakeword_detected()
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Checking voice activity conditions')
                     # Check for voice activity to
                     # trigger the start of recording
                     if ((not self.use_wake_words
-                         or not wake_word_activation_delay_passed)
+                        or not wake_word_activation_delay_passed)
                             and self.start_recording_on_voice_activity) \
                             or self.wakeword_detected:
 
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Checking if voice is active')
                         if self._is_voice_active():
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Voice activity detected')
                             logging.info("voice activity detected")
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Starting recording')
                             self.start()
 
                             self.start_recording_on_voice_activity = False
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Adding buffered audio to frames')
                             # Add the buffered audio
                             # to the recording frames
                             self.frames.extend(list(self.audio_buffer))
                             self.audio_buffer.clear()
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Resetting Silero VAD model states')
                             self.silero_vad_model.reset_states()
                         else:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Checking voice activity')
                             data_copy = data[:]
                             self._check_voice_activity(data_copy)
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Resetting speech_end_silence_start')
                     self.speech_end_silence_start = 0
 
                 else:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Handling recording state')
                     # If we are currently recording
                     if wakeword_samples_to_remove and wakeword_samples_to_remove > 0:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Removing wakeword samples')
                         # Remove samples from the beginning of self.frames
                         samples_removed = 0
                         while wakeword_samples_to_remove > 0 and self.frames:
@@ -1708,20 +1727,31 @@ class AudioToTextRecorder:
                         
                         wakeword_samples_to_remove = 0
 
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Checking if stop_recording_on_voice_deactivity is True')
                     # Stop the recording if silence is detected after speech
                     if self.stop_recording_on_voice_deactivity:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Determining if speech is detected')
                         is_speech = (
                             self._is_silero_speech(data) if self.silero_deactivity_detection
                             else self._is_webrtc_speech(data, True)
                         )
 
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Formatting speech_end_silence_start')
                         if not self.speech_end_silence_start:
                             str_speech_end_silence_start = "0"
                         else:
                             str_speech_end_silence_start = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
-                        logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
+                        if self.use_extended_logging:
+                            logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
 
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Checking if speech is not detected')
                         if not is_speech:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Handling voice deactivity')
                             # Voice deactivity was detected, so we start
                             # measuring silence time before stopping recording
                             if self.speech_end_silence_start == 0 and \
@@ -1729,56 +1759,89 @@ class AudioToTextRecorder:
 
                                 self.speech_end_silence_start = time.time()
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Checking early transcription conditions')
                             if self.speech_end_silence_start and self.early_transcription_on_silence and len(self.frames) > 0 and \
                                 (time.time() - self.speech_end_silence_start > self.early_transcription_on_silence) and \
                                 self.allowed_to_early_transcribe:
-                                    logging.debug("Adding early transcription request")
+                                    if self.use_extended_logging:
+                                        logging.debug("Debug:Adding early transcription request")
                                     self.transcribe_count += 1
                                     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
                                     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+
+                                    if self.use_extended_logging:
+                                        logging.debug("Debug: early transcription request pipe send")
                                     self.parent_transcription_pipe.send((audio, self.language))
+                                    if self.use_extended_logging:
+                                        logging.debug("Debug: early transcription request pipe send return")
                                     self.allowed_to_early_transcribe = False
 
                         else:
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Handling speech detection')
                             if self.speech_end_silence_start:
-                                logging.info("Resetting self.speech_end_silence_start")
+                                if self.use_extended_logging:
+                                    logging.info("Resetting self.speech_end_silence_start")
                                 self.speech_end_silence_start = 0
                                 self.allowed_to_early_transcribe = True
 
-
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Checking if silence duration exceeds threshold')
                         # Wait for silence to stop recording after speech
                         if self.speech_end_silence_start and time.time() - \
                                 self.speech_end_silence_start >= \
                                 self.post_speech_silence_duration:
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Formatting silence start time')
                             # Get time in desired format (HH:MM:SS.nnn)
                             silence_start_time = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
 
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Calculating time difference')
                             # Calculate time difference
                             time_diff = time.time() - self.speech_end_silence_start
 
-                            logging.info(f"voice deactivity detected at {silence_start_time}, "
+                            if self.use_extended_logging:
+                                logging.debug('Debug: Logging voice deactivity detection')
+                                logging.info(f"voice deactivity detected at {silence_start_time}, "
                                         f"time since silence start: {time_diff:.3f} seconds")
 
+                                logging.debug('Debug: Appending data to frames and stopping recording')
                             self.frames.append(data)
                             self.stop()
                             if not self.is_recording:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Resetting speech_end_silence_start')
                                 self.speech_end_silence_start = 0
 
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Handling non-wake word scenario')
                                 if not self.use_wake_words:
                                     self.listen_start = time.time()
                                     self._set_state("listening")
                                     self.start_recording_on_voice_activity = True
                             else:
+                                if self.use_extended_logging:
+                                    logging.debug('Debug: Setting failed_stop_attempt to True')
                                 failed_stop_attempt = True
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Checking if recording stopped')
                 if not self.is_recording and was_recording:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Resetting after stopping recording')
                     # Reset after stopping recording to ensure clean state
                     self.stop_recording_on_voice_deactivity = False
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Checking Silero time')
                 if time.time() - self.silero_check_time > 0.1:
                     self.silero_check_time = 0
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Handling wake word timeout')
                 # Handle wake word timeout (waited to long initiating
                 # speech after wake word detection)
                 if self.wake_word_detect_time and time.time() - \
@@ -1786,22 +1849,38 @@ class AudioToTextRecorder:
 
                     self.wake_word_detect_time = 0
                     if self.wakeword_detected and self.on_wakeword_timeout:
+                        if self.use_extended_logging:
+                            logging.debug('Debug: Calling on_wakeword_timeout')
                         self.on_wakeword_timeout()
                     self.wakeword_detected = False
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Updating was_recording')
                 was_recording = self.is_recording
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Checking if recording and not failed stop attempt')
                 if self.is_recording and not failed_stop_attempt:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Appending data to frames')
                     self.frames.append(data)
 
+                if self.use_extended_logging:
+                    logging.debug('Debug: Checking if not recording or speech end silence start')
                 if not self.is_recording or self.speech_end_silence_start:
+                    if self.use_extended_logging:
+                        logging.debug('Debug: Appending data to audio buffer')
                     self.audio_buffer.append(data)
 
         except Exception as e:
+            logging.debug('Debug: Caught exception in main try block')
             if not self.interrupt_stop_event.is_set():
                 logging.error(f"Unhandled exeption in _recording_worker: {e}")
                 raise
 
+        if self.use_extended_logging:
+            logging.debug('Debug: Exiting _recording_worker method')
+
 
     def _realtime_worker(self):
         """

+ 1 - 1
setup.py

@@ -9,7 +9,7 @@ with open('requirements.txt') as f:
 
 setuptools.setup(
     name="RealtimeSTT",
-    version="0.2.4",
+    version="0.3.0",
     author="Kolja Beigel",
     author_email="kolja.beigel@web.de",
     description="A fast Voice Activity Detection and Transcription System",

+ 49 - 27
tests/realtimestt_test.py

@@ -9,8 +9,11 @@ if __name__ == '__main__':
     import os
     import sys
     from RealtimeSTT import AudioToTextRecorder
-    from colorama import Fore, Back, Style
+    from colorama import Fore, Style
     import colorama
+    from rich.live import Live
+    from rich.console import Console
+    from rich.text import Text
 
     if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
         from torchaudio._extension.utils import _init_dll_path
@@ -20,40 +23,58 @@ if __name__ == '__main__':
 
     colorama.init()
 
+    # Initialize Rich Console and Live
+    console = Console()
+    live = Live(console=console, refresh_per_second=10, screen=False)
+    live.start()
+
     full_sentences = []
     displayed_text = ""
     prev_text = ""
+    rich_text_stored = ""
     recorder = None
 
     end_of_sentence_detection_pause = 0.4
-    mid_sentence_detection_pause = 0.7
+    unknown_sentence_detection_pause = 0.7
+    mid_sentence_detection_pause = 2.0
 
     def clear_console():
         os.system('clear' if os.name == 'posix' else 'cls')
 
     def text_detected(text):
-        global displayed_text, prev_text
+        global displayed_text, prev_text, full_sentences, recorder, rich_text_stored
         sentence_end_marks = ['.', '!', '?', '。'] 
-        if text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+        if text.endswith("..."):
+            recorder.post_speech_silence_duration = mid_sentence_detection_pause
+        elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
             recorder.post_speech_silence_duration = end_of_sentence_detection_pause
         else:
-            recorder.post_speech_silence_duration = mid_sentence_detection_pause
+            recorder.post_speech_silence_duration = unknown_sentence_detection_pause
 
         prev_text = text
 
-        sentences_with_style = [
-            f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
-            for i, sentence in enumerate(full_sentences)
-        ]
-        new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
-
-        if new_text != displayed_text:
-            displayed_text = new_text
-            clear_console()
-            print(displayed_text, end="", flush=True)
+        # Build Rich Text with alternating colors
+        rich_text = Text()
+        for i, sentence in enumerate(full_sentences):
+            if i % 2 == 0:
+                rich_text += Text(sentence, style="yellow") + Text(" ")
+            else:
+                rich_text += Text(sentence, style="cyan") + Text(" ")
+        
+        # If the current text is not a sentence-ending, display it in real-time
+        if text:
+            rich_text += Text(text, style="white")
+
+        new_displayed_text = rich_text.plain
+
+        if new_displayed_text != displayed_text:
+            displayed_text = new_displayed_text
+            live.update(rich_text)
+            rich_text_stored = rich_text
 
     def process_text(text):
-        recorder.post_speech_silence_duration = end_of_sentence_detection_pause
+        global recorder, full_sentences, prev_text
+        recorder.post_speech_silence_duration = unknown_sentence_detection_pause
         full_sentences.append(text)
         prev_text = ""
         text_detected("")
@@ -62,23 +83,23 @@ if __name__ == '__main__':
     recorder_config = {
         'spinner': False,
         'model': 'large-v2',
+        # 'input_device_index': 1,
         'realtime_model_type': 'tiny.en',
         'language': 'en',
-        'input_device_index': 1,
         'silero_sensitivity': 0.05,
         'webrtc_sensitivity': 3,
-        'post_speech_silence_duration': end_of_sentence_detection_pause,
-        'min_length_of_recording': 0,
+        'post_speech_silence_duration': unknown_sentence_detection_pause,
+        'min_length_of_recording': 0.7,        
         'min_gap_between_recordings': 0,                
         'enable_realtime_transcription': True,
         'realtime_processing_pause': 0.1,
-        'on_realtime_transcription_update': text_detected,
+        #'on_realtime_transcription_update': text_detected,
+        'on_realtime_transcription_stabilized': text_detected,
         'silero_deactivity_detection': True,
-        'min_length_of_recording': 0.7,        
         'early_transcription_on_silence': 0.2,
         'beam_size': 5,
         'beam_size_realtime': 1,
-        'no_log_file': False,
+        'no_log_file': True,
     }
 
     if EXTENDED_LOGGING:
@@ -86,12 +107,13 @@ if __name__ == '__main__':
 
     recorder = AudioToTextRecorder(**recorder_config)
 
-    clear_console()
-    print("Say something...", end="", flush=True)
-
+    # Initial display message
+    initial_text = Text("Say something...", style="green")
+    live.update(initial_text)
 
     try:
-        while (True):
+        while True:
             recorder.text(process_text)
     except KeyboardInterrupt:
-        print("Exiting application due to keyboard interrupt")
+        live.stop()
+        print("Exit due to keyboard interrupt.")