9 months ago · 06328c15e8
--- a/README.md
+++ b/README.md
@@ -18,11 +18,11 @@ It's ideal for:
 
				 - **Voice Assistants**
			
 
				 - Applications requiring **fast and precise** speech-to-text conversion
			
 
				 
			
 
				-https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-0722c3ee6d14
			
 
				+https://github.com/user-attachments/assets/797e6552-27cd-41b1-a7f3-e5cbc72094f5
			
 
				 
			
 
				 ### Updates
			
 
				 
			
 
				-Latest Version: v0.2.42
			
 
				+Latest Version: v0.2.5
			
 
				 
			
 
				 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
			
 
				 
			
@@ -310,6 +310,14 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
				 
			
 
				 - **debug_mode** (bool, default=False): If set, the system prints additional debug information to the console.
			
 
				 
			
 
				+- **print_transcription_time** (bool, default=False): Logs the processing time of the main model transcription. This can be useful for performance monitoring and debugging.
			
 
				+
			
 
				+- **early_transcription_on_silence** (int, default=0): If set, the system will transcribe audio faster when silence is detected. Transcription will start after the specified milliseconds. Keep this value lower than `post_speech_silence_duration`, ideally around `post_speech_silence_duration` minus the estimated transcription time with the main model. If silence lasts longer than `post_speech_silence_duration`, the recording is stopped, and the transcription is submitted. If voice activity resumes within this period, the transcription is discarded. This results in faster final transcriptions at the cost of additional GPU load due to some unnecessary final transcriptions.
			
 
				+
			
 
				+- **allowed_latency_limit** (int, default=100): Specifies the maximum number of unprocessed chunks in the queue before discarding chunks. This helps prevent the system from being overwhelmed and losing responsiveness in real-time applications.
			
 
				+
			
 
				+- **no_log_file** (bool, default=False): If set, the system will skip writing the debug log file, reducing disk I/O. Useful if logging to a file is not needed and performance is a priority.
			
 
				+
			
 
				 #### Real-time Transcription Parameters
			
 
				 
			
 
				 > **Note**: *When enabling realtime description a GPU installation is strongly advised. Using realtime transcription may create high GPU loads.*
			
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
@@ -34,6 +34,7 @@ from ctypes import c_bool
 
				 from openwakeword.model import Model
			
 
				 from scipy.signal import resample
			
 
				 from scipy import signal
			
 
				+import signal as system_signal
			
 
				 import faster_whisper
			
 
				 import openwakeword
			
 
				 import collections
			
@@ -72,7 +73,7 @@ INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
 
				 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
			
 
				 INIT_WAKE_WORD_TIMEOUT = 5.0
			
 
				 INIT_WAKE_WORD_BUFFER_DURATION = 0.1
			
 
				-ALLOWED_LATENCY_LIMIT = 10
			
 
				+ALLOWED_LATENCY_LIMIT = 100
			
 
				 
			
 
				 TIME_SLEEP = 0.02
			
 
				 SAMPLE_RATE = 16000
			
@@ -159,8 +160,10 @@ class AudioToTextRecorder:
 
				                  sample_rate: int = SAMPLE_RATE,
			
 
				                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
			
 
				                  suppress_tokens: Optional[List[int]] = [-1],
			
 
				-                 log_transcription_time: bool = False,
			
 
				-                 early_transcription_on_silence: bool = True
			
 
				+                 print_transcription_time: bool = False,
			
 
				+                 early_transcription_on_silence: int = 0,
			
 
				+                 allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
			
 
				+                 no_log_file: bool = False
			
 
				                  ):
			
 
				         """
			
 
				         Initializes an audio recorder and  transcription
			
@@ -341,15 +344,22 @@ class AudioToTextRecorder:
 
				             prompt to be fed to the transcription models.
			
 
				         - suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
			
 
				             from the transcription output.
			
 
				-        - log_transcription_time (bool, default=False): Logs processing time
			
 
				+        - print_transcription_time (bool, default=False): Logs processing time
			
 
				             of main model transcription 
			
 
				-        - early_transcription_on_silence (bool, default=True):  If True, the
			
 
				-            system will immediately transcribe audio when silence is detected.
			
 
				+        - early_transcription_on_silence (int, default=0): If set, the
			
 
				+            system will transcribe audio faster when silence is detected.
			
 
				+            Transcription will start after the specified milliseconds, so 
			
 
				+            keep this value lower than post_speech_silence_duration. 
			
 
				+            Ideally around post_speech_silence_duration minus the estimated
			
 
				+            transcription time with the main model.
			
 
				             If silence lasts longer than post_speech_silence_duration, the 
			
 
				             recording is stopped, and the transcription is submitted. If 
			
 
				             voice activity resumes within this period, the transcription 
			
 
				             is discarded. Results in faster final transcriptions to the cost
			
 
				-            of some unnecessary final transcriptions
			
 
				+            of additional GPU load due to some unnecessary final transcriptions.
			
 
				+        - allowed_latency_limit (int, default=100): Maximal amount of chunks
			
 
				+            that can be unprocessed in queue before discarding chunks.
			
 
				+        - no_log_file (bool, default=False): Skips writing of debug log file.
			
 
				 
			
 
				         Raises:
			
 
				             Exception: Errors related to initializing transcription
			
@@ -400,7 +410,7 @@ class AudioToTextRecorder:
 
				         self.handle_buffer_overflow = handle_buffer_overflow
			
 
				         self.beam_size = beam_size
			
 
				         self.beam_size_realtime = beam_size_realtime
			
 
				-        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
			
 
				+        self.allowed_latency_limit = allowed_latency_limit
			
 
				 
			
 
				         self.level = level
			
 
				         self.audio_queue = mp.Queue()
			
@@ -441,35 +451,40 @@ class AudioToTextRecorder:
 
				         self.detected_realtime_language_probability = 0
			
 
				         self.transcription_lock = threading.Lock()
			
 
				         self.transcribe_count = 0
			
 
				-        self.log_transcription_time = log_transcription_time
			
 
				+        self.print_transcription_time = print_transcription_time
			
 
				         self.early_transcription_on_silence = early_transcription_on_silence
			
 
				 
			
 
				         # Initialize the logging configuration with the specified level
			
 
				         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
			
 
				-        file_log_format = '%(asctime)s - ' + log_format
			
 
				+
			
 
				+        # Adjust file_log_format to include milliseconds
			
 
				+        file_log_format = '%(asctime)s.%(msecs)03d - ' + log_format
			
 
				 
			
 
				         # Get the root logger
			
 
				         logger = logging.getLogger()
			
 
				-        logger.setLevel(level)  # Set the logger's level
			
 
				-        logger.propagate = False  # Prevent propagation to higher-level loggers
			
 
				+        logger.setLevel(logging.DEBUG)  # Set the root logger's level to DEBUG
			
 
				 
			
 
				         # Remove any existing handlers
			
 
				         logger.handlers = []
			
 
				 
			
 
				-        # Create a file handler and set its level
			
 
				-        file_handler = logging.FileHandler('realtimesst.log')
			
 
				-        file_handler.setLevel(logging.DEBUG)
			
 
				-        file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
			
 
				-
			
 
				         # Create a console handler and set its level
			
 
				         console_handler = logging.StreamHandler()
			
 
				-        console_handler.setLevel(level)
			
 
				+        console_handler.setLevel(level) 
			
 
				         console_handler.setFormatter(logging.Formatter(log_format))
			
 
				 
			
 
				         # Add the handlers to the logger
			
 
				-        logger.addHandler(file_handler)
			
 
				+        if not no_log_file:
			
 
				+            # Create a file handler and set its level
			
 
				+            file_handler = logging.FileHandler('realtimesst.log')
			
 
				+            file_handler.setLevel(logging.DEBUG)
			
 
				+            file_handler.setFormatter(logging.Formatter(
			
 
				+                file_log_format,
			
 
				+                datefmt='%Y-%m-%d %H:%M:%S'
			
 
				+            ))
			
 
				+
			
 
				+            logger.addHandler(file_handler)
			
 
				         logger.addHandler(console_handler)
			
 
				-        
			
 
				+
			
 
				         self.is_shut_down = False
			
 
				         self.shutdown_event = mp.Event()
			
 
				         
			
@@ -793,6 +808,9 @@ class AudioToTextRecorder:
 
				             Exception: If there is an error while initializing the
			
 
				             transcription model.
			
 
				         """
			
 
				+
			
 
				+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
			
 
				+
			
 
				         def custom_print(*args, **kwargs):
			
 
				             message = ' '.join(map(str, args))
			
 
				             try:
			
@@ -904,6 +922,7 @@ class AudioToTextRecorder:
 
				         import pyaudio
			
 
				         import numpy as np
			
 
				         from scipy import signal
			
 
				+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
			
 
				 
			
 
				         def get_highest_sample_rate(audio_interface, device_index):
			
 
				             """Get the highest supported sample rate for the specified device."""
			
@@ -1013,6 +1032,7 @@ class AudioToTextRecorder:
 
				         buffer = bytearray()
			
 
				         silero_buffer_size = 2 * buffer_size  # silero complains if too short
			
 
				 
			
 
				+        time_since_last_buffer_message = 0
			
 
				         try:
			
 
				             while not shutdown_event.is_set():
			
 
				                 try:
			
@@ -1029,7 +1049,16 @@ class AudioToTextRecorder:
 
				                             buffer = buffer[silero_buffer_size:]
			
 
				 
			
 
				                             # Feed the extracted data to the audio_queue
			
 
				+                            if time_since_last_buffer_message:
			
 
				+                                time_passed = time.time() - time_since_last_buffer_message
			
 
				+                                if time_passed > 1:
			
 
				+                                    logging.debug("_audio_data_worker writing audio data into queue.")
			
 
				+                                    time_since_last_buffer_message = time.time()
			
 
				+                            else:
			
 
				+                                time_since_last_buffer_message = time.time()
			
 
				+
			
 
				                             audio_queue.put(to_process)
			
 
				+                            
			
 
				 
			
 
				                 except OSError as e:
			
 
				                     if e.errno == pyaudio.paInputOverflowed:
			
@@ -1120,42 +1149,48 @@ class AudioToTextRecorder:
 
				         - Modifies the audio attribute to contain the processed audio data.
			
 
				         """
			
 
				 
			
 
				-        logging.info("Setting listen time")
			
 
				-        if self.listen_start == 0:
			
 
				-            self.listen_start = time.time()
			
 
				-
			
 
				-        # If not yet started recording, wait for voice activity to initiate.
			
 
				-        if not self.is_recording and not self.frames:
			
 
				-            self._set_state("listening")
			
 
				-            self.start_recording_on_voice_activity = True
			
 
				+        try:
			
 
				+            logging.info("Setting listen time")
			
 
				+            if self.listen_start == 0:
			
 
				+                self.listen_start = time.time()
			
 
				+
			
 
				+            # If not yet started recording, wait for voice activity to initiate.
			
 
				+            if not self.is_recording and not self.frames:
			
 
				+                self._set_state("listening")
			
 
				+                self.start_recording_on_voice_activity = True
			
 
				+
			
 
				+                # Wait until recording starts
			
 
				+                logging.debug('Waiting for recording start')
			
 
				+                while not self.interrupt_stop_event.is_set():
			
 
				+                    if self.start_recording_event.wait(timeout=0.02):
			
 
				+                        break
			
 
				 
			
 
				-            # Wait until recording starts
			
 
				-            logging.debug('Waiting for recording start')
			
 
				-            while not self.interrupt_stop_event.is_set():
			
 
				-                if self.start_recording_event.wait(timeout=0.02):
			
 
				-                    break
			
 
				+            # If recording is ongoing, wait for voice inactivity
			
 
				+            # to finish recording.
			
 
				+            if self.is_recording:
			
 
				+                self.stop_recording_on_voice_deactivity = True
			
 
				 
			
 
				-        # If recording is ongoing, wait for voice inactivity
			
 
				-        # to finish recording.
			
 
				-        if self.is_recording:
			
 
				-            self.stop_recording_on_voice_deactivity = True
			
 
				+                # Wait until recording stops
			
 
				+                logging.debug('Waiting for recording stop')
			
 
				+                while not self.interrupt_stop_event.is_set():
			
 
				+                    if (self.stop_recording_event.wait(timeout=0.02)):
			
 
				+                        break
			
 
				 
			
 
				-            # Wait until recording stops
			
 
				-            logging.debug('Waiting for recording stop')
			
 
				-            while not self.interrupt_stop_event.is_set():
			
 
				-                if (self.stop_recording_event.wait(timeout=0.02)):
			
 
				-                    break
			
 
				+            # Convert recorded frames to the appropriate audio format.
			
 
				+            audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				+            self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				+            self.frames.clear()
			
 
				 
			
 
				-        # Convert recorded frames to the appropriate audio format.
			
 
				-        audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				-        self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				-        self.frames.clear()
			
 
				+            # Reset recording-related timestamps
			
 
				+            self.recording_stop_time = 0
			
 
				+            self.listen_start = 0
			
 
				 
			
 
				-        # Reset recording-related timestamps
			
 
				-        self.recording_stop_time = 0
			
 
				-        self.listen_start = 0
			
 
				+            self._set_state("inactive")
			
 
				 
			
 
				-        self._set_state("inactive")
			
 
				+        except KeyboardInterrupt:
			
 
				+            logging.info("KeyboardInterrupt in wait_audio, shutting down")
			
 
				+            self.shutdown()
			
 
				+            raise  # Re-raise the exception after cleanup
			
 
				 
			
 
				     def transcribe(self):
			
 
				         """
			
@@ -1184,18 +1219,22 @@ class AudioToTextRecorder:
 
				         """
			
 
				         self._set_state("transcribing")
			
 
				         audio_copy = copy.deepcopy(self.audio)
			
 
				-        start_time = time.time()  # Start timing
			
 
				+        start_time = 0
			
 
				         with self.transcription_lock:
			
 
				+            
			
 
				             try:
			
 
				                 if self.transcribe_count == 0:
			
 
				+                    logging.debug("Adding transcription request, no early transcription started")
			
 
				+                    start_time = time.time()  # Start timing
			
 
				                     self.parent_transcription_pipe.send((self.audio, self.language))
			
 
				                     self.transcribe_count += 1
			
 
				 
			
 
				                 while self.transcribe_count > 0:
			
 
				-                    logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
			
 
				+                    logging.debug(F"Receive from parent_transcription_pipe after sendiung transcription request, transcribe_count: {self.transcribe_count}")
			
 
				                     status, result = self.parent_transcription_pipe.recv()
			
 
				                     self.transcribe_count -= 1
			
 
				 
			
 
				+                self.allowed_to_early_transcribe = True
			
 
				                 self._set_state("inactive")
			
 
				                 if status == 'success':
			
 
				                     segments, info = result
			
@@ -1206,8 +1245,11 @@ class AudioToTextRecorder:
 
				                     end_time = time.time()  # End timing
			
 
				                     transcription_time = end_time - start_time
			
 
				 
			
 
				-                    if self.log_transcription_time:
			
 
				-                        logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
			
 
				+                    if start_time:
			
 
				+                        if self.print_transcription_time:
			
 
				+                            print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
			
 
				+                        else:
			
 
				+                            logging.debug(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
			
 
				                     return transcription
			
 
				                 else:
			
 
				                     logging.error(f"Transcription error: {result}")
			
@@ -1280,11 +1322,14 @@ class AudioToTextRecorder:
 
				         Returns (if not callback is set):
			
 
				             str: The transcription of the recorded audio
			
 
				         """
			
 
				-
			
 
				         self.interrupt_stop_event.clear()
			
 
				         self.was_interrupted.clear()
			
 
				-
			
 
				-        self.wait_audio()
			
 
				+        try:
			
 
				+            self.wait_audio()
			
 
				+        except KeyboardInterrupt:
			
 
				+            logging.info("KeyboardInterrupt in text() method")
			
 
				+            self.shutdown()
			
 
				+            raise  # Re-raise the exception after cleanup
			
 
				 
			
 
				         if self.is_shut_down or self.interrupt_stop_event.is_set():
			
 
				             if self.interrupt_stop_event.is_set():
			
@@ -1293,10 +1338,51 @@ class AudioToTextRecorder:
 
				 
			
 
				         if on_transcription_finished:
			
 
				             threading.Thread(target=on_transcription_finished,
			
 
				-                             args=(self.transcribe(),)).start()
			
 
				+                            args=(self.transcribe(),)).start()
			
 
				         else:
			
 
				             return self.transcribe()
			
 
				 
			
 
				+    # def text(self,
			
 
				+    #          on_transcription_finished=None,
			
 
				+    #          ):
			
 
				+    #     """
			
 
				+    #     Transcribes audio captured by this class instance
			
 
				+    #     using the `faster_whisper` model.
			
 
				+
			
 
				+    #     - Automatically starts recording upon voice activity if not manually
			
 
				+    #       started using `recorder.start()`.
			
 
				+    #     - Automatically stops recording upon voice deactivity if not manually
			
 
				+    #       stopped with `recorder.stop()`.
			
 
				+    #     - Processes the recorded audio to generate transcription.
			
 
				+
			
 
				+    #     Args:
			
 
				+    #         on_transcription_finished (callable, optional): Callback function
			
 
				+    #           to be executed when transcription is ready.
			
 
				+    #         If provided, transcription will be performed asynchronously, and
			
 
				+    #           the callback will receive the transcription as its argument.
			
 
				+    #           If omitted, the transcription will be performed synchronously,
			
 
				+    #           and the result will be returned.
			
 
				+
			
 
				+    #     Returns (if not callback is set):
			
 
				+    #         str: The transcription of the recorded audio
			
 
				+    #     """
			
 
				+
			
 
				+    #     self.interrupt_stop_event.clear()
			
 
				+    #     self.was_interrupted.clear()
			
 
				+
			
 
				+    #     self.wait_audio()
			
 
				+
			
 
				+    #     if self.is_shut_down or self.interrupt_stop_event.is_set():
			
 
				+    #         if self.interrupt_stop_event.is_set():
			
 
				+    #             self.was_interrupted.set()
			
 
				+    #         return ""
			
 
				+
			
 
				+    #     if on_transcription_finished:
			
 
				+    #         threading.Thread(target=on_transcription_finished,
			
 
				+    #                          args=(self.transcribe(),)).start()
			
 
				+    #     else:
			
 
				+    #         return self.transcribe()
			
 
				+
			
 
				     def start(self):
			
 
				         """
			
 
				         Starts recording audio directly without waiting for voice activity.
			
@@ -1412,6 +1498,9 @@ class AudioToTextRecorder:
 
				         recording worker and closing the audio stream.
			
 
				         """
			
 
				 
			
 
				+        print("RealtimeSTT shutting down")
			
 
				+        logging.debug("RealtimeSTT shutting down")
			
 
				+
			
 
				         # Force wait_audio() and text() to exit
			
 
				         self.is_shut_down = True
			
 
				         self.start_recording_event.set()
			
@@ -1467,10 +1556,12 @@ class AudioToTextRecorder:
 
				         logging.debug('Starting recording worker')
			
 
				 
			
 
				         try:
			
 
				+            time_since_last_buffer_message = 0
			
 
				             was_recording = False
			
 
				             delay_was_passed = False
			
 
				             wakeword_detected_time = None
			
 
				             wakeword_samples_to_remove = None
			
 
				+            self.allowed_to_early_transcribe = True
			
 
				 
			
 
				             # Continuously monitor audio for voice activity
			
 
				             while self.is_running:
			
@@ -1490,11 +1581,13 @@ class AudioToTextRecorder:
 
				                         # Handle queue overflow
			
 
				                         if (self.audio_queue.qsize() >
			
 
				                                 self.allowed_latency_limit):
			
 
				+                            logging.warning("!!! ### !!! ### !!!")
			
 
				                             logging.warning("Audio queue size exceeds "
			
 
				                                             "latency limit. Current size: "
			
 
				                                             f"{self.audio_queue.qsize()}. "
			
 
				                                             "Discarding old audio chunks."
			
 
				                                             )
			
 
				+                            logging.warning("!!! ### !!! ### !!!")
			
 
				 
			
 
				                         while (self.audio_queue.qsize() >
			
 
				                                 self.allowed_latency_limit):
			
@@ -1506,8 +1599,18 @@ class AudioToTextRecorder:
 
				                     self.is_running = False
			
 
				                     break
			
 
				 
			
 
				+                # Feed the extracted data to the audio_queue
			
 
				+                if time_since_last_buffer_message:
			
 
				+                    time_passed = time.time() - time_since_last_buffer_message
			
 
				+                    if time_passed > 1:
			
 
				+                        logging.debug("_recording_worker processing audio data")
			
 
				+                        time_since_last_buffer_message = time.time()
			
 
				+                else:
			
 
				+                    time_since_last_buffer_message = time.time()
			
 
				+
			
 
				+                failed_stop_attempt = False
			
 
				+
			
 
				                 if not self.is_recording:
			
 
				-                    logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
			
 
				                     # Handle not recording state
			
 
				                     time_since_listen_start = (time.time() - self.listen_start
			
 
				                                                if self.listen_start else 0)
			
@@ -1538,7 +1641,6 @@ class AudioToTextRecorder:
 
				                             else:
			
 
				                                 self._set_state("inactive")
			
 
				 
			
 
				-                    #self.wake_word_detect_time = time.time()
			
 
				                     if self.use_wake_words and wake_word_activation_delay_passed:
			
 
				                         try:
			
 
				                             wakeword_index = self._process_wakeword(data)
			
@@ -1554,6 +1656,7 @@ class AudioToTextRecorder:
 
				 
			
 
				                         # If a wake word is detected                        
			
 
				                         if wakeword_index >= 0:
			
 
				+                            self.wake_word_detect_time = time.time()
			
 
				                             wakeword_detected_time = time.time()
			
 
				                             wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
			
 
				                             self.wakeword_detected = True
			
@@ -1612,31 +1715,62 @@ class AudioToTextRecorder:
 
				                             else self._is_webrtc_speech(data, True)
			
 
				                         )
			
 
				 
			
 
				+                        if not self.speech_end_silence_start:
			
 
				+                            str_speech_end_silence_start = "0"
			
 
				+                        else:
			
 
				+                            str_speech_end_silence_start = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
			
 
				+                        logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
			
 
				+
			
 
				                         if not is_speech:
			
 
				                             # Voice deactivity was detected, so we start
			
 
				                             # measuring silence time before stopping recording
			
 
				-                            if self.speech_end_silence_start == 0:
			
 
				+                            if self.speech_end_silence_start == 0 and \
			
 
				+                                (time.time() - self.recording_start_time > self.min_length_of_recording):
			
 
				+
			
 
				                                 self.speech_end_silence_start = time.time()
			
 
				-                                if self.early_transcription_on_silence and len(self.frames) > 0:
			
 
				-                                     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				-                                     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				-                                     self.parent_transcription_pipe.send((audio, self.language))
			
 
				-                                     self.transcribe_count += 1                                
			
 
				+
			
 
				+                            if self.speech_end_silence_start and self.early_transcription_on_silence and len(self.frames) > 0 and \
			
 
				+                                (time.time() - self.speech_end_silence_start > self.early_transcription_on_silence) and \
			
 
				+                                self.allowed_to_early_transcribe:
			
 
				+                                    logging.debug("Adding early transcription request")
			
 
				+                                    self.transcribe_count += 1
			
 
				+                                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				+                                    audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				+                                    self.parent_transcription_pipe.send((audio, self.language))
			
 
				+                                    self.allowed_to_early_transcribe = False
			
 
				+
			
 
				                         else:
			
 
				-                            self.speech_end_silence_start = 0
			
 
				+                            if self.speech_end_silence_start:
			
 
				+                                logging.info("Resetting self.speech_end_silence_start")
			
 
				+                                self.speech_end_silence_start = 0
			
 
				+                                self.allowed_to_early_transcribe = True
			
 
				+
			
 
				 
			
 
				                         # Wait for silence to stop recording after speech
			
 
				                         if self.speech_end_silence_start and time.time() - \
			
 
				                                 self.speech_end_silence_start >= \
			
 
				                                 self.post_speech_silence_duration:
			
 
				-                            logging.info("voice deactivity detected")
			
 
				+
			
 
				+                            # Get time in desired format (HH:MM:SS.nnn)
			
 
				+                            silence_start_time = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
			
 
				+
			
 
				+                            # Calculate time difference
			
 
				+                            time_diff = time.time() - self.speech_end_silence_start
			
 
				+
			
 
				+                            logging.info(f"voice deactivity detected at {silence_start_time}, "
			
 
				+                                        f"time since silence start: {time_diff:.3f} seconds")
			
 
				+
			
 
				                             self.frames.append(data)
			
 
				                             self.stop()
			
 
				+                            if not self.is_recording:
			
 
				+                                self.speech_end_silence_start = 0
			
 
				 
			
 
				-                            if not self.use_wake_words:
			
 
				-                                self.listen_start = time.time()
			
 
				-                                self._set_state("listening")
			
 
				-                                self.start_recording_on_voice_activity = True    
			
 
				+                                if not self.use_wake_words:
			
 
				+                                    self.listen_start = time.time()
			
 
				+                                    self._set_state("listening")
			
 
				+                                    self.start_recording_on_voice_activity = True
			
 
				+                            else:
			
 
				+                                failed_stop_attempt = True
			
 
				 
			
 
				                 if not self.is_recording and was_recording:
			
 
				                     # Reset after stopping recording to ensure clean state
			
@@ -1657,7 +1791,7 @@ class AudioToTextRecorder:
 
				 
			
 
				                 was_recording = self.is_recording
			
 
				 
			
 
				-                if self.is_recording:
			
 
				+                if self.is_recording and not failed_stop_attempt:
			
 
				                     self.frames.append(data)
			
 
				 
			
 
				                 if not self.is_recording or self.speech_end_silence_start:
			
@@ -1753,7 +1887,7 @@ class AudioToTextRecorder:
 
				                     if self.is_recording and time.time() - \
			
 
				                             self.recording_start_time > 0.5:
			
 
				 
			
 
				-                        logging.debug('Starting realtime transcription')
			
 
				+                        # logging.debug('Starting realtime transcription')
			
 
				                         self.realtime_transcription_text = realtime_text
			
 
				                         self.realtime_transcription_text = \
			
 
				                             self.realtime_transcription_text.strip()
			
--- a/tests/realtimestt_test.py
+++ b/tests/realtimestt_test.py
@@ -1,22 +1,46 @@
 
				-from RealtimeSTT import AudioToTextRecorder
			
 
				-from colorama import Fore, Back, Style
			
 
				-import colorama
			
 
				-import os
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				 
			
 
				+    EXTENDED_LOGGING = False
			
 
				+
			
 
				+    if EXTENDED_LOGGING:
			
 
				+        import logging
			
 
				+        logging.basicConfig(level=logging.DEBUG)
			
 
				+
			
 
				+    import os
			
 
				+    import sys
			
 
				+    from RealtimeSTT import AudioToTextRecorder
			
 
				+    from colorama import Fore, Back, Style
			
 
				+    import colorama
			
 
				+
			
 
				+    if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
			
 
				+        from torchaudio._extension.utils import _init_dll_path
			
 
				+        _init_dll_path()    
			
 
				+
			
 
				     print("Initializing RealtimeSTT test...")
			
 
				 
			
 
				     colorama.init()
			
 
				 
			
 
				     full_sentences = []
			
 
				     displayed_text = ""
			
 
				+    prev_text = ""
			
 
				+    recorder = None
			
 
				+
			
 
				+    end_of_sentence_detection_pause = 0.4
			
 
				+    mid_sentence_detection_pause = 0.7
			
 
				 
			
 
				     def clear_console():
			
 
				         os.system('clear' if os.name == 'posix' else 'cls')
			
 
				 
			
 
				     def text_detected(text):
			
 
				-        global displayed_text
			
 
				+        global displayed_text, prev_text
			
 
				+        sentence_end_marks = ['.', '!', '?', '。'] 
			
 
				+        if text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
			
 
				+            recorder.post_speech_silence_duration = end_of_sentence_detection_pause
			
 
				+        else:
			
 
				+            recorder.post_speech_silence_duration = mid_sentence_detection_pause
			
 
				+
			
 
				+        prev_text = text
			
 
				+
			
 
				         sentences_with_style = [
			
 
				             f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
			
 
				             for i, sentence in enumerate(full_sentences)
			
@@ -26,33 +50,48 @@ if __name__ == '__main__':
 
				         if new_text != displayed_text:
			
 
				             displayed_text = new_text
			
 
				             clear_console()
			
 
				-            print(f"Language: {recorder.detected_language} (realtime: {recorder.detected_realtime_language})")
			
 
				             print(displayed_text, end="", flush=True)
			
 
				 
			
 
				     def process_text(text):
			
 
				+        recorder.post_speech_silence_duration = end_of_sentence_detection_pause
			
 
				         full_sentences.append(text)
			
 
				+        prev_text = ""
			
 
				         text_detected("")
			
 
				 
			
 
				+    # Recorder configuration
			
 
				     recorder_config = {
			
 
				         'spinner': False,
			
 
				         'model': 'large-v2',
			
 
				-        'silero_sensitivity': 0.4,
			
 
				-        'webrtc_sensitivity': 2,
			
 
				-        'post_speech_silence_duration': 0.4,
			
 
				+        'realtime_model_type': 'tiny.en',
			
 
				+        'language': 'en',
			
 
				+        'input_device_index': 1,
			
 
				+        'silero_sensitivity': 0.05,
			
 
				+        'webrtc_sensitivity': 3,
			
 
				+        'post_speech_silence_duration': end_of_sentence_detection_pause,
			
 
				         'min_length_of_recording': 0,
			
 
				-        'min_gap_between_recordings': 0,
			
 
				+        'min_gap_between_recordings': 0,                
			
 
				         'enable_realtime_transcription': True,
			
 
				-        'realtime_processing_pause': 0.2,
			
 
				-        'realtime_model_type': 'tiny',
			
 
				-        'on_realtime_transcription_update': text_detected, 
			
 
				+        'realtime_processing_pause': 0.1,
			
 
				+        'on_realtime_transcription_update': text_detected,
			
 
				         'silero_deactivity_detection': True,
			
 
				+        'min_length_of_recording': 0.7,        
			
 
				+        'early_transcription_on_silence': 0.2,
			
 
				+        'beam_size': 5,
			
 
				+        'beam_size_realtime': 1,
			
 
				+        'no_log_file': False,
			
 
				     }
			
 
				 
			
 
				+    if EXTENDED_LOGGING:
			
 
				+        recorder_config['level'] = logging.DEBUG
			
 
				+
			
 
				     recorder = AudioToTextRecorder(**recorder_config)
			
 
				 
			
 
				     clear_console()
			
 
				     print("Say something...", end="", flush=True)
			
 
				 
			
 
				-    while True:
			
 
				-        recorder.text(process_text)
			
 
				 
			
 
				+    try:
			
 
				+        while (True):
			
 
				+            recorder.text(process_text)
			
 
				+    except KeyboardInterrupt:
			
 
				+        print("Exiting application due to keyboard interrupt")