1 year ago · 06328c15e8
--- a/README.md
+++ b/README.md
@@ -18,11 +18,11 @@ It's ideal for:
 
															 - **Voice Assistants**
														
 
															 - Applications requiring **fast and precise** speech-to-text conversion
														
 
															-https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-0722c3ee6d14
														
 
															+https://github.com/user-attachments/assets/797e6552-27cd-41b1-a7f3-e5cbc72094f5
														
 
															 ### Updates
														
 
															-Latest Version: v0.2.42
														
 
															+Latest Version: v0.2.5
														
 
															 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
														
@@ -310,6 +310,14 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
															 - **debug_mode** (bool, default=False): If set, the system prints additional debug information to the console.
														
 
															+- **print_transcription_time** (bool, default=False): Logs the processing time of the main model transcription. This can be useful for performance monitoring and debugging.
														
 
															+
														
 
															+- **early_transcription_on_silence** (int, default=0): If set, the system will transcribe audio faster when silence is detected. Transcription will start after the specified milliseconds. Keep this value lower than `post_speech_silence_duration`, ideally around `post_speech_silence_duration` minus the estimated transcription time with the main model. If silence lasts longer than `post_speech_silence_duration`, the recording is stopped, and the transcription is submitted. If voice activity resumes within this period, the transcription is discarded. This results in faster final transcriptions at the cost of additional GPU load due to some unnecessary final transcriptions.
														
 
															+
														
 
															+- **allowed_latency_limit** (int, default=100): Specifies the maximum number of unprocessed chunks in the queue before discarding chunks. This helps prevent the system from being overwhelmed and losing responsiveness in real-time applications.
														
 
															+
														
 
															+- **no_log_file** (bool, default=False): If set, the system will skip writing the debug log file, reducing disk I/O. Useful if logging to a file is not needed and performance is a priority.
														
 
															+
														
 
															 #### Real-time Transcription Parameters
														
 
															 > **Note**: *When enabling realtime description a GPU installation is strongly advised. Using realtime transcription may create high GPU loads.*
														
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
@@ -34,6 +34,7 @@ from ctypes import c_bool
 
															 from openwakeword.model import Model
														
 
															 from scipy.signal import resample
														
 
															 from scipy import signal
														
 
															+import signal as system_signal
														
 
															 import faster_whisper
														
 
															 import openwakeword
														
 
															 import collections
														
@@ -72,7 +73,7 @@ INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
 
															 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
														
 
															 INIT_WAKE_WORD_TIMEOUT = 5.0
														
 
															 INIT_WAKE_WORD_BUFFER_DURATION = 0.1
														
 
															-ALLOWED_LATENCY_LIMIT = 10
														
 
															+ALLOWED_LATENCY_LIMIT = 100
														
 
															 TIME_SLEEP = 0.02
														
 
															 SAMPLE_RATE = 16000
														
@@ -159,8 +160,10 @@ class AudioToTextRecorder:
 
															                  sample_rate: int = SAMPLE_RATE,
														
 
															                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
														
 
															                  suppress_tokens: Optional[List[int]] = [-1],
														
 
															-                 log_transcription_time: bool = False,
														
 
															-                 early_transcription_on_silence: bool = True
														
 
															+                 print_transcription_time: bool = False,
														
 
															+                 early_transcription_on_silence: int = 0,
														
 
															+                 allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
														
 
															+                 no_log_file: bool = False
														
 
															                  ):
														
 
															         """
														
 
															         Initializes an audio recorder and  transcription
														
@@ -341,15 +344,22 @@ class AudioToTextRecorder:
 
															             prompt to be fed to the transcription models.
														
 
															         - suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
														
 
															             from the transcription output.
														
 
															-        - log_transcription_time (bool, default=False): Logs processing time
														
 
															+        - print_transcription_time (bool, default=False): Logs processing time
														
 
															             of main model transcription 
														
 
															-        - early_transcription_on_silence (bool, default=True):  If True, the
														
 
															-            system will immediately transcribe audio when silence is detected.
														
 
															+        - early_transcription_on_silence (int, default=0): If set, the
														
 
															+            system will transcribe audio faster when silence is detected.
														
 
															+            Transcription will start after the specified milliseconds, so 
														
 
															+            keep this value lower than post_speech_silence_duration. 
														
 
															+            Ideally around post_speech_silence_duration minus the estimated
														
 
															+            transcription time with the main model.
														
 
															             If silence lasts longer than post_speech_silence_duration, the 
														
 
															             recording is stopped, and the transcription is submitted. If 
														
 
															             voice activity resumes within this period, the transcription 
														
 
															             is discarded. Results in faster final transcriptions to the cost
														
 
															-            of some unnecessary final transcriptions
														
 
															+            of additional GPU load due to some unnecessary final transcriptions.
														
 
															+        - allowed_latency_limit (int, default=100): Maximal amount of chunks
														
 
															+            that can be unprocessed in queue before discarding chunks.
														
 
															+        - no_log_file (bool, default=False): Skips writing of debug log file.
														
 
															         Raises:
														
 
															             Exception: Errors related to initializing transcription
														
@@ -400,7 +410,7 @@ class AudioToTextRecorder:
 
															         self.handle_buffer_overflow = handle_buffer_overflow
														
 
															         self.beam_size = beam_size
														
 
															         self.beam_size_realtime = beam_size_realtime
														
 
															-        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
														
 
															+        self.allowed_latency_limit = allowed_latency_limit
														
 
															         self.level = level
														
 
															         self.audio_queue = mp.Queue()
														
@@ -441,35 +451,40 @@ class AudioToTextRecorder:
 
															         self.detected_realtime_language_probability = 0
														
 
															         self.transcription_lock = threading.Lock()
														
 
															         self.transcribe_count = 0
														
 
															-        self.log_transcription_time = log_transcription_time
														
 
															+        self.print_transcription_time = print_transcription_time
														
 
															         self.early_transcription_on_silence = early_transcription_on_silence
														
 
															         # Initialize the logging configuration with the specified level
														
 
															         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
														
 
															-        file_log_format = '%(asctime)s - ' + log_format
														
 
															+
														
 
															+        # Adjust file_log_format to include milliseconds
														
 
															+        file_log_format = '%(asctime)s.%(msecs)03d - ' + log_format
														
 
															         # Get the root logger
														
 
															         logger = logging.getLogger()
														
 
															-        logger.setLevel(level)  # Set the logger's level
														
 
															-        logger.propagate = False  # Prevent propagation to higher-level loggers
														
 
															+        logger.setLevel(logging.DEBUG)  # Set the root logger's level to DEBUG
														
 
															         # Remove any existing handlers
														
 
															         logger.handlers = []
														
 
															-        # Create a file handler and set its level
														
 
															-        file_handler = logging.FileHandler('realtimesst.log')
														
 
															-        file_handler.setLevel(logging.DEBUG)
														
 
															-        file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
														
 
															-
														
 
															         # Create a console handler and set its level
														
 
															         console_handler = logging.StreamHandler()
														
 
															-        console_handler.setLevel(level)
														
 
															+        console_handler.setLevel(level) 
														
 
															         console_handler.setFormatter(logging.Formatter(log_format))
														
 
															         # Add the handlers to the logger
														
 
															-        logger.addHandler(file_handler)
														
 
															+        if not no_log_file:
														
 
															+            # Create a file handler and set its level
														
 
															+            file_handler = logging.FileHandler('realtimesst.log')
														
 
															+            file_handler.setLevel(logging.DEBUG)
														
 
															+            file_handler.setFormatter(logging.Formatter(
														
 
															+                file_log_format,
														
 
															+                datefmt='%Y-%m-%d %H:%M:%S'
														
 
															+            ))
														
 
															+
														
 
															+            logger.addHandler(file_handler)
														
 
															         logger.addHandler(console_handler)
														
 
															-        
														
 
															+
														
 
															         self.is_shut_down = False
														
 
															         self.shutdown_event = mp.Event()
														
@@ -793,6 +808,9 @@ class AudioToTextRecorder:
 
															             Exception: If there is an error while initializing the
														
 
															             transcription model.
														
 
															         """
														
 
															+
														
 
															+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
														
 
															+
														
 
															         def custom_print(*args, **kwargs):
														
 
															             message = ' '.join(map(str, args))
														
 
															             try:
														
@@ -904,6 +922,7 @@ class AudioToTextRecorder:
 
															         import pyaudio
														
 
															         import numpy as np
														
 
															         from scipy import signal
														
 
															+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
														
 
															         def get_highest_sample_rate(audio_interface, device_index):
														
 
															             """Get the highest supported sample rate for the specified device."""
														
@@ -1013,6 +1032,7 @@ class AudioToTextRecorder:
 
															         buffer = bytearray()
														
 
															         silero_buffer_size = 2 * buffer_size  # silero complains if too short
														
 
															+        time_since_last_buffer_message = 0
														
 
															         try:
														
 
															             while not shutdown_event.is_set():
														
 
															                 try:
														
@@ -1029,7 +1049,16 @@ class AudioToTextRecorder:
 
															                             buffer = buffer[silero_buffer_size:]
														
 
															                             # Feed the extracted data to the audio_queue
														
 
															+                            if time_since_last_buffer_message:
														
 
															+                                time_passed = time.time() - time_since_last_buffer_message
														
 
															+                                if time_passed > 1:
														
 
															+                                    logging.debug("_audio_data_worker writing audio data into queue.")
														
 
															+                                    time_since_last_buffer_message = time.time()
														
 
															+                            else:
														
 
															+                                time_since_last_buffer_message = time.time()
														
 
															+
														
 
															                             audio_queue.put(to_process)
														
 
															+                            
														
 
															                 except OSError as e:
														
 
															                     if e.errno == pyaudio.paInputOverflowed:
														
@@ -1120,42 +1149,48 @@ class AudioToTextRecorder:
 
															         - Modifies the audio attribute to contain the processed audio data.
														
 
															         """
														
 
															-        logging.info("Setting listen time")
														
 
															-        if self.listen_start == 0:
														
 
															-            self.listen_start = time.time()
														
 
															-
														
 
															-        # If not yet started recording, wait for voice activity to initiate.
														
 
															-        if not self.is_recording and not self.frames:
														
 
															-            self._set_state("listening")
														
 
															-            self.start_recording_on_voice_activity = True
														
 
															+        try:
														
 
															+            logging.info("Setting listen time")
														
 
															+            if self.listen_start == 0:
														
 
															+                self.listen_start = time.time()
														
 
															+
														
 
															+            # If not yet started recording, wait for voice activity to initiate.
														
 
															+            if not self.is_recording and not self.frames:
														
 
															+                self._set_state("listening")
														
 
															+                self.start_recording_on_voice_activity = True
														
 
															+
														
 
															+                # Wait until recording starts
														
 
															+                logging.debug('Waiting for recording start')
														
 
															+                while not self.interrupt_stop_event.is_set():
														
 
															+                    if self.start_recording_event.wait(timeout=0.02):
														
 
															+                        break
														
 
															-            # Wait until recording starts
														
 
															-            logging.debug('Waiting for recording start')
														
 
															-            while not self.interrupt_stop_event.is_set():
														
 
															-                if self.start_recording_event.wait(timeout=0.02):
														
 
															-                    break
														
 
															+            # If recording is ongoing, wait for voice inactivity
														
 
															+            # to finish recording.
														
 
															+            if self.is_recording:
														
 
															+                self.stop_recording_on_voice_deactivity = True
														
 
															-        # If recording is ongoing, wait for voice inactivity
														
 
															-        # to finish recording.
														
 
															-        if self.is_recording:
														
 
															-            self.stop_recording_on_voice_deactivity = True
														
 
															+                # Wait until recording stops
														
 
															+                logging.debug('Waiting for recording stop')
														
 
															+                while not self.interrupt_stop_event.is_set():
														
 
															+                    if (self.stop_recording_event.wait(timeout=0.02)):
														
 
															+                        break
														
 
															-            # Wait until recording stops
														
 
															-            logging.debug('Waiting for recording stop')
														
 
															-            while not self.interrupt_stop_event.is_set():
														
 
															-                if (self.stop_recording_event.wait(timeout=0.02)):
														
 
															-                    break
														
 
															+            # Convert recorded frames to the appropriate audio format.
														
 
															+            audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															+            self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															+            self.frames.clear()
														
 
															-        # Convert recorded frames to the appropriate audio format.
														
 
															-        audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															-        self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															-        self.frames.clear()
														
 
															+            # Reset recording-related timestamps
														
 
															+            self.recording_stop_time = 0
														
 
															+            self.listen_start = 0
														
 
															-        # Reset recording-related timestamps
														
 
															-        self.recording_stop_time = 0
														
 
															-        self.listen_start = 0
														
 
															+            self._set_state("inactive")
														
 
															-        self._set_state("inactive")
														
 
															+        except KeyboardInterrupt:
														
 
															+            logging.info("KeyboardInterrupt in wait_audio, shutting down")
														
 
															+            self.shutdown()
														
 
															+            raise  # Re-raise the exception after cleanup
														
 
															     def transcribe(self):
														
 
															         """
														
@@ -1184,18 +1219,22 @@ class AudioToTextRecorder:
 
															         """
														
 
															         self._set_state("transcribing")
														
 
															         audio_copy = copy.deepcopy(self.audio)
														
 
															-        start_time = time.time()  # Start timing
														
 
															+        start_time = 0
														
 
															         with self.transcription_lock:
														
 
															+            
														
 
															             try:
														
 
															                 if self.transcribe_count == 0:
														
 
															+                    logging.debug("Adding transcription request, no early transcription started")
														
 
															+                    start_time = time.time()  # Start timing
														
 
															                     self.parent_transcription_pipe.send((self.audio, self.language))
														
 
															                     self.transcribe_count += 1
														
 
															                 while self.transcribe_count > 0:
														
 
															-                    logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
														
 
															+                    logging.debug(F"Receive from parent_transcription_pipe after sendiung transcription request, transcribe_count: {self.transcribe_count}")
														
 
															                     status, result = self.parent_transcription_pipe.recv()
														
 
															                     self.transcribe_count -= 1
														
 
															+                self.allowed_to_early_transcribe = True
														
 
															                 self._set_state("inactive")
														
 
															                 if status == 'success':
														
 
															                     segments, info = result
														
@@ -1206,8 +1245,11 @@ class AudioToTextRecorder:
 
															                     end_time = time.time()  # End timing
														
 
															                     transcription_time = end_time - start_time
														
 
															-                    if self.log_transcription_time:
														
 
															-                        logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
														
 
															+                    if start_time:
														
 
															+                        if self.print_transcription_time:
														
 
															+                            print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
														
 
															+                        else:
														
 
															+                            logging.debug(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
														
 
															                     return transcription
														
 
															                 else:
														
 
															                     logging.error(f"Transcription error: {result}")
														
@@ -1280,11 +1322,14 @@ class AudioToTextRecorder:
 
															         Returns (if not callback is set):
														
 
															             str: The transcription of the recorded audio
														
 
															         """
														
 
															-
														
 
															         self.interrupt_stop_event.clear()
														
 
															         self.was_interrupted.clear()
														
 
															-
														
 
															-        self.wait_audio()
														
 
															+        try:
														
 
															+            self.wait_audio()
														
 
															+        except KeyboardInterrupt:
														
 
															+            logging.info("KeyboardInterrupt in text() method")
														
 
															+            self.shutdown()
														
 
															+            raise  # Re-raise the exception after cleanup
														
 
															         if self.is_shut_down or self.interrupt_stop_event.is_set():
														
 
															             if self.interrupt_stop_event.is_set():
														
@@ -1293,10 +1338,51 @@ class AudioToTextRecorder:
 
															         if on_transcription_finished:
														
 
															             threading.Thread(target=on_transcription_finished,
														
 
															-                             args=(self.transcribe(),)).start()
														
 
															+                            args=(self.transcribe(),)).start()
														
 
															         else:
														
 
															             return self.transcribe()
														
 
															+    # def text(self,
														
 
															+    #          on_transcription_finished=None,
														
 
															+    #          ):
														
 
															+    #     """
														
 
															+    #     Transcribes audio captured by this class instance
														
 
															+    #     using the `faster_whisper` model.
														
 
															+
														
 
															+    #     - Automatically starts recording upon voice activity if not manually
														
 
															+    #       started using `recorder.start()`.
														
 
															+    #     - Automatically stops recording upon voice deactivity if not manually
														
 
															+    #       stopped with `recorder.stop()`.
														
 
															+    #     - Processes the recorded audio to generate transcription.
														
 
															+
														
 
															+    #     Args:
														
 
															+    #         on_transcription_finished (callable, optional): Callback function
														
 
															+    #           to be executed when transcription is ready.
														
 
															+    #         If provided, transcription will be performed asynchronously, and
														
 
															+    #           the callback will receive the transcription as its argument.
														
 
															+    #           If omitted, the transcription will be performed synchronously,
														
 
															+    #           and the result will be returned.
														
 
															+
														
 
															+    #     Returns (if not callback is set):
														
 
															+    #         str: The transcription of the recorded audio
														
 
															+    #     """
														
 
															+
														
 
															+    #     self.interrupt_stop_event.clear()
														
 
															+    #     self.was_interrupted.clear()
														
 
															+
														
 
															+    #     self.wait_audio()
														
 
															+
														
 
															+    #     if self.is_shut_down or self.interrupt_stop_event.is_set():
														
 
															+    #         if self.interrupt_stop_event.is_set():
														
 
															+    #             self.was_interrupted.set()
														
 
															+    #         return ""
														
 
															+
														
 
															+    #     if on_transcription_finished:
														
 
															+    #         threading.Thread(target=on_transcription_finished,
														
 
															+    #                          args=(self.transcribe(),)).start()
														
 
															+    #     else:
														
 
															+    #         return self.transcribe()
														
 
															+
														
 
															     def start(self):
														
 
															         """
														
 
															         Starts recording audio directly without waiting for voice activity.
														
@@ -1412,6 +1498,9 @@ class AudioToTextRecorder:
 
															         recording worker and closing the audio stream.
														
 
															         """
														
 
															+        print("RealtimeSTT shutting down")
														
 
															+        logging.debug("RealtimeSTT shutting down")
														
 
															+
														
 
															         # Force wait_audio() and text() to exit
														
 
															         self.is_shut_down = True
														
 
															         self.start_recording_event.set()
														
@@ -1467,10 +1556,12 @@ class AudioToTextRecorder:
 
															         logging.debug('Starting recording worker')
														
 
															         try:
														
 
															+            time_since_last_buffer_message = 0
														
 
															             was_recording = False
														
 
															             delay_was_passed = False
														
 
															             wakeword_detected_time = None
														
 
															             wakeword_samples_to_remove = None
														
 
															+            self.allowed_to_early_transcribe = True
														
 
															             # Continuously monitor audio for voice activity
														
 
															             while self.is_running:
														
@@ -1490,11 +1581,13 @@ class AudioToTextRecorder:
 
															                         # Handle queue overflow
														
 
															                         if (self.audio_queue.qsize() >
														
 
															                                 self.allowed_latency_limit):
														
 
															+                            logging.warning("!!! ### !!! ### !!!")
														
 
															                             logging.warning("Audio queue size exceeds "
														
 
															                                             "latency limit. Current size: "
														
 
															                                             f"{self.audio_queue.qsize()}. "
														
 
															                                             "Discarding old audio chunks."
														
 
															                                             )
														
 
															+                            logging.warning("!!! ### !!! ### !!!")
														
 
															                         while (self.audio_queue.qsize() >
														
 
															                                 self.allowed_latency_limit):
														
@@ -1506,8 +1599,18 @@ class AudioToTextRecorder:
 
															                     self.is_running = False
														
 
															                     break
														
 
															+                # Feed the extracted data to the audio_queue
														
 
															+                if time_since_last_buffer_message:
														
 
															+                    time_passed = time.time() - time_since_last_buffer_message
														
 
															+                    if time_passed > 1:
														
 
															+                        logging.debug("_recording_worker processing audio data")
														
 
															+                        time_since_last_buffer_message = time.time()
														
 
															+                else:
														
 
															+                    time_since_last_buffer_message = time.time()
														
 
															+
														
 
															+                failed_stop_attempt = False
														
 
															+
														
 
															                 if not self.is_recording:
														
 
															-                    logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
														
 
															                     # Handle not recording state
														
 
															                     time_since_listen_start = (time.time() - self.listen_start
														
 
															                                                if self.listen_start else 0)
														
@@ -1538,7 +1641,6 @@ class AudioToTextRecorder:
 
															                             else:
														
 
															                                 self._set_state("inactive")
														
 
															-                    #self.wake_word_detect_time = time.time()
														
 
															                     if self.use_wake_words and wake_word_activation_delay_passed:
														
 
															                         try:
														
 
															                             wakeword_index = self._process_wakeword(data)
														
@@ -1554,6 +1656,7 @@ class AudioToTextRecorder:
 
															                         # If a wake word is detected                        
														
 
															                         if wakeword_index >= 0:
														
 
															+                            self.wake_word_detect_time = time.time()
														
 
															                             wakeword_detected_time = time.time()
														
 
															                             wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
														
 
															                             self.wakeword_detected = True
														
@@ -1612,31 +1715,62 @@ class AudioToTextRecorder:
 
															                             else self._is_webrtc_speech(data, True)
														
 
															                         )
														
 
															+                        if not self.speech_end_silence_start:
														
 
															+                            str_speech_end_silence_start = "0"
														
 
															+                        else:
														
 
															+                            str_speech_end_silence_start = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
														
 
															+                        logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
														
 
															+
														
 
															                         if not is_speech:
														
 
															                             # Voice deactivity was detected, so we start
														
 
															                             # measuring silence time before stopping recording
														
 
															-                            if self.speech_end_silence_start == 0:
														
 
															+                            if self.speech_end_silence_start == 0 and \
														
 
															+                                (time.time() - self.recording_start_time > self.min_length_of_recording):
														
 
															+
														
 
															                                 self.speech_end_silence_start = time.time()
														
 
															-                                if self.early_transcription_on_silence and len(self.frames) > 0:
														
 
															-                                     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															-                                     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															-                                     self.parent_transcription_pipe.send((audio, self.language))
														
 
															-                                     self.transcribe_count += 1                                
														
 
															+
														
 
															+                            if self.speech_end_silence_start and self.early_transcription_on_silence and len(self.frames) > 0 and \
														
 
															+                                (time.time() - self.speech_end_silence_start > self.early_transcription_on_silence) and \
														
 
															+                                self.allowed_to_early_transcribe:
														
 
															+                                    logging.debug("Adding early transcription request")
														
 
															+                                    self.transcribe_count += 1
														
 
															+                                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
														
 
															+                                    audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
														
 
															+                                    self.parent_transcription_pipe.send((audio, self.language))
														
 
															+                                    self.allowed_to_early_transcribe = False
														
 
															+
														
 
															                         else:
														
 
															-                            self.speech_end_silence_start = 0
														
 
															+                            if self.speech_end_silence_start:
														
 
															+                                logging.info("Resetting self.speech_end_silence_start")
														
 
															+                                self.speech_end_silence_start = 0
														
 
															+                                self.allowed_to_early_transcribe = True
														
 
															+
														
 
															                         # Wait for silence to stop recording after speech
														
 
															                         if self.speech_end_silence_start and time.time() - \
														
 
															                                 self.speech_end_silence_start >= \
														
 
															                                 self.post_speech_silence_duration:
														
 
															-                            logging.info("voice deactivity detected")
														
 
															+
														
 
															+                            # Get time in desired format (HH:MM:SS.nnn)
														
 
															+                            silence_start_time = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
														
 
															+
														
 
															+                            # Calculate time difference
														
 
															+                            time_diff = time.time() - self.speech_end_silence_start
														
 
															+
														
 
															+                            logging.info(f"voice deactivity detected at {silence_start_time}, "
														
 
															+                                        f"time since silence start: {time_diff:.3f} seconds")
														
 
															+
														
 
															                             self.frames.append(data)
														
 
															                             self.stop()
														
 
															+                            if not self.is_recording:
														
 
															+                                self.speech_end_silence_start = 0
														
 
															-                            if not self.use_wake_words:
														
 
															-                                self.listen_start = time.time()
														
 
															-                                self._set_state("listening")
														
 
															-                                self.start_recording_on_voice_activity = True    
														
 
															+                                if not self.use_wake_words:
														
 
															+                                    self.listen_start = time.time()
														
 
															+                                    self._set_state("listening")
														
 
															+                                    self.start_recording_on_voice_activity = True
														
 
															+                            else:
														
 
															+                                failed_stop_attempt = True
														
 
															                 if not self.is_recording and was_recording:
														
 
															                     # Reset after stopping recording to ensure clean state
														
@@ -1657,7 +1791,7 @@ class AudioToTextRecorder:
 
															                 was_recording = self.is_recording
														
 
															-                if self.is_recording:
														
 
															+                if self.is_recording and not failed_stop_attempt:
														
 
															                     self.frames.append(data)
														
 
															                 if not self.is_recording or self.speech_end_silence_start:
														
@@ -1753,7 +1887,7 @@ class AudioToTextRecorder:
 
															                     if self.is_recording and time.time() - \
														
 
															                             self.recording_start_time > 0.5:
														
 
															-                        logging.debug('Starting realtime transcription')
														
 
															+                        # logging.debug('Starting realtime transcription')
														
 
															                         self.realtime_transcription_text = realtime_text
														
 
															                         self.realtime_transcription_text = \
														
 
															                             self.realtime_transcription_text.strip()
														
--- a/tests/realtimestt_test.py
+++ b/tests/realtimestt_test.py
@@ -1,22 +1,46 @@
 
															-from RealtimeSTT import AudioToTextRecorder
														
 
															-from colorama import Fore, Back, Style
														
 
															-import colorama
														
 
															-import os
														
 
															-
														
 
															 if __name__ == '__main__':
														
 
															+    EXTENDED_LOGGING = False
														
 
															+
														
 
															+    if EXTENDED_LOGGING:
														
 
															+        import logging
														
 
															+        logging.basicConfig(level=logging.DEBUG)
														
 
															+
														
 
															+    import os
														
 
															+    import sys
														
 
															+    from RealtimeSTT import AudioToTextRecorder
														
 
															+    from colorama import Fore, Back, Style
														
 
															+    import colorama
														
 
															+
														
 
															+    if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
														
 
															+        from torchaudio._extension.utils import _init_dll_path
														
 
															+        _init_dll_path()    
														
 
															+
														
 
															     print("Initializing RealtimeSTT test...")
														
 
															     colorama.init()
														
 
															     full_sentences = []
														
 
															     displayed_text = ""
														
 
															+    prev_text = ""
														
 
															+    recorder = None
														
 
															+
														
 
															+    end_of_sentence_detection_pause = 0.4
														
 
															+    mid_sentence_detection_pause = 0.7
														
 
															     def clear_console():
														
 
															         os.system('clear' if os.name == 'posix' else 'cls')
														
 
															     def text_detected(text):
														
 
															-        global displayed_text
														
 
															+        global displayed_text, prev_text
														
 
															+        sentence_end_marks = ['.', '!', '?', '。'] 
														
 
															+        if text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
														
 
															+            recorder.post_speech_silence_duration = end_of_sentence_detection_pause
														
 
															+        else:
														
 
															+            recorder.post_speech_silence_duration = mid_sentence_detection_pause
														
 
															+
														
 
															+        prev_text = text
														
 
															+
														
 
															         sentences_with_style = [
														
 
															             f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
														
 
															             for i, sentence in enumerate(full_sentences)
														
@@ -26,33 +50,48 @@ if __name__ == '__main__':
 
															         if new_text != displayed_text:
														
 
															             displayed_text = new_text
														
 
															             clear_console()
														
 
															-            print(f"Language: {recorder.detected_language} (realtime: {recorder.detected_realtime_language})")
														
 
															             print(displayed_text, end="", flush=True)
														
 
															     def process_text(text):
														
 
															+        recorder.post_speech_silence_duration = end_of_sentence_detection_pause
														
 
															         full_sentences.append(text)
														
 
															+        prev_text = ""
														
 
															         text_detected("")
														
 
															+    # Recorder configuration
														
 
															     recorder_config = {
														
 
															         'spinner': False,
														
 
															         'model': 'large-v2',
														
 
															-        'silero_sensitivity': 0.4,
														
 
															-        'webrtc_sensitivity': 2,
														
 
															-        'post_speech_silence_duration': 0.4,
														
 
															+        'realtime_model_type': 'tiny.en',
														
 
															+        'language': 'en',
														
 
															+        'input_device_index': 1,
														
 
															+        'silero_sensitivity': 0.05,
														
 
															+        'webrtc_sensitivity': 3,
														
 
															+        'post_speech_silence_duration': end_of_sentence_detection_pause,
														
 
															         'min_length_of_recording': 0,
														
 
															-        'min_gap_between_recordings': 0,
														
 
															+        'min_gap_between_recordings': 0,                
														
 
															         'enable_realtime_transcription': True,
														
 
															-        'realtime_processing_pause': 0.2,
														
 
															-        'realtime_model_type': 'tiny',
														
 
															-        'on_realtime_transcription_update': text_detected, 
														
 
															+        'realtime_processing_pause': 0.1,
														
 
															+        'on_realtime_transcription_update': text_detected,
														
 
															         'silero_deactivity_detection': True,
														
 
															+        'min_length_of_recording': 0.7,        
														
 
															+        'early_transcription_on_silence': 0.2,
														
 
															+        'beam_size': 5,
														
 
															+        'beam_size_realtime': 1,
														
 
															+        'no_log_file': False,
														
 
															     }
														
 
															+    if EXTENDED_LOGGING:
														
 
															+        recorder_config['level'] = logging.DEBUG
														
 
															+
														
 
															     recorder = AudioToTextRecorder(**recorder_config)
														
 
															     clear_console()
														
 
															     print("Say something...", end="", flush=True)
														
 
															-    while True:
														
 
															-        recorder.text(process_text)
														
 
															+    try:
														
 
															+        while (True):
														
 
															+            recorder.text(process_text)
														
 
															+    except KeyboardInterrupt:
														
 
															+        print("Exiting application due to keyboard interrupt")