Browse Source

dev update

KoljaB 7 months ago
parent
commit
06328c15e8
3 changed files with 273 additions and 92 deletions
  1. 10 2
      README.md
  2. 208 74
      RealtimeSTT/audio_recorder.py
  3. 55 16
      tests/realtimestt_test.py

+ 10 - 2
README.md

@@ -18,11 +18,11 @@ It's ideal for:
 - **Voice Assistants**
 - **Voice Assistants**
 - Applications requiring **fast and precise** speech-to-text conversion
 - Applications requiring **fast and precise** speech-to-text conversion
 
 
-https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-0722c3ee6d14
+https://github.com/user-attachments/assets/797e6552-27cd-41b1-a7f3-e5cbc72094f5
 
 
 ### Updates
 ### Updates
 
 
-Latest Version: v0.2.42
+Latest Version: v0.2.5
 
 
 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
 
 
@@ -310,6 +310,14 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
 
 - **debug_mode** (bool, default=False): If set, the system prints additional debug information to the console.
 - **debug_mode** (bool, default=False): If set, the system prints additional debug information to the console.
 
 
+- **print_transcription_time** (bool, default=False): Logs the processing time of the main model transcription. This can be useful for performance monitoring and debugging.
+
+- **early_transcription_on_silence** (int, default=0): If set, the system will transcribe audio faster when silence is detected. Transcription will start after the specified milliseconds. Keep this value lower than `post_speech_silence_duration`, ideally around `post_speech_silence_duration` minus the estimated transcription time with the main model. If silence lasts longer than `post_speech_silence_duration`, the recording is stopped, and the transcription is submitted. If voice activity resumes within this period, the transcription is discarded. This results in faster final transcriptions at the cost of additional GPU load due to some unnecessary final transcriptions.
+
+- **allowed_latency_limit** (int, default=100): Specifies the maximum number of unprocessed chunks in the queue before discarding chunks. This helps prevent the system from being overwhelmed and losing responsiveness in real-time applications.
+
+- **no_log_file** (bool, default=False): If set, the system will skip writing the debug log file, reducing disk I/O. Useful if logging to a file is not needed and performance is a priority.
+
 #### Real-time Transcription Parameters
 #### Real-time Transcription Parameters
 
 
 > **Note**: *When enabling realtime description a GPU installation is strongly advised. Using realtime transcription may create high GPU loads.*
 > **Note**: *When enabling realtime description a GPU installation is strongly advised. Using realtime transcription may create high GPU loads.*

+ 208 - 74
RealtimeSTT/audio_recorder.py

@@ -34,6 +34,7 @@ from ctypes import c_bool
 from openwakeword.model import Model
 from openwakeword.model import Model
 from scipy.signal import resample
 from scipy.signal import resample
 from scipy import signal
 from scipy import signal
+import signal as system_signal
 import faster_whisper
 import faster_whisper
 import openwakeword
 import openwakeword
 import collections
 import collections
@@ -72,7 +73,7 @@ INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
 INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
 INIT_WAKE_WORD_TIMEOUT = 5.0
 INIT_WAKE_WORD_TIMEOUT = 5.0
 INIT_WAKE_WORD_BUFFER_DURATION = 0.1
 INIT_WAKE_WORD_BUFFER_DURATION = 0.1
-ALLOWED_LATENCY_LIMIT = 10
+ALLOWED_LATENCY_LIMIT = 100
 
 
 TIME_SLEEP = 0.02
 TIME_SLEEP = 0.02
 SAMPLE_RATE = 16000
 SAMPLE_RATE = 16000
@@ -159,8 +160,10 @@ class AudioToTextRecorder:
                  sample_rate: int = SAMPLE_RATE,
                  sample_rate: int = SAMPLE_RATE,
                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
                  suppress_tokens: Optional[List[int]] = [-1],
                  suppress_tokens: Optional[List[int]] = [-1],
-                 log_transcription_time: bool = False,
-                 early_transcription_on_silence: bool = True
+                 print_transcription_time: bool = False,
+                 early_transcription_on_silence: int = 0,
+                 allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
+                 no_log_file: bool = False
                  ):
                  ):
         """
         """
         Initializes an audio recorder and  transcription
         Initializes an audio recorder and  transcription
@@ -341,15 +344,22 @@ class AudioToTextRecorder:
             prompt to be fed to the transcription models.
             prompt to be fed to the transcription models.
         - suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
         - suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
             from the transcription output.
             from the transcription output.
-        - log_transcription_time (bool, default=False): Logs processing time
+        - print_transcription_time (bool, default=False): Logs processing time
             of main model transcription 
             of main model transcription 
-        - early_transcription_on_silence (bool, default=True):  If True, the
-            system will immediately transcribe audio when silence is detected.
+        - early_transcription_on_silence (int, default=0): If set, the
+            system will transcribe audio faster when silence is detected.
+            Transcription will start after the specified milliseconds, so 
+            keep this value lower than post_speech_silence_duration. 
+            Ideally around post_speech_silence_duration minus the estimated
+            transcription time with the main model.
             If silence lasts longer than post_speech_silence_duration, the 
             If silence lasts longer than post_speech_silence_duration, the 
             recording is stopped, and the transcription is submitted. If 
             recording is stopped, and the transcription is submitted. If 
             voice activity resumes within this period, the transcription 
             voice activity resumes within this period, the transcription 
             is discarded. Results in faster final transcriptions to the cost
             is discarded. Results in faster final transcriptions to the cost
-            of some unnecessary final transcriptions
+            of additional GPU load due to some unnecessary final transcriptions.
+        - allowed_latency_limit (int, default=100): Maximal amount of chunks
+            that can be unprocessed in queue before discarding chunks.
+        - no_log_file (bool, default=False): Skips writing of debug log file.
 
 
         Raises:
         Raises:
             Exception: Errors related to initializing transcription
             Exception: Errors related to initializing transcription
@@ -400,7 +410,7 @@ class AudioToTextRecorder:
         self.handle_buffer_overflow = handle_buffer_overflow
         self.handle_buffer_overflow = handle_buffer_overflow
         self.beam_size = beam_size
         self.beam_size = beam_size
         self.beam_size_realtime = beam_size_realtime
         self.beam_size_realtime = beam_size_realtime
-        self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
+        self.allowed_latency_limit = allowed_latency_limit
 
 
         self.level = level
         self.level = level
         self.audio_queue = mp.Queue()
         self.audio_queue = mp.Queue()
@@ -441,35 +451,40 @@ class AudioToTextRecorder:
         self.detected_realtime_language_probability = 0
         self.detected_realtime_language_probability = 0
         self.transcription_lock = threading.Lock()
         self.transcription_lock = threading.Lock()
         self.transcribe_count = 0
         self.transcribe_count = 0
-        self.log_transcription_time = log_transcription_time
+        self.print_transcription_time = print_transcription_time
         self.early_transcription_on_silence = early_transcription_on_silence
         self.early_transcription_on_silence = early_transcription_on_silence
 
 
         # Initialize the logging configuration with the specified level
         # Initialize the logging configuration with the specified level
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
-        file_log_format = '%(asctime)s - ' + log_format
+
+        # Adjust file_log_format to include milliseconds
+        file_log_format = '%(asctime)s.%(msecs)03d - ' + log_format
 
 
         # Get the root logger
         # Get the root logger
         logger = logging.getLogger()
         logger = logging.getLogger()
-        logger.setLevel(level)  # Set the logger's level
-        logger.propagate = False  # Prevent propagation to higher-level loggers
+        logger.setLevel(logging.DEBUG)  # Set the root logger's level to DEBUG
 
 
         # Remove any existing handlers
         # Remove any existing handlers
         logger.handlers = []
         logger.handlers = []
 
 
-        # Create a file handler and set its level
-        file_handler = logging.FileHandler('realtimesst.log')
-        file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
-
         # Create a console handler and set its level
         # Create a console handler and set its level
         console_handler = logging.StreamHandler()
         console_handler = logging.StreamHandler()
-        console_handler.setLevel(level)
+        console_handler.setLevel(level) 
         console_handler.setFormatter(logging.Formatter(log_format))
         console_handler.setFormatter(logging.Formatter(log_format))
 
 
         # Add the handlers to the logger
         # Add the handlers to the logger
-        logger.addHandler(file_handler)
+        if not no_log_file:
+            # Create a file handler and set its level
+            file_handler = logging.FileHandler('realtimesst.log')
+            file_handler.setLevel(logging.DEBUG)
+            file_handler.setFormatter(logging.Formatter(
+                file_log_format,
+                datefmt='%Y-%m-%d %H:%M:%S'
+            ))
+
+            logger.addHandler(file_handler)
         logger.addHandler(console_handler)
         logger.addHandler(console_handler)
-        
+
         self.is_shut_down = False
         self.is_shut_down = False
         self.shutdown_event = mp.Event()
         self.shutdown_event = mp.Event()
         
         
@@ -793,6 +808,9 @@ class AudioToTextRecorder:
             Exception: If there is an error while initializing the
             Exception: If there is an error while initializing the
             transcription model.
             transcription model.
         """
         """
+
+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
+
         def custom_print(*args, **kwargs):
         def custom_print(*args, **kwargs):
             message = ' '.join(map(str, args))
             message = ' '.join(map(str, args))
             try:
             try:
@@ -904,6 +922,7 @@ class AudioToTextRecorder:
         import pyaudio
         import pyaudio
         import numpy as np
         import numpy as np
         from scipy import signal
         from scipy import signal
+        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
 
 
         def get_highest_sample_rate(audio_interface, device_index):
         def get_highest_sample_rate(audio_interface, device_index):
             """Get the highest supported sample rate for the specified device."""
             """Get the highest supported sample rate for the specified device."""
@@ -1013,6 +1032,7 @@ class AudioToTextRecorder:
         buffer = bytearray()
         buffer = bytearray()
         silero_buffer_size = 2 * buffer_size  # silero complains if too short
         silero_buffer_size = 2 * buffer_size  # silero complains if too short
 
 
+        time_since_last_buffer_message = 0
         try:
         try:
             while not shutdown_event.is_set():
             while not shutdown_event.is_set():
                 try:
                 try:
@@ -1029,7 +1049,16 @@ class AudioToTextRecorder:
                             buffer = buffer[silero_buffer_size:]
                             buffer = buffer[silero_buffer_size:]
 
 
                             # Feed the extracted data to the audio_queue
                             # Feed the extracted data to the audio_queue
+                            if time_since_last_buffer_message:
+                                time_passed = time.time() - time_since_last_buffer_message
+                                if time_passed > 1:
+                                    logging.debug("_audio_data_worker writing audio data into queue.")
+                                    time_since_last_buffer_message = time.time()
+                            else:
+                                time_since_last_buffer_message = time.time()
+
                             audio_queue.put(to_process)
                             audio_queue.put(to_process)
+                            
 
 
                 except OSError as e:
                 except OSError as e:
                     if e.errno == pyaudio.paInputOverflowed:
                     if e.errno == pyaudio.paInputOverflowed:
@@ -1120,42 +1149,48 @@ class AudioToTextRecorder:
         - Modifies the audio attribute to contain the processed audio data.
         - Modifies the audio attribute to contain the processed audio data.
         """
         """
 
 
-        logging.info("Setting listen time")
-        if self.listen_start == 0:
-            self.listen_start = time.time()
-
-        # If not yet started recording, wait for voice activity to initiate.
-        if not self.is_recording and not self.frames:
-            self._set_state("listening")
-            self.start_recording_on_voice_activity = True
+        try:
+            logging.info("Setting listen time")
+            if self.listen_start == 0:
+                self.listen_start = time.time()
+
+            # If not yet started recording, wait for voice activity to initiate.
+            if not self.is_recording and not self.frames:
+                self._set_state("listening")
+                self.start_recording_on_voice_activity = True
+
+                # Wait until recording starts
+                logging.debug('Waiting for recording start')
+                while not self.interrupt_stop_event.is_set():
+                    if self.start_recording_event.wait(timeout=0.02):
+                        break
 
 
-            # Wait until recording starts
-            logging.debug('Waiting for recording start')
-            while not self.interrupt_stop_event.is_set():
-                if self.start_recording_event.wait(timeout=0.02):
-                    break
+            # If recording is ongoing, wait for voice inactivity
+            # to finish recording.
+            if self.is_recording:
+                self.stop_recording_on_voice_deactivity = True
 
 
-        # If recording is ongoing, wait for voice inactivity
-        # to finish recording.
-        if self.is_recording:
-            self.stop_recording_on_voice_deactivity = True
+                # Wait until recording stops
+                logging.debug('Waiting for recording stop')
+                while not self.interrupt_stop_event.is_set():
+                    if (self.stop_recording_event.wait(timeout=0.02)):
+                        break
 
 
-            # Wait until recording stops
-            logging.debug('Waiting for recording stop')
-            while not self.interrupt_stop_event.is_set():
-                if (self.stop_recording_event.wait(timeout=0.02)):
-                    break
+            # Convert recorded frames to the appropriate audio format.
+            audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+            self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+            self.frames.clear()
 
 
-        # Convert recorded frames to the appropriate audio format.
-        audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
-        self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
-        self.frames.clear()
+            # Reset recording-related timestamps
+            self.recording_stop_time = 0
+            self.listen_start = 0
 
 
-        # Reset recording-related timestamps
-        self.recording_stop_time = 0
-        self.listen_start = 0
+            self._set_state("inactive")
 
 
-        self._set_state("inactive")
+        except KeyboardInterrupt:
+            logging.info("KeyboardInterrupt in wait_audio, shutting down")
+            self.shutdown()
+            raise  # Re-raise the exception after cleanup
 
 
     def transcribe(self):
     def transcribe(self):
         """
         """
@@ -1184,18 +1219,22 @@ class AudioToTextRecorder:
         """
         """
         self._set_state("transcribing")
         self._set_state("transcribing")
         audio_copy = copy.deepcopy(self.audio)
         audio_copy = copy.deepcopy(self.audio)
-        start_time = time.time()  # Start timing
+        start_time = 0
         with self.transcription_lock:
         with self.transcription_lock:
+            
             try:
             try:
                 if self.transcribe_count == 0:
                 if self.transcribe_count == 0:
+                    logging.debug("Adding transcription request, no early transcription started")
+                    start_time = time.time()  # Start timing
                     self.parent_transcription_pipe.send((self.audio, self.language))
                     self.parent_transcription_pipe.send((self.audio, self.language))
                     self.transcribe_count += 1
                     self.transcribe_count += 1
 
 
                 while self.transcribe_count > 0:
                 while self.transcribe_count > 0:
-                    logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
+                    logging.debug(F"Receive from parent_transcription_pipe after sendiung transcription request, transcribe_count: {self.transcribe_count}")
                     status, result = self.parent_transcription_pipe.recv()
                     status, result = self.parent_transcription_pipe.recv()
                     self.transcribe_count -= 1
                     self.transcribe_count -= 1
 
 
+                self.allowed_to_early_transcribe = True
                 self._set_state("inactive")
                 self._set_state("inactive")
                 if status == 'success':
                 if status == 'success':
                     segments, info = result
                     segments, info = result
@@ -1206,8 +1245,11 @@ class AudioToTextRecorder:
                     end_time = time.time()  # End timing
                     end_time = time.time()  # End timing
                     transcription_time = end_time - start_time
                     transcription_time = end_time - start_time
 
 
-                    if self.log_transcription_time:
-                        logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
+                    if start_time:
+                        if self.print_transcription_time:
+                            print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
+                        else:
+                            logging.debug(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
                     return transcription
                     return transcription
                 else:
                 else:
                     logging.error(f"Transcription error: {result}")
                     logging.error(f"Transcription error: {result}")
@@ -1280,11 +1322,14 @@ class AudioToTextRecorder:
         Returns (if not callback is set):
         Returns (if not callback is set):
             str: The transcription of the recorded audio
             str: The transcription of the recorded audio
         """
         """
-
         self.interrupt_stop_event.clear()
         self.interrupt_stop_event.clear()
         self.was_interrupted.clear()
         self.was_interrupted.clear()
-
-        self.wait_audio()
+        try:
+            self.wait_audio()
+        except KeyboardInterrupt:
+            logging.info("KeyboardInterrupt in text() method")
+            self.shutdown()
+            raise  # Re-raise the exception after cleanup
 
 
         if self.is_shut_down or self.interrupt_stop_event.is_set():
         if self.is_shut_down or self.interrupt_stop_event.is_set():
             if self.interrupt_stop_event.is_set():
             if self.interrupt_stop_event.is_set():
@@ -1293,10 +1338,51 @@ class AudioToTextRecorder:
 
 
         if on_transcription_finished:
         if on_transcription_finished:
             threading.Thread(target=on_transcription_finished,
             threading.Thread(target=on_transcription_finished,
-                             args=(self.transcribe(),)).start()
+                            args=(self.transcribe(),)).start()
         else:
         else:
             return self.transcribe()
             return self.transcribe()
 
 
+    # def text(self,
+    #          on_transcription_finished=None,
+    #          ):
+    #     """
+    #     Transcribes audio captured by this class instance
+    #     using the `faster_whisper` model.
+
+    #     - Automatically starts recording upon voice activity if not manually
+    #       started using `recorder.start()`.
+    #     - Automatically stops recording upon voice deactivity if not manually
+    #       stopped with `recorder.stop()`.
+    #     - Processes the recorded audio to generate transcription.
+
+    #     Args:
+    #         on_transcription_finished (callable, optional): Callback function
+    #           to be executed when transcription is ready.
+    #         If provided, transcription will be performed asynchronously, and
+    #           the callback will receive the transcription as its argument.
+    #           If omitted, the transcription will be performed synchronously,
+    #           and the result will be returned.
+
+    #     Returns (if not callback is set):
+    #         str: The transcription of the recorded audio
+    #     """
+
+    #     self.interrupt_stop_event.clear()
+    #     self.was_interrupted.clear()
+
+    #     self.wait_audio()
+
+    #     if self.is_shut_down or self.interrupt_stop_event.is_set():
+    #         if self.interrupt_stop_event.is_set():
+    #             self.was_interrupted.set()
+    #         return ""
+
+    #     if on_transcription_finished:
+    #         threading.Thread(target=on_transcription_finished,
+    #                          args=(self.transcribe(),)).start()
+    #     else:
+    #         return self.transcribe()
+
     def start(self):
     def start(self):
         """
         """
         Starts recording audio directly without waiting for voice activity.
         Starts recording audio directly without waiting for voice activity.
@@ -1412,6 +1498,9 @@ class AudioToTextRecorder:
         recording worker and closing the audio stream.
         recording worker and closing the audio stream.
         """
         """
 
 
+        print("RealtimeSTT shutting down")
+        logging.debug("RealtimeSTT shutting down")
+
         # Force wait_audio() and text() to exit
         # Force wait_audio() and text() to exit
         self.is_shut_down = True
         self.is_shut_down = True
         self.start_recording_event.set()
         self.start_recording_event.set()
@@ -1467,10 +1556,12 @@ class AudioToTextRecorder:
         logging.debug('Starting recording worker')
         logging.debug('Starting recording worker')
 
 
         try:
         try:
+            time_since_last_buffer_message = 0
             was_recording = False
             was_recording = False
             delay_was_passed = False
             delay_was_passed = False
             wakeword_detected_time = None
             wakeword_detected_time = None
             wakeword_samples_to_remove = None
             wakeword_samples_to_remove = None
+            self.allowed_to_early_transcribe = True
 
 
             # Continuously monitor audio for voice activity
             # Continuously monitor audio for voice activity
             while self.is_running:
             while self.is_running:
@@ -1490,11 +1581,13 @@ class AudioToTextRecorder:
                         # Handle queue overflow
                         # Handle queue overflow
                         if (self.audio_queue.qsize() >
                         if (self.audio_queue.qsize() >
                                 self.allowed_latency_limit):
                                 self.allowed_latency_limit):
+                            logging.warning("!!! ### !!! ### !!!")
                             logging.warning("Audio queue size exceeds "
                             logging.warning("Audio queue size exceeds "
                                             "latency limit. Current size: "
                                             "latency limit. Current size: "
                                             f"{self.audio_queue.qsize()}. "
                                             f"{self.audio_queue.qsize()}. "
                                             "Discarding old audio chunks."
                                             "Discarding old audio chunks."
                                             )
                                             )
+                            logging.warning("!!! ### !!! ### !!!")
 
 
                         while (self.audio_queue.qsize() >
                         while (self.audio_queue.qsize() >
                                 self.allowed_latency_limit):
                                 self.allowed_latency_limit):
@@ -1506,8 +1599,18 @@ class AudioToTextRecorder:
                     self.is_running = False
                     self.is_running = False
                     break
                     break
 
 
+                # Feed the extracted data to the audio_queue
+                if time_since_last_buffer_message:
+                    time_passed = time.time() - time_since_last_buffer_message
+                    if time_passed > 1:
+                        logging.debug("_recording_worker processing audio data")
+                        time_since_last_buffer_message = time.time()
+                else:
+                    time_since_last_buffer_message = time.time()
+
+                failed_stop_attempt = False
+
                 if not self.is_recording:
                 if not self.is_recording:
-                    logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
                     # Handle not recording state
                     # Handle not recording state
                     time_since_listen_start = (time.time() - self.listen_start
                     time_since_listen_start = (time.time() - self.listen_start
                                                if self.listen_start else 0)
                                                if self.listen_start else 0)
@@ -1538,7 +1641,6 @@ class AudioToTextRecorder:
                             else:
                             else:
                                 self._set_state("inactive")
                                 self._set_state("inactive")
 
 
-                    #self.wake_word_detect_time = time.time()
                     if self.use_wake_words and wake_word_activation_delay_passed:
                     if self.use_wake_words and wake_word_activation_delay_passed:
                         try:
                         try:
                             wakeword_index = self._process_wakeword(data)
                             wakeword_index = self._process_wakeword(data)
@@ -1554,6 +1656,7 @@ class AudioToTextRecorder:
 
 
                         # If a wake word is detected                        
                         # If a wake word is detected                        
                         if wakeword_index >= 0:
                         if wakeword_index >= 0:
+                            self.wake_word_detect_time = time.time()
                             wakeword_detected_time = time.time()
                             wakeword_detected_time = time.time()
                             wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
                             wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
                             self.wakeword_detected = True
                             self.wakeword_detected = True
@@ -1612,31 +1715,62 @@ class AudioToTextRecorder:
                             else self._is_webrtc_speech(data, True)
                             else self._is_webrtc_speech(data, True)
                         )
                         )
 
 
+                        if not self.speech_end_silence_start:
+                            str_speech_end_silence_start = "0"
+                        else:
+                            str_speech_end_silence_start = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
+                        logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
+
                         if not is_speech:
                         if not is_speech:
                             # Voice deactivity was detected, so we start
                             # Voice deactivity was detected, so we start
                             # measuring silence time before stopping recording
                             # measuring silence time before stopping recording
-                            if self.speech_end_silence_start == 0:
+                            if self.speech_end_silence_start == 0 and \
+                                (time.time() - self.recording_start_time > self.min_length_of_recording):
+
                                 self.speech_end_silence_start = time.time()
                                 self.speech_end_silence_start = time.time()
-                                if self.early_transcription_on_silence and len(self.frames) > 0:
-                                     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
-                                     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
-                                     self.parent_transcription_pipe.send((audio, self.language))
-                                     self.transcribe_count += 1                                
+
+                            if self.speech_end_silence_start and self.early_transcription_on_silence and len(self.frames) > 0 and \
+                                (time.time() - self.speech_end_silence_start > self.early_transcription_on_silence) and \
+                                self.allowed_to_early_transcribe:
+                                    logging.debug("Adding early transcription request")
+                                    self.transcribe_count += 1
+                                    audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+                                    audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+                                    self.parent_transcription_pipe.send((audio, self.language))
+                                    self.allowed_to_early_transcribe = False
+
                         else:
                         else:
-                            self.speech_end_silence_start = 0
+                            if self.speech_end_silence_start:
+                                logging.info("Resetting self.speech_end_silence_start")
+                                self.speech_end_silence_start = 0
+                                self.allowed_to_early_transcribe = True
+
 
 
                         # Wait for silence to stop recording after speech
                         # Wait for silence to stop recording after speech
                         if self.speech_end_silence_start and time.time() - \
                         if self.speech_end_silence_start and time.time() - \
                                 self.speech_end_silence_start >= \
                                 self.speech_end_silence_start >= \
                                 self.post_speech_silence_duration:
                                 self.post_speech_silence_duration:
-                            logging.info("voice deactivity detected")
+
+                            # Get time in desired format (HH:MM:SS.nnn)
+                            silence_start_time = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
+
+                            # Calculate time difference
+                            time_diff = time.time() - self.speech_end_silence_start
+
+                            logging.info(f"voice deactivity detected at {silence_start_time}, "
+                                        f"time since silence start: {time_diff:.3f} seconds")
+
                             self.frames.append(data)
                             self.frames.append(data)
                             self.stop()
                             self.stop()
+                            if not self.is_recording:
+                                self.speech_end_silence_start = 0
 
 
-                            if not self.use_wake_words:
-                                self.listen_start = time.time()
-                                self._set_state("listening")
-                                self.start_recording_on_voice_activity = True    
+                                if not self.use_wake_words:
+                                    self.listen_start = time.time()
+                                    self._set_state("listening")
+                                    self.start_recording_on_voice_activity = True
+                            else:
+                                failed_stop_attempt = True
 
 
                 if not self.is_recording and was_recording:
                 if not self.is_recording and was_recording:
                     # Reset after stopping recording to ensure clean state
                     # Reset after stopping recording to ensure clean state
@@ -1657,7 +1791,7 @@ class AudioToTextRecorder:
 
 
                 was_recording = self.is_recording
                 was_recording = self.is_recording
 
 
-                if self.is_recording:
+                if self.is_recording and not failed_stop_attempt:
                     self.frames.append(data)
                     self.frames.append(data)
 
 
                 if not self.is_recording or self.speech_end_silence_start:
                 if not self.is_recording or self.speech_end_silence_start:
@@ -1753,7 +1887,7 @@ class AudioToTextRecorder:
                     if self.is_recording and time.time() - \
                     if self.is_recording and time.time() - \
                             self.recording_start_time > 0.5:
                             self.recording_start_time > 0.5:
 
 
-                        logging.debug('Starting realtime transcription')
+                        # logging.debug('Starting realtime transcription')
                         self.realtime_transcription_text = realtime_text
                         self.realtime_transcription_text = realtime_text
                         self.realtime_transcription_text = \
                         self.realtime_transcription_text = \
                             self.realtime_transcription_text.strip()
                             self.realtime_transcription_text.strip()

+ 55 - 16
tests/realtimestt_test.py

@@ -1,22 +1,46 @@
-from RealtimeSTT import AudioToTextRecorder
-from colorama import Fore, Back, Style
-import colorama
-import os
-
 if __name__ == '__main__':
 if __name__ == '__main__':
 
 
+    EXTENDED_LOGGING = False
+
+    if EXTENDED_LOGGING:
+        import logging
+        logging.basicConfig(level=logging.DEBUG)
+
+    import os
+    import sys
+    from RealtimeSTT import AudioToTextRecorder
+    from colorama import Fore, Back, Style
+    import colorama
+
+    if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
+        from torchaudio._extension.utils import _init_dll_path
+        _init_dll_path()    
+
     print("Initializing RealtimeSTT test...")
     print("Initializing RealtimeSTT test...")
 
 
     colorama.init()
     colorama.init()
 
 
     full_sentences = []
     full_sentences = []
     displayed_text = ""
     displayed_text = ""
+    prev_text = ""
+    recorder = None
+
+    end_of_sentence_detection_pause = 0.4
+    mid_sentence_detection_pause = 0.7
 
 
     def clear_console():
     def clear_console():
         os.system('clear' if os.name == 'posix' else 'cls')
         os.system('clear' if os.name == 'posix' else 'cls')
 
 
     def text_detected(text):
     def text_detected(text):
-        global displayed_text
+        global displayed_text, prev_text
+        sentence_end_marks = ['.', '!', '?', '。'] 
+        if text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+            recorder.post_speech_silence_duration = end_of_sentence_detection_pause
+        else:
+            recorder.post_speech_silence_duration = mid_sentence_detection_pause
+
+        prev_text = text
+
         sentences_with_style = [
         sentences_with_style = [
             f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
             f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
             for i, sentence in enumerate(full_sentences)
             for i, sentence in enumerate(full_sentences)
@@ -26,33 +50,48 @@ if __name__ == '__main__':
         if new_text != displayed_text:
         if new_text != displayed_text:
             displayed_text = new_text
             displayed_text = new_text
             clear_console()
             clear_console()
-            print(f"Language: {recorder.detected_language} (realtime: {recorder.detected_realtime_language})")
             print(displayed_text, end="", flush=True)
             print(displayed_text, end="", flush=True)
 
 
     def process_text(text):
     def process_text(text):
+        recorder.post_speech_silence_duration = end_of_sentence_detection_pause
         full_sentences.append(text)
         full_sentences.append(text)
+        prev_text = ""
         text_detected("")
         text_detected("")
 
 
+    # Recorder configuration
     recorder_config = {
     recorder_config = {
         'spinner': False,
         'spinner': False,
         'model': 'large-v2',
         'model': 'large-v2',
-        'silero_sensitivity': 0.4,
-        'webrtc_sensitivity': 2,
-        'post_speech_silence_duration': 0.4,
+        'realtime_model_type': 'tiny.en',
+        'language': 'en',
+        'input_device_index': 1,
+        'silero_sensitivity': 0.05,
+        'webrtc_sensitivity': 3,
+        'post_speech_silence_duration': end_of_sentence_detection_pause,
         'min_length_of_recording': 0,
         'min_length_of_recording': 0,
-        'min_gap_between_recordings': 0,
+        'min_gap_between_recordings': 0,                
         'enable_realtime_transcription': True,
         'enable_realtime_transcription': True,
-        'realtime_processing_pause': 0.2,
-        'realtime_model_type': 'tiny',
-        'on_realtime_transcription_update': text_detected, 
+        'realtime_processing_pause': 0.1,
+        'on_realtime_transcription_update': text_detected,
         'silero_deactivity_detection': True,
         'silero_deactivity_detection': True,
+        'min_length_of_recording': 0.7,        
+        'early_transcription_on_silence': 0.2,
+        'beam_size': 5,
+        'beam_size_realtime': 1,
+        'no_log_file': False,
     }
     }
 
 
+    if EXTENDED_LOGGING:
+        recorder_config['level'] = logging.DEBUG
+
     recorder = AudioToTextRecorder(**recorder_config)
     recorder = AudioToTextRecorder(**recorder_config)
 
 
     clear_console()
     clear_console()
     print("Say something...", end="", flush=True)
     print("Say something...", end="", flush=True)
 
 
-    while True:
-        recorder.text(process_text)
 
 
+    try:
+        while (True):
+            recorder.text(process_text)
+    except KeyboardInterrupt:
+        print("Exiting application due to keyboard interrupt")