9 месяцев назад · 7185200e63
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
@@ -159,6 +159,7 @@ class AudioToTextRecorder:
 
				                  sample_rate: int = SAMPLE_RATE,
			
 
				                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
			
 
				                  suppress_tokens: Optional[List[int]] = [-1],
			
 
				+                 log_transcription_time: bool = False
			
 
				                  ):
			
 
				         """
			
 
				         Initializes an audio recorder and  transcription
			
@@ -429,18 +430,25 @@ class AudioToTextRecorder:
 
				         self.detected_realtime_language = None
			
 
				         self.detected_realtime_language_probability = 0
			
 
				         self.transcription_lock = threading.Lock()
			
 
				+        self.transcribe_count = 0
			
 
				+        self.log_transcription_time = log_transcription_time
			
 
				 
			
 
				         # Initialize the logging configuration with the specified level
			
 
				         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
			
 
				+        file_log_format = '%(asctime)s - ' + log_format
			
 
				 
			
 
				-        # Create a logger
			
 
				+        # Get the root logger
			
 
				         logger = logging.getLogger()
			
 
				-        logger.setLevel(level)  # Set the root logger's level
			
 
				+        logger.setLevel(level)  # Set the logger's level
			
 
				+        logger.propagate = False  # Prevent propagation to higher-level loggers
			
 
				+
			
 
				+        # Remove any existing handlers
			
 
				+        logger.handlers = []
			
 
				 
			
 
				         # Create a file handler and set its level
			
 
				         file_handler = logging.FileHandler('realtimesst.log')
			
 
				         file_handler.setLevel(logging.DEBUG)
			
 
				-        file_handler.setFormatter(logging.Formatter(log_format))
			
 
				+        file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
			
 
				 
			
 
				         # Create a console handler and set its level
			
 
				         console_handler = logging.StreamHandler()
			
@@ -450,15 +458,16 @@ class AudioToTextRecorder:
 
				         # Add the handlers to the logger
			
 
				         logger.addHandler(file_handler)
			
 
				         logger.addHandler(console_handler)
			
 
				-
			
 
				+        
			
 
				         self.is_shut_down = False
			
 
				         self.shutdown_event = mp.Event()
			
 
				-
			
 
				+        
			
 
				         try:
			
 
				-            logging.debug("Explicitly setting the multiprocessing start method to 'spawn'")
			
 
				-            mp.set_start_method('spawn')
			
 
				+            # Only set the start method if it hasn't been set already
			
 
				+            if mp.get_start_method(allow_none=True) is None:
			
 
				+                mp.set_start_method("spawn")
			
 
				         except RuntimeError as e:
			
 
				-            logging.debug(f"Start method has already been set. Details: {e}")
			
 
				+            logging.info(f"Start method has already been set. Details: {e}")
			
 
				 
			
 
				         logging.info("Starting RealTimeSTT")
			
 
				 
			
@@ -706,9 +715,28 @@ class AudioToTextRecorder:
 
				 
			
 
				     def _read_stdout(self):
			
 
				         while not self.shutdown_event.is_set():
			
 
				-            if self.parent_stdout_pipe.poll(0.1):
			
 
				-                message = self.parent_stdout_pipe.recv()
			
 
				-                print(message, flush=True)
			
 
				+            try:
			
 
				+                if self.parent_stdout_pipe.poll(0.1):
			
 
				+                    logging.debug("Receive from stdout pipe")
			
 
				+                    message = self.parent_stdout_pipe.recv()
			
 
				+                    logging.info(message)
			
 
				+            except (BrokenPipeError, EOFError, OSError):
			
 
				+                # The pipe probably has been closed, so we ignore the error
			
 
				+                pass
			
 
				+            # except BrokenPipeError as e:  # handle broken pipe error
			
 
				+            #     pass                
			
 
				+            # except EOFError as e:
			
 
				+            #     logging.error(f"EOFError in read from stdout: {e}")
			
 
				+            #     logging.error(traceback.format_exc())
			
 
				+            #     break            
			
 
				+            except KeyboardInterrupt:  # handle manual interruption (Ctrl+C)
			
 
				+                logging.info("KeyboardInterrupt in read from stdout detected, exiting...")
			
 
				+                break
			
 
				+            except Exception as e:
			
 
				+                logging.error(f"Unexpected error in read from stdout: {e}")
			
 
				+                logging.error(traceback.format_exc())  # Log the full traceback here
			
 
				+                break 
			
 
				+            time.sleep(0.1)
			
 
				 
			
 
				     @staticmethod
			
 
				     def _transcription_worker(conn,
			
@@ -762,7 +790,11 @@ class AudioToTextRecorder:
 
				         """
			
 
				         def custom_print(*args, **kwargs):
			
 
				             message = ' '.join(map(str, args))
			
 
				-            stdout_pipe.send(message)
			
 
				+            try:
			
 
				+                stdout_pipe.send(message)
			
 
				+            except (BrokenPipeError, EOFError, OSError):
			
 
				+                # The pipe probably has been closed, so we ignore the error
			
 
				+                pass
			
 
				 
			
 
				         # Replace the built-in print function with our custom one
			
 
				         __builtins__['print'] = custom_print
			
@@ -791,33 +823,48 @@ class AudioToTextRecorder:
 
				                       "transcription model initialized successfully"
			
 
				                       )
			
 
				 
			
 
				-        while not shutdown_event.is_set():
			
 
				-            try:
			
 
				-                if conn.poll(0.01):
			
 
				-                    audio, language = conn.recv()
			
 
				-                    try:
			
 
				-                        segments, info = model.transcribe(
			
 
				-                            audio,
			
 
				-                            language=language if language else None,
			
 
				-                            beam_size=beam_size,
			
 
				-                            initial_prompt=initial_prompt,
			
 
				-                            suppress_tokens=suppress_tokens
			
 
				-                        )
			
 
				-                        transcription = " ".join(seg.text for seg in segments)
			
 
				-                        transcription = transcription.strip()
			
 
				-                        conn.send(('success', (transcription, info)))
			
 
				-                    except Exception as e:
			
 
				-                        logging.error(f"General transcription error: {e}")
			
 
				-                        conn.send(('error', str(e)))
			
 
				-                else:
			
 
				-                    time.sleep(TIME_SLEEP)
			
 
				+        try:
			
 
				+            while not shutdown_event.is_set():
			
 
				+                try:
			
 
				+                    if conn.poll(0.01):
			
 
				+                        logging.debug("Receive from _transcription_worker  pipe")
			
 
				+                        audio, language = conn.recv()
			
 
				+                        try:
			
 
				+                            segments, info = model.transcribe(
			
 
				+                                audio,
			
 
				+                                language=language if language else None,
			
 
				+                                beam_size=beam_size,
			
 
				+                                initial_prompt=initial_prompt,
			
 
				+                                suppress_tokens=suppress_tokens
			
 
				+                            )
			
 
				+                            transcription = " ".join(seg.text for seg in segments)
			
 
				+                            transcription = transcription.strip()
			
 
				+                            logging.debug(f"Final text detected with main model: {transcription}")
			
 
				+                            conn.send(('success', (transcription, info)))
			
 
				+                        except Exception as e:
			
 
				+                            logging.error(f"General error in _transcription_worker in transcription: {e}")
			
 
				+                            conn.send(('error', str(e)))
			
 
				+                    else:
			
 
				+                        time.sleep(TIME_SLEEP)
			
 
				 
			
 
				-            except KeyboardInterrupt:
			
 
				-                interrupt_stop_event.set()
			
 
				-                logging.debug("Transcription worker process "
			
 
				-                              "finished due to KeyboardInterrupt"
			
 
				-                              )
			
 
				-                break
			
 
				+
			
 
				+
			
 
				+                except KeyboardInterrupt:
			
 
				+                    interrupt_stop_event.set()
			
 
				+                    
			
 
				+                    logging.debug("Transcription worker process "
			
 
				+                                    "finished due to KeyboardInterrupt"
			
 
				+                                    )
			
 
				+                    stdout_pipe.close()
			
 
				+                    break
			
 
				+
			
 
				+                except Exception as e:
			
 
				+                    logging.error(f"General error in _transcription_worker in accessing pipe: {e}")
			
 
				+
			
 
				+        finally:
			
 
				+            __builtins__['print'] = print  # Restore the original print function            
			
 
				+            conn.close()
			
 
				+            stdout_pipe.close()
			
 
				 
			
 
				     @staticmethod
			
 
				     def _audio_data_worker(audio_queue,
			
@@ -916,33 +963,47 @@ class AudioToTextRecorder:
 
				         device_sample_rate = None
			
 
				         chunk_size = 1024  # Increased chunk size for better performance
			
 
				 
			
 
				-        try:
			
 
				-            audio_interface = pyaudio.PyAudio()
			
 
				-            if input_device_index is None:
			
 
				-                try:
			
 
				-                    default_device = audio_interface.get_default_input_device_info()
			
 
				-                    input_device_index = default_device['index']
			
 
				-                except OSError as e:
			
 
				-                    input_device_index = None
			
 
				-
			
 
				-
			
 
				-            if input_device_index is not None:
			
 
				-                device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
			
 
				-            else:
			
 
				-                device_sample_rate = 16000  # better: try 16000, 48000, ... until it works
			
 
				+        def setup_audio():  
			
 
				+            nonlocal audio_interface, stream, device_sample_rate, input_device_index
			
 
				+            try:
			
 
				+                audio_interface = pyaudio.PyAudio()
			
 
				+                if input_device_index is None:
			
 
				+                    try:
			
 
				+                        default_device = audio_interface.get_default_input_device_info()
			
 
				+                        input_device_index = default_device['index']
			
 
				+                    except OSError as e:
			
 
				+                        input_device_index = None
			
 
				+
			
 
				+                sample_rates_to_try = [16000]  # Try 16000 Hz first
			
 
				+                if input_device_index is not None:
			
 
				+                    highest_rate = get_highest_sample_rate(audio_interface, input_device_index)
			
 
				+                    if highest_rate != 16000:
			
 
				+                        sample_rates_to_try.append(highest_rate)
			
 
				+                else:
			
 
				+                    sample_rates_to_try.append(48000)  # Fallback sample rate
			
 
				 
			
 
				-            stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
			
 
				+                for rate in sample_rates_to_try:
			
 
				+                    try:
			
 
				+                        device_sample_rate = rate
			
 
				+                        stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
			
 
				+                        if stream is not None:
			
 
				+                            logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
			
 
				+                            return True
			
 
				+                    except Exception as e:
			
 
				+                        logging.warning(f"Failed to initialize audio stream at {device_sample_rate} Hz: {e}")
			
 
				+                        continue
			
 
				 
			
 
				-            if stream is None:
			
 
				-                raise Exception("Failed to initialize audio stream.")
			
 
				+                # If we reach here, none of the sample rates worked
			
 
				+                raise Exception("Failed to initialize audio stream with all sample rates.")
			
 
				 
			
 
				-        except Exception as e:
			
 
				-            logging.exception(f"Error initializing pyaudio audio recording: {e}")
			
 
				-            if audio_interface:
			
 
				-                audio_interface.terminate()
			
 
				-            raise
			
 
				+            except Exception as e:
			
 
				+                logging.exception(f"Error initializing pyaudio audio recording: {e}")
			
 
				+                if audio_interface:
			
 
				+                    audio_interface.terminate()
			
 
				+                return False
			
 
				 
			
 
				-        logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
			
 
				+        if not setup_audio():
			
 
				+            raise Exception("Failed to set up audio recording.")
			
 
				 
			
 
				         buffer = bytearray()
			
 
				         silero_buffer_size = 2 * buffer_size  # silero complains if too short
			
@@ -950,7 +1011,7 @@ class AudioToTextRecorder:
 
				         try:
			
 
				             while not shutdown_event.is_set():
			
 
				                 try:
			
 
				-                    data = stream.read(chunk_size)
			
 
				+                    data = stream.read(chunk_size, exception_on_overflow=False)
			
 
				                     
			
 
				                     if use_microphone.value:
			
 
				                         processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
			
@@ -970,13 +1031,45 @@ class AudioToTextRecorder:
 
				                         logging.warning("Input overflowed. Frame dropped.")
			
 
				                     else:
			
 
				                         logging.error(f"Error during recording: {e}")
			
 
				+                        # Attempt to reinitialize the stream
			
 
				+                        logging.info("Attempting to reinitialize the audio stream...")
			
 
				+                        if stream:
			
 
				+                            stream.stop_stream()
			
 
				+                            stream.close()
			
 
				+                        if audio_interface:
			
 
				+                            audio_interface.terminate()
			
 
				+                        
			
 
				+                        # Wait a bit before trying to reinitialize
			
 
				+                        time.sleep(1)
			
 
				+                        
			
 
				+                        if not setup_audio():
			
 
				+                            logging.error("Failed to reinitialize audio stream. Exiting.")
			
 
				+                            break
			
 
				+                        else:
			
 
				+                            logging.info("Audio stream reinitialized successfully.")
			
 
				                     continue
			
 
				 
			
 
				                 except Exception as e:
			
 
				                     logging.error(f"Error during recording: {e}")
			
 
				                     tb_str = traceback.format_exc()
			
 
				-                    print(f"Traceback: {tb_str}")
			
 
				-                    print(f"Error: {e}")
			
 
				+                    logging.error(f"Traceback: {tb_str}")
			
 
				+                    logging.error(f"Error: {e}")
			
 
				+                    # Attempt to reinitialize the stream
			
 
				+                    logging.info("Attempting to reinitialize the audio stream...")
			
 
				+                    if stream:
			
 
				+                        stream.stop_stream()
			
 
				+                        stream.close()
			
 
				+                    if audio_interface:
			
 
				+                        audio_interface.terminate()
			
 
				+                    
			
 
				+                    # Wait a bit before trying to reinitialize
			
 
				+                    time.sleep(0.5)
			
 
				+                    
			
 
				+                    if not setup_audio():
			
 
				+                        logging.error("Failed to reinitialize audio stream. Exiting.")
			
 
				+                        break
			
 
				+                    else:
			
 
				+                        logging.info("Audio stream reinitialized successfully.")
			
 
				                     continue
			
 
				 
			
 
				         except KeyboardInterrupt:
			
@@ -993,6 +1086,83 @@ class AudioToTextRecorder:
 
				             if audio_interface:
			
 
				                 audio_interface.terminate()
			
 
				 
			
 
				+        # try:
			
 
				+        #     audio_interface = pyaudio.PyAudio()
			
 
				+        #     if input_device_index is None:
			
 
				+        #         try:
			
 
				+        #             default_device = audio_interface.get_default_input_device_info()
			
 
				+        #             input_device_index = default_device['index']
			
 
				+        #         except OSError as e:
			
 
				+        #             input_device_index = None
			
 
				+
			
 
				+
			
 
				+        #     if input_device_index is not None:
			
 
				+        #         device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
			
 
				+        #     else:
			
 
				+        #         device_sample_rate = 16000  # better: try 16000, 48000, ... until it works
			
 
				+
			
 
				+        #     stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
			
 
				+
			
 
				+        #     if stream is None:
			
 
				+        #         raise Exception("Failed to initialize audio stream.")
			
 
				+
			
 
				+        # except Exception as e:
			
 
				+        #     logging.exception(f"Error initializing pyaudio audio recording: {e}")
			
 
				+        #     if audio_interface:
			
 
				+        #         audio_interface.terminate()
			
 
				+        #     raise
			
 
				+
			
 
				+        # logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
			
 
				+
			
 
				+        # buffer = bytearray()
			
 
				+        # silero_buffer_size = 2 * buffer_size  # silero complains if too short
			
 
				+
			
 
				+        # try:
			
 
				+        #     while not shutdown_event.is_set():
			
 
				+        #         try:
			
 
				+        #             data = stream.read(chunk_size)
			
 
				+                    
			
 
				+        #             if use_microphone.value:
			
 
				+        #                 processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
			
 
				+        #                 buffer += processed_data
			
 
				+
			
 
				+        #                 # Check if the buffer has reached or exceeded the silero_buffer_size
			
 
				+        #                 while len(buffer) >= silero_buffer_size:
			
 
				+        #                     # Extract silero_buffer_size amount of data from the buffer
			
 
				+        #                     to_process = buffer[:silero_buffer_size]
			
 
				+        #                     buffer = buffer[silero_buffer_size:]
			
 
				+
			
 
				+        #                     # Feed the extracted data to the audio_queue
			
 
				+        #                     audio_queue.put(to_process)
			
 
				+
			
 
				+        #         except OSError as e:
			
 
				+        #             if e.errno == pyaudio.paInputOverflowed:
			
 
				+        #                 logging.warning("Input overflowed. Frame dropped.")
			
 
				+        #             else:
			
 
				+        #                 logging.error(f"Error during recording: {e}")
			
 
				+        #             continue
			
 
				+
			
 
				+        #         except Exception as e:
			
 
				+        #             logging.error(f"Error during recording: {e}")
			
 
				+        #             tb_str = traceback.format_exc()
			
 
				+        #             print(f"Traceback: {tb_str}")
			
 
				+        #             print(f"Error: {e}")
			
 
				+        #             continue
			
 
				+
			
 
				+        # except KeyboardInterrupt:
			
 
				+        #     interrupt_stop_event.set()
			
 
				+        #     logging.debug("Audio data worker process finished due to KeyboardInterrupt")
			
 
				+        # finally:
			
 
				+        #     # After recording stops, feed any remaining audio data
			
 
				+        #     if buffer:
			
 
				+        #         audio_queue.put(bytes(buffer))
			
 
				+            
			
 
				+        #     if stream:
			
 
				+        #         stream.stop_stream()
			
 
				+        #         stream.close()
			
 
				+        #     if audio_interface:
			
 
				+        #         audio_interface.terminate()
			
 
				+
			
 
				     def wakeup(self):
			
 
				         """
			
 
				         If in wake work modus, wake up as if a wake word was spoken.
			
@@ -1022,7 +1192,9 @@ class AudioToTextRecorder:
 
				         - Modifies the audio attribute to contain the processed audio data.
			
 
				         """
			
 
				 
			
 
				-        self.listen_start = time.time()
			
 
				+        logging.info("Setting listen time")
			
 
				+        if self.listen_start == 0:
			
 
				+            self.listen_start = time.time()
			
 
				 
			
 
				         # If not yet started recording, wait for voice activity to initiate.
			
 
				         if not self.is_recording and not self.frames:
			
@@ -1087,8 +1259,15 @@ class AudioToTextRecorder:
 
				         start_time = time.time()  # Start timing
			
 
				         with self.transcription_lock:
			
 
				             try:
			
 
				-                self.parent_transcription_pipe.send((self.audio, self.language))
			
 
				-                status, result = self.parent_transcription_pipe.recv()
			
 
				+                if self.transcribe_count == 0:
			
 
				+                    self.parent_transcription_pipe.send((self.audio, self.language))
			
 
				+                    self.transcribe_count += 1
			
 
				+
			
 
				+                while self.transcribe_count > 0:
			
 
				+                    logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
			
 
				+                    status, result = self.parent_transcription_pipe.recv()
			
 
				+                    self.transcribe_count -= 1
			
 
				+
			
 
				                 self._set_state("inactive")
			
 
				                 if status == 'success':
			
 
				                     segments, info = result
			
@@ -1098,7 +1277,9 @@ class AudioToTextRecorder:
 
				                     transcription = self._preprocess_output(segments)
			
 
				                     end_time = time.time()  # End timing
			
 
				                     transcription_time = end_time - start_time
			
 
				-                    # print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
			
 
				+
			
 
				+                    if self.log_transcription_time:
			
 
				+                        logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
			
 
				                     return transcription
			
 
				                 else:
			
 
				                     logging.error(f"Transcription error: {result}")
			
@@ -1118,7 +1299,7 @@ class AudioToTextRecorder:
 
				             )
			
 
				             porcupine_index = self.porcupine.process(pcm)
			
 
				             if self.debug_mode:
			
 
				-                print (f"wake words porcupine_index: {porcupine_index}")
			
 
				+                logging.info(f"wake words porcupine_index: {porcupine_index}")
			
 
				             return self.porcupine.process(pcm)
			
 
				 
			
 
				         elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
			
@@ -1135,15 +1316,16 @@ class AudioToTextRecorder:
 
				                         max_score = scores[-1]
			
 
				                         max_index = idx
			
 
				                 if self.debug_mode:
			
 
				-                    print (f"wake words oww max_index, max_score: {max_index} {max_score}")
			
 
				+                    logging.info(f"wake words oww max_index, max_score: {max_index} {max_score}")
			
 
				                 return max_index  
			
 
				             else:
			
 
				                 if self.debug_mode:
			
 
				-                    print (f"wake words oww_index: -1")
			
 
				+                    logging.info(f"wake words oww_index: -1")
			
 
				                 return -1
			
 
				 
			
 
				         if self.debug_mode:        
			
 
				-            print("wake words no match")
			
 
				+            logging.info("wake words no match")
			
 
				+
			
 
				         return -1
			
 
				 
			
 
				     def text(self,
			
@@ -1392,11 +1574,12 @@ class AudioToTextRecorder:
 
				                             data = self.audio_queue.get()
			
 
				 
			
 
				                 except BrokenPipeError:
			
 
				-                    print("BrokenPipeError _recording_worker")
			
 
				+                    logging.error("BrokenPipeError _recording_worker")
			
 
				                     self.is_running = False
			
 
				                     break
			
 
				 
			
 
				                 if not self.is_recording:
			
 
				+                    logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
			
 
				                     # Handle not recording state
			
 
				                     time_since_listen_start = (time.time() - self.listen_start
			
 
				                                                if self.listen_start else 0)
			
@@ -1506,6 +1689,11 @@ class AudioToTextRecorder:
 
				                             # measuring silence time before stopping recording
			
 
				                             if self.speech_end_silence_start == 0:
			
 
				                                 self.speech_end_silence_start = time.time()
			
 
				+                                # if(len(self.frames) > 0):
			
 
				+                                #     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
			
 
				+                                #     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
			
 
				+                                #     self.parent_transcription_pipe.send((audio, self.language))
			
 
				+                                #     self.transcribe_count += 1                                
			
 
				                         else:
			
 
				                             self.speech_end_silence_start = 0
			
 
				 
			
@@ -1515,7 +1703,16 @@ class AudioToTextRecorder:
 
				                                 self.post_speech_silence_duration:
			
 
				                             logging.info("voice deactivity detected")
			
 
				                             self.frames.append(data)
			
 
				+                            logging.info("stopping recording")
			
 
				                             self.stop()
			
 
				+                            logging.info("stopped recording")
			
 
				+
			
 
				+
			
 
				+                            ####
			
 
				+                            if not self.use_wake_words:
			
 
				+                                self.listen_start = time.time()
			
 
				+                                self._set_state("listening")
			
 
				+                                self.start_recording_on_voice_activity = True    
			
 
				 
			
 
				                 if not self.is_recording and was_recording:
			
 
				                     # Reset after stopping recording to ensure clean state
			
@@ -1583,6 +1780,8 @@ class AudioToTextRecorder:
 
				                         dtype=np.int16
			
 
				                         )
			
 
				 
			
 
				+                    logging.debug(f"Current realtime buffer size: {len(audio_array)}")
			
 
				+
			
 
				                     # Normalize the array to a [-1, 1] range
			
 
				                     audio_array = audio_array.astype(np.float32) / \
			
 
				                         INT16_MAX_ABS_VALUE
			
@@ -1592,12 +1791,14 @@ class AudioToTextRecorder:
 
				                             try:
			
 
				                                 self.parent_transcription_pipe.send((audio_array, self.language))
			
 
				                                 if self.parent_transcription_pipe.poll(timeout=5):  # Wait for 5 seconds
			
 
				+                                    logging.debug("Receive from realtime worker after transcription request to main model")
			
 
				                                     status, result = self.parent_transcription_pipe.recv()
			
 
				                                     if status == 'success':
			
 
				                                         segments, info = result
			
 
				                                         self.detected_realtime_language = info.language if info.language_probability > 0 else None
			
 
				                                         self.detected_realtime_language_probability = info.language_probability
			
 
				                                         realtime_text = segments
			
 
				+                                        logging.debug(f"Realtime text detected with main model: {realtime_text}")
			
 
				                                     else:
			
 
				                                         logging.error(f"Realtime transcription error: {result}")
			
 
				                                         continue
			
@@ -1622,6 +1823,7 @@ class AudioToTextRecorder:
 
				                         realtime_text = " ".join(
			
 
				                             seg.text for seg in segments
			
 
				                         )
			
 
				+                        logging.debug(f"Realtime text detected: {realtime_text}")
			
 
				 
			
 
				                     # double check recording state
			
 
				                     # because it could have changed mid-transcription
			
@@ -1764,19 +1966,19 @@ class AudioToTextRecorder:
 
				                 speech_frames += 1
			
 
				                 if not all_frames_must_be_true:
			
 
				                     if self.debug_mode:
			
 
				-                        print(f"Speech detected in frame {i + 1}"
			
 
				+                        logging.info(f"Speech detected in frame {i + 1}"
			
 
				                               f" of {num_frames}")
			
 
				                     return True
			
 
				         if all_frames_must_be_true:
			
 
				             if self.debug_mode and speech_frames == num_frames:
			
 
				-                print(f"Speech detected in {speech_frames} of "
			
 
				+                logging.info(f"Speech detected in {speech_frames} of "
			
 
				                       f"{num_frames} frames")
			
 
				             elif self.debug_mode:
			
 
				-                print(f"Speech not detected in all {num_frames} frames")
			
 
				+                logging.info(f"Speech not detected in all {num_frames} frames")
			
 
				             return speech_frames == num_frames
			
 
				         else:
			
 
				             if self.debug_mode:
			
 
				-                print(f"Speech not detected in any of {num_frames} frames")
			
 
				+                logging.info(f"Speech not detected in any of {num_frames} frames")
			
 
				             return False
			
 
				 
			
 
				     def _check_voice_activity(self, data):
			
@@ -1841,6 +2043,9 @@ class AudioToTextRecorder:
 
				         # Update to the new state
			
 
				         self.state = new_state
			
 
				 
			
 
				+        # Log the state change
			
 
				+        logging.info(f"State changed from '{old_state}' to '{new_state}'")
			
 
				+
			
 
				         # Execute callbacks based on transitioning FROM a particular state
			
 
				         if old_state == "listening":
			
 
				             if self.on_vad_detect_stop: