KoljaB 7 месяцев назад
Родитель
Сommit
7185200e63
1 измененных файлов с 282 добавлено и 77 удалено
  1. 282 77
      RealtimeSTT/audio_recorder.py

+ 282 - 77
RealtimeSTT/audio_recorder.py

@@ -159,6 +159,7 @@ class AudioToTextRecorder:
                  sample_rate: int = SAMPLE_RATE,
                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
                  suppress_tokens: Optional[List[int]] = [-1],
+                 log_transcription_time: bool = False
                  ):
         """
         Initializes an audio recorder and  transcription
@@ -429,18 +430,25 @@ class AudioToTextRecorder:
         self.detected_realtime_language = None
         self.detected_realtime_language_probability = 0
         self.transcription_lock = threading.Lock()
+        self.transcribe_count = 0
+        self.log_transcription_time = log_transcription_time
 
         # Initialize the logging configuration with the specified level
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
+        file_log_format = '%(asctime)s - ' + log_format
 
-        # Create a logger
+        # Get the root logger
         logger = logging.getLogger()
-        logger.setLevel(level)  # Set the root logger's level
+        logger.setLevel(level)  # Set the logger's level
+        logger.propagate = False  # Prevent propagation to higher-level loggers
+
+        # Remove any existing handlers
+        logger.handlers = []
 
         # Create a file handler and set its level
         file_handler = logging.FileHandler('realtimesst.log')
         file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(logging.Formatter(log_format))
+        file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
 
         # Create a console handler and set its level
         console_handler = logging.StreamHandler()
@@ -450,15 +458,16 @@ class AudioToTextRecorder:
         # Add the handlers to the logger
         logger.addHandler(file_handler)
         logger.addHandler(console_handler)
-
+        
         self.is_shut_down = False
         self.shutdown_event = mp.Event()
-
+        
         try:
-            logging.debug("Explicitly setting the multiprocessing start method to 'spawn'")
-            mp.set_start_method('spawn')
+            # Only set the start method if it hasn't been set already
+            if mp.get_start_method(allow_none=True) is None:
+                mp.set_start_method("spawn")
         except RuntimeError as e:
-            logging.debug(f"Start method has already been set. Details: {e}")
+            logging.info(f"Start method has already been set. Details: {e}")
 
         logging.info("Starting RealTimeSTT")
 
@@ -706,9 +715,28 @@ class AudioToTextRecorder:
 
     def _read_stdout(self):
         while not self.shutdown_event.is_set():
-            if self.parent_stdout_pipe.poll(0.1):
-                message = self.parent_stdout_pipe.recv()
-                print(message, flush=True)
+            try:
+                if self.parent_stdout_pipe.poll(0.1):
+                    logging.debug("Receive from stdout pipe")
+                    message = self.parent_stdout_pipe.recv()
+                    logging.info(message)
+            except (BrokenPipeError, EOFError, OSError):
+                # The pipe probably has been closed, so we ignore the error
+                pass
+            # except BrokenPipeError as e:  # handle broken pipe error
+            #     pass                
+            # except EOFError as e:
+            #     logging.error(f"EOFError in read from stdout: {e}")
+            #     logging.error(traceback.format_exc())
+            #     break            
+            except KeyboardInterrupt:  # handle manual interruption (Ctrl+C)
+                logging.info("KeyboardInterrupt in read from stdout detected, exiting...")
+                break
+            except Exception as e:
+                logging.error(f"Unexpected error in read from stdout: {e}")
+                logging.error(traceback.format_exc())  # Log the full traceback here
+                break 
+            time.sleep(0.1)
 
     @staticmethod
     def _transcription_worker(conn,
@@ -762,7 +790,11 @@ class AudioToTextRecorder:
         """
         def custom_print(*args, **kwargs):
             message = ' '.join(map(str, args))
-            stdout_pipe.send(message)
+            try:
+                stdout_pipe.send(message)
+            except (BrokenPipeError, EOFError, OSError):
+                # The pipe probably has been closed, so we ignore the error
+                pass
 
         # Replace the built-in print function with our custom one
         __builtins__['print'] = custom_print
@@ -791,33 +823,48 @@ class AudioToTextRecorder:
                       "transcription model initialized successfully"
                       )
 
-        while not shutdown_event.is_set():
-            try:
-                if conn.poll(0.01):
-                    audio, language = conn.recv()
-                    try:
-                        segments, info = model.transcribe(
-                            audio,
-                            language=language if language else None,
-                            beam_size=beam_size,
-                            initial_prompt=initial_prompt,
-                            suppress_tokens=suppress_tokens
-                        )
-                        transcription = " ".join(seg.text for seg in segments)
-                        transcription = transcription.strip()
-                        conn.send(('success', (transcription, info)))
-                    except Exception as e:
-                        logging.error(f"General transcription error: {e}")
-                        conn.send(('error', str(e)))
-                else:
-                    time.sleep(TIME_SLEEP)
+        try:
+            while not shutdown_event.is_set():
+                try:
+                    if conn.poll(0.01):
+                        logging.debug("Receive from _transcription_worker  pipe")
+                        audio, language = conn.recv()
+                        try:
+                            segments, info = model.transcribe(
+                                audio,
+                                language=language if language else None,
+                                beam_size=beam_size,
+                                initial_prompt=initial_prompt,
+                                suppress_tokens=suppress_tokens
+                            )
+                            transcription = " ".join(seg.text for seg in segments)
+                            transcription = transcription.strip()
+                            logging.debug(f"Final text detected with main model: {transcription}")
+                            conn.send(('success', (transcription, info)))
+                        except Exception as e:
+                            logging.error(f"General error in _transcription_worker in transcription: {e}")
+                            conn.send(('error', str(e)))
+                    else:
+                        time.sleep(TIME_SLEEP)
 
-            except KeyboardInterrupt:
-                interrupt_stop_event.set()
-                logging.debug("Transcription worker process "
-                              "finished due to KeyboardInterrupt"
-                              )
-                break
+
+
+                except KeyboardInterrupt:
+                    interrupt_stop_event.set()
+                    
+                    logging.debug("Transcription worker process "
+                                    "finished due to KeyboardInterrupt"
+                                    )
+                    stdout_pipe.close()
+                    break
+
+                except Exception as e:
+                    logging.error(f"General error in _transcription_worker in accessing pipe: {e}")
+
+        finally:
+            __builtins__['print'] = print  # Restore the original print function            
+            conn.close()
+            stdout_pipe.close()
 
     @staticmethod
     def _audio_data_worker(audio_queue,
@@ -916,33 +963,47 @@ class AudioToTextRecorder:
         device_sample_rate = None
         chunk_size = 1024  # Increased chunk size for better performance
 
-        try:
-            audio_interface = pyaudio.PyAudio()
-            if input_device_index is None:
-                try:
-                    default_device = audio_interface.get_default_input_device_info()
-                    input_device_index = default_device['index']
-                except OSError as e:
-                    input_device_index = None
-
-
-            if input_device_index is not None:
-                device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
-            else:
-                device_sample_rate = 16000  # better: try 16000, 48000, ... until it works
+        def setup_audio():  
+            nonlocal audio_interface, stream, device_sample_rate, input_device_index
+            try:
+                audio_interface = pyaudio.PyAudio()
+                if input_device_index is None:
+                    try:
+                        default_device = audio_interface.get_default_input_device_info()
+                        input_device_index = default_device['index']
+                    except OSError as e:
+                        input_device_index = None
+
+                sample_rates_to_try = [16000]  # Try 16000 Hz first
+                if input_device_index is not None:
+                    highest_rate = get_highest_sample_rate(audio_interface, input_device_index)
+                    if highest_rate != 16000:
+                        sample_rates_to_try.append(highest_rate)
+                else:
+                    sample_rates_to_try.append(48000)  # Fallback sample rate
 
-            stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
+                for rate in sample_rates_to_try:
+                    try:
+                        device_sample_rate = rate
+                        stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
+                        if stream is not None:
+                            logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
+                            return True
+                    except Exception as e:
+                        logging.warning(f"Failed to initialize audio stream at {device_sample_rate} Hz: {e}")
+                        continue
 
-            if stream is None:
-                raise Exception("Failed to initialize audio stream.")
+                # If we reach here, none of the sample rates worked
+                raise Exception("Failed to initialize audio stream with all sample rates.")
 
-        except Exception as e:
-            logging.exception(f"Error initializing pyaudio audio recording: {e}")
-            if audio_interface:
-                audio_interface.terminate()
-            raise
+            except Exception as e:
+                logging.exception(f"Error initializing pyaudio audio recording: {e}")
+                if audio_interface:
+                    audio_interface.terminate()
+                return False
 
-        logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
+        if not setup_audio():
+            raise Exception("Failed to set up audio recording.")
 
         buffer = bytearray()
         silero_buffer_size = 2 * buffer_size  # silero complains if too short
@@ -950,7 +1011,7 @@ class AudioToTextRecorder:
         try:
             while not shutdown_event.is_set():
                 try:
-                    data = stream.read(chunk_size)
+                    data = stream.read(chunk_size, exception_on_overflow=False)
                     
                     if use_microphone.value:
                         processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
@@ -970,13 +1031,45 @@ class AudioToTextRecorder:
                         logging.warning("Input overflowed. Frame dropped.")
                     else:
                         logging.error(f"Error during recording: {e}")
+                        # Attempt to reinitialize the stream
+                        logging.info("Attempting to reinitialize the audio stream...")
+                        if stream:
+                            stream.stop_stream()
+                            stream.close()
+                        if audio_interface:
+                            audio_interface.terminate()
+                        
+                        # Wait a bit before trying to reinitialize
+                        time.sleep(1)
+                        
+                        if not setup_audio():
+                            logging.error("Failed to reinitialize audio stream. Exiting.")
+                            break
+                        else:
+                            logging.info("Audio stream reinitialized successfully.")
                     continue
 
                 except Exception as e:
                     logging.error(f"Error during recording: {e}")
                     tb_str = traceback.format_exc()
-                    print(f"Traceback: {tb_str}")
-                    print(f"Error: {e}")
+                    logging.error(f"Traceback: {tb_str}")
+                    logging.error(f"Error: {e}")
+                    # Attempt to reinitialize the stream
+                    logging.info("Attempting to reinitialize the audio stream...")
+                    if stream:
+                        stream.stop_stream()
+                        stream.close()
+                    if audio_interface:
+                        audio_interface.terminate()
+                    
+                    # Wait a bit before trying to reinitialize
+                    time.sleep(0.5)
+                    
+                    if not setup_audio():
+                        logging.error("Failed to reinitialize audio stream. Exiting.")
+                        break
+                    else:
+                        logging.info("Audio stream reinitialized successfully.")
                     continue
 
         except KeyboardInterrupt:
@@ -993,6 +1086,83 @@ class AudioToTextRecorder:
             if audio_interface:
                 audio_interface.terminate()
 
+        # try:
+        #     audio_interface = pyaudio.PyAudio()
+        #     if input_device_index is None:
+        #         try:
+        #             default_device = audio_interface.get_default_input_device_info()
+        #             input_device_index = default_device['index']
+        #         except OSError as e:
+        #             input_device_index = None
+
+
+        #     if input_device_index is not None:
+        #         device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
+        #     else:
+        #         device_sample_rate = 16000  # better: try 16000, 48000, ... until it works
+
+        #     stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
+
+        #     if stream is None:
+        #         raise Exception("Failed to initialize audio stream.")
+
+        # except Exception as e:
+        #     logging.exception(f"Error initializing pyaudio audio recording: {e}")
+        #     if audio_interface:
+        #         audio_interface.terminate()
+        #     raise
+
+        # logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
+
+        # buffer = bytearray()
+        # silero_buffer_size = 2 * buffer_size  # silero complains if too short
+
+        # try:
+        #     while not shutdown_event.is_set():
+        #         try:
+        #             data = stream.read(chunk_size)
+                    
+        #             if use_microphone.value:
+        #                 processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
+        #                 buffer += processed_data
+
+        #                 # Check if the buffer has reached or exceeded the silero_buffer_size
+        #                 while len(buffer) >= silero_buffer_size:
+        #                     # Extract silero_buffer_size amount of data from the buffer
+        #                     to_process = buffer[:silero_buffer_size]
+        #                     buffer = buffer[silero_buffer_size:]
+
+        #                     # Feed the extracted data to the audio_queue
+        #                     audio_queue.put(to_process)
+
+        #         except OSError as e:
+        #             if e.errno == pyaudio.paInputOverflowed:
+        #                 logging.warning("Input overflowed. Frame dropped.")
+        #             else:
+        #                 logging.error(f"Error during recording: {e}")
+        #             continue
+
+        #         except Exception as e:
+        #             logging.error(f"Error during recording: {e}")
+        #             tb_str = traceback.format_exc()
+        #             print(f"Traceback: {tb_str}")
+        #             print(f"Error: {e}")
+        #             continue
+
+        # except KeyboardInterrupt:
+        #     interrupt_stop_event.set()
+        #     logging.debug("Audio data worker process finished due to KeyboardInterrupt")
+        # finally:
+        #     # After recording stops, feed any remaining audio data
+        #     if buffer:
+        #         audio_queue.put(bytes(buffer))
+            
+        #     if stream:
+        #         stream.stop_stream()
+        #         stream.close()
+        #     if audio_interface:
+        #         audio_interface.terminate()
+
     def wakeup(self):
         """
         If in wake work modus, wake up as if a wake word was spoken.
@@ -1022,7 +1192,9 @@ class AudioToTextRecorder:
         - Modifies the audio attribute to contain the processed audio data.
         """
 
-        self.listen_start = time.time()
+        logging.info("Setting listen time")
+        if self.listen_start == 0:
+            self.listen_start = time.time()
 
         # If not yet started recording, wait for voice activity to initiate.
         if not self.is_recording and not self.frames:
@@ -1087,8 +1259,15 @@ class AudioToTextRecorder:
         start_time = time.time()  # Start timing
         with self.transcription_lock:
             try:
-                self.parent_transcription_pipe.send((self.audio, self.language))
-                status, result = self.parent_transcription_pipe.recv()
+                if self.transcribe_count == 0:
+                    self.parent_transcription_pipe.send((self.audio, self.language))
+                    self.transcribe_count += 1
+
+                while self.transcribe_count > 0:
+                    logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
+                    status, result = self.parent_transcription_pipe.recv()
+                    self.transcribe_count -= 1
+
                 self._set_state("inactive")
                 if status == 'success':
                     segments, info = result
@@ -1098,7 +1277,9 @@ class AudioToTextRecorder:
                     transcription = self._preprocess_output(segments)
                     end_time = time.time()  # End timing
                     transcription_time = end_time - start_time
-                    # print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
+
+                    if self.log_transcription_time:
+                        logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
                     return transcription
                 else:
                     logging.error(f"Transcription error: {result}")
@@ -1118,7 +1299,7 @@ class AudioToTextRecorder:
             )
             porcupine_index = self.porcupine.process(pcm)
             if self.debug_mode:
-                print (f"wake words porcupine_index: {porcupine_index}")
+                logging.info(f"wake words porcupine_index: {porcupine_index}")
             return self.porcupine.process(pcm)
 
         elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
@@ -1135,15 +1316,16 @@ class AudioToTextRecorder:
                         max_score = scores[-1]
                         max_index = idx
                 if self.debug_mode:
-                    print (f"wake words oww max_index, max_score: {max_index} {max_score}")
+                    logging.info(f"wake words oww max_index, max_score: {max_index} {max_score}")
                 return max_index  
             else:
                 if self.debug_mode:
-                    print (f"wake words oww_index: -1")
+                    logging.info(f"wake words oww_index: -1")
                 return -1
 
         if self.debug_mode:        
-            print("wake words no match")
+            logging.info("wake words no match")
+
         return -1
 
     def text(self,
@@ -1392,11 +1574,12 @@ class AudioToTextRecorder:
                             data = self.audio_queue.get()
 
                 except BrokenPipeError:
-                    print("BrokenPipeError _recording_worker")
+                    logging.error("BrokenPipeError _recording_worker")
                     self.is_running = False
                     break
 
                 if not self.is_recording:
+                    logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
                     # Handle not recording state
                     time_since_listen_start = (time.time() - self.listen_start
                                                if self.listen_start else 0)
@@ -1506,6 +1689,11 @@ class AudioToTextRecorder:
                             # measuring silence time before stopping recording
                             if self.speech_end_silence_start == 0:
                                 self.speech_end_silence_start = time.time()
+                                # if(len(self.frames) > 0):
+                                #     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+                                #     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+                                #     self.parent_transcription_pipe.send((audio, self.language))
+                                #     self.transcribe_count += 1                                
                         else:
                             self.speech_end_silence_start = 0
 
@@ -1515,7 +1703,16 @@ class AudioToTextRecorder:
                                 self.post_speech_silence_duration:
                             logging.info("voice deactivity detected")
                             self.frames.append(data)
+                            logging.info("stopping recording")
                             self.stop()
+                            logging.info("stopped recording")
+
+
+                            ####
+                            if not self.use_wake_words:
+                                self.listen_start = time.time()
+                                self._set_state("listening")
+                                self.start_recording_on_voice_activity = True    
 
                 if not self.is_recording and was_recording:
                     # Reset after stopping recording to ensure clean state
@@ -1583,6 +1780,8 @@ class AudioToTextRecorder:
                         dtype=np.int16
                         )
 
+                    logging.debug(f"Current realtime buffer size: {len(audio_array)}")
+
                     # Normalize the array to a [-1, 1] range
                     audio_array = audio_array.astype(np.float32) / \
                         INT16_MAX_ABS_VALUE
@@ -1592,12 +1791,14 @@ class AudioToTextRecorder:
                             try:
                                 self.parent_transcription_pipe.send((audio_array, self.language))
                                 if self.parent_transcription_pipe.poll(timeout=5):  # Wait for 5 seconds
+                                    logging.debug("Receive from realtime worker after transcription request to main model")
                                     status, result = self.parent_transcription_pipe.recv()
                                     if status == 'success':
                                         segments, info = result
                                         self.detected_realtime_language = info.language if info.language_probability > 0 else None
                                         self.detected_realtime_language_probability = info.language_probability
                                         realtime_text = segments
+                                        logging.debug(f"Realtime text detected with main model: {realtime_text}")
                                     else:
                                         logging.error(f"Realtime transcription error: {result}")
                                         continue
@@ -1622,6 +1823,7 @@ class AudioToTextRecorder:
                         realtime_text = " ".join(
                             seg.text for seg in segments
                         )
+                        logging.debug(f"Realtime text detected: {realtime_text}")
 
                     # double check recording state
                     # because it could have changed mid-transcription
@@ -1764,19 +1966,19 @@ class AudioToTextRecorder:
                 speech_frames += 1
                 if not all_frames_must_be_true:
                     if self.debug_mode:
-                        print(f"Speech detected in frame {i + 1}"
+                        logging.info(f"Speech detected in frame {i + 1}"
                               f" of {num_frames}")
                     return True
         if all_frames_must_be_true:
             if self.debug_mode and speech_frames == num_frames:
-                print(f"Speech detected in {speech_frames} of "
+                logging.info(f"Speech detected in {speech_frames} of "
                       f"{num_frames} frames")
             elif self.debug_mode:
-                print(f"Speech not detected in all {num_frames} frames")
+                logging.info(f"Speech not detected in all {num_frames} frames")
             return speech_frames == num_frames
         else:
             if self.debug_mode:
-                print(f"Speech not detected in any of {num_frames} frames")
+                logging.info(f"Speech not detected in any of {num_frames} frames")
             return False
 
     def _check_voice_activity(self, data):
@@ -1841,6 +2043,9 @@ class AudioToTextRecorder:
         # Update to the new state
         self.state = new_state
 
+        # Log the state change
+        logging.info(f"State changed from '{old_state}' to '{new_state}'")
+
         # Execute callbacks based on transitioning FROM a particular state
         if old_state == "listening":
             if self.on_vad_detect_stop: