Procházet zdrojové kódy

faster transcription

KoljaB před 7 měsíci
rodič
revize
c69e765fc6
1 změnil soubory, kde provedl 8 přidání a 94 odebrání
  1. 8 94
      RealtimeSTT/audio_recorder.py

+ 8 - 94
RealtimeSTT/audio_recorder.py

@@ -159,7 +159,8 @@ class AudioToTextRecorder:
                  sample_rate: int = SAMPLE_RATE,
                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
                  suppress_tokens: Optional[List[int]] = [-1],
-                 log_transcription_time: bool = False
+                 log_transcription_time: bool = False,
+                 early_transcription_on_silence: bool = True
                  ):
         """
         Initializes an audio recorder and  transcription
@@ -432,6 +433,7 @@ class AudioToTextRecorder:
         self.transcription_lock = threading.Lock()
         self.transcribe_count = 0
         self.log_transcription_time = log_transcription_time
+        self.early_transcription_on_silence = early_transcription_on_silence
 
         # Initialize the logging configuration with the specified level
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
@@ -723,12 +725,6 @@ class AudioToTextRecorder:
             except (BrokenPipeError, EOFError, OSError):
                 # The pipe probably has been closed, so we ignore the error
                 pass
-            # except BrokenPipeError as e:  # handle broken pipe error
-            #     pass                
-            # except EOFError as e:
-            #     logging.error(f"EOFError in read from stdout: {e}")
-            #     logging.error(traceback.format_exc())
-            #     break            
             except KeyboardInterrupt:  # handle manual interruption (Ctrl+C)
                 logging.info("KeyboardInterrupt in read from stdout detected, exiting...")
                 break
@@ -1086,83 +1082,6 @@ class AudioToTextRecorder:
             if audio_interface:
                 audio_interface.terminate()
 
-        # try:
-        #     audio_interface = pyaudio.PyAudio()
-        #     if input_device_index is None:
-        #         try:
-        #             default_device = audio_interface.get_default_input_device_info()
-        #             input_device_index = default_device['index']
-        #         except OSError as e:
-        #             input_device_index = None
-
-
-        #     if input_device_index is not None:
-        #         device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
-        #     else:
-        #         device_sample_rate = 16000  # better: try 16000, 48000, ... until it works
-
-        #     stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
-
-        #     if stream is None:
-        #         raise Exception("Failed to initialize audio stream.")
-
-        # except Exception as e:
-        #     logging.exception(f"Error initializing pyaudio audio recording: {e}")
-        #     if audio_interface:
-        #         audio_interface.terminate()
-        #     raise
-
-        # logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
-
-        # buffer = bytearray()
-        # silero_buffer_size = 2 * buffer_size  # silero complains if too short
-
-        # try:
-        #     while not shutdown_event.is_set():
-        #         try:
-        #             data = stream.read(chunk_size)
-                    
-        #             if use_microphone.value:
-        #                 processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
-        #                 buffer += processed_data
-
-        #                 # Check if the buffer has reached or exceeded the silero_buffer_size
-        #                 while len(buffer) >= silero_buffer_size:
-        #                     # Extract silero_buffer_size amount of data from the buffer
-        #                     to_process = buffer[:silero_buffer_size]
-        #                     buffer = buffer[silero_buffer_size:]
-
-        #                     # Feed the extracted data to the audio_queue
-        #                     audio_queue.put(to_process)
-
-        #         except OSError as e:
-        #             if e.errno == pyaudio.paInputOverflowed:
-        #                 logging.warning("Input overflowed. Frame dropped.")
-        #             else:
-        #                 logging.error(f"Error during recording: {e}")
-        #             continue
-
-        #         except Exception as e:
-        #             logging.error(f"Error during recording: {e}")
-        #             tb_str = traceback.format_exc()
-        #             print(f"Traceback: {tb_str}")
-        #             print(f"Error: {e}")
-        #             continue
-
-        # except KeyboardInterrupt:
-        #     interrupt_stop_event.set()
-        #     logging.debug("Audio data worker process finished due to KeyboardInterrupt")
-        # finally:
-        #     # After recording stops, feed any remaining audio data
-        #     if buffer:
-        #         audio_queue.put(bytes(buffer))
-            
-        #     if stream:
-        #         stream.stop_stream()
-        #         stream.close()
-        #     if audio_interface:
-        #         audio_interface.terminate()
-
     def wakeup(self):
         """
         If in wake work modus, wake up as if a wake word was spoken.
@@ -1689,11 +1608,11 @@ class AudioToTextRecorder:
                             # measuring silence time before stopping recording
                             if self.speech_end_silence_start == 0:
                                 self.speech_end_silence_start = time.time()
-                                # if(len(self.frames) > 0):
-                                #     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
-                                #     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
-                                #     self.parent_transcription_pipe.send((audio, self.language))
-                                #     self.transcribe_count += 1                                
+                                if self.early_transcription_on_silence and len(self.frames) > 0:
+                                     audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+                                     audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
+                                     self.parent_transcription_pipe.send((audio, self.language))
+                                     self.transcribe_count += 1                                
                         else:
                             self.speech_end_silence_start = 0
 
@@ -1703,12 +1622,8 @@ class AudioToTextRecorder:
                                 self.post_speech_silence_duration:
                             logging.info("voice deactivity detected")
                             self.frames.append(data)
-                            logging.info("stopping recording")
                             self.stop()
-                            logging.info("stopped recording")
-
 
-                            ####
                             if not self.use_wake_words:
                                 self.listen_start = time.time()
                                 self._set_state("listening")
@@ -1745,7 +1660,6 @@ class AudioToTextRecorder:
                 raise
 
 
-
     def _realtime_worker(self):
         """
         Performs real-time transcription if the feature is enabled.