|
@@ -159,6 +159,7 @@ class AudioToTextRecorder:
|
|
sample_rate: int = SAMPLE_RATE,
|
|
sample_rate: int = SAMPLE_RATE,
|
|
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
|
|
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
|
|
suppress_tokens: Optional[List[int]] = [-1],
|
|
suppress_tokens: Optional[List[int]] = [-1],
|
|
|
|
+ log_transcription_time: bool = False
|
|
):
|
|
):
|
|
"""
|
|
"""
|
|
Initializes an audio recorder and transcription
|
|
Initializes an audio recorder and transcription
|
|
@@ -429,18 +430,25 @@ class AudioToTextRecorder:
|
|
self.detected_realtime_language = None
|
|
self.detected_realtime_language = None
|
|
self.detected_realtime_language_probability = 0
|
|
self.detected_realtime_language_probability = 0
|
|
self.transcription_lock = threading.Lock()
|
|
self.transcription_lock = threading.Lock()
|
|
|
|
+ self.transcribe_count = 0
|
|
|
|
+ self.log_transcription_time = log_transcription_time
|
|
|
|
|
|
# Initialize the logging configuration with the specified level
|
|
# Initialize the logging configuration with the specified level
|
|
log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
|
|
log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
|
|
|
|
+ file_log_format = '%(asctime)s - ' + log_format
|
|
|
|
|
|
- # Create a logger
|
|
|
|
|
|
+ # Get the root logger
|
|
logger = logging.getLogger()
|
|
logger = logging.getLogger()
|
|
- logger.setLevel(level) # Set the root logger's level
|
|
|
|
|
|
+ logger.setLevel(level) # Set the logger's level
|
|
|
|
+ logger.propagate = False # Prevent propagation to higher-level loggers
|
|
|
|
+
|
|
|
|
+ # Remove any existing handlers
|
|
|
|
+ logger.handlers = []
|
|
|
|
|
|
# Create a file handler and set its level
|
|
# Create a file handler and set its level
|
|
file_handler = logging.FileHandler('realtimesst.log')
|
|
file_handler = logging.FileHandler('realtimesst.log')
|
|
file_handler.setLevel(logging.DEBUG)
|
|
file_handler.setLevel(logging.DEBUG)
|
|
- file_handler.setFormatter(logging.Formatter(log_format))
|
|
|
|
|
|
+ file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
|
|
|
|
|
|
# Create a console handler and set its level
|
|
# Create a console handler and set its level
|
|
console_handler = logging.StreamHandler()
|
|
console_handler = logging.StreamHandler()
|
|
@@ -450,15 +458,16 @@ class AudioToTextRecorder:
|
|
# Add the handlers to the logger
|
|
# Add the handlers to the logger
|
|
logger.addHandler(file_handler)
|
|
logger.addHandler(file_handler)
|
|
logger.addHandler(console_handler)
|
|
logger.addHandler(console_handler)
|
|
-
|
|
|
|
|
|
+
|
|
self.is_shut_down = False
|
|
self.is_shut_down = False
|
|
self.shutdown_event = mp.Event()
|
|
self.shutdown_event = mp.Event()
|
|
-
|
|
|
|
|
|
+
|
|
try:
|
|
try:
|
|
- logging.debug("Explicitly setting the multiprocessing start method to 'spawn'")
|
|
|
|
- mp.set_start_method('spawn')
|
|
|
|
|
|
+ # Only set the start method if it hasn't been set already
|
|
|
|
+ if mp.get_start_method(allow_none=True) is None:
|
|
|
|
+ mp.set_start_method("spawn")
|
|
except RuntimeError as e:
|
|
except RuntimeError as e:
|
|
- logging.debug(f"Start method has already been set. Details: {e}")
|
|
|
|
|
|
+ logging.info(f"Start method has already been set. Details: {e}")
|
|
|
|
|
|
logging.info("Starting RealTimeSTT")
|
|
logging.info("Starting RealTimeSTT")
|
|
|
|
|
|
@@ -706,9 +715,28 @@ class AudioToTextRecorder:
|
|
|
|
|
|
def _read_stdout(self):
|
|
def _read_stdout(self):
|
|
while not self.shutdown_event.is_set():
|
|
while not self.shutdown_event.is_set():
|
|
- if self.parent_stdout_pipe.poll(0.1):
|
|
|
|
- message = self.parent_stdout_pipe.recv()
|
|
|
|
- print(message, flush=True)
|
|
|
|
|
|
+ try:
|
|
|
|
+ if self.parent_stdout_pipe.poll(0.1):
|
|
|
|
+ logging.debug("Receive from stdout pipe")
|
|
|
|
+ message = self.parent_stdout_pipe.recv()
|
|
|
|
+ logging.info(message)
|
|
|
|
+ except (BrokenPipeError, EOFError, OSError):
|
|
|
|
+ # The pipe probably has been closed, so we ignore the error
|
|
|
|
+ pass
|
|
|
|
+ # except BrokenPipeError as e: # handle broken pipe error
|
|
|
|
+ # pass
|
|
|
|
+ # except EOFError as e:
|
|
|
|
+ # logging.error(f"EOFError in read from stdout: {e}")
|
|
|
|
+ # logging.error(traceback.format_exc())
|
|
|
|
+ # break
|
|
|
|
+ except KeyboardInterrupt: # handle manual interruption (Ctrl+C)
|
|
|
|
+ logging.info("KeyboardInterrupt in read from stdout detected, exiting...")
|
|
|
|
+ break
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"Unexpected error in read from stdout: {e}")
|
|
|
|
+ logging.error(traceback.format_exc()) # Log the full traceback here
|
|
|
|
+ break
|
|
|
|
+ time.sleep(0.1)
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
def _transcription_worker(conn,
|
|
def _transcription_worker(conn,
|
|
@@ -762,7 +790,11 @@ class AudioToTextRecorder:
|
|
"""
|
|
"""
|
|
def custom_print(*args, **kwargs):
|
|
def custom_print(*args, **kwargs):
|
|
message = ' '.join(map(str, args))
|
|
message = ' '.join(map(str, args))
|
|
- stdout_pipe.send(message)
|
|
|
|
|
|
+ try:
|
|
|
|
+ stdout_pipe.send(message)
|
|
|
|
+ except (BrokenPipeError, EOFError, OSError):
|
|
|
|
+ # The pipe probably has been closed, so we ignore the error
|
|
|
|
+ pass
|
|
|
|
|
|
# Replace the built-in print function with our custom one
|
|
# Replace the built-in print function with our custom one
|
|
__builtins__['print'] = custom_print
|
|
__builtins__['print'] = custom_print
|
|
@@ -791,33 +823,48 @@ class AudioToTextRecorder:
|
|
"transcription model initialized successfully"
|
|
"transcription model initialized successfully"
|
|
)
|
|
)
|
|
|
|
|
|
- while not shutdown_event.is_set():
|
|
|
|
- try:
|
|
|
|
- if conn.poll(0.01):
|
|
|
|
- audio, language = conn.recv()
|
|
|
|
- try:
|
|
|
|
- segments, info = model.transcribe(
|
|
|
|
- audio,
|
|
|
|
- language=language if language else None,
|
|
|
|
- beam_size=beam_size,
|
|
|
|
- initial_prompt=initial_prompt,
|
|
|
|
- suppress_tokens=suppress_tokens
|
|
|
|
- )
|
|
|
|
- transcription = " ".join(seg.text for seg in segments)
|
|
|
|
- transcription = transcription.strip()
|
|
|
|
- conn.send(('success', (transcription, info)))
|
|
|
|
- except Exception as e:
|
|
|
|
- logging.error(f"General transcription error: {e}")
|
|
|
|
- conn.send(('error', str(e)))
|
|
|
|
- else:
|
|
|
|
- time.sleep(TIME_SLEEP)
|
|
|
|
|
|
+ try:
|
|
|
|
+ while not shutdown_event.is_set():
|
|
|
|
+ try:
|
|
|
|
+ if conn.poll(0.01):
|
|
|
|
+ logging.debug("Receive from _transcription_worker pipe")
|
|
|
|
+ audio, language = conn.recv()
|
|
|
|
+ try:
|
|
|
|
+ segments, info = model.transcribe(
|
|
|
|
+ audio,
|
|
|
|
+ language=language if language else None,
|
|
|
|
+ beam_size=beam_size,
|
|
|
|
+ initial_prompt=initial_prompt,
|
|
|
|
+ suppress_tokens=suppress_tokens
|
|
|
|
+ )
|
|
|
|
+ transcription = " ".join(seg.text for seg in segments)
|
|
|
|
+ transcription = transcription.strip()
|
|
|
|
+ logging.debug(f"Final text detected with main model: {transcription}")
|
|
|
|
+ conn.send(('success', (transcription, info)))
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"General error in _transcription_worker in transcription: {e}")
|
|
|
|
+ conn.send(('error', str(e)))
|
|
|
|
+ else:
|
|
|
|
+ time.sleep(TIME_SLEEP)
|
|
|
|
|
|
- except KeyboardInterrupt:
|
|
|
|
- interrupt_stop_event.set()
|
|
|
|
- logging.debug("Transcription worker process "
|
|
|
|
- "finished due to KeyboardInterrupt"
|
|
|
|
- )
|
|
|
|
- break
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ except KeyboardInterrupt:
|
|
|
|
+ interrupt_stop_event.set()
|
|
|
|
+
|
|
|
|
+ logging.debug("Transcription worker process "
|
|
|
|
+ "finished due to KeyboardInterrupt"
|
|
|
|
+ )
|
|
|
|
+ stdout_pipe.close()
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.error(f"General error in _transcription_worker in accessing pipe: {e}")
|
|
|
|
+
|
|
|
|
+ finally:
|
|
|
|
+ __builtins__['print'] = print # Restore the original print function
|
|
|
|
+ conn.close()
|
|
|
|
+ stdout_pipe.close()
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
def _audio_data_worker(audio_queue,
|
|
def _audio_data_worker(audio_queue,
|
|
@@ -916,33 +963,47 @@ class AudioToTextRecorder:
|
|
device_sample_rate = None
|
|
device_sample_rate = None
|
|
chunk_size = 1024 # Increased chunk size for better performance
|
|
chunk_size = 1024 # Increased chunk size for better performance
|
|
|
|
|
|
- try:
|
|
|
|
- audio_interface = pyaudio.PyAudio()
|
|
|
|
- if input_device_index is None:
|
|
|
|
- try:
|
|
|
|
- default_device = audio_interface.get_default_input_device_info()
|
|
|
|
- input_device_index = default_device['index']
|
|
|
|
- except OSError as e:
|
|
|
|
- input_device_index = None
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- if input_device_index is not None:
|
|
|
|
- device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
|
|
|
|
- else:
|
|
|
|
- device_sample_rate = 16000 # better: try 16000, 48000, ... until it works
|
|
|
|
|
|
+ def setup_audio():
|
|
|
|
+ nonlocal audio_interface, stream, device_sample_rate, input_device_index
|
|
|
|
+ try:
|
|
|
|
+ audio_interface = pyaudio.PyAudio()
|
|
|
|
+ if input_device_index is None:
|
|
|
|
+ try:
|
|
|
|
+ default_device = audio_interface.get_default_input_device_info()
|
|
|
|
+ input_device_index = default_device['index']
|
|
|
|
+ except OSError as e:
|
|
|
|
+ input_device_index = None
|
|
|
|
+
|
|
|
|
+ sample_rates_to_try = [16000] # Try 16000 Hz first
|
|
|
|
+ if input_device_index is not None:
|
|
|
|
+ highest_rate = get_highest_sample_rate(audio_interface, input_device_index)
|
|
|
|
+ if highest_rate != 16000:
|
|
|
|
+ sample_rates_to_try.append(highest_rate)
|
|
|
|
+ else:
|
|
|
|
+ sample_rates_to_try.append(48000) # Fallback sample rate
|
|
|
|
|
|
- stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
|
|
|
|
|
|
+ for rate in sample_rates_to_try:
|
|
|
|
+ try:
|
|
|
|
+ device_sample_rate = rate
|
|
|
|
+ stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
|
|
|
|
+ if stream is not None:
|
|
|
|
+ logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
|
|
|
|
+ return True
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.warning(f"Failed to initialize audio stream at {device_sample_rate} Hz: {e}")
|
|
|
|
+ continue
|
|
|
|
|
|
- if stream is None:
|
|
|
|
- raise Exception("Failed to initialize audio stream.")
|
|
|
|
|
|
+ # If we reach here, none of the sample rates worked
|
|
|
|
+ raise Exception("Failed to initialize audio stream with all sample rates.")
|
|
|
|
|
|
- except Exception as e:
|
|
|
|
- logging.exception(f"Error initializing pyaudio audio recording: {e}")
|
|
|
|
- if audio_interface:
|
|
|
|
- audio_interface.terminate()
|
|
|
|
- raise
|
|
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logging.exception(f"Error initializing pyaudio audio recording: {e}")
|
|
|
|
+ if audio_interface:
|
|
|
|
+ audio_interface.terminate()
|
|
|
|
+ return False
|
|
|
|
|
|
- logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
|
|
|
|
|
|
+ if not setup_audio():
|
|
|
|
+ raise Exception("Failed to set up audio recording.")
|
|
|
|
|
|
buffer = bytearray()
|
|
buffer = bytearray()
|
|
silero_buffer_size = 2 * buffer_size # silero complains if too short
|
|
silero_buffer_size = 2 * buffer_size # silero complains if too short
|
|
@@ -950,7 +1011,7 @@ class AudioToTextRecorder:
|
|
try:
|
|
try:
|
|
while not shutdown_event.is_set():
|
|
while not shutdown_event.is_set():
|
|
try:
|
|
try:
|
|
- data = stream.read(chunk_size)
|
|
|
|
|
|
+ data = stream.read(chunk_size, exception_on_overflow=False)
|
|
|
|
|
|
if use_microphone.value:
|
|
if use_microphone.value:
|
|
processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
|
|
processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
|
|
@@ -970,13 +1031,45 @@ class AudioToTextRecorder:
|
|
logging.warning("Input overflowed. Frame dropped.")
|
|
logging.warning("Input overflowed. Frame dropped.")
|
|
else:
|
|
else:
|
|
logging.error(f"Error during recording: {e}")
|
|
logging.error(f"Error during recording: {e}")
|
|
|
|
+ # Attempt to reinitialize the stream
|
|
|
|
+ logging.info("Attempting to reinitialize the audio stream...")
|
|
|
|
+ if stream:
|
|
|
|
+ stream.stop_stream()
|
|
|
|
+ stream.close()
|
|
|
|
+ if audio_interface:
|
|
|
|
+ audio_interface.terminate()
|
|
|
|
+
|
|
|
|
+ # Wait a bit before trying to reinitialize
|
|
|
|
+ time.sleep(1)
|
|
|
|
+
|
|
|
|
+ if not setup_audio():
|
|
|
|
+ logging.error("Failed to reinitialize audio stream. Exiting.")
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ logging.info("Audio stream reinitialized successfully.")
|
|
continue
|
|
continue
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logging.error(f"Error during recording: {e}")
|
|
logging.error(f"Error during recording: {e}")
|
|
tb_str = traceback.format_exc()
|
|
tb_str = traceback.format_exc()
|
|
- print(f"Traceback: {tb_str}")
|
|
|
|
- print(f"Error: {e}")
|
|
|
|
|
|
+ logging.error(f"Traceback: {tb_str}")
|
|
|
|
+ logging.error(f"Error: {e}")
|
|
|
|
+ # Attempt to reinitialize the stream
|
|
|
|
+ logging.info("Attempting to reinitialize the audio stream...")
|
|
|
|
+ if stream:
|
|
|
|
+ stream.stop_stream()
|
|
|
|
+ stream.close()
|
|
|
|
+ if audio_interface:
|
|
|
|
+ audio_interface.terminate()
|
|
|
|
+
|
|
|
|
+ # Wait a bit before trying to reinitialize
|
|
|
|
+ time.sleep(0.5)
|
|
|
|
+
|
|
|
|
+ if not setup_audio():
|
|
|
|
+ logging.error("Failed to reinitialize audio stream. Exiting.")
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ logging.info("Audio stream reinitialized successfully.")
|
|
continue
|
|
continue
|
|
|
|
|
|
except KeyboardInterrupt:
|
|
except KeyboardInterrupt:
|
|
@@ -993,6 +1086,83 @@ class AudioToTextRecorder:
|
|
if audio_interface:
|
|
if audio_interface:
|
|
audio_interface.terminate()
|
|
audio_interface.terminate()
|
|
|
|
|
|
|
|
+ # try:
|
|
|
|
+ # audio_interface = pyaudio.PyAudio()
|
|
|
|
+ # if input_device_index is None:
|
|
|
|
+ # try:
|
|
|
|
+ # default_device = audio_interface.get_default_input_device_info()
|
|
|
|
+ # input_device_index = default_device['index']
|
|
|
|
+ # except OSError as e:
|
|
|
|
+ # input_device_index = None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # if input_device_index is not None:
|
|
|
|
+ # device_sample_rate = get_highest_sample_rate(audio_interface, input_device_index)
|
|
|
|
+ # else:
|
|
|
|
+ # device_sample_rate = 16000 # better: try 16000, 48000, ... until it works
|
|
|
|
+
|
|
|
|
+ # stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
|
|
|
|
+
|
|
|
|
+ # if stream is None:
|
|
|
|
+ # raise Exception("Failed to initialize audio stream.")
|
|
|
|
+
|
|
|
|
+ # except Exception as e:
|
|
|
|
+ # logging.exception(f"Error initializing pyaudio audio recording: {e}")
|
|
|
|
+ # if audio_interface:
|
|
|
|
+ # audio_interface.terminate()
|
|
|
|
+ # raise
|
|
|
|
+
|
|
|
|
+ # logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
|
|
|
|
+
|
|
|
|
+ # buffer = bytearray()
|
|
|
|
+ # silero_buffer_size = 2 * buffer_size # silero complains if too short
|
|
|
|
+
|
|
|
|
+ # try:
|
|
|
|
+ # while not shutdown_event.is_set():
|
|
|
|
+ # try:
|
|
|
|
+ # data = stream.read(chunk_size)
|
|
|
|
+
|
|
|
|
+ # if use_microphone.value:
|
|
|
|
+ # processed_data = preprocess_audio(data, device_sample_rate, target_sample_rate)
|
|
|
|
+ # buffer += processed_data
|
|
|
|
+
|
|
|
|
+ # # Check if the buffer has reached or exceeded the silero_buffer_size
|
|
|
|
+ # while len(buffer) >= silero_buffer_size:
|
|
|
|
+ # # Extract silero_buffer_size amount of data from the buffer
|
|
|
|
+ # to_process = buffer[:silero_buffer_size]
|
|
|
|
+ # buffer = buffer[silero_buffer_size:]
|
|
|
|
+
|
|
|
|
+ # # Feed the extracted data to the audio_queue
|
|
|
|
+ # audio_queue.put(to_process)
|
|
|
|
+
|
|
|
|
+ # except OSError as e:
|
|
|
|
+ # if e.errno == pyaudio.paInputOverflowed:
|
|
|
|
+ # logging.warning("Input overflowed. Frame dropped.")
|
|
|
|
+ # else:
|
|
|
|
+ # logging.error(f"Error during recording: {e}")
|
|
|
|
+ # continue
|
|
|
|
+
|
|
|
|
+ # except Exception as e:
|
|
|
|
+ # logging.error(f"Error during recording: {e}")
|
|
|
|
+ # tb_str = traceback.format_exc()
|
|
|
|
+ # print(f"Traceback: {tb_str}")
|
|
|
|
+ # print(f"Error: {e}")
|
|
|
|
+ # continue
|
|
|
|
+
|
|
|
|
+ # except KeyboardInterrupt:
|
|
|
|
+ # interrupt_stop_event.set()
|
|
|
|
+ # logging.debug("Audio data worker process finished due to KeyboardInterrupt")
|
|
|
|
+ # finally:
|
|
|
|
+ # # After recording stops, feed any remaining audio data
|
|
|
|
+ # if buffer:
|
|
|
|
+ # audio_queue.put(bytes(buffer))
|
|
|
|
+
|
|
|
|
+ # if stream:
|
|
|
|
+ # stream.stop_stream()
|
|
|
|
+ # stream.close()
|
|
|
|
+ # if audio_interface:
|
|
|
|
+ # audio_interface.terminate()
|
|
|
|
+
|
|
def wakeup(self):
|
|
def wakeup(self):
|
|
"""
|
|
"""
|
|
If in wake work modus, wake up as if a wake word was spoken.
|
|
If in wake work modus, wake up as if a wake word was spoken.
|
|
@@ -1022,7 +1192,9 @@ class AudioToTextRecorder:
|
|
- Modifies the audio attribute to contain the processed audio data.
|
|
- Modifies the audio attribute to contain the processed audio data.
|
|
"""
|
|
"""
|
|
|
|
|
|
- self.listen_start = time.time()
|
|
|
|
|
|
+ logging.info("Setting listen time")
|
|
|
|
+ if self.listen_start == 0:
|
|
|
|
+ self.listen_start = time.time()
|
|
|
|
|
|
# If not yet started recording, wait for voice activity to initiate.
|
|
# If not yet started recording, wait for voice activity to initiate.
|
|
if not self.is_recording and not self.frames:
|
|
if not self.is_recording and not self.frames:
|
|
@@ -1087,8 +1259,15 @@ class AudioToTextRecorder:
|
|
start_time = time.time() # Start timing
|
|
start_time = time.time() # Start timing
|
|
with self.transcription_lock:
|
|
with self.transcription_lock:
|
|
try:
|
|
try:
|
|
- self.parent_transcription_pipe.send((self.audio, self.language))
|
|
|
|
- status, result = self.parent_transcription_pipe.recv()
|
|
|
|
|
|
+ if self.transcribe_count == 0:
|
|
|
|
+ self.parent_transcription_pipe.send((self.audio, self.language))
|
|
|
|
+ self.transcribe_count += 1
|
|
|
|
+
|
|
|
|
+ while self.transcribe_count > 0:
|
|
|
|
+ logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
|
|
|
|
+ status, result = self.parent_transcription_pipe.recv()
|
|
|
|
+ self.transcribe_count -= 1
|
|
|
|
+
|
|
self._set_state("inactive")
|
|
self._set_state("inactive")
|
|
if status == 'success':
|
|
if status == 'success':
|
|
segments, info = result
|
|
segments, info = result
|
|
@@ -1098,7 +1277,9 @@ class AudioToTextRecorder:
|
|
transcription = self._preprocess_output(segments)
|
|
transcription = self._preprocess_output(segments)
|
|
end_time = time.time() # End timing
|
|
end_time = time.time() # End timing
|
|
transcription_time = end_time - start_time
|
|
transcription_time = end_time - start_time
|
|
- # print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
|
|
|
|
|
|
+
|
|
|
|
+ if self.log_transcription_time:
|
|
|
|
+ logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
|
|
return transcription
|
|
return transcription
|
|
else:
|
|
else:
|
|
logging.error(f"Transcription error: {result}")
|
|
logging.error(f"Transcription error: {result}")
|
|
@@ -1118,7 +1299,7 @@ class AudioToTextRecorder:
|
|
)
|
|
)
|
|
porcupine_index = self.porcupine.process(pcm)
|
|
porcupine_index = self.porcupine.process(pcm)
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print (f"wake words porcupine_index: {porcupine_index}")
|
|
|
|
|
|
+ logging.info(f"wake words porcupine_index: {porcupine_index}")
|
|
return self.porcupine.process(pcm)
|
|
return self.porcupine.process(pcm)
|
|
|
|
|
|
elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
|
|
elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
|
|
@@ -1135,15 +1316,16 @@ class AudioToTextRecorder:
|
|
max_score = scores[-1]
|
|
max_score = scores[-1]
|
|
max_index = idx
|
|
max_index = idx
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print (f"wake words oww max_index, max_score: {max_index} {max_score}")
|
|
|
|
|
|
+ logging.info(f"wake words oww max_index, max_score: {max_index} {max_score}")
|
|
return max_index
|
|
return max_index
|
|
else:
|
|
else:
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print (f"wake words oww_index: -1")
|
|
|
|
|
|
+ logging.info(f"wake words oww_index: -1")
|
|
return -1
|
|
return -1
|
|
|
|
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print("wake words no match")
|
|
|
|
|
|
+ logging.info("wake words no match")
|
|
|
|
+
|
|
return -1
|
|
return -1
|
|
|
|
|
|
def text(self,
|
|
def text(self,
|
|
@@ -1392,11 +1574,12 @@ class AudioToTextRecorder:
|
|
data = self.audio_queue.get()
|
|
data = self.audio_queue.get()
|
|
|
|
|
|
except BrokenPipeError:
|
|
except BrokenPipeError:
|
|
- print("BrokenPipeError _recording_worker")
|
|
|
|
|
|
+ logging.error("BrokenPipeError _recording_worker")
|
|
self.is_running = False
|
|
self.is_running = False
|
|
break
|
|
break
|
|
|
|
|
|
if not self.is_recording:
|
|
if not self.is_recording:
|
|
|
|
+ logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
|
|
# Handle not recording state
|
|
# Handle not recording state
|
|
time_since_listen_start = (time.time() - self.listen_start
|
|
time_since_listen_start = (time.time() - self.listen_start
|
|
if self.listen_start else 0)
|
|
if self.listen_start else 0)
|
|
@@ -1506,6 +1689,11 @@ class AudioToTextRecorder:
|
|
# measuring silence time before stopping recording
|
|
# measuring silence time before stopping recording
|
|
if self.speech_end_silence_start == 0:
|
|
if self.speech_end_silence_start == 0:
|
|
self.speech_end_silence_start = time.time()
|
|
self.speech_end_silence_start = time.time()
|
|
|
|
+ # if(len(self.frames) > 0):
|
|
|
|
+ # audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
|
|
|
|
+ # audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
|
|
|
+ # self.parent_transcription_pipe.send((audio, self.language))
|
|
|
|
+ # self.transcribe_count += 1
|
|
else:
|
|
else:
|
|
self.speech_end_silence_start = 0
|
|
self.speech_end_silence_start = 0
|
|
|
|
|
|
@@ -1515,7 +1703,16 @@ class AudioToTextRecorder:
|
|
self.post_speech_silence_duration:
|
|
self.post_speech_silence_duration:
|
|
logging.info("voice deactivity detected")
|
|
logging.info("voice deactivity detected")
|
|
self.frames.append(data)
|
|
self.frames.append(data)
|
|
|
|
+ logging.info("stopping recording")
|
|
self.stop()
|
|
self.stop()
|
|
|
|
+ logging.info("stopped recording")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ ####
|
|
|
|
+ if not self.use_wake_words:
|
|
|
|
+ self.listen_start = time.time()
|
|
|
|
+ self._set_state("listening")
|
|
|
|
+ self.start_recording_on_voice_activity = True
|
|
|
|
|
|
if not self.is_recording and was_recording:
|
|
if not self.is_recording and was_recording:
|
|
# Reset after stopping recording to ensure clean state
|
|
# Reset after stopping recording to ensure clean state
|
|
@@ -1583,6 +1780,8 @@ class AudioToTextRecorder:
|
|
dtype=np.int16
|
|
dtype=np.int16
|
|
)
|
|
)
|
|
|
|
|
|
|
|
+ logging.debug(f"Current realtime buffer size: {len(audio_array)}")
|
|
|
|
+
|
|
# Normalize the array to a [-1, 1] range
|
|
# Normalize the array to a [-1, 1] range
|
|
audio_array = audio_array.astype(np.float32) / \
|
|
audio_array = audio_array.astype(np.float32) / \
|
|
INT16_MAX_ABS_VALUE
|
|
INT16_MAX_ABS_VALUE
|
|
@@ -1592,12 +1791,14 @@ class AudioToTextRecorder:
|
|
try:
|
|
try:
|
|
self.parent_transcription_pipe.send((audio_array, self.language))
|
|
self.parent_transcription_pipe.send((audio_array, self.language))
|
|
if self.parent_transcription_pipe.poll(timeout=5): # Wait for 5 seconds
|
|
if self.parent_transcription_pipe.poll(timeout=5): # Wait for 5 seconds
|
|
|
|
+ logging.debug("Receive from realtime worker after transcription request to main model")
|
|
status, result = self.parent_transcription_pipe.recv()
|
|
status, result = self.parent_transcription_pipe.recv()
|
|
if status == 'success':
|
|
if status == 'success':
|
|
segments, info = result
|
|
segments, info = result
|
|
self.detected_realtime_language = info.language if info.language_probability > 0 else None
|
|
self.detected_realtime_language = info.language if info.language_probability > 0 else None
|
|
self.detected_realtime_language_probability = info.language_probability
|
|
self.detected_realtime_language_probability = info.language_probability
|
|
realtime_text = segments
|
|
realtime_text = segments
|
|
|
|
+ logging.debug(f"Realtime text detected with main model: {realtime_text}")
|
|
else:
|
|
else:
|
|
logging.error(f"Realtime transcription error: {result}")
|
|
logging.error(f"Realtime transcription error: {result}")
|
|
continue
|
|
continue
|
|
@@ -1622,6 +1823,7 @@ class AudioToTextRecorder:
|
|
realtime_text = " ".join(
|
|
realtime_text = " ".join(
|
|
seg.text for seg in segments
|
|
seg.text for seg in segments
|
|
)
|
|
)
|
|
|
|
+ logging.debug(f"Realtime text detected: {realtime_text}")
|
|
|
|
|
|
# double check recording state
|
|
# double check recording state
|
|
# because it could have changed mid-transcription
|
|
# because it could have changed mid-transcription
|
|
@@ -1764,19 +1966,19 @@ class AudioToTextRecorder:
|
|
speech_frames += 1
|
|
speech_frames += 1
|
|
if not all_frames_must_be_true:
|
|
if not all_frames_must_be_true:
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print(f"Speech detected in frame {i + 1}"
|
|
|
|
|
|
+ logging.info(f"Speech detected in frame {i + 1}"
|
|
f" of {num_frames}")
|
|
f" of {num_frames}")
|
|
return True
|
|
return True
|
|
if all_frames_must_be_true:
|
|
if all_frames_must_be_true:
|
|
if self.debug_mode and speech_frames == num_frames:
|
|
if self.debug_mode and speech_frames == num_frames:
|
|
- print(f"Speech detected in {speech_frames} of "
|
|
|
|
|
|
+ logging.info(f"Speech detected in {speech_frames} of "
|
|
f"{num_frames} frames")
|
|
f"{num_frames} frames")
|
|
elif self.debug_mode:
|
|
elif self.debug_mode:
|
|
- print(f"Speech not detected in all {num_frames} frames")
|
|
|
|
|
|
+ logging.info(f"Speech not detected in all {num_frames} frames")
|
|
return speech_frames == num_frames
|
|
return speech_frames == num_frames
|
|
else:
|
|
else:
|
|
if self.debug_mode:
|
|
if self.debug_mode:
|
|
- print(f"Speech not detected in any of {num_frames} frames")
|
|
|
|
|
|
+ logging.info(f"Speech not detected in any of {num_frames} frames")
|
|
return False
|
|
return False
|
|
|
|
|
|
def _check_voice_activity(self, data):
|
|
def _check_voice_activity(self, data):
|
|
@@ -1841,6 +2043,9 @@ class AudioToTextRecorder:
|
|
# Update to the new state
|
|
# Update to the new state
|
|
self.state = new_state
|
|
self.state = new_state
|
|
|
|
|
|
|
|
+ # Log the state change
|
|
|
|
+ logging.info(f"State changed from '{old_state}' to '{new_state}'")
|
|
|
|
+
|
|
# Execute callbacks based on transitioning FROM a particular state
|
|
# Execute callbacks based on transitioning FROM a particular state
|
|
if old_state == "listening":
|
|
if old_state == "listening":
|
|
if self.on_vad_detect_stop:
|
|
if self.on_vad_detect_stop:
|