|
@@ -34,6 +34,7 @@ from ctypes import c_bool
|
|
|
from openwakeword.model import Model
|
|
|
from scipy.signal import resample
|
|
|
from scipy import signal
|
|
|
+import signal as system_signal
|
|
|
import faster_whisper
|
|
|
import openwakeword
|
|
|
import collections
|
|
@@ -72,7 +73,7 @@ INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
|
|
|
INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
|
|
|
INIT_WAKE_WORD_TIMEOUT = 5.0
|
|
|
INIT_WAKE_WORD_BUFFER_DURATION = 0.1
|
|
|
-ALLOWED_LATENCY_LIMIT = 10
|
|
|
+ALLOWED_LATENCY_LIMIT = 100
|
|
|
|
|
|
TIME_SLEEP = 0.02
|
|
|
SAMPLE_RATE = 16000
|
|
@@ -159,8 +160,10 @@ class AudioToTextRecorder:
|
|
|
sample_rate: int = SAMPLE_RATE,
|
|
|
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
|
|
|
suppress_tokens: Optional[List[int]] = [-1],
|
|
|
- log_transcription_time: bool = False,
|
|
|
- early_transcription_on_silence: bool = True
|
|
|
+ print_transcription_time: bool = False,
|
|
|
+ early_transcription_on_silence: int = 0,
|
|
|
+ allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
|
|
|
+ no_log_file: bool = False
|
|
|
):
|
|
|
"""
|
|
|
Initializes an audio recorder and transcription
|
|
@@ -341,15 +344,22 @@ class AudioToTextRecorder:
|
|
|
prompt to be fed to the transcription models.
|
|
|
- suppress_tokens (list of int, default=[-1]): Tokens to be suppressed
|
|
|
from the transcription output.
|
|
|
- - log_transcription_time (bool, default=False): Logs processing time
|
|
|
+ - print_transcription_time (bool, default=False): Logs processing time
|
|
|
of main model transcription
|
|
|
- - early_transcription_on_silence (bool, default=True): If True, the
|
|
|
- system will immediately transcribe audio when silence is detected.
|
|
|
+ - early_transcription_on_silence (int, default=0): If set, the
|
|
|
+ system will transcribe audio faster when silence is detected.
|
|
|
+ Transcription will start after the specified milliseconds, so
|
|
|
+ keep this value lower than post_speech_silence_duration.
|
|
|
+ Ideally around post_speech_silence_duration minus the estimated
|
|
|
+ transcription time with the main model.
|
|
|
If silence lasts longer than post_speech_silence_duration, the
|
|
|
recording is stopped, and the transcription is submitted. If
|
|
|
voice activity resumes within this period, the transcription
|
|
|
is discarded. Results in faster final transcriptions to the cost
|
|
|
- of some unnecessary final transcriptions
|
|
|
+ of additional GPU load due to some unnecessary final transcriptions.
|
|
|
+ - allowed_latency_limit (int, default=100): Maximal amount of chunks
|
|
|
+ that can be unprocessed in queue before discarding chunks.
|
|
|
+ - no_log_file (bool, default=False): Skips writing of debug log file.
|
|
|
|
|
|
Raises:
|
|
|
Exception: Errors related to initializing transcription
|
|
@@ -400,7 +410,7 @@ class AudioToTextRecorder:
|
|
|
self.handle_buffer_overflow = handle_buffer_overflow
|
|
|
self.beam_size = beam_size
|
|
|
self.beam_size_realtime = beam_size_realtime
|
|
|
- self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
|
|
|
+ self.allowed_latency_limit = allowed_latency_limit
|
|
|
|
|
|
self.level = level
|
|
|
self.audio_queue = mp.Queue()
|
|
@@ -441,35 +451,40 @@ class AudioToTextRecorder:
|
|
|
self.detected_realtime_language_probability = 0
|
|
|
self.transcription_lock = threading.Lock()
|
|
|
self.transcribe_count = 0
|
|
|
- self.log_transcription_time = log_transcription_time
|
|
|
+ self.print_transcription_time = print_transcription_time
|
|
|
self.early_transcription_on_silence = early_transcription_on_silence
|
|
|
|
|
|
# Initialize the logging configuration with the specified level
|
|
|
log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
|
|
|
- file_log_format = '%(asctime)s - ' + log_format
|
|
|
+
|
|
|
+ # Adjust file_log_format to include milliseconds
|
|
|
+ file_log_format = '%(asctime)s.%(msecs)03d - ' + log_format
|
|
|
|
|
|
# Get the root logger
|
|
|
logger = logging.getLogger()
|
|
|
- logger.setLevel(level) # Set the logger's level
|
|
|
- logger.propagate = False # Prevent propagation to higher-level loggers
|
|
|
+ logger.setLevel(logging.DEBUG) # Set the root logger's level to DEBUG
|
|
|
|
|
|
# Remove any existing handlers
|
|
|
logger.handlers = []
|
|
|
|
|
|
- # Create a file handler and set its level
|
|
|
- file_handler = logging.FileHandler('realtimesst.log')
|
|
|
- file_handler.setLevel(logging.DEBUG)
|
|
|
- file_handler.setFormatter(logging.Formatter(file_log_format, datefmt='%Y-%m-%d %H:%M:%S'))
|
|
|
-
|
|
|
# Create a console handler and set its level
|
|
|
console_handler = logging.StreamHandler()
|
|
|
- console_handler.setLevel(level)
|
|
|
+ console_handler.setLevel(level)
|
|
|
console_handler.setFormatter(logging.Formatter(log_format))
|
|
|
|
|
|
# Add the handlers to the logger
|
|
|
- logger.addHandler(file_handler)
|
|
|
+ if not no_log_file:
|
|
|
+ # Create a file handler and set its level
|
|
|
+ file_handler = logging.FileHandler('realtimesst.log')
|
|
|
+ file_handler.setLevel(logging.DEBUG)
|
|
|
+ file_handler.setFormatter(logging.Formatter(
|
|
|
+ file_log_format,
|
|
|
+ datefmt='%Y-%m-%d %H:%M:%S'
|
|
|
+ ))
|
|
|
+
|
|
|
+ logger.addHandler(file_handler)
|
|
|
logger.addHandler(console_handler)
|
|
|
-
|
|
|
+
|
|
|
self.is_shut_down = False
|
|
|
self.shutdown_event = mp.Event()
|
|
|
|
|
@@ -793,6 +808,9 @@ class AudioToTextRecorder:
|
|
|
Exception: If there is an error while initializing the
|
|
|
transcription model.
|
|
|
"""
|
|
|
+
|
|
|
+ system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
|
|
|
+
|
|
|
def custom_print(*args, **kwargs):
|
|
|
message = ' '.join(map(str, args))
|
|
|
try:
|
|
@@ -904,6 +922,7 @@ class AudioToTextRecorder:
|
|
|
import pyaudio
|
|
|
import numpy as np
|
|
|
from scipy import signal
|
|
|
+ system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
|
|
|
|
|
|
def get_highest_sample_rate(audio_interface, device_index):
|
|
|
"""Get the highest supported sample rate for the specified device."""
|
|
@@ -1013,6 +1032,7 @@ class AudioToTextRecorder:
|
|
|
buffer = bytearray()
|
|
|
silero_buffer_size = 2 * buffer_size # silero complains if too short
|
|
|
|
|
|
+ time_since_last_buffer_message = 0
|
|
|
try:
|
|
|
while not shutdown_event.is_set():
|
|
|
try:
|
|
@@ -1029,7 +1049,16 @@ class AudioToTextRecorder:
|
|
|
buffer = buffer[silero_buffer_size:]
|
|
|
|
|
|
# Feed the extracted data to the audio_queue
|
|
|
+ if time_since_last_buffer_message:
|
|
|
+ time_passed = time.time() - time_since_last_buffer_message
|
|
|
+ if time_passed > 1:
|
|
|
+ logging.debug("_audio_data_worker writing audio data into queue.")
|
|
|
+ time_since_last_buffer_message = time.time()
|
|
|
+ else:
|
|
|
+ time_since_last_buffer_message = time.time()
|
|
|
+
|
|
|
audio_queue.put(to_process)
|
|
|
+
|
|
|
|
|
|
except OSError as e:
|
|
|
if e.errno == pyaudio.paInputOverflowed:
|
|
@@ -1120,42 +1149,48 @@ class AudioToTextRecorder:
|
|
|
- Modifies the audio attribute to contain the processed audio data.
|
|
|
"""
|
|
|
|
|
|
- logging.info("Setting listen time")
|
|
|
- if self.listen_start == 0:
|
|
|
- self.listen_start = time.time()
|
|
|
-
|
|
|
- # If not yet started recording, wait for voice activity to initiate.
|
|
|
- if not self.is_recording and not self.frames:
|
|
|
- self._set_state("listening")
|
|
|
- self.start_recording_on_voice_activity = True
|
|
|
+ try:
|
|
|
+ logging.info("Setting listen time")
|
|
|
+ if self.listen_start == 0:
|
|
|
+ self.listen_start = time.time()
|
|
|
+
|
|
|
+ # If not yet started recording, wait for voice activity to initiate.
|
|
|
+ if not self.is_recording and not self.frames:
|
|
|
+ self._set_state("listening")
|
|
|
+ self.start_recording_on_voice_activity = True
|
|
|
+
|
|
|
+ # Wait until recording starts
|
|
|
+ logging.debug('Waiting for recording start')
|
|
|
+ while not self.interrupt_stop_event.is_set():
|
|
|
+ if self.start_recording_event.wait(timeout=0.02):
|
|
|
+ break
|
|
|
|
|
|
- # Wait until recording starts
|
|
|
- logging.debug('Waiting for recording start')
|
|
|
- while not self.interrupt_stop_event.is_set():
|
|
|
- if self.start_recording_event.wait(timeout=0.02):
|
|
|
- break
|
|
|
+ # If recording is ongoing, wait for voice inactivity
|
|
|
+ # to finish recording.
|
|
|
+ if self.is_recording:
|
|
|
+ self.stop_recording_on_voice_deactivity = True
|
|
|
|
|
|
- # If recording is ongoing, wait for voice inactivity
|
|
|
- # to finish recording.
|
|
|
- if self.is_recording:
|
|
|
- self.stop_recording_on_voice_deactivity = True
|
|
|
+ # Wait until recording stops
|
|
|
+ logging.debug('Waiting for recording stop')
|
|
|
+ while not self.interrupt_stop_event.is_set():
|
|
|
+ if (self.stop_recording_event.wait(timeout=0.02)):
|
|
|
+ break
|
|
|
|
|
|
- # Wait until recording stops
|
|
|
- logging.debug('Waiting for recording stop')
|
|
|
- while not self.interrupt_stop_event.is_set():
|
|
|
- if (self.stop_recording_event.wait(timeout=0.02)):
|
|
|
- break
|
|
|
+ # Convert recorded frames to the appropriate audio format.
|
|
|
+ audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
|
|
|
+ self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
|
|
+ self.frames.clear()
|
|
|
|
|
|
- # Convert recorded frames to the appropriate audio format.
|
|
|
- audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
|
|
|
- self.audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
|
|
- self.frames.clear()
|
|
|
+ # Reset recording-related timestamps
|
|
|
+ self.recording_stop_time = 0
|
|
|
+ self.listen_start = 0
|
|
|
|
|
|
- # Reset recording-related timestamps
|
|
|
- self.recording_stop_time = 0
|
|
|
- self.listen_start = 0
|
|
|
+ self._set_state("inactive")
|
|
|
|
|
|
- self._set_state("inactive")
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ logging.info("KeyboardInterrupt in wait_audio, shutting down")
|
|
|
+ self.shutdown()
|
|
|
+ raise # Re-raise the exception after cleanup
|
|
|
|
|
|
def transcribe(self):
|
|
|
"""
|
|
@@ -1184,18 +1219,22 @@ class AudioToTextRecorder:
|
|
|
"""
|
|
|
self._set_state("transcribing")
|
|
|
audio_copy = copy.deepcopy(self.audio)
|
|
|
- start_time = time.time() # Start timing
|
|
|
+ start_time = 0
|
|
|
with self.transcription_lock:
|
|
|
+
|
|
|
try:
|
|
|
if self.transcribe_count == 0:
|
|
|
+ logging.debug("Adding transcription request, no early transcription started")
|
|
|
+ start_time = time.time() # Start timing
|
|
|
self.parent_transcription_pipe.send((self.audio, self.language))
|
|
|
self.transcribe_count += 1
|
|
|
|
|
|
while self.transcribe_count > 0:
|
|
|
- logging.debug("Receive from parent_transcription_pipe pipe after sendiung transcription request")
|
|
|
+ logging.debug(F"Receive from parent_transcription_pipe after sendiung transcription request, transcribe_count: {self.transcribe_count}")
|
|
|
status, result = self.parent_transcription_pipe.recv()
|
|
|
self.transcribe_count -= 1
|
|
|
|
|
|
+ self.allowed_to_early_transcribe = True
|
|
|
self._set_state("inactive")
|
|
|
if status == 'success':
|
|
|
segments, info = result
|
|
@@ -1206,8 +1245,11 @@ class AudioToTextRecorder:
|
|
|
end_time = time.time() # End timing
|
|
|
transcription_time = end_time - start_time
|
|
|
|
|
|
- if self.log_transcription_time:
|
|
|
- logging.info(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
|
|
|
+ if start_time:
|
|
|
+ if self.print_transcription_time:
|
|
|
+ print(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
|
|
|
+ else:
|
|
|
+ logging.debug(f"Model {self.main_model_type} completed transcription in {transcription_time:.2f} seconds")
|
|
|
return transcription
|
|
|
else:
|
|
|
logging.error(f"Transcription error: {result}")
|
|
@@ -1280,11 +1322,14 @@ class AudioToTextRecorder:
|
|
|
Returns (if not callback is set):
|
|
|
str: The transcription of the recorded audio
|
|
|
"""
|
|
|
-
|
|
|
self.interrupt_stop_event.clear()
|
|
|
self.was_interrupted.clear()
|
|
|
-
|
|
|
- self.wait_audio()
|
|
|
+ try:
|
|
|
+ self.wait_audio()
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ logging.info("KeyboardInterrupt in text() method")
|
|
|
+ self.shutdown()
|
|
|
+ raise # Re-raise the exception after cleanup
|
|
|
|
|
|
if self.is_shut_down or self.interrupt_stop_event.is_set():
|
|
|
if self.interrupt_stop_event.is_set():
|
|
@@ -1293,10 +1338,51 @@ class AudioToTextRecorder:
|
|
|
|
|
|
if on_transcription_finished:
|
|
|
threading.Thread(target=on_transcription_finished,
|
|
|
- args=(self.transcribe(),)).start()
|
|
|
+ args=(self.transcribe(),)).start()
|
|
|
else:
|
|
|
return self.transcribe()
|
|
|
|
|
|
+ # def text(self,
|
|
|
+ # on_transcription_finished=None,
|
|
|
+ # ):
|
|
|
+ # """
|
|
|
+ # Transcribes audio captured by this class instance
|
|
|
+ # using the `faster_whisper` model.
|
|
|
+
|
|
|
+ # - Automatically starts recording upon voice activity if not manually
|
|
|
+ # started using `recorder.start()`.
|
|
|
+ # - Automatically stops recording upon voice deactivity if not manually
|
|
|
+ # stopped with `recorder.stop()`.
|
|
|
+ # - Processes the recorded audio to generate transcription.
|
|
|
+
|
|
|
+ # Args:
|
|
|
+ # on_transcription_finished (callable, optional): Callback function
|
|
|
+ # to be executed when transcription is ready.
|
|
|
+ # If provided, transcription will be performed asynchronously, and
|
|
|
+ # the callback will receive the transcription as its argument.
|
|
|
+ # If omitted, the transcription will be performed synchronously,
|
|
|
+ # and the result will be returned.
|
|
|
+
|
|
|
+ # Returns (if not callback is set):
|
|
|
+ # str: The transcription of the recorded audio
|
|
|
+ # """
|
|
|
+
|
|
|
+ # self.interrupt_stop_event.clear()
|
|
|
+ # self.was_interrupted.clear()
|
|
|
+
|
|
|
+ # self.wait_audio()
|
|
|
+
|
|
|
+ # if self.is_shut_down or self.interrupt_stop_event.is_set():
|
|
|
+ # if self.interrupt_stop_event.is_set():
|
|
|
+ # self.was_interrupted.set()
|
|
|
+ # return ""
|
|
|
+
|
|
|
+ # if on_transcription_finished:
|
|
|
+ # threading.Thread(target=on_transcription_finished,
|
|
|
+ # args=(self.transcribe(),)).start()
|
|
|
+ # else:
|
|
|
+ # return self.transcribe()
|
|
|
+
|
|
|
def start(self):
|
|
|
"""
|
|
|
Starts recording audio directly without waiting for voice activity.
|
|
@@ -1412,6 +1498,9 @@ class AudioToTextRecorder:
|
|
|
recording worker and closing the audio stream.
|
|
|
"""
|
|
|
|
|
|
+ print("RealtimeSTT shutting down")
|
|
|
+ logging.debug("RealtimeSTT shutting down")
|
|
|
+
|
|
|
# Force wait_audio() and text() to exit
|
|
|
self.is_shut_down = True
|
|
|
self.start_recording_event.set()
|
|
@@ -1467,10 +1556,12 @@ class AudioToTextRecorder:
|
|
|
logging.debug('Starting recording worker')
|
|
|
|
|
|
try:
|
|
|
+ time_since_last_buffer_message = 0
|
|
|
was_recording = False
|
|
|
delay_was_passed = False
|
|
|
wakeword_detected_time = None
|
|
|
wakeword_samples_to_remove = None
|
|
|
+ self.allowed_to_early_transcribe = True
|
|
|
|
|
|
# Continuously monitor audio for voice activity
|
|
|
while self.is_running:
|
|
@@ -1490,11 +1581,13 @@ class AudioToTextRecorder:
|
|
|
# Handle queue overflow
|
|
|
if (self.audio_queue.qsize() >
|
|
|
self.allowed_latency_limit):
|
|
|
+ logging.warning("!!! ### !!! ### !!!")
|
|
|
logging.warning("Audio queue size exceeds "
|
|
|
"latency limit. Current size: "
|
|
|
f"{self.audio_queue.qsize()}. "
|
|
|
"Discarding old audio chunks."
|
|
|
)
|
|
|
+ logging.warning("!!! ### !!! ### !!!")
|
|
|
|
|
|
while (self.audio_queue.qsize() >
|
|
|
self.allowed_latency_limit):
|
|
@@ -1506,8 +1599,18 @@ class AudioToTextRecorder:
|
|
|
self.is_running = False
|
|
|
break
|
|
|
|
|
|
+ # Feed the extracted data to the audio_queue
|
|
|
+ if time_since_last_buffer_message:
|
|
|
+ time_passed = time.time() - time_since_last_buffer_message
|
|
|
+ if time_passed > 1:
|
|
|
+ logging.debug("_recording_worker processing audio data")
|
|
|
+ time_since_last_buffer_message = time.time()
|
|
|
+ else:
|
|
|
+ time_since_last_buffer_message = time.time()
|
|
|
+
|
|
|
+ failed_stop_attempt = False
|
|
|
+
|
|
|
if not self.is_recording:
|
|
|
- logging.info(f"not recording, state: {self.state}, self.recording_stop_time: {self.recording_stop_time}, self.listen_start: {self.listen_start}")
|
|
|
# Handle not recording state
|
|
|
time_since_listen_start = (time.time() - self.listen_start
|
|
|
if self.listen_start else 0)
|
|
@@ -1538,7 +1641,6 @@ class AudioToTextRecorder:
|
|
|
else:
|
|
|
self._set_state("inactive")
|
|
|
|
|
|
- #self.wake_word_detect_time = time.time()
|
|
|
if self.use_wake_words and wake_word_activation_delay_passed:
|
|
|
try:
|
|
|
wakeword_index = self._process_wakeword(data)
|
|
@@ -1554,6 +1656,7 @@ class AudioToTextRecorder:
|
|
|
|
|
|
# If a wake word is detected
|
|
|
if wakeword_index >= 0:
|
|
|
+ self.wake_word_detect_time = time.time()
|
|
|
wakeword_detected_time = time.time()
|
|
|
wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
|
|
|
self.wakeword_detected = True
|
|
@@ -1612,31 +1715,62 @@ class AudioToTextRecorder:
|
|
|
else self._is_webrtc_speech(data, True)
|
|
|
)
|
|
|
|
|
|
+ if not self.speech_end_silence_start:
|
|
|
+ str_speech_end_silence_start = "0"
|
|
|
+ else:
|
|
|
+ str_speech_end_silence_start = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
|
|
|
+ logging.debug(f"is_speech: {is_speech}, str_speech_end_silence_start: {str_speech_end_silence_start}")
|
|
|
+
|
|
|
if not is_speech:
|
|
|
# Voice deactivity was detected, so we start
|
|
|
# measuring silence time before stopping recording
|
|
|
- if self.speech_end_silence_start == 0:
|
|
|
+ if self.speech_end_silence_start == 0 and \
|
|
|
+ (time.time() - self.recording_start_time > self.min_length_of_recording):
|
|
|
+
|
|
|
self.speech_end_silence_start = time.time()
|
|
|
- if self.early_transcription_on_silence and len(self.frames) > 0:
|
|
|
- audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
|
|
|
- audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
|
|
- self.parent_transcription_pipe.send((audio, self.language))
|
|
|
- self.transcribe_count += 1
|
|
|
+
|
|
|
+ if self.speech_end_silence_start and self.early_transcription_on_silence and len(self.frames) > 0 and \
|
|
|
+ (time.time() - self.speech_end_silence_start > self.early_transcription_on_silence) and \
|
|
|
+ self.allowed_to_early_transcribe:
|
|
|
+ logging.debug("Adding early transcription request")
|
|
|
+ self.transcribe_count += 1
|
|
|
+ audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
|
|
|
+ audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
|
|
+ self.parent_transcription_pipe.send((audio, self.language))
|
|
|
+ self.allowed_to_early_transcribe = False
|
|
|
+
|
|
|
else:
|
|
|
- self.speech_end_silence_start = 0
|
|
|
+ if self.speech_end_silence_start:
|
|
|
+ logging.info("Resetting self.speech_end_silence_start")
|
|
|
+ self.speech_end_silence_start = 0
|
|
|
+ self.allowed_to_early_transcribe = True
|
|
|
+
|
|
|
|
|
|
# Wait for silence to stop recording after speech
|
|
|
if self.speech_end_silence_start and time.time() - \
|
|
|
self.speech_end_silence_start >= \
|
|
|
self.post_speech_silence_duration:
|
|
|
- logging.info("voice deactivity detected")
|
|
|
+
|
|
|
+ # Get time in desired format (HH:MM:SS.nnn)
|
|
|
+ silence_start_time = datetime.datetime.fromtimestamp(self.speech_end_silence_start).strftime('%H:%M:%S.%f')[:-3]
|
|
|
+
|
|
|
+ # Calculate time difference
|
|
|
+ time_diff = time.time() - self.speech_end_silence_start
|
|
|
+
|
|
|
+ logging.info(f"voice deactivity detected at {silence_start_time}, "
|
|
|
+ f"time since silence start: {time_diff:.3f} seconds")
|
|
|
+
|
|
|
self.frames.append(data)
|
|
|
self.stop()
|
|
|
+ if not self.is_recording:
|
|
|
+ self.speech_end_silence_start = 0
|
|
|
|
|
|
- if not self.use_wake_words:
|
|
|
- self.listen_start = time.time()
|
|
|
- self._set_state("listening")
|
|
|
- self.start_recording_on_voice_activity = True
|
|
|
+ if not self.use_wake_words:
|
|
|
+ self.listen_start = time.time()
|
|
|
+ self._set_state("listening")
|
|
|
+ self.start_recording_on_voice_activity = True
|
|
|
+ else:
|
|
|
+ failed_stop_attempt = True
|
|
|
|
|
|
if not self.is_recording and was_recording:
|
|
|
# Reset after stopping recording to ensure clean state
|
|
@@ -1657,7 +1791,7 @@ class AudioToTextRecorder:
|
|
|
|
|
|
was_recording = self.is_recording
|
|
|
|
|
|
- if self.is_recording:
|
|
|
+ if self.is_recording and not failed_stop_attempt:
|
|
|
self.frames.append(data)
|
|
|
|
|
|
if not self.is_recording or self.speech_end_silence_start:
|
|
@@ -1753,7 +1887,7 @@ class AudioToTextRecorder:
|
|
|
if self.is_recording and time.time() - \
|
|
|
self.recording_start_time > 0.5:
|
|
|
|
|
|
- logging.debug('Starting realtime transcription')
|
|
|
+ # logging.debug('Starting realtime transcription')
|
|
|
self.realtime_transcription_text = realtime_text
|
|
|
self.realtime_transcription_text = \
|
|
|
self.realtime_transcription_text.strip()
|