audio_recorder.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. """
  2. The AudioToTextRecorder class in the provided code facilitates fast speech-to-text transcription.
  3. The class employs the faster_whisper library to transcribe the recorded audio
  4. into text using machine learning models, which can be run either on a GPU or CPU.
  5. Voice activity detection (VAD) is built in, meaning the software can automatically
  6. start or stop recording based on the presence or absence of speech.
  7. Additionally, it uses both short-term and long-term noise analysis to determine
  8. when actual voice activity occurs, as opposed to ambient noise.
  9. It integrates wake word detection through the pvporcupine library, allowing the
  10. software to initiate recording when a specific word or phrase is spoken.
  11. The system provides real-time feedback and can be further customized with multiple
  12. parameters like wake word sensitivity, recording intervals, and buffer durations.
  13. Features:
  14. - Voice Activity Detection: Automatically starts/stops recording when speech is detected or when speech ends.
  15. - Wake Word Detection: Starts recording when a specified wake word (or words) is detected.
  16. - Buffer Management: Handles short and long term audio buffers for efficient processing.
  17. - Event Callbacks: Customizable callbacks for when recording starts or finishes.
  18. - Noise Level Calculation: Adjusts based on the background noise for more accurate voice activity detection.
  19. Author: Kolja Beigel
  20. """
  21. import pyaudio
  22. import collections
  23. import faster_whisper
  24. import torch
  25. import numpy as np
  26. import struct
  27. import pvporcupine
  28. import threading
  29. import time
  30. import logging
  31. from collections import deque
  32. SAMPLE_RATE = 16000
  33. BUFFER_SIZE = 512
  34. LONG_TERM_HISTORY_BUFFERSIZE = 2.0 # seconds
  35. SHORT_TERM_HISTORY_BUFFERSIZE = 2.0 # seconds
  36. WAIT_AFTER_START_BEFORE_ACTIVITY_DETECTION = 0.3 # seconds
  37. ACTIVITY_DETECTION_AFTER_START_PERCENT = 0.6
  38. class AudioToTextRecorder:
  39. """
  40. A class responsible for capturing audio from the microphone, detecting voice activity, and then transcribing the captured audio using the `faster_whisper` model.
  41. """
  42. def __init__(self,
  43. model: str = "tiny",
  44. language: str = "",
  45. wake_words: str = "",
  46. wake_words_sensitivity: float = 0.5,
  47. on_recording_started = None,
  48. on_recording_finished = None,
  49. min_recording_interval: float = 1.0,
  50. interval_between_records: float = 1.0,
  51. buffer_duration: float = 1.0,
  52. voice_activity_threshold: float = 250,
  53. voice_deactivity_sensitivity: float = 0.3,
  54. voice_deactivity_silence_after_speech_end: float = 0.1,
  55. long_term_smoothing_factor: float = 0.995,
  56. short_term_smoothing_factor: float = 0.900,
  57. level=logging.WARNING,
  58. ):
  59. """
  60. Initializes an audio recorder and transcription and wake word detection.
  61. Args:
  62. model (str): Specifies the size of the transcription model to use or the path to a converted model directory.
  63. Valid options are 'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2'.
  64. If a specific size is provided, the model is downloaded from the Hugging Face Hub.
  65. language (str): Language code for speech-to-text engine. If not specified, the model will attempt to detect the language automatically.
  66. wake_words (str): Comma-separated string of wake words to initiate recording. Supported wake words include:
  67. 'alexa', 'americano', 'blueberry', 'bumblebee', 'computer', 'grapefruits', 'grasshopper', 'hey google', 'hey siri', 'jarvis', 'ok google', 'picovoice', 'porcupine', 'terminator'.
  68. wake_words_sensitivity (float): Sensitivity for wake word detection, ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
  69. on_recording_started (callable, optional): Callback invoked when recording begins.
  70. on_recording_finished (callable, optional): Callback invoked when recording ends.
  71. min_recording_interval (float): Minimum interval (in seconds) for recording durations.
  72. interval_between_records (float): Interval (in seconds) between consecutive recordings.
  73. buffer_duration (float): Duration (in seconds) to maintain pre-roll audio in the buffer.
  74. voice_activity_threshold (float): Threshold level above long-term noise to determine the start of voice activity.
  75. voice_deactivity_sensitivity (float): Sensitivity for voice deactivation detection, ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.3.
  76. voice_deactivity_silence_after_speech_end (float): Duration (in seconds) of silence after speech ends to trigger voice deactivation. Default is 0.1.
  77. long_term_smoothing_factor (float): Exponential smoothing factor used in calculating long-term noise level.
  78. short_term_smoothing_factor (float): Exponential smoothing factor used in calculating short-term noise level.
  79. level (logging level): Desired log level for internal logging. Default is `logging.WARNING`.
  80. Raises:
  81. Exception: Errors related to initializing transcription model, wake word detection, or audio recording.
  82. """
  83. self.language = language
  84. self.wake_words = wake_words
  85. self.min_recording_interval = min_recording_interval
  86. self.interval_between_records = interval_between_records
  87. self.buffer_duration = buffer_duration
  88. self.voice_activity_threshold = voice_activity_threshold
  89. self.voice_deactivity_sensitivity = voice_deactivity_sensitivity
  90. self.voice_deactivity_silence_after_speech_end = voice_deactivity_silence_after_speech_end
  91. self.long_term_smoothing_factor = long_term_smoothing_factor
  92. self.short_term_smoothing_factor = short_term_smoothing_factor
  93. self.on_recording_started = on_recording_started
  94. self.on_recording_finished = on_recording_finished
  95. self.level = level
  96. self.buffer_size = BUFFER_SIZE
  97. self.sample_rate = SAMPLE_RATE
  98. self.last_start_time = 0 # time when the recording last started
  99. self.last_stop_time = 0 # time when the recording last stopped
  100. self.speech_end_silence_start = 0
  101. self.level_long_term = 0
  102. self.level_short_term = 0
  103. self.level_peak = 0
  104. self.level_floor = 0
  105. self.voice_deactivity_probability = 0
  106. self.long_term_noise_calculation = True
  107. self.state = "initializing"
  108. # Initialize the logging configuration with the specified level
  109. logging.basicConfig(format='RealTimeSTT: %(message)s', level=level)
  110. # Initialize the transcription model
  111. try:
  112. self.model = faster_whisper.WhisperModel(model_size_or_path=model, device='cuda' if torch.cuda.is_available() else 'cpu')
  113. except Exception as e:
  114. logging.exception(f"Error initializing faster_whisper transcription model: {e}")
  115. raise
  116. # Setup wake word detection
  117. if wake_words:
  118. self.wake_words_list = [word.strip() for word in wake_words.split(',')]
  119. sensitivity_list = [float(wake_words_sensitivity) for _ in range(len(self.wake_words_list))]
  120. try:
  121. self.porcupine = pvporcupine.create(keywords=self.wake_words_list, sensitivities=sensitivity_list)
  122. self.buffer_size = self.porcupine.frame_length
  123. self.sample_rate = self.porcupine.sample_rate
  124. except Exception as e:
  125. logging.exception(f"Error initializing porcupine wake word detection engine: {e}")
  126. raise
  127. # Setup audio recording infrastructure
  128. try:
  129. self.audio = pyaudio.PyAudio()
  130. self.stream = self.audio.open(rate=self.sample_rate, format=pyaudio.paInt16, channels=1, input=True, frames_per_buffer=self.buffer_size)
  131. except Exception as e:
  132. logging.exception(f"Error initializing pyaudio audio recording: {e}")
  133. raise
  134. # This will store the noise levels for the last x seconds
  135. # Assuming data is captured at the buffer size rate, determine how many entries
  136. buffersize_long_term_history = int((self.sample_rate // self.buffer_size) * LONG_TERM_HISTORY_BUFFERSIZE)
  137. self.long_term_noise_history = deque(maxlen=buffersize_long_term_history)
  138. buffersize_short_term_history = int((self.sample_rate // self.buffer_size) * SHORT_TERM_HISTORY_BUFFERSIZE)
  139. self.short_term_noise_history = deque(maxlen=buffersize_short_term_history)
  140. self.audio_buffer = collections.deque(maxlen=int((self.sample_rate // self.buffer_size) * self.buffer_duration))
  141. self.frames = []
  142. # Recording control flags
  143. self.is_recording = False
  144. self.is_running = True
  145. self.start_recording_on_voice_activity = False
  146. self.stop_recording_on_voice_deactivity = False
  147. # Start the recording worker thread
  148. self.recording_thread = threading.Thread(target=self._recording_worker)
  149. self.recording_thread.daemon = True
  150. self.recording_thread.start()
  151. def text(self):
  152. """
  153. Transcribes audio captured by the class instance using the `faster_whisper` model.
  154. - Waits for voice activity if not yet started recording
  155. - Waits for voice deactivity if not yet stopped recording
  156. - Transcribes the recorded audio.
  157. Returns:
  158. str: The transcription of the recorded audio or an empty string in case of an error.
  159. """
  160. try:
  161. # If not yet started to record, wait for voice activity to initiate recording.
  162. if not self.is_recording and len(self.frames) == 0:
  163. self.state = "listening"
  164. self.start_recording_on_voice_activity = True
  165. while not self.is_recording:
  166. time.sleep(0.1) # Use a small sleep to prevent busy-waiting.
  167. # If still recording, wait for voice deactivity to finish recording.
  168. if self.is_recording:
  169. self.state = "recording"
  170. self.stop_recording_on_voice_deactivity = True
  171. while self.is_recording:
  172. time.sleep(0.1) # Use a small sleep to prevent busy-waiting.
  173. # Convert the concatenated frames into text
  174. self.state = "transcribing"
  175. try:
  176. audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
  177. audio_array = audio_array.astype(np.float32) / 32768.0
  178. self.frames = []
  179. return " ".join(seg.text for seg in self.model.transcribe(audio_array, language=self.language if self.language else None)[0]).strip()
  180. except ValueError:
  181. logging.error("Error converting audio buffer to numpy array.")
  182. raise
  183. except faster_whisper.WhisperError as e:
  184. logging.error(f"Whisper transcription error: {e}")
  185. raise
  186. except Exception as e:
  187. logging.error(f"General transcription error: {e}")
  188. raise
  189. except Exception as e:
  190. print(f"Error during transcription: {e}")
  191. return ""
  192. def start(self):
  193. """
  194. Starts recording audio directly without waiting for voice activity.
  195. """
  196. current_time = time.time()
  197. # Ensure there's a minimum interval between stopping and starting recording
  198. if current_time - self.last_stop_time < self.interval_between_records:
  199. logging.info("Attempted to start recording too soon after stopping.")
  200. return self
  201. logging.info("recording started")
  202. self.state = "recording"
  203. self.frames = []
  204. self.is_recording = True
  205. self.last_start_time = current_time
  206. if self.on_recording_started:
  207. self.on_recording_started()
  208. return self
  209. def stop(self):
  210. logging.info("recording stopped")
  211. """
  212. Stops recording audio.
  213. """
  214. current_time = time.time()
  215. # Ensure there's a minimum interval between starting and stopping recording
  216. if current_time - self.last_start_time < self.interval_between_records:
  217. logging.info("Attempted to stop recording too soon after starting.")
  218. return self
  219. logging.info("recording stopped")
  220. self.state = "listening"
  221. self.is_recording = False
  222. self.last_stop_time = current_time
  223. if self.on_recording_finished:
  224. self.on_recording_finished()
  225. return self
  226. def shutdown(self):
  227. """
  228. Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
  229. """
  230. self.is_recording = False
  231. self.is_running = False
  232. self.recording_thread.join()
  233. try:
  234. self.stream.stop_stream()
  235. self.stream.close()
  236. self.audio.terminate()
  237. except Exception as e:
  238. logging.error(f"Error closing the audio stream: {e}")
  239. def _calculate_percentile_mean(self, buffer, percentile, upper=True):
  240. """
  241. Calculates the mean of the specified percentile from the provided buffer of
  242. long_term noise levels. If upper is True, it calculates from the upper side,
  243. otherwise from the lower side.
  244. Args:
  245. - buffer (list): The buffer containing the history of long_term noise levels.
  246. - percentile (float): The desired percentile (0.0 <= percentile <= 1.0). E.g., 0.125 for 1/8.
  247. - upper (bool): Determines if the function considers the upper or lower portion of data.
  248. Returns:
  249. - float: The mean value of the desired portion.
  250. """
  251. sorted_buffer = sorted(buffer)
  252. index = int(len(sorted_buffer) * percentile)
  253. if upper:
  254. values = sorted_buffer[-index:] # Get values from the top
  255. else:
  256. values = sorted_buffer[:index] # Get values from the bottom
  257. if len(values) == 0:
  258. return 0.0
  259. return sum(values) / len(values)
  260. def _recording_worker(self):
  261. """
  262. The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
  263. Uses long_term noise level measurements to determine voice activity.
  264. """
  265. was_recording = False
  266. voice_after_recording = False
  267. # Continuously monitor audio for voice activity
  268. while self.is_running:
  269. try:
  270. data = self.stream.read(self.buffer_size)
  271. except pyaudio.paInputOverflowed:
  272. logging.warning("Input overflowed. Frame dropped.")
  273. continue
  274. except Exception as e:
  275. logging.error(f"Error during recording: {e}")
  276. time.sleep(1)
  277. continue
  278. audio_level = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
  279. if not self.is_recording and self.long_term_noise_calculation:
  280. self.level_long_term = self.level_long_term * self.long_term_smoothing_factor + audio_level * (1.0 - self.long_term_smoothing_factor)
  281. self.level_short_term = self.level_short_term * self.short_term_smoothing_factor + audio_level * (1.0 - self.short_term_smoothing_factor)
  282. self.long_term_noise_history.append(self.level_long_term)
  283. self.short_term_noise_history.append(self.level_short_term)
  284. self.level_peak = self._calculate_percentile_mean(self.short_term_noise_history, 0.05, upper=True)
  285. self.level_floor = self._calculate_percentile_mean(self.short_term_noise_history, 0.1, upper=False)
  286. short_term_to_peak_percentage = (self.level_short_term - self.level_floor) / (self.level_peak - self.level_floor)
  287. if not self.is_recording:
  288. logging.debug(f'Level: {int(audio_level)}, long_term: {int(self.level_long_term)}, short_term: {int(self.level_short_term)}, Peak: {int(self.level_peak)}, long_term low: {int(self.level_floor)}, Percentage: {int(short_term_to_peak_percentage*100)}%')
  289. else:
  290. short_term_to_peak_percentage = (self.level_short_term - self.level_long_term) / (self.level_peak - self.level_long_term)
  291. logging.debug(f'Level: {int(audio_level)}, long_term: {int(self.level_long_term)}, short_term: {int(self.level_short_term)}, Peak: {int(self.level_peak)}, long_term low: {int(self.level_floor)}, Percentage: {int(short_term_to_peak_percentage*100)}%')
  292. # Check if we're not currently recording
  293. if not self.is_recording:
  294. voice_after_recording = False
  295. # Check if wake word detection is active
  296. if self.wake_words:
  297. try:
  298. pcm = struct.unpack_from("h" * self.buffer_size, data)
  299. wakeword_index = self.porcupine.process(pcm)
  300. except struct.error:
  301. logging.error("Error unpacking audio data for wake word processing.")
  302. continue
  303. except Exception as e:
  304. logging.error(f"Wake word processing error: {e}")
  305. continue
  306. wakeword_detected = wakeword_index >= 0
  307. if wakeword_detected:
  308. logging.info(f'wake word "{self.wake_words_list[wakeword_index]}" detected')
  309. self.start()
  310. if self.is_recording:
  311. self.level_long_term = self._calculate_percentile_mean(self.long_term_noise_history, 0.125, upper=False)
  312. self.start_recording_on_voice_activity = False
  313. # Check for voice activity to trigger the start of recording
  314. elif self.start_recording_on_voice_activity and self.level_short_term > self.level_long_term + self.voice_activity_threshold:
  315. logging.info("voice activity detected")
  316. self.start()
  317. if self.is_recording:
  318. self.level_long_term = self._calculate_percentile_mean(self.long_term_noise_history, 0.125, upper=False)
  319. self.start_recording_on_voice_activity = False
  320. # Add the buffered audio to the recording frames
  321. self.frames.extend(list(self.audio_buffer))
  322. self.speech_end_silence_start = 0
  323. # If we're currently recording and voice deactivity is detected, stop the recording
  324. else:
  325. current_time = time.time()
  326. self.state = "recording - waiting for voice end" if voice_after_recording else "recording - waiting for voice"
  327. # we don't detect voice in the first x seconds cause it could be fragments from the wake word
  328. if current_time - self.last_start_time > WAIT_AFTER_START_BEFORE_ACTIVITY_DETECTION:
  329. if not voice_after_recording and self.level_short_term > self.level_long_term + (self.voice_activity_threshold * ACTIVITY_DETECTION_AFTER_START_PERCENT):
  330. logging.info("voice activity after recording detected")
  331. voice_after_recording = True
  332. # we are recording
  333. short_term_to_peak_percentage = (self.level_short_term - self.level_long_term) / (self.level_peak - self.level_long_term)
  334. logging.debug(f'short_term_to_peak_percentage: {int(short_term_to_peak_percentage*100)}%, peak: {int(self.level_peak)}, long_term: {int(self.level_long_term)}')
  335. if voice_after_recording and self.stop_recording_on_voice_deactivity:
  336. if short_term_to_peak_percentage < self.voice_deactivity_sensitivity:
  337. # silence detected (after voice detected while recording)
  338. if self.speech_end_silence_start == 0:
  339. self.speech_end_silence_start = time.time()
  340. self.state = "recording - voice end, silence wait"
  341. else:
  342. self.speech_end_silence_start = 0
  343. if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.voice_deactivity_silence_after_speech_end:
  344. logging.info("voice deactivity detected")
  345. self.stop()
  346. if not self.is_recording:
  347. voice_after_recording = False
  348. if not self.is_recording and was_recording:
  349. # Reset after stopping recording to ensure clean state
  350. self.stop_recording_on_voice_deactivity = False
  351. short_term_to_peak_percentage = min(max(short_term_to_peak_percentage, 0.0), 1.0)
  352. self.voice_deactivity_probability = 1 - short_term_to_peak_percentage
  353. if self.is_recording:
  354. self.frames.append(data)
  355. self.audio_buffer.append(data)
  356. was_recording = self.is_recording
  357. time.sleep(0.01)
  358. def __del__(self):
  359. """
  360. Destructor method ensures safe shutdown of the recorder when the instance is destroyed.
  361. """
  362. self.shutdown()