audio_recorder_client.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. from typing import Iterable, List, Optional, Union
  2. from urllib.parse import urlparse
  3. import subprocess
  4. import websocket
  5. import threading
  6. import platform
  7. import logging
  8. import pyaudio
  9. import socket
  10. import struct
  11. import signal
  12. import json
  13. import time
  14. import sys
  15. import os
  16. DEFAULT_CONTROL_URL = "ws://127.0.0.1:8011"
  17. DEFAULT_DATA_URL = "ws://127.0.0.1:8012"
  18. INIT_MODEL_TRANSCRIPTION = "tiny"
  19. INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
  20. INIT_REALTIME_PROCESSING_PAUSE = 0.2
  21. INIT_SILERO_SENSITIVITY = 0.4
  22. INIT_WEBRTC_SENSITIVITY = 3
  23. INIT_POST_SPEECH_SILENCE_DURATION = 0.6
  24. INIT_MIN_LENGTH_OF_RECORDING = 0.5
  25. INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
  26. INIT_WAKE_WORDS_SENSITIVITY = 0.6
  27. INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
  28. INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
  29. INIT_WAKE_WORD_TIMEOUT = 5.0
  30. INIT_WAKE_WORD_BUFFER_DURATION = 0.1
  31. ALLOWED_LATENCY_LIMIT = 100
  32. CHUNK = 1024
  33. FORMAT = pyaudio.paInt16
  34. CHANNELS = 1
  35. SAMPLE_RATE = 16000
  36. BUFFER_SIZE = 512
  37. INIT_HANDLE_BUFFER_OVERFLOW = False
  38. if platform.system() != 'Darwin':
  39. INIT_HANDLE_BUFFER_OVERFLOW = True
  40. class AudioToTextRecorderClient:
  41. """
  42. A class responsible for capturing audio from the microphone, detecting
  43. voice activity, and then transcribing the captured audio using the
  44. `faster_whisper` model.
  45. """
  46. def __init__(self,
  47. model: str = INIT_MODEL_TRANSCRIPTION,
  48. language: str = "",
  49. compute_type: str = "default",
  50. input_device_index: int = None,
  51. gpu_device_index: Union[int, List[int]] = 0,
  52. device: str = "cuda",
  53. on_recording_start=None,
  54. on_recording_stop=None,
  55. on_transcription_start=None,
  56. ensure_sentence_starting_uppercase=True,
  57. ensure_sentence_ends_with_period=True,
  58. use_microphone=True,
  59. spinner=True,
  60. level=logging.WARNING,
  61. # Realtime transcription parameters
  62. enable_realtime_transcription=False,
  63. use_main_model_for_realtime=False,
  64. realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME,
  65. realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE,
  66. on_realtime_transcription_update=None,
  67. on_realtime_transcription_stabilized=None,
  68. # Voice activation parameters
  69. silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
  70. silero_use_onnx: bool = False,
  71. silero_deactivity_detection: bool = False,
  72. webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
  73. post_speech_silence_duration: float = (
  74. INIT_POST_SPEECH_SILENCE_DURATION
  75. ),
  76. min_length_of_recording: float = (
  77. INIT_MIN_LENGTH_OF_RECORDING
  78. ),
  79. min_gap_between_recordings: float = (
  80. INIT_MIN_GAP_BETWEEN_RECORDINGS
  81. ),
  82. pre_recording_buffer_duration: float = (
  83. INIT_PRE_RECORDING_BUFFER_DURATION
  84. ),
  85. on_vad_detect_start=None,
  86. on_vad_detect_stop=None,
  87. # Wake word parameters
  88. wakeword_backend: str = "pvporcupine",
  89. openwakeword_model_paths: str = None,
  90. openwakeword_inference_framework: str = "onnx",
  91. wake_words: str = "",
  92. wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
  93. wake_word_activation_delay: float = (
  94. INIT_WAKE_WORD_ACTIVATION_DELAY
  95. ),
  96. wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
  97. wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
  98. on_wakeword_detected=None,
  99. on_wakeword_timeout=None,
  100. on_wakeword_detection_start=None,
  101. on_wakeword_detection_end=None,
  102. on_recorded_chunk=None,
  103. debug_mode=False,
  104. handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
  105. beam_size: int = 5,
  106. beam_size_realtime: int = 3,
  107. buffer_size: int = BUFFER_SIZE,
  108. sample_rate: int = SAMPLE_RATE,
  109. initial_prompt: Optional[Union[str, Iterable[int]]] = None,
  110. suppress_tokens: Optional[List[int]] = [-1],
  111. print_transcription_time: bool = False,
  112. early_transcription_on_silence: int = 0,
  113. allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
  114. no_log_file: bool = False,
  115. use_extended_logging: bool = False,
  116. # Server urls
  117. control_url: str = DEFAULT_CONTROL_URL,
  118. data_url: str = DEFAULT_DATA_URL,
  119. autostart_server: bool = True,
  120. ):
  121. # Set instance variables from constructor parameters
  122. self.model = model
  123. self.language = language
  124. self.compute_type = compute_type
  125. self.input_device_index = input_device_index
  126. self.gpu_device_index = gpu_device_index
  127. self.device = device
  128. self.on_recording_start = on_recording_start
  129. self.on_recording_stop = on_recording_stop
  130. self.on_transcription_start = on_transcription_start
  131. self.ensure_sentence_starting_uppercase = ensure_sentence_starting_uppercase
  132. self.ensure_sentence_ends_with_period = ensure_sentence_ends_with_period
  133. self.use_microphone = use_microphone
  134. self.spinner = spinner
  135. self.level = level
  136. # Real-time transcription parameters
  137. self.enable_realtime_transcription = enable_realtime_transcription
  138. self.use_main_model_for_realtime = use_main_model_for_realtime
  139. self.realtime_model_type = realtime_model_type
  140. self.realtime_processing_pause = realtime_processing_pause
  141. self.on_realtime_transcription_update = on_realtime_transcription_update
  142. self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
  143. # Voice activation parameters
  144. self.silero_sensitivity = silero_sensitivity
  145. self.silero_use_onnx = silero_use_onnx
  146. self.silero_deactivity_detection = silero_deactivity_detection
  147. self.webrtc_sensitivity = webrtc_sensitivity
  148. self.post_speech_silence_duration = post_speech_silence_duration
  149. self.min_length_of_recording = min_length_of_recording
  150. self.min_gap_between_recordings = min_gap_between_recordings
  151. self.pre_recording_buffer_duration = pre_recording_buffer_duration
  152. self.on_vad_detect_start = on_vad_detect_start
  153. self.on_vad_detect_stop = on_vad_detect_stop
  154. # Wake word parameters
  155. self.wakeword_backend = wakeword_backend
  156. self.openwakeword_model_paths = openwakeword_model_paths
  157. self.openwakeword_inference_framework = openwakeword_inference_framework
  158. self.wake_words = wake_words
  159. self.wake_words_sensitivity = wake_words_sensitivity
  160. self.wake_word_activation_delay = wake_word_activation_delay
  161. self.wake_word_timeout = wake_word_timeout
  162. self.wake_word_buffer_duration = wake_word_buffer_duration
  163. self.on_wakeword_detected = on_wakeword_detected
  164. self.on_wakeword_timeout = on_wakeword_timeout
  165. self.on_wakeword_detection_start = on_wakeword_detection_start
  166. self.on_wakeword_detection_end = on_wakeword_detection_end
  167. self.on_recorded_chunk = on_recorded_chunk
  168. self.debug_mode = debug_mode
  169. self.handle_buffer_overflow = handle_buffer_overflow
  170. self.beam_size = beam_size
  171. self.beam_size_realtime = beam_size_realtime
  172. self.buffer_size = buffer_size
  173. self.sample_rate = sample_rate
  174. self.initial_prompt = initial_prompt
  175. self.suppress_tokens = suppress_tokens
  176. self.print_transcription_time = print_transcription_time
  177. self.early_transcription_on_silence = early_transcription_on_silence
  178. self.allowed_latency_limit = allowed_latency_limit
  179. self.no_log_file = no_log_file
  180. self.use_extended_logging = use_extended_logging
  181. # Server URLs
  182. self.control_url = control_url
  183. self.data_url = data_url
  184. self.autostart_server = autostart_server
  185. # Instance variables
  186. self.is_running = True
  187. self.connection_established = threading.Event()
  188. self.recording_start = threading.Event()
  189. self.final_text_ready = threading.Event()
  190. self.realtime_text = ""
  191. self.final_text = ""
  192. if self.debug_mode:
  193. print("Checking STT server")
  194. if not self.connect():
  195. print("Failed to connect to the server.", file=sys.stderr)
  196. else:
  197. if self.debug_mode:
  198. print("STT server is running and connected.")
  199. self.start_recording()
  200. def text(self, on_transcription_finished=None):
  201. self.realtime_text = ""
  202. self.submitted_realtime_text = ""
  203. self.final_text = ""
  204. self.final_text_ready.clear()
  205. self.recording_start.set()
  206. try:
  207. total_wait_time = 0
  208. wait_interval = 0.02 # Wait in small intervals, e.g., 100ms
  209. max_wait_time = 60 # Timeout after 60 seconds
  210. while total_wait_time < max_wait_time:
  211. if self.final_text_ready.wait(timeout=wait_interval):
  212. break # Break if transcription is ready
  213. if not self.realtime_text == self.submitted_realtime_text:
  214. if self.on_realtime_transcription_update:
  215. self.on_realtime_transcription_update(self.realtime_text)
  216. self.submitted_realtime_text = self.realtime_text
  217. total_wait_time += wait_interval
  218. # Check if a manual interrupt has occurred
  219. if total_wait_time >= max_wait_time:
  220. if self.debug_mode:
  221. print("Timeout while waiting for text from the server.")
  222. self.recording_start.clear()
  223. if on_transcription_finished:
  224. threading.Thread(target=on_transcription_finished, args=("",)).start()
  225. return ""
  226. self.recording_start.clear()
  227. if on_transcription_finished:
  228. threading.Thread(target=on_transcription_finished, args=(self.final_text,)).start()
  229. return self.final_text
  230. except KeyboardInterrupt:
  231. if self.debug_mode:
  232. print("KeyboardInterrupt in record_and_send_audio, exiting...")
  233. raise KeyboardInterrupt
  234. def connect(self):
  235. if not self.ensure_server_running():
  236. print("Cannot start STT server. Exiting.")
  237. return False
  238. try:
  239. # Connect to control WebSocket
  240. self.control_ws = websocket.WebSocketApp(self.control_url,
  241. on_message=self.on_control_message,
  242. on_error=self.on_error,
  243. on_close=self.on_close,
  244. on_open=self.on_control_open)
  245. self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
  246. self.control_ws_thread.daemon = False
  247. self.control_ws_thread.start()
  248. # Connect to data WebSocket
  249. self.data_ws = websocket.WebSocketApp(self.data_url,
  250. on_message=self.on_data_message,
  251. on_error=self.on_error,
  252. on_close=self.on_close,
  253. on_open=self.on_data_open)
  254. self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
  255. self.data_ws_thread.daemon = False
  256. self.data_ws_thread.start()
  257. # Wait for the connections to be established
  258. if not self.connection_established.wait(timeout=10):
  259. print("Timeout while connecting to the server.")
  260. return False
  261. if self.debug_mode:
  262. print("WebSocket connections established successfully.")
  263. return True
  264. except Exception as e:
  265. print(f"Error while connecting to the server: {e}")
  266. return False
  267. def start_server(self):
  268. args = ['stt-server']
  269. # Map constructor parameters to server arguments
  270. if self.model:
  271. args += ['--model', self.model]
  272. if self.realtime_model_type:
  273. args += ['--realtime_model_type', self.realtime_model_type]
  274. if self.language:
  275. args += ['--language', self.language]
  276. if self.silero_sensitivity is not None:
  277. args += ['--silero_sensitivity', str(self.silero_sensitivity)]
  278. if self.webrtc_sensitivity is not None:
  279. args += ['--webrtc_sensitivity', str(self.webrtc_sensitivity)]
  280. if self.min_length_of_recording is not None:
  281. args += ['--min_length_of_recording', str(self.min_length_of_recording)]
  282. if self.min_gap_between_recordings is not None:
  283. args += ['--min_gap_between_recordings', str(self.min_gap_between_recordings)]
  284. if self.realtime_processing_pause is not None:
  285. args += ['--realtime_processing_pause', str(self.realtime_processing_pause)]
  286. if self.early_transcription_on_silence is not None:
  287. args += ['--early_transcription_on_silence', str(self.early_transcription_on_silence)]
  288. if self.beam_size is not None:
  289. args += ['--beam_size', str(self.beam_size)]
  290. if self.beam_size_realtime is not None:
  291. args += ['--beam_size_realtime', str(self.beam_size_realtime)]
  292. if self.initial_prompt:
  293. args += ['--initial_prompt', self.initial_prompt]
  294. if self.control_url:
  295. parsed_control_url = urlparse(self.control_url)
  296. if parsed_control_url.port:
  297. args += ['--control_port', str(parsed_control_url.port)]
  298. if self.data_url:
  299. parsed_data_url = urlparse(self.data_url)
  300. if parsed_data_url.port:
  301. args += ['--data_port', str(parsed_data_url.port)]
  302. # Start the subprocess with the mapped arguments
  303. if os.name == 'nt': # Windows
  304. cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
  305. subprocess.Popen(cmd, shell=True)
  306. else: # Unix-like systems
  307. subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
  308. print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
  309. def is_server_running(self):
  310. parsed_url = urlparse(self.control_url)
  311. host = parsed_url.hostname
  312. port = parsed_url.port or 80
  313. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  314. return s.connect_ex((host, port)) == 0
  315. def ensure_server_running(self):
  316. if not self.is_server_running():
  317. if self.debug_mode:
  318. print("STT server is not running.", file=sys.stderr)
  319. if self.autostart_server or self.ask_to_start_server():
  320. self.start_server()
  321. if self.debug_mode:
  322. print("Waiting for STT server to start...", file=sys.stderr)
  323. for _ in range(20): # Wait up to 20 seconds
  324. if self.is_server_running():
  325. if self.debug_mode:
  326. print("STT server started successfully.", file=sys.stderr)
  327. time.sleep(2) # Give the server a moment to fully initialize
  328. return True
  329. time.sleep(1)
  330. print("Failed to start STT server.", file=sys.stderr)
  331. return False
  332. else:
  333. print("STT server is required. Please start it manually.", file=sys.stderr)
  334. return False
  335. return True
  336. def start_recording(self):
  337. self.recording_thread = threading.Thread(target=self.record_and_send_audio)
  338. self.recording_thread.daemon = False
  339. self.recording_thread.start()
  340. def setup_audio(self):
  341. try:
  342. self.audio_interface = pyaudio.PyAudio()
  343. self.input_device_index = None
  344. try:
  345. default_device = self.audio_interface.get_default_input_device_info()
  346. self.input_device_index = default_device['index']
  347. except OSError as e:
  348. print(f"No default input device found: {e}")
  349. return False
  350. self.device_sample_rate = 16000 # Try 16000 Hz first
  351. try:
  352. self.stream = self.audio_interface.open(
  353. format=FORMAT,
  354. channels=CHANNELS,
  355. rate=self.device_sample_rate,
  356. input=True,
  357. frames_per_buffer=CHUNK,
  358. input_device_index=self.input_device_index,
  359. )
  360. if self.debug_mode:
  361. print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
  362. return True
  363. except Exception as e:
  364. print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
  365. return False
  366. except Exception as e:
  367. print(f"Error initializing audio recording: {e}")
  368. if self.audio_interface:
  369. self.audio_interface.terminate()
  370. return False
  371. def record_and_send_audio(self):
  372. try:
  373. if not self.setup_audio():
  374. raise Exception("Failed to set up audio recording.")
  375. if self.debug_mode:
  376. print("Recording and sending audio...")
  377. while self.is_running:
  378. try:
  379. audio_data = self.stream.read(CHUNK)
  380. if self.recording_start.is_set():
  381. metadata = {"sampleRate": self.device_sample_rate}
  382. metadata_json = json.dumps(metadata)
  383. metadata_length = len(metadata_json)
  384. message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
  385. if self.is_running:
  386. self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
  387. except KeyboardInterrupt: # handle manual interruption (Ctrl+C)
  388. if self.debug_mode:
  389. print("KeyboardInterrupt in record_and_send_audio, exiting...")
  390. break
  391. except Exception as e:
  392. print(f"Error sending audio data: {e}")
  393. break # Exit the recording loop
  394. except Exception as e:
  395. print(f"Error in record_and_send_audio: {e}")
  396. finally:
  397. self.cleanup_audio()
  398. def cleanup_audio(self):
  399. try:
  400. if self.stream:
  401. self.stream.stop_stream()
  402. self.stream.close()
  403. self.stream = None
  404. if self.audio_interface:
  405. self.audio_interface.terminate()
  406. self.audio_interface = None
  407. except Exception as e:
  408. print(f"Error cleaning up audio resources: {e}")
  409. def on_control_message(self, ws, message):
  410. try:
  411. data = json.loads(message)
  412. # Handle server response with status
  413. if 'status' in data:
  414. if data['status'] == 'success':
  415. if 'parameter' in data and 'value' in data:
  416. if self.debug_mode:
  417. print(f"Parameter {data['parameter']} = {data['value']}")
  418. elif data['status'] == 'error':
  419. print(f"Server Error: {data.get('message', '')}")
  420. else:
  421. print(f"Unknown control message format: {data}")
  422. except json.JSONDecodeError:
  423. print(f"Received non-JSON control message: {message}")
  424. except Exception as e:
  425. print(f"Error processing control message: {e}")
  426. # Handle real-time transcription and full sentence updates
  427. def on_data_message(self, ws, message):
  428. try:
  429. data = json.loads(message)
  430. # Handle real-time transcription updates
  431. if data.get('type') == 'realtime':
  432. if data['text'] != self.realtime_text:
  433. self.realtime_text = data['text']
  434. # Handle full sentences
  435. elif data.get('type') == 'fullSentence':
  436. self.final_text = data['text']
  437. self.final_text_ready.set()
  438. elif data.get('type') == 'recording_start':
  439. if self.on_recording_start:
  440. self.on_recording_start()
  441. elif data.get('type') == 'recording_stop':
  442. if self.on_recording_stop:
  443. self.on_recording_stop()
  444. elif data.get('type') == 'transcription_start':
  445. if self.on_transcription_start:
  446. self.on_transcription_start()
  447. elif data.get('type') == 'vad_detect_start':
  448. if self.on_vad_detect_start:
  449. self.on_vad_detect_start()
  450. else:
  451. print(f"Unknown data message format: {data}")
  452. except json.JSONDecodeError:
  453. print(f"Received non-JSON data message: {message}")
  454. except Exception as e:
  455. print(f"Error processing data message: {e}")
  456. def on_error(self, ws, error):
  457. print(f"WebSocket error: {error}")
  458. def on_close(self, ws, close_status_code, close_msg):
  459. if self.debug_mode:
  460. if ws == self.data_ws:
  461. print(f"Data WebSocket connection closed: {close_status_code} - {close_msg}")
  462. elif ws == self.control_ws:
  463. print(f"Control WebSocket connection closed: {close_status_code} - {close_msg}")
  464. self.is_running = False
  465. def on_control_open(self, ws):
  466. if self.debug_mode:
  467. print("Control WebSocket connection opened.")
  468. self.connection_established.set()
  469. def on_data_open(self, ws):
  470. if self.debug_mode:
  471. print("Data WebSocket connection opened.")
  472. def shutdown(self):
  473. self.is_running = False
  474. #self.stop_event.set()
  475. if self.control_ws:
  476. self.control_ws.close()
  477. if self.data_ws:
  478. self.data_ws.close()
  479. # Join threads to ensure they finish before exiting
  480. if self.control_ws_thread:
  481. self.control_ws_thread.join()
  482. if self.data_ws_thread:
  483. self.data_ws_thread.join()
  484. if self.recording_thread:
  485. self.recording_thread.join()
  486. # Clean up audio resources
  487. if self.stream:
  488. self.stream.stop_stream()
  489. self.stream.close()
  490. if self.audio_interface:
  491. self.audio_interface.terminate()
  492. def __enter__(self):
  493. """
  494. Method to setup the context manager protocol.
  495. This enables the instance to be used in a `with` statement, ensuring
  496. proper resource management. When the `with` block is entered, this
  497. method is automatically called.
  498. Returns:
  499. self: The current instance of the class.
  500. """
  501. return self
  502. def __exit__(self, exc_type, exc_value, traceback):
  503. """
  504. Method to define behavior when the context manager protocol exits.
  505. This is called when exiting the `with` block and ensures that any
  506. necessary cleanup or resource release processes are executed, such as
  507. shutting down the system properly.
  508. Args:
  509. exc_type (Exception or None): The type of the exception that
  510. caused the context to be exited, if any.
  511. exc_value (Exception or None): The exception instance that caused
  512. the context to be exited, if any.
  513. traceback (Traceback or None): The traceback corresponding to the
  514. exception, if any.
  515. """
  516. self.shutdown()