audio_recorder_client.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758
  1. log_outgoing_chunks = False
  2. debug_mode = False
  3. from typing import Iterable, List, Optional, Union
  4. from urllib.parse import urlparse
  5. from datetime import datetime
  6. import subprocess
  7. import websocket
  8. import threading
  9. import platform
  10. import logging
  11. import pyaudio
  12. import socket
  13. import struct
  14. import signal
  15. import json
  16. import time
  17. import sys
  18. import os
  19. DEFAULT_CONTROL_URL = "ws://127.0.0.1:8011"
  20. DEFAULT_DATA_URL = "ws://127.0.0.1:8012"
  21. INIT_MODEL_TRANSCRIPTION = "tiny"
  22. INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
  23. INIT_REALTIME_PROCESSING_PAUSE = 0.2
  24. INIT_SILERO_SENSITIVITY = 0.4
  25. INIT_WEBRTC_SENSITIVITY = 3
  26. INIT_POST_SPEECH_SILENCE_DURATION = 0.6
  27. INIT_MIN_LENGTH_OF_RECORDING = 0.5
  28. INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
  29. INIT_WAKE_WORDS_SENSITIVITY = 0.6
  30. INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
  31. INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
  32. INIT_WAKE_WORD_TIMEOUT = 5.0
  33. INIT_WAKE_WORD_BUFFER_DURATION = 0.1
  34. ALLOWED_LATENCY_LIMIT = 100
  35. CHUNK = 1024
  36. FORMAT = pyaudio.paInt16
  37. CHANNELS = 1
  38. SAMPLE_RATE = 16000
  39. BUFFER_SIZE = 512
  40. INIT_HANDLE_BUFFER_OVERFLOW = False
  41. if platform.system() != 'Darwin':
  42. INIT_HANDLE_BUFFER_OVERFLOW = True
  43. # Define ANSI color codes for terminal output
  44. class bcolors:
  45. HEADER = '\033[95m' # Magenta
  46. OKBLUE = '\033[94m' # Blue
  47. OKCYAN = '\033[96m' # Cyan
  48. OKGREEN = '\033[92m' # Green
  49. WARNING = '\033[93m' # Yellow
  50. FAIL = '\033[91m' # Red
  51. ENDC = '\033[0m' # Reset to default
  52. BOLD = '\033[1m'
  53. UNDERLINE = '\033[4m'
  54. class AudioToTextRecorderClient:
  55. """
  56. A class responsible for capturing audio from the microphone, detecting
  57. voice activity, and then transcribing the captured audio using the
  58. `faster_whisper` model.
  59. """
  60. def __init__(self,
  61. model: str = INIT_MODEL_TRANSCRIPTION,
  62. language: str = "",
  63. compute_type: str = "default",
  64. input_device_index: int = None,
  65. gpu_device_index: Union[int, List[int]] = 0,
  66. device: str = "cuda",
  67. on_recording_start=None,
  68. on_recording_stop=None,
  69. on_transcription_start=None,
  70. ensure_sentence_starting_uppercase=True,
  71. ensure_sentence_ends_with_period=True,
  72. use_microphone=True,
  73. spinner=True,
  74. level=logging.WARNING,
  75. # Realtime transcription parameters
  76. enable_realtime_transcription=False,
  77. use_main_model_for_realtime=False,
  78. realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME,
  79. realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE,
  80. on_realtime_transcription_update=None,
  81. on_realtime_transcription_stabilized=None,
  82. # Voice activation parameters
  83. silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
  84. silero_use_onnx: bool = False,
  85. silero_deactivity_detection: bool = False,
  86. webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
  87. post_speech_silence_duration: float = (
  88. INIT_POST_SPEECH_SILENCE_DURATION
  89. ),
  90. min_length_of_recording: float = (
  91. INIT_MIN_LENGTH_OF_RECORDING
  92. ),
  93. min_gap_between_recordings: float = (
  94. INIT_MIN_GAP_BETWEEN_RECORDINGS
  95. ),
  96. pre_recording_buffer_duration: float = (
  97. INIT_PRE_RECORDING_BUFFER_DURATION
  98. ),
  99. on_vad_detect_start=None,
  100. on_vad_detect_stop=None,
  101. # Wake word parameters
  102. wakeword_backend: str = "pvporcupine",
  103. openwakeword_model_paths: str = None,
  104. openwakeword_inference_framework: str = "onnx",
  105. wake_words: str = "",
  106. wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
  107. wake_word_activation_delay: float = (
  108. INIT_WAKE_WORD_ACTIVATION_DELAY
  109. ),
  110. wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
  111. wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
  112. on_wakeword_detected=None,
  113. on_wakeword_timeout=None,
  114. on_wakeword_detection_start=None,
  115. on_wakeword_detection_end=None,
  116. on_recorded_chunk=None,
  117. debug_mode=False,
  118. handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
  119. beam_size: int = 5,
  120. beam_size_realtime: int = 3,
  121. buffer_size: int = BUFFER_SIZE,
  122. sample_rate: int = SAMPLE_RATE,
  123. initial_prompt: Optional[Union[str, Iterable[int]]] = None,
  124. suppress_tokens: Optional[List[int]] = [-1],
  125. print_transcription_time: bool = False,
  126. early_transcription_on_silence: int = 0,
  127. allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
  128. no_log_file: bool = False,
  129. use_extended_logging: bool = False,
  130. # Server urls
  131. control_url: str = DEFAULT_CONTROL_URL,
  132. data_url: str = DEFAULT_DATA_URL,
  133. autostart_server: bool = True,
  134. ):
  135. # Set instance variables from constructor parameters
  136. self.model = model
  137. self.language = language
  138. self.compute_type = compute_type
  139. self.input_device_index = input_device_index
  140. self.gpu_device_index = gpu_device_index
  141. self.device = device
  142. self.on_recording_start = on_recording_start
  143. self.on_recording_stop = on_recording_stop
  144. self.on_transcription_start = on_transcription_start
  145. self.ensure_sentence_starting_uppercase = ensure_sentence_starting_uppercase
  146. self.ensure_sentence_ends_with_period = ensure_sentence_ends_with_period
  147. self.use_microphone = use_microphone
  148. self.spinner = spinner
  149. self.level = level
  150. # Real-time transcription parameters
  151. self.enable_realtime_transcription = enable_realtime_transcription
  152. self.use_main_model_for_realtime = use_main_model_for_realtime
  153. self.realtime_model_type = realtime_model_type
  154. self.realtime_processing_pause = realtime_processing_pause
  155. self.on_realtime_transcription_update = on_realtime_transcription_update
  156. self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
  157. # Voice activation parameters
  158. self.silero_sensitivity = silero_sensitivity
  159. self.silero_use_onnx = silero_use_onnx
  160. self.silero_deactivity_detection = silero_deactivity_detection
  161. self.webrtc_sensitivity = webrtc_sensitivity
  162. self.post_speech_silence_duration = post_speech_silence_duration
  163. self.min_length_of_recording = min_length_of_recording
  164. self.min_gap_between_recordings = min_gap_between_recordings
  165. self.pre_recording_buffer_duration = pre_recording_buffer_duration
  166. self.on_vad_detect_start = on_vad_detect_start
  167. self.on_vad_detect_stop = on_vad_detect_stop
  168. # Wake word parameters
  169. self.wakeword_backend = wakeword_backend
  170. self.openwakeword_model_paths = openwakeword_model_paths
  171. self.openwakeword_inference_framework = openwakeword_inference_framework
  172. self.wake_words = wake_words
  173. self.wake_words_sensitivity = wake_words_sensitivity
  174. self.wake_word_activation_delay = wake_word_activation_delay
  175. self.wake_word_timeout = wake_word_timeout
  176. self.wake_word_buffer_duration = wake_word_buffer_duration
  177. self.on_wakeword_detected = on_wakeword_detected
  178. self.on_wakeword_timeout = on_wakeword_timeout
  179. self.on_wakeword_detection_start = on_wakeword_detection_start
  180. self.on_wakeword_detection_end = on_wakeword_detection_end
  181. self.on_recorded_chunk = on_recorded_chunk
  182. self.debug_mode = debug_mode
  183. self.handle_buffer_overflow = handle_buffer_overflow
  184. self.beam_size = beam_size
  185. self.beam_size_realtime = beam_size_realtime
  186. self.buffer_size = buffer_size
  187. self.sample_rate = sample_rate
  188. self.initial_prompt = initial_prompt
  189. self.suppress_tokens = suppress_tokens
  190. self.print_transcription_time = print_transcription_time
  191. self.early_transcription_on_silence = early_transcription_on_silence
  192. self.allowed_latency_limit = allowed_latency_limit
  193. self.no_log_file = no_log_file
  194. self.use_extended_logging = use_extended_logging
  195. # Server URLs
  196. self.control_url = control_url
  197. self.data_url = data_url
  198. self.autostart_server = autostart_server
  199. # Instance variables
  200. self.muted = False
  201. self.recording_thread = None
  202. self.is_running = True
  203. self.connection_established = threading.Event()
  204. self.recording_start = threading.Event()
  205. self.final_text_ready = threading.Event()
  206. self.realtime_text = ""
  207. self.final_text = ""
  208. self.request_counter = 0
  209. self.pending_requests = {} # Map from request_id to threading.Event and value
  210. if self.debug_mode:
  211. print("Checking STT server")
  212. if not self.connect():
  213. print("Failed to connect to the server.", file=sys.stderr)
  214. else:
  215. if self.debug_mode:
  216. print("STT server is running and connected.")
  217. if self.use_microphone:
  218. self.start_recording()
  219. def text(self, on_transcription_finished=None):
  220. self.realtime_text = ""
  221. self.submitted_realtime_text = ""
  222. self.final_text = ""
  223. self.final_text_ready.clear()
  224. self.recording_start.set()
  225. try:
  226. total_wait_time = 0
  227. wait_interval = 0.02 # Wait in small intervals, e.g., 100ms
  228. max_wait_time = 60 # Timeout after 60 seconds
  229. while total_wait_time < max_wait_time:
  230. if self.final_text_ready.wait(timeout=wait_interval):
  231. break # Break if transcription is ready
  232. # if not self.realtime_text == self.submitted_realtime_text:
  233. # if self.on_realtime_transcription_update:
  234. # self.on_realtime_transcription_update(self.realtime_text)
  235. # self.submitted_realtime_text = self.realtime_text
  236. total_wait_time += wait_interval
  237. # Check if a manual interrupt has occurred
  238. if total_wait_time >= max_wait_time:
  239. if self.debug_mode:
  240. print("Timeout while waiting for text from the server.")
  241. self.recording_start.clear()
  242. if on_transcription_finished:
  243. threading.Thread(target=on_transcription_finished, args=("",)).start()
  244. return ""
  245. self.recording_start.clear()
  246. if on_transcription_finished:
  247. threading.Thread(target=on_transcription_finished, args=(self.final_text,)).start()
  248. return self.final_text
  249. except KeyboardInterrupt:
  250. if self.debug_mode:
  251. print("KeyboardInterrupt in record_and_send_audio, exiting...")
  252. raise KeyboardInterrupt
  253. except Exception as e:
  254. print(f"Error in AudioToTextRecorderClient.text(): {e}")
  255. return ""
  256. def feed_audio(self, chunk, original_sample_rate=16000):
  257. metadata = {"sampleRate": original_sample_rate}
  258. metadata_json = json.dumps(metadata)
  259. metadata_length = len(metadata_json)
  260. message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + chunk
  261. if self.is_running:
  262. self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
  263. def set_microphone(self, microphone_on=True):
  264. """
  265. Set the microphone on or off.
  266. """
  267. self.muted = not microphone_on
  268. def abort(self):
  269. self.call_method("abort")
  270. def wakeup(self):
  271. self.call_method("wakeup")
  272. def clear_audio_queue(self):
  273. self.call_method("clear_audio_queue")
  274. def stop(self):
  275. self.call_method("stop")
  276. def connect(self):
  277. if not self.ensure_server_running():
  278. print("Cannot start STT server. Exiting.")
  279. return False
  280. try:
  281. # Connect to control WebSocket
  282. self.control_ws = websocket.WebSocketApp(self.control_url,
  283. on_message=self.on_control_message,
  284. on_error=self.on_error,
  285. on_close=self.on_close,
  286. on_open=self.on_control_open)
  287. self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
  288. self.control_ws_thread.daemon = False
  289. self.control_ws_thread.start()
  290. # Connect to data WebSocket
  291. self.data_ws = websocket.WebSocketApp(self.data_url,
  292. on_message=self.on_data_message,
  293. on_error=self.on_error,
  294. on_close=self.on_close,
  295. on_open=self.on_data_open)
  296. self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
  297. self.data_ws_thread.daemon = False
  298. self.data_ws_thread.start()
  299. # Wait for the connections to be established
  300. if not self.connection_established.wait(timeout=10):
  301. print("Timeout while connecting to the server.")
  302. return False
  303. if self.debug_mode:
  304. print("WebSocket connections established successfully.")
  305. return True
  306. except Exception as e:
  307. print(f"Error while connecting to the server: {e}")
  308. return False
  309. def start_server(self):
  310. args = ['stt-server']
  311. # Map constructor parameters to server arguments
  312. if self.model:
  313. args += ['--model', self.model]
  314. if self.realtime_model_type:
  315. args += ['--realtime_model_type', self.realtime_model_type]
  316. if self.language:
  317. args += ['--language', self.language]
  318. if self.silero_sensitivity is not None:
  319. args += ['--silero_sensitivity', str(self.silero_sensitivity)]
  320. if self.silero_use_onnx:
  321. args.append('--silero_use_onnx') # flag, no need for True/False
  322. if self.webrtc_sensitivity is not None:
  323. args += ['--webrtc_sensitivity', str(self.webrtc_sensitivity)]
  324. if self.min_length_of_recording is not None:
  325. args += ['--min_length_of_recording', str(self.min_length_of_recording)]
  326. if self.min_gap_between_recordings is not None:
  327. args += ['--min_gap_between_recordings', str(self.min_gap_between_recordings)]
  328. if self.realtime_processing_pause is not None:
  329. args += ['--realtime_processing_pause', str(self.realtime_processing_pause)]
  330. if self.early_transcription_on_silence is not None:
  331. args += ['--early_transcription_on_silence', str(self.early_transcription_on_silence)]
  332. if self.silero_deactivity_detection:
  333. args.append('--silero_deactivity_detection') # flag, no need for True/False
  334. if self.beam_size is not None:
  335. args += ['--beam_size', str(self.beam_size)]
  336. if self.beam_size_realtime is not None:
  337. args += ['--beam_size_realtime', str(self.beam_size_realtime)]
  338. if self.wake_words is not None:
  339. args += ['--wake_words', str(self.wake_words)]
  340. if self.wake_words_sensitivity is not None:
  341. args += ['--wake_words_sensitivity', str(self.wake_words_sensitivity)]
  342. if self.wake_word_timeout is not None:
  343. args += ['--wake_word_timeout', str(self.wake_word_timeout)]
  344. if self.wake_word_activation_delay is not None:
  345. args += ['--wake_word_activation_delay', str(self.wake_word_activation_delay)]
  346. if self.wakeword_backend is not None:
  347. args += ['--wakeword_backend', str(self.wakeword_backend)]
  348. if self.openwakeword_model_paths:
  349. args += ['--openwakeword_model_paths', str(self.openwakeword_model_paths)]
  350. if self.openwakeword_inference_framework is not None:
  351. args += ['--openwakeword_inference_framework', str(self.openwakeword_inference_framework)]
  352. if self.wake_word_buffer_duration is not None:
  353. args += ['--wake_word_buffer_duration', str(self.wake_word_buffer_duration)]
  354. if self.use_main_model_for_realtime:
  355. args.append('--use_main_model_for_realtime') # flag, no need for True/False
  356. if self.use_extended_logging:
  357. args.append('--use_extended_logging') # flag, no need for True/False
  358. if self.control_url:
  359. parsed_control_url = urlparse(self.control_url)
  360. if parsed_control_url.port:
  361. args += ['--control_port', str(parsed_control_url.port)]
  362. if self.data_url:
  363. parsed_data_url = urlparse(self.data_url)
  364. if parsed_data_url.port:
  365. args += ['--data_port', str(parsed_data_url.port)]
  366. if self.initial_prompt:
  367. sanitized_prompt = self.initial_prompt.replace("\n", "\\n")
  368. args += ['--initial_prompt', sanitized_prompt]
  369. # Start the subprocess with the mapped arguments
  370. if os.name == 'nt': # Windows
  371. cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
  372. if debug_mode:
  373. print(f"Opening server with cli command: {cmd}")
  374. subprocess.Popen(cmd, shell=True)
  375. else: # Unix-like systems
  376. subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
  377. print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
  378. def is_server_running(self):
  379. parsed_url = urlparse(self.control_url)
  380. host = parsed_url.hostname
  381. port = parsed_url.port or 80
  382. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  383. return s.connect_ex((host, port)) == 0
  384. def ensure_server_running(self):
  385. if not self.is_server_running():
  386. if self.debug_mode:
  387. print("STT server is not running.", file=sys.stderr)
  388. if self.autostart_server or self.ask_to_start_server():
  389. self.start_server()
  390. if self.debug_mode:
  391. print("Waiting for STT server to start...", file=sys.stderr)
  392. for _ in range(20): # Wait up to 20 seconds
  393. if self.is_server_running():
  394. if self.debug_mode:
  395. print("STT server started successfully.", file=sys.stderr)
  396. time.sleep(2) # Give the server a moment to fully initialize
  397. return True
  398. time.sleep(1)
  399. print("Failed to start STT server.", file=sys.stderr)
  400. return False
  401. else:
  402. print("STT server is required. Please start it manually.", file=sys.stderr)
  403. return False
  404. return True
  405. def start_recording(self):
  406. self.recording_thread = threading.Thread(target=self.record_and_send_audio)
  407. self.recording_thread.daemon = False
  408. self.recording_thread.start()
  409. def setup_audio(self):
  410. try:
  411. self.audio_interface = pyaudio.PyAudio()
  412. self.input_device_index = None
  413. try:
  414. default_device = self.audio_interface.get_default_input_device_info()
  415. self.input_device_index = default_device['index']
  416. except OSError as e:
  417. print(f"No default input device found: {e}")
  418. return False
  419. self.device_sample_rate = 16000 # Try 16000 Hz first
  420. try:
  421. self.stream = self.audio_interface.open(
  422. format=FORMAT,
  423. channels=CHANNELS,
  424. rate=self.device_sample_rate,
  425. input=True,
  426. frames_per_buffer=CHUNK,
  427. input_device_index=self.input_device_index,
  428. )
  429. if self.debug_mode:
  430. print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
  431. return True
  432. except Exception as e:
  433. print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
  434. return False
  435. except Exception as e:
  436. print(f"Error initializing audio recording: {e}")
  437. if self.audio_interface:
  438. self.audio_interface.terminate()
  439. return False
  440. def record_and_send_audio(self):
  441. try:
  442. if not self.setup_audio():
  443. raise Exception("Failed to set up audio recording.")
  444. if self.debug_mode:
  445. print("Recording and sending audio...")
  446. while self.is_running:
  447. if self.muted:
  448. time.sleep(0.01)
  449. continue
  450. try:
  451. audio_data = self.stream.read(CHUNK)
  452. if self.on_recorded_chunk:
  453. self.on_recorded_chunk(audio_data)
  454. if self.muted:
  455. continue
  456. if self.recording_start.is_set():
  457. metadata = {"sampleRate": self.device_sample_rate}
  458. metadata_json = json.dumps(metadata)
  459. metadata_length = len(metadata_json)
  460. message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
  461. if self.is_running:
  462. if log_outgoing_chunks:
  463. print(".", flush=True, end='')
  464. self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
  465. except KeyboardInterrupt: # handle manual interruption (Ctrl+C)
  466. if self.debug_mode:
  467. print("KeyboardInterrupt in record_and_send_audio, exiting...")
  468. break
  469. except Exception as e:
  470. print(f"Error sending audio data: {e}")
  471. break # Exit the recording loop
  472. except Exception as e:
  473. print(f"Error in record_and_send_audio: {e}")
  474. finally:
  475. self.cleanup_audio()
  476. def cleanup_audio(self):
  477. try:
  478. if self.stream:
  479. self.stream.stop_stream()
  480. self.stream.close()
  481. self.stream = None
  482. if self.audio_interface:
  483. self.audio_interface.terminate()
  484. self.audio_interface = None
  485. except Exception as e:
  486. print(f"Error cleaning up audio resources: {e}")
  487. def on_control_message(self, ws, message):
  488. try:
  489. data = json.loads(message)
  490. # Handle server response with status
  491. if 'status' in data:
  492. if data['status'] == 'success':
  493. if 'parameter' in data and 'value' in data:
  494. request_id = data.get('request_id')
  495. if request_id is not None and request_id in self.pending_requests:
  496. if self.debug_mode:
  497. print(f"Parameter {data['parameter']} = {data['value']}")
  498. self.pending_requests[request_id]['value'] = data['value']
  499. self.pending_requests[request_id]['event'].set()
  500. elif data['status'] == 'error':
  501. print(f"Server Error: {data.get('message', '')}")
  502. else:
  503. print(f"Unknown control message format: {data}")
  504. except json.JSONDecodeError:
  505. print(f"Received non-JSON control message: {message}")
  506. except Exception as e:
  507. print(f"Error processing control message: {e}")
  508. # Handle real-time transcription and full sentence updates
  509. def on_data_message(self, ws, message):
  510. try:
  511. data = json.loads(message)
  512. # Handle real-time transcription updates
  513. if data.get('type') == 'realtime':
  514. if data['text'] != self.realtime_text:
  515. self.realtime_text = data['text']
  516. timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
  517. print(f"Realtime text [{timestamp}]: {bcolors.OKCYAN}{self.realtime_text}{bcolors.ENDC}")
  518. if self.on_realtime_transcription_update:
  519. # Call the callback in a new thread to avoid blocking
  520. threading.Thread(
  521. target=self.on_realtime_transcription_update,
  522. args=(self.realtime_text,)
  523. ).start()
  524. # Handle full sentences
  525. elif data.get('type') == 'fullSentence':
  526. self.final_text = data['text']
  527. self.final_text_ready.set()
  528. elif data.get('type') == 'recording_start':
  529. if self.on_recording_start:
  530. self.on_recording_start()
  531. elif data.get('type') == 'recording_stop':
  532. if self.on_recording_stop:
  533. self.on_recording_stop()
  534. elif data.get('type') == 'transcription_start':
  535. if self.on_transcription_start:
  536. self.on_transcription_start()
  537. elif data.get('type') == 'vad_detect_start':
  538. if self.on_vad_detect_start:
  539. self.on_vad_detect_start()
  540. elif data.get('type') == 'vad_detect_stop':
  541. if self.on_vad_detect_stop:
  542. self.on_vad_detect_stop()
  543. elif data.get('type') == 'wakeword_detected':
  544. if self.on_wakeword_detected:
  545. self.on_wakeword_detected()
  546. elif data.get('type') == 'wakeword_detection_start':
  547. if self.on_wakeword_detection_start:
  548. self.on_wakeword_detection_start()
  549. elif data.get('type') == 'wakeword_detection_end':
  550. if self.on_wakeword_detection_end:
  551. self.on_wakeword_detection_end()
  552. elif data.get('type') == 'recorded_chunk':
  553. pass
  554. else:
  555. print(f"Unknown data message format: {data}")
  556. except json.JSONDecodeError:
  557. print(f"Received non-JSON data message: {message}")
  558. except Exception as e:
  559. print(f"Error processing data message: {e}")
  560. def on_error(self, ws, error):
  561. print(f"WebSocket error: {error}")
  562. def on_close(self, ws, close_status_code, close_msg):
  563. if self.debug_mode:
  564. if ws == self.data_ws:
  565. print(f"Data WebSocket connection closed: {close_status_code} - {close_msg}")
  566. elif ws == self.control_ws:
  567. print(f"Control WebSocket connection closed: {close_status_code} - {close_msg}")
  568. self.is_running = False
  569. def on_control_open(self, ws):
  570. if self.debug_mode:
  571. print("Control WebSocket connection opened.")
  572. self.connection_established.set()
  573. def on_data_open(self, ws):
  574. if self.debug_mode:
  575. print("Data WebSocket connection opened.")
  576. def set_parameter(self, parameter, value):
  577. command = {
  578. "command": "set_parameter",
  579. "parameter": parameter,
  580. "value": value
  581. }
  582. self.control_ws.send(json.dumps(command))
  583. def get_parameter(self, parameter):
  584. # Generate a unique request_id
  585. request_id = self.request_counter
  586. self.request_counter += 1
  587. # Prepare the command with the request_id
  588. command = {
  589. "command": "get_parameter",
  590. "parameter": parameter,
  591. "request_id": request_id
  592. }
  593. # Create an event to wait for the response
  594. event = threading.Event()
  595. self.pending_requests[request_id] = {'event': event, 'value': None}
  596. # Send the command to the server
  597. self.control_ws.send(json.dumps(command))
  598. # Wait for the response or timeout after 5 seconds
  599. if event.wait(timeout=5):
  600. value = self.pending_requests[request_id]['value']
  601. # Clean up the pending request
  602. del self.pending_requests[request_id]
  603. return value
  604. else:
  605. print(f"Timeout waiting for get_parameter {parameter}")
  606. # Clean up the pending request
  607. del self.pending_requests[request_id]
  608. return None
  609. def call_method(self, method, args=None, kwargs=None):
  610. command = {
  611. "command": "call_method",
  612. "method": method,
  613. "args": args or [],
  614. "kwargs": kwargs or {}
  615. }
  616. self.control_ws.send(json.dumps(command))
  617. def shutdown(self):
  618. self.is_running = False
  619. #self.stop_event.set()
  620. if self.control_ws:
  621. self.control_ws.close()
  622. if self.data_ws:
  623. self.data_ws.close()
  624. # Join threads to ensure they finish before exiting
  625. if self.control_ws_thread:
  626. self.control_ws_thread.join()
  627. if self.data_ws_thread:
  628. self.data_ws_thread.join()
  629. if self.recording_thread:
  630. self.recording_thread.join()
  631. # Clean up audio resources
  632. if self.stream:
  633. self.stream.stop_stream()
  634. self.stream.close()
  635. if self.audio_interface:
  636. self.audio_interface.terminate()
  637. def __enter__(self):
  638. """
  639. Method to setup the context manager protocol.
  640. This enables the instance to be used in a `with` statement, ensuring
  641. proper resource management. When the `with` block is entered, this
  642. method is automatically called.
  643. Returns:
  644. self: The current instance of the class.
  645. """
  646. return self
  647. def __exit__(self, exc_type, exc_value, traceback):
  648. """
  649. Method to define behavior when the context manager protocol exits.
  650. This is called when exiting the `with` block and ensures that any
  651. necessary cleanup or resource release processes are executed, such as
  652. shutting down the system properly.
  653. Args:
  654. exc_type (Exception or None): The type of the exception that
  655. caused the context to be exited, if any.
  656. exc_value (Exception or None): The exception instance that caused
  657. the context to be exited, if any.
  658. traceback (Traceback or None): The traceback corresponding to the
  659. exception, if any.
  660. """
  661. self.shutdown()