stt_cli_client.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. """
  2. Speech-to-Text (STT) Client CLI for WebSocket Server Interaction
  3. This command-line interface (CLI) allows interaction with the Speech-to-Text (STT) WebSocket server. It connects to the server via control and data WebSocket URLs to facilitate real-time speech transcription, control the server, and manage various parameters related to the STT process.
  4. The client can be used to start recording audio, set or retrieve STT parameters, and interact with the server using commands. Additionally, the CLI can disable real-time updates or run in debug mode for detailed output.
  5. ### Features:
  6. - Connects to STT WebSocket server for real-time transcription and control.
  7. - Supports setting and retrieving parameters via the command line.
  8. - Allows calling server methods (e.g., start/stop recording).
  9. - Option to disable real-time updates during transcription.
  10. - Debug mode available for verbose logging.
  11. ### Starting the Client:
  12. You can start the client using the command `stt` and optionally pass configuration options or commands for interacting with the server.
  13. ```bash
  14. stt [OPTIONS]
  15. ```
  16. ### Available Parameters:
  17. - `--control-url` (default: "ws://localhost:8011"): The WebSocket URL for server control commands.
  18. - `--data-url` (default: "ws://localhost:8012"): The WebSocket URL for sending audio data and receiving transcription updates.
  19. - `--debug`: Enable debug mode, which prints detailed logs to stderr.
  20. - `--nort` or `--norealtime`: Disable real-time output of transcription results.
  21. - `--set-param PARAM VALUE`: Set a recorder parameter (e.g., silero_sensitivity, beam_size, etc.). This option can be used multiple times to set different parameters.
  22. - `--get-param PARAM`: Retrieve the value of a specific recorder parameter. This option can be used multiple times to retrieve different parameters.
  23. - `--call-method METHOD [ARGS]`: Call a method on the recorder with optional arguments. This option can be used multiple times for different methods.
  24. ### Example Usage:
  25. 1. **Start the client with default settings:**
  26. ```bash
  27. stt
  28. ```
  29. 2. **Set a recorder parameter (e.g., set Silero sensitivity to 0.1):**
  30. ```bash
  31. stt --set-param silero_sensitivity 0.1
  32. ```
  33. 3. **Retrieve the value of a recorder parameter (e.g., get the current Silero sensitivity):**
  34. ```bash
  35. stt --get-param silero_sensitivity
  36. ```
  37. 4. **Call a method on the recorder (e.g., start the microphone input):**
  38. ```bash
  39. stt --call-method set_microphone
  40. ```
  41. 5. **Run in debug mode:**
  42. ```bash
  43. stt --debug
  44. ```
  45. ### WebSocket Interface:
  46. - **Control WebSocket**: Used for sending control commands like setting parameters or invoking methods.
  47. - **Data WebSocket**: Used for sending audio data for real-time transcription and receiving transcription results.
  48. The client can be used to send audio data to the server for transcription and to control the behavior of the server remotely.
  49. """
  50. import os
  51. import sys
  52. import pyaudio
  53. import numpy as np
  54. from scipy import signal
  55. import logging
  56. import websocket
  57. import argparse
  58. import json
  59. import threading
  60. import time
  61. import struct
  62. import socket
  63. import shutil
  64. from urllib.parse import urlparse
  65. import queue
  66. from queue import Queue
  67. os.environ['ALSA_LOG_LEVEL'] = 'none'
  68. # Constants
  69. CHUNK = 1024
  70. FORMAT = pyaudio.paInt16
  71. CHANNELS = 1
  72. RATE = 44100
  73. DEFAULT_CONTROL_URL = "ws://localhost:8011"
  74. DEFAULT_DATA_URL = "ws://localhost:8012"
  75. # Initialize colorama
  76. from colorama import init, Fore, Style
  77. init()
  78. # Stop websocket from spamming the log
  79. websocket.enableTrace(False)
  80. class STTWebSocketClient:
  81. def __init__(self, control_url, data_url, debug=False, file_output=None, norealtime=False):
  82. self.control_url = control_url
  83. self.data_url = data_url
  84. self.control_ws = None
  85. self.data_ws = None
  86. self.is_running = False
  87. self.debug = debug
  88. self.file_output = file_output
  89. self.last_text = ""
  90. self.console_width = shutil.get_terminal_size().columns
  91. self.recording_indicator = "🔴"
  92. self.norealtime = norealtime
  93. self.connection_established = threading.Event()
  94. self.message_queue = Queue()
  95. self.commands = Queue()
  96. self.stop_event = threading.Event()
  97. # Audio attributes
  98. self.audio_interface = None
  99. self.stream = None
  100. self.device_sample_rate = None
  101. self.input_device_index = None
  102. # Threads
  103. self.control_ws_thread = None
  104. self.data_ws_thread = None
  105. self.recording_thread = None
  106. def debug_print(self, message):
  107. if self.debug:
  108. print(message, file=sys.stderr)
  109. def connect(self):
  110. if not self.ensure_server_running():
  111. self.debug_print("Cannot start STT server. Exiting.")
  112. return False
  113. try:
  114. # Connect to control WebSocket
  115. self.control_ws = websocket.WebSocketApp(self.control_url,
  116. on_message=self.on_control_message,
  117. on_error=self.on_error,
  118. on_close=self.on_close,
  119. on_open=self.on_control_open)
  120. self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
  121. self.control_ws_thread.daemon = False # Set to False to ensure proper shutdown
  122. self.control_ws_thread.start()
  123. # Connect to data WebSocket
  124. self.data_ws = websocket.WebSocketApp(self.data_url,
  125. on_message=self.on_data_message,
  126. on_error=self.on_error,
  127. on_close=self.on_close,
  128. on_open=self.on_data_open)
  129. self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
  130. self.data_ws_thread.daemon = False # Set to False to ensure proper shutdown
  131. self.data_ws_thread.start()
  132. # Wait for the connections to be established
  133. if not self.connection_established.wait(timeout=10):
  134. self.debug_print("Timeout while connecting to the server.")
  135. return False
  136. self.debug_print("WebSocket connections established successfully.")
  137. return True
  138. except Exception as e:
  139. self.debug_print(f"Error while connecting to the server: {e}")
  140. return False
  141. def on_control_open(self, ws):
  142. self.debug_print("Control WebSocket connection opened.")
  143. self.connection_established.set()
  144. self.start_command_processor()
  145. def on_data_open(self, ws):
  146. self.debug_print("Data WebSocket connection opened.")
  147. self.is_running = True
  148. self.start_recording()
  149. def on_error(self, ws, error):
  150. self.debug_print(f"WebSocket error: {error}")
  151. def on_close(self, ws, close_status_code, close_msg):
  152. self.debug_print(f"WebSocket connection closed: {close_status_code} - {close_msg}")
  153. self.is_running = False
  154. self.stop_event.set()
  155. def is_server_running(self):
  156. parsed_url = urlparse(self.control_url)
  157. host = parsed_url.hostname
  158. port = parsed_url.port or 80
  159. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  160. return s.connect_ex((host, port)) == 0
  161. def ask_to_start_server(self):
  162. response = input("Would you like to start the STT server now? (y/n): ").strip().lower()
  163. return response == 'y' or response == 'yes'
  164. def start_server(self):
  165. if os.name == 'nt': # Windows
  166. subprocess.Popen('start /min cmd /c stt-server', shell=True)
  167. else: # Unix-like systems
  168. subprocess.Popen(['stt-server'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
  169. print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
  170. def ensure_server_running(self):
  171. if not self.is_server_running():
  172. print("STT server is not running.", file=sys.stderr)
  173. if self.ask_to_start_server():
  174. self.start_server()
  175. print("Waiting for STT server to start...", file=sys.stderr)
  176. for _ in range(20): # Wait up to 20 seconds
  177. if self.is_server_running():
  178. print("STT server started successfully.", file=sys.stderr)
  179. time.sleep(2) # Give the server a moment to fully initialize
  180. return True
  181. time.sleep(1)
  182. print("Failed to start STT server.", file=sys.stderr)
  183. return False
  184. else:
  185. print("STT server is required. Please start it manually.", file=sys.stderr)
  186. return False
  187. return True
  188. # Handle control messages like set_parameter, get_parameter, etc.
  189. def on_control_message(self, ws, message):
  190. try:
  191. data = json.loads(message)
  192. # Handle server response with status
  193. if 'status' in data:
  194. if data['status'] == 'success':
  195. # print(f"Server Response: {data.get('message', '')}")
  196. if 'parameter' in data and 'value' in data:
  197. print(f"Parameter {data['parameter']} = {data['value']}")
  198. elif data['status'] == 'error':
  199. print(f"Server Error: {data.get('message', '')}")
  200. else:
  201. self.debug_print(f"Unknown control message format: {data}")
  202. except json.JSONDecodeError:
  203. self.debug_print(f"Received non-JSON control message: {message}")
  204. except Exception as e:
  205. self.debug_print(f"Error processing control message: {e}")
  206. # Handle real-time transcription and full sentence updates
  207. def on_data_message(self, ws, message):
  208. try:
  209. data = json.loads(message)
  210. # Handle real-time transcription updates
  211. if data.get('type') == 'realtime':
  212. if data['text'] != self.last_text:
  213. self.last_text = data['text']
  214. if not self.norealtime:
  215. self.update_progress_bar(self.last_text)
  216. # Handle full sentences
  217. elif data.get('type') == 'fullSentence':
  218. if self.file_output:
  219. sys.stderr.write('\r\033[K')
  220. sys.stderr.write(data['text'])
  221. sys.stderr.write('\n')
  222. sys.stderr.flush()
  223. print(data['text'], file=self.file_output)
  224. self.file_output.flush() # Ensure it's written immediately
  225. else:
  226. self.finish_progress_bar()
  227. print(f"{data['text']}")
  228. self.stop()
  229. else:
  230. self.debug_print(f"Unknown data message format: {data}")
  231. except json.JSONDecodeError:
  232. self.debug_print(f"Received non-JSON data message: {message}")
  233. except Exception as e:
  234. self.debug_print(f"Error processing data message: {e}")
  235. def show_initial_indicator(self):
  236. if self.norealtime:
  237. return
  238. initial_text = f"{self.recording_indicator}\b\b"
  239. sys.stderr.write(initial_text)
  240. sys.stderr.flush()
  241. def update_progress_bar(self, text):
  242. try:
  243. available_width = self.console_width - 5 # Adjust for progress bar decorations
  244. sys.stderr.write('\r\033[K') # Clear the current line
  245. words = text.split()
  246. last_chars = ""
  247. for word in reversed(words):
  248. if len(last_chars) + len(word) + 1 > available_width:
  249. break
  250. last_chars = word + " " + last_chars
  251. last_chars = last_chars.strip()
  252. colored_text = f"{Fore.YELLOW}{last_chars}{Style.RESET_ALL}{self.recording_indicator}\b\b"
  253. sys.stderr.write(colored_text)
  254. sys.stderr.flush()
  255. except Exception as e:
  256. self.debug_print(f"Error updating progress bar: {e}")
  257. def finish_progress_bar(self):
  258. try:
  259. sys.stderr.write('\r\033[K')
  260. sys.stderr.flush()
  261. except Exception as e:
  262. self.debug_print(f"Error finishing progress bar: {e}")
  263. def stop(self):
  264. self.finish_progress_bar()
  265. self.is_running = False
  266. self.stop_event.set()
  267. if self.control_ws:
  268. self.control_ws.close()
  269. if self.data_ws:
  270. self.data_ws.close()
  271. # Join threads to ensure they finish before exiting
  272. if self.control_ws_thread:
  273. self.control_ws_thread.join()
  274. if self.data_ws_thread:
  275. self.data_ws_thread.join()
  276. if self.recording_thread:
  277. self.recording_thread.join()
  278. # Clean up audio resources
  279. if self.stream:
  280. self.stream.stop_stream()
  281. self.stream.close()
  282. if self.audio_interface:
  283. self.audio_interface.terminate()
  284. def start_recording(self):
  285. self.recording_thread = threading.Thread(target=self.record_and_send_audio)
  286. self.recording_thread.daemon = False # Set to False to ensure proper shutdown
  287. self.recording_thread.start()
  288. def record_and_send_audio(self):
  289. try:
  290. if not self.setup_audio():
  291. raise Exception("Failed to set up audio recording.")
  292. self.debug_print("Recording and sending audio...")
  293. self.show_initial_indicator()
  294. while self.is_running:
  295. try:
  296. audio_data = self.stream.read(CHUNK)
  297. metadata = {"sampleRate": self.device_sample_rate}
  298. metadata_json = json.dumps(metadata)
  299. metadata_length = len(metadata_json)
  300. message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
  301. self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
  302. except Exception as e:
  303. self.debug_print(f"Error sending audio data: {e}")
  304. break # Exit the recording loop
  305. except Exception as e:
  306. self.debug_print(f"Error in record_and_send_audio: {e}")
  307. finally:
  308. self.cleanup_audio()
  309. def setup_audio(self):
  310. try:
  311. self.audio_interface = pyaudio.PyAudio()
  312. self.input_device_index = None
  313. try:
  314. default_device = self.audio_interface.get_default_input_device_info()
  315. self.input_device_index = default_device['index']
  316. except OSError as e:
  317. self.debug_print(f"No default input device found: {e}")
  318. return False
  319. self.device_sample_rate = 16000 # Try 16000 Hz first
  320. try:
  321. self.stream = self.audio_interface.open(
  322. format=FORMAT,
  323. channels=CHANNELS,
  324. rate=self.device_sample_rate,
  325. input=True,
  326. frames_per_buffer=CHUNK,
  327. input_device_index=self.input_device_index,
  328. )
  329. self.debug_print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
  330. return True
  331. except Exception as e:
  332. self.debug_print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
  333. return False
  334. except Exception as e:
  335. self.debug_print(f"Error initializing audio recording: {e}")
  336. if self.audio_interface:
  337. self.audio_interface.terminate()
  338. return False
  339. def cleanup_audio(self):
  340. try:
  341. if self.stream:
  342. self.stream.stop_stream()
  343. self.stream.close()
  344. self.stream = None
  345. if self.audio_interface:
  346. self.audio_interface.terminate()
  347. self.audio_interface = None
  348. except Exception as e:
  349. self.debug_print(f"Error cleaning up audio resources: {e}")
  350. def set_parameter(self, parameter, value):
  351. command = {
  352. "command": "set_parameter",
  353. "parameter": parameter,
  354. "value": value
  355. }
  356. self.control_ws.send(json.dumps(command))
  357. def get_parameter(self, parameter):
  358. command = {
  359. "command": "get_parameter",
  360. "parameter": parameter
  361. }
  362. self.control_ws.send(json.dumps(command))
  363. def call_method(self, method, args=None, kwargs=None):
  364. command = {
  365. "command": "call_method",
  366. "method": method,
  367. "args": args or [],
  368. "kwargs": kwargs or {}
  369. }
  370. self.control_ws.send(json.dumps(command))
  371. def start_command_processor(self):
  372. self.command_thread = threading.Thread(target=self.command_processor)
  373. self.command_thread.daemon = False # Ensure it is not a daemon thread
  374. self.command_thread.start()
  375. def command_processor(self):
  376. # print(f"Starting command processor")
  377. self.debug_print(f"Starting command processor")
  378. #while self.is_running and not self.stop_event.is_set():
  379. while not self.stop_event.is_set():
  380. try:
  381. command = self.commands.get(timeout=0.1)
  382. if command['type'] == 'set_parameter':
  383. self.set_parameter(command['parameter'], command['value'])
  384. elif command['type'] == 'get_parameter':
  385. self.get_parameter(command['parameter'])
  386. elif command['type'] == 'call_method':
  387. self.call_method(command['method'], command.get('args'), command.get('kwargs'))
  388. except queue.Empty: # Use queue.Empty instead of Queue.Empty
  389. continue # Queue was empty, just loop again
  390. except Exception as e:
  391. self.debug_print(f"Error in command processor: {e}")
  392. # finally:
  393. #print(f"Leaving command processor")
  394. self.debug_print(f"Leaving command processor")
  395. def add_command(self, command):
  396. self.commands.put(command)
  397. def main():
  398. parser = argparse.ArgumentParser(description="STT Client")
  399. parser.add_argument("--control-url", default=DEFAULT_CONTROL_URL, help="STT Control WebSocket URL")
  400. parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="STT Data WebSocket URL")
  401. parser.add_argument("--debug", action="store_true", help="Enable debug mode")
  402. parser.add_argument("-nort", "--norealtime", action="store_true", help="Disable real-time output")
  403. parser.add_argument("--set-param", nargs=2, metavar=('PARAM', 'VALUE'), action='append',
  404. help="Set a recorder parameter. Can be used multiple times.")
  405. parser.add_argument("--call-method", nargs='+', metavar='METHOD', action='append',
  406. help="Call a recorder method with optional arguments.")
  407. parser.add_argument("--get-param", nargs=1, metavar='PARAM', action='append',
  408. help="Get the value of a recorder parameter. Can be used multiple times.")
  409. args = parser.parse_args()
  410. # Check if output is being redirected
  411. if not os.isatty(sys.stdout.fileno()):
  412. file_output = sys.stdout
  413. else:
  414. file_output = None
  415. client = STTWebSocketClient(args.control_url, args.data_url, args.debug, file_output, args.norealtime)
  416. def signal_handler(sig, frame):
  417. client.stop()
  418. sys.exit(0)
  419. import signal
  420. signal.signal(signal.SIGINT, signal_handler)
  421. try:
  422. if client.connect():
  423. # Process command-line parameters
  424. if args.set_param:
  425. for param, value in args.set_param:
  426. try:
  427. # Attempt to parse the value to the appropriate type
  428. if '.' in value:
  429. value = float(value)
  430. else:
  431. value = int(value)
  432. except ValueError:
  433. pass # Keep as string if not a number
  434. client.add_command({
  435. 'type': 'set_parameter',
  436. 'parameter': param,
  437. 'value': value
  438. })
  439. if args.get_param:
  440. for param_list in args.get_param:
  441. param = param_list[0]
  442. client.add_command({
  443. 'type': 'get_parameter',
  444. 'parameter': param
  445. })
  446. if args.call_method:
  447. for method_call in args.call_method:
  448. method = method_call[0]
  449. args_list = method_call[1:] if len(method_call) > 1 else []
  450. client.add_command({
  451. 'type': 'call_method',
  452. 'method': method,
  453. 'args': args_list
  454. })
  455. # If command-line parameters were used (like --get-param), wait for them to be processed
  456. if args.set_param or args.get_param or args.call_method:
  457. while not client.commands.empty():
  458. time.sleep(0.1)
  459. # Start recording directly if no command-line params were provided
  460. while client.is_running:
  461. time.sleep(0.1)
  462. else:
  463. print("Failed to connect to the server.", file=sys.stderr)
  464. except Exception as e:
  465. print(f"An error occurred: {e}")
  466. finally:
  467. client.stop()
  468. if __name__ == "__main__":
  469. main()