stt_cli_client.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. """
  2. Speech-to-Text (STT) Client CLI for WebSocket Server Interaction
  3. This command-line interface (CLI) allows interaction with the Speech-to-Text (STT) WebSocket server. It connects to the server via control and data WebSocket URLs to facilitate real-time speech transcription, control the server, and manage various parameters related to the STT process.
  4. The client can be used to start recording audio, set or retrieve STT parameters, and interact with the server using commands. Additionally, the CLI can disable real-time updates or run in debug mode for detailed output.
  5. ### Features:
  6. - Connects to STT WebSocket server for real-time transcription and control.
  7. - Supports setting and retrieving parameters via the command line.
  8. - Allows calling server methods (e.g., start/stop recording).
  9. - Option to disable real-time updates during transcription.
  10. - Debug mode available for verbose logging.
  11. ### Starting the Client:
  12. You can start the client using the command `stt` and optionally pass configuration options or commands for interacting with the server.
  13. ```bash
  14. stt [OPTIONS]
  15. ```
  16. ### Available Parameters:
  17. - `--control-url` (default: "ws://localhost:8011"): The WebSocket URL for server control commands.
  18. - `--data-url` (default: "ws://localhost:8012"): The WebSocket URL for sending audio data and receiving transcription updates.
  19. - `--debug`: Enable debug mode, which prints detailed logs to stderr.
  20. - `--nort` or `--norealtime`: Disable real-time output of transcription results.
  21. - `--set-param PARAM VALUE`: Set a recorder parameter (e.g., silero_sensitivity, beam_size, etc.). This option can be used multiple times to set different parameters.
  22. - `--get-param PARAM`: Retrieve the value of a specific recorder parameter. This option can be used multiple times to retrieve different parameters.
  23. - `--call-method METHOD [ARGS]`: Call a method on the recorder with optional arguments. This option can be used multiple times for different methods.
  24. ### Example Usage:
  25. 1. **Start the client with default settings:**
  26. ```bash
  27. stt
  28. ```
  29. 2. **Set a recorder parameter (e.g., set Silero sensitivity to 0.1):**
  30. ```bash
  31. stt --set-param silero_sensitivity 0.1
  32. ```
  33. 3. **Retrieve the value of a recorder parameter (e.g., get the current Silero sensitivity):**
  34. ```bash
  35. stt --get-param silero_sensitivity
  36. ```
  37. 4. **Call a method on the recorder (e.g., start the microphone input):**
  38. ```bash
  39. stt --call-method set_microphone
  40. ```
  41. 5. **Run in debug mode:**
  42. ```bash
  43. stt --debug
  44. ```
  45. ### WebSocket Interface:
  46. - **Control WebSocket**: Used for sending control commands like setting parameters or invoking methods.
  47. - **Data WebSocket**: Used for sending audio data for real-time transcription and receiving transcription results.
  48. The client can be used to send audio data to the server for transcription and to control the behavior of the server remotely.
  49. """
  50. import os
  51. import sys
  52. import pyaudio
  53. import numpy as np
  54. from scipy import signal
  55. import logging
  56. import websocket
  57. import argparse
  58. import json
  59. import threading
  60. import time
  61. import struct
  62. import socket
  63. import subprocess
  64. import shutil
  65. from urllib.parse import urlparse
  66. import queue
  67. from queue import Queue
  68. os.environ['ALSA_LOG_LEVEL'] = 'none'
  69. # Constants
  70. CHUNK = 1024
  71. FORMAT = pyaudio.paInt16
  72. CHANNELS = 1
  73. RATE = 44100
  74. DEFAULT_CONTROL_URL = "ws://127.0.0.1:8011"
  75. DEFAULT_DATA_URL = "ws://127.0.0.1:8012"
  76. # Initialize colorama
  77. from colorama import init, Fore, Style
  78. init()
  79. # Stop websocket from spamming the log
  80. websocket.enableTrace(False)
  81. class STTWebSocketClient:
  82. def __init__(self, control_url, data_url, debug=False, file_output=None, norealtime=False):
  83. self.control_url = control_url
  84. self.data_url = data_url
  85. self.control_ws = None
  86. self.data_ws = None
  87. self.is_running = False
  88. self.debug = debug
  89. self.file_output = file_output
  90. self.last_text = ""
  91. self.console_width = shutil.get_terminal_size().columns
  92. self.recording_indicator = "🔴"
  93. self.norealtime = norealtime
  94. self.connection_established = threading.Event()
  95. self.message_queue = Queue()
  96. self.commands = Queue()
  97. self.stop_event = threading.Event()
  98. # Audio attributes
  99. self.audio_interface = None
  100. self.stream = None
  101. self.device_sample_rate = None
  102. self.input_device_index = None
  103. # Threads
  104. self.control_ws_thread = None
  105. self.data_ws_thread = None
  106. self.recording_thread = None
  107. def debug_print(self, message):
  108. if self.debug:
  109. print(message, file=sys.stderr)
  110. def connect(self):
  111. if not self.ensure_server_running():
  112. self.debug_print("Cannot start STT server. Exiting.")
  113. return False
  114. try:
  115. # Connect to control WebSocket
  116. self.control_ws = websocket.WebSocketApp(self.control_url,
  117. on_message=self.on_control_message,
  118. on_error=self.on_error,
  119. on_close=self.on_close,
  120. on_open=self.on_control_open)
  121. self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
  122. self.control_ws_thread.daemon = False # Set to False to ensure proper shutdown
  123. self.control_ws_thread.start()
  124. # Connect to data WebSocket
  125. self.data_ws = websocket.WebSocketApp(self.data_url,
  126. on_message=self.on_data_message,
  127. on_error=self.on_error,
  128. on_close=self.on_close,
  129. on_open=self.on_data_open)
  130. self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
  131. self.data_ws_thread.daemon = False # Set to False to ensure proper shutdown
  132. self.data_ws_thread.start()
  133. # Wait for the connections to be established
  134. if not self.connection_established.wait(timeout=10):
  135. self.debug_print("Timeout while connecting to the server.")
  136. return False
  137. self.debug_print("WebSocket connections established successfully.")
  138. return True
  139. except Exception as e:
  140. self.debug_print(f"Error while connecting to the server: {e}")
  141. return False
  142. def on_control_open(self, ws):
  143. self.debug_print("Control WebSocket connection opened.")
  144. self.connection_established.set()
  145. self.start_command_processor()
  146. def on_data_open(self, ws):
  147. self.debug_print("Data WebSocket connection opened.")
  148. self.is_running = True
  149. self.start_recording()
  150. def on_error(self, ws, error):
  151. self.debug_print(f"WebSocket error: {error}")
  152. def on_close(self, ws, close_status_code, close_msg):
  153. self.debug_print(f"WebSocket connection closed: {close_status_code} - {close_msg}")
  154. self.is_running = False
  155. self.stop_event.set()
  156. def is_server_running(self):
  157. parsed_url = urlparse(self.control_url)
  158. host = parsed_url.hostname
  159. port = parsed_url.port or 80
  160. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  161. return s.connect_ex((host, port)) == 0
  162. def ask_to_start_server(self):
  163. response = input("Would you like to start the STT server now? (y/n): ").strip().lower()
  164. return response == 'y' or response == 'yes'
  165. def start_server(self):
  166. if os.name == 'nt': # Windows
  167. subprocess.Popen('start /min cmd /c stt-server', shell=True)
  168. else: # Unix-like systems
  169. subprocess.Popen(['stt-server'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
  170. print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
  171. def ensure_server_running(self):
  172. if not self.is_server_running():
  173. print("STT server is not running.", file=sys.stderr)
  174. if self.ask_to_start_server():
  175. self.start_server()
  176. print("Waiting for STT server to start...", file=sys.stderr)
  177. for _ in range(20): # Wait up to 20 seconds
  178. if self.is_server_running():
  179. print("STT server started successfully.", file=sys.stderr)
  180. time.sleep(2) # Give the server a moment to fully initialize
  181. return True
  182. time.sleep(1)
  183. print("Failed to start STT server.", file=sys.stderr)
  184. return False
  185. else:
  186. print("STT server is required. Please start it manually.", file=sys.stderr)
  187. return False
  188. return True
  189. # Handle control messages like set_parameter, get_parameter, etc.
  190. def on_control_message(self, ws, message):
  191. try:
  192. data = json.loads(message)
  193. # Handle server response with status
  194. if 'status' in data:
  195. if data['status'] == 'success':
  196. # print(f"Server Response: {data.get('message', '')}")
  197. if 'parameter' in data and 'value' in data:
  198. print(f"Parameter {data['parameter']} = {data['value']}")
  199. elif data['status'] == 'error':
  200. print(f"Server Error: {data.get('message', '')}")
  201. else:
  202. self.debug_print(f"Unknown control message format: {data}")
  203. except json.JSONDecodeError:
  204. self.debug_print(f"Received non-JSON control message: {message}")
  205. except Exception as e:
  206. self.debug_print(f"Error processing control message: {e}")
  207. # Handle real-time transcription and full sentence updates
  208. def on_data_message(self, ws, message):
  209. try:
  210. data = json.loads(message)
  211. # Handle real-time transcription updates
  212. if data.get('type') == 'realtime':
  213. if data['text'] != self.last_text:
  214. self.last_text = data['text']
  215. if not self.norealtime:
  216. self.update_progress_bar(self.last_text)
  217. # Handle full sentences
  218. elif data.get('type') == 'fullSentence':
  219. if self.file_output:
  220. sys.stderr.write('\r\033[K')
  221. sys.stderr.write(data['text'])
  222. sys.stderr.write('\n')
  223. sys.stderr.flush()
  224. print(data['text'], file=self.file_output)
  225. self.file_output.flush() # Ensure it's written immediately
  226. else:
  227. self.finish_progress_bar()
  228. print(f"{data['text']}")
  229. self.stop()
  230. else:
  231. self.debug_print(f"Unknown data message format: {data}")
  232. except json.JSONDecodeError:
  233. self.debug_print(f"Received non-JSON data message: {message}")
  234. except Exception as e:
  235. self.debug_print(f"Error processing data message: {e}")
  236. def show_initial_indicator(self):
  237. if self.norealtime:
  238. return
  239. initial_text = f"{self.recording_indicator}\b\b"
  240. sys.stderr.write(initial_text)
  241. sys.stderr.flush()
  242. def update_progress_bar(self, text):
  243. try:
  244. available_width = self.console_width - 5 # Adjust for progress bar decorations
  245. sys.stderr.write('\r\033[K') # Clear the current line
  246. words = text.split()
  247. last_chars = ""
  248. for word in reversed(words):
  249. if len(last_chars) + len(word) + 1 > available_width:
  250. break
  251. last_chars = word + " " + last_chars
  252. last_chars = last_chars.strip()
  253. colored_text = f"{Fore.YELLOW}{last_chars}{Style.RESET_ALL}{self.recording_indicator}\b\b"
  254. sys.stderr.write(colored_text)
  255. sys.stderr.flush()
  256. except Exception as e:
  257. self.debug_print(f"Error updating progress bar: {e}")
  258. def finish_progress_bar(self):
  259. try:
  260. sys.stderr.write('\r\033[K')
  261. sys.stderr.flush()
  262. except Exception as e:
  263. self.debug_print(f"Error finishing progress bar: {e}")
  264. def stop(self):
  265. self.finish_progress_bar()
  266. self.is_running = False
  267. self.stop_event.set()
  268. if self.control_ws:
  269. self.control_ws.close()
  270. if self.data_ws:
  271. self.data_ws.close()
  272. # Join threads to ensure they finish before exiting
  273. if self.control_ws_thread:
  274. self.control_ws_thread.join()
  275. if self.data_ws_thread:
  276. self.data_ws_thread.join()
  277. if self.recording_thread:
  278. self.recording_thread.join()
  279. # Clean up audio resources
  280. if self.stream:
  281. self.stream.stop_stream()
  282. self.stream.close()
  283. if self.audio_interface:
  284. self.audio_interface.terminate()
  285. def start_recording(self):
  286. self.recording_thread = threading.Thread(target=self.record_and_send_audio)
  287. self.recording_thread.daemon = False # Set to False to ensure proper shutdown
  288. self.recording_thread.start()
  289. def record_and_send_audio(self):
  290. try:
  291. if not self.setup_audio():
  292. raise Exception("Failed to set up audio recording.")
  293. self.debug_print("Recording and sending audio...")
  294. self.show_initial_indicator()
  295. while self.is_running:
  296. try:
  297. audio_data = self.stream.read(CHUNK)
  298. metadata = {"sampleRate": self.device_sample_rate}
  299. metadata_json = json.dumps(metadata)
  300. metadata_length = len(metadata_json)
  301. message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
  302. self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
  303. except Exception as e:
  304. self.debug_print(f"Error sending audio data: {e}")
  305. break # Exit the recording loop
  306. except Exception as e:
  307. self.debug_print(f"Error in record_and_send_audio: {e}")
  308. finally:
  309. self.cleanup_audio()
  310. def setup_audio(self):
  311. try:
  312. self.audio_interface = pyaudio.PyAudio()
  313. self.input_device_index = None
  314. try:
  315. default_device = self.audio_interface.get_default_input_device_info()
  316. self.input_device_index = default_device['index']
  317. except OSError as e:
  318. self.debug_print(f"No default input device found: {e}")
  319. return False
  320. self.device_sample_rate = 16000 # Try 16000 Hz first
  321. try:
  322. self.stream = self.audio_interface.open(
  323. format=FORMAT,
  324. channels=CHANNELS,
  325. rate=self.device_sample_rate,
  326. input=True,
  327. frames_per_buffer=CHUNK,
  328. input_device_index=self.input_device_index,
  329. )
  330. self.debug_print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
  331. return True
  332. except Exception as e:
  333. self.debug_print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
  334. return False
  335. except Exception as e:
  336. self.debug_print(f"Error initializing audio recording: {e}")
  337. if self.audio_interface:
  338. self.audio_interface.terminate()
  339. return False
  340. def cleanup_audio(self):
  341. try:
  342. if self.stream:
  343. self.stream.stop_stream()
  344. self.stream.close()
  345. self.stream = None
  346. if self.audio_interface:
  347. self.audio_interface.terminate()
  348. self.audio_interface = None
  349. except Exception as e:
  350. self.debug_print(f"Error cleaning up audio resources: {e}")
  351. def set_parameter(self, parameter, value):
  352. command = {
  353. "command": "set_parameter",
  354. "parameter": parameter,
  355. "value": value
  356. }
  357. self.control_ws.send(json.dumps(command))
  358. def get_parameter(self, parameter):
  359. command = {
  360. "command": "get_parameter",
  361. "parameter": parameter
  362. }
  363. self.control_ws.send(json.dumps(command))
  364. def call_method(self, method, args=None, kwargs=None):
  365. command = {
  366. "command": "call_method",
  367. "method": method,
  368. "args": args or [],
  369. "kwargs": kwargs or {}
  370. }
  371. self.control_ws.send(json.dumps(command))
  372. def start_command_processor(self):
  373. self.command_thread = threading.Thread(target=self.command_processor)
  374. self.command_thread.daemon = False # Ensure it is not a daemon thread
  375. self.command_thread.start()
  376. def command_processor(self):
  377. # print(f"Starting command processor")
  378. self.debug_print(f"Starting command processor")
  379. #while self.is_running and not self.stop_event.is_set():
  380. while not self.stop_event.is_set():
  381. try:
  382. command = self.commands.get(timeout=0.1)
  383. if command['type'] == 'set_parameter':
  384. self.set_parameter(command['parameter'], command['value'])
  385. elif command['type'] == 'get_parameter':
  386. self.get_parameter(command['parameter'])
  387. elif command['type'] == 'call_method':
  388. self.call_method(command['method'], command.get('args'), command.get('kwargs'))
  389. except queue.Empty: # Use queue.Empty instead of Queue.Empty
  390. continue # Queue was empty, just loop again
  391. except Exception as e:
  392. self.debug_print(f"Error in command processor: {e}")
  393. # finally:
  394. #print(f"Leaving command processor")
  395. self.debug_print(f"Leaving command processor")
  396. def add_command(self, command):
  397. self.commands.put(command)
  398. def main():
  399. parser = argparse.ArgumentParser(description="STT Client")
  400. parser.add_argument("--control-url", default=DEFAULT_CONTROL_URL, help="STT Control WebSocket URL")
  401. parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="STT Data WebSocket URL")
  402. parser.add_argument("--debug", action="store_true", help="Enable debug mode")
  403. parser.add_argument("-nort", "--norealtime", action="store_true", help="Disable real-time output")
  404. parser.add_argument("--set-param", nargs=2, metavar=('PARAM', 'VALUE'), action='append',
  405. help="Set a recorder parameter. Can be used multiple times.")
  406. parser.add_argument("--call-method", nargs='+', metavar='METHOD', action='append',
  407. help="Call a recorder method with optional arguments.")
  408. parser.add_argument("--get-param", nargs=1, metavar='PARAM', action='append',
  409. help="Get the value of a recorder parameter. Can be used multiple times.")
  410. args = parser.parse_args()
  411. # Check if output is being redirected
  412. if not os.isatty(sys.stdout.fileno()):
  413. file_output = sys.stdout
  414. else:
  415. file_output = None
  416. client = STTWebSocketClient(args.control_url, args.data_url, args.debug, file_output, args.norealtime)
  417. def signal_handler(sig, frame):
  418. client.stop()
  419. sys.exit(0)
  420. import signal
  421. signal.signal(signal.SIGINT, signal_handler)
  422. try:
  423. if client.connect():
  424. # Process command-line parameters
  425. if args.set_param:
  426. for param, value in args.set_param:
  427. try:
  428. # Attempt to parse the value to the appropriate type
  429. if '.' in value:
  430. value = float(value)
  431. else:
  432. value = int(value)
  433. except ValueError:
  434. pass # Keep as string if not a number
  435. client.add_command({
  436. 'type': 'set_parameter',
  437. 'parameter': param,
  438. 'value': value
  439. })
  440. if args.get_param:
  441. for param_list in args.get_param:
  442. param = param_list[0]
  443. client.add_command({
  444. 'type': 'get_parameter',
  445. 'parameter': param
  446. })
  447. if args.call_method:
  448. for method_call in args.call_method:
  449. method = method_call[0]
  450. args_list = method_call[1:] if len(method_call) > 1 else []
  451. client.add_command({
  452. 'type': 'call_method',
  453. 'method': method,
  454. 'args': args_list
  455. })
  456. # If command-line parameters were used (like --get-param), wait for them to be processed
  457. if args.set_param or args.get_param or args.call_method:
  458. while not client.commands.empty():
  459. time.sleep(0.1)
  460. # Start recording directly if no command-line params were provided
  461. while client.is_running:
  462. time.sleep(0.1)
  463. else:
  464. print("Failed to connect to the server.", file=sys.stderr)
  465. except Exception as e:
  466. print(f"An error occurred: {e}")
  467. finally:
  468. client.stop()
  469. if __name__ == "__main__":
  470. main()