realtimestt_test_hotkeys.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. EXTENDED_LOGGING = False
  2. if __name__ == '__main__':
  3. import subprocess
  4. import sys
  5. import threading
  6. import time
  7. def install_rich():
  8. subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
  9. try:
  10. import rich
  11. except ImportError:
  12. user_input = input("This demo needs the 'rich' library, which is not installed.\nDo you want to install it now? (y/n): ")
  13. if user_input.lower() == 'y':
  14. try:
  15. install_rich()
  16. import rich
  17. print("Successfully installed 'rich'.")
  18. except Exception as e:
  19. print(f"An error occurred while installing 'rich': {e}")
  20. sys.exit(1)
  21. else:
  22. print("The program requires the 'rich' library to run. Exiting...")
  23. sys.exit(1)
  24. import keyboard
  25. import pyperclip
  26. if EXTENDED_LOGGING:
  27. import logging
  28. logging.basicConfig(level=logging.DEBUG)
  29. from rich.console import Console
  30. from rich.live import Live
  31. from rich.text import Text
  32. from rich.panel import Panel
  33. console = Console()
  34. console.print("System initializing, please wait")
  35. import os
  36. from RealtimeSTT import AudioToTextRecorder # Ensure this module has stop() or close() methods
  37. import colorama
  38. colorama.init()
  39. # Import pyautogui
  40. import pyautogui
  41. import pyaudio
  42. import numpy as np
  43. # Initialize Rich Console and Live
  44. live = Live(console=console, refresh_per_second=10, screen=False)
  45. live.start()
  46. # Global variables
  47. full_sentences = []
  48. rich_text_stored = ""
  49. recorder = None
  50. displayed_text = "" # Used for tracking text that was already displayed
  51. end_of_sentence_detection_pause = 0.45
  52. unknown_sentence_detection_pause = 0.7
  53. mid_sentence_detection_pause = 2.0
  54. prev_text = ""
  55. # Events to signal threads to exit or reset
  56. exit_event = threading.Event()
  57. reset_event = threading.Event()
  58. def preprocess_text(text):
  59. # Remove leading whitespaces
  60. text = text.lstrip()
  61. # Remove starting ellipses if present
  62. if text.startswith("..."):
  63. text = text[3:]
  64. # Remove any leading whitespaces again after ellipses removal
  65. text = text.lstrip()
  66. # Uppercase the first letter
  67. if text:
  68. text = text[0].upper() + text[1:]
  69. return text
  70. def text_detected(text):
  71. global prev_text, displayed_text, rich_text_stored
  72. text = preprocess_text(text)
  73. sentence_end_marks = ['.', '!', '?', '。']
  74. if text.endswith("..."):
  75. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  76. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  77. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  78. else:
  79. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  80. prev_text = text
  81. # Build Rich Text with alternating colors
  82. rich_text = Text()
  83. for i, sentence in enumerate(full_sentences):
  84. if i % 2 == 0:
  85. rich_text += Text(sentence, style="yellow") + Text(" ")
  86. else:
  87. rich_text += Text(sentence, style="cyan") + Text(" ")
  88. # If the current text is not a sentence-ending, display it in real-time
  89. if text:
  90. rich_text += Text(text, style="bold yellow")
  91. new_displayed_text = rich_text.plain
  92. if new_displayed_text != displayed_text:
  93. displayed_text = new_displayed_text
  94. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  95. live.update(panel)
  96. rich_text_stored = rich_text
  97. def process_text(text):
  98. global recorder, full_sentences, prev_text, displayed_text
  99. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  100. text = preprocess_text(text)
  101. text = text.rstrip()
  102. if text.endswith("..."):
  103. text = text[:-2]
  104. full_sentences.append(text)
  105. prev_text = ""
  106. text_detected("")
  107. # Check if reset_event is set
  108. if reset_event.is_set():
  109. # Clear buffers
  110. full_sentences.clear()
  111. displayed_text = ""
  112. reset_event.clear()
  113. console.print("[bold magenta]Transcription buffer reset.[/bold magenta]")
  114. return
  115. # Type the finalized sentence to the active window quickly if typing is enabled
  116. try:
  117. # Release modifier keys to prevent stuck keys
  118. for key in ['ctrl', 'shift', 'alt', 'win']:
  119. keyboard.release(key)
  120. pyautogui.keyUp(key)
  121. # Use clipboard to paste text
  122. pyperclip.copy(text + ' ')
  123. pyautogui.hotkey('ctrl', 'v')
  124. except Exception as e:
  125. console.print(f"[bold red]Failed to type the text: {e}[/bold red]")
  126. # Recorder configuration
  127. recorder_config = {
  128. 'spinner': False,
  129. 'model': 'tiny.en',
  130. 'input_device_index': 1,
  131. 'realtime_model_type': 'tiny.en',
  132. 'language': 'en',
  133. 'silero_sensitivity': 0.05,
  134. 'webrtc_sensitivity': 3,
  135. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  136. 'min_length_of_recording': 1.1,
  137. 'min_gap_between_recordings': 0,
  138. 'enable_realtime_transcription': False,
  139. 'realtime_processing_pause': 0.02,
  140. 'on_realtime_transcription_update': text_detected,
  141. 'silero_deactivity_detection': True,
  142. 'early_transcription_on_silence': 0,
  143. 'beam_size': 5,
  144. 'beam_size_realtime': 3,
  145. 'no_log_file': True,
  146. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  147. }
  148. if EXTENDED_LOGGING:
  149. recorder_config['level'] = logging.DEBUG
  150. recorder = AudioToTextRecorder(**recorder_config)
  151. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  152. live.update(initial_text)
  153. # Print available hotkeys
  154. console.print("[bold green]Available Hotkeys:[/bold green]")
  155. console.print("[bold cyan]F1[/bold cyan]: Mute Microphone")
  156. console.print("[bold cyan]F2[/bold cyan]: Unmute Microphone")
  157. console.print("[bold cyan]F3[/bold cyan]: Start Static Recording")
  158. console.print("[bold cyan]F4[/bold cyan]: Stop Static Recording")
  159. console.print("[bold cyan]F5[/bold cyan]: Reset Transcription")
  160. # Global variables for static recording
  161. static_recording_active = False
  162. static_recording_thread = None
  163. static_audio_frames = []
  164. live_recording_enabled = True # Track whether live recording was enabled before static recording
  165. # Note: The maximum recommended length of static recording is about 5 minutes.
  166. def static_recording_worker():
  167. """
  168. Worker function to record audio statically.
  169. """
  170. global static_audio_frames, static_recording_active
  171. # Set up pyaudio
  172. p = pyaudio.PyAudio()
  173. # Use the same audio format as the recorder
  174. FORMAT = pyaudio.paInt16
  175. CHANNELS = 1
  176. RATE = 16000 # Sample rate
  177. CHUNK = 1024 # Buffer size
  178. # Open the audio stream
  179. try:
  180. stream = p.open(format=FORMAT,
  181. channels=CHANNELS,
  182. rate=RATE,
  183. input=True,
  184. frames_per_buffer=CHUNK)
  185. except Exception as e:
  186. console.print(f"[bold red]Failed to open audio stream for static recording: {e}[/bold red]")
  187. static_recording_active = False
  188. p.terminate()
  189. return
  190. while static_recording_active and not exit_event.is_set():
  191. try:
  192. data = stream.read(CHUNK)
  193. static_audio_frames.append(data)
  194. except Exception as e:
  195. console.print(f"[bold red]Error during static recording: {e}[/bold red]")
  196. break
  197. # Stop and close the stream
  198. stream.stop_stream()
  199. stream.close()
  200. p.terminate()
  201. def start_static_recording():
  202. """
  203. Starts the static audio recording.
  204. """
  205. global static_recording_active, static_recording_thread, static_audio_frames, live_recording_enabled
  206. if static_recording_active:
  207. console.print("[bold yellow]Static recording is already in progress.[/bold yellow]")
  208. return
  209. # Mute the live recording microphone
  210. live_recording_enabled = recorder.use_microphone.value
  211. if live_recording_enabled:
  212. recorder.set_microphone(False)
  213. console.print("[bold yellow]Live microphone muted during static recording.[/bold yellow]")
  214. console.print("[bold green]Starting static recording... Press F4 or F5 to stop/reset.[/bold green]")
  215. static_audio_frames = []
  216. static_recording_active = True
  217. static_recording_thread = threading.Thread(target=static_recording_worker, daemon=True)
  218. static_recording_thread.start()
  219. def stop_static_recording():
  220. """
  221. Stops the static audio recording and processes the transcription.
  222. """
  223. global static_recording_active, static_recording_thread
  224. if not static_recording_active:
  225. console.print("[bold yellow]No static recording is in progress.[/bold yellow]")
  226. return
  227. console.print("[bold green]Stopping static recording...[/bold green]")
  228. static_recording_active = False
  229. if static_recording_thread is not None:
  230. static_recording_thread.join()
  231. static_recording_thread = None
  232. # Start a new thread to process the transcription
  233. processing_thread = threading.Thread(target=process_static_transcription, daemon=True)
  234. processing_thread.start()
  235. def process_static_transcription():
  236. global static_audio_frames, live_recording_enabled
  237. if exit_event.is_set():
  238. return
  239. # Process the recorded audio
  240. console.print("[bold green]Processing static recording...[/bold green]")
  241. # Convert audio data to numpy array
  242. audio_data = b''.join(static_audio_frames)
  243. audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
  244. # Transcribe the audio data
  245. try:
  246. from faster_whisper import WhisperModel
  247. except ImportError:
  248. console.print("[bold red]faster_whisper is not installed. Please install it to use static transcription.[/bold red]")
  249. return
  250. # Load the model
  251. model_size = recorder_config.get('model', 'tiny.en')
  252. device = recorder_config.get('device', 'cpu')
  253. compute_type = recorder_config.get('compute_type', 'default')
  254. console.print("Loading transcription model... This may take a moment.")
  255. try:
  256. model = WhisperModel(model_size, device=device, compute_type=compute_type)
  257. except Exception as e:
  258. console.print(f"[bold red]Failed to load transcription model: {e}[/bold red]")
  259. return
  260. # Transcribe the audio
  261. try:
  262. segments, info = model.transcribe(audio_array, beam_size=5)
  263. transcription = ' '.join([segment.text for segment in segments]).strip()
  264. except Exception as e:
  265. console.print(f"[bold red]Error during transcription: {e}[/bold red]")
  266. return
  267. # Display the transcription
  268. console.print("Static Recording Transcription:")
  269. console.print(f"[bold cyan]{transcription}[/bold cyan]")
  270. # Type the transcription into the active window
  271. try:
  272. # Release modifier keys to prevent stuck keys
  273. for key in ['ctrl', 'shift', 'alt', 'win']:
  274. keyboard.release(key)
  275. pyautogui.keyUp(key)
  276. # Use clipboard to paste text
  277. pyperclip.copy(transcription + ' ')
  278. pyautogui.hotkey('ctrl', 'v')
  279. except Exception as e:
  280. console.print(f"[bold red]Failed to type the static transcription: {e}[/bold red]")
  281. # Unmute the live recording microphone if it was enabled before
  282. if live_recording_enabled and not exit_event.is_set():
  283. recorder.set_microphone(True)
  284. console.print("[bold yellow]Live microphone unmuted.[/bold yellow]")
  285. def reset_transcription():
  286. """
  287. Resets the transcription by flushing ongoing recordings or buffers.
  288. """
  289. global static_recording_active, static_recording_thread, static_audio_frames
  290. console.print("[bold magenta]Resetting transcription...[/bold magenta]")
  291. if static_recording_active:
  292. console.print("[bold magenta]Flushing static recording...[/bold magenta]")
  293. # Stop static recording
  294. static_recording_active = False
  295. if static_recording_thread is not None:
  296. static_recording_thread.join()
  297. static_recording_thread = None
  298. # Clear static audio frames
  299. static_audio_frames = []
  300. # Unmute microphone if it was muted during static recording
  301. if live_recording_enabled:
  302. recorder.set_microphone(True)
  303. console.print("[bold yellow]Live microphone unmuted after reset.[/bold yellow]")
  304. elif recorder.use_microphone.value:
  305. # Live transcription is active and microphone is not muted
  306. console.print("[bold magenta]Resetting live transcription buffer...[/bold magenta]")
  307. reset_event.set()
  308. else:
  309. # Microphone is muted; nothing to reset
  310. console.print("[bold yellow]Microphone is muted. Nothing to reset.[/bold yellow]")
  311. # Hotkey Callback Functions
  312. def mute_microphone():
  313. recorder.set_microphone(False)
  314. console.print("[bold red]Microphone muted.[/bold red]")
  315. def unmute_microphone():
  316. recorder.set_microphone(True)
  317. console.print("[bold green]Microphone unmuted.[/bold green]")
  318. # Start the transcription loop in a separate thread
  319. def transcription_loop():
  320. try:
  321. while not exit_event.is_set():
  322. recorder.text(process_text)
  323. except Exception as e:
  324. console.print(f"[bold red]Error in transcription loop: {e}[/bold red]")
  325. finally:
  326. # Do not call sys.exit() here
  327. pass
  328. # Start the transcription loop thread
  329. transcription_thread = threading.Thread(target=transcription_loop, daemon=True)
  330. transcription_thread.start()
  331. # Define the hotkey combinations and their corresponding functions
  332. keyboard.add_hotkey('F1', mute_microphone, suppress=True)
  333. keyboard.add_hotkey('F2', unmute_microphone, suppress=True)
  334. keyboard.add_hotkey('F3', start_static_recording, suppress=True)
  335. keyboard.add_hotkey('F4', stop_static_recording, suppress=True)
  336. keyboard.add_hotkey('F5', reset_transcription, suppress=True)
  337. # Keep the main thread running and handle graceful exit
  338. try:
  339. keyboard.wait() # Waits indefinitely, until a hotkey triggers an exit or Ctrl+C
  340. except KeyboardInterrupt:
  341. console.print("[bold yellow]KeyboardInterrupt received. Exiting...[/bold yellow]")
  342. finally:
  343. # Signal threads to exit
  344. exit_event.set()
  345. # Reset transcription if needed
  346. reset_transcription()
  347. # Stop the recorder
  348. try:
  349. if hasattr(recorder, 'stop'):
  350. recorder.stop()
  351. elif hasattr(recorder, 'close'):
  352. recorder.close()
  353. except Exception as e:
  354. console.print(f"[bold red]Error stopping recorder: {e}[/bold red]")
  355. # Allow some time for threads to finish
  356. time.sleep(1)
  357. # Wait for transcription_thread to finish
  358. if transcription_thread.is_alive():
  359. transcription_thread.join(timeout=5)
  360. # Stop the Live console
  361. live.stop()
  362. console.print("[bold red]Exiting gracefully...[/bold red]")
  363. sys.exit(0)