realtimestt_test_hotkeys_v2.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. EXTENDED_LOGGING = False
  2. if __name__ == '__main__':
  3. import subprocess
  4. import sys
  5. import threading
  6. import time
  7. def install_rich():
  8. subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
  9. try:
  10. import rich
  11. except ImportError:
  12. user_input = input("This demo needs the 'rich' library, which is not installed.\nDo you want to install it now? (y/n): ")
  13. if user_input.lower() == 'y':
  14. try:
  15. install_rich()
  16. import rich
  17. print("Successfully installed 'rich'.")
  18. except Exception as e:
  19. print(f"An error occurred while installing 'rich': {e}")
  20. sys.exit(1)
  21. else:
  22. print("The program requires the 'rich' library to run. Exiting...")
  23. sys.exit(1)
  24. import keyboard
  25. import pyperclip
  26. if EXTENDED_LOGGING:
  27. import logging
  28. logging.basicConfig(level=logging.DEBUG)
  29. from rich.console import Console
  30. from rich.live import Live
  31. from rich.text import Text
  32. from rich.panel import Panel
  33. console = Console()
  34. console.print("System initializing, please wait")
  35. import os
  36. from RealtimeSTT import AudioToTextRecorder # Ensure this module has stop() or close() methods
  37. import colorama
  38. colorama.init()
  39. # Import pyautogui
  40. import pyautogui
  41. import pyaudio
  42. import numpy as np
  43. # Initialize Rich Console and Live
  44. live = Live(console=console, refresh_per_second=10, screen=False)
  45. live.start()
  46. # Global variables
  47. full_sentences = []
  48. rich_text_stored = ""
  49. recorder = None
  50. displayed_text = "" # Used for tracking text that was already displayed
  51. end_of_sentence_detection_pause = 0.45
  52. unknown_sentence_detection_pause = 0.7
  53. mid_sentence_detection_pause = 2.0
  54. prev_text = ""
  55. # Events to signal threads to exit or reset
  56. exit_event = threading.Event()
  57. reset_event = threading.Event()
  58. def preprocess_text(text):
  59. # Remove leading whitespaces
  60. text = text.lstrip()
  61. # Remove starting ellipses if present
  62. if text.startswith("..."):
  63. text = text[3:]
  64. # Remove any leading whitespaces again after ellipses removal
  65. text = text.lstrip()
  66. # Uppercase the first letter
  67. if text:
  68. text = text[0].upper() + text[1:]
  69. return text
  70. def text_detected(text):
  71. global prev_text, displayed_text, rich_text_stored
  72. text = preprocess_text(text)
  73. sentence_end_marks = ['.', '!', '?', '。']
  74. if text.endswith("..."):
  75. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  76. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  77. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  78. else:
  79. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  80. prev_text = text
  81. # Build Rich Text with alternating colors
  82. rich_text = Text()
  83. for i, sentence in enumerate(full_sentences):
  84. if i % 2 == 0:
  85. rich_text += Text(sentence, style="yellow") + Text(" ")
  86. else:
  87. rich_text += Text(sentence, style="cyan") + Text(" ")
  88. # If the current text is not a sentence-ending, display it in real-time
  89. if text:
  90. rich_text += Text(text, style="bold yellow")
  91. new_displayed_text = rich_text.plain
  92. if new_displayed_text != displayed_text:
  93. displayed_text = new_displayed_text
  94. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  95. live.update(panel)
  96. rich_text_stored = rich_text
  97. def process_text(text):
  98. global recorder, full_sentences, prev_text, displayed_text
  99. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  100. text = preprocess_text(text)
  101. text = text.rstrip()
  102. if text.endswith("..."):
  103. text = text[:-2]
  104. full_sentences.append(text)
  105. prev_text = ""
  106. text_detected("")
  107. # Check if reset_event is set
  108. if reset_event.is_set():
  109. # Clear buffers
  110. full_sentences.clear()
  111. displayed_text = ""
  112. reset_event.clear()
  113. console.print("[bold magenta]Transcription buffer reset.[/bold magenta]")
  114. return
  115. # Type the finalized sentence to the active window quickly if typing is enabled
  116. try:
  117. # Release modifier keys to prevent stuck keys
  118. for key in ['ctrl', 'shift', 'alt', 'win']:
  119. keyboard.release(key)
  120. pyautogui.keyUp(key)
  121. # Use clipboard to paste text
  122. pyperclip.copy(text + ' ')
  123. pyautogui.hotkey('ctrl', 'v')
  124. except Exception as e:
  125. console.print(f"[bold red]Failed to type the text: {e}[/bold red]")
  126. # Recorder configuration
  127. recorder_config = {
  128. 'spinner': False,
  129. 'model': 'Systran/faster-distil-whisper-large-v3', # distil-medium.en or large-v2 or deepdml/faster-whisper-large-v3-turbo-ct2 or ...
  130. 'input_device_index': 1,
  131. 'realtime_model_type': 'Systran/faster-distil-whisper-large-v3', # Using the same model for realtime
  132. 'language': 'en',
  133. 'silero_sensitivity': 0.05,
  134. 'webrtc_sensitivity': 3,
  135. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  136. 'min_length_of_recording': 1.1,
  137. 'min_gap_between_recordings': 0,
  138. 'enable_realtime_transcription': True,
  139. 'realtime_processing_pause': 0.02,
  140. 'on_realtime_transcription_update': text_detected,
  141. # 'on_realtime_transcription_stabilized': text_detected,
  142. 'silero_deactivity_detection': True,
  143. 'early_transcription_on_silence': 0,
  144. 'beam_size': 5,
  145. 'beam_size_realtime': 5, # Matching beam_size for consistency
  146. 'no_log_file': True,
  147. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the...",
  148. 'device': 'cuda', # Added device configuration
  149. 'compute_type': 'float16' # Added compute_type configuration
  150. }
  151. if EXTENDED_LOGGING:
  152. recorder_config['level'] = logging.DEBUG
  153. recorder = AudioToTextRecorder(**recorder_config)
  154. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  155. live.update(initial_text)
  156. # Print available hotkeys
  157. console.print("[bold green]Available Hotkeys:[/bold green]")
  158. console.print("[bold cyan]F1[/bold cyan]: Mute Microphone")
  159. console.print("[bold cyan]F2[/bold cyan]: Unmute Microphone")
  160. console.print("[bold cyan]F3[/bold cyan]: Start Static Recording")
  161. console.print("[bold cyan]F4[/bold cyan]: Stop Static Recording")
  162. console.print("[bold cyan]F5[/bold cyan]: Reset Transcription")
  163. # Global variables for static recording
  164. static_recording_active = False
  165. static_recording_thread = None
  166. static_audio_frames = []
  167. live_recording_enabled = True # Track whether live recording was enabled before static recording
  168. # Audio settings for static recording
  169. audio_settings = {
  170. 'FORMAT': pyaudio.paInt16, # PyAudio format
  171. 'CHANNELS': 1, # Mono audio
  172. 'RATE': 16000, # Sample rate
  173. 'CHUNK': 1024 # Buffer size
  174. }
  175. # Note: The maximum recommended length of static recording is about 5 minutes.
  176. def static_recording_worker():
  177. """
  178. Worker function to record audio statically.
  179. """
  180. global static_audio_frames, static_recording_active
  181. # Set up pyaudio
  182. p = pyaudio.PyAudio()
  183. # Use the same audio format as defined in audio_settings
  184. FORMAT = audio_settings['FORMAT']
  185. CHANNELS = audio_settings['CHANNELS']
  186. RATE = audio_settings['RATE'] # Sample rate
  187. CHUNK = audio_settings['CHUNK'] # Buffer size
  188. # Open the audio stream
  189. try:
  190. stream = p.open(format=FORMAT,
  191. channels=CHANNELS,
  192. rate=RATE,
  193. input=True,
  194. frames_per_buffer=CHUNK)
  195. except Exception as e:
  196. console.print(f"[bold red]Failed to open audio stream for static recording: {e}[/bold red]")
  197. static_recording_active = False
  198. p.terminate()
  199. return
  200. while static_recording_active and not exit_event.is_set():
  201. try:
  202. data = stream.read(CHUNK)
  203. static_audio_frames.append(data)
  204. except Exception as e:
  205. console.print(f"[bold red]Error during static recording: {e}[/bold red]")
  206. break
  207. # Stop and close the stream
  208. stream.stop_stream()
  209. stream.close()
  210. p.terminate()
  211. def start_static_recording():
  212. """
  213. Starts the static audio recording.
  214. """
  215. global static_recording_active, static_recording_thread, static_audio_frames, live_recording_enabled
  216. if static_recording_active:
  217. console.print("[bold yellow]Static recording is already in progress.[/bold yellow]")
  218. return
  219. # Mute the live recording microphone
  220. live_recording_enabled = recorder.use_microphone.value
  221. if live_recording_enabled:
  222. recorder.set_microphone(False)
  223. console.print("[bold yellow]Live microphone muted during static recording.[/bold yellow]")
  224. console.print("[bold green]Starting static recording... Press F4 or F5 to stop/reset.[/bold green]")
  225. static_audio_frames = []
  226. static_recording_active = True
  227. static_recording_thread = threading.Thread(target=static_recording_worker, daemon=True)
  228. static_recording_thread.start()
  229. def stop_static_recording():
  230. """
  231. Stops the static audio recording and processes the transcription.
  232. """
  233. global static_recording_active, static_recording_thread
  234. if not static_recording_active:
  235. console.print("[bold yellow]No static recording is in progress.[/bold yellow]")
  236. return
  237. console.print("[bold green]Stopping static recording...[/bold green]")
  238. static_recording_active = False
  239. if static_recording_thread is not None:
  240. static_recording_thread.join()
  241. static_recording_thread = None
  242. # Start a new thread to process the transcription
  243. processing_thread = threading.Thread(target=process_static_transcription, daemon=True)
  244. processing_thread.start()
  245. def process_static_transcription():
  246. global static_audio_frames, live_recording_enabled
  247. if exit_event.is_set():
  248. return
  249. # Process the recorded audio
  250. console.print("[bold green]Processing static recording...[/bold green]")
  251. # Convert audio data to numpy array
  252. audio_data = b''.join(static_audio_frames)
  253. audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
  254. # Transcribe the audio data
  255. try:
  256. from faster_whisper import WhisperModel
  257. except ImportError:
  258. console.print("[bold red]faster_whisper is not installed. Please install it to use static transcription.[/bold red]")
  259. return
  260. # Load the model using recorder_config
  261. model_size = recorder_config['model']
  262. device = recorder_config['device']
  263. compute_type = recorder_config['compute_type']
  264. console.print("Loading transcription model... This may take a moment.")
  265. try:
  266. model = WhisperModel(model_size, device=device, compute_type=compute_type)
  267. except Exception as e:
  268. console.print(f"[bold red]Failed to load transcription model: {e}[/bold red]")
  269. return
  270. # Transcribe the audio
  271. try:
  272. segments, info = model.transcribe(audio_array, beam_size=recorder_config['beam_size'])
  273. transcription = ' '.join([segment.text for segment in segments]).strip()
  274. except Exception as e:
  275. console.print(f"[bold red]Error during transcription: {e}[/bold red]")
  276. return
  277. # Display the transcription
  278. console.print("Static Recording Transcription:")
  279. console.print(f"[bold cyan]{transcription}[/bold cyan]")
  280. # Type the transcription into the active window
  281. try:
  282. # Release modifier keys to prevent stuck keys
  283. for key in ['ctrl', 'shift', 'alt', 'win']:
  284. keyboard.release(key)
  285. pyautogui.keyUp(key)
  286. # Use clipboard to paste text
  287. pyperclip.copy(transcription + ' ')
  288. pyautogui.hotkey('ctrl', 'v')
  289. except Exception as e:
  290. console.print(f"[bold red]Failed to type the static transcription: {e}[/bold red]")
  291. # Unmute the live recording microphone if it was enabled before
  292. if live_recording_enabled and not exit_event.is_set():
  293. recorder.set_microphone(True)
  294. console.print("[bold yellow]Live microphone unmuted.[/bold yellow]")
  295. def reset_transcription():
  296. """
  297. Resets the transcription by flushing ongoing recordings or buffers.
  298. """
  299. global static_recording_active, static_recording_thread, static_audio_frames
  300. console.print("[bold magenta]Resetting transcription...[/bold magenta]")
  301. if static_recording_active:
  302. console.print("[bold magenta]Flushing static recording...[/bold magenta]")
  303. # Stop static recording
  304. static_recording_active = False
  305. if static_recording_thread is not None:
  306. static_recording_thread.join()
  307. static_recording_thread = None
  308. # Clear static audio frames
  309. static_audio_frames = []
  310. # Unmute microphone if it was muted during static recording
  311. if live_recording_enabled:
  312. recorder.set_microphone(True)
  313. console.print("[bold yellow]Live microphone unmuted after reset.[/bold yellow]")
  314. elif recorder.use_microphone.value:
  315. # Live transcription is active and microphone is not muted
  316. console.print("[bold magenta]Resetting live transcription buffer...[/bold magenta]")
  317. reset_event.set()
  318. else:
  319. # Microphone is muted; nothing to reset
  320. console.print("[bold yellow]Microphone is muted. Nothing to reset.[/bold yellow]")
  321. # Hotkey Callback Functions
  322. def mute_microphone():
  323. recorder.set_microphone(False)
  324. console.print("[bold red]Microphone muted.[/bold red]")
  325. def unmute_microphone():
  326. recorder.set_microphone(True)
  327. console.print("[bold green]Microphone unmuted.[/bold green]")
  328. # Start the transcription loop in a separate thread
  329. def transcription_loop():
  330. try:
  331. while not exit_event.is_set():
  332. recorder.text(process_text)
  333. except Exception as e:
  334. console.print(f"[bold red]Error in transcription loop: {e}[/bold red]")
  335. finally:
  336. # Do not call sys.exit() here
  337. pass
  338. # Start the transcription loop thread
  339. transcription_thread = threading.Thread(target=transcription_loop, daemon=True)
  340. transcription_thread.start()
  341. # Define the hotkey combinations and their corresponding functions
  342. keyboard.add_hotkey('F1', mute_microphone, suppress=True)
  343. keyboard.add_hotkey('F2', unmute_microphone, suppress=True)
  344. keyboard.add_hotkey('F3', start_static_recording, suppress=True)
  345. keyboard.add_hotkey('F4', stop_static_recording, suppress=True)
  346. keyboard.add_hotkey('F5', reset_transcription, suppress=True)
  347. # Keep the main thread running and handle graceful exit
  348. try:
  349. keyboard.wait() # Waits indefinitely, until a hotkey triggers an exit or Ctrl+C
  350. except KeyboardInterrupt:
  351. console.print("[bold yellow]KeyboardInterrupt received. Exiting...[/bold yellow]")
  352. finally:
  353. # Signal threads to exit
  354. exit_event.set()
  355. # Reset transcription if needed
  356. reset_transcription()
  357. # Stop the recorder
  358. try:
  359. if hasattr(recorder, 'stop'):
  360. recorder.stop()
  361. elif hasattr(recorder, 'close'):
  362. recorder.close()
  363. except Exception as e:
  364. console.print(f"[bold red]Error stopping recorder: {e}[/bold red]")
  365. # Allow some time for threads to finish
  366. time.sleep(1)
  367. # Wait for transcription_thread to finish
  368. if transcription_thread.is_alive():
  369. transcription_thread.join(timeout=5)
  370. # Stop the Live console
  371. live.stop()
  372. console.print("[bold red]Exiting gracefully...[/bold red]")
  373. sys.exit(0)