realtimestt_test_stereomix.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. EXTENDED_LOGGING = False
  2. def main():
  3. import subprocess
  4. import sys
  5. def install_rich():
  6. subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
  7. try:
  8. import rich
  9. except ImportError:
  10. user_input = input("This demo needs the 'rich' library, which is not installed.\nDo you want to install it now? (y/n): ")
  11. if user_input.lower() == 'y':
  12. try:
  13. install_rich()
  14. import rich
  15. print("Successfully installed 'rich'.")
  16. except Exception as e:
  17. print(f"An error occurred while installing 'rich': {e}")
  18. sys.exit(1)
  19. else:
  20. print("The program requires the 'rich' library to run. Exiting...")
  21. sys.exit(1)
  22. if EXTENDED_LOGGING:
  23. import logging
  24. logging.basicConfig(level=logging.DEBUG)
  25. import os
  26. import sys
  27. import threading
  28. import time
  29. import pyaudio
  30. from rich.console import Console
  31. from rich.live import Live
  32. from rich.text import Text
  33. from rich.panel import Panel
  34. from rich.spinner import Spinner
  35. from rich.progress import Progress, SpinnerColumn, TextColumn
  36. from colorama import Fore, Style, init as colorama_init
  37. from RealtimeSTT import AudioToTextRecorder
  38. # Configuration Constants
  39. LOOPBACK_DEVICE_NAME = "stereomix"
  40. LOOPBACK_DEVICE_HOST_API = 0
  41. BUFFER_SIZE = 512
  42. AUDIO_FORMAT = pyaudio.paInt16
  43. CHANNELS = 1
  44. RATE = 16000
  45. console = Console()
  46. console.print("System initializing, please wait")
  47. colorama_init()
  48. # Initialize Rich Console and Live
  49. live = Live(console=console, refresh_per_second=10, screen=False)
  50. live.start()
  51. full_sentences = []
  52. rich_text_stored = ""
  53. recorder = None
  54. displayed_text = "" # Used for tracking text that was already displayed
  55. end_of_sentence_detection_pause = 0.2
  56. unknown_sentence_detection_pause = 0.5
  57. mid_sentence_detection_pause = 1
  58. prev_text = ""
  59. def clear_console():
  60. os.system('clear' if os.name == 'posix' else 'cls')
  61. def preprocess_text(text):
  62. # Remove leading whitespaces
  63. text = text.lstrip()
  64. # Remove starting ellipses if present
  65. if text.startswith("..."):
  66. text = text[3:]
  67. # Remove any leading whitespaces again after ellipses removal
  68. text = text.lstrip()
  69. # Uppercase the first letter
  70. if text:
  71. text = text[0].upper() + text[1:]
  72. return text
  73. def text_detected(text):
  74. nonlocal prev_text, displayed_text, rich_text_stored
  75. text = preprocess_text(text)
  76. sentence_end_marks = ['.', '!', '?', '。']
  77. midsentence_marks = ['…', '-', '(']
  78. if text.endswith("...") or text and text[-1] in midsentence_marks:
  79. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  80. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  81. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  82. else:
  83. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  84. prev_text = text
  85. # Build Rich Text with alternating colors
  86. rich_text = Text()
  87. for i, sentence in enumerate(full_sentences):
  88. if i % 2 == 0:
  89. rich_text += Text(sentence, style="yellow") + Text(" ")
  90. else:
  91. rich_text += Text(sentence, style="cyan") + Text(" ")
  92. # If the current text is not a sentence-ending, display it in real-time
  93. if text:
  94. rich_text += Text(text, style="bold yellow")
  95. new_displayed_text = rich_text.plain
  96. if new_displayed_text != displayed_text:
  97. displayed_text = new_displayed_text
  98. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  99. live.update(panel)
  100. rich_text_stored = rich_text
  101. def process_text(text):
  102. nonlocal recorder, full_sentences, prev_text
  103. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  104. text = preprocess_text(text)
  105. text = text.rstrip()
  106. if text.endswith("..."):
  107. text = text[:-2] # Remove ellipsis
  108. full_sentences.append(text)
  109. prev_text = ""
  110. text_detected("")
  111. # Recorder configuration
  112. recorder_config = {
  113. 'spinner': False,
  114. 'use_microphone': False,
  115. 'model': 'large-v2',
  116. 'input_device_index': None, # To be set after finding the device
  117. 'realtime_model_type': 'tiny.en',
  118. 'language': 'en',
  119. 'silero_sensitivity': 0.05,
  120. 'webrtc_sensitivity': 3,
  121. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  122. 'min_length_of_recording': 2.0,
  123. 'min_gap_between_recordings': 0,
  124. 'enable_realtime_transcription': True,
  125. 'realtime_processing_pause': 0.01,
  126. 'on_realtime_transcription_update': text_detected,
  127. 'silero_deactivity_detection': False,
  128. 'early_transcription_on_silence': 0,
  129. 'beam_size': 5,
  130. 'beam_size_realtime': 1,
  131. 'no_log_file': True,
  132. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  133. }
  134. if EXTENDED_LOGGING:
  135. recorder_config['level'] = logging.DEBUG
  136. # Initialize PyAudio
  137. audio = pyaudio.PyAudio()
  138. def find_stereo_mix_index():
  139. nonlocal audio
  140. devices_info = ""
  141. for i in range(audio.get_device_count()):
  142. dev = audio.get_device_info_by_index(i)
  143. devices_info += f"{dev['index']}: {dev['name']} (hostApi: {dev['hostApi']})\n"
  144. if (LOOPBACK_DEVICE_NAME.lower() in dev['name'].lower()
  145. and dev['hostApi'] == LOOPBACK_DEVICE_HOST_API):
  146. return dev['index'], devices_info
  147. return None, devices_info
  148. device_index, devices_info = find_stereo_mix_index()
  149. if device_index is None:
  150. live.stop()
  151. console.print("[bold red]Stereo Mix device not found. Available audio devices are:\n[/bold red]")
  152. console.print(devices_info, style="red")
  153. audio.terminate()
  154. sys.exit(1)
  155. else:
  156. recorder_config['input_device_index'] = device_index
  157. console.print(f"Using audio device index {device_index} for Stereo Mix.", style="green")
  158. # Initialize the recorder
  159. recorder = AudioToTextRecorder(**recorder_config)
  160. # Initialize Live Display with waiting message
  161. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  162. live.update(initial_text)
  163. # Define the recording thread
  164. def recording_thread():
  165. nonlocal recorder
  166. stream = audio.open(format=AUDIO_FORMAT,
  167. channels=CHANNELS,
  168. rate=RATE,
  169. input=True,
  170. frames_per_buffer=BUFFER_SIZE,
  171. input_device_index=recorder_config['input_device_index'])
  172. try:
  173. while not stop_event.is_set():
  174. data = stream.read(BUFFER_SIZE, exception_on_overflow=False)
  175. recorder.feed_audio(data)
  176. except Exception as e:
  177. console.print(f"[bold red]Error in recording thread: {e}[/bold red]")
  178. finally:
  179. console.print(f"[bold red]Stopping stream[/bold red]")
  180. stream.stop_stream()
  181. stream.close()
  182. # Define the stop event
  183. stop_event = threading.Event()
  184. # Start the recording thread
  185. thread = threading.Thread(target=recording_thread, daemon=True)
  186. thread.start()
  187. try:
  188. while True:
  189. recorder.text(process_text)
  190. except KeyboardInterrupt:
  191. console.print("[bold red]\nTranscription stopped by user. Exiting...[/bold red]")
  192. finally:
  193. print("live stop")
  194. live.stop()
  195. print("setting stop event")
  196. stop_event.set()
  197. print("thread join")
  198. thread.join()
  199. print("recorder stop")
  200. recorder.stop()
  201. print("audio terminate")
  202. audio.terminate()
  203. print("sys exit ")
  204. sys.exit(0)
  205. if __name__ == '__main__':
  206. main()