realtimestt_test_stereomix.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. EXTENDED_LOGGING = False
  2. def main():
  3. from install_packages import check_and_install_packages
  4. check_and_install_packages([
  5. {
  6. 'import_name': 'rich',
  7. }
  8. ])
  9. if EXTENDED_LOGGING:
  10. import logging
  11. logging.basicConfig(level=logging.DEBUG)
  12. import os
  13. import sys
  14. import threading
  15. import time
  16. import pyaudio
  17. from rich.console import Console
  18. from rich.live import Live
  19. from rich.text import Text
  20. from rich.panel import Panel
  21. from rich.spinner import Spinner
  22. from rich.progress import Progress, SpinnerColumn, TextColumn
  23. from colorama import Fore, Style, init as colorama_init
  24. from RealtimeSTT import AudioToTextRecorder
  25. # Configuration Constants
  26. LOOPBACK_DEVICE_NAME = "stereomix"
  27. LOOPBACK_DEVICE_HOST_API = 0
  28. BUFFER_SIZE = 512
  29. AUDIO_FORMAT = pyaudio.paInt16
  30. CHANNELS = 1
  31. RATE = 16000
  32. console = Console()
  33. console.print("System initializing, please wait")
  34. colorama_init()
  35. # Initialize Rich Console and Live
  36. live = Live(console=console, refresh_per_second=10, screen=False)
  37. live.start()
  38. full_sentences = []
  39. rich_text_stored = ""
  40. recorder = None
  41. displayed_text = "" # Used for tracking text that was already displayed
  42. end_of_sentence_detection_pause = 0.2
  43. unknown_sentence_detection_pause = 0.5
  44. mid_sentence_detection_pause = 1
  45. prev_text = ""
  46. def clear_console():
  47. os.system('clear' if os.name == 'posix' else 'cls')
  48. def preprocess_text(text):
  49. # Remove leading whitespaces
  50. text = text.lstrip()
  51. # Remove starting ellipses if present
  52. if text.startswith("..."):
  53. text = text[3:]
  54. # Remove any leading whitespaces again after ellipses removal
  55. text = text.lstrip()
  56. # Uppercase the first letter
  57. if text:
  58. text = text[0].upper() + text[1:]
  59. return text
  60. def text_detected(text):
  61. nonlocal prev_text, displayed_text, rich_text_stored
  62. text = preprocess_text(text)
  63. sentence_end_marks = ['.', '!', '?', '。']
  64. midsentence_marks = ['…', '-', '(']
  65. if text.endswith("...") or text and text[-1] in midsentence_marks:
  66. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  67. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  68. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  69. else:
  70. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  71. prev_text = text
  72. # Build Rich Text with alternating colors
  73. rich_text = Text()
  74. for i, sentence in enumerate(full_sentences):
  75. if i % 2 == 0:
  76. rich_text += Text(sentence, style="yellow") + Text(" ")
  77. else:
  78. rich_text += Text(sentence, style="cyan") + Text(" ")
  79. # If the current text is not a sentence-ending, display it in real-time
  80. if text:
  81. rich_text += Text(text, style="bold yellow")
  82. new_displayed_text = rich_text.plain
  83. if new_displayed_text != displayed_text:
  84. displayed_text = new_displayed_text
  85. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  86. live.update(panel)
  87. rich_text_stored = rich_text
  88. def process_text(text):
  89. nonlocal recorder, full_sentences, prev_text
  90. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  91. text = preprocess_text(text)
  92. text = text.rstrip()
  93. if text.endswith("..."):
  94. text = text[:-2] # Remove ellipsis
  95. full_sentences.append(text)
  96. prev_text = ""
  97. text_detected("")
  98. # Recorder configuration
  99. recorder_config = {
  100. 'spinner': False,
  101. 'use_microphone': False,
  102. 'model': 'large-v2',
  103. 'input_device_index': None, # To be set after finding the device
  104. 'realtime_model_type': 'tiny.en',
  105. 'language': 'en',
  106. 'silero_sensitivity': 0.05,
  107. 'webrtc_sensitivity': 3,
  108. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  109. 'min_length_of_recording': 2.0,
  110. 'min_gap_between_recordings': 0,
  111. 'enable_realtime_transcription': True,
  112. 'realtime_processing_pause': 0.01,
  113. 'on_realtime_transcription_update': text_detected,
  114. 'silero_deactivity_detection': False,
  115. 'early_transcription_on_silence': 0,
  116. 'beam_size': 5,
  117. 'beam_size_realtime': 1,
  118. 'no_log_file': True,
  119. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  120. }
  121. if EXTENDED_LOGGING:
  122. recorder_config['level'] = logging.DEBUG
  123. # Initialize PyAudio
  124. audio = pyaudio.PyAudio()
  125. def find_stereo_mix_index():
  126. nonlocal audio
  127. devices_info = ""
  128. for i in range(audio.get_device_count()):
  129. dev = audio.get_device_info_by_index(i)
  130. devices_info += f"{dev['index']}: {dev['name']} (hostApi: {dev['hostApi']})\n"
  131. if (LOOPBACK_DEVICE_NAME.lower() in dev['name'].lower()
  132. and dev['hostApi'] == LOOPBACK_DEVICE_HOST_API):
  133. return dev['index'], devices_info
  134. return None, devices_info
  135. device_index, devices_info = find_stereo_mix_index()
  136. if device_index is None:
  137. live.stop()
  138. console.print("[bold red]Stereo Mix device not found. Available audio devices are:\n[/bold red]")
  139. console.print(devices_info, style="red")
  140. audio.terminate()
  141. sys.exit(1)
  142. else:
  143. recorder_config['input_device_index'] = device_index
  144. console.print(f"Using audio device index {device_index} for Stereo Mix.", style="green")
  145. # Initialize the recorder
  146. recorder = AudioToTextRecorder(**recorder_config)
  147. # Initialize Live Display with waiting message
  148. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  149. live.update(initial_text)
  150. # Define the recording thread
  151. def recording_thread():
  152. nonlocal recorder
  153. stream = audio.open(format=AUDIO_FORMAT,
  154. channels=CHANNELS,
  155. rate=RATE,
  156. input=True,
  157. frames_per_buffer=BUFFER_SIZE,
  158. input_device_index=recorder_config['input_device_index'])
  159. try:
  160. while not stop_event.is_set():
  161. data = stream.read(BUFFER_SIZE, exception_on_overflow=False)
  162. recorder.feed_audio(data)
  163. except Exception as e:
  164. console.print(f"[bold red]Error in recording thread: {e}[/bold red]")
  165. finally:
  166. console.print(f"[bold red]Stopping stream[/bold red]")
  167. stream.stop_stream()
  168. stream.close()
  169. # Define the stop event
  170. stop_event = threading.Event()
  171. # Start the recording thread
  172. thread = threading.Thread(target=recording_thread, daemon=True)
  173. thread.start()
  174. try:
  175. while True:
  176. recorder.text(process_text)
  177. except KeyboardInterrupt:
  178. console.print("[bold red]\nTranscription stopped by user. Exiting...[/bold red]")
  179. finally:
  180. print("live stop")
  181. live.stop()
  182. print("setting stop event")
  183. stop_event.set()
  184. print("thread join")
  185. thread.join()
  186. print("recorder stop")
  187. recorder.stop()
  188. print("audio terminate")
  189. audio.terminate()
  190. print("sys exit ")
  191. sys.exit(0)
  192. if __name__ == '__main__':
  193. main()