realtimestt_test.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. EXTENDED_LOGGING = False
  2. # set to 0 to deactivate writing to keyboard
  3. # try lower values like 0.002 (fast) first, take higher values like 0.05 in case it fails
  4. WRITE_TO_KEYBOARD_INTERVAL = 0.002
  5. if __name__ == '__main__':
  6. from install_packages import check_and_install_packages
  7. check_and_install_packages([
  8. {
  9. 'import_name': 'rich',
  10. },
  11. {
  12. 'import_name': 'pyautogui',
  13. }
  14. ])
  15. if EXTENDED_LOGGING:
  16. import logging
  17. logging.basicConfig(level=logging.DEBUG)
  18. from rich.console import Console
  19. from rich.live import Live
  20. from rich.text import Text
  21. from rich.panel import Panel
  22. from rich.spinner import Spinner
  23. from rich.progress import Progress, SpinnerColumn, TextColumn
  24. console = Console()
  25. console.print("System initializing, please wait")
  26. import os
  27. import sys
  28. from RealtimeSTT import AudioToTextRecorder
  29. from colorama import Fore, Style
  30. import colorama
  31. import pyautogui
  32. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  33. from torchaudio._extension.utils import _init_dll_path
  34. _init_dll_path()
  35. colorama.init()
  36. # Initialize Rich Console and Live
  37. live = Live(console=console, refresh_per_second=10, screen=False)
  38. live.start()
  39. full_sentences = []
  40. rich_text_stored = ""
  41. recorder = None
  42. displayed_text = "" # Used for tracking text that was already displayed
  43. end_of_sentence_detection_pause = 0.45
  44. unknown_sentence_detection_pause = 0.7
  45. mid_sentence_detection_pause = 2.0
  46. def clear_console():
  47. os.system('clear' if os.name == 'posix' else 'cls')
  48. prev_text = ""
  49. def preprocess_text(text):
  50. # Remove leading whitespaces
  51. text = text.lstrip()
  52. # Remove starting ellipses if present
  53. if text.startswith("..."):
  54. text = text[3:]
  55. # Remove any leading whitespaces again after ellipses removal
  56. text = text.lstrip()
  57. # Uppercase the first letter
  58. if text:
  59. text = text[0].upper() + text[1:]
  60. return text
  61. def text_detected(text):
  62. global prev_text, displayed_text, rich_text_stored
  63. text = preprocess_text(text)
  64. sentence_end_marks = ['.', '!', '?', '。']
  65. if text.endswith("..."):
  66. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  67. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  68. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  69. else:
  70. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  71. prev_text = text
  72. # Build Rich Text with alternating colors
  73. rich_text = Text()
  74. for i, sentence in enumerate(full_sentences):
  75. if i % 2 == 0:
  76. #rich_text += Text(sentence, style="bold yellow") + Text(" ")
  77. rich_text += Text(sentence, style="yellow") + Text(" ")
  78. else:
  79. rich_text += Text(sentence, style="cyan") + Text(" ")
  80. # If the current text is not a sentence-ending, display it in real-time
  81. if text:
  82. rich_text += Text(text, style="bold yellow")
  83. new_displayed_text = rich_text.plain
  84. if new_displayed_text != displayed_text:
  85. displayed_text = new_displayed_text
  86. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  87. live.update(panel)
  88. rich_text_stored = rich_text
  89. def process_text(text):
  90. global recorder, full_sentences, prev_text
  91. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  92. text = preprocess_text(text)
  93. text = text.rstrip()
  94. if text.endswith("..."):
  95. text = text[:-2]
  96. if not text:
  97. return
  98. full_sentences.append(text)
  99. prev_text = ""
  100. text_detected("")
  101. if WRITE_TO_KEYBOARD_INTERVAL:
  102. pyautogui.write(f"{text} ", interval=WRITE_TO_KEYBOARD_INTERVAL) # Adjust interval as needed
  103. # Recorder configuration
  104. recorder_config = {
  105. 'spinner': False,
  106. 'model': 'large-v2', # or large-v2 or deepdml/faster-whisper-large-v3-turbo-ct2 or ...
  107. # 'input_device_index': 1,
  108. 'realtime_model_type': 'tiny.en', # or small.en or distil-small.en or ...
  109. 'language': 'en',
  110. 'silero_sensitivity': 0.05,
  111. 'webrtc_sensitivity': 3,
  112. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  113. 'min_length_of_recording': 1.1,
  114. 'min_gap_between_recordings': 0,
  115. 'enable_realtime_transcription': True,
  116. 'realtime_processing_pause': 0.02,
  117. 'on_realtime_transcription_update': text_detected,
  118. #'on_realtime_transcription_stabilized': text_detected,
  119. 'silero_deactivity_detection': True,
  120. 'early_transcription_on_silence': 0,
  121. 'beam_size': 5,
  122. 'beam_size_realtime': 3,
  123. 'no_log_file': True,
  124. 'initial_prompt': (
  125. "End incomplete sentences with ellipses.\n"
  126. "Examples:\n"
  127. "Complete: The sky is blue.\n"
  128. "Incomplete: When the sky...\n"
  129. "Complete: She walked home.\n"
  130. "Incomplete: Because he...\n"
  131. )
  132. }
  133. if EXTENDED_LOGGING:
  134. recorder_config['level'] = logging.DEBUG
  135. recorder = AudioToTextRecorder(**recorder_config)
  136. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  137. live.update(initial_text)
  138. try:
  139. while True:
  140. recorder.text(process_text)
  141. except KeyboardInterrupt:
  142. live.stop()
  143. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  144. exit(0)