realtimestt_test.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. EXTENDED_LOGGING = False
  2. if __name__ == '__main__':
  3. import subprocess
  4. import sys
  5. def install_rich():
  6. subprocess.check_call([sys.executable, "-m", "pip", "install", "rich"])
  7. try:
  8. import rich
  9. except ImportError:
  10. user_input = input("This demo needs the 'rich' library, which is not installed.\nDo you want to install it now? (y/n): ")
  11. if user_input.lower() == 'y':
  12. try:
  13. install_rich()
  14. import rich
  15. print("Successfully installed 'rich'.")
  16. except Exception as e:
  17. print(f"An error occurred while installing 'rich': {e}")
  18. sys.exit(1)
  19. else:
  20. print("The program requires the 'rich' library to run. Exiting...")
  21. sys.exit(1)
  22. if EXTENDED_LOGGING:
  23. import logging
  24. logging.basicConfig(level=logging.DEBUG)
  25. from rich.console import Console
  26. from rich.live import Live
  27. from rich.text import Text
  28. from rich.panel import Panel
  29. from rich.spinner import Spinner
  30. from rich.progress import Progress, SpinnerColumn, TextColumn
  31. console = Console()
  32. # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
  33. console.print("System initializing, please wait")
  34. import os
  35. import sys
  36. from RealtimeSTT import AudioToTextRecorder
  37. from colorama import Fore, Style
  38. import colorama
  39. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  40. from torchaudio._extension.utils import _init_dll_path
  41. _init_dll_path()
  42. colorama.init()
  43. # Initialize Rich Console and Live
  44. live = Live(console=console, refresh_per_second=10, screen=False)
  45. live.start()
  46. full_sentences = []
  47. rich_text_stored = ""
  48. recorder = None
  49. displayed_text = "" # Used for tracking text that was already displayed
  50. end_of_sentence_detection_pause = 0.45
  51. unknown_sentence_detection_pause = 0.7
  52. mid_sentence_detection_pause = 2.0
  53. def clear_console():
  54. os.system('clear' if os.name == 'posix' else 'cls')
  55. prev_text = ""
  56. def preprocess_text(text):
  57. # Remove leading whitespaces
  58. text = text.lstrip()
  59. # Remove starting ellipses if present
  60. if text.startswith("..."):
  61. text = text[3:]
  62. # Remove any leading whitespaces again after ellipses removal
  63. text = text.lstrip()
  64. # Uppercase the first letter
  65. if text:
  66. text = text[0].upper() + text[1:]
  67. return text
  68. def text_detected(text):
  69. global prev_text, displayed_text, rich_text_stored
  70. text = preprocess_text(text)
  71. sentence_end_marks = ['.', '!', '?', '。']
  72. if text.endswith("..."):
  73. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  74. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  75. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  76. else:
  77. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  78. prev_text = text
  79. # Build Rich Text with alternating colors
  80. rich_text = Text()
  81. for i, sentence in enumerate(full_sentences):
  82. if i % 2 == 0:
  83. #rich_text += Text(sentence, style="bold yellow") + Text(" ")
  84. rich_text += Text(sentence, style="yellow") + Text(" ")
  85. else:
  86. rich_text += Text(sentence, style="cyan") + Text(" ")
  87. # If the current text is not a sentence-ending, display it in real-time
  88. if text:
  89. rich_text += Text(text, style="bold yellow")
  90. new_displayed_text = rich_text.plain
  91. if new_displayed_text != displayed_text:
  92. displayed_text = new_displayed_text
  93. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  94. live.update(panel)
  95. rich_text_stored = rich_text
  96. def process_text(text):
  97. global recorder, full_sentences, prev_text
  98. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  99. text = preprocess_text(text)
  100. text = text.rstrip()
  101. if text.endswith("..."):
  102. text = text[:-2]
  103. full_sentences.append(text)
  104. prev_text = ""
  105. text_detected("")
  106. # Recorder configuration
  107. recorder_config = {
  108. 'spinner': False,
  109. 'model': 'distil-medium.en', # or large-v2 or deepdml/faster-whisper-large-v3-turbo-ct2 or ...
  110. 'input_device_index': 1,
  111. 'realtime_model_type': 'tiny.en', # or small.en or distil-small.en or ...
  112. 'language': 'en',
  113. 'silero_sensitivity': 0.05,
  114. 'webrtc_sensitivity': 3,
  115. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  116. 'min_length_of_recording': 1.1,
  117. 'min_gap_between_recordings': 0,
  118. 'enable_realtime_transcription': True,
  119. 'realtime_processing_pause': 0.02,
  120. 'on_realtime_transcription_update': text_detected,
  121. #'on_realtime_transcription_stabilized': text_detected,
  122. 'silero_deactivity_detection': True,
  123. 'early_transcription_on_silence': 0,
  124. 'beam_size': 5,
  125. 'beam_size_realtime': 3,
  126. 'no_log_file': True,
  127. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  128. }
  129. if EXTENDED_LOGGING:
  130. recorder_config['level'] = logging.DEBUG
  131. recorder = AudioToTextRecorder(**recorder_config)
  132. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  133. live.update(initial_text)
  134. try:
  135. while True:
  136. recorder.text(process_text)
  137. except KeyboardInterrupt:
  138. live.stop()
  139. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  140. exit(0)