realtimestt_test.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. EXTENDED_LOGGING = False
  2. if __name__ == '__main__':
  3. from install_packages import check_and_install_packages
  4. check_and_install_packages([
  5. {
  6. 'import_name': 'rich',
  7. }
  8. ])
  9. if EXTENDED_LOGGING:
  10. import logging
  11. logging.basicConfig(level=logging.DEBUG)
  12. from rich.console import Console
  13. from rich.live import Live
  14. from rich.text import Text
  15. from rich.panel import Panel
  16. from rich.spinner import Spinner
  17. from rich.progress import Progress, SpinnerColumn, TextColumn
  18. console = Console()
  19. # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
  20. console.print("System initializing, please wait")
  21. import os
  22. import sys
  23. from RealtimeSTT import AudioToTextRecorder
  24. from colorama import Fore, Style
  25. import colorama
  26. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  27. from torchaudio._extension.utils import _init_dll_path
  28. _init_dll_path()
  29. colorama.init()
  30. # Initialize Rich Console and Live
  31. live = Live(console=console, refresh_per_second=10, screen=False)
  32. live.start()
  33. full_sentences = []
  34. rich_text_stored = ""
  35. recorder = None
  36. displayed_text = "" # Used for tracking text that was already displayed
  37. end_of_sentence_detection_pause = 0.45
  38. unknown_sentence_detection_pause = 0.7
  39. mid_sentence_detection_pause = 2.0
  40. def clear_console():
  41. os.system('clear' if os.name == 'posix' else 'cls')
  42. prev_text = ""
  43. def preprocess_text(text):
  44. # Remove leading whitespaces
  45. text = text.lstrip()
  46. # Remove starting ellipses if present
  47. if text.startswith("..."):
  48. text = text[3:]
  49. # Remove any leading whitespaces again after ellipses removal
  50. text = text.lstrip()
  51. # Uppercase the first letter
  52. if text:
  53. text = text[0].upper() + text[1:]
  54. return text
  55. def text_detected(text):
  56. global prev_text, displayed_text, rich_text_stored
  57. text = preprocess_text(text)
  58. sentence_end_marks = ['.', '!', '?', '。']
  59. if text.endswith("..."):
  60. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  61. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  62. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  63. else:
  64. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  65. prev_text = text
  66. # Build Rich Text with alternating colors
  67. rich_text = Text()
  68. for i, sentence in enumerate(full_sentences):
  69. if i % 2 == 0:
  70. #rich_text += Text(sentence, style="bold yellow") + Text(" ")
  71. rich_text += Text(sentence, style="yellow") + Text(" ")
  72. else:
  73. rich_text += Text(sentence, style="cyan") + Text(" ")
  74. # If the current text is not a sentence-ending, display it in real-time
  75. if text:
  76. rich_text += Text(text, style="bold yellow")
  77. new_displayed_text = rich_text.plain
  78. if new_displayed_text != displayed_text:
  79. displayed_text = new_displayed_text
  80. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  81. live.update(panel)
  82. rich_text_stored = rich_text
  83. def process_text(text):
  84. global recorder, full_sentences, prev_text
  85. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  86. text = preprocess_text(text)
  87. text = text.rstrip()
  88. if text.endswith("..."):
  89. text = text[:-2]
  90. full_sentences.append(text)
  91. prev_text = ""
  92. text_detected("")
  93. # Recorder configuration
  94. recorder_config = {
  95. 'spinner': False,
  96. 'model': 'distil-medium.en', # or large-v2 or deepdml/faster-whisper-large-v3-turbo-ct2 or ...
  97. 'input_device_index': 1,
  98. 'realtime_model_type': 'tiny.en', # or small.en or distil-small.en or ...
  99. 'language': 'en',
  100. 'silero_sensitivity': 0.05,
  101. 'webrtc_sensitivity': 3,
  102. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  103. 'min_length_of_recording': 1.1,
  104. 'min_gap_between_recordings': 0,
  105. 'enable_realtime_transcription': True,
  106. 'realtime_processing_pause': 0.02,
  107. 'on_realtime_transcription_update': text_detected,
  108. #'on_realtime_transcription_stabilized': text_detected,
  109. 'silero_deactivity_detection': True,
  110. 'early_transcription_on_silence': 0,
  111. 'beam_size': 5,
  112. 'beam_size_realtime': 3,
  113. 'no_log_file': True,
  114. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  115. }
  116. if EXTENDED_LOGGING:
  117. recorder_config['level'] = logging.DEBUG
  118. recorder = AudioToTextRecorder(**recorder_config)
  119. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  120. live.update(initial_text)
  121. try:
  122. while True:
  123. recorder.text(process_text)
  124. except KeyboardInterrupt:
  125. live.stop()
  126. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  127. exit(0)