realtimestt_test.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. if __name__ == '__main__':
  2. EXTENDED_LOGGING = False
  3. if EXTENDED_LOGGING:
  4. import logging
  5. logging.basicConfig(level=logging.DEBUG)
  6. from rich.console import Console
  7. from rich.live import Live
  8. from rich.text import Text
  9. from rich.panel import Panel
  10. from rich.spinner import Spinner
  11. from rich.progress import Progress, SpinnerColumn, TextColumn
  12. console = Console()
  13. # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
  14. console.print("System initializing, please wait")
  15. import os
  16. import sys
  17. from RealtimeSTT import AudioToTextRecorder
  18. from colorama import Fore, Style
  19. import colorama
  20. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  21. from torchaudio._extension.utils import _init_dll_path
  22. _init_dll_path()
  23. colorama.init()
  24. # Initialize Rich Console and Live
  25. live = Live(console=console, refresh_per_second=10, screen=False)
  26. live.start()
  27. full_sentences = []
  28. rich_text_stored = ""
  29. recorder = None
  30. displayed_text = "" # Used for tracking text that was already displayed
  31. end_of_sentence_detection_pause = 0.45
  32. unknown_sentence_detection_pause = 0.7
  33. mid_sentence_detection_pause = 2.0
  34. def clear_console():
  35. os.system('clear' if os.name == 'posix' else 'cls')
  36. prev_text = ""
  37. def preprocess_text(text):
  38. # Remove leading whitespaces
  39. text = text.lstrip()
  40. # Remove starting ellipses if present
  41. if text.startswith("..."):
  42. text = text[3:]
  43. # Remove any leading whitespaces again after ellipses removal
  44. text = text.lstrip()
  45. # Uppercase the first letter
  46. if text:
  47. text = text[0].upper() + text[1:]
  48. return text
  49. def text_detected(text):
  50. global prev_text, displayed_text, rich_text_stored
  51. text = preprocess_text(text)
  52. sentence_end_marks = ['.', '!', '?', '。']
  53. if text.endswith("..."):
  54. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  55. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  56. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  57. else:
  58. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  59. prev_text = text
  60. # Build Rich Text with alternating colors
  61. rich_text = Text()
  62. for i, sentence in enumerate(full_sentences):
  63. if i % 2 == 0:
  64. #rich_text += Text(sentence, style="bold yellow") + Text(" ")
  65. rich_text += Text(sentence, style="yellow") + Text(" ")
  66. else:
  67. rich_text += Text(sentence, style="cyan") + Text(" ")
  68. # If the current text is not a sentence-ending, display it in real-time
  69. if text:
  70. rich_text += Text(text, style="bold yellow")
  71. new_displayed_text = rich_text.plain
  72. if new_displayed_text != displayed_text:
  73. displayed_text = new_displayed_text
  74. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  75. live.update(panel)
  76. rich_text_stored = rich_text
  77. def process_text(text):
  78. global recorder, full_sentences, prev_text
  79. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  80. text = preprocess_text(text)
  81. text = text.rstrip()
  82. if text.endswith("..."):
  83. text = text[:-2]
  84. full_sentences.append(text)
  85. prev_text = ""
  86. text_detected("")
  87. # Recorder configuration
  88. recorder_config = {
  89. 'spinner': False,
  90. 'model': 'distil-medium.en', # or large-v2 or deepdml/faster-whisper-large-v3-turbo-ct2 or ...
  91. 'input_device_index': 1,
  92. 'realtime_model_type': 'tiny.en', # or small.en or distil-small.en or ...
  93. 'language': 'en',
  94. 'silero_sensitivity': 0.05,
  95. 'webrtc_sensitivity': 3,
  96. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  97. 'min_length_of_recording': 1.1,
  98. 'min_gap_between_recordings': 0,
  99. 'enable_realtime_transcription': True,
  100. 'realtime_processing_pause': 0.02,
  101. 'on_realtime_transcription_update': text_detected,
  102. #'on_realtime_transcription_stabilized': text_detected,
  103. 'silero_deactivity_detection': True,
  104. 'early_transcription_on_silence': 0,
  105. 'beam_size': 5,
  106. 'beam_size_realtime': 3,
  107. 'no_log_file': True,
  108. 'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  109. }
  110. if EXTENDED_LOGGING:
  111. recorder_config['level'] = logging.DEBUG
  112. recorder = AudioToTextRecorder(**recorder_config)
  113. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  114. live.update(initial_text)
  115. try:
  116. while True:
  117. recorder.text(process_text)
  118. except KeyboardInterrupt:
  119. live.stop()
  120. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  121. exit(0)