realtimestt_test.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. if __name__ == '__main__':
  2. EXTENDED_LOGGING = False
  3. if EXTENDED_LOGGING:
  4. import logging
  5. logging.basicConfig(level=logging.DEBUG)
  6. import os
  7. import sys
  8. from RealtimeSTT import AudioToTextRecorder
  9. from colorama import Fore, Style
  10. import colorama
  11. from rich.live import Live
  12. from rich.console import Console
  13. from rich.text import Text
  14. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  15. from torchaudio._extension.utils import _init_dll_path
  16. _init_dll_path()
  17. print("Initializing RealtimeSTT test...")
  18. colorama.init()
  19. # Initialize Rich Console and Live
  20. console = Console()
  21. live = Live(console=console, refresh_per_second=10, screen=False)
  22. live.start()
  23. full_sentences = []
  24. displayed_text = ""
  25. prev_text = ""
  26. rich_text_stored = ""
  27. recorder = None
  28. end_of_sentence_detection_pause = 0.4
  29. unknown_sentence_detection_pause = 0.7
  30. mid_sentence_detection_pause = 2.0
  31. def clear_console():
  32. os.system('clear' if os.name == 'posix' else 'cls')
  33. def text_detected(text):
  34. global displayed_text, prev_text, full_sentences, recorder, rich_text_stored
  35. sentence_end_marks = ['.', '!', '?', '。']
  36. if text.endswith("..."):
  37. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  38. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  39. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  40. else:
  41. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  42. prev_text = text
  43. # Build Rich Text with alternating colors
  44. rich_text = Text()
  45. for i, sentence in enumerate(full_sentences):
  46. if i % 2 == 0:
  47. rich_text += Text(sentence, style="yellow") + Text(" ")
  48. else:
  49. rich_text += Text(sentence, style="cyan") + Text(" ")
  50. # If the current text is not a sentence-ending, display it in real-time
  51. if text:
  52. rich_text += Text(text, style="white")
  53. new_displayed_text = rich_text.plain
  54. if new_displayed_text != displayed_text:
  55. displayed_text = new_displayed_text
  56. live.update(rich_text)
  57. rich_text_stored = rich_text
  58. def process_text(text):
  59. global recorder, full_sentences, prev_text
  60. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  61. full_sentences.append(text)
  62. prev_text = ""
  63. text_detected("")
  64. # Recorder configuration
  65. recorder_config = {
  66. 'spinner': False,
  67. 'model': 'large-v2',
  68. # 'input_device_index': 1,
  69. 'realtime_model_type': 'tiny.en',
  70. 'language': 'en',
  71. 'silero_sensitivity': 0.05,
  72. 'webrtc_sensitivity': 3,
  73. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  74. 'min_length_of_recording': 0.7,
  75. 'min_gap_between_recordings': 0,
  76. 'enable_realtime_transcription': True,
  77. 'realtime_processing_pause': 0.1,
  78. #'on_realtime_transcription_update': text_detected,
  79. 'on_realtime_transcription_stabilized': text_detected,
  80. 'silero_deactivity_detection': True,
  81. 'early_transcription_on_silence': 0.2,
  82. 'beam_size': 5,
  83. 'beam_size_realtime': 1,
  84. 'no_log_file': True,
  85. }
  86. if EXTENDED_LOGGING:
  87. recorder_config['level'] = logging.DEBUG
  88. recorder = AudioToTextRecorder(**recorder_config)
  89. # Initial display message
  90. initial_text = Text("Say something...", style="green")
  91. live.update(initial_text)
  92. try:
  93. while True:
  94. recorder.text(process_text)
  95. except KeyboardInterrupt:
  96. live.stop()
  97. print("Exit due to keyboard interrupt.")