realtimestt_test.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. if __name__ == '__main__':
  2. EXTENDED_LOGGING = False
  3. if EXTENDED_LOGGING:
  4. import logging
  5. logging.basicConfig(level=logging.DEBUG)
  6. import os
  7. import sys
  8. from RealtimeSTT import AudioToTextRecorder
  9. from colorama import Fore, Back, Style
  10. import colorama
  11. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  12. from torchaudio._extension.utils import _init_dll_path
  13. _init_dll_path()
  14. print("Initializing RealtimeSTT test...")
  15. colorama.init()
  16. full_sentences = []
  17. displayed_text = ""
  18. prev_text = ""
  19. recorder = None
  20. end_of_sentence_detection_pause = 0.4
  21. mid_sentence_detection_pause = 0.7
  22. def clear_console():
  23. os.system('clear' if os.name == 'posix' else 'cls')
  24. def text_detected(text):
  25. global displayed_text, prev_text
  26. sentence_end_marks = ['.', '!', '?', '。']
  27. if text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  28. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  29. else:
  30. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  31. prev_text = text
  32. sentences_with_style = [
  33. f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
  34. for i, sentence in enumerate(full_sentences)
  35. ]
  36. new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
  37. if new_text != displayed_text:
  38. displayed_text = new_text
  39. clear_console()
  40. print(displayed_text, end="", flush=True)
  41. def process_text(text):
  42. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  43. full_sentences.append(text)
  44. prev_text = ""
  45. text_detected("")
  46. # Recorder configuration
  47. recorder_config = {
  48. 'spinner': False,
  49. 'model': 'large-v2',
  50. 'realtime_model_type': 'tiny.en',
  51. 'language': 'en',
  52. 'input_device_index': 1,
  53. 'silero_sensitivity': 0.05,
  54. 'webrtc_sensitivity': 3,
  55. 'post_speech_silence_duration': end_of_sentence_detection_pause,
  56. 'min_length_of_recording': 0,
  57. 'min_gap_between_recordings': 0,
  58. 'enable_realtime_transcription': True,
  59. 'realtime_processing_pause': 0.1,
  60. 'on_realtime_transcription_update': text_detected,
  61. 'silero_deactivity_detection': True,
  62. 'min_length_of_recording': 0.7,
  63. 'early_transcription_on_silence': 0.2,
  64. 'beam_size': 5,
  65. 'beam_size_realtime': 1,
  66. 'no_log_file': False,
  67. }
  68. if EXTENDED_LOGGING:
  69. recorder_config['level'] = logging.DEBUG
  70. recorder = AudioToTextRecorder(**recorder_config)
  71. clear_console()
  72. print("Say something...", end="", flush=True)
  73. try:
  74. while (True):
  75. recorder.text(process_text)
  76. except KeyboardInterrupt:
  77. print("Exiting application due to keyboard interrupt")