realtimestt_speechendpoint.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. IS_DEBUG = False
  2. import os
  3. import sys
  4. import threading
  5. import queue
  6. import time
  7. from install_packages import check_and_install_packages
  8. # Check and install required packages
  9. check_and_install_packages([
  10. {'import_name': 'rich'},
  11. {'import_name': 'openai'},
  12. {'import_name': 'colorama'},
  13. {'import_name': 'RealtimeSTT'},
  14. # Add any other required packages here
  15. ])
  16. EXTENDED_LOGGING = False
  17. if __name__ == '__main__':
  18. if EXTENDED_LOGGING:
  19. import logging
  20. logging.basicConfig(level=logging.DEBUG)
  21. from rich.console import Console
  22. from rich.live import Live
  23. from rich.text import Text
  24. from rich.panel import Panel
  25. from rich.spinner import Spinner
  26. from rich.progress import Progress, SpinnerColumn, TextColumn
  27. console = Console()
  28. console.print("System initializing, please wait")
  29. from RealtimeSTT import AudioToTextRecorder
  30. from colorama import Fore, Style
  31. import colorama
  32. from openai import OpenAI
  33. # import ollama
  34. # Initialize OpenAI client for Ollama
  35. client = OpenAI(
  36. # base_url='http://127.0.0.1:11434/v1/', # ollama
  37. base_url='http://127.0.0.1:1234/v1/', # lm_studio
  38. api_key='ollama', # required but ignored
  39. )
  40. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  41. from torchaudio._extension.utils import _init_dll_path
  42. _init_dll_path()
  43. colorama.init()
  44. # Initialize Rich Console and Live
  45. live = Live(console=console, refresh_per_second=10, screen=False)
  46. live.start()
  47. # Initialize a thread-safe queue
  48. text_queue = queue.Queue()
  49. # Variables for managing displayed text
  50. full_sentences = []
  51. rich_text_stored = ""
  52. recorder = None
  53. displayed_text = ""
  54. rapid_sentence_end_detection = 0.4
  55. end_of_sentence_detection_pause = 1.2
  56. unknown_sentence_detection_pause = 2.5
  57. mid_sentence_detection_pause = 3.8
  58. def clear_console():
  59. os.system('clear' if os.name == 'posix' else 'cls')
  60. prev_text = ""
  61. def is_speech_finished(text):
  62. user_prompt = (
  63. "Please reply with only 'c' if the following text is a complete thought (a sentence that stands on its own), "
  64. "or 'i' if it is not finished. Do not include any additional text in your reply. "
  65. "Consider a full sentence to have a clear subject, verb, and predicate or express a complete idea. "
  66. "Examples:\n"
  67. "- 'The sky is blue.' is complete (reply 'c').\n"
  68. "- 'When the sky' is incomplete (reply 'i').\n"
  69. "- 'She walked home.' is complete (reply 'c').\n"
  70. "- 'Because he' is incomplete (reply 'i').\n"
  71. f"\nText: {text}"
  72. )
  73. response = client.chat.completions.create(
  74. model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf",
  75. messages=[{"role": "user", "content": user_prompt}],
  76. max_tokens=1,
  77. temperature=0.0, # Set temperature to 0 for deterministic output
  78. )
  79. if IS_DEBUG:
  80. print(f"t:'{response.choices[0].message.content.strip().lower()}'", end="", flush=True)
  81. reply = response.choices[0].message.content.strip().lower()
  82. return reply == 'c'
  83. def preprocess_text(text):
  84. # Remove leading whitespaces
  85. text = text.lstrip()
  86. # Remove starting ellipses if present
  87. if text.startswith("..."):
  88. text = text[3:]
  89. # Remove any leading whitespaces again after ellipses removal
  90. text = text.lstrip()
  91. # Uppercase the first letter
  92. if text:
  93. text = text[0].upper() + text[1:]
  94. return text
  95. def text_detected(text):
  96. """
  97. Enqueue the detected text for processing.
  98. """
  99. text_queue.put(text)
  100. def process_queue():
  101. global recorder, full_sentences, prev_text, displayed_text, rich_text_stored
  102. while True:
  103. try:
  104. text = text_queue.get(timeout=1) # Wait for text or timeout after 1 second
  105. except queue.Empty:
  106. continue # No text to process, continue looping
  107. if text is None:
  108. # Sentinel value to indicate thread should exit
  109. break
  110. text = preprocess_text(text)
  111. sentence_end_marks = ['.', '!', '?', '。']
  112. if text.endswith("..."):
  113. if not recorder.post_speech_silence_duration == mid_sentence_detection_pause:
  114. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  115. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  116. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  117. if not recorder.post_speech_silence_duration == end_of_sentence_detection_pause:
  118. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  119. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  120. else:
  121. if not recorder.post_speech_silence_duration == unknown_sentence_detection_pause:
  122. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  123. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  124. prev_text = text
  125. import string
  126. transtext = text.translate(str.maketrans('', '', string.punctuation))
  127. if is_speech_finished(transtext):
  128. if not recorder.post_speech_silence_duration == rapid_sentence_end_detection:
  129. recorder.post_speech_silence_duration = rapid_sentence_end_detection
  130. if IS_DEBUG: print(f"RT: {transtext} post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  131. rich_text = Text()
  132. for i, sentence in enumerate(full_sentences):
  133. if i % 2 == 0:
  134. rich_text += Text(sentence, style="yellow") + Text(" ")
  135. else:
  136. rich_text += Text(sentence, style="cyan") + Text(" ")
  137. if text:
  138. rich_text += Text(text, style="bold yellow")
  139. new_displayed_text = rich_text.plain
  140. if new_displayed_text != displayed_text:
  141. displayed_text = new_displayed_text
  142. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  143. live.update(panel)
  144. rich_text_stored = rich_text
  145. # Mark the task as done
  146. text_queue.task_done()
  147. def process_text(text):
  148. global recorder, full_sentences, prev_text
  149. if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  150. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  151. text = preprocess_text(text)
  152. text = text.rstrip()
  153. if text.endswith("..."):
  154. text = text[:-2]
  155. full_sentences.append(text)
  156. prev_text = ""
  157. text_detected("")
  158. # Recorder configuration
  159. recorder_config = {
  160. 'spinner': False,
  161. 'model': 'medium.en',
  162. 'input_device_index': 1,
  163. 'realtime_model_type': 'tiny.en',
  164. 'language': 'en',
  165. 'silero_sensitivity': 0.05,
  166. 'webrtc_sensitivity': 3,
  167. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  168. 'min_length_of_recording': 1.1,
  169. 'min_gap_between_recordings': 0,
  170. 'enable_realtime_transcription': True,
  171. 'realtime_processing_pause': 0.05,
  172. 'on_realtime_transcription_update': text_detected,
  173. 'silero_deactivity_detection': False,
  174. 'early_transcription_on_silence': 0,
  175. 'beam_size': 5,
  176. 'beam_size_realtime': 1,
  177. 'no_log_file': True,
  178. #'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  179. }
  180. if EXTENDED_LOGGING:
  181. recorder_config['level'] = logging.DEBUG
  182. recorder = AudioToTextRecorder(**recorder_config)
  183. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  184. live.update(initial_text)
  185. # Start the worker thread
  186. worker_thread = threading.Thread(target=process_queue, daemon=True)
  187. worker_thread.start()
  188. try:
  189. while True:
  190. recorder.text(process_text)
  191. except KeyboardInterrupt:
  192. # Send sentinel value to worker thread to exit
  193. text_queue.put(None)
  194. worker_thread.join()
  195. live.stop()
  196. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  197. exit(0)