realtimestt_speechendpoint.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. IS_DEBUG = False
  2. import os
  3. import sys
  4. import threading
  5. import queue
  6. import time
  7. from install_packages import check_and_install_packages
  8. # Check and install required packages
  9. check_and_install_packages([
  10. {'import_name': 'rich'},
  11. {'import_name': 'openai'},
  12. {'import_name': 'colorama'},
  13. {'import_name': 'RealtimeSTT'},
  14. # Add any other required packages here
  15. ])
  16. EXTENDED_LOGGING = False
  17. if __name__ == '__main__':
  18. if EXTENDED_LOGGING:
  19. import logging
  20. logging.basicConfig(level=logging.DEBUG)
  21. from rich.console import Console
  22. from rich.live import Live
  23. from rich.text import Text
  24. from rich.panel import Panel
  25. from rich.spinner import Spinner
  26. from rich.progress import Progress, SpinnerColumn, TextColumn
  27. console = Console()
  28. console.print("System initializing, please wait")
  29. from RealtimeSTT import AudioToTextRecorder
  30. from colorama import Fore, Style
  31. import colorama
  32. from openai import OpenAI
  33. # import ollama
  34. # Initialize OpenAI client for Ollama
  35. client = OpenAI(
  36. # base_url='http://127.0.0.1:11434/v1/', # ollama
  37. base_url='http://127.0.0.1:1234/v1/', # lm_studio
  38. api_key='ollama', # required but ignored
  39. )
  40. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  41. from torchaudio._extension.utils import _init_dll_path
  42. _init_dll_path()
  43. colorama.init()
  44. # Initialize Rich Console and Live
  45. live = Live(console=console, refresh_per_second=10, screen=False)
  46. live.start()
  47. # Initialize a thread-safe queue
  48. text_queue = queue.Queue()
  49. # Variables for managing displayed text
  50. full_sentences = []
  51. rich_text_stored = ""
  52. recorder = None
  53. displayed_text = ""
  54. rapid_sentence_end_detection = 0.4
  55. end_of_sentence_detection_pause = 1.2
  56. unknown_sentence_detection_pause = 2.5
  57. mid_sentence_detection_pause = 3.8
  58. def clear_console():
  59. os.system('clear' if os.name == 'posix' else 'cls')
  60. prev_text = ""
  61. speech_finished_cache = {}
  62. def is_speech_finished(text):
  63. # Check if the result is already in the cache
  64. if text in speech_finished_cache:
  65. if IS_DEBUG:
  66. print(f"Cache hit for: '{text}'")
  67. return speech_finished_cache[text]
  68. user_prompt = (
  69. "Please reply with only 'c' if the following text is a complete thought (a sentence that stands on its own), "
  70. "or 'i' if it is not finished. Do not include any additional text in your reply. "
  71. "Consider a full sentence to have a clear subject, verb, and predicate or express a complete idea. "
  72. "Examples:\n"
  73. "- 'The sky is blue.' is complete (reply 'c').\n"
  74. "- 'When the sky' is incomplete (reply 'i').\n"
  75. "- 'She walked home.' is complete (reply 'c').\n"
  76. "- 'Because he' is incomplete (reply 'i').\n"
  77. f"\nText: {text}"
  78. )
  79. response = client.chat.completions.create(
  80. model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf",
  81. messages=[{"role": "user", "content": user_prompt}],
  82. max_tokens=1,
  83. temperature=0.0, # Set temperature to 0 for deterministic output
  84. )
  85. if IS_DEBUG:
  86. print(f"t:'{response.choices[0].message.content.strip().lower()}'", end="", flush=True)
  87. reply = response.choices[0].message.content.strip().lower()
  88. result = reply == 'c'
  89. # Cache the result
  90. speech_finished_cache[text] = result
  91. return result
  92. def preprocess_text(text):
  93. # Remove leading whitespaces
  94. text = text.lstrip()
  95. # Remove starting ellipses if present
  96. if text.startswith("..."):
  97. text = text[3:]
  98. # Remove any leading whitespaces again after ellipses removal
  99. text = text.lstrip()
  100. # Uppercase the first letter
  101. if text:
  102. text = text[0].upper() + text[1:]
  103. return text
  104. def text_detected(text):
  105. """
  106. Enqueue the detected text for processing.
  107. """
  108. text_queue.put(text)
  109. def process_queue():
  110. global recorder, full_sentences, prev_text, displayed_text, rich_text_stored
  111. while True:
  112. try:
  113. text = text_queue.get(timeout=1) # Wait for text or timeout after 1 second
  114. except queue.Empty:
  115. continue # No text to process, continue looping
  116. if text is None:
  117. # Sentinel value to indicate thread should exit
  118. break
  119. text = preprocess_text(text)
  120. sentence_end_marks = ['.', '!', '?', '。']
  121. if text.endswith("..."):
  122. if not recorder.post_speech_silence_duration == mid_sentence_detection_pause:
  123. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  124. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  125. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  126. if not recorder.post_speech_silence_duration == end_of_sentence_detection_pause:
  127. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  128. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  129. else:
  130. if not recorder.post_speech_silence_duration == unknown_sentence_detection_pause:
  131. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  132. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  133. prev_text = text
  134. import string
  135. transtext = text.translate(str.maketrans('', '', string.punctuation))
  136. if is_speech_finished(transtext):
  137. if not recorder.post_speech_silence_duration == rapid_sentence_end_detection:
  138. recorder.post_speech_silence_duration = rapid_sentence_end_detection
  139. if IS_DEBUG: print(f"RT: {transtext} post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  140. rich_text = Text()
  141. for i, sentence in enumerate(full_sentences):
  142. if i % 2 == 0:
  143. rich_text += Text(sentence, style="yellow") + Text(" ")
  144. else:
  145. rich_text += Text(sentence, style="cyan") + Text(" ")
  146. if text:
  147. rich_text += Text(text, style="bold yellow")
  148. new_displayed_text = rich_text.plain
  149. if new_displayed_text != displayed_text:
  150. displayed_text = new_displayed_text
  151. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  152. live.update(panel)
  153. rich_text_stored = rich_text
  154. # Mark the task as done
  155. text_queue.task_done()
  156. def process_text(text):
  157. global recorder, full_sentences, prev_text
  158. if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  159. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  160. text = preprocess_text(text)
  161. text = text.rstrip()
  162. if text.endswith("..."):
  163. text = text[:-2]
  164. full_sentences.append(text)
  165. prev_text = ""
  166. text_detected("")
  167. # Recorder configuration
  168. recorder_config = {
  169. 'spinner': False,
  170. 'model': 'medium.en',
  171. # 'input_device_index': 2,
  172. 'realtime_model_type': 'tiny.en',
  173. 'language': 'en',
  174. #'silero_sensitivity': 0.05,
  175. 'silero_sensitivity': 0.4,
  176. 'webrtc_sensitivity': 3,
  177. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  178. 'min_length_of_recording': 1.1,
  179. 'min_gap_between_recordings': 0,
  180. 'enable_realtime_transcription': True,
  181. 'realtime_processing_pause': 0.05,
  182. 'on_realtime_transcription_update': text_detected,
  183. 'silero_deactivity_detection': False,
  184. 'early_transcription_on_silence': 0,
  185. 'beam_size': 5,
  186. 'beam_size_realtime': 1,
  187. 'no_log_file': True,
  188. 'initial_prompt': (
  189. "End incomplete sentences with ellipses.\n"
  190. "Examples:\n"
  191. "Complete: The sky is blue.\n"
  192. "Incomplete: When the sky...\n"
  193. "Complete: She walked home.\n"
  194. "Incomplete: Because he...\n"
  195. )
  196. #'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  197. }
  198. if EXTENDED_LOGGING:
  199. recorder_config['level'] = logging.DEBUG
  200. recorder = AudioToTextRecorder(**recorder_config)
  201. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  202. live.update(initial_text)
  203. # Start the worker thread
  204. worker_thread = threading.Thread(target=process_queue, daemon=True)
  205. worker_thread.start()
  206. try:
  207. while True:
  208. recorder.text(process_text)
  209. except KeyboardInterrupt:
  210. # Send sentinel value to worker thread to exit
  211. text_queue.put(None)
  212. worker_thread.join()
  213. live.stop()
  214. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  215. exit(0)