realtimestt_speechendpoint.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. IS_DEBUG = False
  2. import os
  3. import sys
  4. import threading
  5. import queue
  6. import time
  7. from collections import deque
  8. from difflib import SequenceMatcher
  9. from install_packages import check_and_install_packages
  10. # Check and install required packages
  11. check_and_install_packages([
  12. {'import_name': 'rich'},
  13. {'import_name': 'openai'},
  14. {'import_name': 'colorama'},
  15. {'import_name': 'RealtimeSTT'},
  16. # Add any other required packages here
  17. ])
  18. EXTENDED_LOGGING = False
  19. if __name__ == '__main__':
  20. if EXTENDED_LOGGING:
  21. import logging
  22. logging.basicConfig(level=logging.DEBUG)
  23. from rich.console import Console
  24. from rich.live import Live
  25. from rich.text import Text
  26. from rich.panel import Panel
  27. from rich.spinner import Spinner
  28. from rich.progress import Progress, SpinnerColumn, TextColumn
  29. console = Console()
  30. console.print("System initializing, please wait")
  31. from RealtimeSTT import AudioToTextRecorder
  32. from colorama import Fore, Style
  33. import colorama
  34. from openai import OpenAI
  35. # import ollama
  36. # Initialize OpenAI client for Ollama
  37. client = OpenAI(
  38. # base_url='http://127.0.0.1:11434/v1/', # ollama
  39. base_url='http://127.0.0.1:1234/v1/', # lm_studio
  40. api_key='ollama', # required but ignored
  41. )
  42. if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
  43. from torchaudio._extension.utils import _init_dll_path
  44. _init_dll_path()
  45. colorama.init()
  46. # Initialize Rich Console and Live
  47. live = Live(console=console, refresh_per_second=10, screen=False)
  48. live.start()
  49. # Initialize a thread-safe queue
  50. text_queue = queue.Queue()
  51. # Variables for managing displayed text
  52. full_sentences = []
  53. rich_text_stored = ""
  54. recorder = None
  55. displayed_text = ""
  56. text_time_deque = deque()
  57. rapid_sentence_end_detection = 0.4
  58. end_of_sentence_detection_pause = 1.2
  59. unknown_sentence_detection_pause = 1.8
  60. mid_sentence_detection_pause = 2.4
  61. hard_break_even_on_background_noise = 3.0
  62. hard_break_even_on_background_noise_min_texts = 3
  63. hard_break_even_on_background_noise_min_chars = 15
  64. hard_break_even_on_background_noise_min_similarity = 0.99
  65. relisten_on_abrupt_stop = True
  66. abrupt_stop = False
  67. def clear_console():
  68. os.system('clear' if os.name == 'posix' else 'cls')
  69. prev_text = ""
  70. speech_finished_cache = {}
  71. def is_speech_finished(text):
  72. # Check if the result is already in the cache
  73. if text in speech_finished_cache:
  74. if IS_DEBUG:
  75. print(f"Cache hit for: '{text}'")
  76. return speech_finished_cache[text]
  77. user_prompt = (
  78. "Please reply with only 'c' if the following text is a complete thought (a sentence that stands on its own), "
  79. "or 'i' if it is not finished. Do not include any additional text in your reply. "
  80. "Consider a full sentence to have a clear subject, verb, and predicate or express a complete idea. "
  81. "Examples:\n"
  82. "- 'The sky is blue.' is complete (reply 'c').\n"
  83. "- 'When the sky' is incomplete (reply 'i').\n"
  84. "- 'She walked home.' is complete (reply 'c').\n"
  85. "- 'Because he' is incomplete (reply 'i').\n"
  86. f"\nText: {text}"
  87. )
  88. response = client.chat.completions.create(
  89. model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf",
  90. messages=[{"role": "user", "content": user_prompt}],
  91. max_tokens=1,
  92. temperature=0.0, # Set temperature to 0 for deterministic output
  93. )
  94. if IS_DEBUG:
  95. print(f"t:'{response.choices[0].message.content.strip().lower()}'", end="", flush=True)
  96. reply = response.choices[0].message.content.strip().lower()
  97. result = reply == 'c'
  98. # Cache the result
  99. speech_finished_cache[text] = result
  100. return result
  101. def preprocess_text(text):
  102. # Remove leading whitespaces
  103. text = text.lstrip()
  104. # Remove starting ellipses if present
  105. if text.startswith("..."):
  106. text = text[3:]
  107. # Remove any leading whitespaces again after ellipses removal
  108. text = text.lstrip()
  109. # Uppercase the first letter
  110. if text:
  111. text = text[0].upper() + text[1:]
  112. return text
  113. def text_detected(text):
  114. """
  115. Enqueue the detected text for processing.
  116. """
  117. text_queue.put(text)
  118. def process_queue():
  119. global recorder, full_sentences, prev_text, displayed_text, rich_text_stored, text_time_deque, abrupt_stop
  120. # Initialize a deque to store texts with their timestamps
  121. while True:
  122. try:
  123. text = text_queue.get(timeout=1) # Wait for text or timeout after 1 second
  124. except queue.Empty:
  125. continue # No text to process, continue looping
  126. if text is None:
  127. # Sentinel value to indicate thread should exit
  128. break
  129. text = preprocess_text(text)
  130. current_time = time.time()
  131. sentence_end_marks = ['.', '!', '?', '。']
  132. if text.endswith("..."):
  133. if not recorder.post_speech_silence_duration == mid_sentence_detection_pause:
  134. recorder.post_speech_silence_duration = mid_sentence_detection_pause
  135. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  136. elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
  137. if not recorder.post_speech_silence_duration == end_of_sentence_detection_pause:
  138. recorder.post_speech_silence_duration = end_of_sentence_detection_pause
  139. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  140. else:
  141. if not recorder.post_speech_silence_duration == unknown_sentence_detection_pause:
  142. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  143. if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  144. prev_text = text
  145. import string
  146. transtext = text.translate(str.maketrans('', '', string.punctuation))
  147. if is_speech_finished(transtext):
  148. if not recorder.post_speech_silence_duration == rapid_sentence_end_detection:
  149. recorder.post_speech_silence_duration = rapid_sentence_end_detection
  150. if IS_DEBUG: print(f"RT: {transtext} post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  151. # Append the new text with its timestamp
  152. text_time_deque.append((current_time, text))
  153. # Remove texts older than 1 second
  154. while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
  155. text_time_deque.popleft()
  156. # Check if at least 3 texts have arrived within the last full second
  157. if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
  158. texts = [t[1] for t in text_time_deque]
  159. first_text = texts[0]
  160. last_text = texts[-1]
  161. # Check if at least 3 texts have arrived within the last full second
  162. if len(text_time_deque) >= 3:
  163. texts = [t[1] for t in text_time_deque]
  164. first_text = texts[0]
  165. last_text = texts[-1]
  166. # Compute the similarity ratio between the first and last texts
  167. similarity = SequenceMatcher(None, first_text, last_text).ratio()
  168. #print(f"Similarity: {similarity:.2f}")
  169. if similarity > hard_break_even_on_background_noise_min_similarity and len(first_text) > hard_break_even_on_background_noise_min_chars:
  170. abrupt_stop = True
  171. recorder.stop()
  172. rich_text = Text()
  173. for i, sentence in enumerate(full_sentences):
  174. if i % 2 == 0:
  175. rich_text += Text(sentence, style="yellow") + Text(" ")
  176. else:
  177. rich_text += Text(sentence, style="cyan") + Text(" ")
  178. if text:
  179. rich_text += Text(text, style="bold yellow")
  180. new_displayed_text = rich_text.plain
  181. if new_displayed_text != displayed_text:
  182. displayed_text = new_displayed_text
  183. panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
  184. live.update(panel)
  185. rich_text_stored = rich_text
  186. # Mark the task as done
  187. text_queue.task_done()
  188. def process_text(text):
  189. global recorder, full_sentences, prev_text, abrupt_stop
  190. if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
  191. recorder.post_speech_silence_duration = unknown_sentence_detection_pause
  192. text = preprocess_text(text)
  193. text = text.rstrip()
  194. text_time_deque.clear()
  195. if text.endswith("..."):
  196. text = text[:-2]
  197. full_sentences.append(text)
  198. prev_text = ""
  199. text_detected("")
  200. if abrupt_stop:
  201. abrupt_stop = False
  202. if relisten_on_abrupt_stop:
  203. recorder.listen()
  204. recorder.start()
  205. if hasattr(recorder, "last_words_buffer"):
  206. recorder.frames.extend(list(recorder.last_words_buffer))
  207. # Recorder configuration
  208. recorder_config = {
  209. 'spinner': False,
  210. 'model': 'medium.en',
  211. #'input_device_index': 1, # mic
  212. #'input_device_index': 2, # stereomix
  213. 'realtime_model_type': 'tiny.en',
  214. 'language': 'en',
  215. #'silero_sensitivity': 0.05,
  216. 'silero_sensitivity': 0.4,
  217. 'webrtc_sensitivity': 3,
  218. 'post_speech_silence_duration': unknown_sentence_detection_pause,
  219. 'min_length_of_recording': 1.1,
  220. 'min_gap_between_recordings': 0,
  221. 'enable_realtime_transcription': True,
  222. 'realtime_processing_pause': 0.05,
  223. 'on_realtime_transcription_update': text_detected,
  224. 'silero_deactivity_detection': False,
  225. 'early_transcription_on_silence': 0,
  226. 'beam_size': 5,
  227. 'beam_size_realtime': 1,
  228. 'no_log_file': True,
  229. 'initial_prompt': (
  230. "End incomplete sentences with ellipses.\n"
  231. "Examples:\n"
  232. "Complete: The sky is blue.\n"
  233. "Incomplete: When the sky...\n"
  234. "Complete: She walked home.\n"
  235. "Incomplete: Because he...\n"
  236. )
  237. #'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."
  238. }
  239. if EXTENDED_LOGGING:
  240. recorder_config['level'] = logging.DEBUG
  241. recorder = AudioToTextRecorder(**recorder_config)
  242. initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
  243. live.update(initial_text)
  244. # Start the worker thread
  245. worker_thread = threading.Thread(target=process_queue, daemon=True)
  246. worker_thread.start()
  247. try:
  248. while True:
  249. recorder.text(process_text)
  250. except KeyboardInterrupt:
  251. # Send sentinel value to worker thread to exit
  252. text_queue.put(None)
  253. worker_thread.join()
  254. live.stop()
  255. console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
  256. exit(0)