7 months ago · 6a057ce511
--- a/tests/realtimestt_speechendpoint.py
+++ b/tests/realtimestt_speechendpoint.py
@@ -5,6 +5,8 @@ import sys
 
															 import threading
														
 
															 import queue
														
 
															 import time
														
 
															+from collections import deque
														
 
															+from difflib import SequenceMatcher
														
 
															 from install_packages import check_and_install_packages
														
 
															 # Check and install required packages
														
@@ -64,11 +66,19 @@ if __name__ == '__main__':
 
															     rich_text_stored = ""
														
 
															     recorder = None
														
 
															     displayed_text = ""
														
 
															+    text_time_deque = deque()
														
 
															     rapid_sentence_end_detection = 0.4
														
 
															     end_of_sentence_detection_pause = 1.2
														
 
															-    unknown_sentence_detection_pause = 2.5
														
 
															-    mid_sentence_detection_pause = 3.8
														
 
															+    unknown_sentence_detection_pause = 1.8
														
 
															+    mid_sentence_detection_pause = 2.4
														
 
															+    hard_break_even_on_background_noise = 3.0
														
 
															+    hard_break_even_on_background_noise_min_texts = 3
														
 
															+    hard_break_even_on_background_noise_min_chars = 15
														
 
															+    hard_break_even_on_background_noise_min_similarity = 0.99
														
 
															+    relisten_on_abrupt_stop = True
														
 
															+
														
 
															+    abrupt_stop = False
														
 
															     def clear_console():
														
 
															         os.system('clear' if os.name == 'posix' else 'cls')
														
@@ -137,9 +147,11 @@ if __name__ == '__main__':
 
															         """
														
 
															         text_queue.put(text)
														
 
															+
														
 
															     def process_queue():
														
 
															-        global recorder, full_sentences, prev_text, displayed_text, rich_text_stored
														
 
															+        global recorder, full_sentences, prev_text, displayed_text, rich_text_stored, text_time_deque, abrupt_stop
														
 
															+        # Initialize a deque to store texts with their timestamps
														
 
															         while True:
														
 
															             try:
														
 
															                 text = text_queue.get(timeout=1)  # Wait for text or timeout after 1 second
														
@@ -151,6 +163,7 @@ if __name__ == '__main__':
 
															                 break
														
 
															             text = preprocess_text(text)
														
 
															+            current_time = time.time()
														
 
															             sentence_end_marks = ['.', '!', '?', '。'] 
														
 
															             if text.endswith("..."):
														
@@ -176,6 +189,34 @@ if __name__ == '__main__':
 
															                     recorder.post_speech_silence_duration = rapid_sentence_end_detection
														
 
															                     if IS_DEBUG: print(f"RT: {transtext} post_speech_silence_duration: {recorder.post_speech_silence_duration}")
														
 
															+            # Append the new text with its timestamp
														
 
															+            text_time_deque.append((current_time, text))
														
 
															+
														
 
															+            # Remove texts older than 1 second
														
 
															+            while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
														
 
															+                text_time_deque.popleft()
														
 
															+
														
 
															+            # Check if at least 3 texts have arrived within the last full second
														
 
															+            if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
														
 
															+                texts = [t[1] for t in text_time_deque]
														
 
															+                first_text = texts[0]
														
 
															+                last_text = texts[-1]
														
 
															+
														
 
															+
														
 
															+            # Check if at least 3 texts have arrived within the last full second
														
 
															+            if len(text_time_deque) >= 3:
														
 
															+                texts = [t[1] for t in text_time_deque]
														
 
															+                first_text = texts[0]
														
 
															+                last_text = texts[-1]
														
 
															+
														
 
															+                # Compute the similarity ratio between the first and last texts
														
 
															+                similarity = SequenceMatcher(None, first_text, last_text).ratio()
														
 
															+                #print(f"Similarity: {similarity:.2f}")
														
 
															+
														
 
															+                if similarity > hard_break_even_on_background_noise_min_similarity and len(first_text) > hard_break_even_on_background_noise_min_chars:
														
 
															+                    abrupt_stop = True
														
 
															+                    recorder.stop()
														
 
															+
														
 
															             rich_text = Text()
														
 
															             for i, sentence in enumerate(full_sentences):
														
 
															                 if i % 2 == 0:
														
@@ -198,11 +239,12 @@ if __name__ == '__main__':
 
															             text_queue.task_done()
														
 
															     def process_text(text):
														
 
															-        global recorder, full_sentences, prev_text
														
 
															+        global recorder, full_sentences, prev_text, abrupt_stop
														
 
															         if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
														
 
															         recorder.post_speech_silence_duration = unknown_sentence_detection_pause
														
 
															         text = preprocess_text(text)
														
 
															         text = text.rstrip()
														
 
															+        text_time_deque.clear()
														
 
															         if text.endswith("..."):
														
 
															             text = text[:-2]
														
@@ -210,11 +252,20 @@ if __name__ == '__main__':
 
															         prev_text = ""
														
 
															         text_detected("")
														
 
															+        if abrupt_stop:
														
 
															+            abrupt_stop = False
														
 
															+            if relisten_on_abrupt_stop:
														
 
															+                recorder.listen()
														
 
															+                recorder.start()
														
 
															+                if hasattr(recorder, "last_words_buffer"):
														
 
															+                    recorder.frames.extend(list(recorder.last_words_buffer))
														
 
															+
														
 
															     # Recorder configuration
														
 
															     recorder_config = {
														
 
															         'spinner': False,
														
 
															         'model': 'medium.en',
														
 
															-        # 'input_device_index': 2,
														
 
															+        #'input_device_index': 1, # mic
														
 
															+        #'input_device_index': 2, # stereomix
														
 
															         'realtime_model_type': 'tiny.en',
														
 
															         'language': 'en',
														
 
															         #'silero_sensitivity': 0.05,