Kaynağa Gözat

added silence_timing parameter

KoljaB 6 ay önce
ebeveyn
işleme
bc17255953
1 değiştirilmiş dosya ile 72 ekleme ve 8 silme
  1. 72 8
      RealtimeSTT_server/stt_server.py

+ 72 - 8
RealtimeSTT_server/stt_server.py

@@ -26,6 +26,7 @@ stt-server [OPTIONS]
     - `-w, --wake_words`: Wake word(s) to trigger listening; default "".
     - `-D, --debug`: Enable debug logging.
     - `-W, --write`: Save audio to WAV file.
+    - `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True. 
     - `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
     - `--silero_use_onnx`: Use Silero ONNX model; default False.
     - `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
@@ -61,6 +62,8 @@ The server will broadcast real-time transcription updates to all connected clien
 """
 
 from .install_packages import check_and_install_packages
+from difflib import SequenceMatcher
+from collections import deque
 from datetime import datetime
 import logging
 import asyncio
@@ -72,9 +75,17 @@ debug_logging = False
 extended_logging = False
 send_recorded_chunk = False
 log_incoming_chunks = False
-stt_optimizations = False
+silence_timing = False
 writechunks = False#
 wav_file = None
+
+hard_break_even_on_background_noise = 3.0
+hard_break_even_on_background_noise_min_texts = 3
+hard_break_even_on_background_noise_min_similarity = 0.99
+hard_break_even_on_background_noise_min_chars = 15
+
+
+text_time_deque = deque()
 loglevel = logging.WARNING
 
 FORMAT = pyaudio.paInt16
@@ -176,6 +187,12 @@ def preprocess_text(text):
     if text.startswith("..."):
         text = text[3:]
 
+    if text.endswith("...'."):
+        text = text[:-1]
+
+    if text.endswith("...'"):
+        text = text[:-1]
+
     # Remove any leading whitespaces again after ellipses removal
     text = text.lstrip()
 
@@ -196,15 +213,51 @@ def text_detected(text, loop):
 
     text = preprocess_text(text)
 
-    if stt_optimizations:
-        sentence_end_marks = ['.', '!', '?', '。'] 
-        if text.endswith("..."):
+    if silence_timing:
+        def ends_with_ellipsis(text: str):
+            if text.endswith("..."):
+                return True
+            if len(text) > 1 and text[:-1].endswith("..."):
+                return True
+            return False
+
+        def sentence_end(text: str):
+            sentence_end_marks = ['.', '!', '?', '。']
+            if text and text[-1] in sentence_end_marks:
+                return True
+            return False
+
+
+        if ends_with_ellipsis(text):
             recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
-        elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+        elif sentence_end(text) and sentence_end(prev_text) and not ends_with_ellipsis(prev_text):
             recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
         else:
             recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
 
+
+        # Append the new text with its timestamp
+        current_time = time.time()
+        text_time_deque.append((current_time, text))
+
+        # Remove texts older than hard_break_even_on_background_noise seconds
+        while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
+            text_time_deque.popleft()
+
+        # Check if at least hard_break_even_on_background_noise_min_texts texts have arrived within the last hard_break_even_on_background_noise seconds
+        if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
+            texts = [t[1] for t in text_time_deque]
+            first_text = texts[0]
+            last_text = texts[-1]
+
+            # Compute the similarity ratio between the first and last texts
+            similarity = SequenceMatcher(None, first_text, last_text).ratio()
+
+            if similarity > hard_break_even_on_background_noise_min_similarity and len(first_text) > hard_break_even_on_background_noise_min_chars:
+                recorder.stop()
+                recorder.clear_audio_queue()
+                prev_text = ""
+
     prev_text = text
 
     # Put the message in the audio queue to be sent to clients
@@ -296,7 +349,7 @@ def on_transcription_start(loop):
 
 # Define the server's arguments
 def parse_arguments():
-    global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks
+    global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks, dynamic_silence_timing
 
     import argparse
     parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
@@ -327,6 +380,9 @@ def parse_arguments():
     parser.add_argument("-W", "--write", metavar="FILE",
                         help="Save received audio to a WAV file")
 
+    parser.add_argument('-s', '--silence_timing', action='store_true', default=True,
+                    help='Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.')
+
     parser.add_argument('--silero_sensitivity', type=float, default=0.05,
                         help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
 
@@ -360,9 +416,14 @@ def parse_arguments():
     parser.add_argument('--beam_size_realtime', type=int, default=3,
                         help='Beam size for the real-time transcription model. A smaller beam size allows for faster real-time processing but may reduce accuracy. Default is 3.')
 
+    # parser.add_argument('--initial_prompt', type=str,
+    #                     default='End incomplete sentences with ellipses.\nExamples:\nComplete: The sky is blue.\nIncomplete: When the sky...\nComplete: She walked home.\nIncomplete: Because he...',
+    #                     help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
+    
     parser.add_argument('--initial_prompt', type=str,
-                        default='End incomplete sentences with ellipses. Examples: Complete: The sky is blue. Incomplete: When the sky... Complete: She walked home. Incomplete: Because he...',
+                        default="Incomplete thoughts should end with '...'. Examples of complete thoughts: 'The sky is blue.' 'She walked home.' Examples of incomplete thoughts: 'When the sky...' 'Because he...'",
                         help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
+    
 
     parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
                         help='The duration of silence (in seconds) that the model should interpret as the end of a sentence. This helps the system detect when to finalize the transcription of a sentence. Default is 0.45 seconds.')
@@ -409,6 +470,7 @@ def parse_arguments():
     extended_logging = args.use_extended_logging
     writechunks = args.write
     log_incoming_chunks = args.logchunks
+    dynamic_silence_timing = args.silence_timing
 
     if debug_logging:
         loglevel = logging.DEBUG
@@ -424,7 +486,7 @@ def parse_arguments():
     return args
 
 def _recorder_thread(loop):
-    global recorder, prev_text, stop_recorder
+    global recorder, stop_recorder
     print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
     for key, value in recorder_config.items():
         print(f"    {bcolors.OKBLUE}{key}{bcolors.ENDC}: {value}")
@@ -433,6 +495,8 @@ def _recorder_thread(loop):
     recorder_ready.set()
     
     def process_text(full_sentence):
+        global prev_text
+        prev_text = ""
         full_sentence = preprocess_text(full_sentence)
         message = json.dumps({
             'type': 'fullSentence',