11 місяців тому · bc17255953
--- a/RealtimeSTT_server/stt_server.py
+++ b/RealtimeSTT_server/stt_server.py
@@ -26,6 +26,7 @@ stt-server [OPTIONS]
 
															     - `-w, --wake_words`: Wake word(s) to trigger listening; default "".
														
 
															     - `-D, --debug`: Enable debug logging.
														
 
															     - `-W, --write`: Save audio to WAV file.
														
 
															+    - `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True. 
														
 
															     - `--silero_sensitivity`: Silero VAD sensitivity (0-1); default 0.05.
														
 
															     - `--silero_use_onnx`: Use Silero ONNX model; default False.
														
 
															     - `--webrtc_sensitivity`: WebRTC VAD sensitivity (0-3); default 3.
														
@@ -61,6 +62,8 @@ The server will broadcast real-time transcription updates to all connected clien
 
															 """
														
 
															 from .install_packages import check_and_install_packages
														
 
															+from difflib import SequenceMatcher
														
 
															+from collections import deque
														
 
															 from datetime import datetime
														
 
															 import logging
														
 
															 import asyncio
														
@@ -72,9 +75,17 @@ debug_logging = False
 
															 extended_logging = False
														
 
															 send_recorded_chunk = False
														
 
															 log_incoming_chunks = False
														
 
															-stt_optimizations = False
														
 
															+silence_timing = False
														
 
															 writechunks = False#
														
 
															 wav_file = None
														
 
															+
														
 
															+hard_break_even_on_background_noise = 3.0
														
 
															+hard_break_even_on_background_noise_min_texts = 3
														
 
															+hard_break_even_on_background_noise_min_similarity = 0.99
														
 
															+hard_break_even_on_background_noise_min_chars = 15
														
 
															+
														
 
															+
														
 
															+text_time_deque = deque()
														
 
															 loglevel = logging.WARNING
														
 
															 FORMAT = pyaudio.paInt16
														
@@ -176,6 +187,12 @@ def preprocess_text(text):
 
															     if text.startswith("..."):
														
 
															         text = text[3:]
														
 
															+    if text.endswith("...'."):
														
 
															+        text = text[:-1]
														
 
															+
														
 
															+    if text.endswith("...'"):
														
 
															+        text = text[:-1]
														
 
															+
														
 
															     # Remove any leading whitespaces again after ellipses removal
														
 
															     text = text.lstrip()
														
@@ -196,15 +213,51 @@ def text_detected(text, loop):
 
															     text = preprocess_text(text)
														
 
															-    if stt_optimizations:
														
 
															-        sentence_end_marks = ['.', '!', '?', '。'] 
														
 
															-        if text.endswith("..."):
														
 
															+    if silence_timing:
														
 
															+        def ends_with_ellipsis(text: str):
														
 
															+            if text.endswith("..."):
														
 
															+                return True
														
 
															+            if len(text) > 1 and text[:-1].endswith("..."):
														
 
															+                return True
														
 
															+            return False
														
 
															+
														
 
															+        def sentence_end(text: str):
														
 
															+            sentence_end_marks = ['.', '!', '?', '。']
														
 
															+            if text and text[-1] in sentence_end_marks:
														
 
															+                return True
														
 
															+            return False
														
 
															+
														
 
															+
														
 
															+        if ends_with_ellipsis(text):
														
 
															             recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
														
 
															-        elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
														
 
															+        elif sentence_end(text) and sentence_end(prev_text) and not ends_with_ellipsis(prev_text):
														
 
															             recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
														
 
															         else:
														
 
															             recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
														
 
															+
														
 
															+        # Append the new text with its timestamp
														
 
															+        current_time = time.time()
														
 
															+        text_time_deque.append((current_time, text))
														
 
															+
														
 
															+        # Remove texts older than hard_break_even_on_background_noise seconds
														
 
															+        while text_time_deque and text_time_deque[0][0] < current_time - hard_break_even_on_background_noise:
														
 
															+            text_time_deque.popleft()
														
 
															+
														
 
															+        # Check if at least hard_break_even_on_background_noise_min_texts texts have arrived within the last hard_break_even_on_background_noise seconds
														
 
															+        if len(text_time_deque) >= hard_break_even_on_background_noise_min_texts:
														
 
															+            texts = [t[1] for t in text_time_deque]
														
 
															+            first_text = texts[0]
														
 
															+            last_text = texts[-1]
														
 
															+
														
 
															+            # Compute the similarity ratio between the first and last texts
														
 
															+            similarity = SequenceMatcher(None, first_text, last_text).ratio()
														
 
															+
														
 
															+            if similarity > hard_break_even_on_background_noise_min_similarity and len(first_text) > hard_break_even_on_background_noise_min_chars:
														
 
															+                recorder.stop()
														
 
															+                recorder.clear_audio_queue()
														
 
															+                prev_text = ""
														
 
															+
														
 
															     prev_text = text
														
 
															     # Put the message in the audio queue to be sent to clients
														
@@ -296,7 +349,7 @@ def on_transcription_start(loop):
 
															 # Define the server's arguments
														
 
															 def parse_arguments():
														
 
															-    global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks
														
 
															+    global debug_logging, extended_logging, loglevel, writechunks, log_incoming_chunks, dynamic_silence_timing
														
 
															     import argparse
														
 
															     parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
														
@@ -327,6 +380,9 @@ def parse_arguments():
 
															     parser.add_argument("-W", "--write", metavar="FILE",
														
 
															                         help="Save received audio to a WAV file")
														
 
															+    parser.add_argument('-s', '--silence_timing', action='store_true', default=True,
														
 
															+                    help='Enable dynamic adjustment of silence duration for sentence detection. Adjusts post-speech silence duration based on detected sentence structure and punctuation. Default is False.')
														
 
															+
														
 
															     parser.add_argument('--silero_sensitivity', type=float, default=0.05,
														
 
															                         help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
														
@@ -360,9 +416,14 @@ def parse_arguments():
 
															     parser.add_argument('--beam_size_realtime', type=int, default=3,
														
 
															                         help='Beam size for the real-time transcription model. A smaller beam size allows for faster real-time processing but may reduce accuracy. Default is 3.')
														
 
															+    # parser.add_argument('--initial_prompt', type=str,
														
 
															+    #                     default='End incomplete sentences with ellipses.\nExamples:\nComplete: The sky is blue.\nIncomplete: When the sky...\nComplete: She walked home.\nIncomplete: Because he...',
														
 
															+    #                     help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
														
 
															+    
														
 
															     parser.add_argument('--initial_prompt', type=str,
														
 
															-                        default='End incomplete sentences with ellipses. Examples: Complete: The sky is blue. Incomplete: When the sky... Complete: She walked home. Incomplete: Because he...',
														
 
															+                        default="Incomplete thoughts should end with '...'. Examples of complete thoughts: 'The sky is blue.' 'She walked home.' Examples of incomplete thoughts: 'When the sky...' 'Because he...'",
														
 
															                         help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
														
 
															+    
														
 
															     parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
														
 
															                         help='The duration of silence (in seconds) that the model should interpret as the end of a sentence. This helps the system detect when to finalize the transcription of a sentence. Default is 0.45 seconds.')
														
@@ -409,6 +470,7 @@ def parse_arguments():
 
															     extended_logging = args.use_extended_logging
														
 
															     writechunks = args.write
														
 
															     log_incoming_chunks = args.logchunks
														
 
															+    dynamic_silence_timing = args.silence_timing
														
 
															     if debug_logging:
														
 
															         loglevel = logging.DEBUG
														
@@ -424,7 +486,7 @@ def parse_arguments():
 
															     return args
														
 
															 def _recorder_thread(loop):
														
 
															-    global recorder, prev_text, stop_recorder
														
 
															+    global recorder, stop_recorder
														
 
															     print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
														
 
															     for key, value in recorder_config.items():
														
 
															         print(f"    {bcolors.OKBLUE}{key}{bcolors.ENDC}: {value}")
														
@@ -433,6 +495,8 @@ def _recorder_thread(loop):
 
															     recorder_ready.set()
														
 
															     def process_text(full_sentence):
														
 
															+        global prev_text
														
 
															+        prev_text = ""
														
 
															         full_sentence = preprocess_text(full_sentence)
														
 
															         message = json.dumps({
														
 
															             'type': 'fullSentence',