KoljaB преди 6 месеца
родител
ревизия
1f959e0bfc
променени са 2 файла, в които са добавени 71 реда и са изтрити 27 реда
  1. 33 9
      RealtimeSTT/audio_recorder_client.py
  2. 38 18
      server/stt_server.py

+ 33 - 9
RealtimeSTT/audio_recorder_client.py

@@ -1,7 +1,9 @@
 log_outgoing_chunks = False
+debug_mode = False
 
 from typing import Iterable, List, Optional, Union
 from urllib.parse import urlparse
+from datetime import datetime
 import subprocess
 import websocket
 import threading
@@ -44,6 +46,18 @@ INIT_HANDLE_BUFFER_OVERFLOW = False
 if platform.system() != 'Darwin':
     INIT_HANDLE_BUFFER_OVERFLOW = True
 
+# Define ANSI color codes for terminal output
+class bcolors:
+    HEADER = '\033[95m'   # Magenta
+    OKBLUE = '\033[94m'   # Blue
+    OKCYAN = '\033[96m'   # Cyan
+    OKGREEN = '\033[92m'  # Green
+    WARNING = '\033[93m'  # Yellow
+    FAIL = '\033[91m'     # Red
+    ENDC = '\033[0m'      # Reset to default
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
 class AudioToTextRecorderClient:
     """
     A class responsible for capturing audio from the microphone, detecting
@@ -241,10 +255,10 @@ class AudioToTextRecorderClient:
                 if self.final_text_ready.wait(timeout=wait_interval):
                     break  # Break if transcription is ready
                 
-                if not self.realtime_text == self.submitted_realtime_text:
-                    if self.on_realtime_transcription_update:
-                        self.on_realtime_transcription_update(self.realtime_text)
-                    self.submitted_realtime_text = self.realtime_text
+                # if not self.realtime_text == self.submitted_realtime_text:
+                #     if self.on_realtime_transcription_update:
+                #         self.on_realtime_transcription_update(self.realtime_text)
+                #     self.submitted_realtime_text = self.realtime_text
 
                 total_wait_time += wait_interval
                 
@@ -287,8 +301,6 @@ class AudioToTextRecorderClient:
         Set the microphone on or off.
         """
         self.muted = not microphone_on
-        #self.call_method("set_microphone", [microphone_on])
-        # self.use_microphone.value = microphone_on
 
     def abort(self):
         self.call_method("abort")
@@ -372,8 +384,6 @@ class AudioToTextRecorderClient:
             args += ['--beam_size', str(self.beam_size)]
         if self.beam_size_realtime is not None:
             args += ['--beam_size_realtime', str(self.beam_size_realtime)]
-        if self.initial_prompt:
-            args += ['--initial_prompt', self.initial_prompt]
         if self.wake_words is not None:
             args += ['--wake_words', str(self.wake_words)]
         if self.wake_words_sensitivity is not None:
@@ -403,11 +413,15 @@ class AudioToTextRecorderClient:
             parsed_data_url = urlparse(self.data_url)
             if parsed_data_url.port:
                 args += ['--data_port', str(parsed_data_url.port)]
+        if self.initial_prompt:
+            sanitized_prompt = self.initial_prompt.replace("\n", "\\n")
+            args += ['--initial_prompt', sanitized_prompt]
 
         # Start the subprocess with the mapped arguments
         if os.name == 'nt':  # Windows
             cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
-            # print(f"Opening server with cli command: {cmd}")
+            if debug_mode:
+                print(f"Opening server with cli command: {cmd}")
             subprocess.Popen(cmd, shell=True)
         else:  # Unix-like systems
             subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
@@ -570,6 +584,16 @@ class AudioToTextRecorderClient:
                 if data['text'] != self.realtime_text:
                     self.realtime_text = data['text']
 
+                    timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+                    print(f"Realtime text [{timestamp}]: {bcolors.OKCYAN}{self.realtime_text}{bcolors.ENDC}")
+
+                    if self.on_realtime_transcription_update:
+                        # Call the callback in a new thread to avoid blocking
+                        threading.Thread(
+                            target=self.on_realtime_transcription_update,
+                            args=(self.realtime_text,)
+                        ).start()
+
             # Handle full sentences
             elif data.get('type') == 'fullSentence':
                 self.final_text = data['text']

+ 38 - 18
server/stt_server.py

@@ -69,8 +69,11 @@ The server will broadcast real-time transcription updates to all connected clien
 extended_logging = True
 send_recorded_chunk = False
 log_incoming_chunks = False
+stt_optimizations = False
 
 
+from .install_packages import check_and_install_packages
+from datetime import datetime
 import asyncio
 import base64
 import sys
@@ -78,7 +81,6 @@ import sys
 if sys.platform == 'win32':
     asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
 
-from .install_packages import check_and_install_packages
 
 check_and_install_packages([
     {
@@ -177,13 +179,14 @@ def text_detected(text, loop):
 
     text = preprocess_text(text)
 
-    sentence_end_marks = ['.', '!', '?', '。'] 
-    if text.endswith("..."):
-        recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
-    elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
-        recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
-    else:
-        recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
+    if stt_optimizations:
+        sentence_end_marks = ['.', '!', '?', '。'] 
+        if text.endswith("..."):
+            recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
+        elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+            recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
+        else:
+            recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
 
     prev_text = text
 
@@ -193,10 +196,14 @@ def text_detected(text, loop):
         'text': text
     })
     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
+
+    # Get current timestamp in HH:MM:SS.nnn format
+    timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+
     if extended_logging:
-        print(f"Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
+        print(f"  [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
     else:
-        print(f"\r{bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
+        print(f"\r[{timestamp}] {bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
 
 def on_recording_start(loop):
     # Send a message to the client indicating recording has started
@@ -348,7 +355,7 @@ def parse_arguments():
     parser.add_argument('--wake_word_timeout', type=float, default=5.0,
                         help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
 
-    parser.add_argument('--wake_word_activation_delay', type=float, default=0.5,
+    parser.add_argument('--wake_word_activation_delay', type=float, default=20,
                         help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0.5 seconds.')
 
     parser.add_argument('--wakeword_backend', type=str, default='pvporcupine',
@@ -369,8 +376,14 @@ def parse_arguments():
     parser.add_argument('--use_extended_logging', action='store_true',
                         help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
 
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Replace escaped newlines with actual newlines in initial_prompt
+    if args.initial_prompt:
+        args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
 
-    return parser.parse_args()
+    return args
 
 def _recorder_thread(loop):
     global recorder, prev_text, stop_recorder
@@ -390,10 +403,12 @@ def _recorder_thread(loop):
         # Use the passed event loop here
         asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
+        timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+
         if extended_logging:
-            print(f"Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}")
+            print(f"  [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
         else:
-            print(f"\r{bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
+            print(f"\r[{timestamp}] {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
     try:
         while not stop_recorder:
             recorder.text(process_text)
@@ -445,8 +460,9 @@ async def control_handler(websocket, path):
                                 value_formatted = f"{value:.2f}"
                             else:
                                 value_formatted = value
+                            timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
                             if extended_logging:
-                                print(f"{bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
                             # Optionally send a response back to the client
                             await websocket.send(json.dumps({"status": "success", "message": f"Parameter {parameter} set to {value}"}))
                         else:
@@ -469,8 +485,9 @@ async def control_handler(websocket, path):
 
                             value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
 
+                            timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
                             if extended_logging:
-                                print(f"{bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
                             response = {"status": "success", "parameter": parameter, "value": value}
                             if request_id is not None:
                                 response["request_id"] = request_id
@@ -490,7 +507,8 @@ async def control_handler(websocket, path):
                                 args = command_data.get("args", [])
                                 kwargs = command_data.get("kwargs", {})
                                 method(*args, **kwargs)
-                                print(f"{bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
+                                timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "success", "message": f"Method {method_name} called"}))
                             else:
                                 print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
@@ -541,8 +559,10 @@ async def broadcast_audio_messages():
         message = await audio_queue.get()
         for conn in list(data_connections):
             try:
+                timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+
                 if extended_logging:
-                    print(f"    {bcolors.OKBLUE}Sending message: {message}{bcolors.ENDC}\n", flush=True, end="")
+                    print(f"  [{timestamp}] Sending message: {bcolors.OKBLUE}{message}{bcolors.ENDC}\n", flush=True, end="")
                 await conn.send(message)
             except websockets.exceptions.ConnectionClosed:
                 data_connections.remove(conn)