8 months ago · 1f959e0bfc
--- a/RealtimeSTT/audio_recorder_client.py
+++ b/RealtimeSTT/audio_recorder_client.py
@@ -1,7 +1,9 @@
 
															 log_outgoing_chunks = False
														
 
															+debug_mode = False
														
 
															 from typing import Iterable, List, Optional, Union
														
 
															 from urllib.parse import urlparse
														
 
															+from datetime import datetime
														
 
															 import subprocess
														
 
															 import websocket
														
 
															 import threading
														
@@ -44,6 +46,18 @@ INIT_HANDLE_BUFFER_OVERFLOW = False
 
															 if platform.system() != 'Darwin':
														
 
															     INIT_HANDLE_BUFFER_OVERFLOW = True
														
 
															+# Define ANSI color codes for terminal output
														
 
															+class bcolors:
														
 
															+    HEADER = '\033[95m'   # Magenta
														
 
															+    OKBLUE = '\033[94m'   # Blue
														
 
															+    OKCYAN = '\033[96m'   # Cyan
														
 
															+    OKGREEN = '\033[92m'  # Green
														
 
															+    WARNING = '\033[93m'  # Yellow
														
 
															+    FAIL = '\033[91m'     # Red
														
 
															+    ENDC = '\033[0m'      # Reset to default
														
 
															+    BOLD = '\033[1m'
														
 
															+    UNDERLINE = '\033[4m'
														
 
															+
														
 
															 class AudioToTextRecorderClient:
														
 
															     """
														
 
															     A class responsible for capturing audio from the microphone, detecting
														
@@ -241,10 +255,10 @@ class AudioToTextRecorderClient:
 
															                 if self.final_text_ready.wait(timeout=wait_interval):
														
 
															                     break  # Break if transcription is ready
														
 
															-                if not self.realtime_text == self.submitted_realtime_text:
														
 
															-                    if self.on_realtime_transcription_update:
														
 
															-                        self.on_realtime_transcription_update(self.realtime_text)
														
 
															-                    self.submitted_realtime_text = self.realtime_text
														
 
															+                # if not self.realtime_text == self.submitted_realtime_text:
														
 
															+                #     if self.on_realtime_transcription_update:
														
 
															+                #         self.on_realtime_transcription_update(self.realtime_text)
														
 
															+                #     self.submitted_realtime_text = self.realtime_text
														
 
															                 total_wait_time += wait_interval
														
@@ -287,8 +301,6 @@ class AudioToTextRecorderClient:
 
															         Set the microphone on or off.
														
 
															         """
														
 
															         self.muted = not microphone_on
														
 
															-        #self.call_method("set_microphone", [microphone_on])
														
 
															-        # self.use_microphone.value = microphone_on
														
 
															     def abort(self):
														
 
															         self.call_method("abort")
														
@@ -372,8 +384,6 @@ class AudioToTextRecorderClient:
 
															             args += ['--beam_size', str(self.beam_size)]
														
 
															         if self.beam_size_realtime is not None:
														
 
															             args += ['--beam_size_realtime', str(self.beam_size_realtime)]
														
 
															-        if self.initial_prompt:
														
 
															-            args += ['--initial_prompt', self.initial_prompt]
														
 
															         if self.wake_words is not None:
														
 
															             args += ['--wake_words', str(self.wake_words)]
														
 
															         if self.wake_words_sensitivity is not None:
														
@@ -403,11 +413,15 @@ class AudioToTextRecorderClient:
 
															             parsed_data_url = urlparse(self.data_url)
														
 
															             if parsed_data_url.port:
														
 
															                 args += ['--data_port', str(parsed_data_url.port)]
														
 
															+        if self.initial_prompt:
														
 
															+            sanitized_prompt = self.initial_prompt.replace("\n", "\\n")
														
 
															+            args += ['--initial_prompt', sanitized_prompt]
														
 
															         # Start the subprocess with the mapped arguments
														
 
															         if os.name == 'nt':  # Windows
														
 
															             cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
														
 
															-            # print(f"Opening server with cli command: {cmd}")
														
 
															+            if debug_mode:
														
 
															+                print(f"Opening server with cli command: {cmd}")
														
 
															             subprocess.Popen(cmd, shell=True)
														
 
															         else:  # Unix-like systems
														
 
															             subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
														
@@ -570,6 +584,16 @@ class AudioToTextRecorderClient:
 
															                 if data['text'] != self.realtime_text:
														
 
															                     self.realtime_text = data['text']
														
 
															+                    timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															+                    print(f"Realtime text [{timestamp}]: {bcolors.OKCYAN}{self.realtime_text}{bcolors.ENDC}")
														
 
															+
														
 
															+                    if self.on_realtime_transcription_update:
														
 
															+                        # Call the callback in a new thread to avoid blocking
														
 
															+                        threading.Thread(
														
 
															+                            target=self.on_realtime_transcription_update,
														
 
															+                            args=(self.realtime_text,)
														
 
															+                        ).start()
														
 
															+
														
 
															             # Handle full sentences
														
 
															             elif data.get('type') == 'fullSentence':
														
 
															                 self.final_text = data['text']
														
--- a/server/stt_server.py
+++ b/server/stt_server.py
@@ -69,8 +69,11 @@ The server will broadcast real-time transcription updates to all connected clien
 
															 extended_logging = True
														
 
															 send_recorded_chunk = False
														
 
															 log_incoming_chunks = False
														
 
															+stt_optimizations = False
														
 
															+from .install_packages import check_and_install_packages
														
 
															+from datetime import datetime
														
 
															 import asyncio
														
 
															 import base64
														
 
															 import sys
														
@@ -78,7 +81,6 @@ import sys
 
															 if sys.platform == 'win32':
														
 
															     asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
														
 
															-from .install_packages import check_and_install_packages
														
 
															 check_and_install_packages([
														
 
															     {
														
@@ -177,13 +179,14 @@ def text_detected(text, loop):
 
															     text = preprocess_text(text)
														
 
															-    sentence_end_marks = ['.', '!', '?', '。'] 
														
 
															-    if text.endswith("..."):
														
 
															-        recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
														
 
															-    elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
														
 
															-        recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
														
 
															-    else:
														
 
															-        recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
														
 
															+    if stt_optimizations:
														
 
															+        sentence_end_marks = ['.', '!', '?', '。'] 
														
 
															+        if text.endswith("..."):
														
 
															+            recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
														
 
															+        elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
														
 
															+            recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
														
 
															+        else:
														
 
															+            recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
														
 
															     prev_text = text
														
@@ -193,10 +196,14 @@ def text_detected(text, loop):
 
															         'text': text
														
 
															     })
														
 
															     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
														
 
															+
														
 
															+    # Get current timestamp in HH:MM:SS.nnn format
														
 
															+    timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															+
														
 
															     if extended_logging:
														
 
															-        print(f"Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
														
 
															+        print(f"  [{timestamp}] Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
														
 
															     else:
														
 
															-        print(f"\r{bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
														
 
															+        print(f"\r[{timestamp}] {bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
														
 
															 def on_recording_start(loop):
														
 
															     # Send a message to the client indicating recording has started
														
@@ -348,7 +355,7 @@ def parse_arguments():
 
															     parser.add_argument('--wake_word_timeout', type=float, default=5.0,
														
 
															                         help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
														
 
															-    parser.add_argument('--wake_word_activation_delay', type=float, default=0.5,
														
 
															+    parser.add_argument('--wake_word_activation_delay', type=float, default=20,
														
 
															                         help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0.5 seconds.')
														
 
															     parser.add_argument('--wakeword_backend', type=str, default='pvporcupine',
														
@@ -369,8 +376,14 @@ def parse_arguments():
 
															     parser.add_argument('--use_extended_logging', action='store_true',
														
 
															                         help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
														
 
															+    # Parse arguments
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    # Replace escaped newlines with actual newlines in initial_prompt
														
 
															+    if args.initial_prompt:
														
 
															+        args.initial_prompt = args.initial_prompt.replace("\\n", "\n")
														
 
															-    return parser.parse_args()
														
 
															+    return args
														
 
															 def _recorder_thread(loop):
														
 
															     global recorder, prev_text, stop_recorder
														
@@ -390,10 +403,12 @@ def _recorder_thread(loop):
 
															         # Use the passed event loop here
														
 
															         asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
														
 
															+        timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															+
														
 
															         if extended_logging:
														
 
															-            print(f"Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}")
														
 
															+            print(f"  [{timestamp}] Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n", flush=True, end="")
														
 
															         else:
														
 
															-            print(f"\r{bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
														
 
															+            print(f"\r[{timestamp}] {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
														
 
															     try:
														
 
															         while not stop_recorder:
														
 
															             recorder.text(process_text)
														
@@ -445,8 +460,9 @@ async def control_handler(websocket, path):
 
															                                 value_formatted = f"{value:.2f}"
														
 
															                             else:
														
 
															                                 value_formatted = value
														
 
															+                            timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															                             if extended_logging:
														
 
															-                                print(f"{bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
														
 
															+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
														
 
															                             # Optionally send a response back to the client
														
 
															                             await websocket.send(json.dumps({"status": "success", "message": f"Parameter {parameter} set to {value}"}))
														
 
															                         else:
														
@@ -469,8 +485,9 @@ async def control_handler(websocket, path):
 
															                             value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
														
 
															+                            timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															                             if extended_logging:
														
 
															-                                print(f"{bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
														
 
															+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
														
 
															                             response = {"status": "success", "parameter": parameter, "value": value}
														
 
															                             if request_id is not None:
														
 
															                                 response["request_id"] = request_id
														
@@ -490,7 +507,8 @@ async def control_handler(websocket, path):
 
															                                 args = command_data.get("args", [])
														
 
															                                 kwargs = command_data.get("kwargs", {})
														
 
															                                 method(*args, **kwargs)
														
 
															-                                print(f"{bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
														
 
															+                                timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															+                                print(f"  [{timestamp}] {bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
														
 
															                                 await websocket.send(json.dumps({"status": "success", "message": f"Method {method_name} called"}))
														
 
															                             else:
														
 
															                                 print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
														
@@ -541,8 +559,10 @@ async def broadcast_audio_messages():
 
															         message = await audio_queue.get()
														
 
															         for conn in list(data_connections):
														
 
															             try:
														
 
															+                timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
														
 
															+
														
 
															                 if extended_logging:
														
 
															-                    print(f"    {bcolors.OKBLUE}Sending message: {message}{bcolors.ENDC}\n", flush=True, end="")
														
 
															+                    print(f"  [{timestamp}] Sending message: {bcolors.OKBLUE}{message}{bcolors.ENDC}\n", flush=True, end="")
														
 
															                 await conn.send(message)
														
 
															             except websockets.exceptions.ConnectionClosed:
														
 
															                 data_connections.remove(conn)