Browse Source

upgrade server

KoljaB 6 months ago
parent
commit
cacb7a1f27
5 changed files with 376 additions and 108 deletions
  1. 12 1
      README.md
  2. 96 21
      RealtimeSTT/audio_recorder.py
  3. 72 3
      RealtimeSTT/audio_recorder_client.py
  4. 195 82
      server/stt_server.py
  5. 1 1
      setup.py

+ 12 - 1
README.md

@@ -28,7 +28,7 @@ https://github.com/user-attachments/assets/797e6552-27cd-41b1-a7f3-e5cbc72094f5
 
 ### Updates
 
-Latest Version: v0.3.1
+Latest Version: v0.3.2
 
 See [release history](https://github.com/KoljaB/RealtimeSTT/releases).
 
@@ -549,6 +549,17 @@ Suggested starting parameters for OpenWakeWord usage:
         ) as recorder:
 ```
 
+## FAQ
+
+### Q: I encountered the following error: "Unable to load any of {libcudnn_ops.so.9.1.0, libcudnn_ops.so.9.1, libcudnn_ops.so.9, libcudnn_ops.so} Invalid handle. Cannot load symbol cudnnCreateTensorDescriptor." How do I fix this?
+
+**A:** This issue arises from a mismatch between the version of `ctranslate2` and cuDNN. The `ctranslate2` library was updated to version 4.5.0, which uses cuDNN 9.2. There are two ways to resolve this issue:
+1. **Downgrade `ctranslate2` to version 4.4.0**:
+   ```bash
+   pip install ctranslate2==4.4.0
+   ```
+2. **Upgrade cuDNN** on your system to version 9.2 or above.
+
 ## Contribution
 
 Contributions are always welcome! 

+ 96 - 21
RealtimeSTT/audio_recorder.py

@@ -463,6 +463,7 @@ class AudioToTextRecorder:
             Exception: Errors related to initializing transcription
             model, wake word detection, or audio recording.
         """
+
         self.language = language
         self.compute_type = compute_type
         self.input_device_index = input_device_index
@@ -598,6 +599,11 @@ class AudioToTextRecorder:
 
         logging.info("Starting RealTimeSTT")
 
+        if use_extended_logging:
+            logging.info("RealtimeSTT was called with these parameters:")
+            for param, value in locals().items():
+                logging.info(f"{param}: {value}")
+
         self.interrupt_stop_event = mp.Event()
         self.was_interrupted = mp.Event()
         self.main_transcription_ready_event = mp.Event()
@@ -922,14 +928,78 @@ class AudioToTextRecorder:
 
         def initialize_audio_stream(audio_interface, sample_rate, chunk_size):
             nonlocal input_device_index
+
+            def validate_device(device_index):
+                """Validate that the device exists and is actually available for input."""
+                try:
+                    device_info = audio_interface.get_device_info_by_index(device_index)
+                    if not device_info.get('maxInputChannels', 0) > 0:
+                        return False
+
+                    # Try to actually read from the device
+                    test_stream = audio_interface.open(
+                        format=pyaudio.paInt16,
+                        channels=1,
+                        rate=target_sample_rate,
+                        input=True,
+                        frames_per_buffer=chunk_size,
+                        input_device_index=device_index,
+                        start=False  # Don't start the stream yet
+                    )
+
+                    # Start the stream and try to read from it
+                    test_stream.start_stream()
+                    test_data = test_stream.read(chunk_size, exception_on_overflow=False)
+                    test_stream.stop_stream()
+                    test_stream.close()
+
+                    # Check if we got valid data
+                    if len(test_data) == 0:
+                        return False
+
+                    return True
+
+                except Exception as e:
+                    logging.debug(f"Device validation failed: {e}")
+                    return False
+
             """Initialize the audio stream with error handling."""
             while not shutdown_event.is_set():
                 try:
-                    # Check and assign the input device index if it is not set
-                    if input_device_index is None:
-                        default_device = audio_interface.get_default_input_device_info()
-                        input_device_index = default_device['index']
+                    # First, get a list of all available input devices
+                    input_devices = []
+                    for i in range(audio_interface.get_device_count()):
+                        try:
+                            device_info = audio_interface.get_device_info_by_index(i)
+                            if device_info.get('maxInputChannels', 0) > 0:
+                                input_devices.append(i)
+                        except Exception:
+                            continue
 
+                    if not input_devices:
+                        raise Exception("No input devices found")
+
+                    # If input_device_index is None or invalid, try to find a working device
+                    if input_device_index is None or input_device_index not in input_devices:
+                        # First try the default device
+                        try:
+                            default_device = audio_interface.get_default_input_device_info()
+                            if validate_device(default_device['index']):
+                                input_device_index = default_device['index']
+                        except Exception:
+                            # If default device fails, try other available input devices
+                            for device_index in input_devices:
+                                if validate_device(device_index):
+                                    input_device_index = device_index
+                                    break
+                            else:
+                                raise Exception("No working input devices found")
+
+                    # Validate the selected device one final time
+                    if not validate_device(input_device_index):
+                        raise Exception("Selected device validation failed")
+
+                    # If we get here, we have a validated device
                     stream = audio_interface.open(
                         format=pyaudio.paInt16,
                         channels=1,
@@ -938,13 +1008,15 @@ class AudioToTextRecorder:
                         frames_per_buffer=chunk_size,
                         input_device_index=input_device_index,
                     )
-                    logging.info("Microphone connected successfully.")
+
+                    logging.info(f"Microphone connected and validated (input_device_index: {input_device_index})")
                     return stream
 
                 except Exception as e:
                     logging.error(f"Microphone connection failed: {e}. Retrying...")
                     input_device_index = None
-                    time.sleep(3)  # Wait for 3 seconds before retrying
+                    time.sleep(3)  # Wait before retrying
+                    continue
 
         def preprocess_audio(chunk, original_sample_rate, target_sample_rate):
             """Preprocess audio chunk similar to feed_audio method."""
@@ -980,7 +1052,8 @@ class AudioToTextRecorder:
         def setup_audio():  
             nonlocal audio_interface, stream, device_sample_rate, input_device_index
             try:
-                audio_interface = pyaudio.PyAudio()
+                if audio_interface is None:
+                    audio_interface = pyaudio.PyAudio()
                 if input_device_index is None:
                     try:
                         default_device = audio_interface.get_default_input_device_info()
@@ -1024,6 +1097,7 @@ class AudioToTextRecorder:
         silero_buffer_size = 2 * buffer_size  # silero complains if too short
 
         time_since_last_buffer_message = 0
+
         try:
             while not shutdown_event.is_set():
                 try:
@@ -1057,7 +1131,7 @@ class AudioToTextRecorder:
                     else:
                         logging.error(f"OSError during recording: {e}")
                         # Attempt to reinitialize the stream
-                        logging.info("Attempting to reinitialize the audio stream...")
+                        logging.error("Attempting to reinitialize the audio stream...")
 
                         try:
                             if stream:
@@ -1065,9 +1139,6 @@ class AudioToTextRecorder:
                                 stream.close()
                         except Exception as e:
                             pass
-
-                        if audio_interface:
-                            audio_interface.terminate()
                         
                         # Wait a bit before trying to reinitialize
                         time.sleep(1)
@@ -1076,7 +1147,7 @@ class AudioToTextRecorder:
                             logging.error("Failed to reinitialize audio stream. Exiting.")
                             break
                         else:
-                            logging.info("Audio stream reinitialized successfully.")
+                            logging.error("Audio stream reinitialized successfully.")
                     continue
 
                 except Exception as e:
@@ -1086,14 +1157,15 @@ class AudioToTextRecorder:
                     logging.error(f"Error: {e}")
                     # Attempt to reinitialize the stream
                     logging.info("Attempting to reinitialize the audio stream...")
-                    if stream:
-                        stream.stop_stream()
-                        stream.close()
-                    if audio_interface:
-                        audio_interface.terminate()
+                    try:
+                        if stream:
+                            stream.stop_stream()
+                            stream.close()
+                    except Exception as e:
+                        pass
                     
                     # Wait a bit before trying to reinitialize
-                    time.sleep(0.5)
+                    time.sleep(1)
                     
                     if not setup_audio():
                         logging.error("Failed to reinitialize audio stream. Exiting.")
@@ -1110,9 +1182,12 @@ class AudioToTextRecorder:
             if buffer:
                 audio_queue.put(bytes(buffer))
             
-            if stream:
-                stream.stop_stream()
-                stream.close()
+            try:
+                if stream:
+                    stream.stop_stream()
+                    stream.close()
+            except Exception as e:
+                pass
             if audio_interface:
                 audio_interface.terminate()
 

+ 72 - 3
RealtimeSTT/audio_recorder_client.py

@@ -1,3 +1,5 @@
+log_outgoing_chunks = False
+
 from typing import Iterable, List, Optional, Union
 from urllib.parse import urlparse
 import subprocess
@@ -208,6 +210,9 @@ class AudioToTextRecorderClient:
         self.realtime_text = ""
         self.final_text = ""
 
+        self.request_counter = 0
+        self.pending_requests = {}  # Map from request_id to threading.Event and value
+
         if self.debug_mode:
             print("Checking STT server")
         if not self.connect():
@@ -349,6 +354,8 @@ class AudioToTextRecorderClient:
             args += ['--language', self.language]
         if self.silero_sensitivity is not None:
             args += ['--silero_sensitivity', str(self.silero_sensitivity)]
+        if self.silero_use_onnx:
+            args.append('--silero_use_onnx')  # flag, no need for True/False
         if self.webrtc_sensitivity is not None:
             args += ['--webrtc_sensitivity', str(self.webrtc_sensitivity)]
         if self.min_length_of_recording is not None:
@@ -359,12 +366,35 @@ class AudioToTextRecorderClient:
             args += ['--realtime_processing_pause', str(self.realtime_processing_pause)]
         if self.early_transcription_on_silence is not None:
             args += ['--early_transcription_on_silence', str(self.early_transcription_on_silence)]
+        if self.silero_deactivity_detection:
+            args.append('--silero_deactivity_detection')  # flag, no need for True/False
         if self.beam_size is not None:
             args += ['--beam_size', str(self.beam_size)]
         if self.beam_size_realtime is not None:
             args += ['--beam_size_realtime', str(self.beam_size_realtime)]
         if self.initial_prompt:
             args += ['--initial_prompt', self.initial_prompt]
+        if self.wake_words is not None:
+            args += ['--wake_words', str(self.wake_words)]
+        if self.wake_words_sensitivity is not None:
+            args += ['--wake_words_sensitivity', str(self.wake_words_sensitivity)]
+        if self.wake_word_timeout is not None:
+            args += ['--wake_word_timeout', str(self.wake_word_timeout)]
+        if self.wake_word_activation_delay is not None:
+            args += ['--wake_word_activation_delay', str(self.wake_word_activation_delay)]
+        if self.wakeword_backend is not None:
+            args += ['--wakeword_backend', str(self.wakeword_backend)]
+        if self.openwakeword_model_paths:
+            args += ['--openwakeword_model_paths', str(self.openwakeword_model_paths)]
+        if self.openwakeword_inference_framework is not None:
+            args += ['--openwakeword_inference_framework', str(self.openwakeword_inference_framework)]
+        if self.wake_word_buffer_duration is not None:
+            args += ['--wake_word_buffer_duration', str(self.wake_word_buffer_duration)]
+        if self.use_main_model_for_realtime:
+            args.append('--use_main_model_for_realtime')  # flag, no need for True/False
+        if self.use_extended_logging:
+            args.append('--use_extended_logging')  # flag, no need for True/False
+
         if self.control_url:
             parsed_control_url = urlparse(self.control_url)
             if parsed_control_url.port:
@@ -377,6 +407,7 @@ class AudioToTextRecorderClient:
         # Start the subprocess with the mapped arguments
         if os.name == 'nt':  # Windows
             cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
+            # print(f"Opening server with cli command: {cmd}")
             subprocess.Popen(cmd, shell=True)
         else:  # Unix-like systems
             subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
@@ -480,6 +511,8 @@ class AudioToTextRecorderClient:
                         message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
 
                         if self.is_running:
+                            if log_outgoing_chunks:
+                                print(".", flush=True, end='')
                             self.data_ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
                 except KeyboardInterrupt:  # handle manual interruption (Ctrl+C)
                     if self.debug_mode:
@@ -513,8 +546,12 @@ class AudioToTextRecorderClient:
             if 'status' in data:
                 if data['status'] == 'success':
                     if 'parameter' in data and 'value' in data:
-                        if self.debug_mode:
-                            print(f"Parameter {data['parameter']} = {data['value']}")
+                        request_id = data.get('request_id')
+                        if request_id is not None and request_id in self.pending_requests:
+                            if self.debug_mode:
+                                print(f"Parameter {data['parameter']} = {data['value']}")
+                            self.pending_requests[request_id]['value'] = data['value']
+                            self.pending_requests[request_id]['event'].set()
                 elif data['status'] == 'error':
                     print(f"Server Error: {data.get('message', '')}")
             else:
@@ -550,12 +587,20 @@ class AudioToTextRecorderClient:
             elif data.get('type') == 'vad_detect_start':
                 if self.on_vad_detect_start:
                     self.on_vad_detect_start()
+            elif data.get('type') == 'vad_detect_stop':
+                if self.on_vad_detect_stop:
+                    self.on_vad_detect_stop()
+            elif data.get('type') == 'wakeword_detected':
+                if self.on_wakeword_detected:
+                    self.on_wakeword_detected()
             elif data.get('type') == 'wakeword_detection_start':
                 if self.on_wakeword_detection_start:
                     self.on_wakeword_detection_start()
             elif data.get('type') == 'wakeword_detection_end':
                 if self.on_wakeword_detection_end:
                     self.on_wakeword_detection_end()
+            elif data.get('type') == 'recorded_chunk':
+                pass
 
             else:
                 print(f"Unknown data message format: {data}")
@@ -595,12 +640,36 @@ class AudioToTextRecorderClient:
         self.control_ws.send(json.dumps(command))
 
     def get_parameter(self, parameter):
+        # Generate a unique request_id
+        request_id = self.request_counter
+        self.request_counter += 1
+
+        # Prepare the command with the request_id
         command = {
             "command": "get_parameter",
-            "parameter": parameter
+            "parameter": parameter,
+            "request_id": request_id
         }
+
+        # Create an event to wait for the response
+        event = threading.Event()
+        self.pending_requests[request_id] = {'event': event, 'value': None}
+
+        # Send the command to the server
         self.control_ws.send(json.dumps(command))
 
+        # Wait for the response or timeout after 5 seconds
+        if event.wait(timeout=5):
+            value = self.pending_requests[request_id]['value']
+            # Clean up the pending request
+            del self.pending_requests[request_id]
+            return value
+        else:
+            print(f"Timeout waiting for get_parameter {parameter}")
+            # Clean up the pending request
+            del self.pending_requests[request_id]
+            return None
+
     def call_method(self, method, args=None, kwargs=None):
         command = {
             "command": "call_method",

+ 195 - 82
server/stt_server.py

@@ -66,7 +66,13 @@ The server will broadcast real-time transcription updates to all connected clien
 """
 
 
+extended_logging = True
+send_recorded_chunk = False
+log_incoming_chunks = False
+
+
 import asyncio
+import base64
 import sys
 
 if sys.platform == 'win32':
@@ -95,7 +101,19 @@ check_and_install_packages([
     }
 ])
 
-print("Starting server, please wait...")
+# Define ANSI color codes for terminal output
+class bcolors:
+    HEADER = '\033[95m'   # Magenta
+    OKBLUE = '\033[94m'   # Blue
+    OKCYAN = '\033[96m'   # Cyan
+    OKGREEN = '\033[92m'  # Green
+    WARNING = '\033[93m'  # Yellow
+    FAIL = '\033[91m'     # Red
+    ENDC = '\033[0m'      # Reset to default
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+print(f"{bcolors.BOLD}{bcolors.OKCYAN}Starting server, please wait...{bcolors.ENDC}")
 
 import threading
 import json
@@ -175,7 +193,10 @@ def text_detected(text, loop):
         'text': text
     })
     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
-    print(f"\r{text}", flush=True, end='')
+    if extended_logging:
+        print(f"Realtime text: {bcolors.OKCYAN}{text}{bcolors.ENDC}\n", flush=True, end="")
+    else:
+        print(f"\r{bcolors.OKCYAN}{text}{bcolors.ENDC}", flush=True, end='')
 
 def on_recording_start(loop):
     # Send a message to the client indicating recording has started
@@ -197,6 +218,19 @@ def on_vad_detect_start(loop):
     })
     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
+def on_vad_detect_stop(loop):
+    message = json.dumps({
+        'type': 'vad_detect_stop'
+    })
+    asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
+
+def on_wakeword_detected(loop):
+    # Send a message to the client when wake word detection starts
+    message = json.dumps({
+        'type': 'wakeword_detected'
+    })
+    asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
+
 def on_wakeword_detection_start(loop):
     # Send a message to the client when wake word detection starts
     message = json.dumps({
@@ -227,83 +261,124 @@ def on_realtime_transcription_update(text, loop):
     })
     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
-def on_recorded_chunk(chunk):
-    # Process each recorded audio chunk (optional implementation)
-    pass
+def on_recorded_chunk(chunk, loop):
+    if send_recorded_chunk:
+        bytes_b64 = base64.b64encode(chunk.tobytes()).decode('utf-8')
+        message = json.dumps({
+            'type': 'recorded_chunk',
+            'bytes': bytes_b64
+        })
+        asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
 # Define the server's arguments
 def parse_arguments():
     import argparse
     parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
-    
+
     parser.add_argument('--model', type=str, default='large-v2',
-                        help='Path to the STT model or model size. Options: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2 or any hugginface CTranslate2 stt model like deepdml/faster-whisper-large-v3-turbo-ct2. Default: medium.en')
-    
+                        help='Path to the STT model or model size. Options include: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, or any huggingface CTranslate2 STT model such as deepdml/faster-whisper-large-v3-turbo-ct2. Default is large-v2.')
+
     parser.add_argument('--realtime_model_type', type=str, default='tiny.en',
-                        help='Model size for real-time transcription. Same options as --model. Used only if real-time transcription is enabled. Default: tiny.en')
-    
+                        help='Model size for real-time transcription. The options are the same as --model. This is used only if real-time transcription is enabled. Default is tiny.en.')
+
     parser.add_argument('--language', type=str, default='en',
-                        help='Language code for the STT model. Leave empty for auto-detection. Default: en')
-    
+                        help='Language code for the STT model to transcribe in a specific language. Leave this empty for auto-detection based on input audio. Default is en.')
+
     parser.add_argument('--input_device_index', type=int, default=1,
-                        help='Index of the audio input device to use. Default: 1')
-    
+                        help='Index of the audio input device to use. Use this option to specify a particular microphone or audio input device based on your system. Default is 1.')
+
     parser.add_argument('--silero_sensitivity', type=float, default=0.05,
-                        help='Sensitivity for Silero Voice Activity Detection (0 to 1). Lower values are less sensitive. Default: 0.05')
-    
+                        help='Sensitivity level for Silero Voice Activity Detection (VAD), with a range from 0 to 1. Lower values make the model less sensitive, useful for noisy environments. Default is 0.05.')
+
+    parser.add_argument('--silero_use_onnx', action='store_true', default=False,
+                        help='Enable ONNX version of Silero model for faster performance with lower resource usage. Default is False.')
+
     parser.add_argument('--webrtc_sensitivity', type=int, default=3,
-                        help='Sensitivity for WebRTC Voice Activity Detection (0 to 3). Higher values are less sensitive. Default: 3')
-    
+                        help='Sensitivity level for WebRTC Voice Activity Detection (VAD), with a range from 0 to 3. Higher values make the model less sensitive, useful for cleaner environments. Default is 3.')
+
     parser.add_argument('--min_length_of_recording', type=float, default=1.1,
-                        help='Minimum duration (in seconds) for a valid recording. Prevents excessively short recordings. Default: 1.1')
-    
+                        help='Minimum duration of valid recordings in seconds. This prevents very short recordings from being processed, which could be caused by noise or accidental sounds. Default is 1.1 seconds.')
+
     parser.add_argument('--min_gap_between_recordings', type=float, default=0,
-                        help='Minimum time (in seconds) between consecutive recordings. Prevents rapid successive recordings. Default: 0')
-    
+                        help='Minimum time (in seconds) between consecutive recordings. Setting this helps avoid overlapping recordings when there’s a brief silence between them. Default is 0 seconds.')
+
     parser.add_argument('--enable_realtime_transcription', action='store_true', default=True,
-                        help='Enable continuous real-time transcription of audio. Default: True')
-    
+                        help='Enable continuous real-time transcription of audio as it is received. When enabled, transcriptions are sent in near real-time. Default is True.')
+
     parser.add_argument('--realtime_processing_pause', type=float, default=0.02,
-                        help='Time interval (in seconds) between processing audio chunks for real-time transcription. Lower values increase responsiveness but may increase CPU load. Default: 0.02')
-    
+                        help='Time interval (in seconds) between processing audio chunks for real-time transcription. Lower values increase responsiveness but may put more load on the CPU. Default is 0.02 seconds.')
+
     parser.add_argument('--silero_deactivity_detection', action='store_true', default=True,
-                        help='Use Silero model for end-of-speech detection. More robust against background noise but uses more GPU resources. Default: True')
-    
+                        help='Use the Silero model for end-of-speech detection. This option can provide more robust silence detection in noisy environments, though it consumes more GPU resources. Default is True.')
+
     parser.add_argument('--early_transcription_on_silence', type=float, default=0.2,
-                        help='Start transcription after specified seconds of silence. Should be lower than post_speech_silence_duration. Set to 0 to disable. Default: 0.2')
-    
+                        help='Start transcription after the specified seconds of silence. This is useful when you want to trigger transcription mid-speech when there is a brief pause. Should be lower than post_speech_silence_duration. Set to 0 to disable. Default is 0.2 seconds.')
+
     parser.add_argument('--beam_size', type=int, default=5,
-                        help='Beam size for the main transcription model. Larger values may improve accuracy but increase processing time. Default: 5')
-    
+                        help='Beam size for the main transcription model. Larger values may improve transcription accuracy but increase the processing time. Default is 5.')
+
     parser.add_argument('--beam_size_realtime', type=int, default=3,
-                        help='Beam size for the real-time transcription model. Smaller than main beam_size for faster processing. Default: 3')
-    
-    parser.add_argument('--initial_prompt', type=str, 
-                    default='End incomplete sentences with ellipses.\nExamples:\nComplete: The sky is blue.\nIncomplete: When the sky...\nComplete: She walked home.\nIncomplete: Because he...',
-                    help='Initial prompt for the transcription model to guide its output format and style. Default provides instructions for sentence completion and ellipsis usage.')
-    
+                        help='Beam size for the real-time transcription model. A smaller beam size allows for faster real-time processing but may reduce accuracy. Default is 3.')
+
+    parser.add_argument('--initial_prompt', type=str,
+                        default='End incomplete sentences with ellipses. Examples: Complete: The sky is blue. Incomplete: When the sky... Complete: She walked home. Incomplete: Because he...',
+                        help='Initial prompt that guides the transcription model to produce transcriptions in a particular style or format. The default provides instructions for handling sentence completions and ellipsis usage.')
+
     parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
-                        help='Duration of pause (in seconds) to consider as end of a sentence. Default: 0.45')
-    
+                        help='The duration of silence (in seconds) that the model should interpret as the end of a sentence. This helps the system detect when to finalize the transcription of a sentence. Default is 0.45 seconds.')
+
     parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7,
-                        help='Duration of pause (in seconds) to consider as an unknown or incomplete sentence. Default: 0.7')
-    
+                        help='The duration of pause (in seconds) that the model should interpret as an incomplete or unknown sentence. This is useful for identifying when a sentence is trailing off or unfinished. Default is 0.7 seconds.')
+
     parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0,
-                        help='Duration of pause (in seconds) to consider as a mid-sentence break. Default: 2.0')
-    
+                        help='The duration of pause (in seconds) that the model should interpret as a mid-sentence break. Longer pauses can indicate a pause in speech but not necessarily the end of a sentence. Default is 2.0 seconds.')
+
     parser.add_argument('--control_port', type=int, default=8011,
-                        help='Port for the control WebSocket connection. Default: 8011')
-    
+                        help='The port number used for the control WebSocket connection. Control connections are used to send and receive commands to the server. Default is port 8011.')
+
     parser.add_argument('--data_port', type=int, default=8012,
-                        help='Port for the data WebSocket connection. Default: 8012')
-    
+                        help='The port number used for the data WebSocket connection. Data connections are used to send audio data and receive transcription updates in real time. Default is port 8012.')
+
+    parser.add_argument('--wake_words', type=str, default="Jarvis",
+                        help='Specify the wake word(s) that will trigger the server to start listening. For example, setting this to "Jarvis" will make the system start transcribing when it detects the wake word "Jarvis". Default is "Jarvis".')
+
+    parser.add_argument('--wake_words_sensitivity', type=float, default=0.5,
+                        help='Sensitivity level for wake word detection, with a range from 0 (most sensitive) to 1 (least sensitive). Adjust this value based on your environment to ensure reliable wake word detection. Default is 0.5.')
+
+    parser.add_argument('--wake_word_timeout', type=float, default=5.0,
+                        help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
+
+    parser.add_argument('--wake_word_activation_delay', type=float, default=0.5,
+                        help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0.5 seconds.')
+
+    parser.add_argument('--wakeword_backend', type=str, default='pvporcupine',
+                        help='The backend used for wake word detection. You can specify different backends such as "default" or any custom implementations depending on your setup. Default is "pvporcupine".')
+
+    parser.add_argument('--openwakeword_model_paths', type=str, nargs='*',
+                        help='A list of file paths to OpenWakeWord models. This is useful if you are using OpenWakeWord for wake word detection and need to specify custom models.')
+
+    parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow',
+                        help='The inference framework to use for OpenWakeWord models. Supported frameworks could include "tensorflow", "pytorch", etc. Default is "tensorflow".')
+
+    parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0,
+                        help='Duration of the buffer in seconds for wake word detection. This sets how long the system will store the audio before and after detecting the wake word. Default is 1.0 seconds.')
+
+    parser.add_argument('--use_main_model_for_realtime', action='store_true',
+                        help='Enable this option if you want to use the main model for real-time transcription, instead of the smaller, faster real-time model. Using the main model may provide better accuracy but at the cost of higher processing time.')
+
+    parser.add_argument('--use_extended_logging', action='store_true',
+                        help='Writes extensive log messages for the recording worker, that processes the audio chunks.')
+
+
     return parser.parse_args()
 
 def _recorder_thread(loop):
     global recorder, prev_text, stop_recorder
-    print(f"Initializing RealtimeSTT server with parameters {recorder_config}")
+    print(f"{bcolors.OKGREEN}Initializing RealtimeSTT server with parameters:{bcolors.ENDC}")
+    for key, value in recorder_config.items():
+        print(f"    {bcolors.OKBLUE}{key}{bcolors.ENDC}: {value}")
     recorder = AudioToTextRecorder(**recorder_config)
-    print("RealtimeSTT initialized")
+    print(f"{bcolors.OKGREEN}{bcolors.BOLD}RealtimeSTT initialized{bcolors.ENDC}")
     recorder_ready.set()
     
     def process_text(full_sentence):
@@ -314,13 +389,16 @@ def _recorder_thread(loop):
         })
         # Use the passed event loop here
         asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
-        print(f"\rSentence: {full_sentence}")
 
+        if extended_logging:
+            print(f"Full text: {bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}")
+        else:
+            print(f"\r{bcolors.BOLD}Sentence:{bcolors.ENDC} {bcolors.OKGREEN}{full_sentence}{bcolors.ENDC}\n")
     try:
         while not stop_recorder:
             recorder.text(process_text)
     except KeyboardInterrupt:
-        print("Exiting application due to keyboard interrupt")
+        print(f"{bcolors.WARNING}Exiting application due to keyboard interrupt{bcolors.ENDC}")
 
 def decode_and_resample(
         audio_data,
@@ -344,13 +422,13 @@ def decode_and_resample(
     return resampled_audio.astype(np.int16).tobytes()
 
 async def control_handler(websocket, path):
-    print("Control client connected")
+    print(f"{bcolors.OKGREEN}Control client connected{bcolors.ENDC}")
     global recorder
     control_connections.add(websocket)
     try:
         async for message in websocket:
             if not recorder_ready.is_set():
-                print("Recorder not ready")
+                print(f"{bcolors.WARNING}Recorder not ready{bcolors.ENDC}")
                 continue
             if isinstance(message, str):
                 # Handle text message (command)
@@ -362,27 +440,47 @@ async def control_handler(websocket, path):
                         value = command_data.get("value")
                         if parameter in allowed_parameters and hasattr(recorder, parameter):
                             setattr(recorder, parameter, value)
-                            print(f"Set recorder.{parameter} to {value}")
+                            # Format the value for output
+                            if isinstance(value, float):
+                                value_formatted = f"{value:.2f}"
+                            else:
+                                value_formatted = value
+                            if extended_logging:
+                                print(f"{bcolors.OKGREEN}Set recorder.{parameter} to: {bcolors.OKBLUE}{value_formatted}{bcolors.ENDC}")
                             # Optionally send a response back to the client
                             await websocket.send(json.dumps({"status": "success", "message": f"Parameter {parameter} set to {value}"}))
                         else:
                             if not parameter in allowed_parameters:
-                                print(f"Parameter {parameter} is not allowed (set_parameter)")
+                                print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (set_parameter){bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (set_parameter)"}))
                             else:
-                                print(f"Parameter {parameter} does not exist (set_parameter)")
+                                print(f"{bcolors.WARNING}Parameter {parameter} does not exist (set_parameter){bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (set_parameter)"}))
+
                     elif command == "get_parameter":
                         parameter = command_data.get("parameter")
+                        request_id = command_data.get("request_id")  # Get the request_id from the command data
                         if parameter in allowed_parameters and hasattr(recorder, parameter):
                             value = getattr(recorder, parameter)
-                            await websocket.send(json.dumps({"status": "success", "parameter": parameter, "value": value}))
+                            if isinstance(value, float):
+                                value_formatted = f"{value:.2f}"
+                            else:
+                                value_formatted = f"{value}"
+
+                            value_truncated = value_formatted[:39] + "…" if len(value_formatted) > 40 else value_formatted
+
+                            if extended_logging:
+                                print(f"{bcolors.OKGREEN}Get recorder.{parameter}: {bcolors.OKBLUE}{value_truncated}{bcolors.ENDC}")
+                            response = {"status": "success", "parameter": parameter, "value": value}
+                            if request_id is not None:
+                                response["request_id"] = request_id
+                            await websocket.send(json.dumps(response))
                         else:
                             if not parameter in allowed_parameters:
-                                print(f"Parameter {parameter} is not allowed (get_parameter)")
+                                print(f"{bcolors.WARNING}Parameter {parameter} is not allowed (get_parameter){bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} is not allowed (get_parameter)"}))
                             else:
-                                print(f"Parameter {parameter} does not exist (get_parameter)")
+                                print(f"{bcolors.WARNING}Parameter {parameter} does not exist (get_parameter){bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "error", "message": f"Parameter {parameter} does not exist (get_parameter)"}))
                     elif command == "call_method":
                         method_name = command_data.get("method")
@@ -392,34 +490,36 @@ async def control_handler(websocket, path):
                                 args = command_data.get("args", [])
                                 kwargs = command_data.get("kwargs", {})
                                 method(*args, **kwargs)
-                                print(f"Called method recorder.{method_name}")
+                                print(f"{bcolors.OKGREEN}Called method recorder.{bcolors.OKBLUE}{method_name}{bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "success", "message": f"Method {method_name} called"}))
                             else:
-                                print(f"Recorder does not have method {method_name}")
+                                print(f"{bcolors.WARNING}Recorder does not have method {method_name}{bcolors.ENDC}")
                                 await websocket.send(json.dumps({"status": "error", "message": f"Recorder does not have method {method_name}"}))
                         else:
-                            print(f"Method {method_name} is not allowed")
+                            print(f"{bcolors.WARNING}Method {method_name} is not allowed{bcolors.ENDC}")
                             await websocket.send(json.dumps({"status": "error", "message": f"Method {method_name} is not allowed"}))
                     else:
-                        print(f"Unknown command: {command}")
+                        print(f"{bcolors.WARNING}Unknown command: {command}{bcolors.ENDC}")
                         await websocket.send(json.dumps({"status": "error", "message": f"Unknown command {command}"}))
                 except json.JSONDecodeError:
-                    print("Received invalid JSON command")
+                    print(f"{bcolors.WARNING}Received invalid JSON command{bcolors.ENDC}")
                     await websocket.send(json.dumps({"status": "error", "message": "Invalid JSON command"}))
             else:
-                print("Received unknown message type on control connection")
+                print(f"{bcolors.WARNING}Received unknown message type on control connection{bcolors.ENDC}")
     except websockets.exceptions.ConnectionClosed as e:
-        print(f"Control client disconnected: {e}")
+        print(f"{bcolors.WARNING}Control client disconnected: {e}{bcolors.ENDC}")
     finally:
         control_connections.remove(websocket)
 
 async def data_handler(websocket, path):
-    print("Data client connected")
+    print(f"{bcolors.OKGREEN}Data client connected{bcolors.ENDC}")
     data_connections.add(websocket)
     try:
         while True:
             message = await websocket.recv()
             if isinstance(message, bytes):
+                if log_incoming_chunks:
+                    print(".", end='', flush=True)
                 # Handle binary message (audio data)
                 metadata_length = int.from_bytes(message[:4], byteorder='little')
                 metadata_json = message[4:4+metadata_length].decode('utf-8')
@@ -429,9 +529,9 @@ async def data_handler(websocket, path):
                 resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
                 recorder.feed_audio(resampled_chunk)
             else:
-                print("Received non-binary message on data connection")
+                print(f"{bcolors.WARNING}Received non-binary message on data connection{bcolors.ENDC}")
     except websockets.exceptions.ConnectionClosed as e:
-        print(f"Data client disconnected: {e}")
+        print(f"{bcolors.WARNING}Data client disconnected: {e}{bcolors.ENDC}")
     finally:
         data_connections.remove(websocket)
         recorder.clear_audio_queue()  # Ensure audio queue is cleared if client disconnects
@@ -441,7 +541,8 @@ async def broadcast_audio_messages():
         message = await audio_queue.get()
         for conn in list(data_connections):
             try:
-                # print(f"Sending message: {message}")
+                if extended_logging:
+                    print(f"    {bcolors.OKBLUE}Sending message: {message}{bcolors.ENDC}\n", flush=True, end="")
                 await conn.send(message)
             except websockets.exceptions.ConnectionClosed:
                 data_connections.remove(conn)
@@ -466,6 +567,7 @@ async def main_async():
         'language': args.language,
         'input_device_index': args.input_device_index,
         'silero_sensitivity': args.silero_sensitivity,
+        'silero_use_onnx': args.silero_use_onnx,
         'webrtc_sensitivity': args.webrtc_sensitivity,
         'post_speech_silence_duration': args.unknown_sentence_detection_pause,
         'min_length_of_recording': args.min_length_of_recording,
@@ -477,24 +579,35 @@ async def main_async():
         'beam_size': args.beam_size,
         'beam_size_realtime': args.beam_size_realtime,
         'initial_prompt': args.initial_prompt,
-
+        'wake_words': args.wake_words,
+        'wake_words_sensitivity': args.wake_words_sensitivity,
+        'wake_word_timeout': args.wake_word_timeout,
+        'wake_word_activation_delay': args.wake_word_activation_delay,
+        'wakeword_backend': args.wakeword_backend,
+        'openwakeword_model_paths': args.openwakeword_model_paths,
+        'openwakeword_inference_framework': args.openwakeword_inference_framework,
+        'wake_word_buffer_duration': args.wake_word_buffer_duration,
+        'use_main_model_for_realtime': args.use_main_model_for_realtime,
         'spinner': False,
         'use_microphone': False,
-
         'on_realtime_transcription_update': make_callback(loop, text_detected),
         'on_recording_start': make_callback(loop, on_recording_start),
         'on_recording_stop': make_callback(loop, on_recording_stop),
         'on_vad_detect_start': make_callback(loop, on_vad_detect_start),
+        'on_vad_detect_stop': make_callback(loop, on_vad_detect_stop),
+        'on_wakeword_detected': make_callback(loop, on_wakeword_detected),
         'on_wakeword_detection_start': make_callback(loop, on_wakeword_detection_start),
         'on_wakeword_detection_end': make_callback(loop, on_wakeword_detection_end),
         'on_transcription_start': make_callback(loop, on_transcription_start),
-        'no_log_file': True,
+        'on_recorded_chunk': make_callback(loop, on_recorded_chunk),
+        'no_log_file': True,  # Disable logging to file
+        'use_extended_logging': args.use_extended_logging,
     }
 
     control_server = await websockets.serve(control_handler, "localhost", args.control_port)
     data_server = await websockets.serve(data_handler, "localhost", args.data_port)
-    print(f"Control server started on ws://localhost:{args.control_port}")
-    print(f"Data server started on ws://localhost:{args.data_port}")
+    print(f"{bcolors.OKGREEN}Control server started on {bcolors.OKBLUE}ws://localhost:{args.control_port}{bcolors.ENDC}")
+    print(f"{bcolors.OKGREEN}Data server started on {bcolors.OKBLUE}ws://localhost:{args.data_port}{bcolors.ENDC}")
 
     # Task to broadcast audio messages
     broadcast_task = asyncio.create_task(broadcast_audio_messages())
@@ -503,12 +616,12 @@ async def main_async():
     recorder_thread.start()
     recorder_ready.wait()
 
-    print("Server started. Press Ctrl+C to stop the server.")
+    print(f"{bcolors.OKGREEN}Server started. Press Ctrl+C to stop the server.{bcolors.ENDC}")
 
     try:
         await asyncio.gather(control_server.wait_closed(), data_server.wait_closed(), broadcast_task)
     except KeyboardInterrupt:
-        print("Shutting down gracefully...")
+        print(f"{bcolors.WARNING}{bcolors.BOLD}Shutting down gracefully...{bcolors.ENDC}")
     finally:
         # Shut down the recorder
         if recorder:
@@ -516,10 +629,10 @@ async def main_async():
             recorder.abort()
             recorder.stop()
             recorder.shutdown()
-            print("Recorder shut down")
+            print(f"{bcolors.OKGREEN}Recorder shut down{bcolors.ENDC}")
 
             recorder_thread.join()
-            print("Recorder thread finished")
+            print(f"{bcolors.OKGREEN}Recorder thread finished{bcolors.ENDC}")
         
         # Cancel all active tasks in the event loop
         tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
@@ -529,14 +642,14 @@ async def main_async():
         # Run pending tasks and handle cancellation
         await asyncio.gather(*tasks, return_exceptions=True)
 
-        print("All tasks cancelled, closing event loop now.")
+        print(f"{bcolors.OKGREEN}All tasks cancelled, closing event loop now.{bcolors.ENDC}")
 
 def main():
     try:
         asyncio.run(main_async())
     except KeyboardInterrupt:
         # Capture any final KeyboardInterrupt to prevent it from showing up in logs
-        print("Server interrupted by user.")
+        print(f"{bcolors.WARNING}Server interrupted by user.{bcolors.ENDC}")
         exit(0)
 
 if __name__ == '__main__':

+ 1 - 1
setup.py

@@ -9,7 +9,7 @@ with open('requirements.txt') as f:
 
 setuptools.setup(
     name="RealtimeSTT",
-    version="0.3.1",
+    version="0.3.2",
     author="Kolja Beigel",
     author_email="kolja.beigel@web.de",
     description="A fast Voice Activity Detection and Transcription System",