KoljaB 7 mesiacov pred
rodič
commit
4fcda3ffba

+ 18 - 9
RealtimeSTT/audio_recorder.py

@@ -1377,6 +1377,17 @@ class AudioToTextRecorder:
 
         return self
 
+    def listen(self):
+        """
+        Puts recorder in immediate "listen" state.
+        This is the state after a wake word detection, for example.
+        The recorder now "listens" for voice activation.
+        Once voice is detected we enter "recording" state.
+        """
+        self.listen_start = time.time()
+        self._set_state("listening")
+        self.start_recording_on_voice_activity = True
+
     def feed_audio(self, chunk, original_sample_rate=16000):
         """
         Feed an audio chunk into the processing pipeline. Chunks are
@@ -1453,14 +1464,14 @@ class AudioToTextRecorder:
             logging.debug('Terminating reader process')
 
             # Give it some time to finish the loop and cleanup.
-            if self.use_microphone:
+            if self.use_microphone.value:
                 self.reader_process.join(timeout=10)
 
-            if self.reader_process.is_alive():
-                logging.warning("Reader process did not terminate "
-                                "in time. Terminating forcefully."
-                                )
-                self.reader_process.terminate()
+                if self.reader_process.is_alive():
+                    logging.warning("Reader process did not terminate "
+                                    "in time. Terminating forcefully."
+                                    )
+                    self.reader_process.terminate()
 
             logging.debug('Terminating transcription process')
             self.transcript_process.join(timeout=10)
@@ -1819,9 +1830,7 @@ class AudioToTextRecorder:
                                 if self.use_extended_logging:
                                     logging.debug('Debug: Handling non-wake word scenario')
                                 if not self.use_wake_words:
-                                    self.listen_start = time.time()
-                                    self._set_state("listening")
-                                    self.start_recording_on_voice_activity = True
+                                    self.listen()
                             else:
                                 if self.use_extended_logging:
                                     logging.debug('Debug: Setting failed_stop_attempt to True')

+ 0 - 0
server/__init__.py


+ 105 - 0
server/install_packages.py

@@ -0,0 +1,105 @@
+import subprocess
+import sys
+import importlib
+
+def check_and_install_packages(packages):
+    """
+    Checks if the specified packages are installed, and if not, prompts the user
+    to install them.
+
+    Parameters:
+    - packages: A list of dictionaries, each containing:
+        - 'module_name': The module or package name to import.
+        - 'attribute': (Optional) The attribute or class to check within the module.
+        - 'install_name': The name used in the pip install command.
+        - 'version': (Optional) Version constraint for the package.
+    """
+    for package in packages:
+        module_name = package['module_name']
+        attribute = package.get('attribute')
+        install_name = package.get('install_name', module_name)
+        version = package.get('version', '')
+
+        try:
+            # Attempt to import the module
+            module = importlib.import_module(module_name)
+            # If an attribute is specified, check if it exists
+            if attribute:
+                getattr(module, attribute)
+        except (ImportError, AttributeError):
+            user_input = input(
+                f"This program requires '{module_name}'"
+                f"{'' if not attribute else ' with attribute ' + attribute}, which is not installed or missing.\n"
+                f"Do you want to install '{install_name}' now? (y/n): "
+            )
+            if user_input.strip().lower() == 'y':
+                try:
+                    # Build the pip install command
+                    install_command = [sys.executable, "-m", "pip", "install"]
+                    if version:
+                        install_command.append(f"{install_name}{version}")
+                    else:
+                        install_command.append(install_name)
+
+                    subprocess.check_call(install_command)
+                    # Try to import again after installation
+                    module = importlib.import_module(module_name)
+                    if attribute:
+                        getattr(module, attribute)
+                    print(f"Successfully installed '{install_name}'.")
+                except Exception as e:
+                    print(f"An error occurred while installing '{install_name}': {e}")
+                    sys.exit(1)
+            else:
+                print(f"The program requires '{install_name}' to run. Exiting...")
+                sys.exit(1)
+
+
+# import subprocess
+# import sys
+
+# def check_and_install_packages(packages):
+#     """
+#     Checks if the specified packages are installed, and if not, prompts the user
+#     to install them.
+
+#     Parameters:
+#     - packages: A list of dictionaries, each containing:
+#         - 'import_name': The name used in the import statement.
+#         - 'install_name': (Optional) The name used in the pip install command.
+#                           Defaults to 'import_name' if not provided.
+#         - 'version': (Optional) Version constraint for the package.
+#     """
+#     for package in packages:
+#         import_name = package['import_name']
+#         install_name = package.get('install_name', import_name)
+#         version = package.get('version', '')
+
+#         try:
+#             print(f"import {import_name}")
+#             __import__(import_name)
+#             print(f"imported {import_name}")
+#         except ImportError:
+#             user_input = input(
+#                 f"This program requires the '{import_name}' library, which is not installed.\n"
+#                 f"Do you want to install it now? (y/n): "
+#             )
+#             if user_input.strip().lower() == 'y':
+#                 try:
+#                     # Build the pip install command
+#                     install_command = [sys.executable, "-m", "pip", "install"]
+#                     if version:
+#                         install_command.append(f"{install_name}{version}")
+#                     else:
+#                         install_command.append(install_name)
+
+#                     subprocess.check_call(install_command)
+#                     __import__(import_name)
+#                     print(f"Successfully installed '{install_name}'.")
+#                 except Exception as e:
+#                     print(f"An error occurred while installing '{install_name}': {e}")
+#                     sys.exit(1)
+#             else:
+#                 print(f"The program requires the '{import_name}' library to run. Exiting...")
+#                 sys.exit(1)
+

+ 307 - 0
server/stt_cli_client.py

@@ -0,0 +1,307 @@
+from .install_packages import check_and_install_packages
+
+check_and_install_packages([
+    {
+        'module_name': 'websocket',                    # Import module
+        'install_name': 'websocket-client',            # Package name for pip install (websocket-client is the correct package for websocket)
+    },
+    {
+        'module_name': 'pyaudio',                      # Import module
+        'install_name': 'pyaudio',                     # Package name for pip install
+    },
+    {
+        'module_name': 'colorama',                     # Import module
+        'attribute': 'init',                           # Attribute to check (init method from colorama)
+        'install_name': 'colorama',                    # Package name for pip install
+        'version': '',                                 # Optional version constraint
+    },
+])
+
+import websocket
+import pyaudio
+from colorama import init, Fore, Style
+
+import argparse
+import json
+import threading
+import time
+import struct
+import os
+import sys
+import socket
+import subprocess
+import shutil
+from urllib.parse import urlparse
+from queue import Queue
+
+# Constants
+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+DEFAULT_SERVER_URL = "ws://localhost:8011"
+
+class STTWebSocketClient:
+    def __init__(self, server_url, debug=False, file_output=None, norealtime=False):
+        self.server_url = server_url
+        self.ws = None
+        self.is_running = False
+        self.debug = debug
+        self.file_output = file_output
+        self.last_text = ""
+        self.pbar = None
+        self.console_width = shutil.get_terminal_size().columns
+        self.recording_indicator = "🔴"
+        self.norealtime = norealtime
+        self.connection_established = threading.Event()
+        self.message_queue = Queue()
+
+    def debug_print(self, message):
+        if self.debug:
+            print(message, file=sys.stderr)
+
+    def connect(self):
+        if not self.ensure_server_running():
+            self.debug_print("Cannot start STT server. Exiting.")
+            return False
+
+        websocket.enableTrace(self.debug)
+        try:
+            
+            self.ws = websocket.WebSocketApp(self.server_url,
+                                             on_message=self.on_message,
+                                             on_error=self.on_error,
+                                             on_close=self.on_close,
+                                             on_open=self.on_open)
+            
+            self.ws_thread = threading.Thread(target=self.ws.run_forever)
+            self.ws_thread.daemon = True
+            self.ws_thread.start()
+
+            # Wait for the connection to be established
+            if not self.connection_established.wait(timeout=10):
+                self.debug_print("Timeout while connecting to the server.")
+                return False
+            
+            self.debug_print("WebSocket connection established successfully.")
+            return True
+        except Exception as e:
+            self.debug_print(f"Error while connecting to the server: {e}")
+            return False
+
+
+    def on_open(self, ws):
+        self.debug_print("WebSocket connection opened.")
+        self.is_running = True
+        self.connection_established.set()
+        self.start_recording()
+
+    def on_error(self, ws, error):
+        self.debug_print(f"WebSocket error: {error}")
+
+    def on_close(self, ws, close_status_code, close_msg):
+        self.debug_print(f"WebSocket connection closed: {close_status_code} - {close_msg}")
+        self.is_running = False
+
+    def is_server_running(self):
+        parsed_url = urlparse(self.server_url)
+        host = parsed_url.hostname
+        port = parsed_url.port or 80
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex((host, port)) == 0
+
+    def ask_to_start_server(self):
+        response = input("Would you like to start the STT server now? (y/n): ").strip().lower()
+        return response == 'y' or response == 'yes'
+
+    def start_server(self):
+        if os.name == 'nt':  # Windows
+            subprocess.Popen('start /min cmd /c stt-server', shell=True)
+        else:  # Unix-like systems
+            subprocess.Popen(['stt-server'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
+        print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
+
+
+    def ensure_server_running(self):
+        if not self.is_server_running():
+            print("STT server is not running.", file=sys.stderr)
+            if self.ask_to_start_server():
+                self.start_server()
+                print("Waiting for STT server to start...", file=sys.stderr)
+                for _ in range(20):  # Wait up to 20 seconds
+                    if self.is_server_running():
+                        print("STT server started successfully.", file=sys.stderr)
+                        time.sleep(2)  # Give the server a moment to fully initialize
+                        return True
+                    time.sleep(1)
+                print("Failed to start STT server.", file=sys.stderr)
+                return False
+            else:
+                print("STT server is required. Please start it manually.", file=sys.stderr)
+                return False
+        return True
+
+    def on_message(self, ws, message):
+        try:
+            data = json.loads(message)
+            if data['type'] == 'realtime':
+                if data['text'] != self.last_text:
+                    self.last_text = data['text']
+                    if not self.norealtime:
+                        self.update_progress_bar(self.last_text) 
+            elif data['type'] == 'fullSentence':
+                if self.file_output:
+                    sys.stderr.write('\r\033[K')
+                    sys.stderr.write(data['text'])
+                    sys.stderr.write('\n')
+                    sys.stderr.flush()
+                    print(data['text'], file=self.file_output)
+                    self.file_output.flush()  # Ensure it's written immediately
+                else:
+                    self.finish_progress_bar()
+                    print(f"{data['text']}")                    
+                    # self.update_progress_bar("") 
+                    # print(f"\r\033[K{data['text']}")
+                    # #print(f"\r\033[KHello")
+                    # self.stop() 
+                    # print("what the fuck")
+                    # print("what the fuck")
+                    # print(f"what the fuck self.file_output {self.file_output}")
+                    # self.update_progress_bar("FGINAAL") 
+                    # self.stop()
+                    # sys.stderr.write(f"\n{data['text']}")
+                    # sys.stderr.write(f"\n{data['text']}")
+                    # sys.stderr.write(f"\nTEEEST")
+                    # sys.stderr.write(f"\nTEEEST")
+                    # sys.stderr.flush()
+                    # print("what the fuck")
+                    # print("what the fuck")
+                    # print("what the fuck")
+                    # print("what the fuck")
+                    # print("what the fuck")
+                self.stop()
+                
+        except json.JSONDecodeError:
+            self.debug_print(f"\nReceived non-JSON message: {message}")
+
+    def show_initial_indicator(self):
+        if self.norealtime:
+            return
+
+        initial_text = f"{self.recording_indicator}\b\b"
+        sys.stderr.write(initial_text)
+        sys.stderr.flush()
+
+    def update_progress_bar(self, text):
+        # Reserve some space for the progress bar decorations
+        available_width = self.console_width - 5
+        
+        # Clear the current line
+        sys.stderr.write('\r\033[K')  # Move to the beginning of the line and clear it
+
+        # Get the last 'available_width' characters, but don't cut words
+        words = text.split()
+        last_chars = ""
+        for word in reversed(words):
+            if len(last_chars) + len(word) + 1 > available_width:
+                break
+            last_chars = word + " " + last_chars
+
+        last_chars = last_chars.strip()
+
+        # Color the text yellow and add recording indicator
+        colored_text = f"{Fore.YELLOW}{last_chars}{Style.RESET_ALL}{self.recording_indicator}\b\b"
+
+        sys.stderr.write(colored_text)
+        sys.stderr.flush()
+
+    def finish_progress_bar(self):
+        # Clear the current line
+        sys.stderr.write('\r\033[K')
+        sys.stderr.flush()
+
+    def stop(self):
+        self.finish_progress_bar()
+        self.is_running = False
+        if self.ws:
+            self.ws.close()
+        if hasattr(self, 'ws_thread'):
+            self.ws_thread.join(timeout=2)
+
+    def start_recording(self):
+        self.show_initial_indicator()
+        threading.Thread(target=self.record_and_send_audio).start()
+
+    def record_and_send_audio(self):
+        p = pyaudio.PyAudio()
+        stream = p.open(format=FORMAT,
+                        input_device_index=1,
+                        channels=CHANNELS,
+                        rate=RATE,
+                        input=True,
+                        frames_per_buffer=CHUNK)
+
+        self.debug_print("Recording and sending audio...")
+
+        while self.is_running:
+            try:
+                audio_data = stream.read(CHUNK)
+                
+                # Prepare metadata
+                metadata = {
+                    "sampleRate": RATE
+                }
+                metadata_json = json.dumps(metadata)
+                metadata_length = len(metadata_json)
+                
+                # Construct the message
+                message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
+                
+                self.ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
+            except Exception as e:
+                self.debug_print(f"\nError sending audio data: {e}")
+                break
+
+        self.debug_print("Stopped recording.")
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="STT Client")
+    parser.add_argument("--server", default=DEFAULT_SERVER_URL, help="STT WebSocket server URL")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("-nort", "--norealtime", action="store_true", help="Disable real-time output")    
+    args = parser.parse_args()
+
+    # Check if output is being redirected
+    if not os.isatty(sys.stdout.fileno()):
+        file_output = sys.stdout
+    else:
+        file_output = None
+    
+    client = STTWebSocketClient(args.server, args.debug, file_output, args.norealtime)
+  
+    def signal_handler(sig, frame):
+        # print("\nInterrupted by user, shutting down...")
+        client.stop()
+        sys.exit(0)
+
+    import signal
+    signal.signal(signal.SIGINT, signal_handler)
+    
+    try:
+        if client.connect():
+            # print("Connection established. Recording... (Press Ctrl+C to stop)", file=sys.stderr)
+            while client.is_running:
+                time.sleep(0.1)
+        else:
+            print("Failed to connect to the server.", file=sys.stderr)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        client.stop()
+
+if __name__ == "__main__":
+    main()

+ 309 - 0
server/stt_server.py

@@ -0,0 +1,309 @@
+from .install_packages import check_and_install_packages
+
+check_and_install_packages([
+    {
+        'module_name': 'RealtimeSTT',                 # Import module
+        'attribute': 'AudioToTextRecorder',           # Specific class to check
+        'install_name': 'RealtimeSTT',                # Package name for pip install
+    },
+    {
+        'module_name': 'websockets',                  # Import module
+        'install_name': 'websockets',                 # Package name for pip install
+    },
+    {
+        'module_name': 'numpy',                       # Import module
+        'install_name': 'numpy',                      # Package name for pip install
+    },
+    {
+        'module_name': 'scipy.signal',                # Submodule of scipy
+        'attribute': 'resample',                      # Specific function to check
+        'install_name': 'scipy',                      # Package name for pip install
+    }
+])
+
+print("Starting server, please wait...")
+
+import asyncio
+import threading
+import json
+import websockets
+from RealtimeSTT import AudioToTextRecorder
+import numpy as np
+from scipy.signal import resample
+
+global_args = None
+recorder = None
+recorder_config = {}
+recorder_ready = threading.Event()
+client_websocket = None
+stop_recorder = False
+prev_text = ""
+
+
+async def send_to_client(message):
+    global client_websocket
+    if client_websocket and client_websocket.open:
+        try:
+            await client_websocket.send(message)
+        except websockets.exceptions.ConnectionClosed:
+            print("Client websocket is closed, resetting client_websocket")
+            client_websocket = None
+    else:
+        print("No client connected or connection is closed.")
+        client_websocket = None  # Ensure it resets
+
+def preprocess_text(text):
+    # Remove leading whitespaces
+    text = text.lstrip()
+
+    #  Remove starting ellipses if present
+    if text.startswith("..."):
+        text = text[3:]
+
+    # Remove any leading whitespaces again after ellipses removal
+    text = text.lstrip()
+
+    # Uppercase the first letter
+    if text:
+        text = text[0].upper() + text[1:]
+    
+    return text
+
+def text_detected(text):
+    global prev_text
+
+    text = preprocess_text(text)
+
+    sentence_end_marks = ['.', '!', '?', '。'] 
+    if text.endswith("..."):
+        recorder.post_speech_silence_duration = global_args.mid_sentence_detection_pause
+    elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+        recorder.post_speech_silence_duration = global_args.end_of_sentence_detection_pause
+    else:
+        recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
+
+    prev_text = text
+
+    try:
+        asyncio.new_event_loop().run_until_complete(
+            send_to_client(
+                json.dumps({
+                    'type': 'realtime',
+                    'text': text
+                })
+            )
+        )
+    except Exception as e:
+        print(f"Error in text_detected while sending to client: {e}")
+    print(f"\r{text}", flush=True, end='')
+
+# Define the server's arguments
+def parse_arguments():
+    import argparse
+    parser = argparse.ArgumentParser(description='Start the Speech-to-Text (STT) server with various configuration options.')
+    
+    parser.add_argument('--model', type=str, default='medium.en',
+                        help='Path to the STT model or model size. Options: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2 or any hugginface CTranslate2 stt model like deepdml/faster-whisper-large-v3-turbo-ct2. Default: medium.en')
+    
+    parser.add_argument('--realtime_model_type', type=str, default='tiny.en',
+                        help='Model size for real-time transcription. Same options as --model. Used only if real-time transcription is enabled. Default: tiny.en')
+    
+    parser.add_argument('--language', type=str, default='en',
+                        help='Language code for the STT model. Leave empty for auto-detection. Default: en')
+    
+    parser.add_argument('--input_device_index', type=int, default=1,
+                        help='Index of the audio input device to use. Default: 1')
+    
+    parser.add_argument('--silero_sensitivity', type=float, default=0.05,
+                        help='Sensitivity for Silero Voice Activity Detection (0 to 1). Lower values are less sensitive. Default: 0.05')
+    
+    parser.add_argument('--webrtc_sensitivity', type=float, default=3,
+                        help='Sensitivity for WebRTC Voice Activity Detection (0 to 3). Higher values are less sensitive. Default: 3')
+    
+    parser.add_argument('--min_length_of_recording', type=float, default=1.1,
+                        help='Minimum duration (in seconds) for a valid recording. Prevents excessively short recordings. Default: 1.1')
+    
+    parser.add_argument('--min_gap_between_recordings', type=float, default=0,
+                        help='Minimum time (in seconds) between consecutive recordings. Prevents rapid successive recordings. Default: 0')
+    
+    parser.add_argument('--enable_realtime_transcription', action='store_true', default=True,
+                        help='Enable continuous real-time transcription of audio. Default: True')
+    
+    parser.add_argument('--realtime_processing_pause', type=float, default=0.02,
+                        help='Time interval (in seconds) between processing audio chunks for real-time transcription. Lower values increase responsiveness but may increase CPU load. Default: 0.02')
+    
+    parser.add_argument('--silero_deactivity_detection', action='store_true', default=True,
+                        help='Use Silero model for end-of-speech detection. More robust against background noise but uses more GPU resources. Default: True')
+    
+    parser.add_argument('--early_transcription_on_silence', type=float, default=0.2,
+                        help='Start transcription after specified seconds of silence. Should be lower than post_speech_silence_duration. Set to 0 to disable. Default: 0.2')
+    
+    parser.add_argument('--beam_size', type=int, default=5,
+                        help='Beam size for the main transcription model. Larger values may improve accuracy but increase processing time. Default: 5')
+    
+    parser.add_argument('--beam_size_realtime', type=int, default=3,
+                        help='Beam size for the real-time transcription model. Smaller than main beam_size for faster processing. Default: 3')
+    
+    parser.add_argument('--initial_prompt', type=str, 
+                        default='Add periods only for complete sentences. Use ellipsis (...) for unfinished thoughts or unclear endings. Examples: \n- Complete: "I went to the store."\n- Incomplete: "I think it was..."',
+                        help='Initial prompt for the transcription model to guide its output format and style. Default provides instructions for sentence completion and ellipsis usage.')
+    
+    parser.add_argument('--end_of_sentence_detection_pause', type=float, default=0.45,
+                        help='Duration of pause (in seconds) to consider as end of a sentence. Default: 0.45')
+    
+    parser.add_argument('--unknown_sentence_detection_pause', type=float, default=0.7,
+                        help='Duration of pause (in seconds) to consider as an unknown or incomplete sentence. Default: 0.7')
+    
+    parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0,
+                        help='Duration of pause (in seconds) to consider as a mid-sentence break. Default: 2.0')
+    
+    return parser.parse_args()
+
+def _recorder_thread():
+    global recorder, prev_text, stop_recorder
+    # print("Initializing RealtimeSTT...")
+    print(f"Initializing RealtimeSTT server with parameters {recorder_config}")
+    recorder = AudioToTextRecorder(**recorder_config)
+    print("RealtimeSTT initialized")
+    recorder_ready.set()
+    
+    def process_text(full_sentence):
+        full_sentence = preprocess_text(full_sentence)
+        prev_text = ""
+        try:
+            asyncio.new_event_loop().run_until_complete(
+                send_to_client(
+                    json.dumps({
+                        'type': 'fullSentence',
+                        'text': full_sentence
+                    })
+                )
+            )
+        except Exception as e:
+            print(f"Error in _recorder_thread while sending to client: {e}")
+        print(f"\rSentence: {full_sentence}")
+
+    try:
+        while not stop_recorder:
+            recorder.text(process_text)
+    except KeyboardInterrupt:
+        print("Exiting application due to keyboard interrupt")
+
+def decode_and_resample(
+        audio_data,
+        original_sample_rate,
+        target_sample_rate):
+
+    # Decode 16-bit PCM data to numpy array
+    if original_sample_rate == target_sample_rate:
+        return audio_data
+
+    audio_np = np.frombuffer(audio_data, dtype=np.int16)
+
+    # Calculate the number of samples after resampling
+    num_original_samples = len(audio_np)
+    num_target_samples = int(num_original_samples * target_sample_rate /
+                                original_sample_rate)
+
+    # Resample the audio
+    resampled_audio = resample(audio_np, num_target_samples)
+
+    return resampled_audio.astype(np.int16).tobytes()
+
+async def echo(websocket, path):
+    print("Client connected")
+    global client_websocket
+    client_websocket = websocket
+    recorder.post_speech_silence_duration = global_args.unknown_sentence_detection_pause
+    try:
+        async for message in websocket:
+            if not recorder_ready.is_set():
+                print("Recorder not ready")
+                continue
+
+            metadata_length = int.from_bytes(message[:4], byteorder='little')
+            metadata_json = message[4:4+metadata_length].decode('utf-8')
+            metadata = json.loads(metadata_json)
+            sample_rate = metadata['sampleRate']
+            chunk = message[4+metadata_length:]
+            resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
+            recorder.feed_audio(resampled_chunk)
+    except websockets.exceptions.ConnectionClosed as e:
+        print(f"Client disconnected: {e}")
+    finally:
+        print("Resetting client_websocket after disconnect")
+        client_websocket = None  # Reset websocket reference
+
+async def main_async():            
+    global stop_recorder, recorder_config, global_args
+    args = parse_arguments()
+    global_args = args
+
+    recorder_config = {
+        'model': args.model,
+        'realtime_model_type': args.realtime_model_type,
+        'language': args.language,
+        'input_device_index': args.input_device_index,
+        'silero_sensitivity': args.silero_sensitivity,
+        'webrtc_sensitivity': args.webrtc_sensitivity,
+        'post_speech_silence_duration': args.unknown_sentence_detection_pause,
+        'min_length_of_recording': args.min_length_of_recording,
+        'min_gap_between_recordings': args.min_gap_between_recordings,
+        'enable_realtime_transcription': args.enable_realtime_transcription,
+        'realtime_processing_pause': args.realtime_processing_pause,
+        'silero_deactivity_detection': args.silero_deactivity_detection,
+        'early_transcription_on_silence': args.early_transcription_on_silence,
+        'beam_size': args.beam_size,
+        'beam_size_realtime': args.beam_size_realtime,
+        'initial_prompt': args.initial_prompt,
+
+        'spinner': False,
+        'use_microphone': False,
+        'on_realtime_transcription_update': text_detected,
+        'no_log_file': True,
+    }
+
+    start_server = await websockets.serve(echo, "localhost", 8011)
+
+    recorder_thread = threading.Thread(target=_recorder_thread)
+    recorder_thread.start()
+    recorder_ready.wait()
+
+    print("Server started. Press Ctrl+C to stop the server.")
+    
+    try:
+        await start_server.wait_closed()  # This will keep the server running
+    except KeyboardInterrupt:
+        print("Shutting down gracefully...")
+    finally:
+        # Shut down the recorder
+        if recorder:
+            stop_recorder = True
+            recorder.abort()
+            recorder.stop()
+            recorder.shutdown()
+            print("Recorder shut down")
+
+            recorder_thread.join()
+            print("Recorder thread finished")
+        
+        # Cancel all active tasks in the event loop
+        tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
+        for task in tasks:
+            task.cancel()
+        
+        # Run pending tasks and handle cancellation
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+        print("All tasks cancelled, closing event loop now.")
+
+def main():
+    try:
+        asyncio.run(main_async())
+    except KeyboardInterrupt:
+        # Capture any final KeyboardInterrupt to prevent it from showing up in logs
+        print("Server interrupted by user.")
+        exit(0)
+
+if __name__ == '__main__':
+    main()

+ 7 - 1
setup.py

@@ -24,5 +24,11 @@ setuptools.setup(
     ],
     python_requires='>=3.6',
     install_requires=requirements,
-    keywords="real-time, audio, transcription, speech-to-text, voice-activity-detection, VAD, real-time-transcription, ambient-noise-detection, microphone-input, faster_whisper, speech-recognition, voice-assistants, audio-processing, buffered-transcription, pyaudio, ambient-noise-level, voice-deactivity"
+    keywords="real-time, audio, transcription, speech-to-text, voice-activity-detection, VAD, real-time-transcription, ambient-noise-detection, microphone-input, faster_whisper, speech-recognition, voice-assistants, audio-processing, buffered-transcription, pyaudio, ambient-noise-level, voice-deactivity",
+    entry_points={
+        'console_scripts': [
+            'stt-server=server.stt_server:main',
+            'stt=server.stt_cli_client:main',
+        ],
+    },
 )

+ 45 - 0
tests/install_packages.py

@@ -0,0 +1,45 @@
+import subprocess
+import sys
+
+def check_and_install_packages(packages):
+    """
+    Checks if the specified packages are installed, and if not, prompts the user
+    to install them.
+
+    Parameters:
+    - packages: A list of dictionaries, each containing:
+        - 'import_name': The name used in the import statement.
+        - 'install_name': (Optional) The name used in the pip install command.
+                          Defaults to 'import_name' if not provided.
+        - 'version': (Optional) Version constraint for the package.
+    """
+    for package in packages:
+        import_name = package['import_name']
+        install_name = package.get('install_name', import_name)
+        version = package.get('version', '')
+
+        try:
+            __import__(import_name)
+        except ImportError:
+            user_input = input(
+                f"This program requires the '{import_name}' library, which is not installed.\n"
+                f"Do you want to install it now? (y/n): "
+            )
+            if user_input.strip().lower() == 'y':
+                try:
+                    # Build the pip install command
+                    install_command = [sys.executable, "-m", "pip", "install"]
+                    if version:
+                        install_command.append(f"{install_name}{version}")
+                    else:
+                        install_command.append(install_name)
+
+                    subprocess.check_call(install_command)
+                    __import__(import_name)
+                    print(f"Successfully installed '{install_name}'.")
+                except Exception as e:
+                    print(f"An error occurred while installing '{install_name}': {e}")
+                    sys.exit(1)
+            else:
+                print(f"The program requires the '{import_name}' library to run. Exiting...")
+                sys.exit(1)

+ 246 - 0
tests/realtimestt_speechendpoint.py

@@ -0,0 +1,246 @@
+IS_DEBUG = False
+
+import os
+import sys
+import threading
+import queue
+import time
+from install_packages import check_and_install_packages
+
+# Check and install required packages
+check_and_install_packages([
+    {'import_name': 'rich'},
+    {'import_name': 'openai'},
+    {'import_name': 'colorama'},
+    {'import_name': 'RealtimeSTT'},
+    # Add any other required packages here
+])
+
+EXTENDED_LOGGING = False
+
+if __name__ == '__main__':
+
+    if EXTENDED_LOGGING:
+        import logging
+        logging.basicConfig(level=logging.DEBUG)
+
+    from rich.console import Console
+    from rich.live import Live
+    from rich.text import Text
+    from rich.panel import Panel
+    from rich.spinner import Spinner
+    from rich.progress import Progress, SpinnerColumn, TextColumn
+    console = Console()
+    console.print("System initializing, please wait")
+
+    from RealtimeSTT import AudioToTextRecorder
+    from colorama import Fore, Style
+    import colorama
+    from openai import OpenAI
+    # import ollama
+
+    # Initialize OpenAI client for Ollama    
+    client = OpenAI(
+        # base_url='http://127.0.0.1:11434/v1/', # ollama
+        base_url='http://127.0.0.1:1234/v1/', # lm_studio
+        api_key='ollama',  # required but ignored
+    )
+
+    if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
+        from torchaudio._extension.utils import _init_dll_path
+        _init_dll_path()    
+
+    colorama.init()
+
+    # Initialize Rich Console and Live
+    live = Live(console=console, refresh_per_second=10, screen=False)
+    live.start()
+
+    # Initialize a thread-safe queue
+    text_queue = queue.Queue()
+
+    # Variables for managing displayed text
+    full_sentences = []
+    rich_text_stored = ""
+    recorder = None
+    displayed_text = ""
+
+    rapid_sentence_end_detection = 0.4
+    end_of_sentence_detection_pause = 1.2
+    unknown_sentence_detection_pause = 2.5
+    mid_sentence_detection_pause = 3.8
+
+    def clear_console():
+        os.system('clear' if os.name == 'posix' else 'cls')
+
+    prev_text = ""
+
+    def is_speech_finished(text):
+        user_prompt = (
+            "Please reply with only 'c' if the following text is a complete thought (a sentence that stands on its own), "
+            "or 'i' if it is not finished. Do not include any additional text in your reply. "
+            "Consider a full sentence to have a clear subject, verb, and predicate or express a complete idea. "
+            "Examples:\n"
+            "- 'The sky is blue.' is complete (reply 'c').\n"
+            "- 'When the sky' is incomplete (reply 'i').\n"
+            "- 'She walked home.' is complete (reply 'c').\n"
+            "- 'Because he' is incomplete (reply 'i').\n"
+            f"\nText: {text}"
+        )
+
+        response = client.chat.completions.create(
+            model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf",
+            messages=[{"role": "user", "content": user_prompt}],
+            max_tokens=1,
+            temperature=0.0,  # Set temperature to 0 for deterministic output
+        )
+
+        if IS_DEBUG:
+            print(f"t:'{response.choices[0].message.content.strip().lower()}'", end="", flush=True)
+
+        reply = response.choices[0].message.content.strip().lower()
+        return reply == 'c'
+
+    def preprocess_text(text):
+        # Remove leading whitespaces
+        text = text.lstrip()
+
+        #  Remove starting ellipses if present
+        if text.startswith("..."):
+            text = text[3:]
+
+        # Remove any leading whitespaces again after ellipses removal
+        text = text.lstrip()
+
+        # Uppercase the first letter
+        if text:
+            text = text[0].upper() + text[1:]
+        
+        return text
+
+    def text_detected(text):
+        """
+        Enqueue the detected text for processing.
+        """
+        text_queue.put(text)
+
+    def process_queue():
+        global recorder, full_sentences, prev_text, displayed_text, rich_text_stored
+
+        while True:
+            try:
+                text = text_queue.get(timeout=1)  # Wait for text or timeout after 1 second
+            except queue.Empty:
+                continue  # No text to process, continue looping
+
+            if text is None:
+                # Sentinel value to indicate thread should exit
+                break
+
+            text = preprocess_text(text)
+
+            sentence_end_marks = ['.', '!', '?', '。'] 
+            if text.endswith("..."):
+                if not recorder.post_speech_silence_duration == mid_sentence_detection_pause:
+                    recorder.post_speech_silence_duration = mid_sentence_detection_pause
+                    if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
+            elif text and text[-1] in sentence_end_marks and prev_text and prev_text[-1] in sentence_end_marks:
+                if not recorder.post_speech_silence_duration == end_of_sentence_detection_pause:
+                    recorder.post_speech_silence_duration = end_of_sentence_detection_pause
+                    if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
+            else:
+                if not recorder.post_speech_silence_duration == unknown_sentence_detection_pause:
+                    recorder.post_speech_silence_duration = unknown_sentence_detection_pause
+                    if IS_DEBUG: print(f"RT: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
+
+            prev_text = text
+            
+            import string
+            transtext = text.translate(str.maketrans('', '', string.punctuation))
+            
+            if is_speech_finished(transtext):
+                if not recorder.post_speech_silence_duration == rapid_sentence_end_detection:
+                    recorder.post_speech_silence_duration = rapid_sentence_end_detection
+                    if IS_DEBUG: print(f"RT: {transtext} post_speech_silence_duration: {recorder.post_speech_silence_duration}")
+
+            rich_text = Text()
+            for i, sentence in enumerate(full_sentences):
+                if i % 2 == 0:
+                    rich_text += Text(sentence, style="yellow") + Text(" ")
+                else:
+                    rich_text += Text(sentence, style="cyan") + Text(" ")
+            
+            if text:
+                rich_text += Text(text, style="bold yellow")
+
+            new_displayed_text = rich_text.plain
+
+            if new_displayed_text != displayed_text:
+                displayed_text = new_displayed_text
+                panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
+                live.update(panel)
+                rich_text_stored = rich_text
+
+            # Mark the task as done
+            text_queue.task_done()
+
+    def process_text(text):
+        global recorder, full_sentences, prev_text
+        if IS_DEBUG: print(f"SENTENCE: post_speech_silence_duration: {recorder.post_speech_silence_duration}")
+        recorder.post_speech_silence_duration = unknown_sentence_detection_pause
+        text = preprocess_text(text)
+        text = text.rstrip()
+        if text.endswith("..."):
+            text = text[:-2]
+                
+        full_sentences.append(text)
+        prev_text = ""
+        text_detected("")
+
+    # Recorder configuration
+    recorder_config = {
+        'spinner': False,
+        'model': 'medium.en',
+        'input_device_index': 1,
+        'realtime_model_type': 'tiny.en',
+        'language': 'en',
+        'silero_sensitivity': 0.05,
+        'webrtc_sensitivity': 3,
+        'post_speech_silence_duration': unknown_sentence_detection_pause,
+        'min_length_of_recording': 1.1,        
+        'min_gap_between_recordings': 0,                
+        'enable_realtime_transcription': True,
+        'realtime_processing_pause': 0.05,
+        'on_realtime_transcription_update': text_detected,
+        'silero_deactivity_detection': False,
+        'early_transcription_on_silence': 0,
+        'beam_size': 5,
+        'beam_size_realtime': 1,
+        'no_log_file': True,
+        #'initial_prompt': "Use ellipses for incomplete sentences like: I went to the..."        
+    }
+
+    if EXTENDED_LOGGING:
+        recorder_config['level'] = logging.DEBUG
+
+    recorder = AudioToTextRecorder(**recorder_config)
+    
+    initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
+    live.update(initial_text)
+
+    # Start the worker thread
+    worker_thread = threading.Thread(target=process_queue, daemon=True)
+    worker_thread.start()
+
+    try:
+        while True:
+            recorder.text(process_text)
+    except KeyboardInterrupt:
+        # Send sentinel value to worker thread to exit
+        text_queue.put(None)
+        worker_thread.join()
+        live.stop()
+        console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
+        exit(0)
+
+