Преглед на файлове

Remove wakeword functionality

Dayowe преди 5 месеца
родител
ревизия
459f053230
променени са 5 файла, в които са добавени 22 реда и са изтрити 475 реда
  1. 14 2
      RealtimeSTT/__init__.py
  2. 7 341
      RealtimeSTT/audio_recorder.py
  3. 0 62
      RealtimeSTT/audio_recorder_client.py
  4. 0 3
      RealtimeSTT_server/stt_cli_client.py
  5. 1 67
      RealtimeSTT_server/stt_server.py

+ 14 - 2
RealtimeSTT/__init__.py

@@ -1,2 +1,14 @@
-from .audio_recorder import AudioToTextRecorder
-from .audio_recorder_client import AudioToTextRecorderClient
+print("Loading RealtimeSTT __init__.py")
+try:
+    from .audio_recorder import AudioToTextRecorder
+    print("Successfully imported AudioToTextRecorder")
+except Exception as e:
+    print(f"Error importing AudioToTextRecorder: {e}")
+
+try:
+    from .audio_recorder_client import AudioToTextRecorderClient
+    print("Successfully imported AudioToTextRecorderClient")
+except Exception as e:
+    print(f"Error importing AudioToTextRecorderClient: {e}")
+
+__all__ = ['AudioToTextRecorder', 'AudioToTextRecorderClient']

+ 7 - 341
RealtimeSTT/audio_recorder.py

@@ -7,16 +7,12 @@ The class employs the faster_whisper library to transcribe the recorded audio
 into text using machine learning models, which can be run either on a GPU or
 CPU. Voice activity detection (VAD) is built in, meaning the software can
 automatically start or stop recording based on the presence or absence of
-speech. It integrates wake word detection through the pvporcupine library,
-allowing the software to initiate recording when a specific word or phrase
-is spoken. The system provides real-time feedback and can be further
+speech. The system provides real-time feedback and can be further
 customized.
 
 Features:
 - Voice Activity Detection: Automatically starts/stops recording when speech
   is detected or when speech ends.
-- Wake Word Detection: Starts recording when a specified wake word (or words)
-  is detected.
 - Event Callbacks: Customizable callbacks for when recording starts
   or finishes.
 - Fast Transcription: Returns the transcribed text from the audio as fast
@@ -30,15 +26,12 @@ from typing import Iterable, List, Optional, Union
 import torch.multiprocessing as mp
 import torch
 from ctypes import c_bool
-from openwakeword.model import Model
 from scipy.signal import resample
 from scipy import signal
 import signal as system_signal
 import faster_whisper
-import openwakeword
 import collections
 import numpy as np
-import pvporcupine
 import traceback
 import threading
 import webrtcvad
@@ -69,11 +62,7 @@ INIT_WEBRTC_SENSITIVITY = 3
 INIT_POST_SPEECH_SILENCE_DURATION = 0.6
 INIT_MIN_LENGTH_OF_RECORDING = 0.5
 INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
-INIT_WAKE_WORDS_SENSITIVITY = 0.6
 INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
-INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
-INIT_WAKE_WORD_TIMEOUT = 5.0
-INIT_WAKE_WORD_BUFFER_DURATION = 0.1
 ALLOWED_LATENCY_LIMIT = 100
 
 TIME_SLEEP = 0.02
@@ -238,22 +227,6 @@ class AudioToTextRecorder:
                  ),
                  on_vad_detect_start=None,
                  on_vad_detect_stop=None,
-
-                 # Wake word parameters
-                 wakeword_backend: str = "pvporcupine",
-                 openwakeword_model_paths: str = None,
-                 openwakeword_inference_framework: str = "onnx",
-                 wake_words: str = "",
-                 wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
-                 wake_word_activation_delay: float = (
-                    INIT_WAKE_WORD_ACTIVATION_DELAY
-                 ),
-                 wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
-                 wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
-                 on_wakeword_detected=None,
-                 on_wakeword_timeout=None,
-                 on_wakeword_detection_start=None,
-                 on_wakeword_detection_end=None,
                  on_recorded_chunk=None,
                  debug_mode=False,
                  handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
@@ -270,8 +243,7 @@ class AudioToTextRecorder:
                  use_extended_logging: bool = False,
                  ):
         """
-        Initializes an audio recorder and  transcription
-        and wake word detection.
+        Initializes an audio recorder and transcription.
 
         Args:
         - model (str, default="tiny"): Specifies the size of the transcription
@@ -384,54 +356,6 @@ class AudioToTextRecorder:
             be called when the system listens for voice activity.
         - on_vad_detect_stop (callable, default=None): Callback function to be
             called when the system stops listening for voice activity.
-        - wakeword_backend (str, default="pvporcupine"): Specifies the backend
-            library to use for wake word detection. Supported options include
-            'pvporcupine' for using the Porcupine wake word engine or 'oww' for
-            using the OpenWakeWord engine.
-        - openwakeword_model_paths (str, default=None): Comma-separated paths
-            to model files for the openwakeword library. These paths point to
-            custom models that can be used for wake word detection when the
-            openwakeword library is selected as the wakeword_backend.
-        - openwakeword_inference_framework (str, default="onnx"): Specifies
-            the inference framework to use with the openwakeword library.
-            Can be either 'onnx' for Open Neural Network Exchange format 
-            or 'tflite' for TensorFlow Lite.
-        - wake_words (str, default=""): Comma-separated string of wake words to
-            initiate recording when using the 'pvporcupine' wakeword backend.
-            Supported wake words include: 'alexa', 'americano', 'blueberry',
-            'bumblebee', 'computer', 'grapefruits', 'grasshopper', 'hey google',
-            'hey siri', 'jarvis', 'ok google', 'picovoice', 'porcupine',
-            'terminator'. For the 'openwakeword' backend, wake words are
-            automatically extracted from the provided model files, so specifying
-            them here is not necessary.
-        - wake_words_sensitivity (float, default=0.5): Sensitivity for wake
-            word detection, ranging from 0 (least sensitive) to 1 (most
-            sensitive). Default is 0.5.
-        - wake_word_activation_delay (float, default=0): Duration in seconds
-            after the start of monitoring before the system switches to wake
-            word activation if no voice is initially detected. If set to
-            zero, the system uses wake word activation immediately.
-        - wake_word_timeout (float, default=5): Duration in seconds after a
-            wake word is recognized. If no subsequent voice activity is
-            detected within this window, the system transitions back to an
-            inactive state, awaiting the next wake word or voice activation.
-        - wake_word_buffer_duration (float, default=0.1): Duration in seconds
-            to buffer audio data during wake word detection. This helps in
-            cutting out the wake word from the recording buffer so it does not
-            falsely get detected along with the following spoken text, ensuring
-            cleaner and more accurate transcription start triggers.
-            Increase this if parts of the wake word get detected as text.
-        - on_wakeword_detected (callable, default=None): Callback function to
-            be called when a wake word is detected.
-        - on_wakeword_timeout (callable, default=None): Callback function to
-            be called when the system goes back to an inactive state after when
-            no speech was detected after wake word activation
-        - on_wakeword_detection_start (callable, default=None): Callback
-             function to be called when the system starts to listen for wake
-             words
-        - on_wakeword_detection_end (callable, default=None): Callback
-            function to be called when the system stops to listen for
-            wake words (e.g. because of timeout or wake word detected)
         - on_recorded_chunk (callable, default=None): Callback function to be
             called when a chunk of audio is recorded. The function is called
             with the recorded audio chunk as its argument.
@@ -475,7 +399,7 @@ class AudioToTextRecorder:
 
         Raises:
             Exception: Errors related to initializing transcription
-            model, wake word detection, or audio recording.
+            model or audio recording.
         """
 
         self.language = language
@@ -483,10 +407,6 @@ class AudioToTextRecorder:
         self.input_device_index = input_device_index
         self.gpu_device_index = gpu_device_index
         self.device = device
-        self.wake_words = wake_words
-        self.wake_word_activation_delay = wake_word_activation_delay
-        self.wake_word_timeout = wake_word_timeout
-        self.wake_word_buffer_duration = wake_word_buffer_duration
         self.ensure_sentence_starting_uppercase = (
             ensure_sentence_starting_uppercase
         )
@@ -500,12 +420,8 @@ class AudioToTextRecorder:
         self.post_speech_silence_duration = post_speech_silence_duration
         self.on_recording_start = on_recording_start
         self.on_recording_stop = on_recording_stop
-        self.on_wakeword_detected = on_wakeword_detected
-        self.on_wakeword_timeout = on_wakeword_timeout
         self.on_vad_detect_start = on_vad_detect_start
         self.on_vad_detect_stop = on_vad_detect_stop
-        self.on_wakeword_detection_start = on_wakeword_detection_start
-        self.on_wakeword_detection_end = on_wakeword_detection_end
         self.on_recorded_chunk = on_recorded_chunk
         self.on_transcription_start = on_transcription_start
         self.enable_realtime_transcription = enable_realtime_transcription
@@ -532,7 +448,6 @@ class AudioToTextRecorder:
         self.sample_rate = sample_rate
         self.recording_start_time = 0
         self.recording_stop_time = 0
-        self.wake_word_detect_time = 0
         self.silero_check_time = 0
         self.silero_working = False
         self.speech_end_silence_start = 0
@@ -542,7 +457,6 @@ class AudioToTextRecorder:
         self.spinner = spinner
         self.halo = None
         self.state = "inactive"
-        self.wakeword_detected = False
         self.text_storage = []
         self.realtime_stabilized_text = ""
         self.realtime_stabilized_safetext = ""
@@ -559,7 +473,6 @@ class AudioToTextRecorder:
         self.last_transcription_bytes_b64 = None
         self.initial_prompt = initial_prompt
         self.suppress_tokens = suppress_tokens
-        self.use_wake_words = wake_words or wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}
         self.detected_language = None
         self.detected_language_probability = 0
         self.detected_realtime_language = None
@@ -689,86 +602,6 @@ class AudioToTextRecorder:
             logging.debug("Faster_whisper realtime speech to text "
                           "transcription model initialized successfully")
 
-        # Setup wake word detection
-        if wake_words or wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
-            self.wakeword_backend = wakeword_backend
-
-            self.wake_words_list = [
-                word.strip() for word in wake_words.lower().split(',')
-            ]
-            self.wake_words_sensitivity = wake_words_sensitivity
-            self.wake_words_sensitivities = [
-                float(wake_words_sensitivity)
-                for _ in range(len(self.wake_words_list))
-            ]
-
-            if self.wakeword_backend in {'pvp', 'pvporcupine'}:
-
-                try:
-                    self.porcupine = pvporcupine.create(
-                        keywords=self.wake_words_list,
-                        sensitivities=self.wake_words_sensitivities
-                    )
-                    self.buffer_size = self.porcupine.frame_length
-                    self.sample_rate = self.porcupine.sample_rate
-
-                except Exception as e:
-                    logging.exception(
-                        "Error initializing porcupine "
-                        f"wake word detection engine: {e}"
-                    )
-                    raise
-
-                logging.debug(
-                    "Porcupine wake word detection engine initialized successfully"
-                )
-
-            elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
-                    
-                openwakeword.utils.download_models()
-
-                try:
-                    if openwakeword_model_paths:
-                        model_paths = openwakeword_model_paths.split(',')
-                        self.owwModel = Model(
-                            wakeword_models=model_paths,
-                            inference_framework=openwakeword_inference_framework
-                        )
-                        logging.info(
-                            "Successfully loaded wakeword model(s): "
-                            f"{openwakeword_model_paths}"
-                        )
-                    else:
-                        self.owwModel = Model(
-                            inference_framework=openwakeword_inference_framework)
-                    
-                    self.oww_n_models = len(self.owwModel.models.keys())
-                    if not self.oww_n_models:
-                        logging.error(
-                            "No wake word models loaded."
-                        )
-
-                    for model_key in self.owwModel.models.keys():
-                        logging.info(
-                            "Successfully loaded openwakeword model: "
-                            f"{model_key}"
-                        )
-
-                except Exception as e:
-                    logging.exception(
-                        "Error initializing openwakeword "
-                        f"wake word detection engine: {e}"
-                    )
-                    raise
-
-                logging.debug(
-                    "Open wake word detection engine initialized successfully"
-                )
-            
-            else:
-                logging.exception(f"Wakeword engine {self.wakeword_backend} unknown/unsupported. Please specify one of: pvporcupine, openwakeword.")
-
-
         # Setup voice activity detection model WebRTC
         try:
             logging.info("Initializing WebRTC voice with "
@@ -1208,12 +1041,6 @@ class AudioToTextRecorder:
             if audio_interface:
                 audio_interface.terminate()
 
-    def wakeup(self):
-        """
-        If in wake work modus, wake up as if a wake word was spoken.
-        """
-        self.listen_start = time.time()
-
     def abort(self):
         self.start_recording_on_voice_activity = False
         self.stop_recording_on_voice_deactivity = False
@@ -1347,46 +1174,6 @@ class AudioToTextRecorder:
                 logging.error(f"Error during transcription: {str(e)}")
                 raise e
 
-    def _process_wakeword(self, data):
-        """
-        Processes audio data to detect wake words.
-        """
-        if self.wakeword_backend in {'pvp', 'pvporcupine'}:
-            pcm = struct.unpack_from(
-                "h" * self.buffer_size,
-                data
-            )
-            porcupine_index = self.porcupine.process(pcm)
-            if self.debug_mode:
-                logging.info(f"wake words porcupine_index: {porcupine_index}")
-            return self.porcupine.process(pcm)
-
-        elif self.wakeword_backend in {'oww', 'openwakeword', 'openwakewords'}:
-            pcm = np.frombuffer(data, dtype=np.int16)
-            prediction = self.owwModel.predict(pcm)
-            max_score = -1
-            max_index = -1
-            wake_words_in_prediction = len(self.owwModel.prediction_buffer.keys())
-            self.wake_words_sensitivities
-            if wake_words_in_prediction:
-                for idx, mdl in enumerate(self.owwModel.prediction_buffer.keys()):
-                    scores = list(self.owwModel.prediction_buffer[mdl])
-                    if scores[-1] >= self.wake_words_sensitivity and scores[-1] > max_score:
-                        max_score = scores[-1]
-                        max_index = idx
-                if self.debug_mode:
-                    logging.info(f"wake words oww max_index, max_score: {max_index} {max_score}")
-                return max_index  
-            else:
-                if self.debug_mode:
-                    logging.info(f"wake words oww_index: -1")
-                return -1
-
-        if self.debug_mode:        
-            logging.info("wake words no match")
-
-        return -1
-
     def text(self,
              on_transcription_finished=None,
              ):
@@ -1451,8 +1238,6 @@ class AudioToTextRecorder:
         self.text_storage = []
         self.realtime_stabilized_text = ""
         self.realtime_stabilized_safetext = ""
-        self.wakeword_detected = False
-        self.wake_word_detect_time = 0
         self.frames = []
         self.new_frames.set()
         self.is_recording = True
@@ -1497,12 +1282,10 @@ class AudioToTextRecorder:
 
     def listen(self):
         """
-        Puts recorder in immediate "listen" state.
-        This is the state after a wake word detection, for example.
+        Puts recorder in "listen" state.
         The recorder now "listens" for voice activation.
         Once voice is detected we enter "recording" state.
         """
-        self.listen_start = time.time()
         self._set_state("listening")
         self.start_recording_on_voice_activity = True
 
@@ -1626,9 +1409,6 @@ class AudioToTextRecorder:
                 logging.debug('Debug: Initializing variables')
             time_since_last_buffer_message = 0
             was_recording = False
-            delay_was_passed = False
-            wakeword_detected_time = None
-            wakeword_samples_to_remove = None
             self.allowed_to_early_transcribe = True
 
             if self.use_extended_logging:
@@ -1717,88 +1497,17 @@ class AudioToTextRecorder:
                 if not self.is_recording:
                     if self.use_extended_logging:
                         logging.debug('Debug: Handling not recording state')
-                    # Handle not recording state
-                    time_since_listen_start = (time.time() - self.listen_start
-                                            if self.listen_start else 0)
-
-                    wake_word_activation_delay_passed = (
-                        time_since_listen_start >
-                        self.wake_word_activation_delay
-                    )
-
-                    if self.use_extended_logging:
-                        logging.debug('Debug: Handling wake-word timeout callback')
-                    # Handle wake-word timeout callback
-                    if wake_word_activation_delay_passed \
-                            and not delay_was_passed:
-
-                        if self.use_wake_words and self.wake_word_activation_delay:
-                            if self.on_wakeword_timeout:
-                                if self.use_extended_logging:
-                                    logging.debug('Debug: Calling on_wakeword_timeout')
-                                self.on_wakeword_timeout()
-                    delay_was_passed = wake_word_activation_delay_passed
+                   
 
                     if self.use_extended_logging:
                         logging.debug('Debug: Setting state and spinner text')
                     # Set state and spinner text
-                    if not self.recording_stop_time:
-                        if self.use_wake_words \
-                                and wake_word_activation_delay_passed \
-                                and not self.wakeword_detected:
-                            if self.use_extended_logging:
-                                logging.debug('Debug: Setting state to "wakeword"')
-                            self._set_state("wakeword")
-                        else:
-                            if self.listen_start:
-                                if self.use_extended_logging:
-                                    logging.debug('Debug: Setting state to "listening"')
-                                self._set_state("listening")
-                            else:
-                                if self.use_extended_logging:
-                                    logging.debug('Debug: Setting state to "inactive"')
-                                self._set_state("inactive")
-
-                    if self.use_extended_logging:
-                        logging.debug('Debug: Checking wake word conditions')
-                    if self.use_wake_words and wake_word_activation_delay_passed:
-                        try:
-                            if self.use_extended_logging:
-                                logging.debug('Debug: Processing wakeword')
-                            wakeword_index = self._process_wakeword(data)
-
-                        except struct.error:
-                            logging.error("Error unpacking audio data "
-                                        "for wake word processing.")
-                            continue
-
-                        except Exception as e:
-                            logging.error(f"Wake word processing error: {e}")
-                            continue
-
-                        if self.use_extended_logging:
-                            logging.debug('Debug: Checking if wake word detected')
-                        # If a wake word is detected                        
-                        if wakeword_index >= 0:
-                            if self.use_extended_logging:
-                                logging.debug('Debug: Wake word detected, updating variables')
-                            self.wake_word_detect_time = time.time()
-                            wakeword_detected_time = time.time()
-                            wakeword_samples_to_remove = int(self.sample_rate * self.wake_word_buffer_duration)
-                            self.wakeword_detected = True
-                            if self.on_wakeword_detected:
-                                if self.use_extended_logging:
-                                    logging.debug('Debug: Calling on_wakeword_detected')
-                                self.on_wakeword_detected()
 
                     if self.use_extended_logging:
                         logging.debug('Debug: Checking voice activity conditions')
                     # Check for voice activity to
                     # trigger the start of recording
-                    if ((not self.use_wake_words
-                        or not wake_word_activation_delay_passed)
-                            and self.start_recording_on_voice_activity) \
-                            or self.wakeword_detected:
+                    if self.start_recording_on_voice_activity:
 
                         if self.use_extended_logging:
                             logging.debug('Debug: Checking if voice is active')
@@ -1837,25 +1546,6 @@ class AudioToTextRecorder:
                 else:
                     if self.use_extended_logging:
                         logging.debug('Debug: Handling recording state')
-                    # If we are currently recording
-                    if wakeword_samples_to_remove and wakeword_samples_to_remove > 0:
-                        if self.use_extended_logging:
-                            logging.debug('Debug: Removing wakeword samples')
-                        # Remove samples from the beginning of self.frames
-                        samples_removed = 0
-                        while wakeword_samples_to_remove > 0 and self.frames:
-                            frame = self.frames[0]
-                            frame_samples = len(frame) // 2  # Assuming 16-bit audio
-                            if wakeword_samples_to_remove >= frame_samples:
-                                self.frames.pop(0)
-                                samples_removed += frame_samples
-                                wakeword_samples_to_remove -= frame_samples
-                            else:
-                                self.frames[0] = frame[wakeword_samples_to_remove * 2:]
-                                samples_removed += wakeword_samples_to_remove
-                                samples_to_remove = 0
-                        
-                        wakeword_samples_to_remove = 0
 
                     if self.use_extended_logging:
                         logging.debug('Debug: Checking if stop_recording_on_voice_deactivity is True')
@@ -1946,8 +1636,6 @@ class AudioToTextRecorder:
                                     logging.debug('Debug: Resetting speech_end_silence_start')
                                 self.speech_end_silence_start = 0
 
-                                if self.use_extended_logging:
-                                    logging.debug('Debug: Handling non-wake word scenario')
                             else:
                                 if self.use_extended_logging:
                                     logging.debug('Debug: Setting failed_stop_attempt to True')
@@ -1966,20 +1654,6 @@ class AudioToTextRecorder:
                 if time.time() - self.silero_check_time > 0.1:
                     self.silero_check_time = 0
 
-                if self.use_extended_logging:
-                    logging.debug('Debug: Handling wake word timeout')
-                # Handle wake word timeout (waited to long initiating
-                # speech after wake word detection)
-                if self.wake_word_detect_time and time.time() - \
-                        self.wake_word_detect_time > self.wake_word_timeout:
-
-                    self.wake_word_detect_time = 0
-                    if self.wakeword_detected and self.on_wakeword_timeout:
-                        if self.use_extended_logging:
-                            logging.debug('Debug: Calling on_wakeword_timeout')
-                        self.on_wakeword_timeout()
-                    self.wakeword_detected = False
-
                 if self.use_extended_logging:
                     logging.debug('Debug: Updating was_recording')
                 was_recording = self.is_recording
@@ -2337,9 +2011,6 @@ class AudioToTextRecorder:
         if old_state == "listening":
             if self.on_vad_detect_stop:
                 self.on_vad_detect_stop()
-        elif old_state == "wakeword":
-            if self.on_wakeword_detection_end:
-                self.on_wakeword_detection_end()
 
         # Execute callbacks based on transitioning TO a particular state
         if new_state == "listening":
@@ -2348,12 +2019,6 @@ class AudioToTextRecorder:
             self._set_spinner("speak now")
             if self.spinner and self.halo:
                 self.halo._interval = 250
-        elif new_state == "wakeword":
-            if self.on_wakeword_detection_start:
-                self.on_wakeword_detection_start()
-            self._set_spinner(f"say {self.wake_words}")
-            if self.spinner and self.halo:
-                self.halo._interval = 500
         elif new_state == "transcribing":
             if self.on_transcription_start:
                 self.on_transcription_start()
@@ -2369,6 +2034,7 @@ class AudioToTextRecorder:
                 self.halo.stop()
                 self.halo = None
 
+
     def _set_spinner(self, text):
         """
         Update the spinner's text or create a new

+ 0 - 62
RealtimeSTT/audio_recorder_client.py

@@ -29,11 +29,7 @@ INIT_WEBRTC_SENSITIVITY = 3
 INIT_POST_SPEECH_SILENCE_DURATION = 0.6
 INIT_MIN_LENGTH_OF_RECORDING = 0.5
 INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
-INIT_WAKE_WORDS_SENSITIVITY = 0.6
 INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
-INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
-INIT_WAKE_WORD_TIMEOUT = 5.0
-INIT_WAKE_WORD_BUFFER_DURATION = 0.1
 ALLOWED_LATENCY_LIMIT = 100
 
 CHUNK = 1024
@@ -109,21 +105,6 @@ class AudioToTextRecorderClient:
                  on_vad_detect_start=None,
                  on_vad_detect_stop=None,
 
-                 # Wake word parameters
-                 wakeword_backend: str = "pvporcupine",
-                 openwakeword_model_paths: str = None,
-                 openwakeword_inference_framework: str = "onnx",
-                 wake_words: str = "",
-                 wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
-                 wake_word_activation_delay: float = (
-                    INIT_WAKE_WORD_ACTIVATION_DELAY
-                 ),
-                 wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
-                 wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
-                 on_wakeword_detected=None,
-                 on_wakeword_timeout=None,
-                 on_wakeword_detection_start=None,
-                 on_wakeword_detection_end=None,
                  on_recorded_chunk=None,
                  debug_mode=False,
                  handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
@@ -181,19 +162,6 @@ class AudioToTextRecorderClient:
         self.on_vad_detect_start = on_vad_detect_start
         self.on_vad_detect_stop = on_vad_detect_stop
 
-        # Wake word parameters
-        self.wakeword_backend = wakeword_backend
-        self.openwakeword_model_paths = openwakeword_model_paths
-        self.openwakeword_inference_framework = openwakeword_inference_framework
-        self.wake_words = wake_words
-        self.wake_words_sensitivity = wake_words_sensitivity
-        self.wake_word_activation_delay = wake_word_activation_delay
-        self.wake_word_timeout = wake_word_timeout
-        self.wake_word_buffer_duration = wake_word_buffer_duration
-        self.on_wakeword_detected = on_wakeword_detected
-        self.on_wakeword_timeout = on_wakeword_timeout
-        self.on_wakeword_detection_start = on_wakeword_detection_start
-        self.on_wakeword_detection_end = on_wakeword_detection_end
         self.on_recorded_chunk = on_recorded_chunk
         self.debug_mode = debug_mode
         self.handle_buffer_overflow = handle_buffer_overflow
@@ -305,9 +273,6 @@ class AudioToTextRecorderClient:
     def abort(self):
         self.call_method("abort")
 
-    def wakeup(self):
-        self.call_method("wakeup")
-
     def clear_audio_queue(self):
         self.call_method("clear_audio_queue")
 
@@ -384,22 +349,6 @@ class AudioToTextRecorderClient:
             args += ['--beam_size', str(self.beam_size)]
         if self.beam_size_realtime is not None:
             args += ['--beam_size_realtime', str(self.beam_size_realtime)]
-        if self.wake_words is not None:
-            args += ['--wake_words', str(self.wake_words)]
-        if self.wake_words_sensitivity is not None:
-            args += ['--wake_words_sensitivity', str(self.wake_words_sensitivity)]
-        if self.wake_word_timeout is not None:
-            args += ['--wake_word_timeout', str(self.wake_word_timeout)]
-        if self.wake_word_activation_delay is not None:
-            args += ['--wake_word_activation_delay', str(self.wake_word_activation_delay)]
-        if self.wakeword_backend is not None:
-            args += ['--wakeword_backend', str(self.wakeword_backend)]
-        if self.openwakeword_model_paths:
-            args += ['--openwakeword_model_paths', str(self.openwakeword_model_paths)]
-        if self.openwakeword_inference_framework is not None:
-            args += ['--openwakeword_inference_framework', str(self.openwakeword_inference_framework)]
-        if self.wake_word_buffer_duration is not None:
-            args += ['--wake_word_buffer_duration', str(self.wake_word_buffer_duration)]
         if self.use_main_model_for_realtime:
             args.append('--use_main_model_for_realtime')  # flag, no need for True/False
         if self.use_extended_logging:
@@ -614,17 +563,6 @@ class AudioToTextRecorderClient:
             elif data.get('type') == 'vad_detect_stop':
                 if self.on_vad_detect_stop:
                     self.on_vad_detect_stop()
-            elif data.get('type') == 'wakeword_detected':
-                if self.on_wakeword_detected:
-                    self.on_wakeword_detected()
-            elif data.get('type') == 'wakeword_detection_start':
-                if self.on_wakeword_detection_start:
-                    self.on_wakeword_detection_start()
-            elif data.get('type') == 'wakeword_detection_end':
-                if self.on_wakeword_detection_end:
-                    self.on_wakeword_detection_end()
-            elif data.get('type') == 'recorded_chunk':
-                pass
 
             else:
                 print(f"Unknown data message format: {data}")

+ 0 - 3
RealtimeSTT_server/stt_cli_client.py

@@ -317,9 +317,6 @@ class STTWebSocketClient:
                 'vad_detect_stop',
                 'recording_start',
                 'recording_stop',
-                'wakeword_detected',
-                'wakeword_detection_start',
-                'wakeword_detection_end',
                 'transcription_start'}:
                 pass  # Known message types, no action needed
             else:

+ 1 - 67
RealtimeSTT_server/stt_server.py

@@ -1,7 +1,7 @@
 """
 Speech-to-Text (STT) Server with Real-Time Transcription and WebSocket Interface
 
-This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library. It allows clients to connect via WebSocket to send audio data and receive real-time transcription updates. The server supports configurable audio recording parameters, voice activity detection (VAD), and wake word detection. It is designed to handle continuous transcription as well as post-recording processing, enabling real-time feedback with the option to improve final transcription quality after the complete sentence is recognized.
+This server provides real-time speech-to-text (STT) transcription using the RealtimeSTT library. It allows clients to connect via WebSocket to send audio data and receive real-time transcription updates. The server supports configurable audio recording parameters, voice activity detection (VAD). It is designed to handle continuous transcription as well as post-recording processing, enabling real-time feedback with the option to improve final transcription quality after the complete sentence is recognized.
 
 ### Features:
 - Real-time transcription using pre-configured or user-defined STT models.
@@ -23,7 +23,6 @@ stt-server [OPTIONS]
     - `-i, --input-device, --input_device_index`: Audio input device index; default 1.
     - `-c, --control, --control_port`: WebSocket control port; default 8011.
     - `-d, --data, --data_port`: WebSocket data port; default 8012.
-    - `-w, --wake_words`: Wake word(s) to trigger listening; default "".
     - `-D, --debug`: Enable debug logging.
     - `-W, --write`: Save audio to WAV file.
     - `-s, --silence_timing`: Enable dynamic silence duration for sentence detection; default True. 
@@ -42,13 +41,6 @@ stt-server [OPTIONS]
     - `--end_of_sentence_detection_pause`: Silence duration for sentence end detection; default 0.45.
     - `--unknown_sentence_detection_pause`: Pause duration for incomplete sentence detection; default 0.7.
     - `--mid_sentence_detection_pause`: Pause for mid-sentence break; default 2.0.
-    - `--wake_words_sensitivity`: Wake word detection sensitivity (0-1); default 0.5.
-    - `--wake_word_timeout`: Wake word timeout in seconds; default 5.0.
-    - `--wake_word_activation_delay`: Delay before wake word activation; default 20.
-    - `--wakeword_backend`: Backend for wake word detection; default 'none'.
-    - `--openwakeword_model_paths`: Paths to OpenWakeWord models.
-    - `--openwakeword_inference_framework`: OpenWakeWord inference framework; default 'tensorflow'.
-    - `--wake_word_buffer_duration`: Wake word buffer duration in seconds; default 1.0.
     - `--use_main_model_for_realtime`: Use main model for real-time transcription.
     - `--use_extended_logging`: Enable extensive log messages.
     - `--logchunks`: Log incoming audio chunks.
@@ -159,13 +151,11 @@ allowed_methods = [
     'abort',
     'stop',
     'clear_audio_queue',
-    'wakeup',
     'shutdown',
     'text',
 ]
 allowed_parameters = [
     'silero_sensitivity',
-    'wake_word_activation_delay',
     'post_speech_silence_duration',
     'listen_start',
     'recording_stop_time',
@@ -301,27 +291,6 @@ def on_vad_detect_stop(loop):
     })
     asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
 
-def on_wakeword_detected(loop):
-    # Send a message to the client when wake word detection starts
-    message = json.dumps({
-        'type': 'wakeword_detected'
-    })
-    asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
-
-def on_wakeword_detection_start(loop):
-    # Send a message to the client when wake word detection starts
-    message = json.dumps({
-        'type': 'wakeword_detection_start'
-    })
-    asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
-
-def on_wakeword_detection_end(loop):
-    # Send a message to the client when wake word detection ends
-    message = json.dumps({
-        'type': 'wakeword_detection_end'
-    })
-    asyncio.run_coroutine_threadsafe(audio_queue.put(message), loop)
-
 def on_transcription_start(loop):
     # Send a message to the client when transcription starts
     message = json.dumps({
@@ -372,9 +341,6 @@ def parse_arguments():
     parser.add_argument('-d', '--data', '--data_port', type=int, default=8012,
                         help='The port number used for the data WebSocket connection. Data connections are used to send audio data and receive transcription updates in real time. Default is port 8012.')
 
-    parser.add_argument('-w', '--wake_words', type=str, default="",
-                        help='Specify the wake word(s) that will trigger the server to start listening. For example, setting this to "Jarvis" will make the system start transcribing when it detects the wake word "Jarvis". Default is "Jarvis".')
-
     parser.add_argument('-D', '--debug', action='store_true', help='Enable debug logging for detailed server operations')
 
     parser.add_argument("-W", "--write", metavar="FILE",
@@ -434,27 +400,6 @@ def parse_arguments():
     parser.add_argument('--mid_sentence_detection_pause', type=float, default=2.0,
                         help='The duration of pause (in seconds) that the model should interpret as a mid-sentence break. Longer pauses can indicate a pause in speech but not necessarily the end of a sentence. Default is 2.0 seconds.')
 
-    parser.add_argument('--wake_words_sensitivity', type=float, default=0.5,
-                        help='Sensitivity level for wake word detection, with a range from 0 (most sensitive) to 1 (least sensitive). Adjust this value based on your environment to ensure reliable wake word detection. Default is 0.5.')
-
-    parser.add_argument('--wake_word_timeout', type=float, default=5.0,
-                        help='Maximum time in seconds that the system will wait for a wake word before timing out. After this timeout, the system stops listening for wake words until reactivated. Default is 5.0 seconds.')
-
-    parser.add_argument('--wake_word_activation_delay', type=float, default=20,
-                        help='The delay in seconds before the wake word detection is activated after the system starts listening. This prevents false positives during the start of a session. Default is 0.5 seconds.')
-
-    parser.add_argument('--wakeword_backend', type=str, default='none',
-                        help='The backend used for wake word detection. You can specify different backends such as "default" or any custom implementations depending on your setup. Default is "pvporcupine".')
-
-    parser.add_argument('--openwakeword_model_paths', type=str, nargs='*',
-                        help='A list of file paths to OpenWakeWord models. This is useful if you are using OpenWakeWord for wake word detection and need to specify custom models.')
-
-    parser.add_argument('--openwakeword_inference_framework', type=str, default='tensorflow',
-                        help='The inference framework to use for OpenWakeWord models. Supported frameworks could include "tensorflow", "pytorch", etc. Default is "tensorflow".')
-
-    parser.add_argument('--wake_word_buffer_duration', type=float, default=1.0,
-                        help='Duration of the buffer in seconds for wake word detection. This sets how long the system will store the audio before and after detecting the wake word. Default is 1.0 seconds.')
-
     parser.add_argument('--use_main_model_for_realtime', action='store_true',
                         help='Enable this option if you want to use the main model for real-time transcription, instead of the smaller, faster real-time model. Using the main model may provide better accuracy but at the cost of higher processing time.')
 
@@ -720,14 +665,6 @@ async def main_async():
         'beam_size': args.beam_size,
         'beam_size_realtime': args.beam_size_realtime,
         'initial_prompt': args.initial_prompt,
-        'wake_words': args.wake_words,
-        'wake_words_sensitivity': args.wake_words_sensitivity,
-        'wake_word_timeout': args.wake_word_timeout,
-        'wake_word_activation_delay': args.wake_word_activation_delay,
-        'wakeword_backend': args.wakeword_backend,
-        'openwakeword_model_paths': args.openwakeword_model_paths,
-        'openwakeword_inference_framework': args.openwakeword_inference_framework,
-        'wake_word_buffer_duration': args.wake_word_buffer_duration,
         'use_main_model_for_realtime': args.use_main_model_for_realtime,
         'spinner': False,
         'use_microphone': False,
@@ -736,9 +673,6 @@ async def main_async():
         'on_recording_stop': make_callback(loop, on_recording_stop),
         'on_vad_detect_start': make_callback(loop, on_vad_detect_start),
         'on_vad_detect_stop': make_callback(loop, on_vad_detect_stop),
-        'on_wakeword_detected': make_callback(loop, on_wakeword_detected),
-        'on_wakeword_detection_start': make_callback(loop, on_wakeword_detection_start),
-        'on_wakeword_detection_end': make_callback(loop, on_wakeword_detection_end),
         'on_transcription_start': make_callback(loop, on_transcription_start),
         # 'on_recorded_chunk': make_callback(loop, on_recorded_chunk),
         'no_log_file': True,  # Disable logging to file