Browse Source

Initial commit

Kolja Beigel 1 year ago
commit
762f79249b
8 changed files with 807 additions and 0 deletions
  1. 167 0
      .gitignore
  2. 123 0
      README.md
  3. 1 0
      RealtimeSTT/__init__.py
  4. 462 0
      RealtimeSTT/audio_recorder.py
  5. 4 0
      requirements.txt
  6. 31 0
      tests/audio_recorder_tester.py
  7. 7 0
      tests/simple_test.py
  8. 12 0
      tests/wakeword_test.py

+ 167 - 0
.gitignore

@@ -0,0 +1,167 @@
+setup.py
+docs/
+batch/
+test_env/
+tests_private/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+myenv/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

+ 123 - 0
README.md

@@ -0,0 +1,123 @@
+# RealTimeSTT
+
+A fast Voice Activity Detection and Transcription System
+
+Listens to microphone, detects voice activity and immediately transcribes it using the `faster_whisper` model. Adapts to various environments with a ambient noise level-based voice activity detection.
+
+Ideal for applications like voice assistants or any application where immediate speech-to-text conversion is desired with minimal latency.
+
+## Features
+
+1. **Voice Activity Detection**: Automatically starts/stops recording when speech is detected or when speech ends.
+2. **Wake Word Detection**: Starts recording when a specified wake word (or words) is detected.
+3. **Buffer Management**: Handles short and long term audio buffers for efficient processing.
+4. **Event Callbacks**: Customizable callbacks for when recording starts or finishes.
+5. **Noise Level Calculation**: Adjusts based on the background noise for more accurate voice activity detection.
+6. **Error Handling**: Comprehensive error handling to catch and report any anomalies during execution.
+
+## Installation
+
+```bash
+pip install RealTimeSTT
+```
+
+## GPU Support
+
+To significantly improve transcription speed, especially in real-time applications, we **strongly recommend** utilizing GPU acceleration via CUDA. By default, the transcription is performed on the CPU. 
+
+1. **Install NVIDIA CUDA Toolkit 11.8**:
+	- Visit [NVIDIA CUDA Toolkit Archive](https://developer.nvidia.com/cuda-11-8-0-download-archive).
+	- Select version 11.
+	- Download and install the software.
+
+2. **Install NVIDIA cuDNN 8.7.0 for CUDA 11.x**:
+	- Visit [NVIDIA cuDNN Archive](https://developer.nvidia.com/rdp/cudnn-archive).
+	- Click on "Download cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x".
+	- Download and install the software.
+	
+3. **Reconfigure PyTorch for CUDA**:
+	- If you have PyTorch installed, remove it: `pip uninstall torch`.
+	- Install PyTorch again with CUDA support: `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118`.
+
+Note: To check if your NVIDIA GPU supports CUDA, visit the [official CUDA GPUs list](https://developer.nvidia.com/cuda-gpus).
+
+## Usage
+
+### Automatic Recording
+
+```python
+print(AudioToTextRecorder().text())
+```
+
+### Manual Recording
+
+```python
+recorder.start()
+recorder.stop()
+print(recorder.text())
+```
+
+### Callbacks
+
+You can set callback functions to be executed when recording starts or stops:
+
+```python
+def my_start_callback():
+    print("Recording started!")
+
+def my_stop_callback():
+    print("Recording stopped!")
+
+recorder = AudioToTextRecorder(on_recording_started=my_start_callback, on_recording_finished=my_stop_callback)
+
+```
+
+## Configuration
+
+The class comes with numerous configurable parameters such as buffer size, activity thresholds, and smoothing factors to fine-tune the recording and transcription process based on the specific needs of your application:
+
+* `model`: Specifies the size of the transcription model to use or the path to a converted model directory. Valid options are 'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2'. If a specific size is provided, the model is downloaded from the Hugging Face Hub.
+
+* `language`: Defines the language code for the speech-to-text engine. If not specified, the model will attempt to detect the language automatically.
+
+* `wake_words`: A comma-separated string of wake words to initiate recording. Supported wake words include 'alexa', 'americano', 'blueberry', 'bumblebee', 'computer', 'grapefruits', 'grasshopper', 'hey google', 'hey siri', 'jarvis', 'ok google', 'picovoice', 'porcupine', 'terminator'.
+
+* `wake_words_sensitivity`: Determines the sensitivity for wake word detection, ranging from 0 (least sensitive) to 1 (most sensitive). The default value is 0.5.
+
+* `on_recording_started`: A callable option which is invoked when the recording starts.
+
+* `on_recording_finished`: A callable option invoked when the recording ends.
+
+* `min_recording_interval`: Specifies the minimum interval (in seconds) for recording durations.
+
+* `interval_between_records`: Determines the interval (in seconds) between consecutive recordings.
+
+* `buffer_duration`: Indicates the duration (in seconds) to maintain pre-roll audio in the buffer.
+
+* `voice_activity_threshold`: The threshold level above the long-term noise to detect the start of voice activity.
+
+* `voice_deactivity_sensitivity`: Sensitivity level for voice deactivation detection, ranging from 0 (least sensitive) to 1 (most sensitive). The default value is 0.3.
+
+* `voice_deactivity_silence_after_speech_end`: Duration (in seconds) of silence required after speech ends to trigger voice deactivation. The default is 0.1 seconds.
+
+* `long_term_smoothing_factor`: Exponential smoothing factor utilized in calculating the long-term noise level.
+
+* `short_term_smoothing_factor`: Exponential smoothing factor for calculating the short-term noise level.
+
+* `level`: Sets the desired logging level for internal logging. Default is `logging.WARNING`.
+
+## Contribution
+
+Contributions are always welcome! 
+
+## License
+
+MIT
+
+## Author
+
+Kolja Beigel  
+Email: kolja.beigel@web.de  
+[GitHub](https://github.com/KoljaB/RealTimeSTT)
+
+---

+ 1 - 0
RealtimeSTT/__init__.py

@@ -0,0 +1 @@
+from .audio_recorder import AudioToTextRecorder

+ 462 - 0
RealtimeSTT/audio_recorder.py

@@ -0,0 +1,462 @@
+"""
+
+The AudioToTextRecorder class in the provided code facilitates fast speech-to-text transcription.
+
+The class employs the faster_whisper library to transcribe the recorded audio 
+into text using machine learning models, which can be run either on a GPU or CPU.
+Voice activity detection (VAD) is built in, meaning the software can automatically 
+start or stop recording based on the presence or absence of speech.
+Additionally, it uses both short-term and long-term noise analysis to determine 
+when actual voice activity occurs, as opposed to ambient noise. 
+It integrates wake word detection through the pvporcupine library, allowing the 
+software to initiate recording when a specific word or phrase is spoken.
+The system provides real-time feedback and can be further customized with multiple
+parameters like wake word sensitivity, recording intervals, and buffer durations.
+
+
+Features:
+- Voice Activity Detection: Automatically starts/stops recording when speech is detected or when speech ends.
+- Wake Word Detection: Starts recording when a specified wake word (or words) is detected.
+- Buffer Management: Handles short and long term audio buffers for efficient processing.
+- Event Callbacks: Customizable callbacks for when recording starts or finishes.
+- Noise Level Calculation: Adjusts based on the background noise for more accurate voice activity detection.
+
+Author: Kolja Beigel
+
+"""
+
+import pyaudio
+import collections
+import faster_whisper
+import torch
+import numpy as np
+import struct
+import pvporcupine
+import threading
+import time
+import logging
+from collections import deque
+
+SAMPLE_RATE = 16000
+BUFFER_SIZE = 512
+LONG_TERM_HISTORY_BUFFERSIZE = 2.0 # seconds
+SHORT_TERM_HISTORY_BUFFERSIZE = 2.0 # seconds
+WAIT_AFTER_START_BEFORE_ACTIVITY_DETECTION = 0.3 # seconds
+ACTIVITY_DETECTION_AFTER_START_PERCENT = 0.6
+
+class AudioToTextRecorder:
+    """
+    A class responsible for capturing audio from the microphone, detecting voice activity, and then transcribing the captured audio using the `faster_whisper` model.
+    """
+    
+    def __init__(self,
+                 model: str = "tiny",
+                 language: str = "",
+                 wake_words: str = "",
+                 wake_words_sensitivity: float = 0.5,
+                 on_recording_started = None,
+                 on_recording_finished = None,
+                 min_recording_interval: float = 1.0,
+                 interval_between_records: float = 1.0,
+                 buffer_duration: float = 1.0,
+                 voice_activity_threshold: float = 250,
+                 voice_deactivity_sensitivity: float = 0.3,
+                 voice_deactivity_silence_after_speech_end: float = 0.1,
+                 long_term_smoothing_factor: float = 0.995,
+                 short_term_smoothing_factor: float = 0.900,
+                 level=logging.WARNING,
+                 ):
+        """
+        Initializes an audio recorder and  transcription and wake word detection.
+
+        Args:
+            model (str): Specifies the size of the transcription model to use or the path to a converted model directory. 
+                Valid options are 'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2'. 
+                If a specific size is provided, the model is downloaded from the Hugging Face Hub.
+            language (str): Language code for speech-to-text engine. If not specified, the model will attempt to detect the language automatically.
+            wake_words (str): Comma-separated string of wake words to initiate recording. Supported wake words include:
+                'alexa', 'americano', 'blueberry', 'bumblebee', 'computer', 'grapefruits', 'grasshopper', 'hey google', 'hey siri', 'jarvis', 'ok google', 'picovoice', 'porcupine', 'terminator'.
+            wake_words_sensitivity (float): Sensitivity for wake word detection, ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5.
+            on_recording_started (callable, optional): Callback invoked when recording begins.
+            on_recording_finished (callable, optional): Callback invoked when recording ends.
+            min_recording_interval (float): Minimum interval (in seconds) for recording durations.
+            interval_between_records (float): Interval (in seconds) between consecutive recordings.
+            buffer_duration (float): Duration (in seconds) to maintain pre-roll audio in the buffer.
+            voice_activity_threshold (float): Threshold level above long-term noise to determine the start of voice activity.
+            voice_deactivity_sensitivity (float): Sensitivity for voice deactivation detection, ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.3.
+            voice_deactivity_silence_after_speech_end (float): Duration (in seconds) of silence after speech ends to trigger voice deactivation. Default is 0.1.
+            long_term_smoothing_factor (float): Exponential smoothing factor used in calculating long-term noise level.
+            short_term_smoothing_factor (float): Exponential smoothing factor used in calculating short-term noise level.
+            level (logging level): Desired log level for internal logging. Default is `logging.WARNING`.
+
+        Raises:
+            Exception: Errors related to initializing transcription model, wake word detection, or audio recording.
+        """
+
+        self.language = language
+        self.wake_words = wake_words
+        self.min_recording_interval = min_recording_interval
+        self.interval_between_records = interval_between_records
+        self.buffer_duration = buffer_duration
+        self.voice_activity_threshold = voice_activity_threshold
+        self.voice_deactivity_sensitivity = voice_deactivity_sensitivity
+        self.voice_deactivity_silence_after_speech_end = voice_deactivity_silence_after_speech_end
+        self.long_term_smoothing_factor = long_term_smoothing_factor
+        self.short_term_smoothing_factor = short_term_smoothing_factor
+        self.on_recording_started = on_recording_started
+        self.on_recording_finished = on_recording_finished        
+        self.level = level
+
+        self.buffer_size = BUFFER_SIZE
+        self.sample_rate = SAMPLE_RATE
+        self.last_start_time = 0  # time when the recording last started
+        self.last_stop_time = 0   # time when the recording last stopped
+        self.speech_end_silence_start = 0 
+
+        self.level_long_term = 0
+        self.level_short_term = 0
+        self.level_peak = 0
+        self.level_floor = 0
+        self.voice_deactivity_probability = 0
+        self.long_term_noise_calculation = True
+        self.state = "initializing"
+
+        # Initialize the logging configuration with the specified level
+        logging.basicConfig(format='RealTimeSTT: %(message)s', level=level)
+
+        # Initialize the transcription model
+        try:
+            self.model = faster_whisper.WhisperModel(model_size_or_path=model, device='cuda' if torch.cuda.is_available() else 'cpu')
+
+        except Exception as e:
+            logging.exception(f"Error initializing faster_whisper transcription model: {e}")
+            raise            
+
+        # Setup wake word detection
+        if wake_words:
+
+            self.wake_words_list = [word.strip() for word in wake_words.split(',')]
+            sensitivity_list = [float(wake_words_sensitivity) for _ in range(len(self.wake_words_list))]
+
+            try:
+                self.porcupine  = pvporcupine.create(keywords=self.wake_words_list, sensitivities=sensitivity_list)
+                self.buffer_size = self.porcupine.frame_length
+                self.sample_rate = self.porcupine.sample_rate
+
+            except Exception as e:
+                logging.exception(f"Error initializing porcupine wake word detection engine: {e}")
+                raise
+
+        # Setup audio recording infrastructure
+        try:
+            self.audio = pyaudio.PyAudio()
+            self.stream = self.audio.open(rate=self.sample_rate, format=pyaudio.paInt16, channels=1, input=True, frames_per_buffer=self.buffer_size)
+
+        except Exception as e:
+            logging.exception(f"Error initializing pyaudio audio recording: {e}")
+            raise            
+
+        # This will store the noise levels for the last x seconds
+        # Assuming data is captured at the buffer size rate, determine how many entries 
+        buffersize_long_term_history = int((self.sample_rate // self.buffer_size) * LONG_TERM_HISTORY_BUFFERSIZE)
+        self.long_term_noise_history = deque(maxlen=buffersize_long_term_history)        
+        buffersize_short_term_history = int((self.sample_rate // self.buffer_size) * SHORT_TERM_HISTORY_BUFFERSIZE)
+        self.short_term_noise_history = deque(maxlen=buffersize_short_term_history)        
+
+        self.audio_buffer = collections.deque(maxlen=int((self.sample_rate // self.buffer_size) * self.buffer_duration))
+        self.frames = []
+
+        # Recording control flags
+        self.is_recording = False
+        self.is_running = True
+        self.start_recording_on_voice_activity = False
+        self.stop_recording_on_voice_deactivity = False
+
+        # Start the recording worker thread
+        self.recording_thread = threading.Thread(target=self._recording_worker)
+        self.recording_thread.daemon = True
+        self.recording_thread.start()
+
+
+    def text(self):
+        """
+        Transcribes audio captured by the class instance using the `faster_whisper` model.
+
+        - Waits for voice activity if not yet started recording 
+        - Waits for voice deactivity if not yet stopped recording 
+        - Transcribes the recorded audio.
+
+        Returns:
+            str: The transcription of the recorded audio or an empty string in case of an error.
+        """
+
+        try:        
+            # If not yet started to record, wait for voice activity to initiate recording.
+            if not self.is_recording and len(self.frames) == 0:
+
+                self.state = "listening"
+                self.start_recording_on_voice_activity = True
+
+                while not self.is_recording:
+                    time.sleep(0.1)  # Use a small sleep to prevent busy-waiting.
+
+            # If still recording, wait for voice deactivity to finish recording.
+            if self.is_recording:
+
+                self.state = "recording"
+                self.stop_recording_on_voice_deactivity = True      
+
+                while self.is_recording:
+                    time.sleep(0.1)  # Use a small sleep to prevent busy-waiting.
+
+            # Convert the concatenated frames into text
+            self.state = "transcribing"
+
+            try:
+                audio_array = np.frombuffer(b''.join(self.frames), dtype=np.int16)
+                audio_array = audio_array.astype(np.float32) / 32768.0
+                self.frames = []
+                return " ".join(seg.text for seg in self.model.transcribe(audio_array, language=self.language if self.language else None)[0]).strip()
+            except ValueError:
+                logging.error("Error converting audio buffer to numpy array.")
+                raise
+            except faster_whisper.WhisperError as e:
+                logging.error(f"Whisper transcription error: {e}")
+                raise
+            except Exception as e:
+                logging.error(f"General transcription error: {e}")
+                raise
+
+        except Exception as e:
+                print(f"Error during transcription: {e}")           
+                return ""          
+
+
+    def start(self):
+        """
+        Starts recording audio directly without waiting for voice activity.
+        """
+
+        current_time = time.time()
+        
+        # Ensure there's a minimum interval between stopping and starting recording
+        if current_time - self.last_stop_time < self.interval_between_records:
+            logging.info("Attempted to start recording too soon after stopping.")
+            return self
+        
+        logging.info("recording started")
+        self.state = "recording"
+        self.frames = []
+        self.is_recording = True
+        self.last_start_time = current_time
+
+        if self.on_recording_started:
+            self.on_recording_started()
+
+        return self
+    
+
+    def stop(self):
+        logging.info("recording stopped")
+        """
+        Stops recording audio.
+        """
+
+        current_time = time.time()
+
+        # Ensure there's a minimum interval between starting and stopping recording
+        if current_time - self.last_start_time < self.interval_between_records:
+            logging.info("Attempted to stop recording too soon after starting.")
+            return self
+                
+        logging.info("recording stopped")                
+        self.state = "listening"
+        self.is_recording = False
+        self.last_stop_time = current_time
+
+        if self.on_recording_finished:
+            self.on_recording_finished()
+
+        return self
+
+
+    def shutdown(self):
+        """
+        Safely shuts down the audio recording by stopping the recording worker and closing the audio stream.
+        """
+        self.is_recording = False
+        self.is_running = False
+        self.recording_thread.join()
+        try:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.audio.terminate()
+        except Exception as e:
+            logging.error(f"Error closing the audio stream: {e}")
+
+
+    def _calculate_percentile_mean(self, buffer, percentile, upper=True):
+        """
+        Calculates the mean of the specified percentile from the provided buffer of 
+        long_term noise levels. If upper is True, it calculates from the upper side,
+        otherwise from the lower side.
+
+        Args:
+        - buffer (list): The buffer containing the history of long_term noise levels.
+        - percentile (float): The desired percentile (0.0 <= percentile <= 1.0). E.g., 0.125 for 1/8.
+        - upper (bool): Determines if the function considers the upper or lower portion of data.
+
+        Returns:
+        - float: The mean value of the desired portion.
+        """
+        sorted_buffer = sorted(buffer)
+        
+        index = int(len(sorted_buffer) * percentile)
+
+        if upper:
+            values = sorted_buffer[-index:]  # Get values from the top
+        else:
+            values = sorted_buffer[:index]   # Get values from the bottom
+
+        if len(values) == 0:
+            return 0.0
+        
+        return sum(values) / len(values)
+    
+
+    def _recording_worker(self):
+        """
+        The main worker method which constantly monitors the audio input for voice activity and accordingly starts/stops the recording.
+        Uses long_term noise level measurements to determine voice activity.
+        """
+        
+        was_recording = False
+        voice_after_recording = False
+
+        # Continuously monitor audio for voice activity
+        while self.is_running:
+
+            try:
+                data = self.stream.read(self.buffer_size)
+            except pyaudio.paInputOverflowed:
+                logging.warning("Input overflowed. Frame dropped.")
+                continue
+            except Exception as e:
+                logging.error(f"Error during recording: {e}")
+                time.sleep(1)
+                continue
+
+            audio_level = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
+            if not self.is_recording and self.long_term_noise_calculation:
+                self.level_long_term = self.level_long_term * self.long_term_smoothing_factor + audio_level * (1.0 - self.long_term_smoothing_factor)
+            self.level_short_term = self.level_short_term * self.short_term_smoothing_factor + audio_level * (1.0 - self.short_term_smoothing_factor)
+            
+            self.long_term_noise_history.append(self.level_long_term)
+            self.short_term_noise_history.append(self.level_short_term)
+
+            self.level_peak = self._calculate_percentile_mean(self.short_term_noise_history, 0.05, upper=True)
+            self.level_floor = self._calculate_percentile_mean(self.short_term_noise_history, 0.1, upper=False)
+            short_term_to_peak_percentage = (self.level_short_term - self.level_floor) / (self.level_peak - self.level_floor)
+
+            if not self.is_recording:
+                logging.debug(f'Level: {int(audio_level)}, long_term: {int(self.level_long_term)}, short_term: {int(self.level_short_term)}, Peak: {int(self.level_peak)}, long_term low: {int(self.level_floor)}, Percentage: {int(short_term_to_peak_percentage*100)}%')
+            else:
+                short_term_to_peak_percentage = (self.level_short_term - self.level_long_term) / (self.level_peak - self.level_long_term)
+                logging.debug(f'Level: {int(audio_level)}, long_term: {int(self.level_long_term)}, short_term: {int(self.level_short_term)}, Peak: {int(self.level_peak)}, long_term low: {int(self.level_floor)}, Percentage: {int(short_term_to_peak_percentage*100)}%')
+
+            # Check if we're not currently recording
+            if not self.is_recording:
+
+                voice_after_recording = False
+
+                # Check if wake word detection is active
+                if self.wake_words:
+
+                    try:
+                        pcm = struct.unpack_from("h" * self.buffer_size, data)
+                        wakeword_index = self.porcupine.process(pcm)
+                    except struct.error:
+                        logging.error("Error unpacking audio data for wake word processing.")
+                        continue
+                    except Exception as e:
+                        logging.error(f"Wake word processing error: {e}")
+                        continue
+                    
+                    wakeword_detected = wakeword_index >= 0
+                    
+                    if wakeword_detected:
+                        logging.info(f'wake word "{self.wake_words_list[wakeword_index]}" detected')
+                        self.start()
+                        if self.is_recording:
+                            self.level_long_term = self._calculate_percentile_mean(self.long_term_noise_history, 0.125, upper=False)
+                            self.start_recording_on_voice_activity = False
+
+                # Check for voice activity to trigger the start of recording
+                elif self.start_recording_on_voice_activity and self.level_short_term > self.level_long_term + self.voice_activity_threshold:
+
+                    logging.info("voice activity detected")
+
+                    self.start()
+
+                    if self.is_recording:
+                        self.level_long_term = self._calculate_percentile_mean(self.long_term_noise_history, 0.125, upper=False)
+                        self.start_recording_on_voice_activity = False
+
+                        # Add the buffered audio to the recording frames
+                        self.frames.extend(list(self.audio_buffer))
+                    
+                self.speech_end_silence_start = 0
+
+            # If we're currently recording and voice deactivity is detected, stop the recording
+            else:
+                current_time = time.time()
+
+                self.state = "recording - waiting for voice end" if voice_after_recording else "recording - waiting for voice"
+
+                # we don't detect voice in the first x seconds cause it could be fragments from the wake word
+                if current_time - self.last_start_time > WAIT_AFTER_START_BEFORE_ACTIVITY_DETECTION:
+                    if not voice_after_recording and self.level_short_term > self.level_long_term + (self.voice_activity_threshold * ACTIVITY_DETECTION_AFTER_START_PERCENT):
+                        logging.info("voice activity after recording detected")
+                        voice_after_recording = True
+
+                # we are recording
+                short_term_to_peak_percentage = (self.level_short_term - self.level_long_term) / (self.level_peak - self.level_long_term)
+                logging.debug(f'short_term_to_peak_percentage: {int(short_term_to_peak_percentage*100)}%, peak: {int(self.level_peak)}, long_term: {int(self.level_long_term)}')
+
+                if voice_after_recording and self.stop_recording_on_voice_deactivity: 
+                    if short_term_to_peak_percentage < self.voice_deactivity_sensitivity:
+                        # silence detected (after voice detected while recording)
+
+                        if self.speech_end_silence_start == 0:
+                            self.speech_end_silence_start = time.time()
+                            self.state = "recording - voice end, silence wait"
+                        
+                    else:
+                        self.speech_end_silence_start = 0
+
+                    if self.speech_end_silence_start and time.time() - self.speech_end_silence_start > self.voice_deactivity_silence_after_speech_end:
+                        logging.info("voice deactivity detected")
+                        self.stop()
+                        if not self.is_recording:
+                                voice_after_recording = False
+
+            if not self.is_recording and was_recording:
+                # Reset after stopping recording to ensure clean state
+                self.stop_recording_on_voice_deactivity = False
+
+            short_term_to_peak_percentage = min(max(short_term_to_peak_percentage, 0.0), 1.0)
+            self.voice_deactivity_probability = 1 - short_term_to_peak_percentage
+
+            if self.is_recording:
+                self.frames.append(data)
+
+            self.audio_buffer.append(data)	
+
+            was_recording = self.is_recording
+            time.sleep(0.01)
+
+    def __del__(self):
+        """
+        Destructor method ensures safe shutdown of the recorder when the instance is destroyed.
+        """
+        self.shutdown()

+ 4 - 0
requirements.txt

@@ -0,0 +1,4 @@
+PyAudio==0.2.13
+faster-whisper==0.7.1
+pvporcupine==1.9.5
+torch==2.0.1

+ 31 - 0
tests/audio_recorder_tester.py

@@ -0,0 +1,31 @@
+from RealtimeSTT import AudioToTextRecorder
+
+def recording_started():
+    print(" >> recording started... ", end="", flush=True)
+
+def recording_finished():
+    print("recording finished...")
+
+recorder = AudioToTextRecorder(language="de", on_recording_started=recording_started, on_recording_finished=recording_finished)
+
+
+# usage 1:
+# automatic detection of speech start and end, waits for text to be returned
+print ("Say something...")
+print (f'TEXT: "{recorder.text()}"')
+print()
+
+
+# usage 2:
+# manual trigger of speech start and end
+print("Tap space when you're ready.")
+import keyboard, time
+keyboard.wait('space')
+while keyboard.is_pressed('space'): time.sleep(0.1)
+
+recorder.start()
+
+print("tap space when you're done... ", end="", flush=True)
+while not keyboard.is_pressed('space'): time.sleep(0.1)
+
+print (f'TEXT: "{recorder.stop().text()}"')

+ 7 - 0
tests/simple_test.py

@@ -0,0 +1,7 @@
+import RealtimeSTT 
+import logging
+
+recorder = RealtimeSTT.AudioToTextRecorder(level=logging.DEBUG)
+
+print("Say something...")
+print(recorder.text())

+ 12 - 0
tests/wakeword_test.py

@@ -0,0 +1,12 @@
+import RealtimeSTT 
+
+def recording_started():
+    print("Speak now...")
+
+def recording_finished():
+    print("Speech end detected... transcribing...")
+
+recorder = RealtimeSTT.AudioToTextRecorder(model="small.en", language="en", wake_words="jarvis", on_recording_started=recording_started, on_recording_finished=recording_finished)
+
+print('Say "Jarvis" then speak.')
+print(recorder.text())