1 year ago · f870d77874
--- a/README.md
+++ b/README.md
@@ -16,6 +16,11 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
															 ### Updates
														
 
															+#### v0.1.8
														
 
															+- large-v3 whisper model now supported (upgrade to faster_whisper 0.10.0)
														
 
															+- added feed_audio() and use_microphone parameter to feed chunks
														
 
															+- added recording and transcription from the browser example 
														
 
															+
														
 
															 #### v0.1.7
														
 
															 - Bugfix for Mac OS Installation (multiprocessing / queue.size())
														
 
															 - KeyboardInterrupt handling (now abortable with CTRL+C)
														
@@ -75,7 +80,7 @@ To use RealtimeSTT with GPU support via CUDA please follow these steps:
 
															 1. **Install NVIDIA CUDA Toolkit 11.8**:
														
 
															     - Visit [NVIDIA CUDA Toolkit Archive](https://developer.nvidia.com/cuda-11-8-0-download-archive).
														
 
															-    - Select version 11.
														
 
															+    - Select operating system and version.
														
 
															     - Download and install the software.
														
 
															 2. **Install NVIDIA cuDNN 8.7.0 for CUDA 11.x**:
														
@@ -179,6 +184,14 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
 
															                                on_recording_stop=my_stop_callback)
														
 
															 ```
														
 
															+### Feed chunks
														
 
															+
														
 
															+If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono with this method:
														
 
															+
														
 
															+```python
														
 
															+recorder.feed_audio(audio_chunk)
														
 
															+```
														
 
															+
														
 
															 ### Shutdown
														
 
															 You can shutdown the recorder safely by using the context manager protocol:
														
@@ -252,6 +265,8 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
															 - **ensure_sentence_ends_with_period** (bool, default=True): Ensures that every sentence that doesn't end with punctuation such as "?", "!" ends with a period
														
 
															+- **use_microphone** (bool, default=True): Usage of local microphone for transcription. Set to False if you want to provide chunks with feed_audio method.
														
 
															+
														
 
															 - **spinner** (bool, default=True): Provides a spinner animation text with information about the current recorder state.
														
 
															 - **level** (int, default=logging.WARNING): Logging level.
														
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
--- a/example_browserclient/client.js
+++ b/example_browserclient/client.js
@@ -0,0 +1,49 @@
 
															+let socket = new WebSocket("ws://localhost:9001");
														
 
															+
														
 
															+socket.onmessage = function(event) {
														
 
															+    let data = JSON.parse(event.data);
														
 
															+    if (data.type === 'realtime') {
														
 
															+        document.getElementById('realtimeText').value = data.text;
														
 
															+    } else if (data.type === 'fullSentence') {
														
 
															+        document.getElementById('fullSentenceText').value = data.text;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// Request access to the microphone
														
 
															+navigator.mediaDevices.getUserMedia({ audio: true })
														
 
															+.then(stream => {
														
 
															+    let audioContext = new AudioContext();
														
 
															+    let source = audioContext.createMediaStreamSource(stream);
														
 
															+    let processor = audioContext.createScriptProcessor(256, 1, 1);
														
 
															+
														
 
															+    source.connect(processor);
														
 
															+    processor.connect(audioContext.destination);
														
 
															+
														
 
															+    processor.onaudioprocess = function(e) {
														
 
															+        let inputData = e.inputBuffer.getChannelData(0);
														
 
															+        let outputData = new Int16Array(inputData.length);
														
 
															+
														
 
															+        // Convert to 16-bit PCM
														
 
															+        for (let i = 0; i < inputData.length; i++) {
														
 
															+            outputData[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
														
 
															+        }
														
 
															+
														
 
															+        // Send the 16-bit PCM data to the server
														
 
															+
														
 
															+        if (socket.readyState === WebSocket.OPEN) {
														
 
															+            // Create a JSON string with metadata
														
 
															+            let metadata = JSON.stringify({ sampleRate: audioContext.sampleRate });
														
 
															+            // Convert metadata to a byte array
														
 
															+            let metadataBytes = new TextEncoder().encode(metadata);
														
 
															+            // Create a buffer for metadata length (4 bytes for 32-bit integer)
														
 
															+            let metadataLength = new ArrayBuffer(4);
														
 
															+            let metadataLengthView = new DataView(metadataLength);
														
 
															+            // Set the length of the metadata in the first 4 bytes
														
 
															+            metadataLengthView.setInt32(0, metadataBytes.byteLength, true); // true for little-endian
														
 
															+            // Combine metadata length, metadata, and audio data into a single message
														
 
															+            let combinedData = new Blob([metadataLength, metadataBytes, outputData.buffer]);
														
 
															+            socket.send(combinedData);
														
 
															+        }
														
 
															+    };
														
 
															+})
														
 
															+.catch(e => console.error(e));
														
--- a/example_browserclient/index.html
+++ b/example_browserclient/index.html
@@ -0,0 +1,18 @@
 
															+<!DOCTYPE html>
														
 
															+<html>
														
 
															+<head>
														
 
															+    <title>Audio Streamer</title>
														
 
															+    <script src="https://cdn.socket.io/4.0.0/socket.io.min.js"></script>
														
 
															+</head>
														
 
															+<body>
														
 
															+    <div>
														
 
															+        <p>Realtime Text:</p>
														
 
															+        <textarea id="realtimeText" rows="5" cols="50"></textarea>
														
 
															+    </div>
														
 
															+    <div>
														
 
															+        <p>Full Sentence Text (higher transcription quality):</p>
														
 
															+        <textarea id="fullSentenceText" rows="5" cols="50"></textarea>
														
 
															+    </div>
														
 
															+    <script src="client.js"></script>
														
 
															+</body>
														
 
															+</html>
														
--- a/example_browserclient/server.py
+++ b/example_browserclient/server.py
@@ -0,0 +1,109 @@
 
															+if __name__ == '__main__':
														
 
															+    print("Starting server, please wait...")
														
 
															+
														
 
															+    from RealtimeSTT import AudioToTextRecorder
														
 
															+    import asyncio
														
 
															+    import websockets
														
 
															+    import threading
														
 
															+    import numpy as np
														
 
															+    from scipy.signal import resample
														
 
															+    import json
														
 
															+
														
 
															+    recorder = None
														
 
															+    recorder_ready = threading.Event()
														
 
															+    client_websocket = None
														
 
															+
														
 
															+    async def send_to_client(message):
														
 
															+        if client_websocket:
														
 
															+            await client_websocket.send(message)
														
 
															+
														
 
															+    def text_detected(text):
														
 
															+        asyncio.new_event_loop().run_until_complete(
														
 
															+            send_to_client(
														
 
															+                json.dumps({
														
 
															+                    'type': 'realtime',
														
 
															+                    'text': text
														
 
															+                })
														
 
															+            )
														
 
															+        )
														
 
															+        print(f"{text}", flush=True, end='')
														
 
															+
														
 
															+    recorder_config = {
														
 
															+        'spinner': False,
														
 
															+        'use_microphone': False,
														
 
															+        'model': 'large-v3',
														
 
															+        'language': 'en',
														
 
															+        'silero_sensitivity': 0.4,
														
 
															+        'webrtc_sensitivity': 2,
														
 
															+        'post_speech_silence_duration': 1.0,
														
 
															+        'min_length_of_recording': 0,
														
 
															+        'min_gap_between_recordings': 0,
														
 
															+        'enable_realtime_transcription': True,
														
 
															+        'realtime_processing_pause': 0,
														
 
															+        'realtime_model_type': 'tiny.en',
														
 
															+        'on_realtime_transcription_stabilized': text_detected,
														
 
															+    }
														
 
															+
														
 
															+    def recorder_thread():
														
 
															+        global recorder
														
 
															+        print("Initializing RealtimeSTT...")
														
 
															+        recorder = AudioToTextRecorder(**recorder_config)
														
 
															+        print("RealtimeSTT initialized")
														
 
															+        recorder_ready.set()
														
 
															+        while True:
														
 
															+            full_sentence = recorder.text()
														
 
															+            asyncio.new_event_loop().run_until_complete(
														
 
															+                send_to_client(
														
 
															+                    json.dumps({
														
 
															+                        'type': 'fullSentence',
														
 
															+                        'text': full_sentence
														
 
															+                    })
														
 
															+                )
														
 
															+            )
														
 
															+            print(f"{full_sentence}")
														
 
															+
														
 
															+    def decode_and_resample(
														
 
															+            audio_data,
														
 
															+            original_sample_rate,
														
 
															+            target_sample_rate):
														
 
															+
														
 
															+        # Decode 16-bit PCM data to numpy array
														
 
															+        audio_np = np.frombuffer(audio_data, dtype=np.int16)
														
 
															+
														
 
															+        # Calculate the number of samples after resampling
														
 
															+        num_original_samples = len(audio_np)
														
 
															+        num_target_samples = int(num_original_samples * target_sample_rate /
														
 
															+                                 original_sample_rate)
														
 
															+
														
 
															+        # Resample the audio
														
 
															+        resampled_audio = resample(audio_np, num_target_samples)
														
 
															+
														
 
															+        return resampled_audio.astype(np.int16).tobytes()
														
 
															+
														
 
															+    async def echo(websocket, path):
														
 
															+        print("Client connected")
														
 
															+        global client_websocket
														
 
															+        client_websocket = websocket
														
 
															+        async for message in websocket:
														
 
															+
														
 
															+            if not recorder_ready.is_set():
														
 
															+                print("Recorder not ready")
														
 
															+                continue
														
 
															+
														
 
															+            metadata_length = int.from_bytes(message[:4], byteorder='little')
														
 
															+            metadata_json = message[4:4+metadata_length].decode('utf-8')
														
 
															+            metadata = json.loads(metadata_json)
														
 
															+            sample_rate = metadata['sampleRate']
														
 
															+            chunk = message[4+metadata_length:]
														
 
															+            resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
														
 
															+            recorder.feed_audio(resampled_chunk)
														
 
															+
														
 
															+    start_server = websockets.serve(echo, "localhost", 9001)
														
 
															+
														
 
															+    recorder_thread = threading.Thread(target=recorder_thread)
														
 
															+    recorder_thread.start()
														
 
															+    recorder_ready.wait()
														
 
															+
														
 
															+    print("Server started. Press Ctrl+C to stop the server.")
														
 
															+    asyncio.get_event_loop().run_until_complete(start_server)
														
 
															+    asyncio.get_event_loop().run_forever()
														
--- a/example_browserclient/start_server.bat
+++ b/example_browserclient/start_server.bat
@@ -0,0 +1,4 @@
 
															+@echo off
														
 
															+cd /d %~dp0
														
 
															+python server.py
														
 
															+cmd
														
--- a/install_with_gpu_support.bat
+++ b/install_with_gpu_support.bat
@@ -1,3 +1,2 @@
 
															-pip uninstall torch
														
 
															-pip install torch==2.0.1+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
														
 
															+pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
														
 
															 pip install -r requirements-gpu.txt
														
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -1,5 +1,5 @@
 
															-PyAudio>=0.2.13
														
 
															-faster-whisper>=0.7.1
														
 
															+PyAudio==0.2.14
														
 
															+faster-whisper==0.10.0
														
 
															 pvporcupine==1.9.5
														
 
															-webrtcvad>=2.0.10
														
 
															-halo>=0.0.31
														
 
															+webrtcvad==2.0.10
														
 
															+halo==0.0.31
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 
															-PyAudio>=0.2.13
														
 
															-faster-whisper>=0.7.1
														
 
															+PyAudio==0.2.14
														
 
															+faster-whisper==0.10.0
														
 
															 pvporcupine==1.9.5
														
 
															-webrtcvad>=2.0.10
														
 
															-halo>=0.0.31
														
 
															-torch>=2.0.1
														
 
															-torchaudio>=2.0.2
														
 
															+webrtcvad==2.0.10
														
 
															+halo==0.0.31
														
 
															+torch==2.1.1
														
 
															+torchaudio==2.1.1
														
--- a/tests/realtimestt_chinese.py
+++ b/tests/realtimestt_chinese.py
@@ -0,0 +1,63 @@
 
															+from RealtimeSTT import AudioToTextRecorder
														
 
															+from colorama import Fore, Back, Style
														
 
															+import colorama
														
 
															+import os
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+
														
 
															+    print("Initializing RealtimeSTT test...")
														
 
															+
														
 
															+    colorama.init()
														
 
															+
														
 
															+    full_sentences = []
														
 
															+    displayed_text = ""
														
 
															+
														
 
															+    def clear_console():
														
 
															+        os.system('clear' if os.name == 'posix' else 'cls')
														
 
															+
														
 
															+    def text_detected(text):
														
 
															+        try:
														
 
															+
														
 
															+            global displayed_text
														
 
															+            sentences_with_style = [
														
 
															+                f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
														
 
															+                for i, sentence in enumerate(full_sentences)
														
 
															+            ]
														
 
															+            new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
														
 
															+
														
 
															+            if new_text != displayed_text:
														
 
															+                displayed_text = new_text
														
 
															+                clear_console()
														
 
															+                print(displayed_text, end="", flush=True)
														
 
															+                
														
 
															+        except Exception as e:
														
 
															+            print(e)
														
 
															+
														
 
															+    def process_text(text):
														
 
															+        full_sentences.append(text)
														
 
															+        text_detected("")
														
 
															+
														
 
															+    recorder_config = {
														
 
															+        'spinner': False,
														
 
															+        'model': 'large-v2',
														
 
															+        'language': 'zh',
														
 
															+        'silero_sensitivity': 0.4,
														
 
															+        'webrtc_sensitivity': 2,
														
 
															+        'post_speech_silence_duration': 0.2,
														
 
															+        'min_length_of_recording': 0,
														
 
															+        'min_gap_between_recordings': 0,        
														
 
															+        # 'enable_realtime_transcription': True,
														
 
															+        # 'realtime_processing_pause': 0.2,
														
 
															+        # 'realtime_model_type': 'tiny',
														
 
															+        # 'on_realtime_transcription_update': text_detected, 
														
 
															+        #'on_realtime_transcription_stabilized': text_detected,
														
 
															+    }
														
 
															+
														
 
															+    recorder = AudioToTextRecorder(**recorder_config)
														
 
															+
														
 
															+    clear_console()
														
 
															+    print("Say something...", end="", flush=True)
														
 
															+
														
 
															+    while True:
														
 
															+        text = recorder.text(process_text)
														
 
															+        text_detected(text)
														
--- a/tests/realtimestt_test.py
+++ b/tests/realtimestt_test.py
@@ -28,6 +28,21 @@ if __name__ == '__main__':
 
															             clear_console()
														
 
															             print(displayed_text, end="", flush=True)
														
 
															+    # def text_detected(text):
														
 
															+    #     global displayed_text
														
 
															+    #     clear_console()
														
 
															+    #     print(text)
														
 
															+        # sentences_with_style = [
														
 
															+        #     f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
														
 
															+        #     for i, sentence in enumerate(full_sentences)
														
 
															+        # ]
														
 
															+        # new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
														
 
															+
														
 
															+        # if new_text != displayed_text:
														
 
															+        #     displayed_text = new_text
														
 
															+        #     clear_console()
														
 
															+        #     print(displayed_text, end="", flush=True)
														
 
															+
														
 
															     def process_text(text):
														
 
															         full_sentences.append(text)
														
 
															         text_detected("")
														
@@ -36,9 +51,9 @@ if __name__ == '__main__':
 
															         'spinner': False,
														
 
															         'model': 'large-v2',
														
 
															         'language': 'en',
														
 
															-        'silero_sensitivity': 0.3,
														
 
															+        'silero_sensitivity': 0.4,
														
 
															         'webrtc_sensitivity': 2,
														
 
															-        'post_speech_silence_duration': 0.5,
														
 
															+        'post_speech_silence_duration': 0.4,
														
 
															         'min_length_of_recording': 0,
														
 
															         'min_gap_between_recordings': 0,
														
 
															         'enable_realtime_transcription': True,