1 anno fa · f870d77874
--- a/README.md
+++ b/README.md
@@ -16,6 +16,11 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
				 
			
 
				 ### Updates
			
 
				 
			
 
				+#### v0.1.8
			
 
				+- large-v3 whisper model now supported (upgrade to faster_whisper 0.10.0)
			
 
				+- added feed_audio() and use_microphone parameter to feed chunks
			
 
				+- added recording and transcription from the browser example 
			
 
				+
			
 
				 #### v0.1.7
			
 
				 - Bugfix for Mac OS Installation (multiprocessing / queue.size())
			
 
				 - KeyboardInterrupt handling (now abortable with CTRL+C)
			
@@ -75,7 +80,7 @@ To use RealtimeSTT with GPU support via CUDA please follow these steps:
 
				 
			
 
				 1. **Install NVIDIA CUDA Toolkit 11.8**:
			
 
				     - Visit [NVIDIA CUDA Toolkit Archive](https://developer.nvidia.com/cuda-11-8-0-download-archive).
			
 
				-    - Select version 11.
			
 
				+    - Select operating system and version.
			
 
				     - Download and install the software.
			
 
				 
			
 
				 2. **Install NVIDIA cuDNN 8.7.0 for CUDA 11.x**:
			
@@ -179,6 +184,14 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
 
				                                on_recording_stop=my_stop_callback)
			
 
				 ```
			
 
				 
			
 
				+### Feed chunks
			
 
				+
			
 
				+If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono with this method:
			
 
				+
			
 
				+```python
			
 
				+recorder.feed_audio(audio_chunk)
			
 
				+```
			
 
				+
			
 
				 ### Shutdown
			
 
				 
			
 
				 You can shutdown the recorder safely by using the context manager protocol:
			
@@ -252,6 +265,8 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
				 
			
 
				 - **ensure_sentence_ends_with_period** (bool, default=True): Ensures that every sentence that doesn't end with punctuation such as "?", "!" ends with a period
			
 
				 
			
 
				+- **use_microphone** (bool, default=True): Usage of local microphone for transcription. Set to False if you want to provide chunks with feed_audio method.
			
 
				+
			
 
				 - **spinner** (bool, default=True): Provides a spinner animation text with information about the current recorder state.
			
 
				 
			
 
				 - **level** (int, default=logging.WARNING): Logging level.
			
--- a/RealtimeSTT/audio_recorder.py
+++ b/RealtimeSTT/audio_recorder.py
--- a/example_browserclient/client.js
+++ b/example_browserclient/client.js
@@ -0,0 +1,49 @@
 
				+let socket = new WebSocket("ws://localhost:9001");
			
 
				+
			
 
				+socket.onmessage = function(event) {
			
 
				+    let data = JSON.parse(event.data);
			
 
				+    if (data.type === 'realtime') {
			
 
				+        document.getElementById('realtimeText').value = data.text;
			
 
				+    } else if (data.type === 'fullSentence') {
			
 
				+        document.getElementById('fullSentenceText').value = data.text;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+// Request access to the microphone
			
 
				+navigator.mediaDevices.getUserMedia({ audio: true })
			
 
				+.then(stream => {
			
 
				+    let audioContext = new AudioContext();
			
 
				+    let source = audioContext.createMediaStreamSource(stream);
			
 
				+    let processor = audioContext.createScriptProcessor(256, 1, 1);
			
 
				+
			
 
				+    source.connect(processor);
			
 
				+    processor.connect(audioContext.destination);
			
 
				+
			
 
				+    processor.onaudioprocess = function(e) {
			
 
				+        let inputData = e.inputBuffer.getChannelData(0);
			
 
				+        let outputData = new Int16Array(inputData.length);
			
 
				+
			
 
				+        // Convert to 16-bit PCM
			
 
				+        for (let i = 0; i < inputData.length; i++) {
			
 
				+            outputData[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
			
 
				+        }
			
 
				+
			
 
				+        // Send the 16-bit PCM data to the server
			
 
				+
			
 
				+        if (socket.readyState === WebSocket.OPEN) {
			
 
				+            // Create a JSON string with metadata
			
 
				+            let metadata = JSON.stringify({ sampleRate: audioContext.sampleRate });
			
 
				+            // Convert metadata to a byte array
			
 
				+            let metadataBytes = new TextEncoder().encode(metadata);
			
 
				+            // Create a buffer for metadata length (4 bytes for 32-bit integer)
			
 
				+            let metadataLength = new ArrayBuffer(4);
			
 
				+            let metadataLengthView = new DataView(metadataLength);
			
 
				+            // Set the length of the metadata in the first 4 bytes
			
 
				+            metadataLengthView.setInt32(0, metadataBytes.byteLength, true); // true for little-endian
			
 
				+            // Combine metadata length, metadata, and audio data into a single message
			
 
				+            let combinedData = new Blob([metadataLength, metadataBytes, outputData.buffer]);
			
 
				+            socket.send(combinedData);
			
 
				+        }
			
 
				+    };
			
 
				+})
			
 
				+.catch(e => console.error(e));
			
--- a/example_browserclient/index.html
+++ b/example_browserclient/index.html
@@ -0,0 +1,18 @@
 
				+<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <title>Audio Streamer</title>
			
 
				+    <script src="https://cdn.socket.io/4.0.0/socket.io.min.js"></script>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <div>
			
 
				+        <p>Realtime Text:</p>
			
 
				+        <textarea id="realtimeText" rows="5" cols="50"></textarea>
			
 
				+    </div>
			
 
				+    <div>
			
 
				+        <p>Full Sentence Text (higher transcription quality):</p>
			
 
				+        <textarea id="fullSentenceText" rows="5" cols="50"></textarea>
			
 
				+    </div>
			
 
				+    <script src="client.js"></script>
			
 
				+</body>
			
 
				+</html>
			
--- a/example_browserclient/server.py
+++ b/example_browserclient/server.py
@@ -0,0 +1,109 @@
 
				+if __name__ == '__main__':
			
 
				+    print("Starting server, please wait...")
			
 
				+
			
 
				+    from RealtimeSTT import AudioToTextRecorder
			
 
				+    import asyncio
			
 
				+    import websockets
			
 
				+    import threading
			
 
				+    import numpy as np
			
 
				+    from scipy.signal import resample
			
 
				+    import json
			
 
				+
			
 
				+    recorder = None
			
 
				+    recorder_ready = threading.Event()
			
 
				+    client_websocket = None
			
 
				+
			
 
				+    async def send_to_client(message):
			
 
				+        if client_websocket:
			
 
				+            await client_websocket.send(message)
			
 
				+
			
 
				+    def text_detected(text):
			
 
				+        asyncio.new_event_loop().run_until_complete(
			
 
				+            send_to_client(
			
 
				+                json.dumps({
			
 
				+                    'type': 'realtime',
			
 
				+                    'text': text
			
 
				+                })
			
 
				+            )
			
 
				+        )
			
 
				+        print(f"{text}", flush=True, end='')
			
 
				+
			
 
				+    recorder_config = {
			
 
				+        'spinner': False,
			
 
				+        'use_microphone': False,
			
 
				+        'model': 'large-v3',
			
 
				+        'language': 'en',
			
 
				+        'silero_sensitivity': 0.4,
			
 
				+        'webrtc_sensitivity': 2,
			
 
				+        'post_speech_silence_duration': 1.0,
			
 
				+        'min_length_of_recording': 0,
			
 
				+        'min_gap_between_recordings': 0,
			
 
				+        'enable_realtime_transcription': True,
			
 
				+        'realtime_processing_pause': 0,
			
 
				+        'realtime_model_type': 'tiny.en',
			
 
				+        'on_realtime_transcription_stabilized': text_detected,
			
 
				+    }
			
 
				+
			
 
				+    def recorder_thread():
			
 
				+        global recorder
			
 
				+        print("Initializing RealtimeSTT...")
			
 
				+        recorder = AudioToTextRecorder(**recorder_config)
			
 
				+        print("RealtimeSTT initialized")
			
 
				+        recorder_ready.set()
			
 
				+        while True:
			
 
				+            full_sentence = recorder.text()
			
 
				+            asyncio.new_event_loop().run_until_complete(
			
 
				+                send_to_client(
			
 
				+                    json.dumps({
			
 
				+                        'type': 'fullSentence',
			
 
				+                        'text': full_sentence
			
 
				+                    })
			
 
				+                )
			
 
				+            )
			
 
				+            print(f"{full_sentence}")
			
 
				+
			
 
				+    def decode_and_resample(
			
 
				+            audio_data,
			
 
				+            original_sample_rate,
			
 
				+            target_sample_rate):
			
 
				+
			
 
				+        # Decode 16-bit PCM data to numpy array
			
 
				+        audio_np = np.frombuffer(audio_data, dtype=np.int16)
			
 
				+
			
 
				+        # Calculate the number of samples after resampling
			
 
				+        num_original_samples = len(audio_np)
			
 
				+        num_target_samples = int(num_original_samples * target_sample_rate /
			
 
				+                                 original_sample_rate)
			
 
				+
			
 
				+        # Resample the audio
			
 
				+        resampled_audio = resample(audio_np, num_target_samples)
			
 
				+
			
 
				+        return resampled_audio.astype(np.int16).tobytes()
			
 
				+
			
 
				+    async def echo(websocket, path):
			
 
				+        print("Client connected")
			
 
				+        global client_websocket
			
 
				+        client_websocket = websocket
			
 
				+        async for message in websocket:
			
 
				+
			
 
				+            if not recorder_ready.is_set():
			
 
				+                print("Recorder not ready")
			
 
				+                continue
			
 
				+
			
 
				+            metadata_length = int.from_bytes(message[:4], byteorder='little')
			
 
				+            metadata_json = message[4:4+metadata_length].decode('utf-8')
			
 
				+            metadata = json.loads(metadata_json)
			
 
				+            sample_rate = metadata['sampleRate']
			
 
				+            chunk = message[4+metadata_length:]
			
 
				+            resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
			
 
				+            recorder.feed_audio(resampled_chunk)
			
 
				+
			
 
				+    start_server = websockets.serve(echo, "localhost", 9001)
			
 
				+
			
 
				+    recorder_thread = threading.Thread(target=recorder_thread)
			
 
				+    recorder_thread.start()
			
 
				+    recorder_ready.wait()
			
 
				+
			
 
				+    print("Server started. Press Ctrl+C to stop the server.")
			
 
				+    asyncio.get_event_loop().run_until_complete(start_server)
			
 
				+    asyncio.get_event_loop().run_forever()
			
--- a/example_browserclient/start_server.bat
+++ b/example_browserclient/start_server.bat
@@ -0,0 +1,4 @@
 
				+@echo off
			
 
				+cd /d %~dp0
			
 
				+python server.py
			
 
				+cmd
			
--- a/install_with_gpu_support.bat
+++ b/install_with_gpu_support.bat
@@ -1,3 +1,2 @@
 
				-pip uninstall torch
			
 
				-pip install torch==2.0.1+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
			
 
				+pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
			
 
				 pip install -r requirements-gpu.txt
			
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -1,5 +1,5 @@
 
				-PyAudio>=0.2.13
			
 
				-faster-whisper>=0.7.1
			
 
				+PyAudio==0.2.14
			
 
				+faster-whisper==0.10.0
			
 
				 pvporcupine==1.9.5
			
 
				-webrtcvad>=2.0.10
			
 
				-halo>=0.0.31
			
 
				+webrtcvad==2.0.10
			
 
				+halo==0.0.31
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 
				-PyAudio>=0.2.13
			
 
				-faster-whisper>=0.7.1
			
 
				+PyAudio==0.2.14
			
 
				+faster-whisper==0.10.0
			
 
				 pvporcupine==1.9.5
			
 
				-webrtcvad>=2.0.10
			
 
				-halo>=0.0.31
			
 
				-torch>=2.0.1
			
 
				-torchaudio>=2.0.2
			
 
				+webrtcvad==2.0.10
			
 
				+halo==0.0.31
			
 
				+torch==2.1.1
			
 
				+torchaudio==2.1.1
			
--- a/tests/realtimestt_chinese.py
+++ b/tests/realtimestt_chinese.py
@@ -0,0 +1,63 @@
 
				+from RealtimeSTT import AudioToTextRecorder
			
 
				+from colorama import Fore, Back, Style
			
 
				+import colorama
			
 
				+import os
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    print("Initializing RealtimeSTT test...")
			
 
				+
			
 
				+    colorama.init()
			
 
				+
			
 
				+    full_sentences = []
			
 
				+    displayed_text = ""
			
 
				+
			
 
				+    def clear_console():
			
 
				+        os.system('clear' if os.name == 'posix' else 'cls')
			
 
				+
			
 
				+    def text_detected(text):
			
 
				+        try:
			
 
				+
			
 
				+            global displayed_text
			
 
				+            sentences_with_style = [
			
 
				+                f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
			
 
				+                for i, sentence in enumerate(full_sentences)
			
 
				+            ]
			
 
				+            new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
			
 
				+
			
 
				+            if new_text != displayed_text:
			
 
				+                displayed_text = new_text
			
 
				+                clear_console()
			
 
				+                print(displayed_text, end="", flush=True)
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+
			
 
				+    def process_text(text):
			
 
				+        full_sentences.append(text)
			
 
				+        text_detected("")
			
 
				+
			
 
				+    recorder_config = {
			
 
				+        'spinner': False,
			
 
				+        'model': 'large-v2',
			
 
				+        'language': 'zh',
			
 
				+        'silero_sensitivity': 0.4,
			
 
				+        'webrtc_sensitivity': 2,
			
 
				+        'post_speech_silence_duration': 0.2,
			
 
				+        'min_length_of_recording': 0,
			
 
				+        'min_gap_between_recordings': 0,        
			
 
				+        # 'enable_realtime_transcription': True,
			
 
				+        # 'realtime_processing_pause': 0.2,
			
 
				+        # 'realtime_model_type': 'tiny',
			
 
				+        # 'on_realtime_transcription_update': text_detected, 
			
 
				+        #'on_realtime_transcription_stabilized': text_detected,
			
 
				+    }
			
 
				+
			
 
				+    recorder = AudioToTextRecorder(**recorder_config)
			
 
				+
			
 
				+    clear_console()
			
 
				+    print("Say something...", end="", flush=True)
			
 
				+
			
 
				+    while True:
			
 
				+        text = recorder.text(process_text)
			
 
				+        text_detected(text)
			
--- a/tests/realtimestt_test.py
+++ b/tests/realtimestt_test.py
@@ -28,6 +28,21 @@ if __name__ == '__main__':
 
				             clear_console()
			
 
				             print(displayed_text, end="", flush=True)
			
 
				 
			
 
				+    # def text_detected(text):
			
 
				+    #     global displayed_text
			
 
				+    #     clear_console()
			
 
				+    #     print(text)
			
 
				+        # sentences_with_style = [
			
 
				+        #     f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
			
 
				+        #     for i, sentence in enumerate(full_sentences)
			
 
				+        # ]
			
 
				+        # new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
			
 
				+
			
 
				+        # if new_text != displayed_text:
			
 
				+        #     displayed_text = new_text
			
 
				+        #     clear_console()
			
 
				+        #     print(displayed_text, end="", flush=True)
			
 
				+
			
 
				     def process_text(text):
			
 
				         full_sentences.append(text)
			
 
				         text_detected("")
			
@@ -36,9 +51,9 @@ if __name__ == '__main__':
 
				         'spinner': False,
			
 
				         'model': 'large-v2',
			
 
				         'language': 'en',
			
 
				-        'silero_sensitivity': 0.3,
			
 
				+        'silero_sensitivity': 0.4,
			
 
				         'webrtc_sensitivity': 2,
			
 
				-        'post_speech_silence_duration': 0.5,
			
 
				+        'post_speech_silence_duration': 0.4,
			
 
				         'min_length_of_recording': 0,
			
 
				         'min_gap_between_recordings': 0,
			
 
				         'enable_realtime_transcription': True,