Sfoglia il codice sorgente

whisper v3, browsersupport

Kolja Beigel 1 anno fa
parent
commit
f870d77874

+ 16 - 1
README.md

@@ -16,6 +16,11 @@ https://github.com/KoljaB/RealtimeSTT/assets/7604638/207cb9a2-4482-48e7-9d2b-072
 
 ### Updates
 
+#### v0.1.8
+- large-v3 whisper model now supported (upgrade to faster_whisper 0.10.0)
+- added feed_audio() and use_microphone parameter to feed chunks
+- added recording and transcription from the browser example 
+
 #### v0.1.7
 - Bugfix for Mac OS Installation (multiprocessing / queue.size())
 - KeyboardInterrupt handling (now abortable with CTRL+C)
@@ -75,7 +80,7 @@ To use RealtimeSTT with GPU support via CUDA please follow these steps:
 
 1. **Install NVIDIA CUDA Toolkit 11.8**:
     - Visit [NVIDIA CUDA Toolkit Archive](https://developer.nvidia.com/cuda-11-8-0-download-archive).
-    - Select version 11.
+    - Select operating system and version.
     - Download and install the software.
 
 2. **Install NVIDIA cuDNN 8.7.0 for CUDA 11.x**:
@@ -179,6 +184,14 @@ recorder = AudioToTextRecorder(on_recording_start=my_start_callback,
                                on_recording_stop=my_stop_callback)
 ```
 
+### Feed chunks
+
+If you don't want to use the local microphone set use_microphone parameter to false and provide raw PCM audiochunks in 16-bit mono with this method:
+
+```python
+recorder.feed_audio(audio_chunk)
+```
+
 ### Shutdown
 
 You can shutdown the recorder safely by using the context manager protocol:
@@ -252,6 +265,8 @@ When you initialize the `AudioToTextRecorder` class, you have various options to
 
 - **ensure_sentence_ends_with_period** (bool, default=True): Ensures that every sentence that doesn't end with punctuation such as "?", "!" ends with a period
 
+- **use_microphone** (bool, default=True): Usage of local microphone for transcription. Set to False if you want to provide chunks with feed_audio method.
+
 - **spinner** (bool, default=True): Provides a spinner animation text with information about the current recorder state.
 
 - **level** (int, default=logging.WARNING): Logging level.

File diff suppressed because it is too large
+ 456 - 203
RealtimeSTT/audio_recorder.py


+ 49 - 0
example_browserclient/client.js

@@ -0,0 +1,49 @@
+let socket = new WebSocket("ws://localhost:9001");
+
+socket.onmessage = function(event) {
+    let data = JSON.parse(event.data);
+    if (data.type === 'realtime') {
+        document.getElementById('realtimeText').value = data.text;
+    } else if (data.type === 'fullSentence') {
+        document.getElementById('fullSentenceText').value = data.text;
+    }
+};
+
+// Request access to the microphone
+navigator.mediaDevices.getUserMedia({ audio: true })
+.then(stream => {
+    let audioContext = new AudioContext();
+    let source = audioContext.createMediaStreamSource(stream);
+    let processor = audioContext.createScriptProcessor(256, 1, 1);
+
+    source.connect(processor);
+    processor.connect(audioContext.destination);
+
+    processor.onaudioprocess = function(e) {
+        let inputData = e.inputBuffer.getChannelData(0);
+        let outputData = new Int16Array(inputData.length);
+
+        // Convert to 16-bit PCM
+        for (let i = 0; i < inputData.length; i++) {
+            outputData[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
+        }
+
+        // Send the 16-bit PCM data to the server
+
+        if (socket.readyState === WebSocket.OPEN) {
+            // Create a JSON string with metadata
+            let metadata = JSON.stringify({ sampleRate: audioContext.sampleRate });
+            // Convert metadata to a byte array
+            let metadataBytes = new TextEncoder().encode(metadata);
+            // Create a buffer for metadata length (4 bytes for 32-bit integer)
+            let metadataLength = new ArrayBuffer(4);
+            let metadataLengthView = new DataView(metadataLength);
+            // Set the length of the metadata in the first 4 bytes
+            metadataLengthView.setInt32(0, metadataBytes.byteLength, true); // true for little-endian
+            // Combine metadata length, metadata, and audio data into a single message
+            let combinedData = new Blob([metadataLength, metadataBytes, outputData.buffer]);
+            socket.send(combinedData);
+        }
+    };
+})
+.catch(e => console.error(e));

+ 18 - 0
example_browserclient/index.html

@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Audio Streamer</title>
+    <script src="https://cdn.socket.io/4.0.0/socket.io.min.js"></script>
+</head>
+<body>
+    <div>
+        <p>Realtime Text:</p>
+        <textarea id="realtimeText" rows="5" cols="50"></textarea>
+    </div>
+    <div>
+        <p>Full Sentence Text (higher transcription quality):</p>
+        <textarea id="fullSentenceText" rows="5" cols="50"></textarea>
+    </div>
+    <script src="client.js"></script>
+</body>
+</html>

+ 109 - 0
example_browserclient/server.py

@@ -0,0 +1,109 @@
+if __name__ == '__main__':
+    print("Starting server, please wait...")
+
+    from RealtimeSTT import AudioToTextRecorder
+    import asyncio
+    import websockets
+    import threading
+    import numpy as np
+    from scipy.signal import resample
+    import json
+
+    recorder = None
+    recorder_ready = threading.Event()
+    client_websocket = None
+
+    async def send_to_client(message):
+        if client_websocket:
+            await client_websocket.send(message)
+
+    def text_detected(text):
+        asyncio.new_event_loop().run_until_complete(
+            send_to_client(
+                json.dumps({
+                    'type': 'realtime',
+                    'text': text
+                })
+            )
+        )
+        print(f"{text}", flush=True, end='')
+
+    recorder_config = {
+        'spinner': False,
+        'use_microphone': False,
+        'model': 'large-v3',
+        'language': 'en',
+        'silero_sensitivity': 0.4,
+        'webrtc_sensitivity': 2,
+        'post_speech_silence_duration': 1.0,
+        'min_length_of_recording': 0,
+        'min_gap_between_recordings': 0,
+        'enable_realtime_transcription': True,
+        'realtime_processing_pause': 0,
+        'realtime_model_type': 'tiny.en',
+        'on_realtime_transcription_stabilized': text_detected,
+    }
+
+    def recorder_thread():
+        global recorder
+        print("Initializing RealtimeSTT...")
+        recorder = AudioToTextRecorder(**recorder_config)
+        print("RealtimeSTT initialized")
+        recorder_ready.set()
+        while True:
+            full_sentence = recorder.text()
+            asyncio.new_event_loop().run_until_complete(
+                send_to_client(
+                    json.dumps({
+                        'type': 'fullSentence',
+                        'text': full_sentence
+                    })
+                )
+            )
+            print(f"{full_sentence}")
+
+    def decode_and_resample(
+            audio_data,
+            original_sample_rate,
+            target_sample_rate):
+
+        # Decode 16-bit PCM data to numpy array
+        audio_np = np.frombuffer(audio_data, dtype=np.int16)
+
+        # Calculate the number of samples after resampling
+        num_original_samples = len(audio_np)
+        num_target_samples = int(num_original_samples * target_sample_rate /
+                                 original_sample_rate)
+
+        # Resample the audio
+        resampled_audio = resample(audio_np, num_target_samples)
+
+        return resampled_audio.astype(np.int16).tobytes()
+
+    async def echo(websocket, path):
+        print("Client connected")
+        global client_websocket
+        client_websocket = websocket
+        async for message in websocket:
+
+            if not recorder_ready.is_set():
+                print("Recorder not ready")
+                continue
+
+            metadata_length = int.from_bytes(message[:4], byteorder='little')
+            metadata_json = message[4:4+metadata_length].decode('utf-8')
+            metadata = json.loads(metadata_json)
+            sample_rate = metadata['sampleRate']
+            chunk = message[4+metadata_length:]
+            resampled_chunk = decode_and_resample(chunk, sample_rate, 16000)
+            recorder.feed_audio(resampled_chunk)
+
+    start_server = websockets.serve(echo, "localhost", 9001)
+
+    recorder_thread = threading.Thread(target=recorder_thread)
+    recorder_thread.start()
+    recorder_ready.wait()
+
+    print("Server started. Press Ctrl+C to stop the server.")
+    asyncio.get_event_loop().run_until_complete(start_server)
+    asyncio.get_event_loop().run_forever()

+ 4 - 0
example_browserclient/start_server.bat

@@ -0,0 +1,4 @@
+@echo off
+cd /d %~dp0
+python server.py
+cmd

+ 1 - 2
install_with_gpu_support.bat

@@ -1,3 +1,2 @@
-pip uninstall torch
-pip install torch==2.0.1+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.1+cu118 torchaudio==2.1.1+cu118 --index-url https://download.pytorch.org/whl/cu118
 pip install -r requirements-gpu.txt

+ 4 - 4
requirements-gpu.txt

@@ -1,5 +1,5 @@
-PyAudio>=0.2.13
-faster-whisper>=0.7.1
+PyAudio==0.2.14
+faster-whisper==0.10.0
 pvporcupine==1.9.5
-webrtcvad>=2.0.10
-halo>=0.0.31
+webrtcvad==2.0.10
+halo==0.0.31

+ 6 - 6
requirements.txt

@@ -1,7 +1,7 @@
-PyAudio>=0.2.13
-faster-whisper>=0.7.1
+PyAudio==0.2.14
+faster-whisper==0.10.0
 pvporcupine==1.9.5
-webrtcvad>=2.0.10
-halo>=0.0.31
-torch>=2.0.1
-torchaudio>=2.0.2
+webrtcvad==2.0.10
+halo==0.0.31
+torch==2.1.1
+torchaudio==2.1.1

+ 63 - 0
tests/realtimestt_chinese.py

@@ -0,0 +1,63 @@
+from RealtimeSTT import AudioToTextRecorder
+from colorama import Fore, Back, Style
+import colorama
+import os
+
+if __name__ == '__main__':
+
+    print("Initializing RealtimeSTT test...")
+
+    colorama.init()
+
+    full_sentences = []
+    displayed_text = ""
+
+    def clear_console():
+        os.system('clear' if os.name == 'posix' else 'cls')
+
+    def text_detected(text):
+        try:
+
+            global displayed_text
+            sentences_with_style = [
+                f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
+                for i, sentence in enumerate(full_sentences)
+            ]
+            new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
+
+            if new_text != displayed_text:
+                displayed_text = new_text
+                clear_console()
+                print(displayed_text, end="", flush=True)
+                
+        except Exception as e:
+            print(e)
+
+    def process_text(text):
+        full_sentences.append(text)
+        text_detected("")
+
+    recorder_config = {
+        'spinner': False,
+        'model': 'large-v2',
+        'language': 'zh',
+        'silero_sensitivity': 0.4,
+        'webrtc_sensitivity': 2,
+        'post_speech_silence_duration': 0.2,
+        'min_length_of_recording': 0,
+        'min_gap_between_recordings': 0,        
+        # 'enable_realtime_transcription': True,
+        # 'realtime_processing_pause': 0.2,
+        # 'realtime_model_type': 'tiny',
+        # 'on_realtime_transcription_update': text_detected, 
+        #'on_realtime_transcription_stabilized': text_detected,
+    }
+
+    recorder = AudioToTextRecorder(**recorder_config)
+
+    clear_console()
+    print("Say something...", end="", flush=True)
+
+    while True:
+        text = recorder.text(process_text)
+        text_detected(text)

+ 17 - 2
tests/realtimestt_test.py

@@ -28,6 +28,21 @@ if __name__ == '__main__':
             clear_console()
             print(displayed_text, end="", flush=True)
 
+    # def text_detected(text):
+    #     global displayed_text
+    #     clear_console()
+    #     print(text)
+        # sentences_with_style = [
+        #     f"{Fore.YELLOW + sentence + Style.RESET_ALL if i % 2 == 0 else Fore.CYAN + sentence + Style.RESET_ALL} "
+        #     for i, sentence in enumerate(full_sentences)
+        # ]
+        # new_text = "".join(sentences_with_style).strip() + " " + text if len(sentences_with_style) > 0 else text
+
+        # if new_text != displayed_text:
+        #     displayed_text = new_text
+        #     clear_console()
+        #     print(displayed_text, end="", flush=True)
+
     def process_text(text):
         full_sentences.append(text)
         text_detected("")
@@ -36,9 +51,9 @@ if __name__ == '__main__':
         'spinner': False,
         'model': 'large-v2',
         'language': 'en',
-        'silero_sensitivity': 0.3,
+        'silero_sensitivity': 0.4,
         'webrtc_sensitivity': 2,
-        'post_speech_silence_duration': 0.5,
+        'post_speech_silence_duration': 0.4,
         'min_length_of_recording': 0,
         'min_gap_between_recordings': 0,
         'enable_realtime_transcription': True,

Some files were not shown because too many files changed in this diff