KoljaB пре 7 месеци
родитељ
комит
fa580d89d9
2 измењених фајлова са 145 додато и 46 уклоњено
  1. 23 6
      RealtimeSTT/audio_recorder.py
  2. 122 40
      server/stt_cli_client.py

+ 23 - 6
RealtimeSTT/audio_recorder.py

@@ -57,6 +57,8 @@ import os
 import re
 import gc
 
+print(f"### whaaaat #######")
+
 # Set OpenMP runtime duplicate library handling to OK (Use only for development!)
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
 
@@ -121,8 +123,10 @@ class TranscriptionWorker:
                 time.sleep(TIME_SLEEP)
 
     def run(self):
-        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
-        __builtins__['print'] = self.custom_print
+        if __name__ == "__main__":
+             system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
+
+        # __builtins__['print'] = self.custom_print
 
         logging.info(f"Initializing faster_whisper main transcription model {self.model_path}")
 
@@ -171,7 +175,7 @@ class TranscriptionWorker:
                 except Exception as e:
                     logging.error(f"General error in processing queue item: {e}")
         finally:
-            __builtins__['print'] = print  # Restore the original print function
+            # __builtins__['print'] = print  # Restore the original print function
             self.conn.close()
             self.stdout_pipe.close()
             self.shutdown_event.set()  # Ensure the polling thread will stop
@@ -891,10 +895,13 @@ class AudioToTextRecorder:
         Raises:
             Exception: If there is an error while initializing the audio recording.
         """
+        # logging.error("### TEST")
         import pyaudio
         import numpy as np
         from scipy import signal
-        system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
+        
+        if __name__ == '__main__':
+            system_signal.signal(system_signal.SIGINT, system_signal.SIG_IGN)
 
         def get_highest_sample_rate(audio_interface, device_index):
             """Get the highest supported sample rate for the specified device."""
@@ -962,6 +969,8 @@ class AudioToTextRecorder:
         def setup_audio():  
             nonlocal audio_interface, stream, device_sample_rate, input_device_index
             try:
+                print(f"### start #######")
+
                 audio_interface = pyaudio.PyAudio()
                 if input_device_index is None:
                     try:
@@ -978,19 +987,27 @@ class AudioToTextRecorder:
                 else:
                     sample_rates_to_try.append(48000)  # Fallback sample rate
 
+                for rate in sample_rates_to_try:
+                    logging.error(f"rates to try: {rate}")
+                    # print(f"rates to try: {rate}")
+
                 for rate in sample_rates_to_try:
                     try:
+                        logging.error(f"trying rate: {rate}")
                         device_sample_rate = rate
                         stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
                         if stream is not None:
                             logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
+                            logging.error(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
                             return True
                     except Exception as e:
-                        logging.warning(f"Failed to initialize audio stream at {device_sample_rate} Hz: {e}")
+                        logging.warning(f"Failed to initialize audio23 stream at {device_sample_rate} Hz: {e}")
                         continue
+                    
+                    
 
                 # If we reach here, none of the sample rates worked
-                raise Exception("Failed to initialize audio stream with all sample rates.")
+                raise Exception("Failed to initialize audio stream12 with all sample rates.")
 
             except Exception as e:
                 logging.exception(f"Error initializing pyaudio audio recording: {e}")

+ 122 - 40
server/stt_cli_client.py

@@ -1,3 +1,112 @@
+import os
+import sys
+import pyaudio
+import numpy as np
+from scipy import signal
+import logging
+os.environ['ALSA_LOG_LEVEL'] = 'none'
+
+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 44100  # Default fallback rate
+input_device_index = None
+audio_interface = None
+stream = None
+device_sample_rate = None
+chunk_size = CHUNK
+
+def get_highest_sample_rate(audio_interface, device_index):
+    """Get the highest supported sample rate for the specified device."""
+    try:
+        device_info = audio_interface.get_device_info_by_index(device_index)
+        max_rate = int(device_info['defaultSampleRate'])
+
+        if 'supportedSampleRates' in device_info:
+            supported_rates = [int(rate) for rate in device_info['supportedSampleRates']]
+            if supported_rates:
+                max_rate = max(supported_rates)
+
+        return max_rate
+    except Exception as e:
+        logging.warning(f"Failed to get highest sample rate: {e}")
+        return 48000  # Fallback to a common high sample rate
+
+def initialize_audio_stream(audio_interface, device_index, sample_rate, chunk_size):
+    """Initialize the audio stream with error handling."""
+    try:
+        stream = audio_interface.open(
+            format=pyaudio.paInt16,
+            channels=CHANNELS,
+            rate=sample_rate,
+            input=True,
+            frames_per_buffer=chunk_size,
+            input_device_index=device_index,
+        )
+        return stream
+    except Exception as e:
+        logging.error(f"Error initializing audio stream: {e}")
+        raise
+
+def preprocess_audio(chunk, original_sample_rate, target_sample_rate):
+    """Preprocess audio chunk similar to feed_audio method."""
+    if isinstance(chunk, np.ndarray):
+        if chunk.ndim == 2:  # Stereo to mono conversion
+            chunk = np.mean(chunk, axis=1)
+
+        # Resample if needed
+        if original_sample_rate != target_sample_rate:
+            num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
+            chunk = signal.resample(chunk, num_samples)
+
+        chunk = chunk.astype(np.int16)
+    else:
+        chunk = np.frombuffer(chunk, dtype=np.int16)
+
+        if original_sample_rate != target_sample_rate:
+            num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
+            chunk = signal.resample(chunk, num_samples)
+            chunk = chunk.astype(np.int16)
+
+    return chunk.tobytes()
+
+def setup_audio():
+    global audio_interface, stream, device_sample_rate, input_device_index
+    try:
+        audio_interface = pyaudio.PyAudio()
+        if input_device_index is None:
+            try:
+                default_device = audio_interface.get_default_input_device_info()
+                input_device_index = default_device['index']
+            except OSError as e:
+                input_device_index = None
+
+        sample_rates_to_try = [16000]  # Try 16000 Hz first
+        if input_device_index is not None:
+            highest_rate = get_highest_sample_rate(audio_interface, input_device_index)
+            if highest_rate != 16000:
+                sample_rates_to_try.append(highest_rate)
+        else:
+            sample_rates_to_try.append(48000)  # Fallback sample rate
+
+        for rate in sample_rates_to_try:
+            try:
+                device_sample_rate = rate
+                stream = initialize_audio_stream(audio_interface, input_device_index, device_sample_rate, chunk_size)
+                if stream is not None:
+                    logging.debug(f"Audio recording initialized successfully at {device_sample_rate} Hz, reading {chunk_size} frames at a time")
+                    return True
+            except Exception as e:
+                logging.warning(f"Failed to initialize audio stream at {device_sample_rate} Hz: {e}")
+                continue
+
+        raise Exception("Failed to initialize audio stream with all sample rates.")
+    except Exception as e:
+        logging.exception(f"Error initializing audio recording: {e}")
+        if audio_interface:
+            audio_interface.terminate()
+        return False
+
 from .install_packages import check_and_install_packages
 
 check_and_install_packages([
@@ -26,8 +135,6 @@ import json
 import threading
 import time
 import struct
-import os
-import sys
 import socket
 import subprocess
 import shutil
@@ -38,7 +145,7 @@ from queue import Queue
 CHUNK = 1024
 FORMAT = pyaudio.paInt16
 CHANNELS = 1
-RATE = 16000
+RATE = 44100
 DEFAULT_SERVER_URL = "ws://localhost:8011"
 
 class STTWebSocketClient:
@@ -89,7 +196,6 @@ class STTWebSocketClient:
             self.debug_print(f"Error while connecting to the server: {e}")
             return False
 
-
     def on_open(self, ws):
         self.debug_print("WebSocket connection opened.")
         self.is_running = True
@@ -159,26 +265,7 @@ class STTWebSocketClient:
                     self.file_output.flush()  # Ensure it's written immediately
                 else:
                     self.finish_progress_bar()
-                    print(f"{data['text']}")                    
-                    # self.update_progress_bar("") 
-                    # print(f"\r\033[K{data['text']}")
-                    # #print(f"\r\033[KHello")
-                    # self.stop() 
-                    # print("what the fuck")
-                    # print("what the fuck")
-                    # print(f"what the fuck self.file_output {self.file_output}")
-                    # self.update_progress_bar("FGINAAL") 
-                    # self.stop()
-                    # sys.stderr.write(f"\n{data['text']}")
-                    # sys.stderr.write(f"\n{data['text']}")
-                    # sys.stderr.write(f"\nTEEEST")
-                    # sys.stderr.write(f"\nTEEEST")
-                    # sys.stderr.flush()
-                    # print("what the fuck")
-                    # print("what the fuck")
-                    # print("what the fuck")
-                    # print("what the fuck")
-                    # print("what the fuck")
+                    print(f"{data['text']}")        
                 self.stop()
                 
         except json.JSONDecodeError:
@@ -225,47 +312,42 @@ class STTWebSocketClient:
         self.is_running = False
         if self.ws:
             self.ws.close()
-        if hasattr(self, 'ws_thread'):
-            self.ws_thread.join(timeout=2)
+        #if hasattr(self, 'ws_thread'):
+        #    self.ws_thread.join(timeout=2)
 
     def start_recording(self):
-        self.show_initial_indicator()
         threading.Thread(target=self.record_and_send_audio).start()
 
     def record_and_send_audio(self):
-        p = pyaudio.PyAudio()
-        stream = p.open(format=FORMAT,
-                        input_device_index=1,
-                        channels=CHANNELS,
-                        rate=RATE,
-                        input=True,
-                        frames_per_buffer=CHUNK)
+        if not setup_audio():
+            raise Exception("Failed to set up audio recording.")
 
         self.debug_print("Recording and sending audio...")
+        self.show_initial_indicator()
 
         while self.is_running:
             try:
                 audio_data = stream.read(CHUNK)
-                
+
                 # Prepare metadata
                 metadata = {
-                    "sampleRate": RATE
+                    "sampleRate": device_sample_rate
                 }
                 metadata_json = json.dumps(metadata)
                 metadata_length = len(metadata_json)
-                
+
                 # Construct the message
                 message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
-                
+
                 self.ws.send(message, opcode=websocket.ABNF.OPCODE_BINARY)
             except Exception as e:
-                self.debug_print(f"\nError sending audio data: {e}")
+                self.debug_print(f"Error sending audio data: {e}")
                 break
 
         self.debug_print("Stopped recording.")
         stream.stop_stream()
         stream.close()
-        p.terminate()
+        audio_interface.terminate()
 
 
 def main():