Procházet zdrojové kódy

realtime transcript from stereomix

KoljaB před 7 měsíci
rodič
revize
af1547656c
2 změnil soubory, kde provedl 66 přidání a 24 odebrání
  1. 65 23
      tests/realtimestt_test.py
  2. 1 1
      tests/realtimestt_test_stereomix.py

+ 65 - 23
tests/realtimestt_test.py

@@ -6,43 +6,76 @@ if __name__ == '__main__':
         import logging
         logging.basicConfig(level=logging.DEBUG)
 
+    from rich.console import Console
+    from rich.live import Live
+    from rich.text import Text
+    from rich.panel import Panel
+    from rich.spinner import Spinner
+    from rich.progress import Progress, SpinnerColumn, TextColumn
+    console = Console()
+    # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
+    console.print("System initializing, please wait")
+
+    
+    # # Initial display message
+    # with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) as progress:
+    #     task = progress.add_task("[cyan]Setting up transcription...", total=None)
+    #     console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
+    #     progress.update(task, description="[green]Initialization complete!")
+
     import os
     import sys
     from RealtimeSTT import AudioToTextRecorder
     from colorama import Fore, Style
     import colorama
-    from rich.live import Live
-    from rich.console import Console
-    from rich.text import Text
 
     if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
         from torchaudio._extension.utils import _init_dll_path
         _init_dll_path()    
 
-    print("Initializing RealtimeSTT test...")
-
     colorama.init()
 
     # Initialize Rich Console and Live
-    console = Console()
     live = Live(console=console, refresh_per_second=10, screen=False)
     live.start()
 
     full_sentences = []
-    displayed_text = ""
-    prev_text = ""
     rich_text_stored = ""
     recorder = None
+    displayed_text = ""  # Used for tracking text that was already displayed
 
-    end_of_sentence_detection_pause = 0.4
+    end_of_sentence_detection_pause = 0.45
     unknown_sentence_detection_pause = 0.7
     mid_sentence_detection_pause = 2.0
 
     def clear_console():
         os.system('clear' if os.name == 'posix' else 'cls')
 
+    prev_text = ""
+
+    def preprocess_text(text):
+        # Remove leading whitespaces
+        text = text.lstrip()
+
+        #  Remove starting ellipses if present
+        if text.startswith("..."):
+            text = text[3:]
+
+        # Remove any leading whitespaces again after ellipses removal
+        text = text.lstrip()
+
+        # Uppercase the first letter
+        if text:
+            text = text[0].upper() + text[1:]
+        
+        return text
+
+
     def text_detected(text):
-        global displayed_text, prev_text, full_sentences, recorder, rich_text_stored
+        global prev_text, displayed_text, rich_text_stored
+
+        text = preprocess_text(text)
+
         sentence_end_marks = ['.', '!', '?', '。'] 
         if text.endswith("..."):
             recorder.post_speech_silence_duration = mid_sentence_detection_pause
@@ -57,24 +90,31 @@ if __name__ == '__main__':
         rich_text = Text()
         for i, sentence in enumerate(full_sentences):
             if i % 2 == 0:
+                #rich_text += Text(sentence, style="bold yellow") + Text(" ")
                 rich_text += Text(sentence, style="yellow") + Text(" ")
             else:
                 rich_text += Text(sentence, style="cyan") + Text(" ")
         
         # If the current text is not a sentence-ending, display it in real-time
         if text:
-            rich_text += Text(text, style="white")
+            rich_text += Text(text, style="bold yellow")
 
         new_displayed_text = rich_text.plain
 
         if new_displayed_text != displayed_text:
             displayed_text = new_displayed_text
-            live.update(rich_text)
+            panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
+            live.update(panel)
             rich_text_stored = rich_text
 
     def process_text(text):
         global recorder, full_sentences, prev_text
         recorder.post_speech_silence_duration = unknown_sentence_detection_pause
+        text = preprocess_text(text)
+        text = text.rstrip()
+        if text.endswith("..."):
+            text = text[:-2]
+                
         full_sentences.append(text)
         prev_text = ""
         text_detected("")
@@ -83,32 +123,33 @@ if __name__ == '__main__':
     recorder_config = {
         'spinner': False,
         'model': 'large-v2',
-        # 'input_device_index': 1,
-        'realtime_model_type': 'tiny.en',
+        'input_device_index': 1,
+        'realtime_model_type': 'small.en',
+        #'realtime_model_type': 'small.en',
         'language': 'en',
         'silero_sensitivity': 0.05,
         'webrtc_sensitivity': 3,
         'post_speech_silence_duration': unknown_sentence_detection_pause,
-        'min_length_of_recording': 0.7,        
+        'min_length_of_recording': 1.1,        
         'min_gap_between_recordings': 0,                
         'enable_realtime_transcription': True,
-        'realtime_processing_pause': 0.1,
-        #'on_realtime_transcription_update': text_detected,
-        'on_realtime_transcription_stabilized': text_detected,
+        'realtime_processing_pause': 0.02,
+        'on_realtime_transcription_update': text_detected,
+        #'on_realtime_transcription_stabilized': text_detected,
         'silero_deactivity_detection': True,
         'early_transcription_on_silence': 0.2,
         'beam_size': 5,
-        'beam_size_realtime': 1,
+        'beam_size_realtime': 5,
         'no_log_file': True,
+        'initial_prompt': "Only add a period at the end of a sentence if you are 100 percent certain that the speaker has finished their statement. If you're unsure or the sentence seems incomplete, leave the sentence open or use ellipses to reflect continuation. For example: 'I went to the...' or 'I think it was...'"
     }
 
     if EXTENDED_LOGGING:
         recorder_config['level'] = logging.DEBUG
 
     recorder = AudioToTextRecorder(**recorder_config)
-
-    # Initial display message
-    initial_text = Text("Say something...", style="green")
+    
+    initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
     live.update(initial_text)
 
     try:
@@ -116,4 +157,5 @@ if __name__ == '__main__':
             recorder.text(process_text)
     except KeyboardInterrupt:
         live.stop()
-        print("Exit due to keyboard interrupt.")
+        console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
+        exit(0)

+ 1 - 1
tests/realtimestt_test_stereomix.py

@@ -110,7 +110,7 @@ def main():
         text = preprocess_text(text)
         text = text.rstrip()
         if text.endswith("..."):
-            text = text[:-3]  # Remove ellipsis
+            text = text[:-2]  # Remove ellipsis
 
         full_sentences.append(text)
         prev_text = ""