|
@@ -6,43 +6,76 @@ if __name__ == '__main__':
|
|
import logging
|
|
import logging
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
|
|
+ from rich.console import Console
|
|
|
|
+ from rich.live import Live
|
|
|
|
+ from rich.text import Text
|
|
|
|
+ from rich.panel import Panel
|
|
|
|
+ from rich.spinner import Spinner
|
|
|
|
+ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
|
|
+ console = Console()
|
|
|
|
+ # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
|
|
|
|
+ console.print("System initializing, please wait")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # # Initial display message
|
|
|
|
+ # with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) as progress:
|
|
|
|
+ # task = progress.add_task("[cyan]Setting up transcription...", total=None)
|
|
|
|
+ # console.print("[bold yellow]System initializing, please wait...[/bold yellow]")
|
|
|
|
+ # progress.update(task, description="[green]Initialization complete!")
|
|
|
|
+
|
|
import os
|
|
import os
|
|
import sys
|
|
import sys
|
|
from RealtimeSTT import AudioToTextRecorder
|
|
from RealtimeSTT import AudioToTextRecorder
|
|
from colorama import Fore, Style
|
|
from colorama import Fore, Style
|
|
import colorama
|
|
import colorama
|
|
- from rich.live import Live
|
|
|
|
- from rich.console import Console
|
|
|
|
- from rich.text import Text
|
|
|
|
|
|
|
|
if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
|
|
if os.name == "nt" and (3, 8) <= sys.version_info < (3, 99):
|
|
from torchaudio._extension.utils import _init_dll_path
|
|
from torchaudio._extension.utils import _init_dll_path
|
|
_init_dll_path()
|
|
_init_dll_path()
|
|
|
|
|
|
- print("Initializing RealtimeSTT test...")
|
|
|
|
-
|
|
|
|
colorama.init()
|
|
colorama.init()
|
|
|
|
|
|
# Initialize Rich Console and Live
|
|
# Initialize Rich Console and Live
|
|
- console = Console()
|
|
|
|
live = Live(console=console, refresh_per_second=10, screen=False)
|
|
live = Live(console=console, refresh_per_second=10, screen=False)
|
|
live.start()
|
|
live.start()
|
|
|
|
|
|
full_sentences = []
|
|
full_sentences = []
|
|
- displayed_text = ""
|
|
|
|
- prev_text = ""
|
|
|
|
rich_text_stored = ""
|
|
rich_text_stored = ""
|
|
recorder = None
|
|
recorder = None
|
|
|
|
+ displayed_text = "" # Used for tracking text that was already displayed
|
|
|
|
|
|
- end_of_sentence_detection_pause = 0.4
|
|
|
|
|
|
+ end_of_sentence_detection_pause = 0.45
|
|
unknown_sentence_detection_pause = 0.7
|
|
unknown_sentence_detection_pause = 0.7
|
|
mid_sentence_detection_pause = 2.0
|
|
mid_sentence_detection_pause = 2.0
|
|
|
|
|
|
def clear_console():
|
|
def clear_console():
|
|
os.system('clear' if os.name == 'posix' else 'cls')
|
|
os.system('clear' if os.name == 'posix' else 'cls')
|
|
|
|
|
|
|
|
+ prev_text = ""
|
|
|
|
+
|
|
|
|
+ def preprocess_text(text):
|
|
|
|
+ # Remove leading whitespaces
|
|
|
|
+ text = text.lstrip()
|
|
|
|
+
|
|
|
|
+ # Remove starting ellipses if present
|
|
|
|
+ if text.startswith("..."):
|
|
|
|
+ text = text[3:]
|
|
|
|
+
|
|
|
|
+ # Remove any leading whitespaces again after ellipses removal
|
|
|
|
+ text = text.lstrip()
|
|
|
|
+
|
|
|
|
+ # Uppercase the first letter
|
|
|
|
+ if text:
|
|
|
|
+ text = text[0].upper() + text[1:]
|
|
|
|
+
|
|
|
|
+ return text
|
|
|
|
+
|
|
|
|
+
|
|
def text_detected(text):
|
|
def text_detected(text):
|
|
- global displayed_text, prev_text, full_sentences, recorder, rich_text_stored
|
|
|
|
|
|
+ global prev_text, displayed_text, rich_text_stored
|
|
|
|
+
|
|
|
|
+ text = preprocess_text(text)
|
|
|
|
+
|
|
sentence_end_marks = ['.', '!', '?', '。']
|
|
sentence_end_marks = ['.', '!', '?', '。']
|
|
if text.endswith("..."):
|
|
if text.endswith("..."):
|
|
recorder.post_speech_silence_duration = mid_sentence_detection_pause
|
|
recorder.post_speech_silence_duration = mid_sentence_detection_pause
|
|
@@ -57,24 +90,31 @@ if __name__ == '__main__':
|
|
rich_text = Text()
|
|
rich_text = Text()
|
|
for i, sentence in enumerate(full_sentences):
|
|
for i, sentence in enumerate(full_sentences):
|
|
if i % 2 == 0:
|
|
if i % 2 == 0:
|
|
|
|
+ #rich_text += Text(sentence, style="bold yellow") + Text(" ")
|
|
rich_text += Text(sentence, style="yellow") + Text(" ")
|
|
rich_text += Text(sentence, style="yellow") + Text(" ")
|
|
else:
|
|
else:
|
|
rich_text += Text(sentence, style="cyan") + Text(" ")
|
|
rich_text += Text(sentence, style="cyan") + Text(" ")
|
|
|
|
|
|
# If the current text is not a sentence-ending, display it in real-time
|
|
# If the current text is not a sentence-ending, display it in real-time
|
|
if text:
|
|
if text:
|
|
- rich_text += Text(text, style="white")
|
|
|
|
|
|
+ rich_text += Text(text, style="bold yellow")
|
|
|
|
|
|
new_displayed_text = rich_text.plain
|
|
new_displayed_text = rich_text.plain
|
|
|
|
|
|
if new_displayed_text != displayed_text:
|
|
if new_displayed_text != displayed_text:
|
|
displayed_text = new_displayed_text
|
|
displayed_text = new_displayed_text
|
|
- live.update(rich_text)
|
|
|
|
|
|
+ panel = Panel(rich_text, title="[bold green]Live Transcription[/bold green]", border_style="bold green")
|
|
|
|
+ live.update(panel)
|
|
rich_text_stored = rich_text
|
|
rich_text_stored = rich_text
|
|
|
|
|
|
def process_text(text):
|
|
def process_text(text):
|
|
global recorder, full_sentences, prev_text
|
|
global recorder, full_sentences, prev_text
|
|
recorder.post_speech_silence_duration = unknown_sentence_detection_pause
|
|
recorder.post_speech_silence_duration = unknown_sentence_detection_pause
|
|
|
|
+ text = preprocess_text(text)
|
|
|
|
+ text = text.rstrip()
|
|
|
|
+ if text.endswith("..."):
|
|
|
|
+ text = text[:-2]
|
|
|
|
+
|
|
full_sentences.append(text)
|
|
full_sentences.append(text)
|
|
prev_text = ""
|
|
prev_text = ""
|
|
text_detected("")
|
|
text_detected("")
|
|
@@ -83,32 +123,33 @@ if __name__ == '__main__':
|
|
recorder_config = {
|
|
recorder_config = {
|
|
'spinner': False,
|
|
'spinner': False,
|
|
'model': 'large-v2',
|
|
'model': 'large-v2',
|
|
- # 'input_device_index': 1,
|
|
|
|
- 'realtime_model_type': 'tiny.en',
|
|
|
|
|
|
+ 'input_device_index': 1,
|
|
|
|
+ 'realtime_model_type': 'small.en',
|
|
|
|
+ #'realtime_model_type': 'small.en',
|
|
'language': 'en',
|
|
'language': 'en',
|
|
'silero_sensitivity': 0.05,
|
|
'silero_sensitivity': 0.05,
|
|
'webrtc_sensitivity': 3,
|
|
'webrtc_sensitivity': 3,
|
|
'post_speech_silence_duration': unknown_sentence_detection_pause,
|
|
'post_speech_silence_duration': unknown_sentence_detection_pause,
|
|
- 'min_length_of_recording': 0.7,
|
|
|
|
|
|
+ 'min_length_of_recording': 1.1,
|
|
'min_gap_between_recordings': 0,
|
|
'min_gap_between_recordings': 0,
|
|
'enable_realtime_transcription': True,
|
|
'enable_realtime_transcription': True,
|
|
- 'realtime_processing_pause': 0.1,
|
|
|
|
- #'on_realtime_transcription_update': text_detected,
|
|
|
|
- 'on_realtime_transcription_stabilized': text_detected,
|
|
|
|
|
|
+ 'realtime_processing_pause': 0.02,
|
|
|
|
+ 'on_realtime_transcription_update': text_detected,
|
|
|
|
+ #'on_realtime_transcription_stabilized': text_detected,
|
|
'silero_deactivity_detection': True,
|
|
'silero_deactivity_detection': True,
|
|
'early_transcription_on_silence': 0.2,
|
|
'early_transcription_on_silence': 0.2,
|
|
'beam_size': 5,
|
|
'beam_size': 5,
|
|
- 'beam_size_realtime': 1,
|
|
|
|
|
|
+ 'beam_size_realtime': 5,
|
|
'no_log_file': True,
|
|
'no_log_file': True,
|
|
|
|
+ 'initial_prompt': "Only add a period at the end of a sentence if you are 100 percent certain that the speaker has finished their statement. If you're unsure or the sentence seems incomplete, leave the sentence open or use ellipses to reflect continuation. For example: 'I went to the...' or 'I think it was...'"
|
|
}
|
|
}
|
|
|
|
|
|
if EXTENDED_LOGGING:
|
|
if EXTENDED_LOGGING:
|
|
recorder_config['level'] = logging.DEBUG
|
|
recorder_config['level'] = logging.DEBUG
|
|
|
|
|
|
recorder = AudioToTextRecorder(**recorder_config)
|
|
recorder = AudioToTextRecorder(**recorder_config)
|
|
-
|
|
|
|
- # Initial display message
|
|
|
|
- initial_text = Text("Say something...", style="green")
|
|
|
|
|
|
+
|
|
|
|
+ initial_text = Panel(Text("Say something...", style="cyan bold"), title="[bold yellow]Waiting for Input[/bold yellow]", border_style="bold yellow")
|
|
live.update(initial_text)
|
|
live.update(initial_text)
|
|
|
|
|
|
try:
|
|
try:
|
|
@@ -116,4 +157,5 @@ if __name__ == '__main__':
|
|
recorder.text(process_text)
|
|
recorder.text(process_text)
|
|
except KeyboardInterrupt:
|
|
except KeyboardInterrupt:
|
|
live.stop()
|
|
live.stop()
|
|
- print("Exit due to keyboard interrupt.")
|
|
|
|
|
|
+ console.print("[bold red]Transcription stopped by user. Exiting...[/bold red]")
|
|
|
|
+ exit(0)
|