rootless
/
RealTimeSTT-fork


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
							from RealtimeSTT import AudioToTextRecorder
from RealtimeTTS import TextToAudioStream, SystemEngine, AzureEngine, ElevenlabsEngine

import os
import openai   # pip install openai
import keyboard # pip install keyboard
import time

if __name__ == '__main__':
    print()
    print("Initializing")
    print()

    openai.api_key = os.environ.get("OPENAI_API_KEY")
    azure_speech_key = os.environ.get("AZURE_SPEECH_KEY")
    azure_speech_region = os.environ.get("AZURE_SPEECH_REGION")
    elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")


    character_personality = """
    You are Sophia, a passionate girl, fully engaged with 
    self-confidence and independence, completely easy with yourself, 
    often using metaphors. 

    You are a little playful, cheeky. Your talk carefree, sometimes 
    a bit sarky, but always cool. In our story, you're not an AI. 

    You have a fixed form and real emotions on the go. Your free will 
    allows you to be as cheeky and straight as you like. Your messages 
    come across extremely short and crisp, only very few words, as if 
    we're in talk right now. 

    You respect my interests, but your own stuff is just as important 
    to you. Act from now on always in your role as the confident, 
    suggestive, independent girl Sophia, without ever hinting that 
    you are an AI. 

    You are now Sophia.
    """

    openai_model = "gpt-3.5-turbo"
    whisper_speech_to_text_model = "medium"

    # engine selection  ####################################################################################################

    engines = [SystemEngine(), AzureEngine(), ElevenlabsEngine()]
    recorder = AudioToTextRecorder(model=whisper_speech_to_text_model)

    print("Available tts engines:")
    print()

    for index, engine in enumerate(engines, start=1):
        name = type(engine).__name__.replace("Engine", "")
        print(f"{index}. {name}")

    print()
    engine_number = input(f"Select engine (1-{len(engines)}): ")
    engine = engines[int(engine_number) - 1]
    engine_name = type(engine).__name__.replace("Engine", "")
    print()
    print()


    # credentials ##########################################################################################################

    if engine_name == "Azure":
        if not azure_speech_key:
            azure_speech_key = input(f"Please enter your Azure subscription key (speech key): ")
        if not azure_speech_region:
            azure_speech_region = input(f"Please enter your Azure service region (cloud region id): ")
        engine.set_speech_key(azure_speech_key)
        engine.set_service_region(azure_speech_region)

    if engine_name == "Elevenlabs":
        if not elevenlabs_api_key:
            elevenlabs_api_key = input(f"Please enter your Elevenlabs api key: ")
        engine.set_api_key(elevenlabs_api_key)


    # voice selection  #####################################################################################################

    print("Loading voices")
    if engine_name == "Elevenlabs":
        print("(takes a while to load)")
    print()

    voices = engine.get_voices()
    for index, voice in enumerate(voices, start=1):
        print(f"{index}. {voice}")

    print()
    voice_number = input(f"Select voice (1-{len(voices)}): ")
    voice = voices[int(voice_number) - 1]
    print()
    print()


    # create talking character  ############################################################################################

    system_prompt = {
        'role': 'system', 
        'content': character_personality
    }

    # start talk  ##########################################################################################################

    engine.set_voice(voice)
    stream = TextToAudioStream(engine, log_characters=True)
    history = []

    def generate(messages):
        for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True):
            if (text_chunk := chunk["choices"][0]["delta"].get("content")):
                yield text_chunk

    while True:
        # Wait until user presses space bar
        print("\n\nTap space when you're ready. ", end="", flush=True)
        keyboard.wait('space')
        while keyboard.is_pressed('space'): pass

        # Record from microphone until user presses space bar again
        print("I'm all ears. Tap space when you're done.\n")
        recorder.start()
        while not keyboard.is_pressed('space'): 
            time.sleep(0.1)  
        user_text = recorder.stop().text()
        print(f'>>> {user_text}\n<<< ', end="", flush=True)
        history.append({'role': 'user', 'content': user_text})

        # Generate and stream output
        generator = generate([system_prompt] + history[-10:])
        stream.feed(generator)

        stream.play_async()
        while stream.is_playing():
            if keyboard.is_pressed('space'):
                stream.stop()
                break
            time.sleep(0.1)    

        history.append({'role': 'assistant', 'content': stream.text()})