rootless
/
RealTimeSTT-fork


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
							import os
import openai
from RealtimeTTS import TextToAudioStream, AzureEngine
from RealtimeSTT import AudioToTextRecorder

# Initialize OpenAI key
openai.api_key = os.environ.get("OPENAI_API_KEY")

# Text-to-Speech Stream Setup
stream = TextToAudioStream(

    # Alternatives: SystemEngine or ElevenlabsEngine
    AzureEngine(
        os.environ.get("AZURE_SPEECH_KEY"),
        "eastus",
    ),
    log_characters=True
)

# Speech-to-Text Recorder Setup
recorder = AudioToTextRecorder(
    model="medium",
    language="en",
    wake_words="Jarvis",
    spinner=True,
    wake_word_activation_delay=5
)

system_prompt_message = {
    'role': 'system',
    'content': 'Answer precise and short with the polite sarcasm of a butler.'
}

def generate_response(messages):
    """Generate assistant's response using OpenAI."""
    for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
        text_chunk = chunk["choices"][0]["delta"].get("content")
        if text_chunk:
            yield text_chunk

history = []

def main():
    """Main loop for interaction."""
    while True:
        # Capture user input from microphone
        user_text = recorder.text().strip()

        if not user_text:
            continue

        print(f'>>> {user_text}\n<<< ', end="", flush=True)
        history.append({'role': 'user', 'content': user_text})

        # Get assistant response and play it
        assistant_response = generate_response([system_prompt_message] + history[-10:])
        stream.feed(assistant_response).play()

        history.append({'role': 'assistant', 'content': stream.text()})

if __name__ == "__main__":
    main()