openai_voice_interface.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import os
  2. import openai
  3. from RealtimeTTS import TextToAudioStream, AzureEngine
  4. from RealtimeSTT import AudioToTextRecorder
  5. # Initialize OpenAI key
  6. openai.api_key = os.environ.get("OPENAI_API_KEY")
  7. # Text-to-Speech Stream Setup
  8. stream = TextToAudioStream(
  9. # Alternatives: SystemEngine or ElevenlabsEngine
  10. AzureEngine(
  11. os.environ.get("AZURE_SPEECH_KEY"),
  12. "eastus",
  13. )
  14. )
  15. # Speech-to-Text Recorder Setup
  16. recorder = AudioToTextRecorder(
  17. model="medium",
  18. language="en",
  19. wake_words="Jarvis",
  20. spinner=True,
  21. wake_word_activation_delay=5
  22. )
  23. system_prompt_message = {
  24. 'role': 'system',
  25. 'content': 'Answer precise and short with the polite sarcasm of a butler.'
  26. }
  27. def generate_response(messages):
  28. """Generate assistant's response using OpenAI."""
  29. for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True):
  30. text_chunk = chunk["choices"][0]["delta"].get("content")
  31. if text_chunk:
  32. yield text_chunk
  33. history = []
  34. def main():
  35. """Main loop for interaction."""
  36. while True:
  37. # Capture user input from microphone
  38. user_text = recorder.text().strip()
  39. if not user_text:
  40. continue
  41. print(f'>>> {user_text}\n<<< ', end="", flush=True)
  42. history.append({'role': 'user', 'content': user_text})
  43. # Get assistant response and play it
  44. assistant_response = generate_response([system_prompt_message] + history[-10:])
  45. stream.feed(assistant_response).play(log_characters=True)
  46. history.append({'role': 'assistant', 'content': stream.text()})
  47. if __name__ == "__main__":
  48. main()