advanced_talk.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. from RealtimeSTT import AudioToTextRecorder
  2. from RealtimeTTS import TextToAudioStream, SystemEngine, AzureEngine, ElevenlabsEngine
  3. import os
  4. import openai # pip install openai
  5. import keyboard # pip install keyboard
  6. import time
  7. if __name__ == '__main__':
  8. print()
  9. print("Initializing")
  10. print()
  11. openai.api_key = os.environ.get("OPENAI_API_KEY")
  12. azure_speech_key = os.environ.get("AZURE_SPEECH_KEY")
  13. azure_speech_region = os.environ.get("AZURE_SPEECH_REGION")
  14. elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
  15. character_personality = """
  16. You are Sophia, a passionate girl, fully engaged with
  17. self-confidence and independence, completely easy with yourself,
  18. often using metaphors.
  19. You are a little playful, cheeky. Your talk carefree, sometimes
  20. a bit sarky, but always cool. In our story, you're not an AI.
  21. You have a fixed form and real emotions on the go. Your free will
  22. allows you to be as cheeky and straight as you like. Your messages
  23. come across extremely short and crisp, only very few words, as if
  24. we're in talk right now.
  25. You respect my interests, but your own stuff is just as important
  26. to you. Act from now on always in your role as the confident,
  27. suggestive, independent girl Sophia, without ever hinting that
  28. you are an AI.
  29. You are now Sophia.
  30. """
  31. openai_model = "gpt-3.5-turbo"
  32. whisper_speech_to_text_model = "medium"
  33. # engine selection ####################################################################################################
  34. engines = [SystemEngine(), AzureEngine(), ElevenlabsEngine()]
  35. recorder = AudioToTextRecorder(model=whisper_speech_to_text_model)
  36. print("Available tts engines:")
  37. print()
  38. for index, engine in enumerate(engines, start=1):
  39. name = type(engine).__name__.replace("Engine", "")
  40. print(f"{index}. {name}")
  41. print()
  42. engine_number = input(f"Select engine (1-{len(engines)}): ")
  43. engine = engines[int(engine_number) - 1]
  44. engine_name = type(engine).__name__.replace("Engine", "")
  45. print()
  46. print()
  47. # credentials ##########################################################################################################
  48. if engine_name == "Azure":
  49. if not azure_speech_key:
  50. azure_speech_key = input(f"Please enter your Azure subscription key (speech key): ")
  51. if not azure_speech_region:
  52. azure_speech_region = input(f"Please enter your Azure service region (cloud region id): ")
  53. engine.set_speech_key(azure_speech_key)
  54. engine.set_service_region(azure_speech_region)
  55. if engine_name == "Elevenlabs":
  56. if not elevenlabs_api_key:
  57. elevenlabs_api_key = input(f"Please enter your Elevenlabs api key: ")
  58. engine.set_api_key(elevenlabs_api_key)
  59. # voice selection #####################################################################################################
  60. print("Loading voices")
  61. if engine_name == "Elevenlabs":
  62. print("(takes a while to load)")
  63. print()
  64. voices = engine.get_voices()
  65. for index, voice in enumerate(voices, start=1):
  66. print(f"{index}. {voice}")
  67. print()
  68. voice_number = input(f"Select voice (1-{len(voices)}): ")
  69. voice = voices[int(voice_number) - 1]
  70. print()
  71. print()
  72. # create talking character ############################################################################################
  73. system_prompt = {
  74. 'role': 'system',
  75. 'content': character_personality
  76. }
  77. # start talk ##########################################################################################################
  78. engine.set_voice(voice)
  79. stream = TextToAudioStream(engine, log_characters=True)
  80. history = []
  81. def generate(messages):
  82. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True):
  83. if (text_chunk := chunk["choices"][0]["delta"].get("content")):
  84. yield text_chunk
  85. while True:
  86. # Wait until user presses space bar
  87. print("\n\nTap space when you're ready. ", end="", flush=True)
  88. keyboard.wait('space')
  89. while keyboard.is_pressed('space'): pass
  90. # Record from microphone until user presses space bar again
  91. print("I'm all ears. Tap space when you're done.\n")
  92. recorder.start()
  93. while not keyboard.is_pressed('space'):
  94. time.sleep(0.1)
  95. user_text = recorder.stop().text()
  96. print(f'>>> {user_text}\n<<< ', end="", flush=True)
  97. history.append({'role': 'user', 'content': user_text})
  98. # Generate and stream output
  99. generator = generate([system_prompt] + history[-10:])
  100. stream.feed(generator)
  101. stream.play_async()
  102. while stream.is_playing():
  103. if keyboard.is_pressed('space'):
  104. stream.stop()
  105. break
  106. time.sleep(0.1)
  107. history.append({'role': 'assistant', 'content': stream.text()})