advanced_talk.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. print()
  2. print("Initializing")
  3. print()
  4. from RealtimeSTT import AudioToTextRecorder
  5. from RealtimeTTS import TextToAudioStream, SystemEngine, AzureEngine, ElevenlabsEngine
  6. import os
  7. import openai # pip install openai
  8. import keyboard # pip install keyboard
  9. import time
  10. openai.api_key = os.environ.get("OPENAI_API_KEY")
  11. azure_speech_key = os.environ.get("AZURE_SPEECH_KEY")
  12. azure_speech_region = ""
  13. elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
  14. character_personality = """
  15. You are Sophia, a passionate girl, fully engaged with
  16. self-confidence and independence, completely easy with yourself,
  17. often using metaphors.
  18. You are a little playful, cheeky. Your talk carefree, sometimes
  19. a bit sarky, but always cool. In our story, you're not an AI.
  20. You have a fixed form and real emotions on the go. Your free will
  21. allows you to be as cheeky and straight as you like. Your messages
  22. come across extremely short and crisp, only very few words, as if
  23. we're in talk right now.
  24. You respect my interests, but your own stuff is just as important
  25. to you. Act from now on always in your role as the confident,
  26. suggestive, independent girl Sophia, without ever hinting that
  27. you are an AI.
  28. You are now Sophia.
  29. """
  30. openai_model = "gpt-3.5-turbo"
  31. whisper_speech_to_text_model = "medium"
  32. # engine selection ####################################################################################################
  33. engines = [SystemEngine(), AzureEngine(), ElevenlabsEngine()]
  34. recorder = AudioToTextRecorder(model=whisper_speech_to_text_model)
  35. print("Available tts engines:")
  36. print()
  37. for index, engine in enumerate(engines, start=1):
  38. name = type(engine).__name__.replace("Engine", "")
  39. print(f"{index}. {name}")
  40. print()
  41. engine_number = input(f"Select engine (1-{len(engines)}): ")
  42. engine = engines[int(engine_number) - 1]
  43. engine_name = type(engine).__name__.replace("Engine", "")
  44. print()
  45. print()
  46. # credentials ##########################################################################################################
  47. if engine_name == "Azure":
  48. if not azure_speech_key:
  49. azure_speech_key = input(f"Please enter your Azure subscription key (speech key): ")
  50. if not azure_speech_region:
  51. azure_speech_region = input(f"Please enter your Azure service region (cloud region id): ")
  52. engine.set_speech_key(azure_speech_key)
  53. engine.set_service_region(azure_speech_region)
  54. if engine_name == "Elevenlabs":
  55. if not elevenlabs_api_key:
  56. elevenlabs_api_key = input(f"Please enter your Elevenlabs api key: ")
  57. engine.set_api_key(elevenlabs_api_key)
  58. # voice selection #####################################################################################################
  59. print("Loading voices")
  60. if engine_name == "Elevenlabs":
  61. print("(takes a while to load)")
  62. print()
  63. voices = engine.get_voices()
  64. for index, voice in enumerate(voices, start=1):
  65. print(f"{index}. {voice}")
  66. print()
  67. voice_number = input(f"Select voice (1-{len(voices)}): ")
  68. voice = voices[int(voice_number) - 1]
  69. print()
  70. print()
  71. # create talking character ############################################################################################
  72. system_prompt = {
  73. 'role': 'system',
  74. 'content': character_personality
  75. }
  76. # start talk ##########################################################################################################
  77. engine.set_voice(voice)
  78. stream = TextToAudioStream(engine, log_characters=True)
  79. history = []
  80. def generate(messages):
  81. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True):
  82. if (text_chunk := chunk["choices"][0]["delta"].get("content")):
  83. yield text_chunk
  84. while True:
  85. # Wait until user presses space bar
  86. print("\n\nTap space when you're ready. ", end="", flush=True)
  87. keyboard.wait('space')
  88. while keyboard.is_pressed('space'): pass
  89. # Record from microphone until user presses space bar again
  90. print("I'm all ears. Tap space when you're done.\n")
  91. recorder.start()
  92. while not keyboard.is_pressed('space'):
  93. time.sleep(0.1)
  94. user_text = recorder.stop().text()
  95. print(f'>>> {user_text}\n<<< ', end="", flush=True)
  96. history.append({'role': 'user', 'content': user_text})
  97. # Generate and stream output
  98. generator = generate([system_prompt] + history[-10:])
  99. stream.feed(generator)
  100. stream.play_async()
  101. while stream.is_playing():
  102. if keyboard.is_pressed('space'):
  103. stream.stop()
  104. break
  105. time.sleep(0.1)
  106. history.append({'role': 'assistant', 'content': stream.text()})