openai_voice_interface.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. start_engine = "System" # Azure, Elevenlabs
  18. recorder_model = "large-v2"
  19. language = "en"
  20. azure_speech_region = "eastus"
  21. openai.api_key = os.environ.get("OPENAI_API_KEY")
  22. user_font_size = 22
  23. user_color = QColor(208, 208, 208) # gray
  24. assistant_font_size = 24
  25. assistant_color = QColor(240, 240, 240) # white
  26. voice_azure = "en-GB-SoniaNeural"
  27. voice_system = "Zira"
  28. #voice_system = "Hazel"
  29. prompt = "Respond helpfully, concisely, and when appropriate, with the subtle, polite irony of a butler."
  30. if language == "de":
  31. voice_system = "Katja"
  32. voice_azure = "de-DE-MajaNeural"
  33. prompt = 'Antworte hilfreich, knapp und bei Gelegenheit mit der feinen, höflichen Ironie eines Butlers.'
  34. print ("Click the top right corner to change the engine")
  35. print ("Press ESC to stop the current playback")
  36. system_prompt_message = {
  37. 'role': 'system',
  38. 'content': prompt
  39. }
  40. def generate_response(messages):
  41. """Generate assistant's response using OpenAI."""
  42. for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  43. text_chunk = chunk["choices"][0]["delta"].get("content")
  44. if text_chunk:
  45. yield text_chunk
  46. history = []
  47. MAX_WINDOW_WIDTH = 1600
  48. MAX_WIDTH_ASSISTANT = 1200
  49. MAX_WIDTH_USER = 1500
  50. class AudioPlayer(QThread):
  51. def __init__(self, file_path):
  52. super(AudioPlayer, self).__init__()
  53. self.file_path = file_path
  54. def run(self):
  55. wav = wavio.read(self.file_path)
  56. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  57. sd.play(sound, wav.rate)
  58. sd.wait()
  59. class TextRetrievalThread(QThread):
  60. textRetrieved = pyqtSignal(str)
  61. def __init__(self, recorder):
  62. super().__init__()
  63. self.recorder = recorder
  64. self.active = False
  65. def run(self):
  66. while True:
  67. if self.active:
  68. text = self.recorder.text()
  69. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  70. self.textRetrieved.emit(text)
  71. self.active = False
  72. time.sleep(0.1)
  73. def activate(self):
  74. self.active = True
  75. class TransparentWindow(QWidget):
  76. updateUI = pyqtSignal()
  77. clearAssistantTextSignal = pyqtSignal()
  78. clearUserTextSignal = pyqtSignal()
  79. def __init__(self):
  80. super().__init__()
  81. self.setGeometry(1, 1, 1, 1)
  82. self.setWindowTitle("Transparent Window")
  83. self.setAttribute(Qt.WA_TranslucentBackground)
  84. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  85. self.big_symbol_font = QFont('Arial', 32)
  86. self.small_symbol_font = QFont('Arial', 17)
  87. self.user_font = QFont('Arial', user_font_size)
  88. self.assistant_font = QFont('Arial', assistant_font_size)
  89. self.assistant_font.setItalic(True)
  90. self.big_symbol_text = ""
  91. self.small_symbol_text = ""
  92. self.user_text = ""
  93. self.assistant_text = ""
  94. self.displayed_user_text = ""
  95. self.displayed_assistant_text = ""
  96. self.stream = None
  97. self.text_retrieval_thread = None
  98. self.user_text_timer = QTimer(self)
  99. self.assistant_text_timer = QTimer(self)
  100. self.user_text_timer.timeout.connect(self.clear_user_text)
  101. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  102. self.clearUserTextSignal.connect(self.init_clear_user_text)
  103. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  104. self.user_text_opacity = 255
  105. self.assistant_text_opacity = 255
  106. self.updateUI.connect(self.update_self)
  107. self.audio_player = None
  108. self.run_fade_user = False
  109. self.run_fade_assistant = False
  110. self.menu = QMenu()
  111. self.menu.setStyleSheet("""
  112. QMenu {
  113. background-color: black;
  114. color: white;
  115. border-radius: 10px;
  116. }
  117. QMenu::item:selected {
  118. background-color: #555555;
  119. }
  120. """)
  121. self.elevenlabs_action = QAction("Elevenlabs", self)
  122. self.azure_action = QAction("Azure", self)
  123. self.system_action = QAction("System", self)
  124. self.menu.addAction(self.elevenlabs_action)
  125. self.menu.addAction(self.azure_action)
  126. self.menu.addAction(self.system_action)
  127. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  128. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  129. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  130. def mousePressEvent(self, event: QMouseEvent):
  131. if event.button() == Qt.LeftButton:
  132. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  133. self.menu.exec_(self.mapToGlobal(event.pos()))
  134. def init(self):
  135. self.select_engine(start_engine)
  136. self.recorder = AudioToTextRecorder(
  137. model=recorder_model,
  138. language=language,
  139. wake_words="Jarvis",
  140. spinner=True,
  141. silero_sensitivity=0.2,
  142. webrtc_sensitivity=3,
  143. on_recording_start=self.on_recording_start,
  144. on_vad_detect_start=self.on_vad_detect_start,
  145. on_wakeword_detection_start=self.on_wakeword_detection_start,
  146. on_transcription_start=self.on_transcription_start,
  147. )
  148. if not start_with_wakeword:
  149. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  150. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  151. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  152. self.text_retrieval_thread.start()
  153. self.text_retrieval_thread.activate()
  154. keyboard.on_press_key('esc', self.on_escape)
  155. def select_engine(self, engine_name):
  156. if self.stream:
  157. if self.stream.is_playing():
  158. self.stream.stop()
  159. self.stream = None
  160. engine = None
  161. if engine_name == "Azure":
  162. engine = AzureEngine(
  163. os.environ.get("AZURE_SPEECH_KEY"),
  164. azure_speech_region,
  165. voice_azure,
  166. rate=24,
  167. pitch=10,
  168. )
  169. elif engine_name == "Elevenlabs":
  170. engine = ElevenlabsEngine(
  171. os.environ.get("ELEVENLABS_API_KEY")
  172. )
  173. else:
  174. engine = SystemEngine(
  175. voice=voice_system,
  176. #print_installed_voices=True
  177. )
  178. self.stream = TextToAudioStream(
  179. engine,
  180. on_character=self.on_character,
  181. on_text_stream_stop=self.on_text_stream_stop,
  182. on_text_stream_start=self.on_text_stream_start,
  183. on_audio_stream_stop=self.on_audio_stream_stop,
  184. log_characters=True
  185. )
  186. print (f"Using {engine_name} engine")
  187. def on_escape(self, e):
  188. if self.stream.is_playing():
  189. self.stream.stop()
  190. def showEvent(self, event: QEvent):
  191. super().showEvent(event)
  192. if event.type() == QEvent.Show:
  193. self.set_symbols("⌛", "🚀")
  194. QTimer.singleShot(1000, self.init)
  195. def on_character(self, char):
  196. if self.stream:
  197. self.assistant_text += char
  198. self.updateUI.emit()
  199. def on_text_stream_stop(self):
  200. print("\"", end="", flush=True)
  201. if self.stream:
  202. assistant_response = self.stream.text()
  203. self.assistant_text = assistant_response
  204. history.append({'role': 'assistant', 'content': assistant_response})
  205. def on_audio_stream_stop(self):
  206. self.set_symbols("🎙️", "⚪")
  207. if self.stream:
  208. self.clearAssistantTextSignal.emit()
  209. self.text_retrieval_thread.activate()
  210. def generate_answer(self):
  211. self.run_fade_assistant = False
  212. if self.assistant_text_timer.isActive():
  213. self.assistant_text_timer.stop()
  214. history.append({'role': 'user', 'content': self.user_text})
  215. self.remove_assistant_text()
  216. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  217. self.stream.feed(assistant_response)
  218. self.stream.play_async(minimum_sentence_length=7,
  219. buffer_threshold_seconds=3)
  220. def set_symbols(self, big_symbol, small_symbol):
  221. self.big_symbol_text = big_symbol
  222. self.small_symbol_text = small_symbol
  223. self.updateUI.emit()
  224. def on_text_stream_start(self):
  225. self.set_symbols("⌛", "👄")
  226. def process_user_text(self, user_text):
  227. user_text = user_text.strip()
  228. if user_text:
  229. self.run_fade_user = False
  230. if self.user_text_timer.isActive():
  231. self.user_text_timer.stop()
  232. self.user_text_opacity = 255
  233. self.user_text = user_text
  234. self.clearUserTextSignal.emit()
  235. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  236. self.set_symbols("⌛", "🧠")
  237. QTimer.singleShot(100, self.generate_answer)
  238. def on_transcription_start(self):
  239. self.set_symbols("⌛", "📝")
  240. def on_recording_start(self):
  241. self.set_symbols("🎙️", "🔴")
  242. def on_vad_detect_start(self):
  243. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  244. self.audio_player = AudioPlayer("active.wav")
  245. self.audio_player.start()
  246. self.set_symbols("🎙️", "⚪")
  247. def on_wakeword_detection_start(self):
  248. self.audio_player = AudioPlayer("inactive.wav")
  249. self.audio_player.start()
  250. self.set_symbols("", "💤")
  251. def init_clear_user_text(self):
  252. if self.user_text_timer.isActive():
  253. self.user_text_timer.stop()
  254. self.user_text_timer.start(10000)
  255. def remove_user_text(self):
  256. self.user_text = ""
  257. self.user_text_opacity = 255
  258. self.updateUI.emit()
  259. def fade_out_user_text(self):
  260. if not self.run_fade_user:
  261. return
  262. if self.user_text_opacity > 0:
  263. self.user_text_opacity -= 5
  264. self.updateUI.emit()
  265. QTimer.singleShot(50, self.fade_out_user_text)
  266. else:
  267. self.run_fade_user = False
  268. self.remove_user_text()
  269. def clear_user_text(self):
  270. self.user_text_timer.stop()
  271. if not self.user_text:
  272. return
  273. self.user_text_opacity = 255
  274. self.run_fade_user = True
  275. self.fade_out_user_text()
  276. def init_clear_assistant_text(self):
  277. if self.assistant_text_timer.isActive():
  278. self.assistant_text_timer.stop()
  279. self.assistant_text_timer.start(10000)
  280. def remove_assistant_text(self):
  281. self.assistant_text = ""
  282. self.assistant_text_opacity = 255
  283. self.updateUI.emit()
  284. def fade_out_assistant_text(self):
  285. if not self.run_fade_assistant:
  286. return
  287. if self.assistant_text_opacity > 0:
  288. self.assistant_text_opacity -= 5
  289. self.updateUI.emit()
  290. QTimer.singleShot(50, self.fade_out_assistant_text)
  291. else:
  292. self.run_fade_assistant = False
  293. self.remove_assistant_text()
  294. def clear_assistant_text(self):
  295. self.assistant_text_timer.stop()
  296. if not self.assistant_text:
  297. return
  298. self.assistant_text_opacity = 255
  299. self.run_fade_assistant = True
  300. self.fade_out_assistant_text()
  301. def update_self(self):
  302. self.blockSignals(True)
  303. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  304. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  305. fm_symbol = QFontMetrics(self.big_symbol_font)
  306. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  307. self.symbol_height = fm_symbol.height() + 8
  308. self.total_width = MAX_WINDOW_WIDTH
  309. fm_user = QFontMetrics(self.user_font)
  310. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  311. self.user_height = fm_user.height() * user_text_lines + 7
  312. fm_assistant = QFontMetrics(self.assistant_font)
  313. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  314. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  315. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  316. desktop = QDesktopWidget()
  317. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  318. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  319. self.blockSignals(False)
  320. self.update()
  321. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  322. painter.setPen(outlineColor)
  323. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  324. (-outline_size, -outline_size), (outline_size, -outline_size),
  325. (-outline_size, outline_size), (outline_size, outline_size)]:
  326. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  327. painter.setPen(textColor)
  328. painter.drawText(x, y, width, height, alignment, text)
  329. def paintEvent(self, event):
  330. painter = QPainter(self)
  331. offsetX = 4
  332. offsetY = 5
  333. painter.setPen(QColor(255, 255, 255))
  334. # Draw symbol
  335. painter.setFont(self.big_symbol_font)
  336. if self.big_symbol_text:
  337. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  338. painter.setFont(self.small_symbol_font)
  339. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  340. else:
  341. painter.setFont(self.small_symbol_font)
  342. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  343. # Draw User Text
  344. painter.setFont(self.user_font)
  345. user_x = self.total_width - self.user_width - 45 + offsetX
  346. user_y = offsetY + 15
  347. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  348. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  349. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  350. # Draw Assistant Text
  351. painter.setFont(self.assistant_font)
  352. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  353. assistant_y = self.user_height + offsetY + 15
  354. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  355. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  356. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  357. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  358. """
  359. Line feeds are inserted so that the text width does never exceed max_width.
  360. Text is only broken up on whole words.
  361. """
  362. fm = QFontMetrics(font)
  363. words = text.split(' ')
  364. adjusted_text = ''
  365. current_line = ''
  366. max_width_used = 0
  367. for word in words:
  368. current_width = fm.width(current_line + word)
  369. if current_width <= max_width_allowed:
  370. current_line += word + ' '
  371. else:
  372. line_width = fm.width(current_line)
  373. if line_width > max_width_used:
  374. max_width_used = line_width
  375. adjusted_text += current_line + '\n'
  376. current_line = word + ' '
  377. line_width = fm.width(current_line)
  378. if line_width > max_width_used:
  379. max_width_used = line_width
  380. adjusted_text += current_line
  381. return adjusted_text.rstrip(), max_width_used
  382. if __name__ == '__main__':
  383. app = QApplication(sys.argv)
  384. window = TransparentWindow()
  385. window.show()
  386. sys.exit(app.exec_())