openai_voice_interface.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. start_engine = "System" # Azure, Elevenlabs
  18. recorder_model = "large-v2"
  19. language = "en"
  20. azure_speech_region = "eastus"
  21. openai.api_key = os.environ.get("OPENAI_API_KEY")
  22. user_font_size = 22
  23. user_color = QColor(208, 208, 208) # gray
  24. assistant_font_size = 24
  25. assistant_color = QColor(240, 240, 240) # white
  26. voice_azure = "en-GB-SoniaNeural"
  27. voice_system = "Zira"
  28. #voice_system = "Hazel"
  29. prompt = "Respond helpfully, concisely, and when appropriate, with the subtle, polite irony of a butler."
  30. if language == "de":
  31. voice_system = "Katja"
  32. voice_azure = "de-DE-MajaNeural"
  33. prompt = 'Antworte hilfreich, knapp und bei Gelegenheit mit der feinen, höflichen Ironie eines Butlers.'
  34. print ("Click the top right corner to change the engine")
  35. print ("Press ESC to stop the current playback")
  36. system_prompt_message = {
  37. 'role': 'system',
  38. 'content': prompt
  39. }
  40. def generate_response(messages):
  41. """Generate assistant's response using OpenAI."""
  42. for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  43. text_chunk = chunk["choices"][0]["delta"].get("content")
  44. if text_chunk:
  45. yield text_chunk
  46. history = []
  47. MAX_WINDOW_WIDTH = 1600
  48. MAX_WIDTH_ASSISTANT = 1200
  49. MAX_WIDTH_USER = 1500
  50. class AudioPlayer(QThread):
  51. def __init__(self, file_path):
  52. super(AudioPlayer, self).__init__()
  53. self.file_path = file_path
  54. def run(self):
  55. wav = wavio.read(self.file_path)
  56. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  57. sd.play(sound, wav.rate)
  58. sd.wait()
  59. class TextRetrievalThread(QThread):
  60. textRetrieved = pyqtSignal(str)
  61. def __init__(self, recorder):
  62. super().__init__()
  63. self.recorder = recorder
  64. self.active = False
  65. def run(self):
  66. while True:
  67. if self.active:
  68. text = self.recorder.text()
  69. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  70. self.textRetrieved.emit(text)
  71. self.active = False
  72. time.sleep(0.1)
  73. def activate(self):
  74. self.active = True
  75. class TransparentWindow(QWidget):
  76. updateUI = pyqtSignal()
  77. clearAssistantTextSignal = pyqtSignal()
  78. clearUserTextSignal = pyqtSignal()
  79. def __init__(self):
  80. super().__init__()
  81. self.setGeometry(1, 1, 1, 1)
  82. self.setWindowTitle("Transparent Window")
  83. self.setAttribute(Qt.WA_TranslucentBackground)
  84. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  85. self.big_symbol_font = QFont('Arial', 32)
  86. self.small_symbol_font = QFont('Arial', 17)
  87. self.user_font = QFont('Arial', user_font_size)
  88. self.assistant_font = QFont('Arial', assistant_font_size)
  89. self.assistant_font.setItalic(True)
  90. self.big_symbol_text = ""
  91. self.small_symbol_text = ""
  92. self.user_text = ""
  93. self.assistant_text = ""
  94. self.displayed_user_text = ""
  95. self.displayed_assistant_text = ""
  96. self.stream = None
  97. self.text_retrieval_thread = None
  98. self.user_text_timer = QTimer(self)
  99. self.assistant_text_timer = QTimer(self)
  100. self.user_text_timer.timeout.connect(self.clear_user_text)
  101. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  102. self.clearUserTextSignal.connect(self.init_clear_user_text)
  103. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  104. self.user_text_opacity = 255
  105. self.assistant_text_opacity = 255
  106. self.updateUI.connect(self.update_self)
  107. self.audio_player = None
  108. self.run_fade_user = False
  109. self.run_fade_assistant = False
  110. self.menu = QMenu()
  111. self.menu.setStyleSheet("""
  112. QMenu {
  113. background-color: black;
  114. color: white;
  115. border-radius: 10px;
  116. }
  117. QMenu::item:selected {
  118. background-color: #555555;
  119. }
  120. """)
  121. self.elevenlabs_action = QAction("Elevenlabs", self)
  122. self.azure_action = QAction("Azure", self)
  123. self.system_action = QAction("System", self)
  124. self.quit_action = QAction("Quit", self)
  125. self.menu.addAction(self.elevenlabs_action)
  126. self.menu.addAction(self.azure_action)
  127. self.menu.addAction(self.system_action)
  128. self.menu.addSeparator()
  129. self.menu.addAction(self.quit_action)
  130. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  131. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  132. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  133. self.quit_action.triggered.connect(self.close_application)
  134. def mousePressEvent(self, event: QMouseEvent):
  135. if event.button() == Qt.LeftButton:
  136. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  137. self.menu.exec_(self.mapToGlobal(event.pos()))
  138. def close_application(self):
  139. QApplication.quit()
  140. def init(self):
  141. self.select_engine(start_engine)
  142. self.recorder = AudioToTextRecorder(
  143. model=recorder_model,
  144. language=language,
  145. wake_words="Jarvis",
  146. spinner=True,
  147. silero_sensitivity=0.2,
  148. webrtc_sensitivity=3,
  149. on_recording_start=self.on_recording_start,
  150. on_vad_detect_start=self.on_vad_detect_start,
  151. on_wakeword_detection_start=self.on_wakeword_detection_start,
  152. on_transcription_start=self.on_transcription_start,
  153. )
  154. if not start_with_wakeword:
  155. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  156. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  157. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  158. self.text_retrieval_thread.start()
  159. self.text_retrieval_thread.activate()
  160. keyboard.on_press_key('esc', self.on_escape)
  161. def select_engine(self, engine_name):
  162. if self.stream:
  163. if self.stream.is_playing():
  164. self.stream.stop()
  165. self.stream = None
  166. engine = None
  167. if engine_name == "Azure":
  168. engine = AzureEngine(
  169. os.environ.get("AZURE_SPEECH_KEY"),
  170. azure_speech_region,
  171. voice_azure,
  172. rate=24,
  173. pitch=10,
  174. )
  175. elif engine_name == "Elevenlabs":
  176. engine = ElevenlabsEngine(
  177. os.environ.get("ELEVENLABS_API_KEY")
  178. )
  179. else:
  180. engine = SystemEngine(
  181. voice=voice_system,
  182. #print_installed_voices=True
  183. )
  184. self.stream = TextToAudioStream(
  185. engine,
  186. on_character=self.on_character,
  187. on_text_stream_stop=self.on_text_stream_stop,
  188. on_text_stream_start=self.on_text_stream_start,
  189. on_audio_stream_stop=self.on_audio_stream_stop,
  190. log_characters=True
  191. )
  192. print (f"Using {engine_name} engine")
  193. def on_escape(self, e):
  194. if self.stream.is_playing():
  195. self.stream.stop()
  196. def showEvent(self, event: QEvent):
  197. super().showEvent(event)
  198. if event.type() == QEvent.Show:
  199. self.set_symbols("⌛", "🚀")
  200. QTimer.singleShot(1000, self.init)
  201. def on_character(self, char):
  202. if self.stream:
  203. self.assistant_text += char
  204. self.updateUI.emit()
  205. def on_text_stream_stop(self):
  206. print("\"", end="", flush=True)
  207. if self.stream:
  208. assistant_response = self.stream.text()
  209. self.assistant_text = assistant_response
  210. history.append({'role': 'assistant', 'content': assistant_response})
  211. def on_audio_stream_stop(self):
  212. self.set_symbols("🎙️", "⚪")
  213. if self.stream:
  214. self.clearAssistantTextSignal.emit()
  215. self.text_retrieval_thread.activate()
  216. def generate_answer(self):
  217. self.run_fade_assistant = False
  218. if self.assistant_text_timer.isActive():
  219. self.assistant_text_timer.stop()
  220. history.append({'role': 'user', 'content': self.user_text})
  221. self.remove_assistant_text()
  222. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  223. self.stream.feed(assistant_response)
  224. self.stream.play_async(minimum_sentence_length=7,
  225. buffer_threshold_seconds=3)
  226. def set_symbols(self, big_symbol, small_symbol):
  227. self.big_symbol_text = big_symbol
  228. self.small_symbol_text = small_symbol
  229. self.updateUI.emit()
  230. def on_text_stream_start(self):
  231. self.set_symbols("⌛", "👄")
  232. def process_user_text(self, user_text):
  233. user_text = user_text.strip()
  234. if user_text:
  235. self.run_fade_user = False
  236. if self.user_text_timer.isActive():
  237. self.user_text_timer.stop()
  238. self.user_text_opacity = 255
  239. self.user_text = user_text
  240. self.clearUserTextSignal.emit()
  241. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  242. self.set_symbols("⌛", "🧠")
  243. QTimer.singleShot(100, self.generate_answer)
  244. def on_transcription_start(self):
  245. self.set_symbols("⌛", "📝")
  246. def on_recording_start(self):
  247. self.set_symbols("🎙️", "🔴")
  248. def on_vad_detect_start(self):
  249. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  250. self.audio_player = AudioPlayer("active.wav")
  251. self.audio_player.start()
  252. self.set_symbols("🎙️", "⚪")
  253. def on_wakeword_detection_start(self):
  254. self.audio_player = AudioPlayer("inactive.wav")
  255. self.audio_player.start()
  256. self.set_symbols("", "💤")
  257. def init_clear_user_text(self):
  258. if self.user_text_timer.isActive():
  259. self.user_text_timer.stop()
  260. self.user_text_timer.start(10000)
  261. def remove_user_text(self):
  262. self.user_text = ""
  263. self.user_text_opacity = 255
  264. self.updateUI.emit()
  265. def fade_out_user_text(self):
  266. if not self.run_fade_user:
  267. return
  268. if self.user_text_opacity > 0:
  269. self.user_text_opacity -= 5
  270. self.updateUI.emit()
  271. QTimer.singleShot(50, self.fade_out_user_text)
  272. else:
  273. self.run_fade_user = False
  274. self.remove_user_text()
  275. def clear_user_text(self):
  276. self.user_text_timer.stop()
  277. if not self.user_text:
  278. return
  279. self.user_text_opacity = 255
  280. self.run_fade_user = True
  281. self.fade_out_user_text()
  282. def init_clear_assistant_text(self):
  283. if self.assistant_text_timer.isActive():
  284. self.assistant_text_timer.stop()
  285. self.assistant_text_timer.start(10000)
  286. def remove_assistant_text(self):
  287. self.assistant_text = ""
  288. self.assistant_text_opacity = 255
  289. self.updateUI.emit()
  290. def fade_out_assistant_text(self):
  291. if not self.run_fade_assistant:
  292. return
  293. if self.assistant_text_opacity > 0:
  294. self.assistant_text_opacity -= 5
  295. self.updateUI.emit()
  296. QTimer.singleShot(50, self.fade_out_assistant_text)
  297. else:
  298. self.run_fade_assistant = False
  299. self.remove_assistant_text()
  300. def clear_assistant_text(self):
  301. self.assistant_text_timer.stop()
  302. if not self.assistant_text:
  303. return
  304. self.assistant_text_opacity = 255
  305. self.run_fade_assistant = True
  306. self.fade_out_assistant_text()
  307. def update_self(self):
  308. self.blockSignals(True)
  309. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  310. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  311. fm_symbol = QFontMetrics(self.big_symbol_font)
  312. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  313. self.symbol_height = fm_symbol.height() + 8
  314. self.total_width = MAX_WINDOW_WIDTH
  315. fm_user = QFontMetrics(self.user_font)
  316. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  317. self.user_height = fm_user.height() * user_text_lines + 7
  318. fm_assistant = QFontMetrics(self.assistant_font)
  319. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  320. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  321. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  322. desktop = QDesktopWidget()
  323. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  324. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  325. self.blockSignals(False)
  326. self.update()
  327. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  328. painter.setPen(outlineColor)
  329. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  330. (-outline_size, -outline_size), (outline_size, -outline_size),
  331. (-outline_size, outline_size), (outline_size, outline_size)]:
  332. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  333. painter.setPen(textColor)
  334. painter.drawText(x, y, width, height, alignment, text)
  335. def paintEvent(self, event):
  336. painter = QPainter(self)
  337. offsetX = 4
  338. offsetY = 5
  339. painter.setPen(QColor(255, 255, 255))
  340. # Draw symbol
  341. painter.setFont(self.big_symbol_font)
  342. if self.big_symbol_text:
  343. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  344. painter.setFont(self.small_symbol_font)
  345. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  346. else:
  347. painter.setFont(self.small_symbol_font)
  348. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  349. # Draw User Text
  350. painter.setFont(self.user_font)
  351. user_x = self.total_width - self.user_width - 45 + offsetX
  352. user_y = offsetY + 15
  353. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  354. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  355. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  356. # Draw Assistant Text
  357. painter.setFont(self.assistant_font)
  358. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  359. assistant_y = self.user_height + offsetY + 15
  360. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  361. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  362. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  363. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  364. """
  365. Line feeds are inserted so that the text width does never exceed max_width.
  366. Text is only broken up on whole words.
  367. """
  368. fm = QFontMetrics(font)
  369. words = text.split(' ')
  370. adjusted_text = ''
  371. current_line = ''
  372. max_width_used = 0
  373. for word in words:
  374. current_width = fm.width(current_line + word)
  375. if current_width <= max_width_allowed:
  376. current_line += word + ' '
  377. else:
  378. line_width = fm.width(current_line)
  379. if line_width > max_width_used:
  380. max_width_used = line_width
  381. adjusted_text += current_line + '\n'
  382. current_line = word + ' '
  383. line_width = fm.width(current_line)
  384. if line_width > max_width_used:
  385. max_width_used = line_width
  386. adjusted_text += current_line
  387. return adjusted_text.rstrip(), max_width_used
  388. if __name__ == '__main__':
  389. app = QApplication(sys.argv)
  390. window = TransparentWindow()
  391. window.show()
  392. sys.exit(app.exec_())