openai_voice_interface.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. start_engine = "System" # Azure, Elevenlabs
  18. recorder_model = "medium"
  19. language = "en"
  20. azure_speech_region = "eastus"
  21. openai_model = "gpt-4" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
  22. openai.api_key = os.environ.get("OPENAI_API_KEY")
  23. user_font_size = 22
  24. user_color = QColor(208, 208, 208) # gray
  25. assistant_font_size = 24
  26. assistant_color = QColor(240, 240, 240) # white
  27. voice_azure = "en-GB-SoniaNeural"
  28. voice_system = "Zira"
  29. #voice_system = "Hazel"
  30. prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
  31. elevenlabs_model = "eleven_monolingual_v1"
  32. if language == "de":
  33. elevenlabs_model = "eleven_multilingual_v1"
  34. voice_system = "Katja"
  35. voice_azure = "de-DE-MajaNeural"
  36. prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
  37. print ("Click the top right corner to change the engine")
  38. print ("Press ESC to stop the current playback")
  39. system_prompt_message = {
  40. 'role': 'system',
  41. 'content': prompt
  42. }
  43. def generate_response(messages):
  44. """Generate assistant's response using OpenAI."""
  45. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  46. text_chunk = chunk["choices"][0]["delta"].get("content")
  47. if text_chunk:
  48. yield text_chunk
  49. history = []
  50. MAX_WINDOW_WIDTH = 1600
  51. MAX_WIDTH_ASSISTANT = 1200
  52. MAX_WIDTH_USER = 1500
  53. class AudioPlayer(QThread):
  54. def __init__(self, file_path):
  55. super(AudioPlayer, self).__init__()
  56. self.file_path = file_path
  57. def run(self):
  58. wav = wavio.read(self.file_path)
  59. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  60. sd.play(sound, wav.rate)
  61. sd.wait()
  62. class TextRetrievalThread(QThread):
  63. textRetrieved = pyqtSignal(str)
  64. def __init__(self, recorder):
  65. super().__init__()
  66. self.recorder = recorder
  67. self.active = False
  68. def run(self):
  69. while True:
  70. if self.active:
  71. text = self.recorder.text()
  72. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  73. self.textRetrieved.emit(text)
  74. self.active = False
  75. time.sleep(0.1)
  76. def activate(self):
  77. self.active = True
  78. class TransparentWindow(QWidget):
  79. updateUI = pyqtSignal()
  80. clearAssistantTextSignal = pyqtSignal()
  81. clearUserTextSignal = pyqtSignal()
  82. def __init__(self):
  83. super().__init__()
  84. self.setGeometry(1, 1, 1, 1)
  85. self.setWindowTitle("Transparent Window")
  86. self.setAttribute(Qt.WA_TranslucentBackground)
  87. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  88. self.big_symbol_font = QFont('Arial', 32)
  89. self.small_symbol_font = QFont('Arial', 17)
  90. self.user_font = QFont('Arial', user_font_size)
  91. self.assistant_font = QFont('Arial', assistant_font_size)
  92. self.assistant_font.setItalic(True)
  93. self.big_symbol_text = ""
  94. self.small_symbol_text = ""
  95. self.user_text = ""
  96. self.assistant_text = ""
  97. self.displayed_user_text = ""
  98. self.displayed_assistant_text = ""
  99. self.stream = None
  100. self.text_retrieval_thread = None
  101. self.user_text_timer = QTimer(self)
  102. self.assistant_text_timer = QTimer(self)
  103. self.user_text_timer.timeout.connect(self.clear_user_text)
  104. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  105. self.clearUserTextSignal.connect(self.init_clear_user_text)
  106. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  107. self.user_text_opacity = 255
  108. self.assistant_text_opacity = 255
  109. self.updateUI.connect(self.update_self)
  110. self.audio_player = None
  111. self.run_fade_user = False
  112. self.run_fade_assistant = False
  113. self.menu = QMenu()
  114. self.menu.setStyleSheet("""
  115. QMenu {
  116. background-color: black;
  117. color: white;
  118. border-radius: 10px;
  119. }
  120. QMenu::item:selected {
  121. background-color: #555555;
  122. }
  123. """)
  124. self.elevenlabs_action = QAction("Elevenlabs", self)
  125. self.azure_action = QAction("Azure", self)
  126. self.system_action = QAction("System", self)
  127. self.quit_action = QAction("Quit", self)
  128. self.menu.addAction(self.elevenlabs_action)
  129. self.menu.addAction(self.azure_action)
  130. self.menu.addAction(self.system_action)
  131. self.menu.addSeparator()
  132. self.menu.addAction(self.quit_action)
  133. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  134. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  135. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  136. self.quit_action.triggered.connect(self.close_application)
  137. def mousePressEvent(self, event: QMouseEvent):
  138. if event.button() == Qt.LeftButton:
  139. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  140. self.menu.exec_(self.mapToGlobal(event.pos()))
  141. def close_application(self):
  142. QApplication.quit()
  143. def init(self):
  144. self.select_engine(start_engine)
  145. self.recorder = AudioToTextRecorder(
  146. model=recorder_model,
  147. language=language,
  148. wake_words="Jarvis",
  149. spinner=True,
  150. silero_sensitivity=0.2,
  151. webrtc_sensitivity=3,
  152. on_recording_start=self.on_recording_start,
  153. on_vad_detect_start=self.on_vad_detect_start,
  154. on_wakeword_detection_start=self.on_wakeword_detection_start,
  155. on_transcription_start=self.on_transcription_start,
  156. )
  157. if not start_with_wakeword:
  158. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  159. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  160. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  161. self.text_retrieval_thread.start()
  162. self.text_retrieval_thread.activate()
  163. keyboard.on_press_key('esc', self.on_escape)
  164. def select_engine(self, engine_name):
  165. if self.stream:
  166. self.stream.stop()
  167. self.stream = None
  168. engine = None
  169. if engine_name == "Azure":
  170. engine = AzureEngine(
  171. os.environ.get("AZURE_SPEECH_KEY"),
  172. azure_speech_region,
  173. voice_azure,
  174. rate=24,
  175. pitch=10,
  176. )
  177. elif engine_name == "Elevenlabs":
  178. engine = ElevenlabsEngine(
  179. os.environ.get("ELEVENLABS_API_KEY"),
  180. model=elevenlabs_model
  181. )
  182. else:
  183. engine = SystemEngine(
  184. voice=voice_system,
  185. #print_installed_voices=True
  186. )
  187. self.stream = TextToAudioStream(
  188. engine,
  189. on_character=self.on_character,
  190. on_text_stream_stop=self.on_text_stream_stop,
  191. on_text_stream_start=self.on_text_stream_start,
  192. on_audio_stream_stop=self.on_audio_stream_stop,
  193. log_characters=True
  194. )
  195. sys.stdout.write('\033[K') # Clear to the end of line
  196. sys.stdout.write('\r') # Move the cursor to the beginning of the line
  197. print (f"Using {engine_name} engine")
  198. def on_escape(self, e):
  199. if self.stream.is_playing():
  200. self.stream.stop()
  201. def showEvent(self, event: QEvent):
  202. super().showEvent(event)
  203. if event.type() == QEvent.Show:
  204. self.set_symbols("⌛", "🚀")
  205. QTimer.singleShot(1000, self.init)
  206. def on_character(self, char):
  207. if self.stream:
  208. self.assistant_text += char
  209. self.updateUI.emit()
  210. def on_text_stream_stop(self):
  211. print("\"", end="", flush=True)
  212. if self.stream:
  213. assistant_response = self.stream.text()
  214. self.assistant_text = assistant_response
  215. history.append({'role': 'assistant', 'content': assistant_response})
  216. def on_audio_stream_stop(self):
  217. self.set_symbols("🎙️", "⚪")
  218. if self.stream:
  219. self.clearAssistantTextSignal.emit()
  220. self.text_retrieval_thread.activate()
  221. def generate_answer(self):
  222. self.run_fade_assistant = False
  223. if self.assistant_text_timer.isActive():
  224. self.assistant_text_timer.stop()
  225. history.append({'role': 'user', 'content': self.user_text})
  226. self.remove_assistant_text()
  227. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  228. self.stream.feed(assistant_response)
  229. self.stream.play_async(minimum_sentence_length=6,
  230. buffer_threshold_seconds=2)
  231. def set_symbols(self, big_symbol, small_symbol):
  232. self.big_symbol_text = big_symbol
  233. self.small_symbol_text = small_symbol
  234. self.updateUI.emit()
  235. def on_text_stream_start(self):
  236. self.set_symbols("⌛", "👄")
  237. def process_user_text(self, user_text):
  238. user_text = user_text.strip()
  239. if user_text:
  240. self.run_fade_user = False
  241. if self.user_text_timer.isActive():
  242. self.user_text_timer.stop()
  243. self.user_text_opacity = 255
  244. self.user_text = user_text
  245. self.clearUserTextSignal.emit()
  246. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  247. self.set_symbols("⌛", "🧠")
  248. QTimer.singleShot(100, self.generate_answer)
  249. def on_transcription_start(self):
  250. self.set_symbols("⌛", "📝")
  251. def on_recording_start(self):
  252. self.set_symbols("🎙️", "🔴")
  253. def on_vad_detect_start(self):
  254. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  255. self.audio_player = AudioPlayer("active.wav")
  256. self.audio_player.start()
  257. self.set_symbols("🎙️", "⚪")
  258. def on_wakeword_detection_start(self):
  259. self.audio_player = AudioPlayer("inactive.wav")
  260. self.audio_player.start()
  261. self.set_symbols("", "💤")
  262. def init_clear_user_text(self):
  263. if self.user_text_timer.isActive():
  264. self.user_text_timer.stop()
  265. self.user_text_timer.start(10000)
  266. def remove_user_text(self):
  267. self.user_text = ""
  268. self.user_text_opacity = 255
  269. self.updateUI.emit()
  270. def fade_out_user_text(self):
  271. if not self.run_fade_user:
  272. return
  273. if self.user_text_opacity > 0:
  274. self.user_text_opacity -= 5
  275. self.updateUI.emit()
  276. QTimer.singleShot(50, self.fade_out_user_text)
  277. else:
  278. self.run_fade_user = False
  279. self.remove_user_text()
  280. def clear_user_text(self):
  281. self.user_text_timer.stop()
  282. if not self.user_text:
  283. return
  284. self.user_text_opacity = 255
  285. self.run_fade_user = True
  286. self.fade_out_user_text()
  287. def init_clear_assistant_text(self):
  288. if self.assistant_text_timer.isActive():
  289. self.assistant_text_timer.stop()
  290. self.assistant_text_timer.start(10000)
  291. def remove_assistant_text(self):
  292. self.assistant_text = ""
  293. self.assistant_text_opacity = 255
  294. self.updateUI.emit()
  295. def fade_out_assistant_text(self):
  296. if not self.run_fade_assistant:
  297. return
  298. if self.assistant_text_opacity > 0:
  299. self.assistant_text_opacity -= 5
  300. self.updateUI.emit()
  301. QTimer.singleShot(50, self.fade_out_assistant_text)
  302. else:
  303. self.run_fade_assistant = False
  304. self.remove_assistant_text()
  305. def clear_assistant_text(self):
  306. self.assistant_text_timer.stop()
  307. if not self.assistant_text:
  308. return
  309. self.assistant_text_opacity = 255
  310. self.run_fade_assistant = True
  311. self.fade_out_assistant_text()
  312. def update_self(self):
  313. self.blockSignals(True)
  314. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  315. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  316. fm_symbol = QFontMetrics(self.big_symbol_font)
  317. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  318. self.symbol_height = fm_symbol.height() + 8
  319. self.total_width = MAX_WINDOW_WIDTH
  320. fm_user = QFontMetrics(self.user_font)
  321. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  322. self.user_height = fm_user.height() * user_text_lines + 7
  323. fm_assistant = QFontMetrics(self.assistant_font)
  324. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  325. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  326. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  327. desktop = QDesktopWidget()
  328. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  329. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  330. self.blockSignals(False)
  331. self.update()
  332. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  333. painter.setPen(outlineColor)
  334. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  335. (-outline_size, -outline_size), (outline_size, -outline_size),
  336. (-outline_size, outline_size), (outline_size, outline_size)]:
  337. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  338. painter.setPen(textColor)
  339. painter.drawText(x, y, width, height, alignment, text)
  340. def paintEvent(self, event):
  341. painter = QPainter(self)
  342. offsetX = 4
  343. offsetY = 5
  344. painter.setPen(QColor(255, 255, 255))
  345. # Draw symbol
  346. painter.setFont(self.big_symbol_font)
  347. if self.big_symbol_text:
  348. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  349. painter.setFont(self.small_symbol_font)
  350. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  351. else:
  352. painter.setFont(self.small_symbol_font)
  353. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  354. # Draw User Text
  355. painter.setFont(self.user_font)
  356. user_x = self.total_width - self.user_width - 45 + offsetX
  357. user_y = offsetY + 15
  358. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  359. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  360. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  361. # Draw Assistant Text
  362. painter.setFont(self.assistant_font)
  363. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  364. assistant_y = self.user_height + offsetY + 15
  365. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  366. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  367. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  368. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  369. """
  370. Line feeds are inserted so that the text width does never exceed max_width.
  371. Text is only broken up on whole words.
  372. """
  373. fm = QFontMetrics(font)
  374. words = text.split(' ')
  375. adjusted_text = ''
  376. current_line = ''
  377. max_width_used = 0
  378. for word in words:
  379. current_width = fm.width(current_line + word)
  380. if current_width <= max_width_allowed:
  381. current_line += word + ' '
  382. else:
  383. line_width = fm.width(current_line)
  384. if line_width > max_width_used:
  385. max_width_used = line_width
  386. adjusted_text += current_line + '\n'
  387. current_line = word + ' '
  388. line_width = fm.width(current_line)
  389. if line_width > max_width_used:
  390. max_width_used = line_width
  391. adjusted_text += current_line
  392. return adjusted_text.rstrip(), max_width_used
  393. if __name__ == '__main__':
  394. app = QApplication(sys.argv)
  395. window = TransparentWindow()
  396. window.show()
  397. sys.exit(app.exec_())