ui_openai_voice_interface.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. start_engine = "Azure" # Azure, Elevenlabs
  18. recorder_model = "large-v2"
  19. language = "en"
  20. azure_speech_region = "eastus"
  21. openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
  22. openai.api_key = os.environ.get("OPENAI_API_KEY")
  23. user_font_size = 22
  24. user_color = QColor(0, 188, 242) # turquoise
  25. assistant_font_size = 24
  26. assistant_color = QColor(239, 98, 166) # pink
  27. voice_azure = "en-GB-SoniaNeural"
  28. voice_system = "Zira"
  29. #voice_system = "Hazel"
  30. prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
  31. elevenlabs_model = "eleven_monolingual_v1"
  32. if language == "de":
  33. elevenlabs_model = "eleven_multilingual_v1"
  34. voice_system = "Katja"
  35. voice_azure = "de-DE-MajaNeural"
  36. prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
  37. print ("Click the top right corner to change the engine")
  38. print ("Press ESC to stop the current playback")
  39. system_prompt_message = {
  40. 'role': 'system',
  41. 'content': prompt
  42. }
  43. def generate_response(messages):
  44. """Generate assistant's response using OpenAI."""
  45. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  46. text_chunk = chunk["choices"][0]["delta"].get("content")
  47. if text_chunk:
  48. yield text_chunk
  49. history = []
  50. MAX_WINDOW_WIDTH = 1600
  51. MAX_WIDTH_ASSISTANT = 1200
  52. MAX_WIDTH_USER = 1500
  53. class AudioPlayer(QThread):
  54. def __init__(self, file_path):
  55. super(AudioPlayer, self).__init__()
  56. self.file_path = file_path
  57. def run(self):
  58. wav = wavio.read(self.file_path)
  59. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  60. sd.play(sound, wav.rate)
  61. sd.wait()
  62. class TextRetrievalThread(QThread):
  63. textRetrieved = pyqtSignal(str)
  64. def __init__(self, recorder):
  65. super().__init__()
  66. self.recorder = recorder
  67. self.active = False
  68. def run(self):
  69. while True:
  70. if self.active:
  71. text = self.recorder.text()
  72. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  73. self.textRetrieved.emit(text)
  74. self.active = False
  75. time.sleep(0.1)
  76. def activate(self):
  77. self.active = True
  78. class TransparentWindow(QWidget):
  79. updateUI = pyqtSignal()
  80. clearAssistantTextSignal = pyqtSignal()
  81. clearUserTextSignal = pyqtSignal()
  82. def __init__(self):
  83. super().__init__()
  84. self.setGeometry(1, 1, 1, 1)
  85. self.setWindowTitle("Transparent Window")
  86. self.setAttribute(Qt.WA_TranslucentBackground)
  87. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  88. self.big_symbol_font = QFont('Arial', 32)
  89. self.small_symbol_font = QFont('Arial', 17)
  90. self.user_font = QFont('Arial', user_font_size)
  91. self.assistant_font = QFont('Arial', assistant_font_size)
  92. self.assistant_font.setItalic(True)
  93. self.big_symbol_text = ""
  94. self.small_symbol_text = ""
  95. self.user_text = ""
  96. self.assistant_text = ""
  97. self.displayed_user_text = ""
  98. self.displayed_assistant_text = ""
  99. self.stream = None
  100. self.text_retrieval_thread = None
  101. self.user_text_timer = QTimer(self)
  102. self.assistant_text_timer = QTimer(self)
  103. self.user_text_timer.timeout.connect(self.clear_user_text)
  104. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  105. self.clearUserTextSignal.connect(self.init_clear_user_text)
  106. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  107. self.user_text_opacity = 255
  108. self.assistant_text_opacity = 255
  109. self.updateUI.connect(self.update_self)
  110. self.audio_player = None
  111. self.run_fade_user = False
  112. self.run_fade_assistant = False
  113. self.menu = QMenu()
  114. self.menu.setStyleSheet("""
  115. QMenu {
  116. background-color: black;
  117. color: white;
  118. border-radius: 10px;
  119. }
  120. QMenu::item:selected {
  121. background-color: #555555;
  122. }
  123. """)
  124. self.elevenlabs_action = QAction("Elevenlabs", self)
  125. self.azure_action = QAction("Azure", self)
  126. self.system_action = QAction("System", self)
  127. self.quit_action = QAction("Quit", self)
  128. self.menu.addAction(self.elevenlabs_action)
  129. self.menu.addAction(self.azure_action)
  130. self.menu.addAction(self.system_action)
  131. self.menu.addSeparator()
  132. self.menu.addAction(self.quit_action)
  133. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  134. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  135. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  136. self.quit_action.triggered.connect(self.close_application)
  137. def mousePressEvent(self, event: QMouseEvent):
  138. if event.button() == Qt.LeftButton:
  139. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  140. self.menu.exec_(self.mapToGlobal(event.pos()))
  141. def close_application(self):
  142. QApplication.quit()
  143. def init(self):
  144. self.select_engine(start_engine)
  145. # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
  146. self.recorder = AudioToTextRecorder(
  147. model=recorder_model,
  148. language=language,
  149. wake_words="Jarvis",
  150. spinner=True,
  151. silero_sensitivity=0.2,
  152. webrtc_sensitivity=3,
  153. on_recording_start=self.on_recording_start,
  154. on_vad_detect_start=self.on_vad_detect_start,
  155. on_wakeword_detection_start=self.on_wakeword_detection_start,
  156. on_transcription_start=self.on_transcription_start,
  157. post_speech_silence_duration=0.4,
  158. min_length_of_recording=0.3,
  159. min_gap_between_recordings=0.01,
  160. enable_realtime_transcription = True,
  161. realtime_processing_pause = 0.01,
  162. realtime_model_type = "tiny",
  163. on_realtime_transcription_stabilized=self.text_detected
  164. )
  165. if not start_with_wakeword:
  166. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  167. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  168. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  169. self.text_retrieval_thread.start()
  170. self.text_retrieval_thread.activate()
  171. keyboard.on_press_key('esc', self.on_escape)
  172. def select_engine(self, engine_name):
  173. if self.stream:
  174. self.stream.stop()
  175. self.stream = None
  176. engine = None
  177. if engine_name == "Azure":
  178. engine = AzureEngine(
  179. os.environ.get("AZURE_SPEECH_KEY"),
  180. os.environ.get("AZURE_SPEECH_REGION"),
  181. voice_azure,
  182. rate=24,
  183. pitch=10,
  184. )
  185. elif engine_name == "Elevenlabs":
  186. engine = ElevenlabsEngine(
  187. os.environ.get("ELEVENLABS_API_KEY"),
  188. model=elevenlabs_model
  189. )
  190. else:
  191. engine = SystemEngine(
  192. voice=voice_system,
  193. #print_installed_voices=True
  194. )
  195. self.stream = TextToAudioStream(
  196. engine,
  197. on_character=self.on_character,
  198. on_text_stream_stop=self.on_text_stream_stop,
  199. on_text_stream_start=self.on_text_stream_start,
  200. on_audio_stream_stop=self.on_audio_stream_stop,
  201. log_characters=True
  202. )
  203. sys.stdout.write('\033[K') # Clear to the end of line
  204. sys.stdout.write('\r') # Move the cursor to the beginning of the line
  205. print (f"Using {engine_name} engine")
  206. def text_detected(self, text):
  207. self.run_fade_user = False
  208. if self.user_text_timer.isActive():
  209. self.user_text_timer.stop()
  210. self.user_text_opacity = 255
  211. self.user_text = text
  212. self.updateUI.emit()
  213. def on_escape(self, e):
  214. if self.stream.is_playing():
  215. self.stream.stop()
  216. def showEvent(self, event: QEvent):
  217. super().showEvent(event)
  218. if event.type() == QEvent.Show:
  219. self.set_symbols("⌛", "🚀")
  220. QTimer.singleShot(1000, self.init)
  221. def on_character(self, char):
  222. if self.stream:
  223. self.assistant_text += char
  224. self.updateUI.emit()
  225. def on_text_stream_stop(self):
  226. print("\"", end="", flush=True)
  227. if self.stream:
  228. assistant_response = self.stream.text()
  229. self.assistant_text = assistant_response
  230. history.append({'role': 'assistant', 'content': assistant_response})
  231. def on_audio_stream_stop(self):
  232. self.set_symbols("🎙️", "⚪")
  233. if self.stream:
  234. self.clearAssistantTextSignal.emit()
  235. self.text_retrieval_thread.activate()
  236. def generate_answer(self):
  237. self.run_fade_assistant = False
  238. if self.assistant_text_timer.isActive():
  239. self.assistant_text_timer.stop()
  240. history.append({'role': 'user', 'content': self.user_text})
  241. self.remove_assistant_text()
  242. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  243. self.stream.feed(assistant_response)
  244. self.stream.play_async(minimum_sentence_length=6,
  245. buffer_threshold_seconds=2)
  246. def set_symbols(self, big_symbol, small_symbol):
  247. self.big_symbol_text = big_symbol
  248. self.small_symbol_text = small_symbol
  249. self.updateUI.emit()
  250. def on_text_stream_start(self):
  251. self.set_symbols("⌛", "👄")
  252. def process_user_text(self, user_text):
  253. user_text = user_text.strip()
  254. if user_text:
  255. self.run_fade_user = False
  256. if self.user_text_timer.isActive():
  257. self.user_text_timer.stop()
  258. self.user_text_opacity = 255
  259. self.user_text = user_text
  260. self.clearUserTextSignal.emit()
  261. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  262. self.set_symbols("⌛", "🧠")
  263. QTimer.singleShot(100, self.generate_answer)
  264. def on_transcription_start(self):
  265. self.set_symbols("⌛", "📝")
  266. def on_recording_start(self):
  267. self.text_storage = []
  268. self.ongoing_sentence = ""
  269. self.set_symbols("🎙️", "🔴")
  270. def on_vad_detect_start(self):
  271. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  272. self.audio_player = AudioPlayer("active.wav")
  273. self.audio_player.start()
  274. self.set_symbols("🎙️", "⚪")
  275. def on_wakeword_detection_start(self):
  276. self.audio_player = AudioPlayer("inactive.wav")
  277. self.audio_player.start()
  278. self.set_symbols("", "💤")
  279. def init_clear_user_text(self):
  280. if self.user_text_timer.isActive():
  281. self.user_text_timer.stop()
  282. self.user_text_timer.start(10000)
  283. def remove_user_text(self):
  284. self.user_text = ""
  285. self.user_text_opacity = 255
  286. self.updateUI.emit()
  287. def fade_out_user_text(self):
  288. if not self.run_fade_user:
  289. return
  290. if self.user_text_opacity > 0:
  291. self.user_text_opacity -= 5
  292. self.updateUI.emit()
  293. QTimer.singleShot(50, self.fade_out_user_text)
  294. else:
  295. self.run_fade_user = False
  296. self.remove_user_text()
  297. def clear_user_text(self):
  298. self.user_text_timer.stop()
  299. if not self.user_text:
  300. return
  301. self.user_text_opacity = 255
  302. self.run_fade_user = True
  303. self.fade_out_user_text()
  304. def init_clear_assistant_text(self):
  305. if self.assistant_text_timer.isActive():
  306. self.assistant_text_timer.stop()
  307. self.assistant_text_timer.start(10000)
  308. def remove_assistant_text(self):
  309. self.assistant_text = ""
  310. self.assistant_text_opacity = 255
  311. self.updateUI.emit()
  312. def fade_out_assistant_text(self):
  313. if not self.run_fade_assistant:
  314. return
  315. if self.assistant_text_opacity > 0:
  316. self.assistant_text_opacity -= 5
  317. self.updateUI.emit()
  318. QTimer.singleShot(50, self.fade_out_assistant_text)
  319. else:
  320. self.run_fade_assistant = False
  321. self.remove_assistant_text()
  322. def clear_assistant_text(self):
  323. self.assistant_text_timer.stop()
  324. if not self.assistant_text:
  325. return
  326. self.assistant_text_opacity = 255
  327. self.run_fade_assistant = True
  328. self.fade_out_assistant_text()
  329. def update_self(self):
  330. self.blockSignals(True)
  331. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  332. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  333. fm_symbol = QFontMetrics(self.big_symbol_font)
  334. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  335. self.symbol_height = fm_symbol.height() + 8
  336. self.total_width = MAX_WINDOW_WIDTH
  337. fm_user = QFontMetrics(self.user_font)
  338. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  339. self.user_height = fm_user.height() * user_text_lines + 7
  340. fm_assistant = QFontMetrics(self.assistant_font)
  341. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  342. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  343. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  344. desktop = QDesktopWidget()
  345. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  346. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  347. self.blockSignals(False)
  348. self.update()
  349. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  350. painter.setPen(outlineColor)
  351. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  352. (-outline_size, -outline_size), (outline_size, -outline_size),
  353. (-outline_size, outline_size), (outline_size, outline_size)]:
  354. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  355. painter.setPen(textColor)
  356. painter.drawText(x, y, width, height, alignment, text)
  357. def paintEvent(self, event):
  358. painter = QPainter(self)
  359. offsetX = 4
  360. offsetY = 5
  361. painter.setPen(QColor(255, 255, 255))
  362. # Draw symbol
  363. painter.setFont(self.big_symbol_font)
  364. if self.big_symbol_text:
  365. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  366. painter.setFont(self.small_symbol_font)
  367. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  368. else:
  369. painter.setFont(self.small_symbol_font)
  370. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  371. # Draw User Text
  372. painter.setFont(self.user_font)
  373. user_x = self.total_width - self.user_width - 45 + offsetX
  374. user_y = offsetY + 15
  375. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  376. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  377. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  378. # Draw Assistant Text
  379. painter.setFont(self.assistant_font)
  380. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  381. assistant_y = self.user_height + offsetY + 15
  382. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  383. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  384. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  385. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  386. """
  387. Line feeds are inserted so that the text width does never exceed max_width.
  388. Text is only broken up on whole words.
  389. """
  390. fm = QFontMetrics(font)
  391. words = text.split(' ')
  392. adjusted_text = ''
  393. current_line = ''
  394. max_width_used = 0
  395. for word in words:
  396. current_width = fm.width(current_line + word)
  397. if current_width <= max_width_allowed:
  398. current_line += word + ' '
  399. else:
  400. line_width = fm.width(current_line)
  401. if line_width > max_width_used:
  402. max_width_used = line_width
  403. adjusted_text += current_line + '\n'
  404. current_line = word + ' '
  405. line_width = fm.width(current_line)
  406. if line_width > max_width_used:
  407. max_width_used = line_width
  408. adjusted_text += current_line
  409. return adjusted_text.rstrip(), max_width_used
  410. if __name__ == '__main__':
  411. app = QApplication(sys.argv)
  412. window = TransparentWindow()
  413. window.show()
  414. sys.exit(app.exec_())