ui_openai_voice_interface.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. if __name__ == '__main__':
  2. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  3. from RealtimeSTT import AudioToTextRecorder
  4. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  5. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  6. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  7. import os
  8. import openai
  9. import sys
  10. import time
  11. import sounddevice as sd
  12. import numpy as np
  13. import wavio
  14. import keyboard
  15. max_history_messages = 6
  16. return_to_wakewords_after_silence = 12
  17. start_with_wakeword = False
  18. start_engine = "Azure" # Azure, Elevenlabs
  19. recorder_model = "large-v2"
  20. language = "en"
  21. azure_speech_region = "eastus"
  22. openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
  23. openai.api_key = os.environ.get("OPENAI_API_KEY")
  24. user_font_size = 22
  25. user_color = QColor(0, 188, 242) # turquoise
  26. assistant_font_size = 24
  27. assistant_color = QColor(239, 98, 166) # pink
  28. voice_azure = "en-GB-SoniaNeural"
  29. voice_system = "Zira"
  30. #voice_system = "Hazel"
  31. prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
  32. elevenlabs_model = "eleven_monolingual_v1"
  33. if language == "de":
  34. elevenlabs_model = "eleven_multilingual_v1"
  35. voice_system = "Katja"
  36. voice_azure = "de-DE-MajaNeural"
  37. prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
  38. print ("Click the top right corner to change the engine")
  39. print ("Press ESC to stop the current playback")
  40. system_prompt_message = {
  41. 'role': 'system',
  42. 'content': prompt
  43. }
  44. def generate_response(messages):
  45. """Generate assistant's response using OpenAI."""
  46. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  47. text_chunk = chunk["choices"][0]["delta"].get("content")
  48. if text_chunk:
  49. yield text_chunk
  50. history = []
  51. MAX_WINDOW_WIDTH = 1600
  52. MAX_WIDTH_ASSISTANT = 1200
  53. MAX_WIDTH_USER = 1500
  54. class AudioPlayer(QThread):
  55. def __init__(self, file_path):
  56. super(AudioPlayer, self).__init__()
  57. self.file_path = file_path
  58. def run(self):
  59. wav = wavio.read(self.file_path)
  60. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  61. sd.play(sound, wav.rate)
  62. sd.wait()
  63. class TextRetrievalThread(QThread):
  64. textRetrieved = pyqtSignal(str)
  65. def __init__(self, recorder):
  66. super().__init__()
  67. self.recorder = recorder
  68. self.active = False
  69. def run(self):
  70. while True:
  71. if self.active:
  72. text = self.recorder.text()
  73. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  74. self.textRetrieved.emit(text)
  75. self.active = False
  76. time.sleep(0.1)
  77. def activate(self):
  78. self.active = True
  79. class TransparentWindow(QWidget):
  80. updateUI = pyqtSignal()
  81. clearAssistantTextSignal = pyqtSignal()
  82. clearUserTextSignal = pyqtSignal()
  83. def __init__(self):
  84. super().__init__()
  85. self.setGeometry(1, 1, 1, 1)
  86. self.setWindowTitle("Transparent Window")
  87. self.setAttribute(Qt.WA_TranslucentBackground)
  88. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  89. self.big_symbol_font = QFont('Arial', 32)
  90. self.small_symbol_font = QFont('Arial', 17)
  91. self.user_font = QFont('Arial', user_font_size)
  92. self.assistant_font = QFont('Arial', assistant_font_size)
  93. self.assistant_font.setItalic(True)
  94. self.big_symbol_text = ""
  95. self.small_symbol_text = ""
  96. self.user_text = ""
  97. self.assistant_text = ""
  98. self.displayed_user_text = ""
  99. self.displayed_assistant_text = ""
  100. self.stream = None
  101. self.text_retrieval_thread = None
  102. self.user_text_timer = QTimer(self)
  103. self.assistant_text_timer = QTimer(self)
  104. self.user_text_timer.timeout.connect(self.clear_user_text)
  105. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  106. self.clearUserTextSignal.connect(self.init_clear_user_text)
  107. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  108. self.user_text_opacity = 255
  109. self.assistant_text_opacity = 255
  110. self.updateUI.connect(self.update_self)
  111. self.audio_player = None
  112. self.run_fade_user = False
  113. self.run_fade_assistant = False
  114. self.menu = QMenu()
  115. self.menu.setStyleSheet("""
  116. QMenu {
  117. background-color: black;
  118. color: white;
  119. border-radius: 10px;
  120. }
  121. QMenu::item:selected {
  122. background-color: #555555;
  123. }
  124. """)
  125. self.elevenlabs_action = QAction("Elevenlabs", self)
  126. self.azure_action = QAction("Azure", self)
  127. self.system_action = QAction("System", self)
  128. self.quit_action = QAction("Quit", self)
  129. self.menu.addAction(self.elevenlabs_action)
  130. self.menu.addAction(self.azure_action)
  131. self.menu.addAction(self.system_action)
  132. self.menu.addSeparator()
  133. self.menu.addAction(self.quit_action)
  134. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  135. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  136. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  137. self.quit_action.triggered.connect(self.close_application)
  138. def mousePressEvent(self, event: QMouseEvent):
  139. if event.button() == Qt.LeftButton:
  140. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  141. self.menu.exec_(self.mapToGlobal(event.pos()))
  142. def close_application(self):
  143. if self.recorder:
  144. self.recorder.shutdown()
  145. QApplication.quit()
  146. def init(self):
  147. self.select_engine(start_engine)
  148. # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
  149. self.recorder = AudioToTextRecorder(
  150. model=recorder_model,
  151. language=language,
  152. wake_words="Jarvis",
  153. silero_use_onnx=False,
  154. spinner=True,
  155. silero_sensitivity=0.2,
  156. webrtc_sensitivity=3,
  157. on_recording_start=self.on_recording_start,
  158. on_vad_detect_start=self.on_vad_detect_start,
  159. on_wakeword_detection_start=self.on_wakeword_detection_start,
  160. on_transcription_start=self.on_transcription_start,
  161. post_speech_silence_duration=0.4,
  162. min_length_of_recording=0.3,
  163. min_gap_between_recordings=0.01,
  164. enable_realtime_transcription = True,
  165. realtime_processing_pause = 0.01,
  166. realtime_model_type = "tiny",
  167. on_realtime_transcription_stabilized=self.text_detected
  168. )
  169. if not start_with_wakeword:
  170. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  171. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  172. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  173. self.text_retrieval_thread.start()
  174. self.text_retrieval_thread.activate()
  175. keyboard.on_press_key('esc', self.on_escape)
  176. def closeEvent(self, event):
  177. if self.recorder:
  178. self.recorder.shutdown()
  179. def select_engine(self, engine_name):
  180. if self.stream:
  181. self.stream.stop()
  182. self.stream = None
  183. engine = None
  184. if engine_name == "Azure":
  185. engine = AzureEngine(
  186. os.environ.get("AZURE_SPEECH_KEY"),
  187. os.environ.get("AZURE_SPEECH_REGION"),
  188. voice_azure,
  189. rate=24,
  190. pitch=10,
  191. )
  192. elif engine_name == "Elevenlabs":
  193. engine = ElevenlabsEngine(
  194. os.environ.get("ELEVENLABS_API_KEY"),
  195. model=elevenlabs_model
  196. )
  197. else:
  198. engine = SystemEngine(
  199. voice=voice_system,
  200. #print_installed_voices=True
  201. )
  202. self.stream = TextToAudioStream(
  203. engine,
  204. on_character=self.on_character,
  205. on_text_stream_stop=self.on_text_stream_stop,
  206. on_text_stream_start=self.on_text_stream_start,
  207. on_audio_stream_stop=self.on_audio_stream_stop,
  208. log_characters=True
  209. )
  210. sys.stdout.write('\033[K') # Clear to the end of line
  211. sys.stdout.write('\r') # Move the cursor to the beginning of the line
  212. print (f"Using {engine_name} engine")
  213. def text_detected(self, text):
  214. self.run_fade_user = False
  215. if self.user_text_timer.isActive():
  216. self.user_text_timer.stop()
  217. self.user_text_opacity = 255
  218. self.user_text = text
  219. self.updateUI.emit()
  220. def on_escape(self, e):
  221. if self.stream.is_playing():
  222. self.stream.stop()
  223. def showEvent(self, event: QEvent):
  224. super().showEvent(event)
  225. if event.type() == QEvent.Show:
  226. self.set_symbols("⌛", "🚀")
  227. QTimer.singleShot(1000, self.init)
  228. def on_character(self, char):
  229. if self.stream:
  230. self.assistant_text += char
  231. self.updateUI.emit()
  232. def on_text_stream_stop(self):
  233. print("\"", end="", flush=True)
  234. if self.stream:
  235. assistant_response = self.stream.text()
  236. self.assistant_text = assistant_response
  237. history.append({'role': 'assistant', 'content': assistant_response})
  238. def on_audio_stream_stop(self):
  239. self.set_symbols("🎙️", "⚪")
  240. if self.stream:
  241. self.clearAssistantTextSignal.emit()
  242. self.text_retrieval_thread.activate()
  243. def generate_answer(self):
  244. self.run_fade_assistant = False
  245. if self.assistant_text_timer.isActive():
  246. self.assistant_text_timer.stop()
  247. history.append({'role': 'user', 'content': self.user_text})
  248. self.remove_assistant_text()
  249. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  250. self.stream.feed(assistant_response)
  251. self.stream.play_async(minimum_sentence_length=6,
  252. buffer_threshold_seconds=2)
  253. def set_symbols(self, big_symbol, small_symbol):
  254. self.big_symbol_text = big_symbol
  255. self.small_symbol_text = small_symbol
  256. self.updateUI.emit()
  257. def on_text_stream_start(self):
  258. self.set_symbols("⌛", "👄")
  259. def process_user_text(self, user_text):
  260. user_text = user_text.strip()
  261. if user_text:
  262. self.run_fade_user = False
  263. if self.user_text_timer.isActive():
  264. self.user_text_timer.stop()
  265. self.user_text_opacity = 255
  266. self.user_text = user_text
  267. self.clearUserTextSignal.emit()
  268. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  269. self.set_symbols("⌛", "🧠")
  270. QTimer.singleShot(100, self.generate_answer)
  271. def on_transcription_start(self):
  272. self.set_symbols("⌛", "📝")
  273. def on_recording_start(self):
  274. self.text_storage = []
  275. self.ongoing_sentence = ""
  276. self.set_symbols("🎙️", "🔴")
  277. def on_vad_detect_start(self):
  278. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  279. self.audio_player = AudioPlayer("active.wav")
  280. self.audio_player.start()
  281. self.set_symbols("🎙️", "⚪")
  282. def on_wakeword_detection_start(self):
  283. self.audio_player = AudioPlayer("inactive.wav")
  284. self.audio_player.start()
  285. self.set_symbols("", "💤")
  286. def init_clear_user_text(self):
  287. if self.user_text_timer.isActive():
  288. self.user_text_timer.stop()
  289. self.user_text_timer.start(10000)
  290. def remove_user_text(self):
  291. self.user_text = ""
  292. self.user_text_opacity = 255
  293. self.updateUI.emit()
  294. def fade_out_user_text(self):
  295. if not self.run_fade_user:
  296. return
  297. if self.user_text_opacity > 0:
  298. self.user_text_opacity -= 5
  299. self.updateUI.emit()
  300. QTimer.singleShot(50, self.fade_out_user_text)
  301. else:
  302. self.run_fade_user = False
  303. self.remove_user_text()
  304. def clear_user_text(self):
  305. self.user_text_timer.stop()
  306. if not self.user_text:
  307. return
  308. self.user_text_opacity = 255
  309. self.run_fade_user = True
  310. self.fade_out_user_text()
  311. def init_clear_assistant_text(self):
  312. if self.assistant_text_timer.isActive():
  313. self.assistant_text_timer.stop()
  314. self.assistant_text_timer.start(10000)
  315. def remove_assistant_text(self):
  316. self.assistant_text = ""
  317. self.assistant_text_opacity = 255
  318. self.updateUI.emit()
  319. def fade_out_assistant_text(self):
  320. if not self.run_fade_assistant:
  321. return
  322. if self.assistant_text_opacity > 0:
  323. self.assistant_text_opacity -= 5
  324. self.updateUI.emit()
  325. QTimer.singleShot(50, self.fade_out_assistant_text)
  326. else:
  327. self.run_fade_assistant = False
  328. self.remove_assistant_text()
  329. def clear_assistant_text(self):
  330. self.assistant_text_timer.stop()
  331. if not self.assistant_text:
  332. return
  333. self.assistant_text_opacity = 255
  334. self.run_fade_assistant = True
  335. self.fade_out_assistant_text()
  336. def update_self(self):
  337. self.blockSignals(True)
  338. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  339. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  340. fm_symbol = QFontMetrics(self.big_symbol_font)
  341. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  342. self.symbol_height = fm_symbol.height() + 8
  343. self.total_width = MAX_WINDOW_WIDTH
  344. fm_user = QFontMetrics(self.user_font)
  345. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  346. self.user_height = fm_user.height() * user_text_lines + 7
  347. fm_assistant = QFontMetrics(self.assistant_font)
  348. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  349. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  350. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  351. desktop = QDesktopWidget()
  352. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  353. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  354. self.blockSignals(False)
  355. self.update()
  356. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  357. painter.setPen(outlineColor)
  358. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  359. (-outline_size, -outline_size), (outline_size, -outline_size),
  360. (-outline_size, outline_size), (outline_size, outline_size)]:
  361. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  362. painter.setPen(textColor)
  363. painter.drawText(x, y, width, height, alignment, text)
  364. def paintEvent(self, event):
  365. painter = QPainter(self)
  366. offsetX = 4
  367. offsetY = 5
  368. painter.setPen(QColor(255, 255, 255))
  369. # Draw symbol
  370. painter.setFont(self.big_symbol_font)
  371. if self.big_symbol_text:
  372. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  373. painter.setFont(self.small_symbol_font)
  374. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  375. else:
  376. painter.setFont(self.small_symbol_font)
  377. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  378. # Draw User Text
  379. painter.setFont(self.user_font)
  380. user_x = self.total_width - self.user_width - 45 + offsetX
  381. user_y = offsetY + 15
  382. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  383. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  384. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  385. # Draw Assistant Text
  386. painter.setFont(self.assistant_font)
  387. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  388. assistant_y = self.user_height + offsetY + 15
  389. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  390. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  391. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  392. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  393. """
  394. Line feeds are inserted so that the text width does never exceed max_width.
  395. Text is only broken up on whole words.
  396. """
  397. fm = QFontMetrics(font)
  398. words = text.split(' ')
  399. adjusted_text = ''
  400. current_line = ''
  401. max_width_used = 0
  402. for word in words:
  403. current_width = fm.width(current_line + word)
  404. if current_width <= max_width_allowed:
  405. current_line += word + ' '
  406. else:
  407. line_width = fm.width(current_line)
  408. if line_width > max_width_used:
  409. max_width_used = line_width
  410. adjusted_text += current_line + '\n'
  411. current_line = word + ' '
  412. line_width = fm.width(current_line)
  413. if line_width > max_width_used:
  414. max_width_used = line_width
  415. adjusted_text += current_line
  416. return adjusted_text.rstrip(), max_width_used
  417. app = QApplication(sys.argv)
  418. window = TransparentWindow()
  419. window.show()
  420. sys.exit(app.exec_())