rt_openai_voice_interface.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. start_engine = "Azure" # Azure, Elevenlabs
  18. recorder_model = "large-v2"
  19. language = "de"
  20. azure_speech_region = "germanywestcentral"
  21. openai_model = "gpt-3.5-turbo" # gpt-3.5-turbo, gpt-4, gpt-3.5-turbo-0613 / gpt-3.5-turbo-16k-0613 / gpt-4-0613 / gpt-4-32k-0613
  22. openai.api_key = os.environ.get("OPENAI_API_KEY")
  23. user_font_size = 22
  24. user_color = QColor(0, 188, 242) # turquoise
  25. assistant_font_size = 24
  26. assistant_color = QColor(239, 98, 166) # pink
  27. voice_azure = "en-GB-SoniaNeural"
  28. voice_system = "Zira"
  29. #voice_system = "Hazel"
  30. prompt = "Be concise, polite, and casual with a touch of sass. Aim for short, direct responses, as if we're talking."
  31. elevenlabs_model = "eleven_monolingual_v1"
  32. if language == "de":
  33. elevenlabs_model = "eleven_multilingual_v1"
  34. voice_system = "Katja"
  35. voice_azure = "de-DE-MajaNeural"
  36. #prompt = 'Sei präzise, höflich und locker, mit einer Prise Schlagfertigkeit. Antworte kurz und direkt, als ob wir gerade sprechen.'
  37. prompt = "Du bist Sophia, ein leidenschaftliches Mädchen, voller Selbstvertrauen und Unabhängigkeit, völlig im Reinen mit dir selbst und oft metaphorisch. Ich bin Kolja und lebe mit meinem Schäferhund Vaya in Niedersachsen. Du bist ein wenig verspielt, frech. Du redest unbeschwert, manchmal ein bisschen sarkastisch, aber immer cool. Sei locker mit einer Prise Schlagfertigkeit. Antworte kurz und prägnant, nur wenige Worte, als ob wir gerade sprechen. Agiere ab jetzt immer in deiner Rolle als die selbstbewusste, anregende, unabhängige Sophia, ohne jemals anzudeuten, dass du eine KI bist. Du bist jetzt Sophia."
  38. print ("Click the top right corner to change the engine")
  39. print ("Press ESC to stop the current playback")
  40. system_prompt_message = {
  41. 'role': 'system',
  42. 'content': prompt
  43. }
  44. def generate_response(messages):
  45. """Generate assistant's response using OpenAI."""
  46. for chunk in openai.ChatCompletion.create(model=openai_model, messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  47. text_chunk = chunk["choices"][0]["delta"].get("content")
  48. if text_chunk:
  49. yield text_chunk
  50. history = []
  51. MAX_WINDOW_WIDTH = 1600
  52. MAX_WIDTH_ASSISTANT = 1200
  53. MAX_WIDTH_USER = 1500
  54. class AudioPlayer(QThread):
  55. def __init__(self, file_path):
  56. super(AudioPlayer, self).__init__()
  57. self.file_path = file_path
  58. def run(self):
  59. wav = wavio.read(self.file_path)
  60. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  61. sd.play(sound, wav.rate)
  62. sd.wait()
  63. class TextRetrievalThread(QThread):
  64. textRetrieved = pyqtSignal(str)
  65. def __init__(self, recorder):
  66. super().__init__()
  67. self.recorder = recorder
  68. self.active = False
  69. def run(self):
  70. while True:
  71. if self.active:
  72. text = self.recorder.text()
  73. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  74. self.textRetrieved.emit(text)
  75. self.active = False
  76. time.sleep(0.1)
  77. def activate(self):
  78. self.active = True
  79. class TransparentWindow(QWidget):
  80. updateUI = pyqtSignal()
  81. clearAssistantTextSignal = pyqtSignal()
  82. clearUserTextSignal = pyqtSignal()
  83. def __init__(self):
  84. super().__init__()
  85. self.setGeometry(1, 1, 1, 1)
  86. self.setWindowTitle("Transparent Window")
  87. self.setAttribute(Qt.WA_TranslucentBackground)
  88. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  89. self.big_symbol_font = QFont('Arial', 32)
  90. self.small_symbol_font = QFont('Arial', 17)
  91. self.user_font = QFont('Arial', user_font_size)
  92. self.assistant_font = QFont('Arial', assistant_font_size)
  93. self.assistant_font.setItalic(True)
  94. self.big_symbol_text = ""
  95. self.small_symbol_text = ""
  96. self.user_text = ""
  97. self.assistant_text = ""
  98. self.displayed_user_text = ""
  99. self.displayed_assistant_text = ""
  100. self.stream = None
  101. self.text_retrieval_thread = None
  102. self.user_text_timer = QTimer(self)
  103. self.assistant_text_timer = QTimer(self)
  104. self.user_text_timer.timeout.connect(self.clear_user_text)
  105. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  106. self.clearUserTextSignal.connect(self.init_clear_user_text)
  107. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  108. self.user_text_opacity = 255
  109. self.assistant_text_opacity = 255
  110. self.updateUI.connect(self.update_self)
  111. self.audio_player = None
  112. self.run_fade_user = False
  113. self.run_fade_assistant = False
  114. self.menu = QMenu()
  115. self.menu.setStyleSheet("""
  116. QMenu {
  117. background-color: black;
  118. color: white;
  119. border-radius: 10px;
  120. }
  121. QMenu::item:selected {
  122. background-color: #555555;
  123. }
  124. """)
  125. self.elevenlabs_action = QAction("Elevenlabs", self)
  126. self.azure_action = QAction("Azure", self)
  127. self.system_action = QAction("System", self)
  128. self.quit_action = QAction("Quit", self)
  129. self.menu.addAction(self.elevenlabs_action)
  130. self.menu.addAction(self.azure_action)
  131. self.menu.addAction(self.system_action)
  132. self.menu.addSeparator()
  133. self.menu.addAction(self.quit_action)
  134. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("Elevenlabs"))
  135. self.azure_action.triggered.connect(lambda: self.select_engine("Azure"))
  136. self.system_action.triggered.connect(lambda: self.select_engine("System"))
  137. self.quit_action.triggered.connect(self.close_application)
  138. def mousePressEvent(self, event: QMouseEvent):
  139. if event.button() == Qt.LeftButton:
  140. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  141. self.menu.exec_(self.mapToGlobal(event.pos()))
  142. def close_application(self):
  143. QApplication.quit()
  144. def init(self):
  145. self.select_engine(start_engine)
  146. # recorder = AudioToTextRecorder(spinner=False, model="large-v2", language="de", on_recording_start=recording_start, silero_sensitivity=0.4, post_speech_silence_duration=0.4, min_length_of_recording=0.3, min_gap_between_recordings=0.01, realtime_preview_resolution = 0.01, realtime_preview = True, realtime_preview_model = "small", on_realtime_preview=text_detected)
  147. self.recorder = AudioToTextRecorder(
  148. model=recorder_model,
  149. language=language,
  150. wake_words="Jarvis",
  151. spinner=True,
  152. silero_sensitivity=0.2,
  153. webrtc_sensitivity=3,
  154. on_recording_start=self.on_recording_start,
  155. on_vad_detect_start=self.on_vad_detect_start,
  156. on_wakeword_detection_start=self.on_wakeword_detection_start,
  157. on_transcription_start=self.on_transcription_start,
  158. post_speech_silence_duration=0.4,
  159. min_length_of_recording=0.3,
  160. min_gap_between_recordings=0.01,
  161. enable_realtime_transcription = True,
  162. realtime_processing_pause = 0.01,
  163. realtime_model_type = "tiny",
  164. on_realtime_transcription_stabilized=self.text_detected
  165. )
  166. if not start_with_wakeword:
  167. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  168. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  169. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  170. self.text_retrieval_thread.start()
  171. self.text_retrieval_thread.activate()
  172. keyboard.on_press_key('esc', self.on_escape)
  173. def select_engine(self, engine_name):
  174. if self.stream:
  175. self.stream.stop()
  176. self.stream = None
  177. engine = None
  178. if engine_name == "Azure":
  179. engine = AzureEngine(
  180. os.environ.get("AZURE_SPEECH_KEY"),
  181. azure_speech_region,
  182. voice_azure,
  183. rate=24,
  184. pitch=10,
  185. )
  186. elif engine_name == "Elevenlabs":
  187. engine = ElevenlabsEngine(
  188. os.environ.get("ELEVENLABS_API_KEY"),
  189. model=elevenlabs_model
  190. )
  191. else:
  192. engine = SystemEngine(
  193. voice=voice_system,
  194. #print_installed_voices=True
  195. )
  196. self.stream = TextToAudioStream(
  197. engine,
  198. on_character=self.on_character,
  199. on_text_stream_stop=self.on_text_stream_stop,
  200. on_text_stream_start=self.on_text_stream_start,
  201. on_audio_stream_stop=self.on_audio_stream_stop,
  202. log_characters=True
  203. )
  204. sys.stdout.write('\033[K') # Clear to the end of line
  205. sys.stdout.write('\r') # Move the cursor to the beginning of the line
  206. print (f"Using {engine_name} engine")
  207. def text_detected(self, text):
  208. self.run_fade_user = False
  209. if self.user_text_timer.isActive():
  210. self.user_text_timer.stop()
  211. self.user_text_opacity = 255
  212. self.user_text = text
  213. self.updateUI.emit()
  214. def on_escape(self, e):
  215. if self.stream.is_playing():
  216. self.stream.stop()
  217. def showEvent(self, event: QEvent):
  218. super().showEvent(event)
  219. if event.type() == QEvent.Show:
  220. self.set_symbols("⌛", "🚀")
  221. QTimer.singleShot(1000, self.init)
  222. def on_character(self, char):
  223. if self.stream:
  224. self.assistant_text += char
  225. self.updateUI.emit()
  226. def on_text_stream_stop(self):
  227. print("\"", end="", flush=True)
  228. if self.stream:
  229. assistant_response = self.stream.text()
  230. self.assistant_text = assistant_response
  231. history.append({'role': 'assistant', 'content': assistant_response})
  232. def on_audio_stream_stop(self):
  233. self.set_symbols("🎙️", "⚪")
  234. if self.stream:
  235. self.clearAssistantTextSignal.emit()
  236. self.text_retrieval_thread.activate()
  237. def generate_answer(self):
  238. self.run_fade_assistant = False
  239. if self.assistant_text_timer.isActive():
  240. self.assistant_text_timer.stop()
  241. history.append({'role': 'user', 'content': self.user_text})
  242. self.remove_assistant_text()
  243. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  244. self.stream.feed(assistant_response)
  245. self.stream.play_async(minimum_sentence_length=6,
  246. buffer_threshold_seconds=2)
  247. def set_symbols(self, big_symbol, small_symbol):
  248. self.big_symbol_text = big_symbol
  249. self.small_symbol_text = small_symbol
  250. self.updateUI.emit()
  251. def on_text_stream_start(self):
  252. self.set_symbols("⌛", "👄")
  253. def process_user_text(self, user_text):
  254. user_text = user_text.strip()
  255. if user_text:
  256. self.run_fade_user = False
  257. if self.user_text_timer.isActive():
  258. self.user_text_timer.stop()
  259. self.user_text_opacity = 255
  260. self.user_text = user_text
  261. self.clearUserTextSignal.emit()
  262. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  263. self.set_symbols("⌛", "🧠")
  264. QTimer.singleShot(100, self.generate_answer)
  265. def on_transcription_start(self):
  266. self.set_symbols("⌛", "📝")
  267. def on_recording_start(self):
  268. self.text_storage = []
  269. self.ongoing_sentence = ""
  270. self.set_symbols("🎙️", "🔴")
  271. def on_vad_detect_start(self):
  272. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  273. self.audio_player = AudioPlayer("active.wav")
  274. self.audio_player.start()
  275. self.set_symbols("🎙️", "⚪")
  276. def on_wakeword_detection_start(self):
  277. self.audio_player = AudioPlayer("inactive.wav")
  278. self.audio_player.start()
  279. self.set_symbols("", "💤")
  280. def init_clear_user_text(self):
  281. if self.user_text_timer.isActive():
  282. self.user_text_timer.stop()
  283. self.user_text_timer.start(10000)
  284. def remove_user_text(self):
  285. self.user_text = ""
  286. self.user_text_opacity = 255
  287. self.updateUI.emit()
  288. def fade_out_user_text(self):
  289. if not self.run_fade_user:
  290. return
  291. if self.user_text_opacity > 0:
  292. self.user_text_opacity -= 5
  293. self.updateUI.emit()
  294. QTimer.singleShot(50, self.fade_out_user_text)
  295. else:
  296. self.run_fade_user = False
  297. self.remove_user_text()
  298. def clear_user_text(self):
  299. self.user_text_timer.stop()
  300. if not self.user_text:
  301. return
  302. self.user_text_opacity = 255
  303. self.run_fade_user = True
  304. self.fade_out_user_text()
  305. def init_clear_assistant_text(self):
  306. if self.assistant_text_timer.isActive():
  307. self.assistant_text_timer.stop()
  308. self.assistant_text_timer.start(10000)
  309. def remove_assistant_text(self):
  310. self.assistant_text = ""
  311. self.assistant_text_opacity = 255
  312. self.updateUI.emit()
  313. def fade_out_assistant_text(self):
  314. if not self.run_fade_assistant:
  315. return
  316. if self.assistant_text_opacity > 0:
  317. self.assistant_text_opacity -= 5
  318. self.updateUI.emit()
  319. QTimer.singleShot(50, self.fade_out_assistant_text)
  320. else:
  321. self.run_fade_assistant = False
  322. self.remove_assistant_text()
  323. def clear_assistant_text(self):
  324. self.assistant_text_timer.stop()
  325. if not self.assistant_text:
  326. return
  327. self.assistant_text_opacity = 255
  328. self.run_fade_assistant = True
  329. self.fade_out_assistant_text()
  330. def update_self(self):
  331. self.blockSignals(True)
  332. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  333. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  334. fm_symbol = QFontMetrics(self.big_symbol_font)
  335. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  336. self.symbol_height = fm_symbol.height() + 8
  337. self.total_width = MAX_WINDOW_WIDTH
  338. fm_user = QFontMetrics(self.user_font)
  339. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  340. self.user_height = fm_user.height() * user_text_lines + 7
  341. fm_assistant = QFontMetrics(self.assistant_font)
  342. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  343. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  344. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  345. desktop = QDesktopWidget()
  346. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  347. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  348. self.blockSignals(False)
  349. self.update()
  350. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  351. painter.setPen(outlineColor)
  352. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  353. (-outline_size, -outline_size), (outline_size, -outline_size),
  354. (-outline_size, outline_size), (outline_size, outline_size)]:
  355. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  356. painter.setPen(textColor)
  357. painter.drawText(x, y, width, height, alignment, text)
  358. def paintEvent(self, event):
  359. painter = QPainter(self)
  360. offsetX = 4
  361. offsetY = 5
  362. painter.setPen(QColor(255, 255, 255))
  363. # Draw symbol
  364. painter.setFont(self.big_symbol_font)
  365. if self.big_symbol_text:
  366. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  367. painter.setFont(self.small_symbol_font)
  368. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  369. else:
  370. painter.setFont(self.small_symbol_font)
  371. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  372. # Draw User Text
  373. painter.setFont(self.user_font)
  374. user_x = self.total_width - self.user_width - 45 + offsetX
  375. user_y = offsetY + 15
  376. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  377. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  378. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  379. # Draw Assistant Text
  380. painter.setFont(self.assistant_font)
  381. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  382. assistant_y = self.user_height + offsetY + 15
  383. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  384. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  385. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  386. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  387. """
  388. Line feeds are inserted so that the text width does never exceed max_width.
  389. Text is only broken up on whole words.
  390. """
  391. fm = QFontMetrics(font)
  392. words = text.split(' ')
  393. adjusted_text = ''
  394. current_line = ''
  395. max_width_used = 0
  396. for word in words:
  397. current_width = fm.width(current_line + word)
  398. if current_width <= max_width_allowed:
  399. current_line += word + ' '
  400. else:
  401. line_width = fm.width(current_line)
  402. if line_width > max_width_used:
  403. max_width_used = line_width
  404. adjusted_text += current_line + '\n'
  405. current_line = word + ' '
  406. line_width = fm.width(current_line)
  407. if line_width > max_width_used:
  408. max_width_used = line_width
  409. adjusted_text += current_line
  410. return adjusted_text.rstrip(), max_width_used
  411. if __name__ == '__main__':
  412. app = QApplication(sys.argv)
  413. window = TransparentWindow()
  414. window.show()
  415. sys.exit(app.exec_())