VoiceApp.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont, QMouseEvent, QContextMenuEvent
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget, QMenu, QAction
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. import keyboard
  14. max_history_messages = 6
  15. return_to_wakewords_after_silence = 12
  16. start_with_wakeword = False
  17. recorder_model = "large-v2"
  18. language = "de"
  19. engine = "azure" # elevenlabs, system
  20. azure_speech_region = "germanywestcentral"
  21. openai.api_key = os.environ.get("OPENAI_API_KEY")
  22. user_font_size = 22
  23. user_color = QColor(208, 208, 208) # gray
  24. assistant_font_size = 24
  25. assistant_color = QColor(240, 240, 240) # white
  26. voice = "en-GB-SoniaNeural"
  27. prompt = "Respond helpfully, concisely, and when appropriate, with the subtle, polite irony of a butler."
  28. if language == "de":
  29. voice = "de-DE-MajaNeural"
  30. prompt = 'Antworte hilfreich, knapp und bei Gelegenheit mit der feinen, höflichen Ironie eines Butlers.'
  31. system_prompt_message = {
  32. 'role': 'system',
  33. 'content': prompt
  34. }
  35. def generate_response(messages):
  36. """Generate assistant's response using OpenAI."""
  37. for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  38. text_chunk = chunk["choices"][0]["delta"].get("content")
  39. if text_chunk:
  40. yield text_chunk
  41. history = []
  42. MAX_WINDOW_WIDTH = 1600
  43. MAX_WIDTH_ASSISTANT = 1200
  44. MAX_WIDTH_USER = 1500
  45. class AudioPlayer(QThread):
  46. def __init__(self, file_path):
  47. super(AudioPlayer, self).__init__()
  48. self.file_path = file_path
  49. def run(self):
  50. wav = wavio.read(self.file_path)
  51. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  52. sd.play(sound, wav.rate)
  53. sd.wait()
  54. class TextRetrievalThread(QThread):
  55. textRetrieved = pyqtSignal(str)
  56. def __init__(self, recorder):
  57. super().__init__()
  58. self.recorder = recorder
  59. self.active = False
  60. def run(self):
  61. while True:
  62. if self.active:
  63. text = self.recorder.text()
  64. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  65. self.textRetrieved.emit(text)
  66. self.active = False
  67. time.sleep(0.1)
  68. def activate(self):
  69. self.active = True
  70. class TransparentWindow(QWidget):
  71. updateUI = pyqtSignal()
  72. clearAssistantTextSignal = pyqtSignal()
  73. clearUserTextSignal = pyqtSignal()
  74. def __init__(self):
  75. super().__init__()
  76. self.setGeometry(1, 1, 1, 1)
  77. self.setWindowTitle("Transparent Window")
  78. self.setAttribute(Qt.WA_TranslucentBackground)
  79. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  80. self.big_symbol_font = QFont('Arial', 32)
  81. self.small_symbol_font = QFont('Arial', 17)
  82. self.user_font = QFont('Arial', user_font_size)
  83. self.assistant_font = QFont('Arial', assistant_font_size)
  84. self.assistant_font.setItalic(True)
  85. self.big_symbol_text = ""
  86. self.small_symbol_text = ""
  87. self.user_text = ""
  88. self.assistant_text = ""
  89. self.displayed_user_text = ""
  90. self.displayed_assistant_text = ""
  91. self.stream = None
  92. self.text_retrieval_thread = None
  93. self.user_text_timer = QTimer(self)
  94. self.assistant_text_timer = QTimer(self)
  95. self.user_text_timer.timeout.connect(self.clear_user_text)
  96. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  97. self.clearUserTextSignal.connect(self.init_clear_user_text)
  98. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  99. self.user_text_opacity = 255
  100. self.assistant_text_opacity = 255
  101. self.updateUI.connect(self.update_self)
  102. self.audio_player = None
  103. self.run_fade_user = False
  104. self.run_fade_assistant = False
  105. self.menu = QMenu()
  106. self.menu.setStyleSheet("""
  107. QMenu {
  108. background-color: black;
  109. color: white;
  110. border-radius: 10px;
  111. }
  112. QMenu::item:selected {
  113. background-color: #555555;
  114. }
  115. """)
  116. self.elevenlabs_action = QAction("Elevenlabs", self)
  117. self.azure_action = QAction("Azure", self)
  118. self.system_action = QAction("System", self)
  119. self.menu.addAction(self.elevenlabs_action)
  120. self.menu.addAction(self.azure_action)
  121. self.menu.addAction(self.system_action)
  122. self.elevenlabs_action.triggered.connect(lambda: self.select_engine("elevenlabs"))
  123. self.azure_action.triggered.connect(lambda: self.select_engine("azure"))
  124. self.system_action.triggered.connect(lambda: self.select_engine("system"))
  125. def mousePressEvent(self, event: QMouseEvent):
  126. if event.button() == Qt.LeftButton:
  127. if event.pos().x() >= self.width() - 100 and event.pos().y() <= 100:
  128. self.menu.exec_(self.mapToGlobal(event.pos()))
  129. def init(self):
  130. self.select_engine("azure")
  131. self.recorder = AudioToTextRecorder(
  132. model=recorder_model,
  133. language=language,
  134. wake_words="Jarvis",
  135. spinner=True,
  136. silero_sensitivity=0.2,
  137. webrtc_sensitivity=3,
  138. on_recording_start=self.on_recording_start,
  139. on_vad_detect_start=self.on_vad_detect_start,
  140. on_wakeword_detection_start=self.on_wakeword_detection_start,
  141. on_transcription_start=self.on_transcription_start,
  142. )
  143. if not start_with_wakeword:
  144. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  145. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  146. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  147. self.text_retrieval_thread.start()
  148. self.text_retrieval_thread.activate()
  149. keyboard.on_press_key('esc', self.on_escape)
  150. def select_engine(self, engine_name):
  151. if self.stream:
  152. if self.stream.is_playing():
  153. self.stream.stop()
  154. self.stream = None
  155. engine = None
  156. if engine_name == "azure":
  157. engine = AzureEngine(
  158. os.environ.get("AZURE_SPEECH_KEY"),
  159. azure_speech_region,
  160. voice,
  161. rate=34,
  162. pitch=10,
  163. )
  164. elif engine_name == "elevenlabs":
  165. engine = ElevenlabsEngine(
  166. os.environ.get("ELEVENLABS_API_KEY")
  167. )
  168. else:
  169. engine = SystemEngine(
  170. voice="Stefan",
  171. print_installed_voices=True
  172. )
  173. self.stream = TextToAudioStream(
  174. engine,
  175. on_character=self.on_character,
  176. on_text_stream_stop=self.on_text_stream_stop,
  177. on_text_stream_start=self.on_text_stream_start,
  178. on_audio_stream_stop=self.on_audio_stream_stop,
  179. log_characters=True
  180. )
  181. def on_escape(self, e):
  182. if self.stream.is_playing():
  183. self.stream.stop()
  184. def showEvent(self, event: QEvent):
  185. super().showEvent(event)
  186. if event.type() == QEvent.Show:
  187. self.set_symbols("⌛", "🚀")
  188. QTimer.singleShot(1000, self.init)
  189. def on_character(self, char):
  190. if self.stream:
  191. self.assistant_text += char
  192. self.updateUI.emit()
  193. def on_text_stream_stop(self):
  194. print("\"", end="", flush=True)
  195. if self.stream:
  196. assistant_response = self.stream.text()
  197. self.assistant_text = assistant_response
  198. history.append({'role': 'assistant', 'content': assistant_response})
  199. def on_audio_stream_stop(self):
  200. if self.stream:
  201. self.clearAssistantTextSignal.emit()
  202. self.text_retrieval_thread.activate()
  203. def generate_answer(self):
  204. self.run_fade_assistant = False
  205. if self.assistant_text_timer.isActive():
  206. self.assistant_text_timer.stop()
  207. history.append({'role': 'user', 'content': self.user_text})
  208. self.remove_assistant_text()
  209. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  210. self.stream.feed(assistant_response)
  211. self.stream.play_async(minimum_sentence_length=7,
  212. buffer_threshold_seconds=3)
  213. def set_symbols(self, big_symbol, small_symbol):
  214. self.big_symbol_text = big_symbol
  215. self.small_symbol_text = small_symbol
  216. self.updateUI.emit()
  217. def on_text_stream_start(self):
  218. self.set_symbols("⌛", "👄")
  219. def process_user_text(self, user_text):
  220. user_text = user_text.strip()
  221. if user_text:
  222. self.run_fade_user = False
  223. if self.user_text_timer.isActive():
  224. self.user_text_timer.stop()
  225. self.user_text_opacity = 255
  226. self.user_text = user_text
  227. self.clearUserTextSignal.emit()
  228. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  229. self.set_symbols("⌛", "🧠")
  230. QTimer.singleShot(100, self.generate_answer)
  231. def on_transcription_start(self):
  232. self.set_symbols("⌛", "📝")
  233. def on_recording_start(self):
  234. self.set_symbols("🎙️", "🔴")
  235. def on_vad_detect_start(self):
  236. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  237. self.audio_player = AudioPlayer("active.wav")
  238. self.audio_player.start()
  239. self.set_symbols("🎙️", "⚪")
  240. def on_wakeword_detection_start(self):
  241. self.audio_player = AudioPlayer("inactive.wav")
  242. self.audio_player.start()
  243. self.set_symbols("", "💤")
  244. def init_clear_user_text(self):
  245. if self.user_text_timer.isActive():
  246. self.user_text_timer.stop()
  247. self.user_text_timer.start(10000)
  248. def remove_user_text(self):
  249. self.user_text = ""
  250. self.user_text_opacity = 255
  251. self.updateUI.emit()
  252. def fade_out_user_text(self):
  253. if not self.run_fade_user:
  254. return
  255. if self.user_text_opacity > 0:
  256. self.user_text_opacity -= 5
  257. self.updateUI.emit()
  258. QTimer.singleShot(50, self.fade_out_user_text)
  259. else:
  260. self.run_fade_user = False
  261. self.remove_user_text()
  262. def clear_user_text(self):
  263. self.user_text_timer.stop()
  264. if not self.user_text:
  265. return
  266. self.user_text_opacity = 255
  267. self.run_fade_user = True
  268. self.fade_out_user_text()
  269. def init_clear_assistant_text(self):
  270. if self.assistant_text_timer.isActive():
  271. self.assistant_text_timer.stop()
  272. self.assistant_text_timer.start(10000)
  273. def remove_assistant_text(self):
  274. self.assistant_text = ""
  275. self.assistant_text_opacity = 255
  276. self.updateUI.emit()
  277. def fade_out_assistant_text(self):
  278. if not self.run_fade_assistant:
  279. return
  280. if self.assistant_text_opacity > 0:
  281. self.assistant_text_opacity -= 5
  282. self.updateUI.emit()
  283. QTimer.singleShot(50, self.fade_out_assistant_text)
  284. else:
  285. self.run_fade_assistant = False
  286. self.remove_assistant_text()
  287. def clear_assistant_text(self):
  288. self.assistant_text_timer.stop()
  289. if not self.assistant_text:
  290. return
  291. self.assistant_text_opacity = 255
  292. self.run_fade_assistant = True
  293. self.fade_out_assistant_text()
  294. # def keyPressEvent(self, event):
  295. # if event.key() == Qt.Key_Escape:
  296. # self.stream.stop()
  297. # super().keyPressEvent(event)
  298. def update_self(self):
  299. self.blockSignals(True)
  300. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  301. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  302. fm_symbol = QFontMetrics(self.big_symbol_font)
  303. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  304. self.symbol_height = fm_symbol.height() + 8
  305. self.total_width = MAX_WINDOW_WIDTH
  306. fm_user = QFontMetrics(self.user_font)
  307. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  308. self.user_height = fm_user.height() * user_text_lines + 7
  309. fm_assistant = QFontMetrics(self.assistant_font)
  310. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  311. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  312. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  313. desktop = QDesktopWidget()
  314. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  315. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  316. self.blockSignals(False)
  317. self.update()
  318. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  319. painter.setPen(outlineColor)
  320. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  321. (-outline_size, -outline_size), (outline_size, -outline_size),
  322. (-outline_size, outline_size), (outline_size, outline_size)]:
  323. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  324. painter.setPen(textColor)
  325. painter.drawText(x, y, width, height, alignment, text)
  326. def paintEvent(self, event):
  327. painter = QPainter(self)
  328. offsetX = 4
  329. offsetY = 5
  330. painter.setPen(QColor(255, 255, 255))
  331. # Draw symbol
  332. painter.setFont(self.big_symbol_font)
  333. if self.big_symbol_text:
  334. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  335. painter.setFont(self.small_symbol_font)
  336. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  337. else:
  338. painter.setFont(self.small_symbol_font)
  339. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  340. # Draw User Text
  341. painter.setFont(self.user_font)
  342. user_x = self.total_width - self.user_width - 45 + offsetX
  343. user_y = offsetY + 15
  344. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  345. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  346. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  347. # Draw Assistant Text
  348. painter.setFont(self.assistant_font)
  349. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  350. assistant_y = self.user_height + offsetY + 15
  351. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  352. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  353. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  354. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  355. """
  356. Line feeds are inserted so that the text width does never exceed max_width.
  357. Text is only broken up on whole words.
  358. """
  359. fm = QFontMetrics(font)
  360. words = text.split(' ')
  361. adjusted_text = ''
  362. current_line = ''
  363. max_width_used = 0
  364. for word in words:
  365. current_width = fm.width(current_line + word)
  366. if current_width <= max_width_allowed:
  367. current_line += word + ' '
  368. else:
  369. line_width = fm.width(current_line)
  370. if line_width > max_width_used:
  371. max_width_used = line_width
  372. adjusted_text += current_line + '\n'
  373. current_line = word + ' '
  374. line_width = fm.width(current_line)
  375. if line_width > max_width_used:
  376. max_width_used = line_width
  377. adjusted_text += current_line
  378. return adjusted_text.rstrip(), max_width_used
  379. if __name__ == '__main__':
  380. app = QApplication(sys.argv)
  381. window = TransparentWindow()
  382. window.show()
  383. sys.exit(app.exec_())