VoiceApp.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
  2. from RealtimeSTT import AudioToTextRecorder
  3. from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
  4. from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont
  5. from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget
  6. import os
  7. import openai
  8. import sys
  9. import time
  10. import sounddevice as sd
  11. import numpy as np
  12. import wavio
  13. max_history_messages = 6
  14. return_to_wakewords_after_silence = 12
  15. start_with_wakeword = False
  16. recorder_model = "large-v2"
  17. language = "de"
  18. openai.api_key = os.environ.get("OPENAI_API_KEY")
  19. azure_speech_region = "germanywestcentral"
  20. user_font_size = 22
  21. user_color = QColor(208, 208, 208) # gray
  22. assistant_font_size = 24
  23. assistant_color = QColor(240, 240, 240) # white
  24. voice = "en-GB-SoniaNeural"
  25. prompt = "Respond helpfully, concisely, and when appropriate, with the subtle, polite irony of a butler."
  26. if language == "de":
  27. voice = "de-DE-MajaNeural"
  28. prompt = 'Antworte hilfreich, knapp und bei Gelegenheit mit der feinen, höflichen Ironie eines Butlers.'
  29. system_prompt_message = {
  30. 'role': 'system',
  31. 'content': prompt
  32. }
  33. def generate_response(messages):
  34. """Generate assistant's response using OpenAI."""
  35. for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
  36. text_chunk = chunk["choices"][0]["delta"].get("content")
  37. if text_chunk:
  38. yield text_chunk
  39. history = []
  40. MAX_WINDOW_WIDTH = 1600
  41. MAX_WIDTH_ASSISTANT = 1200
  42. MAX_WIDTH_USER = 1500
  43. class AudioPlayer(QThread):
  44. def __init__(self, file_path):
  45. super(AudioPlayer, self).__init__()
  46. self.file_path = file_path
  47. def run(self):
  48. wav = wavio.read(self.file_path)
  49. sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
  50. sd.play(sound, wav.rate)
  51. sd.wait()
  52. class TextRetrievalThread(QThread):
  53. textRetrieved = pyqtSignal(str)
  54. def __init__(self, recorder):
  55. super().__init__()
  56. self.recorder = recorder
  57. self.active = False
  58. def run(self):
  59. while True:
  60. if self.active:
  61. text = self.recorder.text()
  62. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  63. self.textRetrieved.emit(text)
  64. self.active = False
  65. time.sleep(0.1)
  66. def activate(self):
  67. self.active = True
  68. class TransparentWindow(QWidget):
  69. updateUI = pyqtSignal()
  70. clearAssistantTextSignal = pyqtSignal()
  71. clearUserTextSignal = pyqtSignal()
  72. def __init__(self):
  73. super().__init__()
  74. self.setGeometry(1, 1, 1, 1)
  75. self.setWindowTitle("Transparent Window")
  76. self.setAttribute(Qt.WA_TranslucentBackground)
  77. self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
  78. self.big_symbol_font = QFont('Arial', 32)
  79. self.small_symbol_font = QFont('Arial', 17)
  80. self.user_font = QFont('Arial', user_font_size)
  81. self.assistant_font = QFont('Arial', assistant_font_size)
  82. self.assistant_font.setItalic(True)
  83. self.big_symbol_text = ""
  84. self.small_symbol_text = ""
  85. self.user_text = ""
  86. self.assistant_text = ""
  87. self.displayed_user_text = ""
  88. self.displayed_assistant_text = ""
  89. self.stream = None
  90. self.text_retrieval_thread = None
  91. self.user_text_timer = QTimer(self)
  92. self.assistant_text_timer = QTimer(self)
  93. self.user_text_timer.timeout.connect(self.clear_user_text)
  94. self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
  95. self.clearUserTextSignal.connect(self.init_clear_user_text)
  96. self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
  97. self.user_text_opacity = 255
  98. self.assistant_text_opacity = 255
  99. self.updateUI.connect(self.update_self)
  100. self.audio_player = None
  101. self.run_fade_user = False
  102. self.run_fade_assistant = False
  103. def init(self):
  104. self.stream = TextToAudioStream(
  105. # SystemEngine(),
  106. AzureEngine(
  107. os.environ.get("AZURE_SPEECH_KEY"),
  108. azure_speech_region,
  109. voice,
  110. rate=34,
  111. pitch=10,
  112. ),
  113. # ElevenlabsEngine(
  114. # os.environ.get("ELEVENLABS_API_KEY")
  115. # ),
  116. on_character=self.on_character,
  117. on_text_stream_stop=self.on_text_stream_stop,
  118. on_text_stream_start=self.on_text_stream_start,
  119. on_audio_stream_stop=self.on_audio_stream_stop,
  120. log_characters=True,
  121. )
  122. self.recorder = AudioToTextRecorder(
  123. model=recorder_model,
  124. language=language,
  125. wake_words="Jarvis",
  126. spinner=True,
  127. silero_sensitivity=0.2,
  128. webrtc_sensitivity=3,
  129. on_recording_start=self.on_recording_start,
  130. on_vad_detect_start=self.on_vad_detect_start,
  131. on_wakeword_detection_start=self.on_wakeword_detection_start,
  132. on_transcription_start=self.on_transcription_start,
  133. )
  134. if not start_with_wakeword:
  135. self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
  136. self.text_retrieval_thread = TextRetrievalThread(self.recorder)
  137. self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
  138. self.text_retrieval_thread.start()
  139. self.text_retrieval_thread.activate()
  140. def showEvent(self, event: QEvent):
  141. super().showEvent(event)
  142. if event.type() == QEvent.Show:
  143. self.set_symbols("⌛", "🚀")
  144. QTimer.singleShot(1000, self.init)
  145. def on_character(self, char):
  146. if self.stream:
  147. self.assistant_text += char
  148. self.updateUI.emit()
  149. def on_text_stream_stop(self):
  150. print("\"", end="", flush=True)
  151. if self.stream:
  152. assistant_response = self.stream.text()
  153. self.assistant_text = assistant_response
  154. history.append({'role': 'assistant', 'content': assistant_response})
  155. def on_audio_stream_stop(self):
  156. if self.stream:
  157. self.clearAssistantTextSignal.emit()
  158. self.text_retrieval_thread.activate()
  159. def generate_answer(self):
  160. self.run_fade_assistant = False
  161. if self.assistant_text_timer.isActive():
  162. self.assistant_text_timer.stop()
  163. history.append({'role': 'user', 'content': self.user_text})
  164. self.remove_assistant_text()
  165. assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
  166. self.stream.feed(assistant_response)
  167. self.stream.play_async(minimum_sentence_length=7,
  168. buffer_threshold_seconds=3)
  169. def set_symbols(self, big_symbol, small_symbol):
  170. self.big_symbol_text = big_symbol
  171. self.small_symbol_text = small_symbol
  172. self.updateUI.emit()
  173. def on_text_stream_start(self):
  174. self.set_symbols("⌛", "👄")
  175. def process_user_text(self, user_text):
  176. user_text = user_text.strip()
  177. if user_text:
  178. self.run_fade_user = False
  179. if self.user_text_timer.isActive():
  180. self.user_text_timer.stop()
  181. self.user_text_opacity = 255
  182. self.user_text = user_text
  183. self.clearUserTextSignal.emit()
  184. print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
  185. self.set_symbols("⌛", "🧠")
  186. QTimer.singleShot(100, self.generate_answer)
  187. def on_transcription_start(self):
  188. self.set_symbols("⌛", "📝")
  189. def on_recording_start(self):
  190. self.set_symbols("🎙️", "🔴")
  191. def on_vad_detect_start(self):
  192. if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
  193. self.audio_player = AudioPlayer("active.wav")
  194. self.audio_player.start()
  195. self.set_symbols("🎙️", "⚪")
  196. def on_wakeword_detection_start(self):
  197. self.audio_player = AudioPlayer("inactive.wav")
  198. self.audio_player.start()
  199. self.set_symbols("", "💤")
  200. def init_clear_user_text(self):
  201. if self.user_text_timer.isActive():
  202. self.user_text_timer.stop()
  203. self.user_text_timer.start(10000)
  204. def remove_user_text(self):
  205. self.user_text = ""
  206. self.user_text_opacity = 255
  207. self.updateUI.emit()
  208. def fade_out_user_text(self):
  209. if not self.run_fade_user:
  210. return
  211. if self.user_text_opacity > 0:
  212. self.user_text_opacity -= 5
  213. self.updateUI.emit()
  214. QTimer.singleShot(50, self.fade_out_user_text)
  215. else:
  216. self.run_fade_user = False
  217. self.remove_user_text()
  218. def clear_user_text(self):
  219. self.user_text_timer.stop()
  220. if not self.user_text:
  221. return
  222. self.user_text_opacity = 255
  223. self.run_fade_user = True
  224. self.fade_out_user_text()
  225. def init_clear_assistant_text(self):
  226. if self.assistant_text_timer.isActive():
  227. self.assistant_text_timer.stop()
  228. self.assistant_text_timer.start(10000)
  229. def remove_assistant_text(self):
  230. self.assistant_text = ""
  231. self.assistant_text_opacity = 255
  232. self.updateUI.emit()
  233. def fade_out_assistant_text(self):
  234. if not self.run_fade_assistant:
  235. return
  236. if self.assistant_text_opacity > 0:
  237. self.assistant_text_opacity -= 5
  238. self.updateUI.emit()
  239. QTimer.singleShot(50, self.fade_out_assistant_text)
  240. else:
  241. self.run_fade_assistant = False
  242. self.remove_assistant_text()
  243. def clear_assistant_text(self):
  244. self.assistant_text_timer.stop()
  245. if not self.assistant_text:
  246. return
  247. self.assistant_text_opacity = 255
  248. self.run_fade_assistant = True
  249. self.fade_out_assistant_text()
  250. def update_self(self):
  251. self.blockSignals(True)
  252. self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
  253. self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
  254. fm_symbol = QFontMetrics(self.big_symbol_font)
  255. self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
  256. self.symbol_height = fm_symbol.height() + 8
  257. self.total_width = MAX_WINDOW_WIDTH
  258. fm_user = QFontMetrics(self.user_font)
  259. user_text_lines = (self.displayed_user_text.count("\n") + 1)
  260. self.user_height = fm_user.height() * user_text_lines + 7
  261. fm_assistant = QFontMetrics(self.assistant_font)
  262. assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
  263. self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
  264. self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
  265. desktop = QDesktopWidget()
  266. screen_rect = desktop.availableGeometry(desktop.primaryScreen())
  267. self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
  268. self.blockSignals(False)
  269. self.update()
  270. def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
  271. painter.setPen(outlineColor)
  272. for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
  273. (-outline_size, -outline_size), (outline_size, -outline_size),
  274. (-outline_size, outline_size), (outline_size, outline_size)]:
  275. painter.drawText(x + dx, y + dy, width, height, alignment, text)
  276. painter.setPen(textColor)
  277. painter.drawText(x, y, width, height, alignment, text)
  278. def paintEvent(self, event):
  279. painter = QPainter(self)
  280. offsetX = 4
  281. offsetY = 5
  282. painter.setPen(QColor(255, 255, 255))
  283. # Draw symbol
  284. painter.setFont(self.big_symbol_font)
  285. if self.big_symbol_text:
  286. painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
  287. painter.setFont(self.small_symbol_font)
  288. painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  289. else:
  290. painter.setFont(self.small_symbol_font)
  291. painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
  292. # Draw User Text
  293. painter.setFont(self.user_font)
  294. user_x = self.total_width - self.user_width - 45 + offsetX
  295. user_y = offsetY + 15
  296. user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
  297. outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
  298. self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
  299. # Draw Assistant Text
  300. painter.setFont(self.assistant_font)
  301. assistant_x = self.total_width - self.assistant_width - 5 + offsetX
  302. assistant_y = self.user_height + offsetY + 15
  303. assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
  304. outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
  305. self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
  306. def return_text_adjusted_to_width(self, text, font, max_width_allowed):
  307. """
  308. Line feeds are inserted so that the text width does never exceed max_width.
  309. Text is only broken up on whole words.
  310. """
  311. fm = QFontMetrics(font)
  312. words = text.split(' ')
  313. adjusted_text = ''
  314. current_line = ''
  315. max_width_used = 0
  316. for word in words:
  317. current_width = fm.width(current_line + word)
  318. if current_width <= max_width_allowed:
  319. current_line += word + ' '
  320. else:
  321. line_width = fm.width(current_line)
  322. if line_width > max_width_used:
  323. max_width_used = line_width
  324. adjusted_text += current_line + '\n'
  325. current_line = word + ' '
  326. line_width = fm.width(current_line)
  327. if line_width > max_width_used:
  328. max_width_used = line_width
  329. adjusted_text += current_line
  330. return adjusted_text.rstrip(), max_width_used
  331. if __name__ == '__main__':
  332. app = QApplication(sys.argv)
  333. window = TransparentWindow()
  334. window.show()
  335. sys.exit(app.exec_())