123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- from RealtimeTTS import TextToAudioStream, AzureEngine, ElevenlabsEngine, SystemEngine
- from RealtimeSTT import AudioToTextRecorder
- from PyQt5.QtCore import Qt, QTimer, QRect, QEvent, pyqtSignal, QThread, QPoint, QPropertyAnimation, QVariantAnimation
- from PyQt5.QtGui import QPalette, QColor, QPainter, QFontMetrics, QFont
- from PyQt5.QtWidgets import QApplication, QLabel, QWidget, QDesktopWidget
- import os
- import openai
- import sys
- import time
- import sounddevice as sd
- import numpy as np
- import wavio
- max_history_messages = 6
- return_to_wakewords_after_silence = 12
- start_with_wakeword = False
- recorder_model = "large-v2"
- language = "de"
- openai.api_key = os.environ.get("OPENAI_API_KEY")
- azure_speech_region = "germanywestcentral"
- user_font_size = 22
- user_color = QColor(208, 208, 208) # gray
- assistant_font_size = 24
- assistant_color = QColor(240, 240, 240) # white
- voice = "en-GB-SoniaNeural"
- prompt = "Respond helpfully, concisely, and when appropriate, with the subtle, polite irony of a butler."
- if language == "de":
- voice = "de-DE-MajaNeural"
- prompt = 'Antworte hilfreich, knapp und bei Gelegenheit mit der feinen, höflichen Ironie eines Butlers.'
- system_prompt_message = {
- 'role': 'system',
- 'content': prompt
- }
- def generate_response(messages):
- """Generate assistant's response using OpenAI."""
- for chunk in openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages, stream=True, logit_bias={35309:-100, 36661:-100}):
- text_chunk = chunk["choices"][0]["delta"].get("content")
- if text_chunk:
- yield text_chunk
- history = []
- MAX_WINDOW_WIDTH = 1600
- MAX_WIDTH_ASSISTANT = 1200
- MAX_WIDTH_USER = 1500
- class AudioPlayer(QThread):
- def __init__(self, file_path):
- super(AudioPlayer, self).__init__()
- self.file_path = file_path
- def run(self):
- wav = wavio.read(self.file_path)
- sound = wav.data.astype(np.float32) / np.iinfo(np.int16).max
- sd.play(sound, wav.rate)
- sd.wait()
- class TextRetrievalThread(QThread):
- textRetrieved = pyqtSignal(str)
- def __init__(self, recorder):
- super().__init__()
- self.recorder = recorder
- self.active = False
- def run(self):
- while True:
- if self.active:
- text = self.recorder.text()
- self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
- self.textRetrieved.emit(text)
- self.active = False
- time.sleep(0.1)
- def activate(self):
- self.active = True
- class TransparentWindow(QWidget):
- updateUI = pyqtSignal()
- clearAssistantTextSignal = pyqtSignal()
- clearUserTextSignal = pyqtSignal()
- def __init__(self):
- super().__init__()
- self.setGeometry(1, 1, 1, 1)
- self.setWindowTitle("Transparent Window")
- self.setAttribute(Qt.WA_TranslucentBackground)
- self.setWindowFlags(Qt.FramelessWindowHint | Qt.WindowStaysOnTopHint)
- self.big_symbol_font = QFont('Arial', 32)
- self.small_symbol_font = QFont('Arial', 17)
- self.user_font = QFont('Arial', user_font_size)
- self.assistant_font = QFont('Arial', assistant_font_size)
- self.assistant_font.setItalic(True)
- self.big_symbol_text = ""
- self.small_symbol_text = ""
- self.user_text = ""
- self.assistant_text = ""
- self.displayed_user_text = ""
- self.displayed_assistant_text = ""
- self.stream = None
- self.text_retrieval_thread = None
- self.user_text_timer = QTimer(self)
- self.assistant_text_timer = QTimer(self)
- self.user_text_timer.timeout.connect(self.clear_user_text)
- self.assistant_text_timer.timeout.connect(self.clear_assistant_text)
- self.clearUserTextSignal.connect(self.init_clear_user_text)
- self.clearAssistantTextSignal.connect(self.init_clear_assistant_text)
- self.user_text_opacity = 255
- self.assistant_text_opacity = 255
- self.updateUI.connect(self.update_self)
- self.audio_player = None
- self.run_fade_user = False
- self.run_fade_assistant = False
- def init(self):
- self.stream = TextToAudioStream(
- # SystemEngine(),
- AzureEngine(
- os.environ.get("AZURE_SPEECH_KEY"),
- azure_speech_region,
- voice,
- rate=34,
- pitch=10,
- ),
- # ElevenlabsEngine(
- # os.environ.get("ELEVENLABS_API_KEY")
- # ),
- on_character=self.on_character,
- on_text_stream_stop=self.on_text_stream_stop,
- on_text_stream_start=self.on_text_stream_start,
- on_audio_stream_stop=self.on_audio_stream_stop,
- log_characters=True,
- )
- self.recorder = AudioToTextRecorder(
- model=recorder_model,
- language=language,
- wake_words="Jarvis",
- spinner=True,
- silero_sensitivity=0.2,
- webrtc_sensitivity=3,
- on_recording_start=self.on_recording_start,
- on_vad_detect_start=self.on_vad_detect_start,
- on_wakeword_detection_start=self.on_wakeword_detection_start,
- on_transcription_start=self.on_transcription_start,
- )
- if not start_with_wakeword:
- self.recorder.wake_word_activation_delay = return_to_wakewords_after_silence
-
- self.text_retrieval_thread = TextRetrievalThread(self.recorder)
- self.text_retrieval_thread.textRetrieved.connect(self.process_user_text)
- self.text_retrieval_thread.start()
- self.text_retrieval_thread.activate()
- def showEvent(self, event: QEvent):
- super().showEvent(event)
- if event.type() == QEvent.Show:
- self.set_symbols("⌛", "🚀")
- QTimer.singleShot(1000, self.init)
- def on_character(self, char):
- if self.stream:
- self.assistant_text += char
- self.updateUI.emit()
- def on_text_stream_stop(self):
- print("\"", end="", flush=True)
- if self.stream:
- assistant_response = self.stream.text()
- self.assistant_text = assistant_response
- history.append({'role': 'assistant', 'content': assistant_response})
- def on_audio_stream_stop(self):
- if self.stream:
- self.clearAssistantTextSignal.emit()
- self.text_retrieval_thread.activate()
- def generate_answer(self):
- self.run_fade_assistant = False
- if self.assistant_text_timer.isActive():
- self.assistant_text_timer.stop()
- history.append({'role': 'user', 'content': self.user_text})
- self.remove_assistant_text()
- assistant_response = generate_response([system_prompt_message] + history[-max_history_messages:])
- self.stream.feed(assistant_response)
- self.stream.play_async(minimum_sentence_length=7,
- buffer_threshold_seconds=3)
- def set_symbols(self, big_symbol, small_symbol):
- self.big_symbol_text = big_symbol
- self.small_symbol_text = small_symbol
- self.updateUI.emit()
- def on_text_stream_start(self):
- self.set_symbols("⌛", "👄")
- def process_user_text(self, user_text):
- user_text = user_text.strip()
- if user_text:
- self.run_fade_user = False
- if self.user_text_timer.isActive():
- self.user_text_timer.stop()
- self.user_text_opacity = 255
- self.user_text = user_text
- self.clearUserTextSignal.emit()
- print (f"Me: \"{user_text}\"\nAI: \"", end="", flush=True)
- self.set_symbols("⌛", "🧠")
- QTimer.singleShot(100, self.generate_answer)
- def on_transcription_start(self):
- self.set_symbols("⌛", "📝")
- def on_recording_start(self):
- self.set_symbols("🎙️", "🔴")
- def on_vad_detect_start(self):
- if self.small_symbol_text == "💤" or self.small_symbol_text == "🚀":
- self.audio_player = AudioPlayer("active.wav")
- self.audio_player.start()
- self.set_symbols("🎙️", "⚪")
- def on_wakeword_detection_start(self):
- self.audio_player = AudioPlayer("inactive.wav")
- self.audio_player.start()
- self.set_symbols("", "💤")
- def init_clear_user_text(self):
- if self.user_text_timer.isActive():
- self.user_text_timer.stop()
- self.user_text_timer.start(10000)
- def remove_user_text(self):
- self.user_text = ""
- self.user_text_opacity = 255
- self.updateUI.emit()
- def fade_out_user_text(self):
- if not self.run_fade_user:
- return
- if self.user_text_opacity > 0:
- self.user_text_opacity -= 5
- self.updateUI.emit()
- QTimer.singleShot(50, self.fade_out_user_text)
- else:
- self.run_fade_user = False
- self.remove_user_text()
- def clear_user_text(self):
- self.user_text_timer.stop()
- if not self.user_text:
- return
- self.user_text_opacity = 255
- self.run_fade_user = True
- self.fade_out_user_text()
- def init_clear_assistant_text(self):
- if self.assistant_text_timer.isActive():
- self.assistant_text_timer.stop()
- self.assistant_text_timer.start(10000)
- def remove_assistant_text(self):
- self.assistant_text = ""
- self.assistant_text_opacity = 255
- self.updateUI.emit()
- def fade_out_assistant_text(self):
- if not self.run_fade_assistant:
- return
-
- if self.assistant_text_opacity > 0:
- self.assistant_text_opacity -= 5
- self.updateUI.emit()
- QTimer.singleShot(50, self.fade_out_assistant_text)
- else:
- self.run_fade_assistant = False
- self.remove_assistant_text()
- def clear_assistant_text(self):
- self.assistant_text_timer.stop()
- if not self.assistant_text:
- return
- self.assistant_text_opacity = 255
- self.run_fade_assistant = True
- self.fade_out_assistant_text()
- def update_self(self):
- self.blockSignals(True)
-
- self.displayed_user_text, self.user_width = self.return_text_adjusted_to_width(self.user_text, self.user_font, MAX_WIDTH_USER)
- self.displayed_assistant_text, self.assistant_width = self.return_text_adjusted_to_width(self.assistant_text, self.assistant_font, MAX_WIDTH_ASSISTANT)
- fm_symbol = QFontMetrics(self.big_symbol_font)
- self.symbol_width = fm_symbol.width(self.big_symbol_text) + 3
- self.symbol_height = fm_symbol.height() + 8
- self.total_width = MAX_WINDOW_WIDTH
- fm_user = QFontMetrics(self.user_font)
- user_text_lines = (self.displayed_user_text.count("\n") + 1)
- self.user_height = fm_user.height() * user_text_lines + 7
- fm_assistant = QFontMetrics(self.assistant_font)
- assistant_text_lines = (self.displayed_assistant_text.count("\n") + 1)
- self.assistant_height = fm_assistant.height() * assistant_text_lines + 18
- self.total_height = sum([self.symbol_height, self.user_height, self.assistant_height])
- desktop = QDesktopWidget()
- screen_rect = desktop.availableGeometry(desktop.primaryScreen())
- self.setGeometry(screen_rect.right() - self.total_width - 50, 0, self.total_width + 50, self.total_height + 50)
- self.blockSignals(False)
- self.update()
- def drawTextWithOutline(self, painter, x, y, width, height, alignment, text, textColor, outlineColor, outline_size):
- painter.setPen(outlineColor)
- for dx, dy in [(-outline_size, 0), (outline_size, 0), (0, -outline_size), (0, outline_size),
- (-outline_size, -outline_size), (outline_size, -outline_size),
- (-outline_size, outline_size), (outline_size, outline_size)]:
- painter.drawText(x + dx, y + dy, width, height, alignment, text)
- painter.setPen(textColor)
- painter.drawText(x, y, width, height, alignment, text)
- def paintEvent(self, event):
- painter = QPainter(self)
- offsetX = 4
- offsetY = 5
-
- painter.setPen(QColor(255, 255, 255))
- # Draw symbol
- painter.setFont(self.big_symbol_font)
- if self.big_symbol_text:
- painter.drawText(self.total_width - self.symbol_width + 5 + offsetX, offsetY, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignTop, self.big_symbol_text)
- painter.setFont(self.small_symbol_font)
- painter.drawText(self.total_width - self.symbol_width + 17 + offsetX, offsetY + 10, self.symbol_width, self.symbol_height, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
- else:
- painter.setFont(self.small_symbol_font)
- painter.drawText(self.total_width - 43 + offsetX, offsetY + 2, 50, 50, Qt.AlignRight | Qt.AlignBottom, self.small_symbol_text)
- # Draw User Text
- painter.setFont(self.user_font)
- user_x = self.total_width - self.user_width - 45 + offsetX
- user_y = offsetY + 15
- user_color_with_opacity = QColor(user_color.red(), user_color.green(), user_color.blue(), self.user_text_opacity)
- outline_color_with_opacity = QColor(0, 0, 0, self.user_text_opacity)
- self.drawTextWithOutline(painter, user_x, user_y, self.user_width, self.user_height, Qt.AlignRight | Qt.AlignTop, self.displayed_user_text, user_color_with_opacity, outline_color_with_opacity, 2)
- # Draw Assistant Text
- painter.setFont(self.assistant_font)
- assistant_x = self.total_width - self.assistant_width - 5 + offsetX
- assistant_y = self.user_height + offsetY + 15
- assistant_color_with_opacity = QColor(assistant_color.red(), assistant_color.green(), assistant_color.blue(), self.assistant_text_opacity)
- outline_color_with_opacity = QColor(0, 0, 0, self.assistant_text_opacity)
- self.drawTextWithOutline(painter, assistant_x, assistant_y, self.assistant_width, self.assistant_height, Qt.AlignRight | Qt.AlignTop, self.displayed_assistant_text, assistant_color_with_opacity, outline_color_with_opacity, 2)
- def return_text_adjusted_to_width(self, text, font, max_width_allowed):
- """
- Line feeds are inserted so that the text width does never exceed max_width.
- Text is only broken up on whole words.
- """
- fm = QFontMetrics(font)
- words = text.split(' ')
- adjusted_text = ''
- current_line = ''
- max_width_used = 0
-
- for word in words:
- current_width = fm.width(current_line + word)
- if current_width <= max_width_allowed:
- current_line += word + ' '
- else:
- line_width = fm.width(current_line)
- if line_width > max_width_used:
- max_width_used = line_width
- adjusted_text += current_line + '\n'
- current_line = word + ' '
-
- line_width = fm.width(current_line)
- if line_width > max_width_used:
- max_width_used = line_width
- adjusted_text += current_line
- return adjusted_text.rstrip(), max_width_used
- if __name__ == '__main__':
- app = QApplication(sys.argv)
- window = TransparentWindow()
- window.show()
- sys.exit(app.exec_())
|