# ╔╗╔┌─┐┬─┐┌─┐┬ ┬ # ║║║├┤ ├┬┘│ └┬┘ # ╝╚╝└─┘┴└─└─┘ ┴ # Code is licensed under CC-BY-NC-ND 4.0 unless otherwise specified. # https://creativecommons.org/licenses/by-nc-nd/4.0/ # You CANNOT edit this file without direct permission from the author. # You can redistribute this file without any changes. # meta developer: @nercymods # scope: hikka_min 1.6.2 # requires: pydub openai==1.3.8 ffmpeg import os import openai from hikkatl.tl.types import Message from pydub import AudioSegment import requests import base64 from .. import loader, utils @loader.tds class WhisperMod(loader.Module): """Module for speech recognition""" strings = { "name": "WhisperMod", "audio_not_found": ( "👮‍♀️Not found to" " recognize." ), "recognized": ( "🗣Recognized:\n{transcription}" ), "error": ( "Error occurred during" " transcription." ), "recognition": ( "🫥Recognition..." ), "downloading": "🐍Downloading, wait", "autowhisper_enabled": ( "🫥Auto-whisper enabled" " in this chat." ), "autowhisper_disabled": ( "🫥Auto-whisper disabled" " in this chat." ), "no_api": " Insert openai api-key in config (.cfg whispermod)", "invalid_key": " Invalid openai api-key", "hf_instructions": ( "👩‍🎓 How to get hugging face api token:\n" "> Open Hugging Face and sign in. 👤 \n" "> Go to Settings → Access Tokens: https://huggingface.co/settings/tokens. ⚙️ \n" "> Click New Token. \n" "> Select permission: \"make calls to the serverless Inference API\". ⚙️ \n" "> Click Create Token. \n" "> Copy the token and paste it into the config. " ), "hf_token_missing": ( "Missing hugging face api token" " (.cfg whispermod)" ) } strings_ru = { "audio_not_found": ( "👮‍♀️Не найдено, что" " распознавать." ), "recognized": ( "🗣Распознано:\n{transcription}" ), "error": ( "Ошибка при" " транскрипции." ), "recognition": ( "🫥Распознавание..." ), "downloading": ( "🐍Скачивание," " подождите..." ), "autowhisper_enabled": ( "🫥Автораспознавание" " включено в этом чате." ), "autowhisper_disabled": ( "🫥Автораспознавание" " отключено в этом чате." ), "no_api": ( " Укажите api-ключ в конфиге" " (.cfg whispermod)" ), "invalid_key": ( " Неверный api-ключ" ), "hf_instructions": ( "👩‍🎓 Как получить api-токен hugging face:\n" "> Откройте Hugging Face и войдите в аккаунт. 👤\n" "> Перейдите в Settings → Access Tokens: https://huggingface.co/settings/tokens. ⚙️\n" "> Нажмите New Token. \n" "> Выберите разрешение: \"make calls to the serverless Inference API\". ⚙️\n" "> Нажмите Create Token. \n" "> Скопируйте токен и вставьте его в конфиг. " ), "hf_token_missing": ( "Отсутствует api-токен hugging face" " (.cfg whispermod)" ) } def __init__(self): self.config = loader.ModuleConfig( loader.ConfigValue( "api_key", None, lambda: "Api key for Whisper (https://platform.openai.com/account/api-keys)", validator=loader.validators.Hidden(), ), loader.ConfigValue( "temperature", "0.2", lambda: ( "The sampling temperature, between 0 and 1. Higher values like 0.8" " will make the output more random, while lower values like 0.2" " will make it more focused and deterministic. If set to 0, the" " model will use log probability to automatically increase the" " temperature until certain thresholds are hit." ), validator=loader.validators.String(), ), loader.ConfigValue( "prompt", None, lambda: ( "An optional text to guide the model's style or continue a previous" " audio segment. The prompt should match the audio language." ), validator=loader.validators.String(), ), loader.ConfigValue( "hf_api_key", None, lambda: "Api key for hugging face (look .hfguide)", validator=loader.validators.Hidden(), ), loader.ConfigValue( "auto_voice", True, lambda: "Enable auto-recognition for voice messages", validator=loader.validators.Boolean() ), loader.ConfigValue( "auto_video", True, lambda: "Enable auto-recognition for video messages", validator=loader.validators.Boolean() ), ) @loader.command(ru_doc="распознать речь из голосового/видео сообщения в реплае, используя openai api") async def whisper(self, message: Message): """Transcribe speech from a voice/video message in reply using openai api""" if self.config["api_key"] is None: await utils.answer(message, self.strings["no_api"]) return rep = await message.get_reply_message() down = await utils.answer(message, self.strings["downloading"]) file = await rep.download_media() file_extension = os.path.splitext(file)[1].lower() if file_extension in [".oga", ".ogg"]: await self.client.edit_message( message.chat_id, down.id, self.strings["recognition"] ) input_file = file audio = AudioSegment.from_file(input_file, format="ogg") audio.export("output_file.mp3", format="mp3") audio_file = open("output_file.mp3", "rb") client = openai.AsyncOpenAI(api_key=self.config["api_key"]) try: response = await client.audio.transcriptions.create( model="whisper-1", file=audio_file, prompt=self.config["prompt"], temperature=self.config["temperature"], ) except openai.AuthenticationError: await utils.answer(message, self.strings["invalid_key"]) return except Exception as e: await utils.answer( message, f"Error: {e}", ) return transcription = response.text await self.client.edit_message( message.chat_id, down.id, self.strings["recognized"].format(transcription=transcription), ) os.remove(file) os.remove("output_file.mp3") elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]: await self.client.edit_message( message.chat_id, down.id, self.strings["recognition"] ) input_file = file audio_file = open(input_file, "rb") client = openai.AsyncOpenAI(api_key=self.config["api_key"]) try: response = await client.audio.transcriptions.create( model="whisper-1", file=audio_file, prompt=self.config["prompt"], temperature=self.config["temperature"], ) except openai.AuthenticationError: await utils.answer(message, self.strings["invalid_key"]) return except Exception as e: await utils.answer( message, f"Error: {e}", ) return transcription = response.text await self.client.edit_message( message.chat_id, down.id, self.strings["recognized"].format(transcription=transcription), ) os.remove(file) else: await utils.answer(message, self.strings["audio_not_found"]) @loader.command( ru_doc=( "включить/выключить автораспознавание голосовых и видео сообщений в чате" " где введена команда" ) ) async def autowhspr(self, message: Message): """Enable/disable auto-speech recognition for voice and video messages""" chat_id = str(message.chat_id) current_state = self.get("autowhspr", {}) enabled = current_state.get(chat_id, False) if enabled: current_state.pop(chat_id, None) status_message = self.strings["autowhisper_disabled"] else: current_state[chat_id] = True status_message = self.strings["autowhisper_enabled"] self.set("autowhspr", current_state) await utils.answer(message, status_message) @loader.watcher(only_media=True) async def autowhisper_watcher(self, message: Message): """Watcher to automatically transcribe voice and video messages when auto-speech recognition is enabled""" chat_id = str(message.chat_id) current_state = self.get("autowhspr", {}) if current_state.get(chat_id, False): if (message.voice and self.config["auto_voice"]) or (message.video and self.config["auto_video"]): if not message.gif and not message.sticker and not message.photo: rep = message await self.whisperwatch(rep) async def whisperwatch(self, message: Message): """Transcribe speech from a voice/video message in reply""" rep = message down = await self.client.send_message( message.chat.id, message=self.strings["downloading"], reply_to=rep.id ) file = await rep.download_media() file_extension = os.path.splitext(file)[1].lower() if file_extension in [".oga", ".ogg"]: await self.client.edit_message( message.chat_id, down.id, self.strings["recognition"] ) input_file = file audio = AudioSegment.from_file(input_file, format="ogg") audio.export("output_file.mp3", format="mp3") audio_file = open("output_file.mp3", "rb") client = openai.AsyncOpenAI(api_key=self.config["api_key"]) try: response = await client.audio.transcriptions.create( model="whisper-1", file=audio_file, prompt=self.config["prompt"], temperature=self.config["temperature"], ) except openai.AuthenticationError: await utils.answer(message, self.strings["invalid_key"]) return except Exception as e: await utils.answer( message, f"Error: {e}", ) return transcription = response.text await self.client.edit_message( message.chat_id, down.id, self.strings["recognized"].format(transcription=transcription), ) os.remove(file) os.remove("output_file.mp3") elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]: await self.client.edit_message( message.chat_id, down.id, self.strings["recognition"] ) input_file = file audio_file = open(input_file, "rb") client = openai.AsyncOpenAI(api_key=self.config["api_key"]) try: response = await client.audio.transcriptions.create( model="whisper-1", file=audio_file, prompt=self.config["prompt"], temperature=self.config["temperature"], ) except openai.AuthenticationError: await utils.answer(message, self.strings["invalid_key"]) return except Exception as e: await utils.answer( message, f"Error: {e}", ) return transcription = response.text await self.client.edit_message( message.chat_id, down.id, self.strings["recognized"].format(transcription=transcription), ) os.remove(file) else: return @loader.command(ru_doc="распознать речь из голосового/видео сообщения в реплае, используя hugging face api") async def hfwhisper(self, m: Message): """Transcribe speech from a voice/video message in reply using hugging face api""" if self.config["hf_api_key"] is None: await utils.answer(m, self.strings["hf_token_missing"]) return rep = await m.get_reply_message() await utils.answer(m, self.strings["downloading"]) file = await rep.download_media() file_extension = os.path.splitext(file)[1].lower() if file_extension in ['.ogg', '.oga']: try: await utils.answer(m, self.strings["recognition"]) with open(file, "rb") as f: audio_bytes = f.read() audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') payload = { "inputs": audio_b64, } response = await utils.run_sync( requests.post, url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo", headers = {"Authorization": f"Bearer {self.config['hf_api_key']}", "x-use-cache": "false", "x-wait-for-model": "true", "Content-Type": "application/json" }, json = payload, ) output = response.json() os.remove(file) return await utils.answer(m, self.strings["recognized"].format(transcription=output['text'])) except Exception as e: import logging logging.getLogger().error(e) return await utils.answer(m, self.strings["error"]) elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]: try: await utils.answer(m, self.strings["recognition"]) audio = AudioSegment.from_file(file, format=file_extension.replace('.', '')) audio.export("output_file.mp3", format="mp3") with open("output_file.mp3", "rb") as f: audio_bytes = f.read() audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') payload = { "inputs": audio_b64, "language": "ru", "attention_mask": [1] * len(audio_bytes) } response = await utils.run_sync( requests.post, url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo", headers = {"Authorization": f"Bearer {self.config['hf_api_key']}", "x-use-cache": "false", "x-wait-for-model": "true", "Content-Type": "application/json" }, json = payload, ) output = response.json() os.remove("output_file.mp3") os.remove(file) return await utils.answer(m, self.strings["recognized"].format(transcription=output['text'])) except Exception as e: import logging logging.getLogger().error(e) return await utils.answer(m, self.strings["error"]) @loader.command( ru_doc=( "включить/выключить автораспознавание через Hugging Face API в текущем чате" ) ) async def hfautowhspr(self, message: Message): """Enable/disable auto-speech recognition using Hugging Face API""" chat_id = str(message.chat_id) current_state = self.get("hfautowhspr", {}) enabled = current_state.get(chat_id, False) if enabled: current_state.pop(chat_id, None) status_message = self.strings["autowhisper_disabled"] else: current_state[chat_id] = True status_message = self.strings["autowhisper_enabled"] self.set("hfautowhspr", current_state) await utils.answer(message, status_message) @loader.watcher(only_media=True) async def hfautowhisper_watcher(self, message: Message): """Watcher for Hugging Face auto-transcription""" chat_id = str(message.chat_id) current_state = self.get("hfautowhspr", {}) if current_state.get(chat_id, False): if (message.voice and self.config["auto_voice"]) or (message.video and self.config["auto_video"]): if not message.gif and not message.sticker and not message.photo: rep = message await self.hfwhisperwatch(rep) async def hfwhisperwatch(self, message: Message): """Auto-transcribe using Hugging Face API""" if self.config["hf_api_key"] is None: return rep = message down = await self.client.send_message( message.chat.id, self.strings["downloading"], reply_to=rep.id ) file = await rep.download_media() file_extension = os.path.splitext(file)[1].lower() try: await self.client.edit_message( message.chat_id, down.id, self.strings["recognition"] ) if file_extension in ['.ogg', '.oga']: with open(file, "rb") as f: audio_bytes = f.read() else: audio = AudioSegment.from_file(file, format=file_extension.replace('.', '')) audio.export("temp_audio.mp3", format="mp3") with open("temp_audio.mp3", "rb") as f: audio_bytes = f.read() os.remove("temp_audio.mp3") audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') response = await utils.run_sync( requests.post, url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo", headers={ "Authorization": f"Bearer {self.config['hf_api_key']}", "x-use-cache": "false", "x-wait-for-model": "true", "Content-Type": "application/json" }, json={"inputs": audio_b64}, ) if response.status_code != 200: raise Exception(f"API Error: {response.text}") output = response.json() text = output.get('text', '') await self.client.edit_message( message.chat_id, down.id, self.strings["recognized"].format(transcription=text), ) except Exception as e: await self.client.edit_message( message.chat_id, down.id, f"❌ Error: {str(e)}" ) finally: if os.path.exists(file): os.remove(file) if os.path.exists("temp_audio.mp3"): os.remove("temp_audio.mp3") @loader.command(ru_doc="гайд как получить hugging face токен", en_doc="guide how to get hugging face token") async def hfguide(self, m: Message): await utils.answer(m, self.strings['hf_instructions'])