# ╔╗╔┌─┐┬─┐┌─┐┬ ┬
# ║║║├┤ ├┬┘│ └┬┘
# ╝╚╝└─┘┴└─└─┘ ┴
# Code is licensed under CC-BY-NC-ND 4.0 unless otherwise specified.
# https://creativecommons.org/licenses/by-nc-nd/4.0/
# You CANNOT edit this file without direct permission from the author.
# You can redistribute this file without any changes.
# meta developer: @nercymods
# scope: hikka_min 1.6.2
# requires: pydub openai==1.3.8 ffmpeg
import os
import openai
from hikkatl.tl.types import Message
from pydub import AudioSegment
import requests
import base64
from .. import loader, utils
@loader.tds
class WhisperMod(loader.Module):
"""Module for speech recognition"""
strings = {
"name": "WhisperMod",
"audio_not_found": (
"👮♀️Not found to"
" recognize."
),
"recognized": (
"🗣Recognized:\n{transcription}"
),
"error": (
"❌Error occurred during"
" transcription."
),
"recognition": (
"🫥Recognition..."
),
"downloading": "🐍Downloading, wait",
"autowhisper_enabled": (
"🫥Auto-whisper enabled"
" in this chat."
),
"autowhisper_disabled": (
"🫥Auto-whisper disabled"
" in this chat."
),
"no_api": "❌ Insert openai api-key in config (.cfg whispermod)",
"invalid_key": "❌ Invalid openai api-key",
"hf_instructions": (
"👩🎓 How to get hugging face api token:\n"
"> Open Hugging Face and sign in. 👤 \n"
"> Go to Settings → Access Tokens: https://huggingface.co/settings/tokens. ⚙️ \n"
"> Click New Token. ➕ \n"
"> Select permission: \"make calls to the serverless Inference API\". ⚙️ \n"
"> Click Create Token. ➕ \n"
"> Copy the token and paste it into the config. ✅"
),
"hf_token_missing": (
"❌Missing hugging face api token"
" (.cfg whispermod)"
)
}
strings_ru = {
"audio_not_found": (
"👮♀️Не найдено, что"
" распознавать."
),
"recognized": (
"🗣Распознано:\n{transcription}"
),
"error": (
"❌Ошибка при"
" транскрипции."
),
"recognition": (
"🫥Распознавание..."
),
"downloading": (
"🐍Скачивание,"
" подождите..."
),
"autowhisper_enabled": (
"🫥Автораспознавание"
" включено в этом чате."
),
"autowhisper_disabled": (
"🫥Автораспознавание"
" отключено в этом чате."
),
"no_api": (
"❌ Укажите api-ключ в конфиге"
" (.cfg whispermod)"
),
"invalid_key": (
"❌ Неверный api-ключ"
),
"hf_instructions": (
"👩🎓 Как получить api-токен hugging face:\n"
"> Откройте Hugging Face и войдите в аккаунт. 👤\n"
"> Перейдите в Settings → Access Tokens: https://huggingface.co/settings/tokens. ⚙️\n"
"> Нажмите New Token. ➕\n"
"> Выберите разрешение: \"make calls to the serverless Inference API\". ⚙️\n"
"> Нажмите Create Token. ➕\n"
"> Скопируйте токен и вставьте его в конфиг. ✅"
),
"hf_token_missing": (
"❌Отсутствует api-токен hugging face"
" (.cfg whispermod)"
)
}
def __init__(self):
self.config = loader.ModuleConfig(
loader.ConfigValue(
"api_key",
None,
lambda: "Api key for Whisper (https://platform.openai.com/account/api-keys)",
validator=loader.validators.Hidden(),
),
loader.ConfigValue(
"temperature",
"0.2",
lambda: (
"The sampling temperature, between 0 and 1. Higher values like 0.8"
" will make the output more random, while lower values like 0.2"
" will make it more focused and deterministic. If set to 0, the"
" model will use log probability to automatically increase the"
" temperature until certain thresholds are hit."
),
validator=loader.validators.String(),
),
loader.ConfigValue(
"prompt",
None,
lambda: (
"An optional text to guide the model's style or continue a previous"
" audio segment. The prompt should match the audio language."
),
validator=loader.validators.String(),
),
loader.ConfigValue(
"hf_api_key",
None,
lambda: "Api key for hugging face (look .hfguide)",
validator=loader.validators.Hidden(),
),
loader.ConfigValue(
"auto_voice",
True,
lambda: "Enable auto-recognition for voice messages",
validator=loader.validators.Boolean()
),
loader.ConfigValue(
"auto_video",
True,
lambda: "Enable auto-recognition for video messages",
validator=loader.validators.Boolean()
),
)
@loader.command(ru_doc="распознать речь из голосового/видео сообщения в реплае, используя openai api")
async def whisper(self, message: Message):
"""Transcribe speech from a voice/video message in reply using openai api"""
if self.config["api_key"] is None:
await utils.answer(message, self.strings["no_api"])
return
rep = await message.get_reply_message()
down = await utils.answer(message, self.strings["downloading"])
file = await rep.download_media()
file_extension = os.path.splitext(file)[1].lower()
if file_extension in [".oga", ".ogg"]:
await self.client.edit_message(
message.chat_id, down.id, self.strings["recognition"]
)
input_file = file
audio = AudioSegment.from_file(input_file, format="ogg")
audio.export("output_file.mp3", format="mp3")
audio_file = open("output_file.mp3", "rb")
client = openai.AsyncOpenAI(api_key=self.config["api_key"])
try:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt=self.config["prompt"],
temperature=self.config["temperature"],
)
except openai.AuthenticationError:
await utils.answer(message, self.strings["invalid_key"])
return
except Exception as e:
await utils.answer(
message,
f"❌Error: {e}",
)
return
transcription = response.text
await self.client.edit_message(
message.chat_id,
down.id,
self.strings["recognized"].format(transcription=transcription),
)
os.remove(file)
os.remove("output_file.mp3")
elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]:
await self.client.edit_message(
message.chat_id, down.id, self.strings["recognition"]
)
input_file = file
audio_file = open(input_file, "rb")
client = openai.AsyncOpenAI(api_key=self.config["api_key"])
try:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt=self.config["prompt"],
temperature=self.config["temperature"],
)
except openai.AuthenticationError:
await utils.answer(message, self.strings["invalid_key"])
return
except Exception as e:
await utils.answer(
message,
f"❌Error: {e}",
)
return
transcription = response.text
await self.client.edit_message(
message.chat_id,
down.id,
self.strings["recognized"].format(transcription=transcription),
)
os.remove(file)
else:
await utils.answer(message, self.strings["audio_not_found"])
@loader.command(
ru_doc=(
"включить/выключить автораспознавание голосовых и видео сообщений в чате"
" где введена команда"
)
)
async def autowhspr(self, message: Message):
"""Enable/disable auto-speech recognition for voice and video messages"""
chat_id = str(message.chat_id)
current_state = self.get("autowhspr", {})
enabled = current_state.get(chat_id, False)
if enabled:
current_state.pop(chat_id, None)
status_message = self.strings["autowhisper_disabled"]
else:
current_state[chat_id] = True
status_message = self.strings["autowhisper_enabled"]
self.set("autowhspr", current_state)
await utils.answer(message, status_message)
@loader.watcher(only_media=True)
async def autowhisper_watcher(self, message: Message):
"""Watcher to automatically transcribe voice and video messages when auto-speech recognition is enabled"""
chat_id = str(message.chat_id)
current_state = self.get("autowhspr", {})
if current_state.get(chat_id, False):
if (message.voice and self.config["auto_voice"]) or (message.video and self.config["auto_video"]):
if not message.gif and not message.sticker and not message.photo:
rep = message
await self.whisperwatch(rep)
async def whisperwatch(self, message: Message):
"""Transcribe speech from a voice/video message in reply"""
rep = message
down = await self.client.send_message(
message.chat.id, message=self.strings["downloading"], reply_to=rep.id
)
file = await rep.download_media()
file_extension = os.path.splitext(file)[1].lower()
if file_extension in [".oga", ".ogg"]:
await self.client.edit_message(
message.chat_id, down.id, self.strings["recognition"]
)
input_file = file
audio = AudioSegment.from_file(input_file, format="ogg")
audio.export("output_file.mp3", format="mp3")
audio_file = open("output_file.mp3", "rb")
client = openai.AsyncOpenAI(api_key=self.config["api_key"])
try:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt=self.config["prompt"],
temperature=self.config["temperature"],
)
except openai.AuthenticationError:
await utils.answer(message, self.strings["invalid_key"])
return
except Exception as e:
await utils.answer(
message,
f"❌Error: {e}",
)
return
transcription = response.text
await self.client.edit_message(
message.chat_id,
down.id,
self.strings["recognized"].format(transcription=transcription),
)
os.remove(file)
os.remove("output_file.mp3")
elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]:
await self.client.edit_message(
message.chat_id, down.id, self.strings["recognition"]
)
input_file = file
audio_file = open(input_file, "rb")
client = openai.AsyncOpenAI(api_key=self.config["api_key"])
try:
response = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
prompt=self.config["prompt"],
temperature=self.config["temperature"],
)
except openai.AuthenticationError:
await utils.answer(message, self.strings["invalid_key"])
return
except Exception as e:
await utils.answer(
message,
f"❌Error: {e}",
)
return
transcription = response.text
await self.client.edit_message(
message.chat_id,
down.id,
self.strings["recognized"].format(transcription=transcription),
)
os.remove(file)
else:
return
@loader.command(ru_doc="распознать речь из голосового/видео сообщения в реплае, используя hugging face api")
async def hfwhisper(self, m: Message):
"""Transcribe speech from a voice/video message in reply using hugging face api"""
if self.config["hf_api_key"] is None:
await utils.answer(m, self.strings["hf_token_missing"])
return
rep = await m.get_reply_message()
await utils.answer(m, self.strings["downloading"])
file = await rep.download_media()
file_extension = os.path.splitext(file)[1].lower()
if file_extension in ['.ogg', '.oga']:
try:
await utils.answer(m, self.strings["recognition"])
with open(file, "rb") as f:
audio_bytes = f.read()
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
payload = {
"inputs": audio_b64,
}
response = await utils.run_sync(
requests.post,
url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo",
headers = {"Authorization": f"Bearer {self.config['hf_api_key']}",
"x-use-cache": "false",
"x-wait-for-model": "true",
"Content-Type": "application/json"
},
json = payload,
)
output = response.json()
os.remove(file)
return await utils.answer(m, self.strings["recognized"].format(transcription=output['text']))
except Exception as e:
import logging
logging.getLogger().error(e)
return await utils.answer(m, self.strings["error"])
elif file_extension in [".mp3", "m4a", ".wav", ".mpeg", ".mp4"]:
try:
await utils.answer(m, self.strings["recognition"])
audio = AudioSegment.from_file(file, format=file_extension.replace('.', ''))
audio.export("output_file.mp3", format="mp3")
with open("output_file.mp3", "rb") as f:
audio_bytes = f.read()
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
payload = {
"inputs": audio_b64,
"language": "ru",
"attention_mask": [1] * len(audio_bytes)
}
response = await utils.run_sync(
requests.post,
url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo",
headers = {"Authorization": f"Bearer {self.config['hf_api_key']}",
"x-use-cache": "false",
"x-wait-for-model": "true",
"Content-Type": "application/json"
},
json = payload,
)
output = response.json()
os.remove("output_file.mp3")
os.remove(file)
return await utils.answer(m, self.strings["recognized"].format(transcription=output['text']))
except Exception as e:
import logging
logging.getLogger().error(e)
return await utils.answer(m, self.strings["error"])
@loader.command(
ru_doc=(
"включить/выключить автораспознавание через Hugging Face API в текущем чате"
)
)
async def hfautowhspr(self, message: Message):
"""Enable/disable auto-speech recognition using Hugging Face API"""
chat_id = str(message.chat_id)
current_state = self.get("hfautowhspr", {})
enabled = current_state.get(chat_id, False)
if enabled:
current_state.pop(chat_id, None)
status_message = self.strings["autowhisper_disabled"]
else:
current_state[chat_id] = True
status_message = self.strings["autowhisper_enabled"]
self.set("hfautowhspr", current_state)
await utils.answer(message, status_message)
@loader.watcher(only_media=True)
async def hfautowhisper_watcher(self, message: Message):
"""Watcher for Hugging Face auto-transcription"""
chat_id = str(message.chat_id)
current_state = self.get("hfautowhspr", {})
if current_state.get(chat_id, False):
if (message.voice and self.config["auto_voice"]) or (message.video and self.config["auto_video"]):
if not message.gif and not message.sticker and not message.photo:
rep = message
await self.hfwhisperwatch(rep)
async def hfwhisperwatch(self, message: Message):
"""Auto-transcribe using Hugging Face API"""
if self.config["hf_api_key"] is None:
return
rep = message
down = await self.client.send_message(
message.chat.id, self.strings["downloading"], reply_to=rep.id
)
file = await rep.download_media()
file_extension = os.path.splitext(file)[1].lower()
try:
await self.client.edit_message(
message.chat_id, down.id, self.strings["recognition"]
)
if file_extension in ['.ogg', '.oga']:
with open(file, "rb") as f:
audio_bytes = f.read()
else:
audio = AudioSegment.from_file(file, format=file_extension.replace('.', ''))
audio.export("temp_audio.mp3", format="mp3")
with open("temp_audio.mp3", "rb") as f:
audio_bytes = f.read()
os.remove("temp_audio.mp3")
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
response = await utils.run_sync(
requests.post,
url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo",
headers={
"Authorization": f"Bearer {self.config['hf_api_key']}",
"x-use-cache": "false",
"x-wait-for-model": "true",
"Content-Type": "application/json"
},
json={"inputs": audio_b64},
)
if response.status_code != 200:
raise Exception(f"API Error: {response.text}")
output = response.json()
text = output.get('text', '')
await self.client.edit_message(
message.chat_id,
down.id,
self.strings["recognized"].format(transcription=text),
)
except Exception as e:
await self.client.edit_message(
message.chat_id,
down.id,
f"❌ Error: {str(e)}"
)
finally:
if os.path.exists(file):
os.remove(file)
if os.path.exists("temp_audio.mp3"):
os.remove("temp_audio.mp3")
@loader.command(ru_doc="гайд как получить hugging face токен", en_doc="guide how to get hugging face token")
async def hfguide(self, m: Message):
await utils.answer(m, self.strings['hf_instructions'])