Files
B.Tech-Project-III/thirdeye/backend/agents/voice_transcriber.py
2026-04-05 00:43:23 +05:30

194 lines
6.6 KiB
Python

"""
Voice Transcriber — Groq Whisper integration.
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
audio bytes from Telegram voice messages and video notes into plain text.
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
Telegram voice messages: OGG/Opus
Telegram video notes: MP4
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
"""
import io
import logging
from typing import Optional
import httpx
from backend.config import (
GROQ_API_KEY,
VOICE_LANGUAGE,
VOICE_MAX_DURATION_SECONDS,
VOICE_MIN_DURATION_SECONDS,
)
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
WHISPER_MODEL = "whisper-large-v3"
# Groq file size limit for Whisper: 25 MB
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
# --- Main transcription function ---------------------------------------------
async def transcribe_audio(
audio_bytes: bytes,
filename: str = "audio.ogg",
duration_seconds: int = None,
) -> dict:
"""
Transcribe audio bytes using Groq Whisper.
Args:
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
filename: Filename hint for the API (determines format detection)
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
Returns:
{
"ok": True,
"transcript": "The full transcribed text...",
"language": "en",
"duration": 45,
"word_count": 120,
}
OR on failure:
{
"ok": False,
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
"error": "optional error string",
}
"""
# Pre-flight checks
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
if not audio_bytes:
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
return {
"ok": False,
"reason": "file_too_large",
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
}
if duration_seconds is not None:
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
return {
"ok": False,
"reason": "too_short",
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
}
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
return {
"ok": False,
"reason": "too_long",
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
}
# Determine MIME type from filename extension
ext_to_mime = {
".ogg": "audio/ogg",
".opus": "audio/ogg",
".mp3": "audio/mpeg",
".mp4": "video/mp4",
".m4a": "audio/mp4",
".wav": "audio/wav",
".flac": "audio/flac",
".webm": "audio/webm",
}
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
mime_type = ext_to_mime.get(ext, "audio/ogg")
form_data = {
"model": WHISPER_MODEL,
"response_format": "verbose_json", # returns language detection
"temperature": "0", # deterministic transcription
}
if VOICE_LANGUAGE:
form_data["language"] = VOICE_LANGUAGE
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
GROQ_WHISPER_URL,
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
data=form_data,
)
resp.raise_for_status()
data = resp.json()
except httpx.HTTPStatusError as e:
error_text = ""
try:
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
except Exception:
error_text = e.response.text[:200]
if e.response.status_code == 429:
logger.warning("Groq Whisper rate limited")
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
except httpx.TimeoutException:
logger.warning("Groq Whisper request timed out")
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
except Exception as e:
logger.error(f"Groq Whisper unexpected error: {e}")
return {"ok": False, "reason": "api_error", "error": str(e)}
# Parse response
transcript = (data.get("text") or "").strip()
if not transcript:
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
# Detect if Whisper only returned noise markers
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
if transcript.lower() in noise_patterns:
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
word_count = len(transcript.split())
logger.info(
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
)
return {
"ok": True,
"transcript": transcript,
"language": detected_language,
"duration": duration_seconds,
"word_count": word_count,
}
# --- Telegram-specific download helper ---------------------------------------
async def download_telegram_audio(bot, file_id: str) -> bytes:
"""
Download a Telegram file (voice or video_note) and return raw bytes.
"""
tg_file = await bot.get_file(file_id)
audio_bytes = await tg_file.download_as_bytearray()
return bytes(audio_bytes)
def format_duration(seconds: int) -> str:
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
if seconds is None:
return "?"
if seconds >= 60:
return f"{seconds // 60}m {seconds % 60}s"
return f"{seconds}s"