mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 12:41:48 +00:00
194 lines
6.6 KiB
Python
194 lines
6.6 KiB
Python
"""
|
|
Voice Transcriber — Groq Whisper integration.
|
|
|
|
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
|
|
audio bytes from Telegram voice messages and video notes into plain text.
|
|
|
|
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
|
|
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
|
|
Telegram voice messages: OGG/Opus
|
|
Telegram video notes: MP4
|
|
|
|
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
|
|
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
|
|
"""
|
|
import io
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
from backend.config import (
|
|
GROQ_API_KEY,
|
|
VOICE_LANGUAGE,
|
|
VOICE_MAX_DURATION_SECONDS,
|
|
VOICE_MIN_DURATION_SECONDS,
|
|
)
|
|
|
|
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
|
|
|
|
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
|
|
WHISPER_MODEL = "whisper-large-v3"
|
|
|
|
# Groq file size limit for Whisper: 25 MB
|
|
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
|
|
|
|
|
|
# --- Main transcription function ---------------------------------------------
|
|
|
|
async def transcribe_audio(
|
|
audio_bytes: bytes,
|
|
filename: str = "audio.ogg",
|
|
duration_seconds: int = None,
|
|
) -> dict:
|
|
"""
|
|
Transcribe audio bytes using Groq Whisper.
|
|
|
|
Args:
|
|
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
|
|
filename: Filename hint for the API (determines format detection)
|
|
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
|
|
|
|
Returns:
|
|
{
|
|
"ok": True,
|
|
"transcript": "The full transcribed text...",
|
|
"language": "en",
|
|
"duration": 45,
|
|
"word_count": 120,
|
|
}
|
|
OR on failure:
|
|
{
|
|
"ok": False,
|
|
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
|
|
"error": "optional error string",
|
|
}
|
|
"""
|
|
# Pre-flight checks
|
|
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
|
|
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
|
|
|
|
if not audio_bytes:
|
|
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
|
|
|
|
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
|
|
return {
|
|
"ok": False,
|
|
"reason": "file_too_large",
|
|
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
|
|
}
|
|
|
|
if duration_seconds is not None:
|
|
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
|
|
return {
|
|
"ok": False,
|
|
"reason": "too_short",
|
|
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
|
|
}
|
|
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
|
|
return {
|
|
"ok": False,
|
|
"reason": "too_long",
|
|
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
|
|
}
|
|
|
|
# Determine MIME type from filename extension
|
|
ext_to_mime = {
|
|
".ogg": "audio/ogg",
|
|
".opus": "audio/ogg",
|
|
".mp3": "audio/mpeg",
|
|
".mp4": "video/mp4",
|
|
".m4a": "audio/mp4",
|
|
".wav": "audio/wav",
|
|
".flac": "audio/flac",
|
|
".webm": "audio/webm",
|
|
}
|
|
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
|
|
mime_type = ext_to_mime.get(ext, "audio/ogg")
|
|
|
|
form_data = {
|
|
"model": WHISPER_MODEL,
|
|
"response_format": "verbose_json", # returns language detection
|
|
"temperature": "0", # deterministic transcription
|
|
}
|
|
if VOICE_LANGUAGE:
|
|
form_data["language"] = VOICE_LANGUAGE
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.post(
|
|
GROQ_WHISPER_URL,
|
|
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
|
|
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
|
|
data=form_data,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
error_text = ""
|
|
try:
|
|
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
|
|
except Exception:
|
|
error_text = e.response.text[:200]
|
|
|
|
if e.response.status_code == 429:
|
|
logger.warning("Groq Whisper rate limited")
|
|
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
|
|
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
|
|
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
|
|
|
|
except httpx.TimeoutException:
|
|
logger.warning("Groq Whisper request timed out")
|
|
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Groq Whisper unexpected error: {e}")
|
|
return {"ok": False, "reason": "api_error", "error": str(e)}
|
|
|
|
# Parse response
|
|
transcript = (data.get("text") or "").strip()
|
|
|
|
if not transcript:
|
|
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
|
|
|
|
# Detect if Whisper only returned noise markers
|
|
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
|
|
if transcript.lower() in noise_patterns:
|
|
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
|
|
|
|
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
|
|
word_count = len(transcript.split())
|
|
|
|
logger.info(
|
|
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
|
|
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
|
|
)
|
|
|
|
return {
|
|
"ok": True,
|
|
"transcript": transcript,
|
|
"language": detected_language,
|
|
"duration": duration_seconds,
|
|
"word_count": word_count,
|
|
}
|
|
|
|
|
|
# --- Telegram-specific download helper ---------------------------------------
|
|
|
|
async def download_telegram_audio(bot, file_id: str) -> bytes:
|
|
"""
|
|
Download a Telegram file (voice or video_note) and return raw bytes.
|
|
"""
|
|
tg_file = await bot.get_file(file_id)
|
|
audio_bytes = await tg_file.download_as_bytearray()
|
|
return bytes(audio_bytes)
|
|
|
|
|
|
def format_duration(seconds: int) -> str:
|
|
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
|
|
if seconds is None:
|
|
return "?"
|
|
if seconds >= 60:
|
|
return f"{seconds // 60}m {seconds % 60}s"
|
|
return f"{seconds}s" |