mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 20:51:49 +00:00
init
This commit is contained in:
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Voice Transcriber — Groq Whisper integration.
|
||||
|
||||
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
|
||||
audio bytes from Telegram voice messages and video notes into plain text.
|
||||
|
||||
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
|
||||
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
|
||||
Telegram voice messages: OGG/Opus
|
||||
Telegram video notes: MP4
|
||||
|
||||
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
|
||||
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.config import (
|
||||
GROQ_API_KEY,
|
||||
VOICE_LANGUAGE,
|
||||
VOICE_MAX_DURATION_SECONDS,
|
||||
VOICE_MIN_DURATION_SECONDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
|
||||
|
||||
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
WHISPER_MODEL = "whisper-large-v3"
|
||||
|
||||
# Groq file size limit for Whisper: 25 MB
|
||||
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
|
||||
|
||||
# --- Main transcription function ---------------------------------------------
|
||||
|
||||
async def transcribe_audio(
|
||||
audio_bytes: bytes,
|
||||
filename: str = "audio.ogg",
|
||||
duration_seconds: int = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe audio bytes using Groq Whisper.
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
|
||||
filename: Filename hint for the API (determines format detection)
|
||||
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"ok": True,
|
||||
"transcript": "The full transcribed text...",
|
||||
"language": "en",
|
||||
"duration": 45,
|
||||
"word_count": 120,
|
||||
}
|
||||
OR on failure:
|
||||
{
|
||||
"ok": False,
|
||||
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
|
||||
"error": "optional error string",
|
||||
}
|
||||
"""
|
||||
# Pre-flight checks
|
||||
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
|
||||
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
|
||||
|
||||
if not audio_bytes:
|
||||
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
|
||||
|
||||
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "file_too_large",
|
||||
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
|
||||
}
|
||||
|
||||
if duration_seconds is not None:
|
||||
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_short",
|
||||
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
|
||||
}
|
||||
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_long",
|
||||
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
|
||||
}
|
||||
|
||||
# Determine MIME type from filename extension
|
||||
ext_to_mime = {
|
||||
".ogg": "audio/ogg",
|
||||
".opus": "audio/ogg",
|
||||
".mp3": "audio/mpeg",
|
||||
".mp4": "video/mp4",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".flac": "audio/flac",
|
||||
".webm": "audio/webm",
|
||||
}
|
||||
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
|
||||
mime_type = ext_to_mime.get(ext, "audio/ogg")
|
||||
|
||||
form_data = {
|
||||
"model": WHISPER_MODEL,
|
||||
"response_format": "verbose_json", # returns language detection
|
||||
"temperature": "0", # deterministic transcription
|
||||
}
|
||||
if VOICE_LANGUAGE:
|
||||
form_data["language"] = VOICE_LANGUAGE
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.post(
|
||||
GROQ_WHISPER_URL,
|
||||
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
|
||||
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
|
||||
data=form_data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_text = ""
|
||||
try:
|
||||
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
|
||||
except Exception:
|
||||
error_text = e.response.text[:200]
|
||||
|
||||
if e.response.status_code == 429:
|
||||
logger.warning("Groq Whisper rate limited")
|
||||
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
|
||||
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
|
||||
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Groq Whisper request timed out")
|
||||
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Groq Whisper unexpected error: {e}")
|
||||
return {"ok": False, "reason": "api_error", "error": str(e)}
|
||||
|
||||
# Parse response
|
||||
transcript = (data.get("text") or "").strip()
|
||||
|
||||
if not transcript:
|
||||
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
|
||||
|
||||
# Detect if Whisper only returned noise markers
|
||||
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
|
||||
if transcript.lower() in noise_patterns:
|
||||
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
|
||||
|
||||
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
|
||||
word_count = len(transcript.split())
|
||||
|
||||
logger.info(
|
||||
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
|
||||
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"language": detected_language,
|
||||
"duration": duration_seconds,
|
||||
"word_count": word_count,
|
||||
}
|
||||
|
||||
|
||||
# --- Telegram-specific download helper ---------------------------------------
|
||||
|
||||
async def download_telegram_audio(bot, file_id: str) -> bytes:
|
||||
"""
|
||||
Download a Telegram file (voice or video_note) and return raw bytes.
|
||||
"""
|
||||
tg_file = await bot.get_file(file_id)
|
||||
audio_bytes = await tg_file.download_as_bytearray()
|
||||
return bytes(audio_bytes)
|
||||
|
||||
|
||||
def format_duration(seconds: int) -> str:
|
||||
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
|
||||
if seconds is None:
|
||||
return "?"
|
||||
if seconds >= 60:
|
||||
return f"{seconds // 60}m {seconds % 60}s"
|
||||
return f"{seconds}s"
|
||||
Reference in New Issue
Block a user