""" Voice Transcriber — Groq Whisper integration. Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe audio bytes from Telegram voice messages and video notes into plain text. Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm Telegram voice messages: OGG/Opus Telegram video notes: MP4 Free tier limits: 7,200 seconds of audio / hour on Groq free plan. At avg 30s per voice note: ~240 voice notes / hour — more than any team sends. """ import io import logging from typing import Optional import httpx from backend.config import ( GROQ_API_KEY, VOICE_LANGUAGE, VOICE_MAX_DURATION_SECONDS, VOICE_MIN_DURATION_SECONDS, ) logger = logging.getLogger("thirdeye.agents.voice_transcriber") GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions" WHISPER_MODEL = "whisper-large-v3" # Groq file size limit for Whisper: 25 MB GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024 # --- Main transcription function --------------------------------------------- async def transcribe_audio( audio_bytes: bytes, filename: str = "audio.ogg", duration_seconds: int = None, ) -> dict: """ Transcribe audio bytes using Groq Whisper. Args: audio_bytes: Raw audio data (OGG, MP4, WAV, etc.) filename: Filename hint for the API (determines format detection) duration_seconds: Voice message duration from Telegram metadata (for pre-filtering) Returns: { "ok": True, "transcript": "The full transcribed text...", "language": "en", "duration": 45, "word_count": 120, } OR on failure: { "ok": False, "reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech", "error": "optional error string", } """ # Pre-flight checks if not GROQ_API_KEY or len(GROQ_API_KEY) < 5: return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"} if not audio_bytes: return {"ok": False, "reason": "empty", "error": "No audio bytes received"} if len(audio_bytes) > GROQ_MAX_FILE_BYTES: return { "ok": False, "reason": "file_too_large", "error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB", } if duration_seconds is not None: if duration_seconds < VOICE_MIN_DURATION_SECONDS: return { "ok": False, "reason": "too_short", "error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s", } if duration_seconds > VOICE_MAX_DURATION_SECONDS: return { "ok": False, "reason": "too_long", "error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s", } # Determine MIME type from filename extension ext_to_mime = { ".ogg": "audio/ogg", ".opus": "audio/ogg", ".mp3": "audio/mpeg", ".mp4": "video/mp4", ".m4a": "audio/mp4", ".wav": "audio/wav", ".flac": "audio/flac", ".webm": "audio/webm", } ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg" mime_type = ext_to_mime.get(ext, "audio/ogg") form_data = { "model": WHISPER_MODEL, "response_format": "verbose_json", # returns language detection "temperature": "0", # deterministic transcription } if VOICE_LANGUAGE: form_data["language"] = VOICE_LANGUAGE try: async with httpx.AsyncClient(timeout=60.0) as client: resp = await client.post( GROQ_WHISPER_URL, headers={"Authorization": f"Bearer {GROQ_API_KEY}"}, files={"file": (filename, io.BytesIO(audio_bytes), mime_type)}, data=form_data, ) resp.raise_for_status() data = resp.json() except httpx.HTTPStatusError as e: error_text = "" try: error_text = e.response.json().get("error", {}).get("message", e.response.text[:200]) except Exception: error_text = e.response.text[:200] if e.response.status_code == 429: logger.warning("Groq Whisper rate limited") return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"} logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}") return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"} except httpx.TimeoutException: logger.warning("Groq Whisper request timed out") return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"} except Exception as e: logger.error(f"Groq Whisper unexpected error: {e}") return {"ok": False, "reason": "api_error", "error": str(e)} # Parse response transcript = (data.get("text") or "").strip() if not transcript: return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"} # Detect if Whisper only returned noise markers noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"} if transcript.lower() in noise_patterns: return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"} detected_language = data.get("language", VOICE_LANGUAGE or "unknown") word_count = len(transcript.split()) logger.info( f"Whisper transcribed {duration_seconds or '?'}s audio -> " f"{word_count} words [{detected_language}]: {transcript[:60]}..." ) return { "ok": True, "transcript": transcript, "language": detected_language, "duration": duration_seconds, "word_count": word_count, } # --- Telegram-specific download helper --------------------------------------- async def download_telegram_audio(bot, file_id: str) -> bytes: """ Download a Telegram file (voice or video_note) and return raw bytes. """ tg_file = await bot.get_file(file_id) audio_bytes = await tg_file.download_as_bytearray() return bytes(audio_bytes) def format_duration(seconds: int) -> str: """Format seconds into human-readable string: '1m 34s' or '45s'.""" if seconds is None: return "?" if seconds >= 60: return f"{seconds // 60}m {seconds % 60}s" return f"{seconds}s"