init

2026-04-19 20:51:49 +00:00 · 2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions
--- a/thirdeye/backend/agents/voice_transcriber.py
+++ b/thirdeye/backend/agents/voice_transcriber.py
@@ -0,0 +1,194 @@
+"""
+Voice Transcriber — Groq Whisper integration.
+
+Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
+audio bytes from Telegram voice messages and video notes into plain text.
+
+Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
+Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
+Telegram voice messages: OGG/Opus
+Telegram video notes:    MP4
+
+Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
+At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
+"""
+import io
+import logging
+from typing import Optional
+
+import httpx
+
+from backend.config import (
+    GROQ_API_KEY,
+    VOICE_LANGUAGE,
+    VOICE_MAX_DURATION_SECONDS,
+    VOICE_MIN_DURATION_SECONDS,
+)
+
+logger = logging.getLogger("thirdeye.agents.voice_transcriber")
+
+GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
+WHISPER_MODEL = "whisper-large-v3"
+
+# Groq file size limit for Whisper: 25 MB
+GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
+
+
+# --- Main transcription function ---------------------------------------------
+
+async def transcribe_audio(
+    audio_bytes: bytes,
+    filename: str = "audio.ogg",
+    duration_seconds: int = None,
+) -> dict:
+    """
+    Transcribe audio bytes using Groq Whisper.
+
+    Args:
+        audio_bytes:       Raw audio data (OGG, MP4, WAV, etc.)
+        filename:          Filename hint for the API (determines format detection)
+        duration_seconds:  Voice message duration from Telegram metadata (for pre-filtering)
+
+    Returns:
+        {
+            "ok": True,
+            "transcript": "The full transcribed text...",
+            "language": "en",
+            "duration": 45,
+            "word_count": 120,
+        }
+        OR on failure:
+        {
+            "ok": False,
+            "reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
+            "error": "optional error string",
+        }
+    """
+    # Pre-flight checks
+    if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
+        return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
+
+    if not audio_bytes:
+        return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
+
+    if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
+        return {
+            "ok": False,
+            "reason": "file_too_large",
+            "error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
+        }
+
+    if duration_seconds is not None:
+        if duration_seconds < VOICE_MIN_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_short",
+                "error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
+            }
+        if duration_seconds > VOICE_MAX_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_long",
+                "error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
+            }
+
+    # Determine MIME type from filename extension
+    ext_to_mime = {
+        ".ogg": "audio/ogg",
+        ".opus": "audio/ogg",
+        ".mp3": "audio/mpeg",
+        ".mp4": "video/mp4",
+        ".m4a": "audio/mp4",
+        ".wav": "audio/wav",
+        ".flac": "audio/flac",
+        ".webm": "audio/webm",
+    }
+    ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
+    mime_type = ext_to_mime.get(ext, "audio/ogg")
+
+    form_data = {
+        "model": WHISPER_MODEL,
+        "response_format": "verbose_json",   # returns language detection
+        "temperature": "0",                  # deterministic transcription
+    }
+    if VOICE_LANGUAGE:
+        form_data["language"] = VOICE_LANGUAGE
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                GROQ_WHISPER_URL,
+                headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+                files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
+                data=form_data,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+
+    except httpx.HTTPStatusError as e:
+        error_text = ""
+        try:
+            error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
+        except Exception:
+            error_text = e.response.text[:200]
+
+        if e.response.status_code == 429:
+            logger.warning("Groq Whisper rate limited")
+            return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
+        logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
+        return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
+
+    except httpx.TimeoutException:
+        logger.warning("Groq Whisper request timed out")
+        return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
+
+    except Exception as e:
+        logger.error(f"Groq Whisper unexpected error: {e}")
+        return {"ok": False, "reason": "api_error", "error": str(e)}
+
+    # Parse response
+    transcript = (data.get("text") or "").strip()
+
+    if not transcript:
+        return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
+
+    # Detect if Whisper only returned noise markers
+    noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
+    if transcript.lower() in noise_patterns:
+        return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
+
+    detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
+    word_count = len(transcript.split())
+
+    logger.info(
+        f"Whisper transcribed {duration_seconds or '?'}s audio -> "
+        f"{word_count} words [{detected_language}]: {transcript[:60]}..."
+    )
+
+    return {
+        "ok": True,
+        "transcript": transcript,
+        "language": detected_language,
+        "duration": duration_seconds,
+        "word_count": word_count,
+    }
+
+
+# --- Telegram-specific download helper ---------------------------------------
+
+async def download_telegram_audio(bot, file_id: str) -> bytes:
+    """
+    Download a Telegram file (voice or video_note) and return raw bytes.
+    """
+    tg_file = await bot.get_file(file_id)
+    audio_bytes = await tg_file.download_as_bytearray()
+    return bytes(audio_bytes)
+
+
+def format_duration(seconds: int) -> str:
+    """Format seconds into human-readable string: '1m 34s' or '45s'."""
+    if seconds is None:
+        return "?"
+    if seconds >= 60:
+        return f"{seconds // 60}m {seconds % 60}s"
+    return f"{seconds}s"