B.Tech-Project-III/thirdeye/backend/agents/voice_handler.py

"""
Voice Handler
Orchestrates the full pipeline for Telegram voice messages and video notes:

  Telegram voice/video_note message
    -> download audio bytes
    -> transcribe via Groq Whisper (voice_transcriber.py)
    -> build a voice_transcript signal (stored raw for full-text search)
    -> run transcript through process_message_batch (signal extraction)
    -> all extracted signals carry voice attribution metadata

Voice metadata attached to every extracted signal:
  source:           "voice"
  voice_file_id:    Telegram file ID
  voice_duration:   seconds
  speaker:          sender display name
"""
import logging
import uuid
from datetime import datetime, timezone

from backend.agents.voice_transcriber import (
    transcribe_audio, download_telegram_audio, format_duration
)
from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
from backend.db.chroma import store_signals
from backend.pipeline import process_message_batch

logger = logging.getLogger("thirdeye.agents.voice_handler")


# --- Voice transcript signal builder -----------------------------------------

def build_voice_transcript_signal(
    transcript: str,
    sender: str,
    group_id: str,
    voice_file_id: str,
    duration_seconds: int,
    language: str,
    timestamp: str,
) -> dict:
    """
    Build a voice_transcript signal that stores the full raw transcription.
    Always stored alongside extracted signals so the full transcript is
    searchable in ChromaDB even if no structured signals were extracted.
    """
    return {
        "id": str(uuid.uuid4()),
        "type": "voice_transcript",
        "summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
        "raw_quote": transcript,
        "severity": "low",
        "status": "transcribed",
        "sentiment": "neutral",
        "urgency": "none",
        "entities": [f"@{sender}"],
        "keywords": _extract_voice_keywords(transcript),
        "timestamp": timestamp,
        "group_id": group_id,
        "lens": "voice",
        "source": "voice",
        "voice_file_id": voice_file_id,
        "voice_duration": duration_seconds,
        "voice_language": language,
        "speaker": sender,
    }


def _extract_voice_keywords(text: str) -> list[str]:
    """Simple keyword extraction from transcript text."""
    stopwords = {
        "the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
        "in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
        "but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
        "so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
    }
    words = text.lower().split()
    keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
    return list(dict.fromkeys(keywords))[:12]


def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
    """
    Inject voice attribution into every signal extracted from a voice transcript.
    Accepts both Signal Pydantic model objects and plain dicts.
    This ensures /ask can cite the voice source in its answers.
    """
    result = []
    for signal in signals:
        sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
        sig["source"] = "voice"
        sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
        sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
        sig["voice_language"] = voice_meta.get("language", "")
        sig["speaker"] = voice_meta.get("sender", "Unknown")
        if "[Voice]" not in sig.get("summary", ""):
            sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
        result.append(sig)
    return result


# --- Fallback signal builder -------------------------------------------------

# Keywords that hint at a signal type when the LLM extraction returns nothing
_FALLBACK_TYPE_HINTS = {
    "feature_request": {
        "need", "needs", "required", "require", "want", "should", "missing",
        "add", "feature", "ui", "ux", "design", "change", "changes", "update",
        "improve", "improvement", "responsiveness", "responsive",
    },
    "blocker": {
        "blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
        "issue", "problem", "broken", "fails", "failing",
    },
    "action_item": {
        "will", "going", "plan", "todo", "do", "fix", "implement", "setup",
        "create", "build", "deploy", "check",
    },
    "risk": {
        "risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
        "deadline", "delay", "late",
    },
}


def _build_fallback_signal(
    transcript: str,
    sender: str,
    group_id: str,
    timestamp: str,
    voice_meta: dict,
) -> dict:
    """
    Build a best-effort structured signal from a voice transcript when the LLM
    returned 0 signals. Picks the most likely signal type from keyword hints,
    falling back to 'feature_request' as the safe default.
    """
    words = set(transcript.lower().split())
    scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
    best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"

    urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
    severity = "high" if words & urgency_words else "medium"

    summary = transcript[:200].strip()
    if len(transcript) > 200:
        summary += "..."

    return {
        "id": str(uuid.uuid4()),
        "type": best_type,
        "summary": f"[Voice @{sender}] {summary}",
        "raw_quote": transcript[:500],
        "severity": severity,
        "status": "unresolved",
        "sentiment": "neutral",
        "urgency": "high" if severity == "high" else "medium",
        "entities": [f"@{sender}"],
        "keywords": _extract_voice_keywords(transcript),
        "timestamp": timestamp,
        "group_id": group_id,
        "lens": "voice",
        "source": "voice",
        "speaker": sender,
        "voice_file_id": voice_meta.get("voice_file_id", ""),
        "voice_duration": voice_meta.get("duration_seconds", 0),
        "voice_language": voice_meta.get("language", ""),
    }


# --- Main handler ------------------------------------------------------------

async def handle_voice_message(
    bot,
    group_id: str,
    sender: str,
    file_id: str,
    duration_seconds: int,
    message_date,
    is_video_note: bool = False,
) -> dict:
    """
    Full pipeline for a single voice or video note message.

    Returns:
        {"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
        OR {"ok": False, "reason": "...", "error": "..."}
    """
    if not ENABLE_VOICE_TRANSCRIPTION:
        return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}

    msg_type = "video note" if is_video_note else "voice message"
    logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")

    # 1. Download audio
    try:
        audio_bytes = await download_telegram_audio(bot, file_id)
    except Exception as e:
        logger.error(f"Failed to download audio from {sender}: {e}")
        return {"ok": False, "reason": "download_failed", "error": str(e)}

    # 2. Transcribe
    filename = "audio.mp4" if is_video_note else "audio.ogg"
    transcription = await transcribe_audio(
        audio_bytes,
        filename=filename,
        duration_seconds=duration_seconds,
    )

    if not transcription["ok"]:
        logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
        return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}

    transcript = transcription["transcript"]
    language = transcription.get("language", "unknown")
    timestamp = (
        message_date.replace(tzinfo=timezone.utc).isoformat()
        if message_date else datetime.utcnow().isoformat()
    )

    # 3. Store raw voice transcript signal
    if VOICE_STORE_TRANSCRIPT:
        transcript_signal = build_voice_transcript_signal(
            transcript=transcript,
            sender=sender,
            group_id=group_id,
            voice_file_id=file_id,
            duration_seconds=duration_seconds,
            language=language,
            timestamp=timestamp,
        )
        store_signals(group_id, [transcript_signal])
        logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")

    # 4. Run through signal extraction pipeline — treat as a regular text message
    voice_meta = {
        "sender": sender,
        "voice_file_id": file_id,
        "duration_seconds": duration_seconds,
        "language": language,
    }

    messages = [{
        "sender": sender,
        "text": transcript,
        "timestamp": timestamp,
        "source": "voice",
        "voice_file_id": file_id,
        "voice_duration": duration_seconds,
    }]

    try:
        extracted_signals = await process_message_batch(group_id, messages)
        extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
        signals_count = len(extracted_signals)

        # Fallback: if the LLM extracted nothing from a meaningful voice message,
        # create a generic signal so the content is still searchable as structured data.
        if signals_count == 0 and len(transcript.split()) >= 5:
            fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
            store_signals(group_id, [fallback])
            signals_count = 1
            logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
    except Exception as e:
        logger.error(f"Signal extraction failed for voice from {sender}: {e}")
        signals_count = 0

    logger.info(
        f"Voice pipeline complete: {sender}, {duration_seconds}s, "
        f"{signals_count} signals, transcript={len(transcript)} chars"
    )

    return {
        "ok": True,
        "transcript": transcript,
        "signals_extracted": signals_count,
        "duration": duration_seconds,
        "sender": f"@{sender}",
        "language": language,
    }