init

2026-04-19 12:41:48 +00:00 · 2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions
--- a/thirdeye/backend/agents/classifier.py
+++ b/thirdeye/backend/agents/classifier.py
@@ -0,0 +1,34 @@
+"""Classifier Agent — adds metadata tags to extracted signals."""
+import logging
+from backend.providers import call_llm
+from backend.db.models import Signal
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.classifier")
+
+SYSTEM_PROMPT = """You are a fast metadata classifier. Given an extracted signal, add classification tags.
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"sentiment": "positive|neutral|negative|urgent", "urgency": "none|low|medium|high|critical", "keywords": ["3-5 searchable keywords"]}
+"""
+
+
+async def classify_signal(signal: Signal) -> Signal:
+    """Add classification metadata to a signal."""
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Classify this signal:\nType: {signal.type}\nSummary: {signal.summary}\nQuote: {signal.raw_quote}"},
+    ]
+
+    try:
+        result = await call_llm("fast_small", messages, temperature=0.1, max_tokens=200)
+        parsed = extract_json_object(result.get("content", ""))
+        signal.sentiment = parsed.get("sentiment", signal.sentiment)
+        signal.urgency = parsed.get("urgency", signal.urgency)
+        signal.keywords = parsed.get("keywords", signal.keywords)
+
+    except Exception as e:
+        logger.warning(f"Classification failed, using defaults: {e}")
+        # Keep defaults — classification failure is non-fatal
+
+    return signal
--- a/thirdeye/backend/agents/context_detector.py
+++ b/thirdeye/backend/agents/context_detector.py
@@ -0,0 +1,107 @@
+"""Context Detector Agent — auto-classifies group type from messages."""
+import logging
+from backend.providers import call_llm
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.context_detector")
+
+SYSTEM_PROMPT = """You analyze a batch of messages from a Telegram group and determine what TYPE of group this is.
+
+CLASSIFY into exactly ONE:
+- "dev" — Software engineering team (code, PRs, deployments, bugs, tech stack)
+- "product" — Product/business team (features, users, metrics, roadmap, competitors)
+- "client" — Client/agency channel (deliverables, timelines, approvals, invoices)
+- "community" — Community/interest group (recommendations, events, local info, casual)
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"detected_lens": "dev|product|client|community", "confidence": 0.0-1.0, "evidence": ["signal1", "signal2", "signal3"]}
+"""
+
+VALID_LENSES = {"dev", "product", "client", "community"}
+
+
+def _heuristic_detect_context(messages_text: str) -> dict:
+    """Rule-based fallback when LLM output is malformed/unavailable."""
+    text = (messages_text or "").lower()
+
+    lens_keywords = {
+        "dev": [
+            "bug", "deploy", "deployment", "api", "database", "schema", "postgres", "mongo",
+            "timeout", "endpoint", "pod", "pr", "code", "docker", "stack", "integration",
+        ],
+        "product": [
+            "feature", "roadmap", "user", "users", "client", "customers", "complain", "pain",
+            "prioritize", "priority", "enterprise", "competitor", "demo", "sso", "dark mode",
+            "mobile", "stability", "integration",
+        ],
+        "client": [
+            "invoice", "deadline", "deliverable", "approval", "sign-off", "scope", "payment",
+            "contract", "proposal", "timeline", "meeting",
+        ],
+        "community": [
+            "event", "meetup", "recommend", "anyone", "community", "local", "where can i",
+            "suggestion", "friends", "weekend",
+        ],
+    }
+
+    scores = {
+        lens: sum(text.count(keyword) for keyword in keywords)
+        for lens, keywords in lens_keywords.items()
+    }
+
+    best_lens = max(scores, key=scores.get)
+    best_score = scores[best_lens]
+    if best_score == 0:
+        best_lens = "dev"
+
+    evidence = [k for k in lens_keywords[best_lens] if k in text][:3]
+    confidence = min(0.95, 0.35 + 0.08 * best_score) if best_score > 0 else 0.0
+
+    return {
+        "detected_lens": best_lens,
+        "confidence": round(confidence, 2),
+        "evidence": evidence or ["heuristic_fallback"],
+    }
+
+
+async def detect_context(messages_text: str) -> dict:
+    """Detect group type from a batch of messages."""
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Classify this group based on these messages:\n\n{messages_text}"},
+    ]
+
+    try:
+        result = await call_llm(
+            "fast_large",
+            messages,
+            temperature=0.1,
+            max_tokens=300,
+            response_format={"type": "json_object"},
+        )
+        parsed = extract_json_object(result.get("content", ""))
+
+        detected_lens = str(parsed.get("detected_lens", "dev")).strip().lower()
+        if detected_lens not in VALID_LENSES:
+            detected_lens = "dev"
+
+        confidence = parsed.get("confidence", 0.5)
+        try:
+            confidence = float(confidence)
+        except (TypeError, ValueError):
+            confidence = 0.5
+
+        evidence = parsed.get("evidence", [])
+        if not isinstance(evidence, list):
+            evidence = [str(evidence)]
+
+        return {
+            "detected_lens": detected_lens,
+            "confidence": max(0.0, min(1.0, confidence)),
+            "evidence": [str(x) for x in evidence][:5],
+        }
+    except Exception as e:
+        logger.error(f"Context detection failed: {e}")
+        fallback = _heuristic_detect_context(messages_text)
+        fallback["evidence"] = fallback["evidence"] + ["detection_failed"]
+        return fallback
--- a/thirdeye/backend/agents/cross_group_analyst.py
+++ b/thirdeye/backend/agents/cross_group_analyst.py
@@ -0,0 +1,287 @@
+"""Cross-Group Analyst Agent — detects blind spots between multiple teams."""
+
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import get_all_signals, get_group_ids
+from backend.db.models import CrossGroupInsight
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.cross_group_analyst")
+
+SYSTEM_PROMPT = """You are the Cross-Group Intelligence Analyst for ThirdEye. This is the MOST IMPORTANT analysis.
+
+You receive intelligence summaries from MULTIPLE Telegram groups. Your job is to find BLIND SPOTS — information in one group that should be in another.
+
+Detect:
+- blocked_handoff: Team A waiting for something from Team B, but Team B doesn't know
+- conflicting_decision: Team A decided X, Team B decided the opposite
+- information_silo: Critical info in Group A never reached Group B
+- promise_reality_gap: Promise made in one group, but another group shows it's blocked
+- duplicated_effort: Two teams working on similar things unknowingly
+
+Respond ONLY with valid JSON (no markdown):
+{"insights": [{"type": "insight_type", "description": "SPECIFIC description naming the groups, people, and topics", "group_a": {"name": "group_name", "evidence": "what was said"}, "group_b": {"name": "group_name", "evidence": "what was said or NOT said"}, "severity": "warning|critical", "recommendation": "Specific action"}]}
+
+If no cross-group issues: {"insights": []}
+Be SPECIFIC. Name the groups, people, topics, and exact conflicts."""
+
+
+def _heuristic_cross_group_insights(
+    group_summaries: dict[str, list[dict]],
+) -> list[CrossGroupInsight]:
+    """Generate best-effort cross-group insights when LLM output is unavailable."""
+    insights: list[CrossGroupInsight] = []
+
+    normalized = {}
+    for group_name, signals in group_summaries.items():
+        docs = [str(s.get("document", "")) for s in signals]
+        combined = " ".join(docs).lower()
+        signal_types = []
+        for s in signals:
+            signal_types.append(
+                str(s.get("metadata", {}).get("type", "unknown")).lower()
+            )
+        normalized[group_name] = {
+            "text": combined,
+            "signals": signals,
+            "types": signal_types,
+        }
+
+    group_names = list(normalized.keys())
+    for i in range(len(group_names)):
+        for j in range(i + 1, len(group_names)):
+            group_a = group_names[i]
+            group_b = group_names[j]
+            text_a = normalized[group_a]["text"]
+            text_b = normalized[group_b]["text"]
+            types_a = set(normalized[group_a]["types"])
+            types_b = set(normalized[group_b]["types"])
+
+            # Detect a likely blocked handoff around design/spec dependencies.
+            a_waiting = any(
+                k in text_a for k in ["waiting", "blocked", "design spec", "specs"]
+            )
+            b_mentions_specs = any(
+                k in text_b for k in ["design spec", "specs", "design"]
+            )
+            if a_waiting and not b_mentions_specs:
+                insights.append(
+                    CrossGroupInsight(
+                        type="blocked_handoff",
+                        description=(
+                            f"{group_a} indicates dependency blockage (design/spec inputs), "
+                            f"but {group_b} has no corresponding discussion of that dependency."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": "Contains waiting/blocked language tied to specs or design dependency.",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": "No clear mention of design specs/dependency handoff in available signals.",
+                        },
+                        severity="warning",
+                        recommendation=(
+                            f"Create a shared handoff item between {group_a} and {group_b} for design/spec ownership "
+                            "with an explicit due date."
+                        ),
+                    )
+                )
+
+            # Detect likely promise vs execution mismatch.
+            b_promises = any(
+                k in text_b
+                for k in ["demo", "friday", "promised", "told the client", "ready by"]
+            )
+            a_blocked = any(
+                k in text_a
+                for k in ["blocked", "waiting", "can't proceed", "cannot proceed"]
+            )
+            if b_promises and a_blocked:
+                insights.append(
+                    CrossGroupInsight(
+                        type="promise_reality_gap",
+                        description=(
+                            f"{group_b} signals delivery promises while {group_a} reports blockers that may prevent those commitments."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": "Signals include active blockers/waiting dependencies.",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": "Signals include explicit client/demo commitments and timelines.",
+                        },
+                        severity="critical",
+                        recommendation="Run a joint risk review and re-baseline commitments before the next client update.",
+                    )
+                )
+
+            # Type-based silo detection when lexical cues are weak.
+            a_operational_risk = bool(
+                types_a.intersection(
+                    {"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
+                )
+            )
+            b_planning_focus = bool(
+                types_b.intersection(
+                    {
+                        "feature_request",
+                        "roadmap_drift",
+                        "priority_conflict",
+                        "user_pain_point",
+                    }
+                )
+            )
+            if a_operational_risk and b_planning_focus:
+                insights.append(
+                    CrossGroupInsight(
+                        type="information_silo",
+                        description=(
+                            f"{group_a} shows operational risk signals while {group_b} is focused on planning/user demands, "
+                            "suggesting risk context is not shared across groups."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": f"Operational risk signal types: {sorted(types_a.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": f"Planning-focused signal types: {sorted(types_b.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
+                        },
+                        severity="warning",
+                        recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
+                    )
+                )
+
+            # Check reverse direction as well.
+            b_operational_risk = bool(
+                types_b.intersection(
+                    {"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
+                )
+            )
+            a_planning_focus = bool(
+                types_a.intersection(
+                    {
+                        "feature_request",
+                        "roadmap_drift",
+                        "priority_conflict",
+                        "user_pain_point",
+                    }
+                )
+            )
+            if b_operational_risk and a_planning_focus:
+                insights.append(
+                    CrossGroupInsight(
+                        type="information_silo",
+                        description=(
+                            f"{group_b} shows operational risk signals while {group_a} is focused on planning/user demands, "
+                            "suggesting risk context is not shared across groups."
+                        ),
+                        group_a={
+                            "name": group_b,
+                            "evidence": f"Operational risk signal types: {sorted(types_b.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
+                        },
+                        group_b={
+                            "name": group_a,
+                            "evidence": f"Planning-focused signal types: {sorted(types_a.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
+                        },
+                        severity="warning",
+                        recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
+                    )
+                )
+
+    deduped = []
+    seen_keys = set()
+    for insight in insights:
+        key = (insight.type, insight.group_a.get("name"), insight.group_b.get("name"))
+        if key in seen_keys:
+            continue
+        seen_keys.add(key)
+        deduped.append(insight)
+
+    return deduped[:5]
+
+
+async def analyze_cross_group(
+    group_summaries: dict[str, list[dict]] = None,
+) -> list[CrossGroupInsight]:
+    """
+    Analyze intelligence across all monitored groups to find blind spots.
+
+    Args:
+        group_summaries: Optional pre-built summaries. If None, loads from ChromaDB.
+    """
+    if group_summaries is None:
+        group_ids = get_group_ids()
+        if len(group_ids) < 2:
+            logger.info("Need at least 2 groups for cross-group analysis")
+            return []
+
+        group_summaries = {}
+        for gid in group_ids:
+            signals = get_all_signals(gid)
+            group_summaries[gid] = signals
+
+    if len(group_summaries) < 2:
+        return []
+
+    # Format summaries for the LLM
+    summary_parts = []
+    for group_name, signals in group_summaries.items():
+        signal_lines = []
+        for s in signals[:30]:  # Limit per group to fit context
+            meta = s["metadata"]
+            signal_lines.append(f"  - [{meta.get('type', '?')}] {s['document'][:120]}")
+
+        summary_parts.append(
+            f"=== GROUP: {group_name} ({len(signals)} total signals) ===\n"
+            + "\n".join(signal_lines)
+        )
+
+    full_summary = "\n\n".join(summary_parts)
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": f"Analyze cross-group intelligence:\n\n{full_summary}",
+        },
+    ]
+
+    try:
+        result = await call_llm(
+            "reasoning",
+            messages,
+            temperature=0.2,
+            max_tokens=2000,
+            response_format={"type": "json_object"},
+        )
+        parsed = extract_json_object(result.get("content", ""))
+        insights = []
+        for i in parsed.get("insights", []):
+            insights.append(
+                CrossGroupInsight(
+                    type=i.get("type", "unknown"),
+                    description=i.get("description", ""),
+                    group_a=i.get("group_a", {}),
+                    group_b=i.get("group_b", {}),
+                    severity=i.get("severity", "warning"),
+                    recommendation=i.get("recommendation", ""),
+                )
+            )
+
+        logger.info(f"Cross-group analysis found {len(insights)} insights")
+        return insights
+
+    except Exception as e:
+        raw = ""
+        if "result" in locals() and isinstance(result, dict):
+            raw = str(result.get("content", ""))[:300].replace("\n", " ")
+        logger.info(f"Cross-group LLM parse issue, using fallback: {e}; raw_head={raw}")
+        fallback = _heuristic_cross_group_insights(group_summaries)
+        if fallback:
+            logger.info(
+                f"Cross-group heuristic fallback produced {len(fallback)} insights"
+            )
+        return fallback
--- a/thirdeye/backend/agents/document_ingestor.py
+++ b/thirdeye/backend/agents/document_ingestor.py
@@ -0,0 +1,200 @@
+"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
+import os
+import logging
+import uuid
+from datetime import datetime
+
+logger = logging.getLogger("thirdeye.agents.document_ingestor")
+
+# --- Text Extraction ---
+
+def extract_text_from_pdf(file_path: str) -> list[dict]:
+    """Extract text from PDF, returns list of {page: int, text: str}."""
+    from PyPDF2 import PdfReader
+
+    pages = []
+    try:
+        reader = PdfReader(file_path)
+        for i, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text and text.strip():
+                pages.append({"page": i + 1, "text": text.strip()})
+    except Exception as e:
+        logger.error(f"PDF extraction failed for {file_path}: {e}")
+
+    return pages
+
+
+def extract_text_from_docx(file_path: str) -> list[dict]:
+    """Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
+    from docx import Document
+
+    try:
+        doc = Document(file_path)
+        full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+        if full_text.strip():
+            return [{"page": 1, "text": full_text.strip()}]
+    except Exception as e:
+        logger.error(f"DOCX extraction failed for {file_path}: {e}")
+
+    return []
+
+
+def extract_text_from_txt(file_path: str) -> list[dict]:
+    """Extract text from plain text file."""
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            text = f.read().strip()
+        if text:
+            return [{"page": 1, "text": text}]
+    except Exception as e:
+        logger.error(f"TXT extraction failed for {file_path}: {e}")
+
+    return []
+
+
+EXTRACTORS = {
+    ".pdf": extract_text_from_pdf,
+    ".docx": extract_text_from_docx,
+    ".txt": extract_text_from_txt,
+    ".md": extract_text_from_txt,
+    ".csv": extract_text_from_txt,
+    ".json": extract_text_from_txt,
+    ".log": extract_text_from_txt,
+}
+
+
+def extract_text(file_path: str) -> list[dict]:
+    """Route to correct extractor based on file extension."""
+    ext = os.path.splitext(file_path)[1].lower()
+    extractor = EXTRACTORS.get(ext)
+    if not extractor:
+        logger.warning(f"Unsupported file type: {ext} ({file_path})")
+        return []
+    return extractor(file_path)
+
+
+# --- Chunking ---
+
+def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
+    """
+    Split text into overlapping chunks.
+    
+    Uses paragraph boundaries when possible, falls back to sentence boundaries,
+    then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
+    """
+    if len(text) <= max_chars:
+        return [text]
+
+    # Split by paragraphs first
+    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
+
+    chunks = []
+    current_chunk = ""
+
+    for para in paragraphs:
+        # If adding this paragraph stays under limit, add it
+        if len(current_chunk) + len(para) + 1 <= max_chars:
+            current_chunk = (current_chunk + "\n" + para).strip()
+        else:
+            # Save current chunk if it has content
+            if current_chunk:
+                chunks.append(current_chunk)
+
+            # If single paragraph is too long, split it by sentences
+            if len(para) > max_chars:
+                sentences = para.replace(". ", ".\n").split("\n")
+                sub_chunk = ""
+                for sent in sentences:
+                    if len(sub_chunk) + len(sent) + 1 <= max_chars:
+                        sub_chunk = (sub_chunk + " " + sent).strip()
+                    else:
+                        if sub_chunk:
+                            chunks.append(sub_chunk)
+                        sub_chunk = sent
+                if sub_chunk:
+                    current_chunk = sub_chunk
+                else:
+                    current_chunk = ""
+            else:
+                current_chunk = para
+
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    # Add overlap: prepend last N chars of previous chunk to each subsequent chunk
+    if overlap_chars > 0 and len(chunks) > 1:
+        overlapped = [chunks[0]]
+        for i in range(1, len(chunks)):
+            prev_tail = chunks[i - 1][-overlap_chars:]
+            # Find a word boundary in the overlap
+            space_idx = prev_tail.find(" ")
+            if space_idx > 0:
+                prev_tail = prev_tail[space_idx + 1:]
+            overlapped.append(prev_tail + " " + chunks[i])
+        chunks = overlapped
+
+    return chunks
+
+
+# --- Main Ingestion ---
+
+def ingest_document(
+    file_path: str,
+    group_id: str,
+    shared_by: str = "Unknown",
+    filename: str = None,
+) -> list[dict]:
+    """
+    Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
+    
+    Args:
+        file_path: Path to the downloaded file on disk
+        group_id: Telegram group ID
+        shared_by: Who shared the file
+        filename: Original filename (for metadata)
+    
+    Returns:
+        List of signal dicts ready for store_signals()
+    """
+    if filename is None:
+        filename = os.path.basename(file_path)
+
+    # Extract
+    pages = extract_text(file_path)
+    if not pages:
+        logger.warning(f"No text extracted from {filename}")
+        return []
+
+    # Chunk each page
+    signals = []
+    total_chunks = 0
+
+    for page_data in pages:
+        page_num = page_data["page"]
+        chunks = chunk_text(page_data["text"])
+
+        for chunk_idx, chunk_text_str in enumerate(chunks):
+            if len(chunk_text_str.strip()) < 30:
+                continue  # Skip tiny chunks
+
+            signal = {
+                "id": str(uuid.uuid4()),
+                "type": "document_knowledge",
+                "summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
+                "entities": [f"@{shared_by}", filename],
+                "severity": "low",
+                "status": "reference",
+                "sentiment": "neutral",
+                "urgency": "none",
+                "raw_quote": chunk_text_str,
+                "timestamp": datetime.utcnow().isoformat(),
+                "group_id": group_id,
+                "lens": "document",
+                "keywords": [filename, f"page_{page_num}", "document", shared_by],
+            }
+            signals.append(signal)
+            total_chunks += 1
+
+    logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
+    return signals
--- a/thirdeye/backend/agents/jira_agent.py
+++ b/thirdeye/backend/agents/jira_agent.py
@@ -0,0 +1,373 @@
+"""
+Jira Signal Agent
+Takes ThirdEye signals and converts them into well-formed Jira tickets.
+
+Responsibilities:
+  1. Map signal type → Jira issue type + priority
+  2. LLM-generate a clean ticket title and structured description from signal context
+  3. Extract assignee names and match them to Jira account IDs (best-effort)
+  4. Raise the ticket via jira_client and mark the signal in ChromaDB
+  5. Bulk-raise: process a group's unraised high-severity signals in one call
+"""
+import json
+import logging
+from datetime import datetime
+
+from backend.providers import call_llm
+from backend.integrations.jira_client import (
+    create_issue, search_issues, add_comment, is_configured, search_users
+)
+from backend.db.chroma import store_signals, mark_signal_as_raised, get_raised_signal_ids
+from backend.config import (
+    JIRA_DEFAULT_PROJECT, JIRA_DEFAULT_ISSUE_TYPE,
+    JIRA_AUTO_RAISE_SEVERITY
+)
+
+logger = logging.getLogger("thirdeye.agents.jira_agent")
+
+
+# ─── Signal → Jira type mapping ──────────────────────────────────────────────
+
+# Maps ThirdEye signal type → (Jira issue type, default priority)
+# Note: Issue types must match what's available in your Jira project
+# Common types: Task, Bug, Story, Epic, Workstream (project-specific)
+SIGNAL_TYPE_MAP = {
+    # Dev signals
+    "tech_debt":          ("Task",  "Low"),
+    "recurring_bug":      ("Task",   "High"),  # Changed from Bug to Task
+    "architecture_decision": ("Task", "Medium"),
+    "deployment_risk":    ("Task",  "High"),
+    "workaround":         ("Task",  "Medium"),
+    "knowledge_silo":     ("Task",  "Medium"),
+    # Product signals
+    "feature_request":    ("Task", "Medium"),  # Changed from Story to Task
+    "priority_conflict":  ("Task",  "High"),
+    "sentiment_shift":    ("Task",  "Medium"),
+    # Client signals
+    "promise":            ("Task",  "High"),
+    "scope_creep":        ("Task",  "High"),
+    "risk":               ("Task",  "High"),
+    # Meet signals
+    "meet_action_item":   ("Task",  "Medium"),
+    "meet_blocker":       ("Task",  "Highest"),
+    "meet_risk":          ("Task",  "High"),
+    "meet_decision":      ("Task",  "Medium"),
+    "meet_open_q":        ("Task",  "Low"),
+    # Generic
+    "blocker":            ("Task",  "Highest"),
+    "decision":           ("Task",  "Medium"),
+    "action_item":        ("Task",  "Medium"),
+}
+
+SEVERITY_TO_PRIORITY = {
+    "critical": "Highest",
+    "high":     "High",
+    "medium":   "Medium",
+    "low":      "Low",
+}
+
+RAISEABLE_TYPES = set(SIGNAL_TYPE_MAP.keys())
+
+
+# ─── Assignee resolution ─────────────────────────────────────────────────────
+
+async def resolve_assignee_account_id(name: str) -> str | None:
+    """
+    Resolve a person's display name (or @name) to their Jira account ID.
+    Uses Jira's user search API and fuzzy-matches the best result.
+    Returns the account ID string, or None if no confident match is found.
+    """
+    if not name:
+        return None
+    clean = name.lstrip("@").strip()
+    try:
+        users = await search_users(clean)
+        if not users:
+            return None
+        clean_lower = clean.lower()
+        # Exact display-name match first
+        for u in users:
+            if u["display_name"].lower() == clean_lower:
+                return u["account_id"]
+        # Partial match (all search words appear in display name)
+        words = clean_lower.split()
+        for u in users:
+            dn = u["display_name"].lower()
+            if all(w in dn for w in words):
+                return u["account_id"]
+        # Last resort: first result
+        return users[0]["account_id"]
+    except Exception as e:
+        logger.warning(f"resolve_assignee_account_id failed for '{name}': {e}")
+        return None
+
+
+# ─── LLM ticket generation ───────────────────────────────────────────────────
+
+TICKET_GEN_SYSTEM_PROMPT = """You are a senior engineering manager writing Jira tickets from team intelligence signals.
+
+Given a ThirdEye signal (a structured piece of extracted team knowledge), write a Jira ticket.
+
+Return ONLY a valid JSON object with exactly these fields:
+{
+  "summary": "Short, actionable ticket title (max 100 chars). Start with a verb. No jargon.",
+  "description": "Full ticket description. Include: what the issue is, context from the signal, why it matters, suggested next steps. Use blank lines between sections. Use '- ' for bullet points. Max 400 words.",
+  "labels": ["label1", "label2"],
+  "assignee_name": "First name or @name of the person to assign, or null if unclear"
+}
+
+Label rules:
+- Always include "thirdeye" and "auto-raised"
+- Add the signal type as a label (e.g. "tech-debt", "recurring-bug")
+- Add "urgent" if severity is high or critical
+- Labels must not have spaces (use hyphens)
+
+Summary rules:
+- Starts with a verb: "Fix", "Investigate", "Address", "Resolve", "Document", "Implement"
+- Be specific — "Fix intermittent checkout timeout" NOT "Fix bug"
+- Never exceed 100 characters
+
+Description must include:
+1. What: clear 1-sentence problem statement
+2. Context: what was actually said / detected (cite the signal)
+3. Impact: why this matters to the team or product
+4. Suggested next steps (2-3 bullet points)
+
+Return JSON only — no markdown, no preamble."""
+
+
+async def generate_ticket_content(signal: dict) -> dict:
+    """
+    Use an LLM to generate a clean, context-rich Jira ticket from a ThirdEye signal.
+    Returns {"summary": str, "description": str, "labels": list, "assignee_name": str|None}
+    """
+    signal_text = (
+        f"Signal type: {signal.get('type', 'unknown')}\n"
+        f"Summary: {signal.get('summary', '')}\n"
+        f"Raw quote: {signal.get('raw_quote', '')[:300]}\n"
+        f"Severity: {signal.get('severity', 'medium')}\n"
+        f"Entities involved: {', '.join(signal.get('entities', []))}\n"
+        f"Keywords: {', '.join(signal.get('keywords', []))}\n"
+        f"Timestamp: {signal.get('timestamp', '')}\n"
+        f"Group: {signal.get('group_id', '')}\n"
+        f"Lens: {signal.get('lens', '')}"
+    )
+
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": TICKET_GEN_SYSTEM_PROMPT},
+                {"role": "user", "content": signal_text},
+            ],
+            temperature=0.2,
+            max_tokens=800,
+            response_format={"type": "json_object"},
+        )
+        raw = result["content"].strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        return json.loads(raw)
+
+    except Exception as e:
+        logger.warning(f"Ticket generation LLM failed: {e}. Using fallback.")
+        # Fallback: build a basic ticket without LLM
+        sig_type = signal.get("type", "unknown").replace("_", " ").title()
+        return {
+            "summary": f"{sig_type}: {signal.get('summary', 'Unknown issue')[:80]}",
+            "description": (
+                f"Signal detected by ThirdEye.\n\n"
+                f"Type: {signal.get('type', 'unknown')}\n"
+                f"Summary: {signal.get('summary', '')}\n\n"
+                f"Raw context:\n{signal.get('raw_quote', '(none)')[:300]}\n\n"
+                f"Severity: {signal.get('severity', 'medium')}"
+            ),
+            "labels": ["thirdeye", "auto-raised", signal.get("type", "unknown").replace("_", "-")],
+            "assignee_name": None,
+        }
+
+
+# ─── Main raise function ──────────────────────────────────────────────────────
+
+async def raise_ticket_for_signal(
+    signal: dict,
+    group_id: str,
+    project_key: str = None,
+    force: bool = False,
+    assignee_account_id: str = None,
+) -> dict:
+    """
+    Create a Jira ticket for a single ThirdEye signal.
+
+    Args:
+        signal:      The signal dict from ChromaDB
+        group_id:    The group this signal belongs to (for dedup tracking)
+        project_key: Override project (default: JIRA_DEFAULT_PROJECT)
+        force:       If True, raise even if already raised before
+
+    Returns:
+        {"ok": True, "key": "ENG-42", "url": "...", "summary": "..."}
+        OR
+        {"ok": False, "reason": "already_raised" | "not_raiseable" | "jira_error", ...}
+    """
+    if not is_configured():
+        return {"ok": False, "reason": "jira_not_configured"}
+
+    signal_id = signal.get("id", "")
+    signal_type = signal.get("type", "")
+
+    # Check if this signal type is raiseable
+    if signal_type not in RAISEABLE_TYPES:
+        return {"ok": False, "reason": "not_raiseable", "signal_type": signal_type}
+
+    # Check if already raised (skip if force=True)
+    if not force and signal_id:
+        already_raised = get_raised_signal_ids(group_id)
+        if signal_id in already_raised:
+            return {"ok": False, "reason": "already_raised", "signal_id": signal_id}
+
+    # Determine Jira issue type and priority from signal
+    default_type, default_priority = SIGNAL_TYPE_MAP.get(signal_type, (JIRA_DEFAULT_ISSUE_TYPE, "Medium"))
+    severity = signal.get("severity", "medium").lower()
+    priority = SEVERITY_TO_PRIORITY.get(severity, default_priority)
+
+    # Generate ticket content via LLM
+    ticket_content = await generate_ticket_content(signal)
+
+    summary = ticket_content.get("summary", signal.get("summary", "ThirdEye signal")[:100])
+    description = ticket_content.get("description", signal.get("summary", ""))
+    labels = ticket_content.get("labels", ["thirdeye", "auto-raised"])
+    # Always ensure thirdeye label is present
+    if "thirdeye" not in labels:
+        labels.append("thirdeye")
+
+    # Append ThirdEye metadata as a context section in the description
+    meta_section = (
+        f"\n\n---\n"
+        f"Raised by: ThirdEye\n"
+        f"Signal ID: {signal_id}\n"
+        f"Group: {group_id}\n"
+        f"Detected: {signal.get('timestamp', datetime.utcnow().isoformat())}"
+    )
+    description = description + meta_section
+
+    # Resolve assignee: explicit account_id wins, then signal override name, then LLM-extracted name
+    if not assignee_account_id:
+        name_hint = signal.get("assignee_override") or ticket_content.get("assignee_name")
+        if name_hint:
+            assignee_account_id = await resolve_assignee_account_id(name_hint)
+            if assignee_account_id:
+                logger.info(f"Resolved assignee '{name_hint}' → {assignee_account_id}")
+            else:
+                logger.warning(f"Could not resolve assignee '{name_hint}' to a Jira account")
+
+    # Create the ticket
+    result = await create_issue(
+        project_key=project_key or JIRA_DEFAULT_PROJECT,
+        summary=summary,
+        description=description,
+        issue_type=default_type,
+        priority=priority,
+        labels=labels,
+        assignee_account_id=assignee_account_id,
+    )
+
+    if result.get("ok"):
+        jira_key = result["key"]
+        jira_url = result["url"]
+        # Mark this signal as raised in ChromaDB so we never duplicate it
+        if signal_id:
+            mark_signal_as_raised(
+                group_id, signal_id, jira_key,
+                jira_url=jira_url,
+                jira_summary=summary,
+                jira_priority=priority,
+            )
+        logger.info(f"Raised Jira ticket {jira_key} for signal {signal_id} ({signal_type})")
+        return {
+            "ok": True,
+            "key": jira_key,
+            "url": jira_url,
+            "summary": summary,
+            "issue_type": default_type,
+            "priority": priority,
+            "assignee_account_id": assignee_account_id,
+        }
+    else:
+        logger.error(f"Jira ticket creation failed: {result}")
+        return {
+            "ok": False,
+            "reason": "jira_error",
+            "error": result.get("error"),
+            "details": result.get("details"),
+        }
+
+
+async def bulk_raise_for_group(
+    group_id: str,
+    signals: list[dict],
+    min_severity: str = None,
+    project_key: str = None,
+    max_tickets: int = 10,
+) -> list[dict]:
+    """
+    Raise Jira tickets for multiple signals from a group in one call.
+
+    Filters:
+    - Only raiseable signal types
+    - Only signals at or above min_severity (defaults to JIRA_AUTO_RAISE_SEVERITY)
+    - Skips signals already raised
+    - Caps at max_tickets to avoid flooding Jira
+
+    Returns list of raise results.
+    """
+    min_sev = (min_severity or JIRA_AUTO_RAISE_SEVERITY).lower()
+    severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
+    min_rank = severity_rank.get(min_sev, 2)  # Default: high
+
+    already_raised = get_raised_signal_ids(group_id)
+    candidates = []
+
+    for sig in signals:
+        sig_type = sig.get("type", "")
+        sig_id = sig.get("id", "")
+        severity = sig.get("severity", "low").lower()
+        rank = severity_rank.get(severity, 0)
+
+        if sig_type not in RAISEABLE_TYPES:
+            continue
+        if rank < min_rank:
+            continue
+        if sig_id in already_raised:
+            continue
+        candidates.append(sig)
+
+    # Sort by severity descending, then raise up to max_tickets
+    candidates.sort(key=lambda s: severity_rank.get(s.get("severity", "low"), 0), reverse=True)
+    candidates = candidates[:max_tickets]
+
+    results = []
+    for sig in candidates:
+        result = await raise_ticket_for_signal(sig, group_id, project_key=project_key)
+        results.append({**result, "signal_type": sig.get("type"), "signal_summary": sig.get("summary", "")[:80]})
+
+    logger.info(f"Bulk raise for group {group_id}: {len(results)} tickets from {len(signals)} signals")
+    return results
+
+
+def format_raise_result_for_telegram(result: dict) -> str:
+    """Format a single raise result as a Telegram message line."""
+    if result.get("ok"):
+        return (
+            f"✅ [{result['key']}]({result['url']}) — "
+            f"*{result.get('issue_type', 'Task')}* | {result.get('priority', 'Medium')} priority\n"
+            f"   _{result.get('summary', '')[:90]}_"
+        )
+    reason = result.get("reason", "unknown")
+    if reason == "already_raised":
+        return f"⏭️ Already raised — skipped"
+    if reason == "not_raiseable":
+        return f"⚪ Signal type `{result.get('signal_type', '?')}` — not mapped to Jira"
+    return f"❌ Failed: {result.get('error', reason)}"
--- a/thirdeye/backend/agents/json_utils.py
+++ b/thirdeye/backend/agents/json_utils.py
@@ -0,0 +1,43 @@
+"""Utilities for robustly parsing JSON from LLM responses."""
+
+import json
+import re
+
+
+def extract_json_object(content: str) -> dict:
+    """Extract and parse the first JSON object from raw LLM output."""
+    text = (content or "").strip()
+    if not text:
+        raise json.JSONDecodeError("Empty LLM response", text, 0)
+
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
+        text = re.sub(r"\s*```$", "", text)
+
+    text = text.strip()
+    if not text:
+        raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)
+
+    decoder = json.JSONDecoder()
+
+    # Direct parse for pure JSON responses.
+    try:
+        parsed = json.loads(text)
+        if isinstance(parsed, dict):
+            return parsed
+    except json.JSONDecodeError:
+        pass
+
+    # Try to decode from each object start. This handles wrapper text more
+    # reliably than regex, especially with nested braces.
+    for idx, ch in enumerate(text):
+        if ch != "{":
+            continue
+        try:
+            parsed, _ = decoder.raw_decode(text[idx:])
+            if isinstance(parsed, dict):
+                return parsed
+        except json.JSONDecodeError:
+            continue
+
+    raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)
--- a/thirdeye/backend/agents/link_fetcher.py
+++ b/thirdeye/backend/agents/link_fetcher.py
@@ -0,0 +1,213 @@
+"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
+import re
+import uuid
+import logging
+import asyncio
+from datetime import datetime
+
+import httpx
+from bs4 import BeautifulSoup
+
+from backend.providers import call_llm
+from backend.config import ENABLE_LINK_FETCH
+
+logger = logging.getLogger("thirdeye.agents.link_fetcher")
+
+# Patterns to skip (images, downloads, social media embeds, etc.)
+SKIP_PATTERNS = [
+    r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
+    r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
+    r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
+    r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
+    r"^https?://(www\.)?instagram\.com/p/",
+    r"^https?://(www\.)?tiktok\.com/",
+    r"^https?://(www\.)?youtube\.com/shorts/",
+    r"^https?://t\.me/",  # Other Telegram links
+]
+
+SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
+
+
+def extract_urls(text: str) -> list[str]:
+    """Extract all HTTP/HTTPS URLs from a text string."""
+    url_pattern = re.compile(
+        r"https?://[^\s<>\"')\]},;]+"
+    )
+    urls = url_pattern.findall(text)
+
+    # Clean trailing punctuation
+    cleaned = []
+    for url in urls:
+        url = url.rstrip(".,;:!?)")
+        if len(url) > 10:
+            cleaned.append(url)
+
+    return cleaned
+
+
+def should_fetch(url: str) -> bool:
+    """Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
+    for pattern in SKIP_COMPILED:
+        if pattern.search(url):
+            return False
+    return True
+
+
+async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
+    """
+    Fetch a URL and extract main text content.
+    
+    Returns:
+        {title, text, url} or None if fetch fails
+    """
+    try:
+        async with httpx.AsyncClient(
+            follow_redirects=True,
+            timeout=timeout,
+            headers={
+                "User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            },
+        ) as client:
+            response = await client.get(url)
+
+            if response.status_code != 200:
+                logger.info(f"URL returned {response.status_code}: {url[:80]}")
+                return None
+
+            content_type = response.headers.get("content-type", "")
+            if "text/html" not in content_type and "application/xhtml" not in content_type:
+                logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
+                return None
+
+            html = response.text
+
+    except httpx.TimeoutException:
+        logger.info(f"URL timed out: {url[:80]}")
+        return None
+    except Exception as e:
+        logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
+        return None
+
+    # Parse HTML
+    try:
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Extract title
+        title = ""
+        if soup.title and soup.title.string:
+            title = soup.title.string.strip()
+
+        # Remove script, style, nav, footer, header elements
+        for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
+            tag.decompose()
+
+        # Try to find main content area
+        main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
+        if main:
+            text = main.get_text(separator="\n", strip=True)
+        else:
+            text = soup.get_text(separator="\n", strip=True)
+
+        # Clean up
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        text = "\n".join(lines)
+
+        # Skip if too little content
+        if len(text) < 100:
+            logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
+            return None
+
+        # Truncate very long content
+        if len(text) > 8000:
+            text = text[:8000] + "\n\n[Content truncated]"
+
+        return {
+            "title": title or url,
+            "text": text,
+            "url": url,
+        }
+
+    except Exception as e:
+        logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
+        return None
+
+
+async def summarize_content(title: str, text: str, url: str) -> str:
+    """Use LLM to create a concise summary of fetched content."""
+    # Limit text sent to LLM
+    text_preview = text[:3000]
+
+    messages = [
+        {"role": "system", "content": """You are a content summarizer for ThirdEye. 
+Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
+Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
+Respond with ONLY the summary text, nothing else."""},
+        {"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
+    ]
+
+    try:
+        result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
+        return result["content"].strip()
+    except Exception as e:
+        logger.warning(f"Link summarization failed: {e}")
+        # Fallback: use first 200 chars of text
+        return text[:200] + "..."
+
+
+async def process_links_from_message(
+    text: str,
+    group_id: str,
+    shared_by: str = "Unknown",
+) -> list[dict]:
+    """
+    Full pipeline: extract URLs from message → fetch → summarize → produce signals.
+    
+    Designed to be called in the background (non-blocking to the main message pipeline).
+    
+    Returns:
+        List of signal dicts ready for store_signals()
+    """
+    if not ENABLE_LINK_FETCH:
+        return []
+
+    urls = extract_urls(text)
+    fetchable = [u for u in urls if should_fetch(u)]
+
+    if not fetchable:
+        return []
+
+    signals = []
+
+    # Process up to 3 links per message to avoid overload
+    for url in fetchable[:3]:
+        try:
+            content = await fetch_url_content(url)
+            if not content:
+                continue
+
+            summary = await summarize_content(content["title"], content["text"], url)
+
+            signal = {
+                "id": str(uuid.uuid4()),
+                "type": "link_knowledge",
+                "summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
+                "entities": [f"@{shared_by}", url[:100]],
+                "severity": "low",
+                "status": "reference",
+                "sentiment": "neutral",
+                "urgency": "none",
+                "raw_quote": summary,
+                "timestamp": datetime.utcnow().isoformat(),
+                "group_id": group_id,
+                "lens": "link",
+                "keywords": [content["title"][:50], "link", "web", shared_by],
+            }
+            signals.append(signal)
+            logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
+
+        except Exception as e:
+            logger.warning(f"Link processing failed for {url[:60]}: {e}")
+            continue
+
+    return signals
--- a/thirdeye/backend/agents/meet_cross_ref.py
+++ b/thirdeye/backend/agents/meet_cross_ref.py
@@ -0,0 +1,188 @@
+"""
+Meet Cross-Reference Agent
+Finds connections between meeting signals and existing Telegram group signals.
+Surfaces: confirmations (meeting agrees with chat), contradictions (meeting contradicts chat),
+and blind spots (meeting discusses something chat groups don't know about).
+"""
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import query_signals, get_all_signals
+from backend.config import MEET_CROSS_REF_GROUPS, MEET_DEFAULT_GROUP_ID
+
+logger = logging.getLogger("thirdeye.agents.meet_cross_ref")
+
+CROSS_REF_SYSTEM_PROMPT = """You are an expert at finding connections between meeting discussions and team chat history.
+
+You will receive:
+1. MEETING SIGNALS — decisions, action items, blockers, risks from a recent Google Meet
+2. CHAT SIGNALS — existing signals from team Telegram groups
+
+Find meaningful connections across three categories:
+
+CONFIRMATIONS: Meeting agrees with or reinforces something from chat history
+CONTRADICTIONS: Meeting decision conflicts with what was said/decided in chat
+BLIND SPOTS: Important things from the meeting that the chat teams don't seem to know about
+
+Return ONLY a valid JSON object:
+{
+  "confirmations": [
+    {"meeting_signal": "...", "chat_signal": "...", "group": "...", "significance": "high|medium|low"}
+  ],
+  "contradictions": [
+    {"meeting_signal": "...", "chat_signal": "...", "group": "...", "impact": "...", "significance": "high|medium|low"}
+  ],
+  "blind_spots": [
+    {"meeting_signal": "...", "teams_unaware": ["group1", "group2"], "recommendation": "..."}
+  ]
+}
+
+Rules:
+- Only include HIGH confidence matches — do not stretch for weak connections
+- Keep each signal description concise (1 sentence max)
+- significance "high" = this matters for team alignment; "medium" = worth noting; "low" = minor
+- If a category has nothing meaningful, use an empty array []
+- Return JSON only"""
+
+
+async def find_cross_references(
+    meeting_id: str,
+    group_id: str = None,
+    cross_ref_group_ids: list[str] = None,
+) -> dict:
+    """
+    Compare meeting signals against chat group signals.
+    
+    Args:
+        meeting_id: The meeting to analyze
+        group_id: ChromaDB group where meet signals are stored (defaults to MEET_DEFAULT_GROUP_ID)
+        cross_ref_group_ids: Groups to compare against (defaults to MEET_CROSS_REF_GROUPS from config)
+    
+    Returns:
+        Dict with confirmations, contradictions, blind_spots lists
+    """
+    group_id = group_id or MEET_DEFAULT_GROUP_ID
+    cross_ref_group_ids = cross_ref_group_ids or MEET_CROSS_REF_GROUPS
+
+    if not cross_ref_group_ids:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": "No cross-reference groups configured. Set MEET_CROSS_REF_GROUPS in .env",
+        }
+
+    # 1. Get meeting signals (decisions, actions, blockers, risks — NOT raw chunks)
+    meet_signals = query_signals(group_id, meeting_id, n_results=30)
+    structured_meet = [
+        s for s in meet_signals
+        if s.get("metadata", {}).get("type") in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk", "meet_open_q")
+    ]
+
+    if not structured_meet:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": f"No structured signals found for meeting {meeting_id}. Has it been processed yet?",
+        }
+
+    # 2. Get signals from each cross-reference group
+    chat_context_parts = []
+    for gid in cross_ref_group_ids:
+        try:
+            all_sig = get_all_signals(gid)
+            if all_sig:
+                formatted = "\n".join([
+                    f"  [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:120]}"
+                    for s in all_sig[:20]  # Cap at 20 per group to stay within token limits
+                ])
+                chat_context_parts.append(f"Group '{gid}':\n{formatted}")
+        except Exception as e:
+            logger.warning(f"Could not load signals for group {gid}: {e}")
+
+    if not chat_context_parts:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": "Could not load any signals from cross-reference groups.",
+        }
+
+    # 3. Format inputs for LLM
+    meet_text = "\n".join([
+        f"  [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:150]}" for s in structured_meet
+    ])
+    chat_text = "\n\n".join(chat_context_parts)
+
+    prompt = f"""MEETING SIGNALS (from meeting: {meeting_id}):
+{meet_text}
+
+CHAT SIGNALS (from monitored Telegram groups):
+{chat_text}"""
+
+    try:
+        import json
+        result = await call_llm(
+            task_type="reasoning",
+            messages=[
+                {"role": "system", "content": CROSS_REF_SYSTEM_PROMPT},
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0.2,
+            max_tokens=1500,
+            response_format={"type": "json_object"},
+        )
+        raw = result["content"].strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        return json.loads(raw)
+
+    except Exception as e:
+        logger.error(f"Cross-reference LLM call failed: {e}")
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": str(e),
+        }
+
+
+def format_cross_ref_for_telegram(analysis: dict, meeting_id: str) -> str:
+    """Format cross-reference results as a Telegram message."""
+    parts = [f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting: `{meeting_id}`\n"]
+
+    if analysis.get("error"):
+        return f"⚠️ Cross-reference failed: {analysis['error']}"
+
+    confirmations = analysis.get("confirmations", [])
+    contradictions = analysis.get("contradictions", [])
+    blind_spots = analysis.get("blind_spots", [])
+
+    if not confirmations and not contradictions and not blind_spots:
+        return f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting `{meeting_id}`: No significant connections found between this meeting and your chat groups."
+
+    if confirmations:
+        parts.append(f"✅ *Confirmations* ({len(confirmations)})")
+        for c in confirmations[:3]:  # Cap at 3 for readability
+            sig = "🔴" if c.get("significance") == "high" else "🟡"
+            parts.append(f"{sig} Meeting: _{c['meeting_signal'][:100]}_")
+            parts.append(f"   Matches [{c.get('group', '?')}]: _{c['chat_signal'][:100]}_\n")
+
+    if contradictions:
+        parts.append(f"⚡ *Contradictions* ({len(contradictions)}) — ACTION NEEDED")
+        for c in contradictions[:3]:
+            parts.append(f"🔴 Meeting decided: _{c['meeting_signal'][:100]}_")
+            parts.append(f"   BUT [{c.get('group', '?')}] says: _{c['chat_signal'][:100]}_")
+            if c.get("impact"):
+                parts.append(f"   Impact: {c['impact'][:100]}\n")
+
+    if blind_spots:
+        parts.append(f"🔦 *Blind Spots* ({len(blind_spots)}) — Teams may not know")
+        for b in blind_spots[:3]:
+            parts.append(f"🟠 {b['meeting_signal'][:120]}")
+            if b.get("recommendation"):
+                parts.append(f"   → {b['recommendation'][:100]}\n")
+
+    return "\n".join(parts)
--- a/thirdeye/backend/agents/meet_ingestor.py
+++ b/thirdeye/backend/agents/meet_ingestor.py
@@ -0,0 +1,342 @@
+"""
+Meet Ingestor Agent
+Processes raw Google Meet transcript chunks and extracts structured signals.
+
+Signal types produced:
+  meet_decision    — A decision made during the meeting
+  meet_action_item — A task assigned to someone
+  meet_blocker     — A blocker or dependency raised
+  meet_risk        — A risk or concern identified
+  meet_open_q      — An unresolved question left open
+  meet_summary     — Full meeting summary (emitted on is_final=True)
+  meet_chunk_raw   — Raw transcript chunk (always stored, for full-text search)
+"""
+import asyncio
+import json
+import logging
+import uuid
+from datetime import datetime
+
+from backend.providers import call_llm
+from backend.db.chroma import store_signals
+
+logger = logging.getLogger("thirdeye.agents.meet_ingestor")
+
+
+# ─── Extraction prompt ───────────────────────────────────────────────────────
+
+EXTRACTION_SYSTEM_PROMPT = """You are an expert meeting analyst. You receive raw transcript chunks from a Google Meet recording and extract structured signals.
+
+Extract ONLY signals that are clearly present. Do NOT hallucinate or infer beyond what is stated.
+
+Return ONLY a valid JSON object with this exact structure:
+{
+  "decisions": [
+    {"text": "...", "owner": "@name or null", "confidence": "high|medium|low"}
+  ],
+  "action_items": [
+    {"text": "...", "owner": "@name or null", "due": "date string or null", "confidence": "high|medium|low"}
+  ],
+  "blockers": [
+    {"text": "...", "blocking_what": "...", "confidence": "high|medium|low"}
+  ],
+  "risks": [
+    {"text": "...", "severity": "high|medium|low", "confidence": "high|medium|low"}
+  ],
+  "open_questions": [
+    {"text": "...", "confidence": "high|medium|low"}
+  ]
+}
+
+Rules:
+- If a category has nothing, use an empty array []
+- owner must start with @ if it's a person's name (e.g. "@Alex")
+- text must be a clear, standalone sentence — not a fragment
+- Only include confidence "high" if the signal is unambiguous
+- Do NOT reproduce filler words, pleasantries, or off-topic banter
+- Return JSON only — no markdown, no preamble, no explanation"""
+
+
+SUMMARY_SYSTEM_PROMPT = """You are a meeting intelligence expert. Given a full meeting transcript (possibly from multiple chunks), write a concise but complete meeting summary.
+
+Structure your summary as:
+1. One-sentence overview (what was the meeting about)
+2. Key decisions made (bullet points, max 5)
+3. Action items assigned (who does what by when)
+4. Blockers or risks raised
+5. Open questions still unresolved
+
+Keep the summary under 400 words. Be specific. Use names when available. Do NOT use filler phrases like "the team discussed" — just state what was decided/agreed/assigned."""
+
+
+# ─── Signal builder ─────────────────────────────────────────────────────────
+
+def _build_signal(
+    signal_type: str,
+    summary: str,
+    raw_quote: str,
+    severity: str,
+    entities: list[str],
+    keywords: list[str],
+    timestamp: str,
+    group_id: str,
+    meeting_id: str,
+    urgency: str = "none",
+    status: str = "open",
+) -> dict:
+    return {
+        "id": str(uuid.uuid4()),
+        "type": signal_type,
+        "summary": summary,
+        "raw_quote": raw_quote[:500] if raw_quote else "",
+        "severity": severity,
+        "status": status,
+        "sentiment": "neutral",
+        "urgency": urgency,
+        "entities": entities,
+        "keywords": keywords,
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "meet",
+        "meeting_id": meeting_id,
+    }
+
+
+def _extract_entities(text: str, owner: str = None) -> list[str]:
+    """Extract entity strings from text (names starting with @)."""
+    import re
+    entities = re.findall(r"@[\w]+", text)
+    if owner and owner.startswith("@"):
+        entities.append(owner)
+    return list(set(entities))
+
+
+def _extract_keywords(text: str) -> list[str]:
+    """Simple keyword extraction: lowercase meaningful words."""
+    stopwords = {"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
+                 "in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
+                 "but", "we", "i", "it", "be", "do", "have", "has", "had", "not"}
+    words = text.lower().split()
+    keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
+    return list(dict.fromkeys(keywords))[:10]  # deduplicate, keep first 10
+
+
+# ─── Main processing function ────────────────────────────────────────────────
+
+async def process_meet_chunk(
+    meeting_id: str,
+    group_id: str,
+    chunk_index: int,
+    text: str,
+    speaker: str,
+    timestamp: str,
+    is_final: bool,
+):
+    """
+    Full pipeline for a transcript chunk:
+    1. Always store raw chunk for full-text search
+    2. Extract structured signals via LLM
+    3. If is_final, generate a full meeting summary
+    """
+    logger.info(f"Processing meet chunk {chunk_index} for meeting {meeting_id} ({len(text)} chars)")
+    signals_to_store = []
+
+    # 1. Always store the raw chunk (enables full-text similarity search later)
+    raw_signal = _build_signal(
+        signal_type="meet_chunk_raw",
+        summary=f"[{meeting_id}] Chunk {chunk_index}: {text[:120]}...",
+        raw_quote=text,
+        severity="low",
+        entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
+        keywords=_extract_keywords(text),
+        timestamp=timestamp,
+        group_id=group_id,
+        meeting_id=meeting_id,
+    )
+    signals_to_store.append(raw_signal)
+
+    # 2. Extract structured signals via LLM
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
+                {"role": "user", "content": f"Transcript chunk from speaker '{speaker}':\n\n{text}"},
+            ],
+            temperature=0.1,
+            max_tokens=1500,
+            response_format={"type": "json_object"},
+        )
+
+        raw_json = result["content"].strip()
+        # Strip markdown code fences if present
+        if raw_json.startswith("```"):
+            raw_json = raw_json.split("```")[1]
+            if raw_json.startswith("json"):
+                raw_json = raw_json[4:]
+        extracted = json.loads(raw_json)
+
+    except Exception as e:
+        logger.warning(f"Meet extraction LLM failed for chunk {chunk_index}: {e}")
+        extracted = {}
+
+    # Decisions
+    for item in extracted.get("decisions", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_decision",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="medium",
+                entities=_extract_entities(item["text"], item.get("owner")),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                status="decided",
+            ))
+
+    # Action items
+    for item in extracted.get("action_items", []):
+        if item.get("confidence") in ("high", "medium"):
+            due_str = f" Due: {item['due']}." if item.get("due") else ""
+            signals_to_store.append(_build_signal(
+                signal_type="meet_action_item",
+                summary=f"{item['text']}{due_str}",
+                raw_quote=item["text"],
+                severity="medium",
+                entities=_extract_entities(item["text"], item.get("owner")),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                urgency="medium" if item.get("due") else "low",
+                status="open",
+            ))
+
+    # Blockers
+    for item in extracted.get("blockers", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_blocker",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="high",
+                entities=_extract_entities(item["text"]),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                urgency="high",
+                status="open",
+            ))
+
+    # Risks
+    for item in extracted.get("risks", []):
+        signals_to_store.append(_build_signal(
+            signal_type="meet_risk",
+            summary=item["text"],
+            raw_quote=item["text"],
+            severity=item.get("severity", "medium"),
+            entities=_extract_entities(item["text"]),
+            keywords=_extract_keywords(item["text"]),
+            timestamp=timestamp,
+            group_id=group_id,
+            meeting_id=meeting_id,
+            urgency="medium",
+            status="open",
+        ))
+
+    # Open questions
+    for item in extracted.get("open_questions", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_open_q",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="low",
+                entities=_extract_entities(item["text"]),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                status="open",
+            ))
+
+    # 3. If this is the final chunk, generate a meeting summary
+    if is_final:
+        summary_signal = await _generate_meeting_summary(
+            meeting_id, group_id, text, speaker, timestamp
+        )
+        if summary_signal:
+            signals_to_store.append(summary_signal)
+
+    # Store everything
+    if signals_to_store:
+        store_signals(group_id, signals_to_store)
+        logger.info(
+            f"Stored {len(signals_to_store)} signals for meeting {meeting_id} chunk {chunk_index}"
+        )
+
+    return signals_to_store
+
+
+async def _generate_meeting_summary(
+    meeting_id: str,
+    group_id: str,
+    final_chunk_text: str,
+    speaker: str,
+    timestamp: str,
+) -> dict | None:
+    """
+    Pull all raw chunks for this meeting from ChromaDB and generate a summary.
+    Falls back to summarizing just the final chunk if retrieval fails.
+    """
+    from backend.db.chroma import query_signals
+
+    try:
+        # Get all raw chunks for this meeting
+        raw_chunks = query_signals(
+            group_id,
+            meeting_id,
+            n_results=50,
+            signal_type="meet_chunk_raw",
+        )
+        full_transcript = "\n\n".join(
+            [s.get("metadata", {}).get("raw_quote", "") or s.get("document", "") for s in raw_chunks]
+        )
+        if not full_transcript.strip():
+            full_transcript = final_chunk_text
+    except Exception:
+        full_transcript = final_chunk_text
+
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": f"Meeting ID: {meeting_id}\n\nFull transcript:\n\n{full_transcript[:6000]}",
+                },
+            ],
+            temperature=0.3,
+            max_tokens=600,
+        )
+        summary_text = result["content"].strip()
+    except Exception as e:
+        logger.warning(f"Meeting summary generation failed: {e}")
+        return None
+
+    return _build_signal(
+        signal_type="meet_summary",
+        summary=summary_text,
+        raw_quote=full_transcript[:500],
+        severity="medium",
+        entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
+        keywords=_extract_keywords(summary_text),
+        timestamp=timestamp,
+        group_id=group_id,
+        meeting_id=meeting_id,
+        status="completed",
+    )
--- a/thirdeye/backend/agents/pattern_detector.py
+++ b/thirdeye/backend/agents/pattern_detector.py
@@ -0,0 +1,114 @@
+"""Pattern Detector Agent — finds trends and anomalies in accumulated signals."""
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import get_all_signals
+from backend.db.models import Pattern
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.pattern_detector")
+
+SYSTEM_PROMPT = """You are the Pattern Detector for ThirdEye. You analyze accumulated signals to find patterns and anomalies.
+
+Detect these pattern types:
+- frequency_spike: A signal type mentioned significantly more than usual
+- knowledge_silo: Only one person discusses a critical topic (bus factor = 1)
+- recurring_issue: Same bug/problem appearing repeatedly
+- sentiment_trend: Gradual shift in tone over time
+- stale_item: Decisions proposed but never resolved, promises with no follow-up
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"patterns": [{"type": "pattern_type", "description": "Clear human-readable description", "severity": "info|warning|critical", "evidence_ids": [], "recommendation": "Suggested action"}]}
+
+If no patterns found: {"patterns": []}
+Only report patterns that are genuinely concerning. Do NOT manufacture patterns from insufficient data."""
+
+
+def _heuristic_detect_patterns(group_id: str, all_signals: list[dict]) -> list[Pattern]:
+    """Generate conservative patterns from signal metadata when LLM output is unavailable."""
+    patterns: list[Pattern] = []
+    type_counts: dict[str, int] = {}
+    entity_counts: dict[str, int] = {}
+
+    for s in all_signals:
+        meta = s.get("metadata", {})
+        signal_type = str(meta.get("type", "unknown"))
+        type_counts[signal_type] = type_counts.get(signal_type, 0) + 1
+
+        entities = meta.get("entities", [])
+        if isinstance(entities, str):
+            entities = [entities]
+        if isinstance(entities, list):
+            for ent in entities:
+                ent_key = str(ent).strip()
+                if ent_key:
+                    entity_counts[ent_key] = entity_counts.get(ent_key, 0) + 1
+
+    recurring_types = [t for t, c in type_counts.items() if c >= 2 and t in {"recurring_bug", "workaround", "tech_debt"}]
+    for signal_type in recurring_types:
+        patterns.append(Pattern(
+            group_id=group_id,
+            type="recurring_issue",
+            description=f"Signal type '{signal_type}' has appeared repeatedly ({type_counts[signal_type]} times).",
+            severity="warning",
+            recommendation="Create a dedicated action item with owner and due date to stop repeated recurrence.",
+        ))
+
+    silo_entities = [ent for ent, c in entity_counts.items() if c >= 2]
+    if any("stripe" in ent.lower() or "payment" in ent.lower() for ent in silo_entities):
+        patterns.append(Pattern(
+            group_id=group_id,
+            type="knowledge_silo",
+            description="Critical payment-related topics are concentrated in repeated mentions, suggesting low bus factor.",
+            severity="warning",
+            recommendation="Document payment workflows and assign at least one backup owner.",
+        ))
+
+    return patterns[:5]
+
+
+async def detect_patterns(group_id: str) -> list[Pattern]:
+    """Analyze all signals in a group and detect patterns."""
+    all_signals = get_all_signals(group_id)
+
+    if len(all_signals) < 3:
+        logger.info(f"Not enough signals ({len(all_signals)}) for pattern detection in {group_id}")
+        return []
+
+    # Format signals for the LLM
+    signal_summary = []
+    for s in all_signals:
+        meta = s["metadata"]
+        signal_summary.append(
+            f"- [{meta.get('type', '?')}] {s['document'][:100]} "
+            f"(severity={meta.get('severity', '?')}, entities={meta.get('entities', '[]')}, "
+            f"time={meta.get('timestamp', '?')})"
+        )
+    signals_text = "\n".join(signal_summary)
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Analyze these {len(all_signals)} signals from group '{group_id}':\n\n{signals_text}"},
+    ]
+
+    try:
+        result = await call_llm("reasoning", messages, temperature=0.2, max_tokens=1500)
+        parsed = extract_json_object(result.get("content", ""))
+        patterns = []
+        for p in parsed.get("patterns", []):
+            patterns.append(Pattern(
+                group_id=group_id,
+                type=p.get("type", "unknown"),
+                description=p.get("description", ""),
+                severity=p.get("severity", "info"),
+                recommendation=p.get("recommendation", ""),
+            ))
+
+        logger.info(f"Detected {len(patterns)} patterns in {group_id}")
+        return patterns
+
+    except Exception as e:
+        logger.info(f"Pattern detection LLM parse issue, using fallback: {e}")
+        fallback = _heuristic_detect_patterns(group_id, all_signals)
+        if fallback:
+            logger.info(f"Pattern heuristic fallback produced {len(fallback)} patterns in {group_id}")
+        return fallback
--- a/thirdeye/backend/agents/query_agent.py
+++ b/thirdeye/backend/agents/query_agent.py
@@ -0,0 +1,68 @@
+"""
+Query Agent — voice-aware signal context formatting for ThirdEye.
+
+Provides _format_signal_for_context() which labels each ChromaDB signal with
+its true origin (voice note, document, meeting, chat) so the LLM can produce
+properly attributed answers like:
+  "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided..."
+"""
+from datetime import datetime
+
+
+VOICE_CITATION_INSTRUCTION = """
+When context includes [VOICE NOTE — @name on Date (Xs)] signals, ALWAYS cite the voice note explicitly.
+Example: "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided to use PostgreSQL."
+Never flatten voice signals into generic "the team discussed" language. Always name the speaker and source.
+"""
+
+
+def _format_signal_for_context(signal: dict) -> str:
+    """
+    Format a ChromaDB signal as a context snippet for the Query Agent LLM.
+    Voice-sourced signals get explicit attribution so the LLM cites them correctly.
+    Accepts both flat signal dicts and dicts with a nested 'metadata' key.
+    """
+    # Support both flat dicts and ChromaDB-style {"metadata": {...}, "document": ...}
+    meta = signal.get("metadata", signal)
+
+    source = meta.get("source", signal.get("source", "chat"))
+    sig_type = meta.get("type", signal.get("type", "unknown"))
+    summary = meta.get("summary", signal.get("summary", ""))
+    timestamp = meta.get("timestamp", signal.get("timestamp", ""))
+
+    date_str = ""
+    if timestamp:
+        try:
+            dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
+            date_str = dt.strftime("%b %d")
+        except Exception:
+            date_str = timestamp[:10]
+
+    if source == "voice":
+        speaker = meta.get("speaker", signal.get("speaker", "Unknown"))
+        duration = meta.get("voice_duration", signal.get("voice_duration", 0))
+        duration_str = f"{duration}s" if duration else "?"
+        return (
+            f"[VOICE NOTE — @{speaker} on {date_str} ({duration_str})] "
+            f"[{sig_type}] {summary}"
+        )
+
+    if source == "document":
+        return f"[DOCUMENT — {date_str}] [{sig_type}] {summary}"
+
+    if source == "link":
+        return f"[WEB LINK — {date_str}] [{sig_type}] {summary}"
+
+    if sig_type in ("meet_decision", "meet_action_item", "meet_blocker", "meet_summary"):
+        meeting_id = meta.get("meeting_id", signal.get("meeting_id", ""))
+        return f"[MEETING {meeting_id} — {date_str}] [{sig_type}] {summary}"
+
+    entities_raw = meta.get("entities", signal.get("entities", []))
+    if isinstance(entities_raw, str):
+        import json
+        try:
+            entities_raw = json.loads(entities_raw)
+        except Exception:
+            entities_raw = []
+    sender_str = entities_raw[0] if entities_raw else ""
+    return f"[CHAT — {sender_str} on {date_str}] [{sig_type}] {summary}"
--- a/thirdeye/backend/agents/signal_extractor.py
+++ b/thirdeye/backend/agents/signal_extractor.py
@@ -0,0 +1,128 @@
+"""Signal Extractor Agent — extracts structured signals from chat messages."""
+import logging
+from backend.providers import call_llm
+from backend.db.models import Signal
+from datetime import datetime
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.signal_extractor")
+
+# Lens-specific system prompts
+LENS_PROMPTS = {
+    "dev": """You are the Signal Extractor for ThirdEye operating in DevLens mode.
+You analyze batches of developer team chat messages and extract STRUCTURED SIGNALS.
+
+Extract ONLY signals that represent meaningful technical information. Skip greetings, small talk, emoji reactions, and meta-conversation.
+
+Signal types to look for:
+- architecture_decision: Technology choices, design decisions with rationale
+- tech_debt: Shortcuts, hardcoded values, "will fix later" patterns
+- knowledge_silo_evidence: Only one person discusses a critical topic
+- recurring_bug: Same issue mentioned repeatedly
+- stack_decision: Technology/framework choices (proposed or decided)
+- deployment_risk: Risky deployment practices
+- workaround: Temporary fixes being applied repeatedly
+- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
+
+Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
+For EACH signal found, include it in the JSON array. If NO meaningful signals exist, return empty array.
+Be SELECTIVE. Quality over quantity.""",
+
+    "product": """You are the Signal Extractor for ThirdEye operating in ProductLens mode.
+
+Signal types to look for:
+- feature_request: Features users or team members are asking for
+- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
+- user_pain_point: User difficulties, complaints, confusion
+- roadmap_drift: Discussion of topics not on the current plan
+- priority_conflict: Team members disagreeing on what's most important
+- metric_mention: Specific numbers, conversion rates, performance data
+- user_quote: Direct quotes from users/customers
+- competitor_intel: Mentions of competitor actions or features
+
+Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
+Be SELECTIVE. Quality over quantity.""",
+
+    "client": """You are the Signal Extractor for ThirdEye operating in ClientLens mode.
+
+Signal types to look for:
+- promise: Commitments made with deadlines (explicit or implicit)
+- scope_creep: Additional requests introduced casually without formal change requests
+- sentiment_signal: Tone changes (positive praise, growing frustration, formality shifts)
+- unanswered_request: Questions or requests that haven't received responses
+- satisfaction: Explicit positive or negative feedback
+- escalation_risk: Mentions of involving management, expressing deadline concerns
+- client_decision: Decisions made by the client
+
+Pay SPECIAL attention to implicit deadlines ("by end of week", "before the meeting").
+Be SELECTIVE. Quality over quantity.""",
+
+    "community": """You are the Signal Extractor for ThirdEye operating in CommunityLens mode.
+
+Signal types: recommendation, event, issue, local_knowledge, question
+Be SELECTIVE. Quality over quantity.""",
+}
+
+EXTRACTION_FORMAT = """
+Respond ONLY with valid JSON in this exact format (no markdown, no backticks, no explanation):
+{"signals": [{"type": "signal_type_here", "summary": "One clear sentence that includes specific names, numbers, timelines, and commitments", "entities": ["@person", "technology"], "severity": "low|medium|high|critical", "status": "proposed|decided|implemented|unresolved", "raw_quote": "Exact verbatim sentence(s) from the message that capture the full claim, including names, numbers, and timelines", "message_index": 0}]}
+
+IMPORTANT for raw_quote: copy the FULL relevant sentence from the message, not just a topic keyword.
+Example — message "Anirban: feature page revamp will take approx 2 more days"
+  WRONG raw_quote: "feature page revamp"
+  CORRECT raw_quote: "feature page revamp will take approx 2 more days"
+
+If no signals found: {"signals": []}
+"""
+
+
+async def extract_signals(messages_text: str, group_id: str, lens: str = "dev") -> list[Signal]:
+    """
+    Extract structured signals from a batch of formatted chat messages.
+    
+    Args:
+        messages_text: Formatted string like "[Alex]: Let's use Redis\\n[Bob]: Agreed"
+        group_id: Telegram group ID
+        lens: Active lens mode (dev, product, client, community)
+    
+    Returns:
+        List of Signal objects
+    """
+    system_prompt = LENS_PROMPTS.get(lens, LENS_PROMPTS["dev"])
+
+    messages = [
+        {"role": "system", "content": system_prompt + "\n\n" + EXTRACTION_FORMAT},
+        {"role": "user", "content": f"Extract signals from these messages:\n\n{messages_text}"},
+    ]
+
+    try:
+        result = await call_llm("fast_large", messages, temperature=0.2, max_tokens=2000)
+        parsed = extract_json_object(result.get("content", ""))
+        raw_signals = parsed.get("signals", [])
+
+        # Convert to Signal objects
+        signals = []
+        for raw in raw_signals:
+            try:
+                signal = Signal(
+                    group_id=group_id,
+                    lens=lens,
+                    type=raw.get("type", "unknown"),
+                    summary=raw.get("summary", ""),
+                    entities=raw.get("entities", []),
+                    severity=raw.get("severity", "low"),
+                    status=raw.get("status", "unknown"),
+                    raw_quote=raw.get("raw_quote", ""),
+                    timestamp=datetime.utcnow().isoformat(),
+                )
+                signals.append(signal)
+            except Exception as e:
+                logger.warning(f"Failed to parse signal: {e}")
+                continue
+
+        logger.info(f"Extracted {len(signals)} signals from {group_id} (lens={lens}) via {result['provider']}")
+        return signals
+
+    except Exception as e:
+        logger.error(f"Signal extraction failed: {e}")
+        return []
--- a/thirdeye/backend/agents/voice_handler.py
+++ b/thirdeye/backend/agents/voice_handler.py
@@ -0,0 +1,281 @@
+"""
+Voice Handler
+Orchestrates the full pipeline for Telegram voice messages and video notes:
+
+  Telegram voice/video_note message
+    -> download audio bytes
+    -> transcribe via Groq Whisper (voice_transcriber.py)
+    -> build a voice_transcript signal (stored raw for full-text search)
+    -> run transcript through process_message_batch (signal extraction)
+    -> all extracted signals carry voice attribution metadata
+
+Voice metadata attached to every extracted signal:
+  source:           "voice"
+  voice_file_id:    Telegram file ID
+  voice_duration:   seconds
+  speaker:          sender display name
+"""
+import logging
+import uuid
+from datetime import datetime, timezone
+
+from backend.agents.voice_transcriber import (
+    transcribe_audio, download_telegram_audio, format_duration
+)
+from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
+from backend.db.chroma import store_signals
+from backend.pipeline import process_message_batch
+
+logger = logging.getLogger("thirdeye.agents.voice_handler")
+
+
+# --- Voice transcript signal builder -----------------------------------------
+
+def build_voice_transcript_signal(
+    transcript: str,
+    sender: str,
+    group_id: str,
+    voice_file_id: str,
+    duration_seconds: int,
+    language: str,
+    timestamp: str,
+) -> dict:
+    """
+    Build a voice_transcript signal that stores the full raw transcription.
+    Always stored alongside extracted signals so the full transcript is
+    searchable in ChromaDB even if no structured signals were extracted.
+    """
+    return {
+        "id": str(uuid.uuid4()),
+        "type": "voice_transcript",
+        "summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
+        "raw_quote": transcript,
+        "severity": "low",
+        "status": "transcribed",
+        "sentiment": "neutral",
+        "urgency": "none",
+        "entities": [f"@{sender}"],
+        "keywords": _extract_voice_keywords(transcript),
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "voice",
+        "source": "voice",
+        "voice_file_id": voice_file_id,
+        "voice_duration": duration_seconds,
+        "voice_language": language,
+        "speaker": sender,
+    }
+
+
+def _extract_voice_keywords(text: str) -> list[str]:
+    """Simple keyword extraction from transcript text."""
+    stopwords = {
+        "the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
+        "in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
+        "but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
+        "so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
+    }
+    words = text.lower().split()
+    keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
+    return list(dict.fromkeys(keywords))[:12]
+
+
+def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
+    """
+    Inject voice attribution into every signal extracted from a voice transcript.
+    Accepts both Signal Pydantic model objects and plain dicts.
+    This ensures /ask can cite the voice source in its answers.
+    """
+    result = []
+    for signal in signals:
+        sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
+        sig["source"] = "voice"
+        sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
+        sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
+        sig["voice_language"] = voice_meta.get("language", "")
+        sig["speaker"] = voice_meta.get("sender", "Unknown")
+        if "[Voice]" not in sig.get("summary", ""):
+            sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
+        result.append(sig)
+    return result
+
+
+# --- Fallback signal builder -------------------------------------------------
+
+# Keywords that hint at a signal type when the LLM extraction returns nothing
+_FALLBACK_TYPE_HINTS = {
+    "feature_request": {
+        "need", "needs", "required", "require", "want", "should", "missing",
+        "add", "feature", "ui", "ux", "design", "change", "changes", "update",
+        "improve", "improvement", "responsiveness", "responsive",
+    },
+    "blocker": {
+        "blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
+        "issue", "problem", "broken", "fails", "failing",
+    },
+    "action_item": {
+        "will", "going", "plan", "todo", "do", "fix", "implement", "setup",
+        "create", "build", "deploy", "check",
+    },
+    "risk": {
+        "risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
+        "deadline", "delay", "late",
+    },
+}
+
+
+def _build_fallback_signal(
+    transcript: str,
+    sender: str,
+    group_id: str,
+    timestamp: str,
+    voice_meta: dict,
+) -> dict:
+    """
+    Build a best-effort structured signal from a voice transcript when the LLM
+    returned 0 signals. Picks the most likely signal type from keyword hints,
+    falling back to 'feature_request' as the safe default.
+    """
+    words = set(transcript.lower().split())
+    scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
+    best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
+
+    urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
+    severity = "high" if words & urgency_words else "medium"
+
+    summary = transcript[:200].strip()
+    if len(transcript) > 200:
+        summary += "..."
+
+    return {
+        "id": str(uuid.uuid4()),
+        "type": best_type,
+        "summary": f"[Voice @{sender}] {summary}",
+        "raw_quote": transcript[:500],
+        "severity": severity,
+        "status": "unresolved",
+        "sentiment": "neutral",
+        "urgency": "high" if severity == "high" else "medium",
+        "entities": [f"@{sender}"],
+        "keywords": _extract_voice_keywords(transcript),
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "voice",
+        "source": "voice",
+        "speaker": sender,
+        "voice_file_id": voice_meta.get("voice_file_id", ""),
+        "voice_duration": voice_meta.get("duration_seconds", 0),
+        "voice_language": voice_meta.get("language", ""),
+    }
+
+
+# --- Main handler ------------------------------------------------------------
+
+async def handle_voice_message(
+    bot,
+    group_id: str,
+    sender: str,
+    file_id: str,
+    duration_seconds: int,
+    message_date,
+    is_video_note: bool = False,
+) -> dict:
+    """
+    Full pipeline for a single voice or video note message.
+
+    Returns:
+        {"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
+        OR {"ok": False, "reason": "...", "error": "..."}
+    """
+    if not ENABLE_VOICE_TRANSCRIPTION:
+        return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
+
+    msg_type = "video note" if is_video_note else "voice message"
+    logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
+
+    # 1. Download audio
+    try:
+        audio_bytes = await download_telegram_audio(bot, file_id)
+    except Exception as e:
+        logger.error(f"Failed to download audio from {sender}: {e}")
+        return {"ok": False, "reason": "download_failed", "error": str(e)}
+
+    # 2. Transcribe
+    filename = "audio.mp4" if is_video_note else "audio.ogg"
+    transcription = await transcribe_audio(
+        audio_bytes,
+        filename=filename,
+        duration_seconds=duration_seconds,
+    )
+
+    if not transcription["ok"]:
+        logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
+        return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
+
+    transcript = transcription["transcript"]
+    language = transcription.get("language", "unknown")
+    timestamp = (
+        message_date.replace(tzinfo=timezone.utc).isoformat()
+        if message_date else datetime.utcnow().isoformat()
+    )
+
+    # 3. Store raw voice transcript signal
+    if VOICE_STORE_TRANSCRIPT:
+        transcript_signal = build_voice_transcript_signal(
+            transcript=transcript,
+            sender=sender,
+            group_id=group_id,
+            voice_file_id=file_id,
+            duration_seconds=duration_seconds,
+            language=language,
+            timestamp=timestamp,
+        )
+        store_signals(group_id, [transcript_signal])
+        logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
+
+    # 4. Run through signal extraction pipeline — treat as a regular text message
+    voice_meta = {
+        "sender": sender,
+        "voice_file_id": file_id,
+        "duration_seconds": duration_seconds,
+        "language": language,
+    }
+
+    messages = [{
+        "sender": sender,
+        "text": transcript,
+        "timestamp": timestamp,
+        "source": "voice",
+        "voice_file_id": file_id,
+        "voice_duration": duration_seconds,
+    }]
+
+    try:
+        extracted_signals = await process_message_batch(group_id, messages)
+        extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
+        signals_count = len(extracted_signals)
+
+        # Fallback: if the LLM extracted nothing from a meaningful voice message,
+        # create a generic signal so the content is still searchable as structured data.
+        if signals_count == 0 and len(transcript.split()) >= 5:
+            fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
+            store_signals(group_id, [fallback])
+            signals_count = 1
+            logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
+    except Exception as e:
+        logger.error(f"Signal extraction failed for voice from {sender}: {e}")
+        signals_count = 0
+
+    logger.info(
+        f"Voice pipeline complete: {sender}, {duration_seconds}s, "
+        f"{signals_count} signals, transcript={len(transcript)} chars"
+    )
+
+    return {
+        "ok": True,
+        "transcript": transcript,
+        "signals_extracted": signals_count,
+        "duration": duration_seconds,
+        "sender": f"@{sender}",
+        "language": language,
+    }
--- a/thirdeye/backend/agents/voice_transcriber.py
+++ b/thirdeye/backend/agents/voice_transcriber.py
@@ -0,0 +1,194 @@
+"""
+Voice Transcriber — Groq Whisper integration.
+
+Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
+audio bytes from Telegram voice messages and video notes into plain text.
+
+Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
+Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
+Telegram voice messages: OGG/Opus
+Telegram video notes:    MP4
+
+Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
+At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
+"""
+import io
+import logging
+from typing import Optional
+
+import httpx
+
+from backend.config import (
+    GROQ_API_KEY,
+    VOICE_LANGUAGE,
+    VOICE_MAX_DURATION_SECONDS,
+    VOICE_MIN_DURATION_SECONDS,
+)
+
+logger = logging.getLogger("thirdeye.agents.voice_transcriber")
+
+GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
+WHISPER_MODEL = "whisper-large-v3"
+
+# Groq file size limit for Whisper: 25 MB
+GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
+
+
+# --- Main transcription function ---------------------------------------------
+
+async def transcribe_audio(
+    audio_bytes: bytes,
+    filename: str = "audio.ogg",
+    duration_seconds: int = None,
+) -> dict:
+    """
+    Transcribe audio bytes using Groq Whisper.
+
+    Args:
+        audio_bytes:       Raw audio data (OGG, MP4, WAV, etc.)
+        filename:          Filename hint for the API (determines format detection)
+        duration_seconds:  Voice message duration from Telegram metadata (for pre-filtering)
+
+    Returns:
+        {
+            "ok": True,
+            "transcript": "The full transcribed text...",
+            "language": "en",
+            "duration": 45,
+            "word_count": 120,
+        }
+        OR on failure:
+        {
+            "ok": False,
+            "reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
+            "error": "optional error string",
+        }
+    """
+    # Pre-flight checks
+    if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
+        return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
+
+    if not audio_bytes:
+        return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
+
+    if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
+        return {
+            "ok": False,
+            "reason": "file_too_large",
+            "error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
+        }
+
+    if duration_seconds is not None:
+        if duration_seconds < VOICE_MIN_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_short",
+                "error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
+            }
+        if duration_seconds > VOICE_MAX_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_long",
+                "error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
+            }
+
+    # Determine MIME type from filename extension
+    ext_to_mime = {
+        ".ogg": "audio/ogg",
+        ".opus": "audio/ogg",
+        ".mp3": "audio/mpeg",
+        ".mp4": "video/mp4",
+        ".m4a": "audio/mp4",
+        ".wav": "audio/wav",
+        ".flac": "audio/flac",
+        ".webm": "audio/webm",
+    }
+    ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
+    mime_type = ext_to_mime.get(ext, "audio/ogg")
+
+    form_data = {
+        "model": WHISPER_MODEL,
+        "response_format": "verbose_json",   # returns language detection
+        "temperature": "0",                  # deterministic transcription
+    }
+    if VOICE_LANGUAGE:
+        form_data["language"] = VOICE_LANGUAGE
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                GROQ_WHISPER_URL,
+                headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+                files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
+                data=form_data,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+
+    except httpx.HTTPStatusError as e:
+        error_text = ""
+        try:
+            error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
+        except Exception:
+            error_text = e.response.text[:200]
+
+        if e.response.status_code == 429:
+            logger.warning("Groq Whisper rate limited")
+            return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
+        logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
+        return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
+
+    except httpx.TimeoutException:
+        logger.warning("Groq Whisper request timed out")
+        return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
+
+    except Exception as e:
+        logger.error(f"Groq Whisper unexpected error: {e}")
+        return {"ok": False, "reason": "api_error", "error": str(e)}
+
+    # Parse response
+    transcript = (data.get("text") or "").strip()
+
+    if not transcript:
+        return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
+
+    # Detect if Whisper only returned noise markers
+    noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
+    if transcript.lower() in noise_patterns:
+        return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
+
+    detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
+    word_count = len(transcript.split())
+
+    logger.info(
+        f"Whisper transcribed {duration_seconds or '?'}s audio -> "
+        f"{word_count} words [{detected_language}]: {transcript[:60]}..."
+    )
+
+    return {
+        "ok": True,
+        "transcript": transcript,
+        "language": detected_language,
+        "duration": duration_seconds,
+        "word_count": word_count,
+    }
+
+
+# --- Telegram-specific download helper ---------------------------------------
+
+async def download_telegram_audio(bot, file_id: str) -> bytes:
+    """
+    Download a Telegram file (voice or video_note) and return raw bytes.
+    """
+    tg_file = await bot.get_file(file_id)
+    audio_bytes = await tg_file.download_as_bytearray()
+    return bytes(audio_bytes)
+
+
+def format_duration(seconds: int) -> str:
+    """Format seconds into human-readable string: '1m 34s' or '45s'."""
+    if seconds is None:
+        return "?"
+    if seconds >= 60:
+        return f"{seconds // 60}m {seconds % 60}s"
+    return f"{seconds}s"
--- a/thirdeye/backend/agents/web_search.py
+++ b/thirdeye/backend/agents/web_search.py
@@ -0,0 +1,84 @@
+"""Web Search Agent — Tavily integration for real-time web context."""
+import logging
+from backend.config import TAVILY_API_KEY, ENABLE_WEB_SEARCH
+
+logger = logging.getLogger("thirdeye.agents.web_search")
+
+_tavily_client = None
+
+
+def _get_client():
+    global _tavily_client
+    if _tavily_client is None and TAVILY_API_KEY and len(TAVILY_API_KEY) > 5:
+        try:
+            from tavily import TavilyClient
+            _tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
+            logger.info("Tavily client initialized")
+        except ImportError:
+            logger.error("tavily-python not installed. Run: pip install tavily-python")
+        except Exception as e:
+            logger.error(f"Tavily client init failed: {e}")
+    return _tavily_client
+
+
+async def search_web(query: str, max_results: int = 5) -> list[dict]:
+    """
+    Search the web using Tavily and return structured results.
+    
+    Args:
+        query: Search query string
+        max_results: Max results to return (1-10)
+    
+    Returns:
+        List of {title, url, content, score} dicts, sorted by relevance
+    """
+    if not ENABLE_WEB_SEARCH:
+        logger.info("Web search is disabled via feature flag")
+        return []
+
+    client = _get_client()
+    if not client:
+        logger.warning("Tavily client not available (missing API key or install)")
+        return []
+
+    try:
+        response = client.search(
+            query=query,
+            max_results=max_results,
+            search_depth="basic",  # "basic" is faster + free-tier friendly; "advanced" for deeper
+            include_answer=False,
+            include_raw_content=False,
+        )
+
+        results = []
+        for r in response.get("results", []):
+            results.append({
+                "title": r.get("title", ""),
+                "url": r.get("url", ""),
+                "content": r.get("content", ""),
+                "score": r.get("score", 0.0),
+            })
+
+        logger.info(f"Tavily returned {len(results)} results for: {query[:60]}")
+        return results
+
+    except Exception as e:
+        logger.error(f"Tavily search failed: {e}")
+        return []
+
+
+def format_search_results_for_llm(results: list[dict]) -> str:
+    """Format Tavily results into context string for the Query Agent."""
+    if not results:
+        return ""
+
+    parts = []
+    for i, r in enumerate(results):
+        content_preview = r["content"][:500] if r["content"] else "No content"
+        parts.append(
+            f"[Web Result {i+1}] {r['title']}\n"
+            f"Source: {r['url']}\n"
+            f"Content: {content_preview}"
+        )
+
+    return "\n\n".join(parts)