init

2026-04-19 12:41:48 +00:00 · 2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions
--- a/thirdeye/backend/agents/classifier.py
+++ b/thirdeye/backend/agents/classifier.py
@@ -0,0 +1,34 @@
+"""Classifier Agent — adds metadata tags to extracted signals."""
+import logging
+from backend.providers import call_llm
+from backend.db.models import Signal
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.classifier")
+
+SYSTEM_PROMPT = """You are a fast metadata classifier. Given an extracted signal, add classification tags.
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"sentiment": "positive|neutral|negative|urgent", "urgency": "none|low|medium|high|critical", "keywords": ["3-5 searchable keywords"]}
+"""
+
+
+async def classify_signal(signal: Signal) -> Signal:
+    """Add classification metadata to a signal."""
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Classify this signal:\nType: {signal.type}\nSummary: {signal.summary}\nQuote: {signal.raw_quote}"},
+    ]
+
+    try:
+        result = await call_llm("fast_small", messages, temperature=0.1, max_tokens=200)
+        parsed = extract_json_object(result.get("content", ""))
+        signal.sentiment = parsed.get("sentiment", signal.sentiment)
+        signal.urgency = parsed.get("urgency", signal.urgency)
+        signal.keywords = parsed.get("keywords", signal.keywords)
+
+    except Exception as e:
+        logger.warning(f"Classification failed, using defaults: {e}")
+        # Keep defaults — classification failure is non-fatal
+
+    return signal
--- a/thirdeye/backend/agents/context_detector.py
+++ b/thirdeye/backend/agents/context_detector.py
@@ -0,0 +1,107 @@
+"""Context Detector Agent — auto-classifies group type from messages."""
+import logging
+from backend.providers import call_llm
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.context_detector")
+
+SYSTEM_PROMPT = """You analyze a batch of messages from a Telegram group and determine what TYPE of group this is.
+
+CLASSIFY into exactly ONE:
+- "dev" — Software engineering team (code, PRs, deployments, bugs, tech stack)
+- "product" — Product/business team (features, users, metrics, roadmap, competitors)
+- "client" — Client/agency channel (deliverables, timelines, approvals, invoices)
+- "community" — Community/interest group (recommendations, events, local info, casual)
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"detected_lens": "dev|product|client|community", "confidence": 0.0-1.0, "evidence": ["signal1", "signal2", "signal3"]}
+"""
+
+VALID_LENSES = {"dev", "product", "client", "community"}
+
+
+def _heuristic_detect_context(messages_text: str) -> dict:
+    """Rule-based fallback when LLM output is malformed/unavailable."""
+    text = (messages_text or "").lower()
+
+    lens_keywords = {
+        "dev": [
+            "bug", "deploy", "deployment", "api", "database", "schema", "postgres", "mongo",
+            "timeout", "endpoint", "pod", "pr", "code", "docker", "stack", "integration",
+        ],
+        "product": [
+            "feature", "roadmap", "user", "users", "client", "customers", "complain", "pain",
+            "prioritize", "priority", "enterprise", "competitor", "demo", "sso", "dark mode",
+            "mobile", "stability", "integration",
+        ],
+        "client": [
+            "invoice", "deadline", "deliverable", "approval", "sign-off", "scope", "payment",
+            "contract", "proposal", "timeline", "meeting",
+        ],
+        "community": [
+            "event", "meetup", "recommend", "anyone", "community", "local", "where can i",
+            "suggestion", "friends", "weekend",
+        ],
+    }
+
+    scores = {
+        lens: sum(text.count(keyword) for keyword in keywords)
+        for lens, keywords in lens_keywords.items()
+    }
+
+    best_lens = max(scores, key=scores.get)
+    best_score = scores[best_lens]
+    if best_score == 0:
+        best_lens = "dev"
+
+    evidence = [k for k in lens_keywords[best_lens] if k in text][:3]
+    confidence = min(0.95, 0.35 + 0.08 * best_score) if best_score > 0 else 0.0
+
+    return {
+        "detected_lens": best_lens,
+        "confidence": round(confidence, 2),
+        "evidence": evidence or ["heuristic_fallback"],
+    }
+
+
+async def detect_context(messages_text: str) -> dict:
+    """Detect group type from a batch of messages."""
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Classify this group based on these messages:\n\n{messages_text}"},
+    ]
+
+    try:
+        result = await call_llm(
+            "fast_large",
+            messages,
+            temperature=0.1,
+            max_tokens=300,
+            response_format={"type": "json_object"},
+        )
+        parsed = extract_json_object(result.get("content", ""))
+
+        detected_lens = str(parsed.get("detected_lens", "dev")).strip().lower()
+        if detected_lens not in VALID_LENSES:
+            detected_lens = "dev"
+
+        confidence = parsed.get("confidence", 0.5)
+        try:
+            confidence = float(confidence)
+        except (TypeError, ValueError):
+            confidence = 0.5
+
+        evidence = parsed.get("evidence", [])
+        if not isinstance(evidence, list):
+            evidence = [str(evidence)]
+
+        return {
+            "detected_lens": detected_lens,
+            "confidence": max(0.0, min(1.0, confidence)),
+            "evidence": [str(x) for x in evidence][:5],
+        }
+    except Exception as e:
+        logger.error(f"Context detection failed: {e}")
+        fallback = _heuristic_detect_context(messages_text)
+        fallback["evidence"] = fallback["evidence"] + ["detection_failed"]
+        return fallback
--- a/thirdeye/backend/agents/cross_group_analyst.py
+++ b/thirdeye/backend/agents/cross_group_analyst.py
@@ -0,0 +1,287 @@
+"""Cross-Group Analyst Agent — detects blind spots between multiple teams."""
+
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import get_all_signals, get_group_ids
+from backend.db.models import CrossGroupInsight
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.cross_group_analyst")
+
+SYSTEM_PROMPT = """You are the Cross-Group Intelligence Analyst for ThirdEye. This is the MOST IMPORTANT analysis.
+
+You receive intelligence summaries from MULTIPLE Telegram groups. Your job is to find BLIND SPOTS — information in one group that should be in another.
+
+Detect:
+- blocked_handoff: Team A waiting for something from Team B, but Team B doesn't know
+- conflicting_decision: Team A decided X, Team B decided the opposite
+- information_silo: Critical info in Group A never reached Group B
+- promise_reality_gap: Promise made in one group, but another group shows it's blocked
+- duplicated_effort: Two teams working on similar things unknowingly
+
+Respond ONLY with valid JSON (no markdown):
+{"insights": [{"type": "insight_type", "description": "SPECIFIC description naming the groups, people, and topics", "group_a": {"name": "group_name", "evidence": "what was said"}, "group_b": {"name": "group_name", "evidence": "what was said or NOT said"}, "severity": "warning|critical", "recommendation": "Specific action"}]}
+
+If no cross-group issues: {"insights": []}
+Be SPECIFIC. Name the groups, people, topics, and exact conflicts."""
+
+
+def _heuristic_cross_group_insights(
+    group_summaries: dict[str, list[dict]],
+) -> list[CrossGroupInsight]:
+    """Generate best-effort cross-group insights when LLM output is unavailable."""
+    insights: list[CrossGroupInsight] = []
+
+    normalized = {}
+    for group_name, signals in group_summaries.items():
+        docs = [str(s.get("document", "")) for s in signals]
+        combined = " ".join(docs).lower()
+        signal_types = []
+        for s in signals:
+            signal_types.append(
+                str(s.get("metadata", {}).get("type", "unknown")).lower()
+            )
+        normalized[group_name] = {
+            "text": combined,
+            "signals": signals,
+            "types": signal_types,
+        }
+
+    group_names = list(normalized.keys())
+    for i in range(len(group_names)):
+        for j in range(i + 1, len(group_names)):
+            group_a = group_names[i]
+            group_b = group_names[j]
+            text_a = normalized[group_a]["text"]
+            text_b = normalized[group_b]["text"]
+            types_a = set(normalized[group_a]["types"])
+            types_b = set(normalized[group_b]["types"])
+
+            # Detect a likely blocked handoff around design/spec dependencies.
+            a_waiting = any(
+                k in text_a for k in ["waiting", "blocked", "design spec", "specs"]
+            )
+            b_mentions_specs = any(
+                k in text_b for k in ["design spec", "specs", "design"]
+            )
+            if a_waiting and not b_mentions_specs:
+                insights.append(
+                    CrossGroupInsight(
+                        type="blocked_handoff",
+                        description=(
+                            f"{group_a} indicates dependency blockage (design/spec inputs), "
+                            f"but {group_b} has no corresponding discussion of that dependency."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": "Contains waiting/blocked language tied to specs or design dependency.",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": "No clear mention of design specs/dependency handoff in available signals.",
+                        },
+                        severity="warning",
+                        recommendation=(
+                            f"Create a shared handoff item between {group_a} and {group_b} for design/spec ownership "
+                            "with an explicit due date."
+                        ),
+                    )
+                )
+
+            # Detect likely promise vs execution mismatch.
+            b_promises = any(
+                k in text_b
+                for k in ["demo", "friday", "promised", "told the client", "ready by"]
+            )
+            a_blocked = any(
+                k in text_a
+                for k in ["blocked", "waiting", "can't proceed", "cannot proceed"]
+            )
+            if b_promises and a_blocked:
+                insights.append(
+                    CrossGroupInsight(
+                        type="promise_reality_gap",
+                        description=(
+                            f"{group_b} signals delivery promises while {group_a} reports blockers that may prevent those commitments."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": "Signals include active blockers/waiting dependencies.",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": "Signals include explicit client/demo commitments and timelines.",
+                        },
+                        severity="critical",
+                        recommendation="Run a joint risk review and re-baseline commitments before the next client update.",
+                    )
+                )
+
+            # Type-based silo detection when lexical cues are weak.
+            a_operational_risk = bool(
+                types_a.intersection(
+                    {"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
+                )
+            )
+            b_planning_focus = bool(
+                types_b.intersection(
+                    {
+                        "feature_request",
+                        "roadmap_drift",
+                        "priority_conflict",
+                        "user_pain_point",
+                    }
+                )
+            )
+            if a_operational_risk and b_planning_focus:
+                insights.append(
+                    CrossGroupInsight(
+                        type="information_silo",
+                        description=(
+                            f"{group_a} shows operational risk signals while {group_b} is focused on planning/user demands, "
+                            "suggesting risk context is not shared across groups."
+                        ),
+                        group_a={
+                            "name": group_a,
+                            "evidence": f"Operational risk signal types: {sorted(types_a.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
+                        },
+                        group_b={
+                            "name": group_b,
+                            "evidence": f"Planning-focused signal types: {sorted(types_b.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
+                        },
+                        severity="warning",
+                        recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
+                    )
+                )
+
+            # Check reverse direction as well.
+            b_operational_risk = bool(
+                types_b.intersection(
+                    {"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
+                )
+            )
+            a_planning_focus = bool(
+                types_a.intersection(
+                    {
+                        "feature_request",
+                        "roadmap_drift",
+                        "priority_conflict",
+                        "user_pain_point",
+                    }
+                )
+            )
+            if b_operational_risk and a_planning_focus:
+                insights.append(
+                    CrossGroupInsight(
+                        type="information_silo",
+                        description=(
+                            f"{group_b} shows operational risk signals while {group_a} is focused on planning/user demands, "
+                            "suggesting risk context is not shared across groups."
+                        ),
+                        group_a={
+                            "name": group_b,
+                            "evidence": f"Operational risk signal types: {sorted(types_b.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
+                        },
+                        group_b={
+                            "name": group_a,
+                            "evidence": f"Planning-focused signal types: {sorted(types_a.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
+                        },
+                        severity="warning",
+                        recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
+                    )
+                )
+
+    deduped = []
+    seen_keys = set()
+    for insight in insights:
+        key = (insight.type, insight.group_a.get("name"), insight.group_b.get("name"))
+        if key in seen_keys:
+            continue
+        seen_keys.add(key)
+        deduped.append(insight)
+
+    return deduped[:5]
+
+
+async def analyze_cross_group(
+    group_summaries: dict[str, list[dict]] = None,
+) -> list[CrossGroupInsight]:
+    """
+    Analyze intelligence across all monitored groups to find blind spots.
+
+    Args:
+        group_summaries: Optional pre-built summaries. If None, loads from ChromaDB.
+    """
+    if group_summaries is None:
+        group_ids = get_group_ids()
+        if len(group_ids) < 2:
+            logger.info("Need at least 2 groups for cross-group analysis")
+            return []
+
+        group_summaries = {}
+        for gid in group_ids:
+            signals = get_all_signals(gid)
+            group_summaries[gid] = signals
+
+    if len(group_summaries) < 2:
+        return []
+
+    # Format summaries for the LLM
+    summary_parts = []
+    for group_name, signals in group_summaries.items():
+        signal_lines = []
+        for s in signals[:30]:  # Limit per group to fit context
+            meta = s["metadata"]
+            signal_lines.append(f"  - [{meta.get('type', '?')}] {s['document'][:120]}")
+
+        summary_parts.append(
+            f"=== GROUP: {group_name} ({len(signals)} total signals) ===\n"
+            + "\n".join(signal_lines)
+        )
+
+    full_summary = "\n\n".join(summary_parts)
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": f"Analyze cross-group intelligence:\n\n{full_summary}",
+        },
+    ]
+
+    try:
+        result = await call_llm(
+            "reasoning",
+            messages,
+            temperature=0.2,
+            max_tokens=2000,
+            response_format={"type": "json_object"},
+        )
+        parsed = extract_json_object(result.get("content", ""))
+        insights = []
+        for i in parsed.get("insights", []):
+            insights.append(
+                CrossGroupInsight(
+                    type=i.get("type", "unknown"),
+                    description=i.get("description", ""),
+                    group_a=i.get("group_a", {}),
+                    group_b=i.get("group_b", {}),
+                    severity=i.get("severity", "warning"),
+                    recommendation=i.get("recommendation", ""),
+                )
+            )
+
+        logger.info(f"Cross-group analysis found {len(insights)} insights")
+        return insights
+
+    except Exception as e:
+        raw = ""
+        if "result" in locals() and isinstance(result, dict):
+            raw = str(result.get("content", ""))[:300].replace("\n", " ")
+        logger.info(f"Cross-group LLM parse issue, using fallback: {e}; raw_head={raw}")
+        fallback = _heuristic_cross_group_insights(group_summaries)
+        if fallback:
+            logger.info(
+                f"Cross-group heuristic fallback produced {len(fallback)} insights"
+            )
+        return fallback
--- a/thirdeye/backend/agents/document_ingestor.py
+++ b/thirdeye/backend/agents/document_ingestor.py
@@ -0,0 +1,200 @@
+"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
+import os
+import logging
+import uuid
+from datetime import datetime
+
+logger = logging.getLogger("thirdeye.agents.document_ingestor")
+
+# --- Text Extraction ---
+
+def extract_text_from_pdf(file_path: str) -> list[dict]:
+    """Extract text from PDF, returns list of {page: int, text: str}."""
+    from PyPDF2 import PdfReader
+
+    pages = []
+    try:
+        reader = PdfReader(file_path)
+        for i, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text and text.strip():
+                pages.append({"page": i + 1, "text": text.strip()})
+    except Exception as e:
+        logger.error(f"PDF extraction failed for {file_path}: {e}")
+
+    return pages
+
+
+def extract_text_from_docx(file_path: str) -> list[dict]:
+    """Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
+    from docx import Document
+
+    try:
+        doc = Document(file_path)
+        full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+        if full_text.strip():
+            return [{"page": 1, "text": full_text.strip()}]
+    except Exception as e:
+        logger.error(f"DOCX extraction failed for {file_path}: {e}")
+
+    return []
+
+
+def extract_text_from_txt(file_path: str) -> list[dict]:
+    """Extract text from plain text file."""
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            text = f.read().strip()
+        if text:
+            return [{"page": 1, "text": text}]
+    except Exception as e:
+        logger.error(f"TXT extraction failed for {file_path}: {e}")
+
+    return []
+
+
+EXTRACTORS = {
+    ".pdf": extract_text_from_pdf,
+    ".docx": extract_text_from_docx,
+    ".txt": extract_text_from_txt,
+    ".md": extract_text_from_txt,
+    ".csv": extract_text_from_txt,
+    ".json": extract_text_from_txt,
+    ".log": extract_text_from_txt,
+}
+
+
+def extract_text(file_path: str) -> list[dict]:
+    """Route to correct extractor based on file extension."""
+    ext = os.path.splitext(file_path)[1].lower()
+    extractor = EXTRACTORS.get(ext)
+    if not extractor:
+        logger.warning(f"Unsupported file type: {ext} ({file_path})")
+        return []
+    return extractor(file_path)
+
+
+# --- Chunking ---
+
+def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
+    """
+    Split text into overlapping chunks.
+    
+    Uses paragraph boundaries when possible, falls back to sentence boundaries,
+    then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
+    """
+    if len(text) <= max_chars:
+        return [text]
+
+    # Split by paragraphs first
+    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
+
+    chunks = []
+    current_chunk = ""
+
+    for para in paragraphs:
+        # If adding this paragraph stays under limit, add it
+        if len(current_chunk) + len(para) + 1 <= max_chars:
+            current_chunk = (current_chunk + "\n" + para).strip()
+        else:
+            # Save current chunk if it has content
+            if current_chunk:
+                chunks.append(current_chunk)
+
+            # If single paragraph is too long, split it by sentences
+            if len(para) > max_chars:
+                sentences = para.replace(". ", ".\n").split("\n")
+                sub_chunk = ""
+                for sent in sentences:
+                    if len(sub_chunk) + len(sent) + 1 <= max_chars:
+                        sub_chunk = (sub_chunk + " " + sent).strip()
+                    else:
+                        if sub_chunk:
+                            chunks.append(sub_chunk)
+                        sub_chunk = sent
+                if sub_chunk:
+                    current_chunk = sub_chunk
+                else:
+                    current_chunk = ""
+            else:
+                current_chunk = para
+
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    # Add overlap: prepend last N chars of previous chunk to each subsequent chunk
+    if overlap_chars > 0 and len(chunks) > 1:
+        overlapped = [chunks[0]]
+        for i in range(1, len(chunks)):
+            prev_tail = chunks[i - 1][-overlap_chars:]
+            # Find a word boundary in the overlap
+            space_idx = prev_tail.find(" ")
+            if space_idx > 0:
+                prev_tail = prev_tail[space_idx + 1:]
+            overlapped.append(prev_tail + " " + chunks[i])
+        chunks = overlapped
+
+    return chunks
+
+
+# --- Main Ingestion ---
+
+def ingest_document(
+    file_path: str,
+    group_id: str,
+    shared_by: str = "Unknown",
+    filename: str = None,
+) -> list[dict]:
+    """
+    Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
+    
+    Args:
+        file_path: Path to the downloaded file on disk
+        group_id: Telegram group ID
+        shared_by: Who shared the file
+        filename: Original filename (for metadata)
+    
+    Returns:
+        List of signal dicts ready for store_signals()
+    """
+    if filename is None:
+        filename = os.path.basename(file_path)
+
+    # Extract
+    pages = extract_text(file_path)
+    if not pages:
+        logger.warning(f"No text extracted from {filename}")
+        return []
+
+    # Chunk each page
+    signals = []
+    total_chunks = 0
+
+    for page_data in pages:
+        page_num = page_data["page"]
+        chunks = chunk_text(page_data["text"])
+
+        for chunk_idx, chunk_text_str in enumerate(chunks):
+            if len(chunk_text_str.strip()) < 30:
+                continue  # Skip tiny chunks
+
+            signal = {
+                "id": str(uuid.uuid4()),
+                "type": "document_knowledge",
+                "summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
+                "entities": [f"@{shared_by}", filename],
+                "severity": "low",
+                "status": "reference",
+                "sentiment": "neutral",
+                "urgency": "none",
+                "raw_quote": chunk_text_str,
+                "timestamp": datetime.utcnow().isoformat(),
+                "group_id": group_id,
+                "lens": "document",
+                "keywords": [filename, f"page_{page_num}", "document", shared_by],
+            }
+            signals.append(signal)
+            total_chunks += 1
+
+    logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
+    return signals
--- a/thirdeye/backend/agents/jira_agent.py
+++ b/thirdeye/backend/agents/jira_agent.py
@@ -0,0 +1,373 @@
+"""
+Jira Signal Agent
+Takes ThirdEye signals and converts them into well-formed Jira tickets.
+
+Responsibilities:
+  1. Map signal type → Jira issue type + priority
+  2. LLM-generate a clean ticket title and structured description from signal context
+  3. Extract assignee names and match them to Jira account IDs (best-effort)
+  4. Raise the ticket via jira_client and mark the signal in ChromaDB
+  5. Bulk-raise: process a group's unraised high-severity signals in one call
+"""
+import json
+import logging
+from datetime import datetime
+
+from backend.providers import call_llm
+from backend.integrations.jira_client import (
+    create_issue, search_issues, add_comment, is_configured, search_users
+)
+from backend.db.chroma import store_signals, mark_signal_as_raised, get_raised_signal_ids
+from backend.config import (
+    JIRA_DEFAULT_PROJECT, JIRA_DEFAULT_ISSUE_TYPE,
+    JIRA_AUTO_RAISE_SEVERITY
+)
+
+logger = logging.getLogger("thirdeye.agents.jira_agent")
+
+
+# ─── Signal → Jira type mapping ──────────────────────────────────────────────
+
+# Maps ThirdEye signal type → (Jira issue type, default priority)
+# Note: Issue types must match what's available in your Jira project
+# Common types: Task, Bug, Story, Epic, Workstream (project-specific)
+SIGNAL_TYPE_MAP = {
+    # Dev signals
+    "tech_debt":          ("Task",  "Low"),
+    "recurring_bug":      ("Task",   "High"),  # Changed from Bug to Task
+    "architecture_decision": ("Task", "Medium"),
+    "deployment_risk":    ("Task",  "High"),
+    "workaround":         ("Task",  "Medium"),
+    "knowledge_silo":     ("Task",  "Medium"),
+    # Product signals
+    "feature_request":    ("Task", "Medium"),  # Changed from Story to Task
+    "priority_conflict":  ("Task",  "High"),
+    "sentiment_shift":    ("Task",  "Medium"),
+    # Client signals
+    "promise":            ("Task",  "High"),
+    "scope_creep":        ("Task",  "High"),
+    "risk":               ("Task",  "High"),
+    # Meet signals
+    "meet_action_item":   ("Task",  "Medium"),
+    "meet_blocker":       ("Task",  "Highest"),
+    "meet_risk":          ("Task",  "High"),
+    "meet_decision":      ("Task",  "Medium"),
+    "meet_open_q":        ("Task",  "Low"),
+    # Generic
+    "blocker":            ("Task",  "Highest"),
+    "decision":           ("Task",  "Medium"),
+    "action_item":        ("Task",  "Medium"),
+}
+
+SEVERITY_TO_PRIORITY = {
+    "critical": "Highest",
+    "high":     "High",
+    "medium":   "Medium",
+    "low":      "Low",
+}
+
+RAISEABLE_TYPES = set(SIGNAL_TYPE_MAP.keys())
+
+
+# ─── Assignee resolution ─────────────────────────────────────────────────────
+
+async def resolve_assignee_account_id(name: str) -> str | None:
+    """
+    Resolve a person's display name (or @name) to their Jira account ID.
+    Uses Jira's user search API and fuzzy-matches the best result.
+    Returns the account ID string, or None if no confident match is found.
+    """
+    if not name:
+        return None
+    clean = name.lstrip("@").strip()
+    try:
+        users = await search_users(clean)
+        if not users:
+            return None
+        clean_lower = clean.lower()
+        # Exact display-name match first
+        for u in users:
+            if u["display_name"].lower() == clean_lower:
+                return u["account_id"]
+        # Partial match (all search words appear in display name)
+        words = clean_lower.split()
+        for u in users:
+            dn = u["display_name"].lower()
+            if all(w in dn for w in words):
+                return u["account_id"]
+        # Last resort: first result
+        return users[0]["account_id"]
+    except Exception as e:
+        logger.warning(f"resolve_assignee_account_id failed for '{name}': {e}")
+        return None
+
+
+# ─── LLM ticket generation ───────────────────────────────────────────────────
+
+TICKET_GEN_SYSTEM_PROMPT = """You are a senior engineering manager writing Jira tickets from team intelligence signals.
+
+Given a ThirdEye signal (a structured piece of extracted team knowledge), write a Jira ticket.
+
+Return ONLY a valid JSON object with exactly these fields:
+{
+  "summary": "Short, actionable ticket title (max 100 chars). Start with a verb. No jargon.",
+  "description": "Full ticket description. Include: what the issue is, context from the signal, why it matters, suggested next steps. Use blank lines between sections. Use '- ' for bullet points. Max 400 words.",
+  "labels": ["label1", "label2"],
+  "assignee_name": "First name or @name of the person to assign, or null if unclear"
+}
+
+Label rules:
+- Always include "thirdeye" and "auto-raised"
+- Add the signal type as a label (e.g. "tech-debt", "recurring-bug")
+- Add "urgent" if severity is high or critical
+- Labels must not have spaces (use hyphens)
+
+Summary rules:
+- Starts with a verb: "Fix", "Investigate", "Address", "Resolve", "Document", "Implement"
+- Be specific — "Fix intermittent checkout timeout" NOT "Fix bug"
+- Never exceed 100 characters
+
+Description must include:
+1. What: clear 1-sentence problem statement
+2. Context: what was actually said / detected (cite the signal)
+3. Impact: why this matters to the team or product
+4. Suggested next steps (2-3 bullet points)
+
+Return JSON only — no markdown, no preamble."""
+
+
+async def generate_ticket_content(signal: dict) -> dict:
+    """
+    Use an LLM to generate a clean, context-rich Jira ticket from a ThirdEye signal.
+    Returns {"summary": str, "description": str, "labels": list, "assignee_name": str|None}
+    """
+    signal_text = (
+        f"Signal type: {signal.get('type', 'unknown')}\n"
+        f"Summary: {signal.get('summary', '')}\n"
+        f"Raw quote: {signal.get('raw_quote', '')[:300]}\n"
+        f"Severity: {signal.get('severity', 'medium')}\n"
+        f"Entities involved: {', '.join(signal.get('entities', []))}\n"
+        f"Keywords: {', '.join(signal.get('keywords', []))}\n"
+        f"Timestamp: {signal.get('timestamp', '')}\n"
+        f"Group: {signal.get('group_id', '')}\n"
+        f"Lens: {signal.get('lens', '')}"
+    )
+
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": TICKET_GEN_SYSTEM_PROMPT},
+                {"role": "user", "content": signal_text},
+            ],
+            temperature=0.2,
+            max_tokens=800,
+            response_format={"type": "json_object"},
+        )
+        raw = result["content"].strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        return json.loads(raw)
+
+    except Exception as e:
+        logger.warning(f"Ticket generation LLM failed: {e}. Using fallback.")
+        # Fallback: build a basic ticket without LLM
+        sig_type = signal.get("type", "unknown").replace("_", " ").title()
+        return {
+            "summary": f"{sig_type}: {signal.get('summary', 'Unknown issue')[:80]}",
+            "description": (
+                f"Signal detected by ThirdEye.\n\n"
+                f"Type: {signal.get('type', 'unknown')}\n"
+                f"Summary: {signal.get('summary', '')}\n\n"
+                f"Raw context:\n{signal.get('raw_quote', '(none)')[:300]}\n\n"
+                f"Severity: {signal.get('severity', 'medium')}"
+            ),
+            "labels": ["thirdeye", "auto-raised", signal.get("type", "unknown").replace("_", "-")],
+            "assignee_name": None,
+        }
+
+
+# ─── Main raise function ──────────────────────────────────────────────────────
+
+async def raise_ticket_for_signal(
+    signal: dict,
+    group_id: str,
+    project_key: str = None,
+    force: bool = False,
+    assignee_account_id: str = None,
+) -> dict:
+    """
+    Create a Jira ticket for a single ThirdEye signal.
+
+    Args:
+        signal:      The signal dict from ChromaDB
+        group_id:    The group this signal belongs to (for dedup tracking)
+        project_key: Override project (default: JIRA_DEFAULT_PROJECT)
+        force:       If True, raise even if already raised before
+
+    Returns:
+        {"ok": True, "key": "ENG-42", "url": "...", "summary": "..."}
+        OR
+        {"ok": False, "reason": "already_raised" | "not_raiseable" | "jira_error", ...}
+    """
+    if not is_configured():
+        return {"ok": False, "reason": "jira_not_configured"}
+
+    signal_id = signal.get("id", "")
+    signal_type = signal.get("type", "")
+
+    # Check if this signal type is raiseable
+    if signal_type not in RAISEABLE_TYPES:
+        return {"ok": False, "reason": "not_raiseable", "signal_type": signal_type}
+
+    # Check if already raised (skip if force=True)
+    if not force and signal_id:
+        already_raised = get_raised_signal_ids(group_id)
+        if signal_id in already_raised:
+            return {"ok": False, "reason": "already_raised", "signal_id": signal_id}
+
+    # Determine Jira issue type and priority from signal
+    default_type, default_priority = SIGNAL_TYPE_MAP.get(signal_type, (JIRA_DEFAULT_ISSUE_TYPE, "Medium"))
+    severity = signal.get("severity", "medium").lower()
+    priority = SEVERITY_TO_PRIORITY.get(severity, default_priority)
+
+    # Generate ticket content via LLM
+    ticket_content = await generate_ticket_content(signal)
+
+    summary = ticket_content.get("summary", signal.get("summary", "ThirdEye signal")[:100])
+    description = ticket_content.get("description", signal.get("summary", ""))
+    labels = ticket_content.get("labels", ["thirdeye", "auto-raised"])
+    # Always ensure thirdeye label is present
+    if "thirdeye" not in labels:
+        labels.append("thirdeye")
+
+    # Append ThirdEye metadata as a context section in the description
+    meta_section = (
+        f"\n\n---\n"
+        f"Raised by: ThirdEye\n"
+        f"Signal ID: {signal_id}\n"
+        f"Group: {group_id}\n"
+        f"Detected: {signal.get('timestamp', datetime.utcnow().isoformat())}"
+    )
+    description = description + meta_section
+
+    # Resolve assignee: explicit account_id wins, then signal override name, then LLM-extracted name
+    if not assignee_account_id:
+        name_hint = signal.get("assignee_override") or ticket_content.get("assignee_name")
+        if name_hint:
+            assignee_account_id = await resolve_assignee_account_id(name_hint)
+            if assignee_account_id:
+                logger.info(f"Resolved assignee '{name_hint}' → {assignee_account_id}")
+            else:
+                logger.warning(f"Could not resolve assignee '{name_hint}' to a Jira account")
+
+    # Create the ticket
+    result = await create_issue(
+        project_key=project_key or JIRA_DEFAULT_PROJECT,
+        summary=summary,
+        description=description,
+        issue_type=default_type,
+        priority=priority,
+        labels=labels,
+        assignee_account_id=assignee_account_id,
+    )
+
+    if result.get("ok"):
+        jira_key = result["key"]
+        jira_url = result["url"]
+        # Mark this signal as raised in ChromaDB so we never duplicate it
+        if signal_id:
+            mark_signal_as_raised(
+                group_id, signal_id, jira_key,
+                jira_url=jira_url,
+                jira_summary=summary,
+                jira_priority=priority,
+            )
+        logger.info(f"Raised Jira ticket {jira_key} for signal {signal_id} ({signal_type})")
+        return {
+            "ok": True,
+            "key": jira_key,
+            "url": jira_url,
+            "summary": summary,
+            "issue_type": default_type,
+            "priority": priority,
+            "assignee_account_id": assignee_account_id,
+        }
+    else:
+        logger.error(f"Jira ticket creation failed: {result}")
+        return {
+            "ok": False,
+            "reason": "jira_error",
+            "error": result.get("error"),
+            "details": result.get("details"),
+        }
+
+
+async def bulk_raise_for_group(
+    group_id: str,
+    signals: list[dict],
+    min_severity: str = None,
+    project_key: str = None,
+    max_tickets: int = 10,
+) -> list[dict]:
+    """
+    Raise Jira tickets for multiple signals from a group in one call.
+
+    Filters:
+    - Only raiseable signal types
+    - Only signals at or above min_severity (defaults to JIRA_AUTO_RAISE_SEVERITY)
+    - Skips signals already raised
+    - Caps at max_tickets to avoid flooding Jira
+
+    Returns list of raise results.
+    """
+    min_sev = (min_severity or JIRA_AUTO_RAISE_SEVERITY).lower()
+    severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
+    min_rank = severity_rank.get(min_sev, 2)  # Default: high
+
+    already_raised = get_raised_signal_ids(group_id)
+    candidates = []
+
+    for sig in signals:
+        sig_type = sig.get("type", "")
+        sig_id = sig.get("id", "")
+        severity = sig.get("severity", "low").lower()
+        rank = severity_rank.get(severity, 0)
+
+        if sig_type not in RAISEABLE_TYPES:
+            continue
+        if rank < min_rank:
+            continue
+        if sig_id in already_raised:
+            continue
+        candidates.append(sig)
+
+    # Sort by severity descending, then raise up to max_tickets
+    candidates.sort(key=lambda s: severity_rank.get(s.get("severity", "low"), 0), reverse=True)
+    candidates = candidates[:max_tickets]
+
+    results = []
+    for sig in candidates:
+        result = await raise_ticket_for_signal(sig, group_id, project_key=project_key)
+        results.append({**result, "signal_type": sig.get("type"), "signal_summary": sig.get("summary", "")[:80]})
+
+    logger.info(f"Bulk raise for group {group_id}: {len(results)} tickets from {len(signals)} signals")
+    return results
+
+
+def format_raise_result_for_telegram(result: dict) -> str:
+    """Format a single raise result as a Telegram message line."""
+    if result.get("ok"):
+        return (
+            f"✅ [{result['key']}]({result['url']}) — "
+            f"*{result.get('issue_type', 'Task')}* | {result.get('priority', 'Medium')} priority\n"
+            f"   _{result.get('summary', '')[:90]}_"
+        )
+    reason = result.get("reason", "unknown")
+    if reason == "already_raised":
+        return f"⏭️ Already raised — skipped"
+    if reason == "not_raiseable":
+        return f"⚪ Signal type `{result.get('signal_type', '?')}` — not mapped to Jira"
+    return f"❌ Failed: {result.get('error', reason)}"
--- a/thirdeye/backend/agents/json_utils.py
+++ b/thirdeye/backend/agents/json_utils.py
@@ -0,0 +1,43 @@
+"""Utilities for robustly parsing JSON from LLM responses."""
+
+import json
+import re
+
+
+def extract_json_object(content: str) -> dict:
+    """Extract and parse the first JSON object from raw LLM output."""
+    text = (content or "").strip()
+    if not text:
+        raise json.JSONDecodeError("Empty LLM response", text, 0)
+
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
+        text = re.sub(r"\s*```$", "", text)
+
+    text = text.strip()
+    if not text:
+        raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)
+
+    decoder = json.JSONDecoder()
+
+    # Direct parse for pure JSON responses.
+    try:
+        parsed = json.loads(text)
+        if isinstance(parsed, dict):
+            return parsed
+    except json.JSONDecodeError:
+        pass
+
+    # Try to decode from each object start. This handles wrapper text more
+    # reliably than regex, especially with nested braces.
+    for idx, ch in enumerate(text):
+        if ch != "{":
+            continue
+        try:
+            parsed, _ = decoder.raw_decode(text[idx:])
+            if isinstance(parsed, dict):
+                return parsed
+        except json.JSONDecodeError:
+            continue
+
+    raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)
--- a/thirdeye/backend/agents/link_fetcher.py
+++ b/thirdeye/backend/agents/link_fetcher.py
@@ -0,0 +1,213 @@
+"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
+import re
+import uuid
+import logging
+import asyncio
+from datetime import datetime
+
+import httpx
+from bs4 import BeautifulSoup
+
+from backend.providers import call_llm
+from backend.config import ENABLE_LINK_FETCH
+
+logger = logging.getLogger("thirdeye.agents.link_fetcher")
+
+# Patterns to skip (images, downloads, social media embeds, etc.)
+SKIP_PATTERNS = [
+    r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
+    r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
+    r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
+    r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
+    r"^https?://(www\.)?instagram\.com/p/",
+    r"^https?://(www\.)?tiktok\.com/",
+    r"^https?://(www\.)?youtube\.com/shorts/",
+    r"^https?://t\.me/",  # Other Telegram links
+]
+
+SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
+
+
+def extract_urls(text: str) -> list[str]:
+    """Extract all HTTP/HTTPS URLs from a text string."""
+    url_pattern = re.compile(
+        r"https?://[^\s<>\"')\]},;]+"
+    )
+    urls = url_pattern.findall(text)
+
+    # Clean trailing punctuation
+    cleaned = []
+    for url in urls:
+        url = url.rstrip(".,;:!?)")
+        if len(url) > 10:
+            cleaned.append(url)
+
+    return cleaned
+
+
+def should_fetch(url: str) -> bool:
+    """Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
+    for pattern in SKIP_COMPILED:
+        if pattern.search(url):
+            return False
+    return True
+
+
+async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
+    """
+    Fetch a URL and extract main text content.
+    
+    Returns:
+        {title, text, url} or None if fetch fails
+    """
+    try:
+        async with httpx.AsyncClient(
+            follow_redirects=True,
+            timeout=timeout,
+            headers={
+                "User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            },
+        ) as client:
+            response = await client.get(url)
+
+            if response.status_code != 200:
+                logger.info(f"URL returned {response.status_code}: {url[:80]}")
+                return None
+
+            content_type = response.headers.get("content-type", "")
+            if "text/html" not in content_type and "application/xhtml" not in content_type:
+                logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
+                return None
+
+            html = response.text
+
+    except httpx.TimeoutException:
+        logger.info(f"URL timed out: {url[:80]}")
+        return None
+    except Exception as e:
+        logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
+        return None
+
+    # Parse HTML
+    try:
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Extract title
+        title = ""
+        if soup.title and soup.title.string:
+            title = soup.title.string.strip()
+
+        # Remove script, style, nav, footer, header elements
+        for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
+            tag.decompose()
+
+        # Try to find main content area
+        main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
+        if main:
+            text = main.get_text(separator="\n", strip=True)
+        else:
+            text = soup.get_text(separator="\n", strip=True)
+
+        # Clean up
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        text = "\n".join(lines)
+
+        # Skip if too little content
+        if len(text) < 100:
+            logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
+            return None
+
+        # Truncate very long content
+        if len(text) > 8000:
+            text = text[:8000] + "\n\n[Content truncated]"
+
+        return {
+            "title": title or url,
+            "text": text,
+            "url": url,
+        }
+
+    except Exception as e:
+        logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
+        return None
+
+
+async def summarize_content(title: str, text: str, url: str) -> str:
+    """Use LLM to create a concise summary of fetched content."""
+    # Limit text sent to LLM
+    text_preview = text[:3000]
+
+    messages = [
+        {"role": "system", "content": """You are a content summarizer for ThirdEye. 
+Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
+Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
+Respond with ONLY the summary text, nothing else."""},
+        {"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
+    ]
+
+    try:
+        result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
+        return result["content"].strip()
+    except Exception as e:
+        logger.warning(f"Link summarization failed: {e}")
+        # Fallback: use first 200 chars of text
+        return text[:200] + "..."
+
+
+async def process_links_from_message(
+    text: str,
+    group_id: str,
+    shared_by: str = "Unknown",
+) -> list[dict]:
+    """
+    Full pipeline: extract URLs from message → fetch → summarize → produce signals.
+    
+    Designed to be called in the background (non-blocking to the main message pipeline).
+    
+    Returns:
+        List of signal dicts ready for store_signals()
+    """
+    if not ENABLE_LINK_FETCH:
+        return []
+
+    urls = extract_urls(text)
+    fetchable = [u for u in urls if should_fetch(u)]
+
+    if not fetchable:
+        return []
+
+    signals = []
+
+    # Process up to 3 links per message to avoid overload
+    for url in fetchable[:3]:
+        try:
+            content = await fetch_url_content(url)
+            if not content:
+                continue
+
+            summary = await summarize_content(content["title"], content["text"], url)
+
+            signal = {
+                "id": str(uuid.uuid4()),
+                "type": "link_knowledge",
+                "summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
+                "entities": [f"@{shared_by}", url[:100]],
+                "severity": "low",
+                "status": "reference",
+                "sentiment": "neutral",
+                "urgency": "none",
+                "raw_quote": summary,
+                "timestamp": datetime.utcnow().isoformat(),
+                "group_id": group_id,
+                "lens": "link",
+                "keywords": [content["title"][:50], "link", "web", shared_by],
+            }
+            signals.append(signal)
+            logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
+
+        except Exception as e:
+            logger.warning(f"Link processing failed for {url[:60]}: {e}")
+            continue
+
+    return signals
--- a/thirdeye/backend/agents/meet_cross_ref.py
+++ b/thirdeye/backend/agents/meet_cross_ref.py
@@ -0,0 +1,188 @@
+"""
+Meet Cross-Reference Agent
+Finds connections between meeting signals and existing Telegram group signals.
+Surfaces: confirmations (meeting agrees with chat), contradictions (meeting contradicts chat),
+and blind spots (meeting discusses something chat groups don't know about).
+"""
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import query_signals, get_all_signals
+from backend.config import MEET_CROSS_REF_GROUPS, MEET_DEFAULT_GROUP_ID
+
+logger = logging.getLogger("thirdeye.agents.meet_cross_ref")
+
+CROSS_REF_SYSTEM_PROMPT = """You are an expert at finding connections between meeting discussions and team chat history.
+
+You will receive:
+1. MEETING SIGNALS — decisions, action items, blockers, risks from a recent Google Meet
+2. CHAT SIGNALS — existing signals from team Telegram groups
+
+Find meaningful connections across three categories:
+
+CONFIRMATIONS: Meeting agrees with or reinforces something from chat history
+CONTRADICTIONS: Meeting decision conflicts with what was said/decided in chat
+BLIND SPOTS: Important things from the meeting that the chat teams don't seem to know about
+
+Return ONLY a valid JSON object:
+{
+  "confirmations": [
+    {"meeting_signal": "...", "chat_signal": "...", "group": "...", "significance": "high|medium|low"}
+  ],
+  "contradictions": [
+    {"meeting_signal": "...", "chat_signal": "...", "group": "...", "impact": "...", "significance": "high|medium|low"}
+  ],
+  "blind_spots": [
+    {"meeting_signal": "...", "teams_unaware": ["group1", "group2"], "recommendation": "..."}
+  ]
+}
+
+Rules:
+- Only include HIGH confidence matches — do not stretch for weak connections
+- Keep each signal description concise (1 sentence max)
+- significance "high" = this matters for team alignment; "medium" = worth noting; "low" = minor
+- If a category has nothing meaningful, use an empty array []
+- Return JSON only"""
+
+
+async def find_cross_references(
+    meeting_id: str,
+    group_id: str = None,
+    cross_ref_group_ids: list[str] = None,
+) -> dict:
+    """
+    Compare meeting signals against chat group signals.
+    
+    Args:
+        meeting_id: The meeting to analyze
+        group_id: ChromaDB group where meet signals are stored (defaults to MEET_DEFAULT_GROUP_ID)
+        cross_ref_group_ids: Groups to compare against (defaults to MEET_CROSS_REF_GROUPS from config)
+    
+    Returns:
+        Dict with confirmations, contradictions, blind_spots lists
+    """
+    group_id = group_id or MEET_DEFAULT_GROUP_ID
+    cross_ref_group_ids = cross_ref_group_ids or MEET_CROSS_REF_GROUPS
+
+    if not cross_ref_group_ids:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": "No cross-reference groups configured. Set MEET_CROSS_REF_GROUPS in .env",
+        }
+
+    # 1. Get meeting signals (decisions, actions, blockers, risks — NOT raw chunks)
+    meet_signals = query_signals(group_id, meeting_id, n_results=30)
+    structured_meet = [
+        s for s in meet_signals
+        if s.get("metadata", {}).get("type") in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk", "meet_open_q")
+    ]
+
+    if not structured_meet:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": f"No structured signals found for meeting {meeting_id}. Has it been processed yet?",
+        }
+
+    # 2. Get signals from each cross-reference group
+    chat_context_parts = []
+    for gid in cross_ref_group_ids:
+        try:
+            all_sig = get_all_signals(gid)
+            if all_sig:
+                formatted = "\n".join([
+                    f"  [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:120]}"
+                    for s in all_sig[:20]  # Cap at 20 per group to stay within token limits
+                ])
+                chat_context_parts.append(f"Group '{gid}':\n{formatted}")
+        except Exception as e:
+            logger.warning(f"Could not load signals for group {gid}: {e}")
+
+    if not chat_context_parts:
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": "Could not load any signals from cross-reference groups.",
+        }
+
+    # 3. Format inputs for LLM
+    meet_text = "\n".join([
+        f"  [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:150]}" for s in structured_meet
+    ])
+    chat_text = "\n\n".join(chat_context_parts)
+
+    prompt = f"""MEETING SIGNALS (from meeting: {meeting_id}):
+{meet_text}
+
+CHAT SIGNALS (from monitored Telegram groups):
+{chat_text}"""
+
+    try:
+        import json
+        result = await call_llm(
+            task_type="reasoning",
+            messages=[
+                {"role": "system", "content": CROSS_REF_SYSTEM_PROMPT},
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0.2,
+            max_tokens=1500,
+            response_format={"type": "json_object"},
+        )
+        raw = result["content"].strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        return json.loads(raw)
+
+    except Exception as e:
+        logger.error(f"Cross-reference LLM call failed: {e}")
+        return {
+            "confirmations": [],
+            "contradictions": [],
+            "blind_spots": [],
+            "error": str(e),
+        }
+
+
+def format_cross_ref_for_telegram(analysis: dict, meeting_id: str) -> str:
+    """Format cross-reference results as a Telegram message."""
+    parts = [f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting: `{meeting_id}`\n"]
+
+    if analysis.get("error"):
+        return f"⚠️ Cross-reference failed: {analysis['error']}"
+
+    confirmations = analysis.get("confirmations", [])
+    contradictions = analysis.get("contradictions", [])
+    blind_spots = analysis.get("blind_spots", [])
+
+    if not confirmations and not contradictions and not blind_spots:
+        return f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting `{meeting_id}`: No significant connections found between this meeting and your chat groups."
+
+    if confirmations:
+        parts.append(f"✅ *Confirmations* ({len(confirmations)})")
+        for c in confirmations[:3]:  # Cap at 3 for readability
+            sig = "🔴" if c.get("significance") == "high" else "🟡"
+            parts.append(f"{sig} Meeting: _{c['meeting_signal'][:100]}_")
+            parts.append(f"   Matches [{c.get('group', '?')}]: _{c['chat_signal'][:100]}_\n")
+
+    if contradictions:
+        parts.append(f"⚡ *Contradictions* ({len(contradictions)}) — ACTION NEEDED")
+        for c in contradictions[:3]:
+            parts.append(f"🔴 Meeting decided: _{c['meeting_signal'][:100]}_")
+            parts.append(f"   BUT [{c.get('group', '?')}] says: _{c['chat_signal'][:100]}_")
+            if c.get("impact"):
+                parts.append(f"   Impact: {c['impact'][:100]}\n")
+
+    if blind_spots:
+        parts.append(f"🔦 *Blind Spots* ({len(blind_spots)}) — Teams may not know")
+        for b in blind_spots[:3]:
+            parts.append(f"🟠 {b['meeting_signal'][:120]}")
+            if b.get("recommendation"):
+                parts.append(f"   → {b['recommendation'][:100]}\n")
+
+    return "\n".join(parts)
--- a/thirdeye/backend/agents/meet_ingestor.py
+++ b/thirdeye/backend/agents/meet_ingestor.py
@@ -0,0 +1,342 @@
+"""
+Meet Ingestor Agent
+Processes raw Google Meet transcript chunks and extracts structured signals.
+
+Signal types produced:
+  meet_decision    — A decision made during the meeting
+  meet_action_item — A task assigned to someone
+  meet_blocker     — A blocker or dependency raised
+  meet_risk        — A risk or concern identified
+  meet_open_q      — An unresolved question left open
+  meet_summary     — Full meeting summary (emitted on is_final=True)
+  meet_chunk_raw   — Raw transcript chunk (always stored, for full-text search)
+"""
+import asyncio
+import json
+import logging
+import uuid
+from datetime import datetime
+
+from backend.providers import call_llm
+from backend.db.chroma import store_signals
+
+logger = logging.getLogger("thirdeye.agents.meet_ingestor")
+
+
+# ─── Extraction prompt ───────────────────────────────────────────────────────
+
+EXTRACTION_SYSTEM_PROMPT = """You are an expert meeting analyst. You receive raw transcript chunks from a Google Meet recording and extract structured signals.
+
+Extract ONLY signals that are clearly present. Do NOT hallucinate or infer beyond what is stated.
+
+Return ONLY a valid JSON object with this exact structure:
+{
+  "decisions": [
+    {"text": "...", "owner": "@name or null", "confidence": "high|medium|low"}
+  ],
+  "action_items": [
+    {"text": "...", "owner": "@name or null", "due": "date string or null", "confidence": "high|medium|low"}
+  ],
+  "blockers": [
+    {"text": "...", "blocking_what": "...", "confidence": "high|medium|low"}
+  ],
+  "risks": [
+    {"text": "...", "severity": "high|medium|low", "confidence": "high|medium|low"}
+  ],
+  "open_questions": [
+    {"text": "...", "confidence": "high|medium|low"}
+  ]
+}
+
+Rules:
+- If a category has nothing, use an empty array []
+- owner must start with @ if it's a person's name (e.g. "@Alex")
+- text must be a clear, standalone sentence — not a fragment
+- Only include confidence "high" if the signal is unambiguous
+- Do NOT reproduce filler words, pleasantries, or off-topic banter
+- Return JSON only — no markdown, no preamble, no explanation"""
+
+
+SUMMARY_SYSTEM_PROMPT = """You are a meeting intelligence expert. Given a full meeting transcript (possibly from multiple chunks), write a concise but complete meeting summary.
+
+Structure your summary as:
+1. One-sentence overview (what was the meeting about)
+2. Key decisions made (bullet points, max 5)
+3. Action items assigned (who does what by when)
+4. Blockers or risks raised
+5. Open questions still unresolved
+
+Keep the summary under 400 words. Be specific. Use names when available. Do NOT use filler phrases like "the team discussed" — just state what was decided/agreed/assigned."""
+
+
+# ─── Signal builder ─────────────────────────────────────────────────────────
+
+def _build_signal(
+    signal_type: str,
+    summary: str,
+    raw_quote: str,
+    severity: str,
+    entities: list[str],
+    keywords: list[str],
+    timestamp: str,
+    group_id: str,
+    meeting_id: str,
+    urgency: str = "none",
+    status: str = "open",
+) -> dict:
+    return {
+        "id": str(uuid.uuid4()),
+        "type": signal_type,
+        "summary": summary,
+        "raw_quote": raw_quote[:500] if raw_quote else "",
+        "severity": severity,
+        "status": status,
+        "sentiment": "neutral",
+        "urgency": urgency,
+        "entities": entities,
+        "keywords": keywords,
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "meet",
+        "meeting_id": meeting_id,
+    }
+
+
+def _extract_entities(text: str, owner: str = None) -> list[str]:
+    """Extract entity strings from text (names starting with @)."""
+    import re
+    entities = re.findall(r"@[\w]+", text)
+    if owner and owner.startswith("@"):
+        entities.append(owner)
+    return list(set(entities))
+
+
+def _extract_keywords(text: str) -> list[str]:
+    """Simple keyword extraction: lowercase meaningful words."""
+    stopwords = {"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
+                 "in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
+                 "but", "we", "i", "it", "be", "do", "have", "has", "had", "not"}
+    words = text.lower().split()
+    keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
+    return list(dict.fromkeys(keywords))[:10]  # deduplicate, keep first 10
+
+
+# ─── Main processing function ────────────────────────────────────────────────
+
+async def process_meet_chunk(
+    meeting_id: str,
+    group_id: str,
+    chunk_index: int,
+    text: str,
+    speaker: str,
+    timestamp: str,
+    is_final: bool,
+):
+    """
+    Full pipeline for a transcript chunk:
+    1. Always store raw chunk for full-text search
+    2. Extract structured signals via LLM
+    3. If is_final, generate a full meeting summary
+    """
+    logger.info(f"Processing meet chunk {chunk_index} for meeting {meeting_id} ({len(text)} chars)")
+    signals_to_store = []
+
+    # 1. Always store the raw chunk (enables full-text similarity search later)
+    raw_signal = _build_signal(
+        signal_type="meet_chunk_raw",
+        summary=f"[{meeting_id}] Chunk {chunk_index}: {text[:120]}...",
+        raw_quote=text,
+        severity="low",
+        entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
+        keywords=_extract_keywords(text),
+        timestamp=timestamp,
+        group_id=group_id,
+        meeting_id=meeting_id,
+    )
+    signals_to_store.append(raw_signal)
+
+    # 2. Extract structured signals via LLM
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
+                {"role": "user", "content": f"Transcript chunk from speaker '{speaker}':\n\n{text}"},
+            ],
+            temperature=0.1,
+            max_tokens=1500,
+            response_format={"type": "json_object"},
+        )
+
+        raw_json = result["content"].strip()
+        # Strip markdown code fences if present
+        if raw_json.startswith("```"):
+            raw_json = raw_json.split("```")[1]
+            if raw_json.startswith("json"):
+                raw_json = raw_json[4:]
+        extracted = json.loads(raw_json)
+
+    except Exception as e:
+        logger.warning(f"Meet extraction LLM failed for chunk {chunk_index}: {e}")
+        extracted = {}
+
+    # Decisions
+    for item in extracted.get("decisions", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_decision",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="medium",
+                entities=_extract_entities(item["text"], item.get("owner")),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                status="decided",
+            ))
+
+    # Action items
+    for item in extracted.get("action_items", []):
+        if item.get("confidence") in ("high", "medium"):
+            due_str = f" Due: {item['due']}." if item.get("due") else ""
+            signals_to_store.append(_build_signal(
+                signal_type="meet_action_item",
+                summary=f"{item['text']}{due_str}",
+                raw_quote=item["text"],
+                severity="medium",
+                entities=_extract_entities(item["text"], item.get("owner")),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                urgency="medium" if item.get("due") else "low",
+                status="open",
+            ))
+
+    # Blockers
+    for item in extracted.get("blockers", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_blocker",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="high",
+                entities=_extract_entities(item["text"]),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                urgency="high",
+                status="open",
+            ))
+
+    # Risks
+    for item in extracted.get("risks", []):
+        signals_to_store.append(_build_signal(
+            signal_type="meet_risk",
+            summary=item["text"],
+            raw_quote=item["text"],
+            severity=item.get("severity", "medium"),
+            entities=_extract_entities(item["text"]),
+            keywords=_extract_keywords(item["text"]),
+            timestamp=timestamp,
+            group_id=group_id,
+            meeting_id=meeting_id,
+            urgency="medium",
+            status="open",
+        ))
+
+    # Open questions
+    for item in extracted.get("open_questions", []):
+        if item.get("confidence") in ("high", "medium"):
+            signals_to_store.append(_build_signal(
+                signal_type="meet_open_q",
+                summary=item["text"],
+                raw_quote=item["text"],
+                severity="low",
+                entities=_extract_entities(item["text"]),
+                keywords=_extract_keywords(item["text"]),
+                timestamp=timestamp,
+                group_id=group_id,
+                meeting_id=meeting_id,
+                status="open",
+            ))
+
+    # 3. If this is the final chunk, generate a meeting summary
+    if is_final:
+        summary_signal = await _generate_meeting_summary(
+            meeting_id, group_id, text, speaker, timestamp
+        )
+        if summary_signal:
+            signals_to_store.append(summary_signal)
+
+    # Store everything
+    if signals_to_store:
+        store_signals(group_id, signals_to_store)
+        logger.info(
+            f"Stored {len(signals_to_store)} signals for meeting {meeting_id} chunk {chunk_index}"
+        )
+
+    return signals_to_store
+
+
+async def _generate_meeting_summary(
+    meeting_id: str,
+    group_id: str,
+    final_chunk_text: str,
+    speaker: str,
+    timestamp: str,
+) -> dict | None:
+    """
+    Pull all raw chunks for this meeting from ChromaDB and generate a summary.
+    Falls back to summarizing just the final chunk if retrieval fails.
+    """
+    from backend.db.chroma import query_signals
+
+    try:
+        # Get all raw chunks for this meeting
+        raw_chunks = query_signals(
+            group_id,
+            meeting_id,
+            n_results=50,
+            signal_type="meet_chunk_raw",
+        )
+        full_transcript = "\n\n".join(
+            [s.get("metadata", {}).get("raw_quote", "") or s.get("document", "") for s in raw_chunks]
+        )
+        if not full_transcript.strip():
+            full_transcript = final_chunk_text
+    except Exception:
+        full_transcript = final_chunk_text
+
+    try:
+        result = await call_llm(
+            task_type="fast_large",
+            messages=[
+                {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": f"Meeting ID: {meeting_id}\n\nFull transcript:\n\n{full_transcript[:6000]}",
+                },
+            ],
+            temperature=0.3,
+            max_tokens=600,
+        )
+        summary_text = result["content"].strip()
+    except Exception as e:
+        logger.warning(f"Meeting summary generation failed: {e}")
+        return None
+
+    return _build_signal(
+        signal_type="meet_summary",
+        summary=summary_text,
+        raw_quote=full_transcript[:500],
+        severity="medium",
+        entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
+        keywords=_extract_keywords(summary_text),
+        timestamp=timestamp,
+        group_id=group_id,
+        meeting_id=meeting_id,
+        status="completed",
+    )
--- a/thirdeye/backend/agents/pattern_detector.py
+++ b/thirdeye/backend/agents/pattern_detector.py
@@ -0,0 +1,114 @@
+"""Pattern Detector Agent — finds trends and anomalies in accumulated signals."""
+import logging
+from backend.providers import call_llm
+from backend.db.chroma import get_all_signals
+from backend.db.models import Pattern
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.pattern_detector")
+
+SYSTEM_PROMPT = """You are the Pattern Detector for ThirdEye. You analyze accumulated signals to find patterns and anomalies.
+
+Detect these pattern types:
+- frequency_spike: A signal type mentioned significantly more than usual
+- knowledge_silo: Only one person discusses a critical topic (bus factor = 1)
+- recurring_issue: Same bug/problem appearing repeatedly
+- sentiment_trend: Gradual shift in tone over time
+- stale_item: Decisions proposed but never resolved, promises with no follow-up
+
+Respond ONLY with valid JSON (no markdown, no backticks):
+{"patterns": [{"type": "pattern_type", "description": "Clear human-readable description", "severity": "info|warning|critical", "evidence_ids": [], "recommendation": "Suggested action"}]}
+
+If no patterns found: {"patterns": []}
+Only report patterns that are genuinely concerning. Do NOT manufacture patterns from insufficient data."""
+
+
+def _heuristic_detect_patterns(group_id: str, all_signals: list[dict]) -> list[Pattern]:
+    """Generate conservative patterns from signal metadata when LLM output is unavailable."""
+    patterns: list[Pattern] = []
+    type_counts: dict[str, int] = {}
+    entity_counts: dict[str, int] = {}
+
+    for s in all_signals:
+        meta = s.get("metadata", {})
+        signal_type = str(meta.get("type", "unknown"))
+        type_counts[signal_type] = type_counts.get(signal_type, 0) + 1
+
+        entities = meta.get("entities", [])
+        if isinstance(entities, str):
+            entities = [entities]
+        if isinstance(entities, list):
+            for ent in entities:
+                ent_key = str(ent).strip()
+                if ent_key:
+                    entity_counts[ent_key] = entity_counts.get(ent_key, 0) + 1
+
+    recurring_types = [t for t, c in type_counts.items() if c >= 2 and t in {"recurring_bug", "workaround", "tech_debt"}]
+    for signal_type in recurring_types:
+        patterns.append(Pattern(
+            group_id=group_id,
+            type="recurring_issue",
+            description=f"Signal type '{signal_type}' has appeared repeatedly ({type_counts[signal_type]} times).",
+            severity="warning",
+            recommendation="Create a dedicated action item with owner and due date to stop repeated recurrence.",
+        ))
+
+    silo_entities = [ent for ent, c in entity_counts.items() if c >= 2]
+    if any("stripe" in ent.lower() or "payment" in ent.lower() for ent in silo_entities):
+        patterns.append(Pattern(
+            group_id=group_id,
+            type="knowledge_silo",
+            description="Critical payment-related topics are concentrated in repeated mentions, suggesting low bus factor.",
+            severity="warning",
+            recommendation="Document payment workflows and assign at least one backup owner.",
+        ))
+
+    return patterns[:5]
+
+
+async def detect_patterns(group_id: str) -> list[Pattern]:
+    """Analyze all signals in a group and detect patterns."""
+    all_signals = get_all_signals(group_id)
+
+    if len(all_signals) < 3:
+        logger.info(f"Not enough signals ({len(all_signals)}) for pattern detection in {group_id}")
+        return []
+
+    # Format signals for the LLM
+    signal_summary = []
+    for s in all_signals:
+        meta = s["metadata"]
+        signal_summary.append(
+            f"- [{meta.get('type', '?')}] {s['document'][:100]} "
+            f"(severity={meta.get('severity', '?')}, entities={meta.get('entities', '[]')}, "
+            f"time={meta.get('timestamp', '?')})"
+        )
+    signals_text = "\n".join(signal_summary)
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"Analyze these {len(all_signals)} signals from group '{group_id}':\n\n{signals_text}"},
+    ]
+
+    try:
+        result = await call_llm("reasoning", messages, temperature=0.2, max_tokens=1500)
+        parsed = extract_json_object(result.get("content", ""))
+        patterns = []
+        for p in parsed.get("patterns", []):
+            patterns.append(Pattern(
+                group_id=group_id,
+                type=p.get("type", "unknown"),
+                description=p.get("description", ""),
+                severity=p.get("severity", "info"),
+                recommendation=p.get("recommendation", ""),
+            ))
+
+        logger.info(f"Detected {len(patterns)} patterns in {group_id}")
+        return patterns
+
+    except Exception as e:
+        logger.info(f"Pattern detection LLM parse issue, using fallback: {e}")
+        fallback = _heuristic_detect_patterns(group_id, all_signals)
+        if fallback:
+            logger.info(f"Pattern heuristic fallback produced {len(fallback)} patterns in {group_id}")
+        return fallback
--- a/thirdeye/backend/agents/query_agent.py
+++ b/thirdeye/backend/agents/query_agent.py
@@ -0,0 +1,68 @@
+"""
+Query Agent — voice-aware signal context formatting for ThirdEye.
+
+Provides _format_signal_for_context() which labels each ChromaDB signal with
+its true origin (voice note, document, meeting, chat) so the LLM can produce
+properly attributed answers like:
+  "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided..."
+"""
+from datetime import datetime
+
+
+VOICE_CITATION_INSTRUCTION = """
+When context includes [VOICE NOTE — @name on Date (Xs)] signals, ALWAYS cite the voice note explicitly.
+Example: "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided to use PostgreSQL."
+Never flatten voice signals into generic "the team discussed" language. Always name the speaker and source.
+"""
+
+
+def _format_signal_for_context(signal: dict) -> str:
+    """
+    Format a ChromaDB signal as a context snippet for the Query Agent LLM.
+    Voice-sourced signals get explicit attribution so the LLM cites them correctly.
+    Accepts both flat signal dicts and dicts with a nested 'metadata' key.
+    """
+    # Support both flat dicts and ChromaDB-style {"metadata": {...}, "document": ...}
+    meta = signal.get("metadata", signal)
+
+    source = meta.get("source", signal.get("source", "chat"))
+    sig_type = meta.get("type", signal.get("type", "unknown"))
+    summary = meta.get("summary", signal.get("summary", ""))
+    timestamp = meta.get("timestamp", signal.get("timestamp", ""))
+
+    date_str = ""
+    if timestamp:
+        try:
+            dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
+            date_str = dt.strftime("%b %d")
+        except Exception:
+            date_str = timestamp[:10]
+
+    if source == "voice":
+        speaker = meta.get("speaker", signal.get("speaker", "Unknown"))
+        duration = meta.get("voice_duration", signal.get("voice_duration", 0))
+        duration_str = f"{duration}s" if duration else "?"
+        return (
+            f"[VOICE NOTE — @{speaker} on {date_str} ({duration_str})] "
+            f"[{sig_type}] {summary}"
+        )
+
+    if source == "document":
+        return f"[DOCUMENT — {date_str}] [{sig_type}] {summary}"
+
+    if source == "link":
+        return f"[WEB LINK — {date_str}] [{sig_type}] {summary}"
+
+    if sig_type in ("meet_decision", "meet_action_item", "meet_blocker", "meet_summary"):
+        meeting_id = meta.get("meeting_id", signal.get("meeting_id", ""))
+        return f"[MEETING {meeting_id} — {date_str}] [{sig_type}] {summary}"
+
+    entities_raw = meta.get("entities", signal.get("entities", []))
+    if isinstance(entities_raw, str):
+        import json
+        try:
+            entities_raw = json.loads(entities_raw)
+        except Exception:
+            entities_raw = []
+    sender_str = entities_raw[0] if entities_raw else ""
+    return f"[CHAT — {sender_str} on {date_str}] [{sig_type}] {summary}"
--- a/thirdeye/backend/agents/signal_extractor.py
+++ b/thirdeye/backend/agents/signal_extractor.py
@@ -0,0 +1,128 @@
+"""Signal Extractor Agent — extracts structured signals from chat messages."""
+import logging
+from backend.providers import call_llm
+from backend.db.models import Signal
+from datetime import datetime
+from backend.agents.json_utils import extract_json_object
+
+logger = logging.getLogger("thirdeye.agents.signal_extractor")
+
+# Lens-specific system prompts
+LENS_PROMPTS = {
+    "dev": """You are the Signal Extractor for ThirdEye operating in DevLens mode.
+You analyze batches of developer team chat messages and extract STRUCTURED SIGNALS.
+
+Extract ONLY signals that represent meaningful technical information. Skip greetings, small talk, emoji reactions, and meta-conversation.
+
+Signal types to look for:
+- architecture_decision: Technology choices, design decisions with rationale
+- tech_debt: Shortcuts, hardcoded values, "will fix later" patterns
+- knowledge_silo_evidence: Only one person discusses a critical topic
+- recurring_bug: Same issue mentioned repeatedly
+- stack_decision: Technology/framework choices (proposed or decided)
+- deployment_risk: Risky deployment practices
+- workaround: Temporary fixes being applied repeatedly
+- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
+
+Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
+For EACH signal found, include it in the JSON array. If NO meaningful signals exist, return empty array.
+Be SELECTIVE. Quality over quantity.""",
+
+    "product": """You are the Signal Extractor for ThirdEye operating in ProductLens mode.
+
+Signal types to look for:
+- feature_request: Features users or team members are asking for
+- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
+- user_pain_point: User difficulties, complaints, confusion
+- roadmap_drift: Discussion of topics not on the current plan
+- priority_conflict: Team members disagreeing on what's most important
+- metric_mention: Specific numbers, conversion rates, performance data
+- user_quote: Direct quotes from users/customers
+- competitor_intel: Mentions of competitor actions or features
+
+Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
+Be SELECTIVE. Quality over quantity.""",
+
+    "client": """You are the Signal Extractor for ThirdEye operating in ClientLens mode.
+
+Signal types to look for:
+- promise: Commitments made with deadlines (explicit or implicit)
+- scope_creep: Additional requests introduced casually without formal change requests
+- sentiment_signal: Tone changes (positive praise, growing frustration, formality shifts)
+- unanswered_request: Questions or requests that haven't received responses
+- satisfaction: Explicit positive or negative feedback
+- escalation_risk: Mentions of involving management, expressing deadline concerns
+- client_decision: Decisions made by the client
+
+Pay SPECIAL attention to implicit deadlines ("by end of week", "before the meeting").
+Be SELECTIVE. Quality over quantity.""",
+
+    "community": """You are the Signal Extractor for ThirdEye operating in CommunityLens mode.
+
+Signal types: recommendation, event, issue, local_knowledge, question
+Be SELECTIVE. Quality over quantity.""",
+}
+
+EXTRACTION_FORMAT = """
+Respond ONLY with valid JSON in this exact format (no markdown, no backticks, no explanation):
+{"signals": [{"type": "signal_type_here", "summary": "One clear sentence that includes specific names, numbers, timelines, and commitments", "entities": ["@person", "technology"], "severity": "low|medium|high|critical", "status": "proposed|decided|implemented|unresolved", "raw_quote": "Exact verbatim sentence(s) from the message that capture the full claim, including names, numbers, and timelines", "message_index": 0}]}
+
+IMPORTANT for raw_quote: copy the FULL relevant sentence from the message, not just a topic keyword.
+Example — message "Anirban: feature page revamp will take approx 2 more days"
+  WRONG raw_quote: "feature page revamp"
+  CORRECT raw_quote: "feature page revamp will take approx 2 more days"
+
+If no signals found: {"signals": []}
+"""
+
+
+async def extract_signals(messages_text: str, group_id: str, lens: str = "dev") -> list[Signal]:
+    """
+    Extract structured signals from a batch of formatted chat messages.
+    
+    Args:
+        messages_text: Formatted string like "[Alex]: Let's use Redis\\n[Bob]: Agreed"
+        group_id: Telegram group ID
+        lens: Active lens mode (dev, product, client, community)
+    
+    Returns:
+        List of Signal objects
+    """
+    system_prompt = LENS_PROMPTS.get(lens, LENS_PROMPTS["dev"])
+
+    messages = [
+        {"role": "system", "content": system_prompt + "\n\n" + EXTRACTION_FORMAT},
+        {"role": "user", "content": f"Extract signals from these messages:\n\n{messages_text}"},
+    ]
+
+    try:
+        result = await call_llm("fast_large", messages, temperature=0.2, max_tokens=2000)
+        parsed = extract_json_object(result.get("content", ""))
+        raw_signals = parsed.get("signals", [])
+
+        # Convert to Signal objects
+        signals = []
+        for raw in raw_signals:
+            try:
+                signal = Signal(
+                    group_id=group_id,
+                    lens=lens,
+                    type=raw.get("type", "unknown"),
+                    summary=raw.get("summary", ""),
+                    entities=raw.get("entities", []),
+                    severity=raw.get("severity", "low"),
+                    status=raw.get("status", "unknown"),
+                    raw_quote=raw.get("raw_quote", ""),
+                    timestamp=datetime.utcnow().isoformat(),
+                )
+                signals.append(signal)
+            except Exception as e:
+                logger.warning(f"Failed to parse signal: {e}")
+                continue
+
+        logger.info(f"Extracted {len(signals)} signals from {group_id} (lens={lens}) via {result['provider']}")
+        return signals
+
+    except Exception as e:
+        logger.error(f"Signal extraction failed: {e}")
+        return []
--- a/thirdeye/backend/agents/voice_handler.py
+++ b/thirdeye/backend/agents/voice_handler.py
@@ -0,0 +1,281 @@
+"""
+Voice Handler
+Orchestrates the full pipeline for Telegram voice messages and video notes:
+
+  Telegram voice/video_note message
+    -> download audio bytes
+    -> transcribe via Groq Whisper (voice_transcriber.py)
+    -> build a voice_transcript signal (stored raw for full-text search)
+    -> run transcript through process_message_batch (signal extraction)
+    -> all extracted signals carry voice attribution metadata
+
+Voice metadata attached to every extracted signal:
+  source:           "voice"
+  voice_file_id:    Telegram file ID
+  voice_duration:   seconds
+  speaker:          sender display name
+"""
+import logging
+import uuid
+from datetime import datetime, timezone
+
+from backend.agents.voice_transcriber import (
+    transcribe_audio, download_telegram_audio, format_duration
+)
+from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
+from backend.db.chroma import store_signals
+from backend.pipeline import process_message_batch
+
+logger = logging.getLogger("thirdeye.agents.voice_handler")
+
+
+# --- Voice transcript signal builder -----------------------------------------
+
+def build_voice_transcript_signal(
+    transcript: str,
+    sender: str,
+    group_id: str,
+    voice_file_id: str,
+    duration_seconds: int,
+    language: str,
+    timestamp: str,
+) -> dict:
+    """
+    Build a voice_transcript signal that stores the full raw transcription.
+    Always stored alongside extracted signals so the full transcript is
+    searchable in ChromaDB even if no structured signals were extracted.
+    """
+    return {
+        "id": str(uuid.uuid4()),
+        "type": "voice_transcript",
+        "summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
+        "raw_quote": transcript,
+        "severity": "low",
+        "status": "transcribed",
+        "sentiment": "neutral",
+        "urgency": "none",
+        "entities": [f"@{sender}"],
+        "keywords": _extract_voice_keywords(transcript),
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "voice",
+        "source": "voice",
+        "voice_file_id": voice_file_id,
+        "voice_duration": duration_seconds,
+        "voice_language": language,
+        "speaker": sender,
+    }
+
+
+def _extract_voice_keywords(text: str) -> list[str]:
+    """Simple keyword extraction from transcript text."""
+    stopwords = {
+        "the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
+        "in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
+        "but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
+        "so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
+    }
+    words = text.lower().split()
+    keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
+    return list(dict.fromkeys(keywords))[:12]
+
+
+def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
+    """
+    Inject voice attribution into every signal extracted from a voice transcript.
+    Accepts both Signal Pydantic model objects and plain dicts.
+    This ensures /ask can cite the voice source in its answers.
+    """
+    result = []
+    for signal in signals:
+        sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
+        sig["source"] = "voice"
+        sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
+        sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
+        sig["voice_language"] = voice_meta.get("language", "")
+        sig["speaker"] = voice_meta.get("sender", "Unknown")
+        if "[Voice]" not in sig.get("summary", ""):
+            sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
+        result.append(sig)
+    return result
+
+
+# --- Fallback signal builder -------------------------------------------------
+
+# Keywords that hint at a signal type when the LLM extraction returns nothing
+_FALLBACK_TYPE_HINTS = {
+    "feature_request": {
+        "need", "needs", "required", "require", "want", "should", "missing",
+        "add", "feature", "ui", "ux", "design", "change", "changes", "update",
+        "improve", "improvement", "responsiveness", "responsive",
+    },
+    "blocker": {
+        "blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
+        "issue", "problem", "broken", "fails", "failing",
+    },
+    "action_item": {
+        "will", "going", "plan", "todo", "do", "fix", "implement", "setup",
+        "create", "build", "deploy", "check",
+    },
+    "risk": {
+        "risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
+        "deadline", "delay", "late",
+    },
+}
+
+
+def _build_fallback_signal(
+    transcript: str,
+    sender: str,
+    group_id: str,
+    timestamp: str,
+    voice_meta: dict,
+) -> dict:
+    """
+    Build a best-effort structured signal from a voice transcript when the LLM
+    returned 0 signals. Picks the most likely signal type from keyword hints,
+    falling back to 'feature_request' as the safe default.
+    """
+    words = set(transcript.lower().split())
+    scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
+    best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
+
+    urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
+    severity = "high" if words & urgency_words else "medium"
+
+    summary = transcript[:200].strip()
+    if len(transcript) > 200:
+        summary += "..."
+
+    return {
+        "id": str(uuid.uuid4()),
+        "type": best_type,
+        "summary": f"[Voice @{sender}] {summary}",
+        "raw_quote": transcript[:500],
+        "severity": severity,
+        "status": "unresolved",
+        "sentiment": "neutral",
+        "urgency": "high" if severity == "high" else "medium",
+        "entities": [f"@{sender}"],
+        "keywords": _extract_voice_keywords(transcript),
+        "timestamp": timestamp,
+        "group_id": group_id,
+        "lens": "voice",
+        "source": "voice",
+        "speaker": sender,
+        "voice_file_id": voice_meta.get("voice_file_id", ""),
+        "voice_duration": voice_meta.get("duration_seconds", 0),
+        "voice_language": voice_meta.get("language", ""),
+    }
+
+
+# --- Main handler ------------------------------------------------------------
+
+async def handle_voice_message(
+    bot,
+    group_id: str,
+    sender: str,
+    file_id: str,
+    duration_seconds: int,
+    message_date,
+    is_video_note: bool = False,
+) -> dict:
+    """
+    Full pipeline for a single voice or video note message.
+
+    Returns:
+        {"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
+        OR {"ok": False, "reason": "...", "error": "..."}
+    """
+    if not ENABLE_VOICE_TRANSCRIPTION:
+        return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
+
+    msg_type = "video note" if is_video_note else "voice message"
+    logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
+
+    # 1. Download audio
+    try:
+        audio_bytes = await download_telegram_audio(bot, file_id)
+    except Exception as e:
+        logger.error(f"Failed to download audio from {sender}: {e}")
+        return {"ok": False, "reason": "download_failed", "error": str(e)}
+
+    # 2. Transcribe
+    filename = "audio.mp4" if is_video_note else "audio.ogg"
+    transcription = await transcribe_audio(
+        audio_bytes,
+        filename=filename,
+        duration_seconds=duration_seconds,
+    )
+
+    if not transcription["ok"]:
+        logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
+        return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
+
+    transcript = transcription["transcript"]
+    language = transcription.get("language", "unknown")
+    timestamp = (
+        message_date.replace(tzinfo=timezone.utc).isoformat()
+        if message_date else datetime.utcnow().isoformat()
+    )
+
+    # 3. Store raw voice transcript signal
+    if VOICE_STORE_TRANSCRIPT:
+        transcript_signal = build_voice_transcript_signal(
+            transcript=transcript,
+            sender=sender,
+            group_id=group_id,
+            voice_file_id=file_id,
+            duration_seconds=duration_seconds,
+            language=language,
+            timestamp=timestamp,
+        )
+        store_signals(group_id, [transcript_signal])
+        logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
+
+    # 4. Run through signal extraction pipeline — treat as a regular text message
+    voice_meta = {
+        "sender": sender,
+        "voice_file_id": file_id,
+        "duration_seconds": duration_seconds,
+        "language": language,
+    }
+
+    messages = [{
+        "sender": sender,
+        "text": transcript,
+        "timestamp": timestamp,
+        "source": "voice",
+        "voice_file_id": file_id,
+        "voice_duration": duration_seconds,
+    }]
+
+    try:
+        extracted_signals = await process_message_batch(group_id, messages)
+        extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
+        signals_count = len(extracted_signals)
+
+        # Fallback: if the LLM extracted nothing from a meaningful voice message,
+        # create a generic signal so the content is still searchable as structured data.
+        if signals_count == 0 and len(transcript.split()) >= 5:
+            fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
+            store_signals(group_id, [fallback])
+            signals_count = 1
+            logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
+    except Exception as e:
+        logger.error(f"Signal extraction failed for voice from {sender}: {e}")
+        signals_count = 0
+
+    logger.info(
+        f"Voice pipeline complete: {sender}, {duration_seconds}s, "
+        f"{signals_count} signals, transcript={len(transcript)} chars"
+    )
+
+    return {
+        "ok": True,
+        "transcript": transcript,
+        "signals_extracted": signals_count,
+        "duration": duration_seconds,
+        "sender": f"@{sender}",
+        "language": language,
+    }
--- a/thirdeye/backend/agents/voice_transcriber.py
+++ b/thirdeye/backend/agents/voice_transcriber.py
@@ -0,0 +1,194 @@
+"""
+Voice Transcriber — Groq Whisper integration.
+
+Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
+audio bytes from Telegram voice messages and video notes into plain text.
+
+Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
+Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
+Telegram voice messages: OGG/Opus
+Telegram video notes:    MP4
+
+Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
+At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
+"""
+import io
+import logging
+from typing import Optional
+
+import httpx
+
+from backend.config import (
+    GROQ_API_KEY,
+    VOICE_LANGUAGE,
+    VOICE_MAX_DURATION_SECONDS,
+    VOICE_MIN_DURATION_SECONDS,
+)
+
+logger = logging.getLogger("thirdeye.agents.voice_transcriber")
+
+GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
+WHISPER_MODEL = "whisper-large-v3"
+
+# Groq file size limit for Whisper: 25 MB
+GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
+
+
+# --- Main transcription function ---------------------------------------------
+
+async def transcribe_audio(
+    audio_bytes: bytes,
+    filename: str = "audio.ogg",
+    duration_seconds: int = None,
+) -> dict:
+    """
+    Transcribe audio bytes using Groq Whisper.
+
+    Args:
+        audio_bytes:       Raw audio data (OGG, MP4, WAV, etc.)
+        filename:          Filename hint for the API (determines format detection)
+        duration_seconds:  Voice message duration from Telegram metadata (for pre-filtering)
+
+    Returns:
+        {
+            "ok": True,
+            "transcript": "The full transcribed text...",
+            "language": "en",
+            "duration": 45,
+            "word_count": 120,
+        }
+        OR on failure:
+        {
+            "ok": False,
+            "reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
+            "error": "optional error string",
+        }
+    """
+    # Pre-flight checks
+    if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
+        return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
+
+    if not audio_bytes:
+        return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
+
+    if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
+        return {
+            "ok": False,
+            "reason": "file_too_large",
+            "error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
+        }
+
+    if duration_seconds is not None:
+        if duration_seconds < VOICE_MIN_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_short",
+                "error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
+            }
+        if duration_seconds > VOICE_MAX_DURATION_SECONDS:
+            return {
+                "ok": False,
+                "reason": "too_long",
+                "error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
+            }
+
+    # Determine MIME type from filename extension
+    ext_to_mime = {
+        ".ogg": "audio/ogg",
+        ".opus": "audio/ogg",
+        ".mp3": "audio/mpeg",
+        ".mp4": "video/mp4",
+        ".m4a": "audio/mp4",
+        ".wav": "audio/wav",
+        ".flac": "audio/flac",
+        ".webm": "audio/webm",
+    }
+    ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
+    mime_type = ext_to_mime.get(ext, "audio/ogg")
+
+    form_data = {
+        "model": WHISPER_MODEL,
+        "response_format": "verbose_json",   # returns language detection
+        "temperature": "0",                  # deterministic transcription
+    }
+    if VOICE_LANGUAGE:
+        form_data["language"] = VOICE_LANGUAGE
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                GROQ_WHISPER_URL,
+                headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
+                files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
+                data=form_data,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+
+    except httpx.HTTPStatusError as e:
+        error_text = ""
+        try:
+            error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
+        except Exception:
+            error_text = e.response.text[:200]
+
+        if e.response.status_code == 429:
+            logger.warning("Groq Whisper rate limited")
+            return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
+        logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
+        return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
+
+    except httpx.TimeoutException:
+        logger.warning("Groq Whisper request timed out")
+        return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
+
+    except Exception as e:
+        logger.error(f"Groq Whisper unexpected error: {e}")
+        return {"ok": False, "reason": "api_error", "error": str(e)}
+
+    # Parse response
+    transcript = (data.get("text") or "").strip()
+
+    if not transcript:
+        return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
+
+    # Detect if Whisper only returned noise markers
+    noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
+    if transcript.lower() in noise_patterns:
+        return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
+
+    detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
+    word_count = len(transcript.split())
+
+    logger.info(
+        f"Whisper transcribed {duration_seconds or '?'}s audio -> "
+        f"{word_count} words [{detected_language}]: {transcript[:60]}..."
+    )
+
+    return {
+        "ok": True,
+        "transcript": transcript,
+        "language": detected_language,
+        "duration": duration_seconds,
+        "word_count": word_count,
+    }
+
+
+# --- Telegram-specific download helper ---------------------------------------
+
+async def download_telegram_audio(bot, file_id: str) -> bytes:
+    """
+    Download a Telegram file (voice or video_note) and return raw bytes.
+    """
+    tg_file = await bot.get_file(file_id)
+    audio_bytes = await tg_file.download_as_bytearray()
+    return bytes(audio_bytes)
+
+
+def format_duration(seconds: int) -> str:
+    """Format seconds into human-readable string: '1m 34s' or '45s'."""
+    if seconds is None:
+        return "?"
+    if seconds >= 60:
+        return f"{seconds // 60}m {seconds % 60}s"
+    return f"{seconds}s"
--- a/thirdeye/backend/agents/web_search.py
+++ b/thirdeye/backend/agents/web_search.py
@@ -0,0 +1,84 @@
+"""Web Search Agent — Tavily integration for real-time web context."""
+import logging
+from backend.config import TAVILY_API_KEY, ENABLE_WEB_SEARCH
+
+logger = logging.getLogger("thirdeye.agents.web_search")
+
+_tavily_client = None
+
+
+def _get_client():
+    global _tavily_client
+    if _tavily_client is None and TAVILY_API_KEY and len(TAVILY_API_KEY) > 5:
+        try:
+            from tavily import TavilyClient
+            _tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
+            logger.info("Tavily client initialized")
+        except ImportError:
+            logger.error("tavily-python not installed. Run: pip install tavily-python")
+        except Exception as e:
+            logger.error(f"Tavily client init failed: {e}")
+    return _tavily_client
+
+
+async def search_web(query: str, max_results: int = 5) -> list[dict]:
+    """
+    Search the web using Tavily and return structured results.
+    
+    Args:
+        query: Search query string
+        max_results: Max results to return (1-10)
+    
+    Returns:
+        List of {title, url, content, score} dicts, sorted by relevance
+    """
+    if not ENABLE_WEB_SEARCH:
+        logger.info("Web search is disabled via feature flag")
+        return []
+
+    client = _get_client()
+    if not client:
+        logger.warning("Tavily client not available (missing API key or install)")
+        return []
+
+    try:
+        response = client.search(
+            query=query,
+            max_results=max_results,
+            search_depth="basic",  # "basic" is faster + free-tier friendly; "advanced" for deeper
+            include_answer=False,
+            include_raw_content=False,
+        )
+
+        results = []
+        for r in response.get("results", []):
+            results.append({
+                "title": r.get("title", ""),
+                "url": r.get("url", ""),
+                "content": r.get("content", ""),
+                "score": r.get("score", 0.0),
+            })
+
+        logger.info(f"Tavily returned {len(results)} results for: {query[:60]}")
+        return results
+
+    except Exception as e:
+        logger.error(f"Tavily search failed: {e}")
+        return []
+
+
+def format_search_results_for_llm(results: list[dict]) -> str:
+    """Format Tavily results into context string for the Query Agent."""
+    if not results:
+        return ""
+
+    parts = []
+    for i, r in enumerate(results):
+        content_preview = r["content"][:500] if r["content"] else "No content"
+        parts.append(
+            f"[Web Result {i+1}] {r['title']}\n"
+            f"Source: {r['url']}\n"
+            f"Content: {content_preview}"
+        )
+
+    return "\n\n".join(parts)
--- a/thirdeye/backend/api/routes.py
+++ b/thirdeye/backend/api/routes.py
@@ -0,0 +1,785 @@
+"""FastAPI routes for the ThirdEye dashboard."""
+import asyncio
+import json
+import logging
+from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from backend.db.chroma import get_all_signals, query_signals, get_group_ids, get_group_names
+from backend.pipeline import query_knowledge, get_lens, set_lens
+from backend.agents.pattern_detector import detect_patterns
+from backend.agents.cross_group_analyst import analyze_cross_group
+from collections import defaultdict
+
+logger = logging.getLogger("thirdeye.api")
+
+app = FastAPI(title="ThirdEye API", version="1.0.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.get("/api/groups")
+async def list_groups():
+    """List all monitored groups."""
+    group_ids = get_group_ids()
+    names = get_group_names()
+    groups = []
+    for gid in group_ids:
+        signals = get_all_signals(gid)
+        groups.append({
+            "group_id": gid,
+            "group_name": names.get(gid, gid),
+            "signal_count": len(signals),
+            "lens": get_lens(gid),
+        })
+    return {"groups": groups}
+
+
+@app.get("/api/groups/{group_id}/signals")
+async def get_signals(
+    group_id: str,
+    signal_type: str = None,
+    severity: str = None,
+    lens: str = None,
+    date_from: str = None,
+    date_to: str = None,
+):
+    """Get signals for a group with optional filters."""
+    signals = get_all_signals(group_id, signal_type=signal_type)
+
+    if severity:
+        signals = [s for s in signals if s.get("metadata", {}).get("severity") == severity]
+    if lens:
+        signals = [s for s in signals if s.get("metadata", {}).get("lens") == lens]
+    if date_from:
+        signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") >= date_from]
+    if date_to:
+        signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") <= date_to]
+
+    signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""), reverse=True)
+    return {"signals": signals, "count": len(signals)}
+
+
+@app.post("/api/groups/{group_id}/query")
+async def query_group(group_id: str, body: dict):
+    """Natural language query over a group's knowledge base."""
+    question = body.get("question", "")
+    if not question:
+        raise HTTPException(400, "Missing 'question' field")
+    try:
+        answer = await query_knowledge(group_id, question)
+        return {"answer": answer, "question": question}
+    except Exception as e:
+        logger.warning(f"Query failed for {group_id}: {e}")
+        raise HTTPException(500, "Query processing failed — please try again")
+
+
+@app.get("/api/groups/{group_id}/patterns")
+async def get_patterns(group_id: str):
+    """Detect and return patterns for a group."""
+    try:
+        patterns = await asyncio.wait_for(detect_patterns(group_id), timeout=25.0)
+        return {"patterns": [p.model_dump() for p in patterns]}
+    except asyncio.TimeoutError:
+        logger.warning(f"Pattern detection timed out for {group_id}")
+        return {"patterns": []}
+    except Exception as e:
+        logger.warning(f"Pattern detection failed for {group_id}: {e}")
+        return {"patterns": []}
+
+
+@app.get("/api/cross-group/insights")
+async def get_cross_group_insights():
+    """Run cross-group analysis and return insights."""
+    try:
+        group_ids = get_group_ids()
+        if len(group_ids) < 2:
+            return {"insights": [], "message": "Need at least 2 monitored groups"}
+
+        summaries = {}
+        for gid in group_ids:
+            summaries[gid] = get_all_signals(gid)
+
+        insights = await asyncio.wait_for(analyze_cross_group(summaries), timeout=25.0)
+        return {"insights": [i.model_dump() for i in insights]}
+    except asyncio.TimeoutError:
+        logger.warning("Cross-group analysis timed out — returning heuristic fallback")
+        from backend.agents.cross_group_analyst import _heuristic_cross_group_insights
+        fallback = _heuristic_cross_group_insights(summaries)
+        return {"insights": [i.model_dump() for i in fallback]}
+    except Exception as e:
+        logger.warning(f"Cross-group analysis failed: {e}")
+        return {"insights": [], "message": "Analysis temporarily unavailable"}
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok", "service": "thirdeye"}
+
+# ─────────────────────────────────────────────
+#  Google Meet Ingestion Endpoints
+# ─────────────────────────────────────────────
+
+from pydantic import BaseModel
+from backend.config import MEET_INGEST_SECRET, ENABLE_MEET_INGESTION
+
+class MeetStartPayload(BaseModel):
+    meeting_id: str
+    group_id: str = "meet_sessions"
+    started_at: str
+    speaker: str = "Unknown"
+
+class MeetChunkPayload(BaseModel):
+    meeting_id: str
+    group_id: str = "meet_sessions"
+    chunk_index: int
+    text: str
+    speaker: str = "Unknown"
+    timestamp: str
+    is_final: bool = False
+
+def _verify_meet_secret(request: Request):
+    secret = request.headers.get("X-ThirdEye-Secret", "")
+    if secret != MEET_INGEST_SECRET:
+        from fastapi import HTTPException
+        raise HTTPException(status_code=403, detail="Invalid Meet ingest secret")
+
+@app.post("/api/meet/start")
+async def meet_start(payload: MeetStartPayload, request: Request):
+    """Called by extension when a new meeting begins."""
+    _verify_meet_secret(request)
+    if not ENABLE_MEET_INGESTION:
+        return {"ok": False, "reason": "Meet ingestion disabled"}
+
+    # Store a meeting-started signal immediately
+    from backend.db.chroma import store_signals
+    from datetime import datetime
+    import uuid
+
+    signal = {
+        "id": str(uuid.uuid4()),
+        "type": "meet_started",
+        "summary": f"Meeting {payload.meeting_id} started by {payload.speaker}",
+        "raw_quote": "",
+        "severity": "low",
+        "status": "active",
+        "sentiment": "neutral",
+        "urgency": "none",
+        "entities": [f"@{payload.speaker}", f"#{payload.meeting_id}"],
+        "keywords": ["meeting", "started", payload.meeting_id],
+        "timestamp": payload.started_at,
+        "group_id": payload.group_id,
+        "lens": "meet",
+        "meeting_id": payload.meeting_id,
+    }
+    store_signals(payload.group_id, [signal])
+    return {"ok": True, "meeting_id": payload.meeting_id}
+
+
+@app.post("/api/meet/ingest")
+async def meet_ingest(payload: MeetChunkPayload, request: Request, background_tasks: BackgroundTasks):
+    """Called by extension every 30s with a transcript chunk."""
+    _verify_meet_secret(request)
+    if not ENABLE_MEET_INGESTION:
+        return {"ok": False, "reason": "Meet ingestion disabled"}
+
+    if len(payload.text.strip()) < 10:
+        return {"ok": True, "skipped": True, "reason": "chunk too short"}
+
+    # Process asynchronously so the extension gets a fast response
+    from backend.agents.meet_ingestor import process_meet_chunk
+    background_tasks.add_task(
+        process_meet_chunk,
+        payload.meeting_id,
+        payload.group_id,
+        payload.chunk_index,
+        payload.text,
+        payload.speaker,
+        payload.timestamp,
+        payload.is_final,
+    )
+
+    return {
+        "ok": True,
+        "meeting_id": payload.meeting_id,
+        "chunk_index": payload.chunk_index,
+        "queued": True,
+    }
+
+
+@app.get("/api/meet/meetings")
+async def list_meetings():
+    """List all recorded meetings with their signal counts."""
+    from backend.db.chroma import get_all_signals, get_group_ids
+    
+    meetings = {}
+    
+    # Check all groups for meet-related signals
+    for group_id in get_group_ids():
+        all_signals = get_all_signals(group_id)
+        for sig in all_signals:
+            # Only process signals that have lens="meet"
+            if sig.get("metadata", {}).get("lens") != "meet":
+                continue
+            
+            mid = sig.get("metadata", {}).get("meeting_id", "unknown")
+            if not mid or mid == "":
+                continue
+                
+            if mid not in meetings:
+                meetings[mid] = {"meeting_id": mid, "signal_count": 0, "types": {}}
+            meetings[mid]["signal_count"] += 1
+            t = sig.get("metadata", {}).get("type", "unknown")
+            meetings[mid]["types"][t] = meetings[mid]["types"].get(t, 0) + 1
+    
+    return {"meetings": list(meetings.values())}
+
+
+@app.get("/api/meet/meetings/{meeting_id}/signals")
+async def get_meeting_signals(meeting_id: str):
+    """Get all signals for a specific meeting."""
+    from backend.db.chroma import get_all_signals, get_group_ids
+
+    all_signals = []
+    for gid in get_group_ids():
+        for sig in get_all_signals(gid):
+            meta = sig.get("metadata", {})
+            if meta.get("meeting_id") == meeting_id and meta.get("lens") == "meet":
+                all_signals.append(sig)
+
+    all_signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""))
+    return {"meeting_id": meeting_id, "signals": all_signals, "count": len(all_signals)}
+
+
+@app.get("/api/meet/meetings/{meeting_id}")
+async def get_meeting_detail(meeting_id: str):
+    """Get detailed info for a single meeting."""
+    from backend.db.chroma import get_all_signals, get_group_ids
+
+    signals_by_type: dict = {}
+    started_at = ""
+    speaker = "Unknown"
+    group_id = ""
+
+    for gid in get_group_ids():
+        for sig in get_all_signals(gid):
+            meta = sig.get("metadata", {})
+            if meta.get("meeting_id") != meeting_id or meta.get("lens") != "meet":
+                continue
+            sig_type = meta.get("type", "unknown")
+            signals_by_type.setdefault(sig_type, []).append(sig)
+            if sig_type == "meet_started":
+                started_at = meta.get("timestamp", "")
+                speaker = meta.get("speaker", "Unknown") or meta.get("entities", "Unknown")
+                group_id = gid
+
+    # Summary signal text
+    summary_text = ""
+    for sig in signals_by_type.get("meet_summary", []):
+        summary_text = sig.get("metadata", {}).get("summary", "") or sig.get("document", "")
+        break
+
+    signal_counts = {k: len(v) for k, v in signals_by_type.items()}
+    total_signals = sum(signal_counts.values())
+
+    return {
+        "meeting_id": meeting_id,
+        "started_at": started_at,
+        "speaker": speaker,
+        "group_id": group_id,
+        "total_signals": total_signals,
+        "signal_counts": signal_counts,
+        "summary": summary_text,
+    }
+
+
+@app.get("/api/meet/meetings/{meeting_id}/transcript")
+async def get_meeting_transcript(meeting_id: str):
+    """Get raw transcript chunks for a meeting in chronological order."""
+    from backend.db.chroma import get_all_signals, get_group_ids
+
+    chunks = []
+    for gid in get_group_ids():
+        for sig in get_all_signals(gid):
+            meta = sig.get("metadata", {})
+            if meta.get("meeting_id") == meeting_id and meta.get("type") == "meet_chunk_raw":
+                chunks.append({
+                    "id": sig.get("id", ""),
+                    "text": meta.get("raw_quote", "") or sig.get("document", ""),
+                    "speaker": meta.get("speaker", "Unknown"),
+                    "timestamp": meta.get("timestamp", ""),
+                    "summary": meta.get("summary", ""),
+                })
+
+    chunks.sort(key=lambda c: c["timestamp"])
+    return {"meeting_id": meeting_id, "transcript": chunks, "chunk_count": len(chunks)}
+
+
+# ─────────────────────────────────────────────
+#  Jira Endpoints
+# ─────────────────────────────────────────────
+
+class JiraRaisePayload(BaseModel):
+    signal_id: str
+    group_id: str
+    project_key: str = None
+    force: bool = False
+
+class JiraCreatePayload(BaseModel):
+    summary: str
+    description: str = ""
+    project_key: str = None
+    issue_type: str = "Task"
+    priority: str = "Medium"
+    labels: list = []
+    assignee_account_id: str = None
+
+
+@app.get("/api/jira/tickets")
+async def list_jira_tickets(
+    group_id: str = None,
+    date_from: str = None,
+    date_to: str = None,
+    live: bool = False,
+):
+    """List all Jira tickets raised by ThirdEye across all groups."""
+    from backend.db.chroma import get_all_signals, get_group_ids
+    from backend.integrations.jira_client import get_issue, is_configured
+
+    group_ids = [group_id] if group_id else get_group_ids()
+    tickets = []
+
+    for gid in group_ids:
+        for sig in get_all_signals(gid, signal_type="jira_raised"):
+            meta = sig.get("metadata", {})
+            ts = meta.get("timestamp", "")
+            if date_from and ts < date_from:
+                continue
+            if date_to and ts > date_to:
+                continue
+
+            jira_key = meta.get("jira_key", "") or (
+                json.loads(meta.get("entities", "[]") or "[]") or [""]
+            )[0]
+
+            tickets.append({
+                "id": sig.get("id", ""),
+                "jira_key": jira_key,
+                "jira_url": meta.get("jira_url", ""),
+                "jira_summary": meta.get("jira_summary", "") or meta.get("summary", ""),
+                "jira_priority": meta.get("jira_priority", "Medium"),
+                "original_signal_id": meta.get("original_signal_id", "") or meta.get("raw_quote", ""),
+                "group_id": gid,
+                "raised_at": ts,
+                "status": "Unknown",
+            })
+
+    # Fetch live status from Jira if requested and configured
+    if live and is_configured() and tickets:
+        for ticket in tickets:
+            if ticket["jira_key"]:
+                try:
+                    issue_data = await get_issue(ticket["jira_key"])
+                    ticket["status"] = issue_data.get("status", "Unknown")
+                    ticket["assignee"] = issue_data.get("assignee", "Unassigned")
+                    if not ticket["jira_summary"]:
+                        ticket["jira_summary"] = issue_data.get("summary", "")
+                except Exception:
+                    pass
+
+    tickets.sort(key=lambda t: t["raised_at"], reverse=True)
+    return {"tickets": tickets, "count": len(tickets)}
+
+
+@app.get("/api/jira/tickets/{ticket_key}/status")
+async def get_jira_ticket_status(ticket_key: str):
+    """Fetch live status for a Jira ticket."""
+    from backend.integrations.jira_client import get_issue, is_configured
+
+    if not is_configured():
+        raise HTTPException(503, "Jira is not configured")
+    try:
+        data = await get_issue(ticket_key)
+        return data
+    except Exception as e:
+        raise HTTPException(502, f"Jira API error: {e}")
+
+
+@app.post("/api/jira/raise")
+async def raise_jira_ticket(payload: JiraRaisePayload):
+    """Raise a Jira ticket for an existing ThirdEye signal."""
+    from backend.db.chroma import get_all_signals
+    from backend.agents.jira_agent import raise_ticket_for_signal
+    from backend.integrations.jira_client import is_configured
+
+    if not is_configured():
+        raise HTTPException(503, "Jira is not configured")
+
+    # Find the signal in the group
+    signals = get_all_signals(payload.group_id)
+    target = next((s for s in signals if s.get("id") == payload.signal_id), None)
+    if not target:
+        raise HTTPException(404, f"Signal {payload.signal_id} not found in group {payload.group_id}")
+
+    # Build flat signal dict from stored format
+    meta = target.get("metadata", {})
+    signal_dict = {
+        "id": target.get("id", ""),
+        "type": meta.get("type", "unknown"),
+        "summary": meta.get("summary", "") or target.get("document", ""),
+        "raw_quote": meta.get("raw_quote", ""),
+        "severity": meta.get("severity", "medium"),
+        "status": meta.get("status", "open"),
+        "entities": json.loads(meta.get("entities", "[]") or "[]"),
+        "keywords": json.loads(meta.get("keywords", "[]") or "[]"),
+        "timestamp": meta.get("timestamp", ""),
+        "group_id": payload.group_id,
+        "lens": meta.get("lens", ""),
+    }
+
+    result = await raise_ticket_for_signal(
+        signal_dict,
+        payload.group_id,
+        project_key=payload.project_key,
+        force=payload.force,
+    )
+    return result
+
+
+@app.post("/api/jira/create")
+async def create_jira_ticket(payload: JiraCreatePayload):
+    """Create a custom Jira ticket directly from the dashboard."""
+    from backend.integrations.jira_client import create_issue, is_configured
+    from backend.config import JIRA_DEFAULT_PROJECT
+    from backend.db.chroma import store_signals
+    import uuid as _uuid
+    from datetime import datetime
+
+    if not is_configured():
+        raise HTTPException(503, "Jira is not configured")
+
+    result = await create_issue(
+        project_key=payload.project_key or JIRA_DEFAULT_PROJECT,
+        summary=payload.summary,
+        description=payload.description or "(Created from ThirdEye Dashboard)",
+        issue_type=payload.issue_type,
+        priority=payload.priority,
+        labels=payload.labels or ["thirdeye", "dashboard"],
+        assignee_account_id=payload.assignee_account_id or None,
+    )
+
+    # Persist a jira_raised tracking signal so it appears in the ticket list
+    if result.get("ok"):
+        jira_key = result["key"]
+        jira_url = result.get("url", "")
+        tracking_signal = {
+            "id": str(_uuid.uuid4()),
+            "type": "jira_raised",
+            "summary": payload.summary,
+            "raw_quote": "manual",
+            "severity": payload.priority.lower() if payload.priority else "medium",
+            "status": "raised",
+            "sentiment": "neutral",
+            "urgency": "none",
+            "entities": [jira_key],
+            "keywords": ["jira", jira_key, "manual", "dashboard"],
+            "timestamp": datetime.utcnow().isoformat(),
+            "group_id": "dashboard",
+            "lens": "jira",
+            "jira_key": jira_key,
+            "jira_url": jira_url,
+            "jira_summary": payload.summary,
+            "jira_priority": payload.priority or "Medium",
+            "original_signal_id": "manual",
+        }
+        store_signals("dashboard", [tracking_signal])
+        logger.info(f"Stored manually-created Jira ticket {jira_key} in ChromaDB (group=dashboard)")
+
+    return result
+
+
+@app.get("/api/jira/users/search")
+async def search_jira_users(q: str = ""):
+    """Search Jira users by name or email fragment for assignee picker."""
+    from backend.integrations.jira_client import search_users, is_configured
+    if not is_configured():
+        raise HTTPException(503, "Jira is not configured")
+    if not q or len(q.strip()) < 1:
+        return {"users": []}
+    try:
+        users = await search_users(q.strip(), max_results=8)
+        return {"users": users}
+    except Exception as e:
+        logger.warning(f"Jira user search failed: {e}")
+        return {"users": []}
+
+
+@app.get("/api/jira/config")
+async def get_jira_config():
+    """Check if Jira is configured and return basic project info."""
+    from backend.integrations.jira_client import is_configured, test_connection, list_projects
+    from backend.config import JIRA_DEFAULT_PROJECT, JIRA_BASE_URL
+
+    configured = is_configured()
+    if not configured:
+        return {"configured": False}
+
+    conn = await test_connection()
+    projects = []
+    if conn.get("ok"):
+        try:
+            projects = await list_projects()
+        except Exception:
+            pass
+
+    return {
+        "configured": True,
+        "connected": conn.get("ok", False),
+        "base_url": JIRA_BASE_URL,
+        "default_project": JIRA_DEFAULT_PROJECT,
+        "projects": projects,
+    }
+
+
+# ─────────────────────────────────────────────
+#  Knowledge Browser
+# ─────────────────────────────────────────────
+
+_BROWSE_EXCLUDED_TYPES = {
+    "jira_raised", "meet_started", "meet_chunk_raw", "voice_transcript",
+}
+
+_BROWSE_STOPWORDS = {
+    "the", "a", "an", "is", "in", "on", "at", "to", "and", "or", "not",
+    "for", "of", "it", "this", "that", "be", "with", "as", "was", "are",
+    "has", "have", "but", "by", "from", "we", "our", "they", "its",
+    "will", "can", "would", "should", "about", "all", "new", "use",
+}
+
+
+@app.get("/api/knowledge/browse/{group_id}")
+async def browse_knowledge(
+    group_id: str,
+    date_from: str = None,
+    date_to: str = None,
+    topic: str = None,
+):
+    """
+    Browse a group's knowledge base organized by AI-clustered topics and date timeline.
+    Returns topics (derived from keyword frequency) and a day-by-day timeline of signals.
+    Excludes internal system signals (jira_raised, meet_chunk_raw, etc.).
+    """
+    all_sigs = get_all_signals(group_id)
+
+    # Strip system / tracking signals
+    signals = [
+        s for s in all_sigs
+        if s.get("metadata", {}).get("type", "") not in _BROWSE_EXCLUDED_TYPES
+    ]
+
+    # Date filtering
+    if date_from:
+        signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") >= date_from]
+    if date_to:
+        signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") <= date_to + "T23:59:59"]
+
+    # ── Build keyword frequency map ────────────────────────────────────────────
+    keyword_freq: dict[str, int] = defaultdict(int)
+    for sig in signals:
+        raw_kws = sig.get("metadata", {}).get("keywords", "[]")
+        try:
+            kws: list = json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws
+        except Exception:
+            kws = []
+        for kw in kws:
+            kw_clean = str(kw).lower().strip()
+            if len(kw_clean) > 2 and kw_clean not in _BROWSE_STOPWORDS:
+                keyword_freq[kw_clean] += 1
+
+    # Top 25 keywords become the selectable topics (must appear in ≥2 signals)
+    sorted_kws = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)
+    top_topics: list[str] = [kw for kw, freq in sorted_kws[:25] if freq >= 1]
+
+    def _primary_topic(sig: dict) -> str:
+        """Return the highest-ranked top-topic that this signal's keywords contain."""
+        raw_kws = sig.get("metadata", {}).get("keywords", "[]")
+        try:
+            kw_set = {str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)}
+        except Exception:
+            kw_set = set()
+        for t in top_topics:
+            if t in kw_set:
+                return t
+        return sig.get("metadata", {}).get("type", "other").replace("_", " ")
+
+    def _all_topics(sig: dict) -> list[str]:
+        """Return all top-topics that this signal belongs to."""
+        raw_kws = sig.get("metadata", {}).get("keywords", "[]")
+        try:
+            kw_set = {str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)}
+        except Exception:
+            kw_set = set()
+        matched = [t for t in top_topics if t in kw_set]
+        return matched if matched else [sig.get("metadata", {}).get("type", "other").replace("_", " ")]
+
+    # ── Topic-filter (optional) ────────────────────────────────────────────────
+    if topic:
+        topic_lower = topic.lower()
+        filtered = []
+        for sig in signals:
+            raw_kws = sig.get("metadata", {}).get("keywords", "[]")
+            try:
+                kws = [str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)]
+            except Exception:
+                kws = []
+            sig_type = sig.get("metadata", {}).get("type", "").replace("_", " ")
+            if topic_lower in kws or topic_lower == sig_type:
+                filtered.append(sig)
+        signals = filtered
+
+    # ── Build topics summary list ──────────────────────────────────────────────
+    topic_buckets: dict[str, list] = defaultdict(list)
+    for sig in (get_all_signals(group_id) if not (date_from or date_to or topic) else signals):
+        # Rebuild buckets from the full unfiltered set for sidebar counts
+        pass
+
+    # Use current filtered signals for topic counts
+    for sig in signals:
+        primary = _primary_topic(sig)
+        topic_buckets[primary].append(sig)
+
+    topics_summary = []
+    seen_topics: set[str] = set()
+    for t in top_topics:
+        bucket = topic_buckets.get(t, [])
+        if bucket and t not in seen_topics:
+            seen_topics.add(t)
+            latest_ts = max((s.get("metadata", {}).get("timestamp", "") for s in bucket), default="")
+            topics_summary.append({
+                "name": t,
+                "signal_count": len(bucket),
+                "latest": latest_ts,
+                "sample_signals": [
+                    s.get("metadata", {}).get("summary", "") or s.get("document", "")
+                    for s in bucket[:2]
+                ],
+            })
+    # Add leftover types as topics
+    for t, bucket in sorted(topic_buckets.items(), key=lambda x: len(x[1]), reverse=True):
+        if t not in seen_topics and bucket:
+            seen_topics.add(t)
+            latest_ts = max((s.get("metadata", {}).get("timestamp", "") for s in bucket), default="")
+            topics_summary.append({
+                "name": t,
+                "signal_count": len(bucket),
+                "latest": latest_ts,
+                "sample_signals": [
+                    s.get("metadata", {}).get("summary", "") or s.get("document", "")
+                    for s in bucket[:2]
+                ],
+            })
+    topics_summary.sort(key=lambda t: t["signal_count"], reverse=True)
+
+    # ── Build day-by-day timeline ──────────────────────────────────────────────
+    day_buckets: dict[str, list] = defaultdict(list)
+    for sig in signals:
+        ts = sig.get("metadata", {}).get("timestamp", "")
+        date_key = ts[:10] if ts and len(ts) >= 10 else "unknown"
+        day_buckets[date_key].append(sig)
+
+    timeline = []
+    for date_key in sorted(day_buckets.keys(), reverse=True):
+        day_sigs = sorted(
+            day_buckets[date_key],
+            key=lambda s: s.get("metadata", {}).get("timestamp", ""),
+            reverse=True,
+        )
+        day_topics = list(dict.fromkeys(
+            t for s in day_sigs for t in _all_topics(s)
+        ))
+        timeline.append({
+            "date": date_key,
+            "signals": day_sigs,
+            "topics": day_topics[:6],
+            "signal_count": len(day_sigs),
+        })
+
+    # ── Date range metadata ────────────────────────────────────────────────────
+    all_ts = [
+        s.get("metadata", {}).get("timestamp", "")
+        for s in signals
+        if s.get("metadata", {}).get("timestamp", "")
+    ]
+    date_range = {
+        "earliest": min(all_ts) if all_ts else "",
+        "latest": max(all_ts) if all_ts else "",
+    }
+
+    names = get_group_names()
+    return {
+        "group_id": group_id,
+        "group_name": names.get(group_id, group_id),
+        "total_signals": len(signals),
+        "date_range": date_range,
+        "topics": topics_summary,
+        "timeline": timeline,
+    }
+
+
+# ─────────────────────────────────────────────
+#  Enhanced Chat / Signals Timeline
+# ─────────────────────────────────────────────
+
+@app.get("/api/signals/timeline")
+async def get_signals_timeline(
+    group_id: str = None,
+    severity: str = None,
+    lens: str = None,
+    signal_type: str = None,
+    date_from: str = None,
+    date_to: str = None,
+    limit: int = 200,
+):
+    """
+    Cross-group signal timeline with full filter support.
+    Returns signals sorted newest-first, ready for timeline rendering.
+    """
+    from backend.db.chroma import get_all_signals, get_group_ids, get_group_names
+
+    group_ids = [group_id] if group_id else get_group_ids()
+    names = get_group_names()
+    all_signals = []
+
+    for gid in group_ids:
+        for sig in get_all_signals(gid, signal_type=signal_type):
+            meta = sig.get("metadata", {})
+            ts = meta.get("timestamp", "")
+
+            if severity and meta.get("severity") != severity:
+                continue
+            if lens and meta.get("lens") != lens:
+                continue
+            if date_from and ts < date_from:
+                continue
+            if date_to and ts > date_to:
+                continue
+            # Exclude internal tracking signals from timeline
+            if meta.get("type") in ("jira_raised", "meet_started"):
+                continue
+
+            all_signals.append({
+                **sig,
+                "group_name": names.get(gid, gid),
+            })
+
+    all_signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""), reverse=True)
+    return {
+        "signals": all_signals[:limit],
+        "total": len(all_signals),
+        "truncated": len(all_signals) > limit,
+    }
+
--- a/thirdeye/backend/bot/bot.py
+++ b/thirdeye/backend/bot/bot.py
--- a/thirdeye/backend/bot/commands.py
+++ b/thirdeye/backend/bot/commands.py
@@ -0,0 +1,150 @@
+"""
+ThirdEye bot commands — voice intelligence.
+Houses cmd_voicelog and any future command handlers that don't belong in the
+main bot.py module.
+"""
+import logging
+from datetime import datetime
+
+logger = logging.getLogger("thirdeye.bot.commands")
+
+
+async def cmd_voicelog(update, context):
+    """
+    /voicelog [filter]
+    Audit trail of all voice note decisions, actions, and blockers in this group.
+
+    Usage:
+      /voicelog                — all voice-sourced signals (last 20)
+      /voicelog decisions      — only decisions from voice notes
+      /voicelog actions        — only action items from voice notes
+      /voicelog blockers       — only blockers from voice notes
+      /voicelog @Raj           — only voice notes by Raj
+      /voicelog search [query] — search voice note content
+    """
+    from backend.db.chroma import query_signals, get_all_signals
+    from backend.agents.voice_transcriber import format_duration
+
+    chat_id = str(update.effective_chat.id)
+    args = context.args or []
+
+    filter_type = None
+    filter_speaker = None
+    search_query = None
+
+    if args:
+        first = args[0].lower()
+        if first == "decisions":
+            filter_type = "architecture_decision"
+        elif first == "actions":
+            filter_type = "action_item"
+        elif first == "blockers":
+            filter_type = "blocker"
+        elif first == "search" and len(args) > 1:
+            search_query = " ".join(args[1:])
+        elif first.startswith("@"):
+            filter_speaker = first[1:]
+
+    await update.message.reply_text("🎤 Searching voice notes...", parse_mode="Markdown")
+
+    if search_query:
+        raw_signals = query_signals(chat_id, search_query, n_results=30)
+    else:
+        raw_signals = get_all_signals(chat_id)
+
+    # Normalise: both query_signals and get_all_signals return
+    # {"document": ..., "metadata": {...}, "id": ...} shaped dicts.
+    # Flatten metadata to top-level for uniform field access below.
+    def _flatten(s: dict) -> dict:
+        meta = s.get("metadata", {})
+        flat = {**meta}
+        flat.setdefault("id", s.get("id", ""))
+        flat.setdefault("document", s.get("document", ""))
+        return flat
+
+    all_signals = [_flatten(s) for s in raw_signals]
+
+    # Filter to voice-sourced signals only
+    voice_signals = [
+        s for s in all_signals
+        if s.get("source") == "voice"
+        or s.get("type") == "voice_transcript"
+        or "[Voice @" in s.get("summary", "")
+    ]
+
+    if filter_type:
+        voice_signals = [s for s in voice_signals if s.get("type") == filter_type]
+    if filter_speaker:
+        voice_signals = [
+            s for s in voice_signals
+            if filter_speaker.lower() in s.get("speaker", "").lower()
+            or filter_speaker.lower() in str(s.get("entities", [])).lower()
+        ]
+
+    # Prefer structured signals; fall back to raw transcripts if none
+    structured = [s for s in voice_signals if s.get("type") != "voice_transcript"]
+    display_signals = structured if structured else voice_signals
+
+    # Sort by timestamp descending
+    def _ts(s):
+        try:
+            return datetime.fromisoformat(s.get("timestamp", "").replace("Z", "+00:00"))
+        except Exception:
+            return datetime.min
+
+    display_signals.sort(key=_ts, reverse=True)
+    display_signals = display_signals[:20]
+
+    if not display_signals:
+        await update.message.reply_text(
+            "📭 No voice note signals found. Voice notes are transcribed automatically when sent here.",
+            parse_mode="Markdown",
+        )
+        return
+
+    type_emoji = {
+        "architecture_decision": "🏗️",
+        "tech_debt":             "⚠️",
+        "action_item":           "📌",
+        "blocker":               "🚧",
+        "feature_request":       "💡",
+        "promise":               "🤝",
+        "risk":                  "🔴",
+        "recurring_bug":         "🐛",
+        "voice_transcript":      "🎤",
+    }
+
+    filter_label = ""
+    if filter_type:
+        filter_label = f" — {filter_type.replace('_', ' ').title()}"
+    elif filter_speaker:
+        filter_label = f" — @{filter_speaker}"
+    elif search_query:
+        filter_label = f" — '{search_query}'"
+
+    lines = [f"🎤 *Voice Note Audit Trail*{filter_label}\n_{len(display_signals)} signal(s)_\n"]
+
+    for sig in display_signals:
+        ts = sig.get("timestamp", "")
+        date_str = ""
+        if ts:
+            try:
+                dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
+                date_str = dt.strftime("%b %d")
+            except Exception:
+                date_str = ts[:10]
+
+        speaker = sig.get("speaker", "")
+        duration = sig.get("voice_duration", 0)
+        duration_str = format_duration(int(duration)) if duration else ""
+        emoji = type_emoji.get(sig.get("type", ""), "🎤")
+
+        summary = sig.get("summary", "")
+        if summary.startswith("[Voice @"):
+            summary = summary.split("] ", 1)[-1] if "] " in summary else summary
+
+        meta_parts = [f"@{speaker}" if speaker else "", date_str, duration_str]
+        meta = " · ".join(filter(None, meta_parts))
+        lines.append(f"{emoji} *{meta}*\n   _{summary[:100]}_\n")
+
+    await update.message.reply_text("\n".join(lines), parse_mode="Markdown")
--- a/thirdeye/backend/config.py
+++ b/thirdeye/backend/config.py
@@ -0,0 +1,62 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Telegram
+TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
+
+# Ollama (local)
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
+OLLAMA_ENABLED = os.getenv("OLLAMA_ENABLED", "true").lower() == "true"
+
+# LLM Providers
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# Additional Groq keys for round-robin rotation (avoids rate limits on llama-3.3-70b-versatile)
+GROQ_API_KEY_2 = os.getenv("GROQ_API_KEY_2")
+GROQ_API_KEY_3 = os.getenv("GROQ_API_KEY_3")
+CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY")
+SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+
+# Embeddings
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+
+# App
+CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "5"))
+BATCH_TIMEOUT_SECONDS = int(os.getenv("BATCH_TIMEOUT_SECONDS", "60"))
+
+# Web Search
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+
+# Feature Flags
+ENABLE_DOCUMENT_INGESTION = os.getenv("ENABLE_DOCUMENT_INGESTION", "true").lower() == "true"
+ENABLE_WEB_SEARCH = os.getenv("ENABLE_WEB_SEARCH", "true").lower() == "true"
+ENABLE_LINK_FETCH = os.getenv("ENABLE_LINK_FETCH", "true").lower() == "true"
+
+# Google Meet Extension
+MEET_INGEST_SECRET = os.getenv("MEET_INGEST_SECRET", "thirdeye_meet_secret_change_me")
+MEET_DEFAULT_GROUP_ID = os.getenv("MEET_DEFAULT_GROUP_ID", "meet_sessions")
+ENABLE_MEET_INGESTION = os.getenv("ENABLE_MEET_INGESTION", "true").lower() == "true"
+MEET_CROSS_REF_GROUPS = [
+    g.strip() for g in os.getenv("MEET_CROSS_REF_GROUPS", "").split(",") if g.strip()
+]
+
+# Jira
+JIRA_BASE_URL = os.getenv("JIRA_BASE_URL", "").rstrip("/")
+JIRA_EMAIL = os.getenv("JIRA_EMAIL", "")
+JIRA_API_TOKEN = os.getenv("JIRA_API_TOKEN", "")
+JIRA_DEFAULT_PROJECT = os.getenv("JIRA_DEFAULT_PROJECT", "ENG")
+JIRA_DEFAULT_ISSUE_TYPE = os.getenv("JIRA_DEFAULT_ISSUE_TYPE", "Task")
+ENABLE_JIRA = os.getenv("ENABLE_JIRA", "true").lower() == "true"
+JIRA_AUTO_RAISE = os.getenv("JIRA_AUTO_RAISE", "false").lower() == "true"
+JIRA_AUTO_RAISE_SEVERITY = os.getenv("JIRA_AUTO_RAISE_SEVERITY", "high")
+
+# Voice Message Intelligence
+ENABLE_VOICE_TRANSCRIPTION = os.getenv("ENABLE_VOICE_TRANSCRIPTION", "true").lower() == "true"
+VOICE_MAX_DURATION_SECONDS = int(os.getenv("VOICE_MAX_DURATION_SECONDS", "300"))
+VOICE_MIN_DURATION_SECONDS = int(os.getenv("VOICE_MIN_DURATION_SECONDS", "2"))
+VOICE_LANGUAGE = os.getenv("VOICE_LANGUAGE", "")          # empty string = Whisper auto-detects
+VOICE_STORE_TRANSCRIPT = os.getenv("VOICE_STORE_TRANSCRIPT", "true").lower() == "true"
--- a/thirdeye/backend/db/chroma.py
+++ b/thirdeye/backend/db/chroma.py
@@ -0,0 +1,279 @@
+"""ChromaDB setup and operations."""
+import json
+import uuid
+import chromadb
+import logging
+from datetime import datetime
+from backend.config import CHROMA_DB_PATH
+from backend.db.embeddings import embed_texts, embed_query
+
+logger = logging.getLogger("thirdeye.chroma")
+
+# Initialize persistent client
+_chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
+
+
+def get_collection(group_id: str) -> chromadb.Collection:
+    """Get or create a collection for a specific group."""
+    safe_name = f"ll_{group_id.replace('-', '_')}"
+    # ChromaDB collection names: 3-63 chars, alphanumeric + underscores
+    safe_name = safe_name[:63]
+    return _chroma_client.get_or_create_collection(name=safe_name)
+
+
+def set_group_name(group_id: str, name: str):
+    """Persist the human-readable Telegram group name in the collection metadata."""
+    if not name or name == group_id:
+        return
+    try:
+        col = get_collection(group_id)
+        existing = dict(col.metadata or {})
+        if existing.get("group_name") != name:
+            existing["group_name"] = name
+            col.modify(metadata=existing)
+    except Exception as e:
+        logger.warning(f"set_group_name failed for {group_id}: {e}")
+
+
+def get_group_names() -> dict[str, str]:
+    """Return a mapping of group_id -> human-readable name (falls back to group_id)."""
+    result = {}
+    for col in _chroma_client.list_collections():
+        if not col.name.startswith("ll_"):
+            continue
+        group_id = col.name.replace("ll_", "").replace("_", "-")
+        name = (col.metadata or {}).get("group_name", group_id)
+        result[group_id] = name
+    return result
+
+
+def store_signals(group_id: str, signals: list[dict]):
+    """Store extracted signals in ChromaDB with embeddings."""
+    if not signals:
+        return
+
+    collection = get_collection(group_id)
+    documents = []
+    metadatas = []
+    ids = []
+
+    for signal in signals:
+        doc_text = f"{signal['type']}: {signal['summary']}"
+        if signal.get('raw_quote'):
+            doc_text += f" | Quote: {signal['raw_quote']}"
+
+        documents.append(doc_text)
+        metadatas.append({
+            "type": signal.get("type", "unknown"),
+            "severity": signal.get("severity", "low"),
+            "status": signal.get("status", "unknown"),
+            "sentiment": signal.get("sentiment", "neutral"),
+            "urgency": signal.get("urgency", "none"),
+            "entities": json.dumps(signal.get("entities", [])),
+            "keywords": json.dumps(signal.get("keywords", [])),
+            "raw_quote": signal.get("raw_quote", ""),
+            "summary": signal.get("summary", ""),
+            "timestamp": signal.get("timestamp", datetime.utcnow().isoformat()),
+            "group_id": group_id,
+            "lens": signal.get("lens", "unknown"),
+            "meeting_id": signal.get("meeting_id", ""),
+            # Voice attribution — preserved so /voicelog and /ask can cite the source
+            "source":         signal.get("source", ""),
+            "speaker":        signal.get("speaker", ""),
+            "voice_file_id":  signal.get("voice_file_id", ""),
+            "voice_duration": int(signal.get("voice_duration", 0) or 0),
+            "voice_language": signal.get("voice_language", ""),
+            # Jira tracking fields (populated for jira_raised signals)
+            "jira_key":           signal.get("jira_key", ""),
+            "jira_url":           signal.get("jira_url", ""),
+            "jira_summary":       signal.get("jira_summary", ""),
+            "jira_priority":      signal.get("jira_priority", ""),
+            "original_signal_id": signal.get("original_signal_id", ""),
+        })
+        ids.append(signal.get("id", str(uuid.uuid4())))
+
+    # Generate embeddings
+    embeddings = embed_texts(documents)
+
+    collection.add(
+        documents=documents,
+        metadatas=metadatas,
+        embeddings=embeddings,
+        ids=ids,
+    )
+    logger.info(f"Stored {len(signals)} signals for group {group_id}")
+
+
+def query_signals(group_id: str, query: str, n_results: int = 10, signal_type: str = None) -> list[dict]:
+    """Query the knowledge base with natural language."""
+    collection = get_collection(group_id)
+
+    query_embedding = embed_query(query)
+
+    where_filter = None
+    if signal_type:
+        where_filter = {"type": signal_type}
+
+    try:
+        results = collection.query(
+            query_embeddings=[query_embedding],
+            n_results=min(n_results, collection.count() or 1),
+            where=where_filter,
+        )
+    except Exception as e:
+        logger.warning(f"Query failed: {e}")
+        return []
+
+    # Format results
+    output = []
+    if results and results["documents"]:
+        for i, doc in enumerate(results["documents"][0]):
+            meta = results["metadatas"][0][i] if results["metadatas"] else {}
+            distance = results["distances"][0][i] if results["distances"] else None
+            sig_id = results["ids"][0][i] if results.get("ids") else ""
+            output.append({
+                "id": sig_id,
+                "document": doc,
+                "metadata": meta,
+                "relevance_score": 1 - (distance or 0),  # Convert distance to similarity
+            })
+
+    return output
+
+
+def get_all_signals(group_id: str, signal_type: str = None) -> list[dict]:
+    """Get all signals for a group (for pattern detection)."""
+    collection = get_collection(group_id)
+    count = collection.count()
+    if count == 0:
+        return []
+
+    where_filter = {"type": signal_type} if signal_type else None
+
+    try:
+        results = collection.get(where=where_filter, limit=count)
+    except Exception:
+        results = collection.get(limit=count)
+
+    output = []
+    if results and results["documents"]:
+        for i, doc in enumerate(results["documents"]):
+            meta = results["metadatas"][i] if results["metadatas"] else {}
+            output.append({"document": doc, "metadata": meta, "id": results["ids"][i]})
+
+    return output
+
+
+def get_group_ids() -> list[str]:
+    """Get all group IDs that have collections."""
+    collections = _chroma_client.list_collections()
+    return [c.name.replace("ll_", "").replace("_", "-") for c in collections if c.name.startswith("ll_")]
+
+
+def query_signals_global(query: str, n_results: int = 5, exclude_group_id: str = None) -> list[dict]:
+    """
+    Search across ALL group collections for a query.
+    Used as a cross-group fallback when local search returns weak results.
+    Each result is annotated with its source group_id.
+    """
+    collections = _chroma_client.list_collections()
+    query_embedding = embed_query(query)
+    all_results = []
+
+    for col_meta in collections:
+        if not col_meta.name.startswith("ll_"):
+            continue
+
+        # Derive group_id from collection name
+        raw = col_meta.name[len("ll_"):]
+        group_id = raw.replace("_", "-")
+
+        if exclude_group_id and group_id == exclude_group_id:
+            continue
+
+        try:
+            col = _chroma_client.get_collection(col_meta.name)
+            count = col.count()
+            if count == 0:
+                continue
+
+            results = col.query(
+                query_embeddings=[query_embedding],
+                n_results=min(n_results, count),
+            )
+
+            if results and results["documents"]:
+                for i, doc in enumerate(results["documents"][0]):
+                    meta = results["metadatas"][0][i] if results["metadatas"] else {}
+                    distance = results["distances"][0][i] if results["distances"] else None
+                    all_results.append({
+                        "document": doc,
+                        "metadata": meta,
+                        "relevance_score": 1 - (distance or 0),
+                        "source_group_id": group_id,
+                    })
+        except Exception as e:
+            logger.warning(f"Global query failed for collection {col_meta.name}: {e}")
+            continue
+
+    # Sort by relevance and return top n_results
+    all_results.sort(key=lambda x: x["relevance_score"], reverse=True)
+    return all_results[:n_results]
+
+def mark_signal_as_raised(
+    group_id: str,
+    signal_id: str,
+    jira_key: str,
+    jira_url: str = "",
+    jira_summary: str = "",
+    jira_priority: str = "",
+):
+    """
+    Tag a signal with its Jira ticket key so we never raise it twice.
+    Adds a new signal of type 'jira_raised' linked to the original signal_id.
+    """
+    import uuid
+    from datetime import datetime
+
+    tracking_signal = {
+        "id": str(uuid.uuid4()),
+        "type": "jira_raised",
+        "summary": jira_summary or f"Jira ticket {jira_key} raised for signal {signal_id}",
+        "raw_quote": signal_id,          # original signal_id — used by get_raised_signal_ids
+        "severity": "low",
+        "status": "raised",
+        "sentiment": "neutral",
+        "urgency": "none",
+        "entities": [jira_key],
+        "keywords": ["jira", jira_key, "raised"],
+        "timestamp": datetime.utcnow().isoformat(),
+        "group_id": group_id,
+        "lens": "jira",
+        # Jira tracking fields
+        "jira_key": jira_key,
+        "jira_url": jira_url,
+        "jira_summary": jira_summary,
+        "jira_priority": jira_priority,
+        "original_signal_id": signal_id,
+    }
+    store_signals(group_id, [tracking_signal])
+
+
+def get_raised_signal_ids(group_id: str) -> set[str]:
+    """
+    Return the set of signal IDs that have already had Jira tickets raised.
+    Used to prevent duplicates.
+    """
+    collection = get_collection(group_id)
+    try:
+        results = collection.get(where={"type": "jira_raised"})
+        # raw_quote stores the original signal_id
+        raised_ids = set()
+        if results and results.get("metadatas"):
+            for meta in results["metadatas"]:
+                original_id = meta.get("raw_quote")  # signal_id stored in raw_quote field
+                if original_id:
+                    raised_ids.add(original_id)
+        return raised_ids
+    except Exception:
+        return set()
--- a/thirdeye/backend/db/embeddings.py
+++ b/thirdeye/backend/db/embeddings.py
@@ -0,0 +1,67 @@
+"""Embedding provider with Cohere primary and local fallback."""
+import cohere
+import logging
+from backend.config import COHERE_API_KEY
+
+logger = logging.getLogger("thirdeye.embeddings")
+
+_cohere_client = None
+_local_model = None
+
+def _get_cohere():
+    global _cohere_client
+    if _cohere_client is None and COHERE_API_KEY:
+        _cohere_client = cohere.Client(COHERE_API_KEY)
+    return _cohere_client
+
+def _get_local_model():
+    global _local_model
+    if _local_model is None:
+        from sentence_transformers import SentenceTransformer
+        _local_model = SentenceTransformer("all-MiniLM-L6-v2")
+        logger.info("Loaded local embedding model: all-MiniLM-L6-v2")
+    return _local_model
+
+
+def embed_texts(texts: list[str]) -> list[list[float]]:
+    """Embed a list of texts. Tries Cohere first, falls back to local model."""
+    if not texts:
+        return []
+
+    # Try Cohere
+    client = _get_cohere()
+    if client:
+        try:
+            response = client.embed(
+                texts=texts,
+                model="embed-english-v3.0",
+                input_type="search_document",
+            )
+            logger.info(f"Cohere embedded {len(texts)} texts")
+            return [list(e) for e in response.embeddings]
+        except Exception as e:
+            logger.warning(f"Cohere embedding failed: {e}, falling back to local")
+
+    # Fallback to local
+    model = _get_local_model()
+    embeddings = model.encode(texts).tolist()
+    logger.info(f"Local model embedded {len(texts)} texts")
+    return embeddings
+
+
+def embed_query(text: str) -> list[float]:
+    """Embed a single query text."""
+    client = _get_cohere()
+    if client:
+        try:
+            response = client.embed(
+                texts=[text],
+                model="embed-english-v3.0",
+                input_type="search_query",
+            )
+            return list(response.embeddings[0])
+        except Exception:
+            pass
+
+    model = _get_local_model()
+    return model.encode([text]).tolist()[0]
--- a/thirdeye/backend/db/models.py
+++ b/thirdeye/backend/db/models.py
@@ -0,0 +1,57 @@
+"""Data models for ThirdEye."""
+from pydantic import BaseModel, Field
+from typing import Optional
+from datetime import datetime
+import uuid
+
+
+class Signal(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    group_id: str
+    lens: str = "unknown"  # dev, product, client, community
+    type: str  # architecture_decision, tech_debt, etc.
+    summary: str
+    entities: list[str] = []
+    severity: str = "low"  # low, medium, high, critical
+    status: str = "unknown"  # proposed, decided, implemented, unresolved
+    sentiment: str = "neutral"
+    urgency: str = "none"
+    raw_quote: str = ""
+    source_messages: list[int] = []
+    timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
+    keywords: list[str] = []
+
+
+class Pattern(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    group_id: str
+    type: str  # frequency_spike, knowledge_silo, recurring_issue, sentiment_trend, stale_item
+    description: str
+    severity: str = "info"  # info, warning, critical
+    evidence_signal_ids: list[str] = []
+    recommendation: str = ""
+    detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
+    is_active: bool = True
+
+
+class CrossGroupInsight(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    type: str  # blocked_handoff, conflicting_decision, information_silo, promise_reality_gap, duplicated_effort
+    description: str
+    group_a: dict = {}  # {name, group_id, evidence}
+    group_b: dict = {}
+    severity: str = "warning"
+    recommendation: str = ""
+    detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
+    is_resolved: bool = False
+
+
+class GroupConfig(BaseModel):
+    group_id: str
+    group_name: str = ""
+    lens_mode: str = "auto"  # auto, dev, product, client, community
+    detected_lens: str = "unknown"
+    confidence: float = 0.0
+    is_active: bool = True
+    message_count: int = 0
+    signal_count: int = 0
--- a/thirdeye/backend/integrations/init.py
+++ b/thirdeye/backend/integrations/init.py
--- a/thirdeye/backend/integrations/jira_client.py
+++ b/thirdeye/backend/integrations/jira_client.py
@@ -0,0 +1,346 @@
+"""
+Jira REST API v3 client — async, using httpx.
+All methods return plain dicts (no Jira SDK objects).
+Authentication: Basic auth with email + API token (Jira Cloud standard).
+Docs: https://developer.atlassian.com/cloud/jira/platform/rest/v3/
+"""
+import base64
+import logging
+from typing import Optional
+import httpx
+
+from backend.config import (
+    JIRA_BASE_URL, JIRA_EMAIL, JIRA_API_TOKEN, ENABLE_JIRA
+)
+
+logger = logging.getLogger("thirdeye.integrations.jira")
+
+# ─── Auth ────────────────────────────────────────────────────────────────────
+
+def _auth_header() -> dict:
+    """Build the Basic auth header from email + API token."""
+    raw = f"{JIRA_EMAIL}:{JIRA_API_TOKEN}"
+    encoded = base64.b64encode(raw.encode()).decode()
+    return {
+        "Authorization": f"Basic {encoded}",
+        "Accept": "application/json",
+        "Content-Type": "application/json",
+    }
+
+
+def _base_url() -> str:
+    return f"{JIRA_BASE_URL}/rest/api/3"
+
+
+def is_configured() -> bool:
+    """Return True if all required Jira config is set."""
+    return bool(JIRA_BASE_URL and JIRA_EMAIL and JIRA_API_TOKEN and ENABLE_JIRA)
+
+
+# ─── Core HTTP helpers ───────────────────────────────────────────────────────
+
+async def _get(path: str, params: dict = None) -> dict:
+    async with httpx.AsyncClient(timeout=15.0) as client:
+        resp = await client.get(
+            f"{_base_url()}{path}",
+            headers=_auth_header(),
+            params=params or {},
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+async def _post(path: str, body: dict) -> dict:
+    async with httpx.AsyncClient(timeout=15.0) as client:
+        resp = await client.post(
+            f"{_base_url()}{path}",
+            headers=_auth_header(),
+            json=body,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+async def _put(path: str, body: dict) -> dict:
+    async with httpx.AsyncClient(timeout=15.0) as client:
+        resp = await client.put(
+            f"{_base_url()}{path}",
+            headers=_auth_header(),
+            json=body,
+        )
+        resp.raise_for_status()
+        # PUT /issue returns 204 No Content on success
+        if resp.status_code == 204:
+            return {"ok": True}
+        return resp.json()
+
+
+# ─── Public API ──────────────────────────────────────────────────────────────
+
+async def test_connection() -> dict:
+    """
+    Verify credentials work by calling /myself.
+    Returns {"ok": True, "displayName": "...", "email": "..."} or {"ok": False, "error": "..."}
+    """
+    try:
+        data = await _get("/myself")
+        return {
+            "ok": True,
+            "display_name": data.get("displayName", "Unknown"),
+            "email": data.get("emailAddress", "Unknown"),
+            "account_id": data.get("accountId", ""),
+        }
+    except httpx.HTTPStatusError as e:
+        return {"ok": False, "error": f"HTTP {e.response.status_code}: {e.response.text[:200]}"}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+async def list_projects() -> list[dict]:
+    """
+    List all accessible Jira projects.
+    Returns list of {"key": "ENG", "name": "Engineering", "id": "10001"}
+    """
+    data = await _get("/project/search", params={"maxResults": 50})
+    return [
+        {
+            "key": p["key"],
+            "name": p["name"],
+            "id": p["id"],
+            "type": p.get("projectTypeKey", "software"),
+        }
+        for p in data.get("values", [])
+    ]
+
+
+async def list_issue_types(project_key: str) -> list[dict]:
+    """
+    List issue types available for a specific project.
+    Returns list of {"id": "10001", "name": "Bug", "subtask": False}
+    """
+    data = await _get(f"/project/{project_key}")
+    issue_types = data.get("issueTypes", [])
+    return [
+        {
+            "id": it["id"],
+            "name": it["name"],
+            "subtask": it.get("subtask", False),
+        }
+        for it in issue_types
+        if not it.get("subtask", False)  # Exclude subtask types
+    ]
+
+
+async def get_issue(issue_key: str) -> dict:
+    """
+    Get a single issue by key (e.g. "ENG-42").
+    Returns simplified issue dict.
+    """
+    data = await _get(f"/issue/{issue_key}")
+    fields = data.get("fields", {})
+    return {
+        "key": data["key"],
+        "id": data["id"],
+        "summary": fields.get("summary", ""),
+        "status": fields.get("status", {}).get("name", "Unknown"),
+        "priority": fields.get("priority", {}).get("name", "Medium"),
+        "assignee": (fields.get("assignee") or {}).get("displayName", "Unassigned"),
+        "issue_type": fields.get("issuetype", {}).get("name", "Task"),
+        "url": f"{JIRA_BASE_URL}/browse/{data['key']}",
+        "created": fields.get("created", ""),
+        "updated": fields.get("updated", ""),
+    }
+
+
+async def create_issue(
+    project_key: str,
+    summary: str,
+    description: str,
+    issue_type: str = "Task",
+    priority: str = "Medium",
+    labels: list[str] = None,
+    assignee_account_id: str = None,
+) -> dict:
+    """
+    Create a new Jira issue.
+
+    Args:
+        project_key:        Project key (e.g. "ENG")
+        summary:            Issue title (max ~250 chars)
+        description:        Full description in Atlassian Document Format (ADF)
+        issue_type:         "Task", "Bug", "Story", "Epic"
+        priority:           "Highest", "High", "Medium", "Low", "Lowest"
+        labels:             List of label strings (no spaces allowed in labels)
+        assignee_account_id: Jira account ID to assign to (optional)
+
+    Returns:
+        {"key": "ENG-42", "id": "10042", "url": "https://..."}
+    """
+    fields: dict = {
+        "project": {"key": project_key},
+        "summary": summary[:255],  # Jira hard limit
+        "description": _text_to_adf(description),
+        "issuetype": {"name": issue_type},
+        "priority": {"name": priority},
+    }
+
+    if labels:
+        # Jira labels cannot have spaces — replace with hyphens
+        fields["labels"] = [l.replace(" ", "-") for l in labels]
+
+    if assignee_account_id:
+        fields["assignee"] = {"accountId": assignee_account_id}
+
+    body = {"fields": fields}
+
+    try:
+        data = await _post("/issue", body)
+        issue_key = data["key"]
+        return {
+            "ok": True,
+            "key": issue_key,
+            "id": data["id"],
+            "url": f"{JIRA_BASE_URL}/browse/{issue_key}",
+        }
+    except httpx.HTTPStatusError as e:
+        error_body = {}
+        try:
+            error_body = e.response.json()
+        except Exception:
+            pass
+        errors = error_body.get("errors", {})
+        messages = error_body.get("errorMessages", [])
+        return {
+            "ok": False,
+            "error": f"HTTP {e.response.status_code}",
+            "details": errors or messages or e.response.text[:300],
+        }
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+async def search_issues(jql: str, max_results: int = 10) -> list[dict]:
+    """
+    Search issues using JQL (Jira Query Language).
+    Example JQL: 'project = ENG AND labels = thirdeye AND status != Done'
+
+    Returns list of simplified issue dicts.
+    """
+    data = await _get("/search/jql", params={
+        "jql": jql,
+        "maxResults": max_results,
+        "fields": "summary,status,priority,assignee,issuetype,labels,created",
+    })
+    results = []
+    for issue in data.get("issues", []):
+        fields = issue.get("fields", {})
+        results.append({
+            "key": issue["key"],
+            "summary": fields.get("summary", ""),
+            "status": fields.get("status", {}).get("name", "Unknown"),
+            "priority": fields.get("priority", {}).get("name", "Medium"),
+            "assignee": (fields.get("assignee") or {}).get("displayName", "Unassigned"),
+            "issue_type": fields.get("issuetype", {}).get("name", "Task"),
+            "labels": fields.get("labels", []),
+            "url": f"{JIRA_BASE_URL}/browse/{issue['key']}",
+        })
+    return results
+
+
+async def search_users(query: str, max_results: int = 10) -> list[dict]:
+    """
+    Search Jira users by display name or email fragment.
+    Returns list of {"account_id", "display_name", "email", "active"}.
+    """
+    try:
+        data = await _get("/user/search", params={"query": query, "maxResults": max_results})
+        return [
+            {
+                "account_id": u.get("accountId", ""),
+                "display_name": u.get("displayName", ""),
+                "email": u.get("emailAddress", ""),
+                "active": u.get("active", True),
+            }
+            for u in data
+            if u.get("active", True)
+        ]
+    except Exception as e:
+        logger.warning(f"User search failed for '{query}': {e}")
+        return []
+
+
+async def assign_issue(issue_key: str, account_id: str) -> dict:
+    """
+    Assign a Jira issue to a user by their Jira account ID.
+    Returns {"ok": True} on success or {"ok": False, "error": "..."}.
+    """
+    try:
+        await _put(f"/issue/{issue_key}/assignee", {"accountId": account_id})
+        return {"ok": True}
+    except httpx.HTTPStatusError as e:
+        return {"ok": False, "error": f"HTTP {e.response.status_code}: {e.response.text[:200]}"}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+async def add_comment(issue_key: str, comment: str) -> dict:
+    """Add a plain-text comment to an existing issue."""
+    try:
+        data = await _post(f"/issue/{issue_key}/comment", {
+            "body": _text_to_adf(comment)
+        })
+        return {"ok": True, "id": data.get("id")}
+    except Exception as e:
+        return {"ok": False, "error": str(e)}
+
+
+# ─── ADF helper ──────────────────────────────────────────────────────────────
+
+def _text_to_adf(text: str) -> dict:
+    """
+    Convert plain text to Atlassian Document Format (ADF).
+    Jira Cloud requires ADF for description/comment fields (not plain strings).
+    Splits on double newlines to create separate paragraphs.
+    """
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    if not paragraphs:
+        paragraphs = [text.strip() or "(no description)"]
+
+    content = []
+    for para in paragraphs:
+        # Handle bullet lines within a paragraph (lines starting with - or *)
+        lines = para.split("\n")
+        bullet_items = [l.lstrip("-* ").strip() for l in lines if l.strip().startswith(("-", "*", "•"))]
+        non_bullets = [l for l in lines if not l.strip().startswith(("-", "*", "•"))]
+
+        if non_bullets:
+            content.append({
+                "type": "paragraph",
+                "content": [{"type": "text", "text": " ".join(non_bullets)}],
+            })
+
+        if bullet_items:
+            content.append({
+                "type": "bulletList",
+                "content": [
+                    {
+                        "type": "listItem",
+                        "content": [
+                            {
+                                "type": "paragraph",
+                                "content": [{"type": "text", "text": item}],
+                            }
+                        ],
+                    }
+                    for item in bullet_items
+                ],
+            })
+
+    return {
+        "type": "doc",
+        "version": 1,
+        "content": content or [
+            {"type": "paragraph", "content": [{"type": "text", "text": "(no description)"}]}
+        ],
+    }
+
--- a/thirdeye/backend/pipeline.py
+++ b/thirdeye/backend/pipeline.py
@@ -0,0 +1,288 @@
+"""Core pipeline: message batch → signals → classified → stored → queryable."""
+import asyncio
+import logging
+from backend.agents.signal_extractor import extract_signals
+from backend.agents.classifier import classify_signal
+from backend.agents.context_detector import detect_context
+from backend.db.chroma import store_signals, query_signals
+from backend.db.models import Signal
+
+logger = logging.getLogger("thirdeye.pipeline")
+
+# In-memory group config store (replace with Redis/DB for production)
+_group_configs = {}
+
+
+async def detect_and_set_lens(group_id: str, messages_text: str) -> str:
+    """Auto-detect lens for a group from initial messages."""
+    result = await detect_context(messages_text)
+    _group_configs[group_id] = {
+        "lens": result["detected_lens"],
+        "confidence": result["confidence"],
+    }
+    logger.info(f"Group {group_id}: lens={result['detected_lens']} (conf={result['confidence']})")
+    return result["detected_lens"]
+
+
+def get_lens(group_id: str) -> str:
+    """Get the current lens for a group."""
+    config = _group_configs.get(group_id, {})
+    return config.get("lens", "dev")
+
+
+def set_lens(group_id: str, lens: str):
+    """Manually set the lens for a group."""
+    _group_configs[group_id] = {"lens": lens, "confidence": 1.0}
+
+async def _auto_raise_and_notify(group_id: str, signals: list[dict]):
+    """
+    Background task: raise Jira tickets for critical signals and log results.
+    Called automatically when JIRA_AUTO_RAISE=true in .env.
+    Does NOT send Telegram messages (no bot context here) — check logs or /jiraraised.
+    """
+    import logging
+    logger = logging.getLogger("thirdeye.pipeline.auto_raise")
+
+    try:
+        from backend.agents.jira_agent import bulk_raise_for_group
+        results = await bulk_raise_for_group(
+            group_id=group_id,
+            signals=signals,
+            min_severity="high",
+            max_tickets=5,
+        )
+        raised = [r for r in results if r.get("ok")]
+        if raised:
+            logger.info(
+                f"[Auto-raise] Group {group_id}: {len(raised)} ticket(s) raised — "
+                + ", ".join(r.get("key", "?") for r in raised)
+            )
+    except Exception as e:
+        logging.getLogger("thirdeye.pipeline.auto_raise").error(f"Auto-raise failed: {e}")
+
+
+async def process_message_batch(group_id: str, messages: list[dict]) -> list[Signal]:
+    """
+    Process a batch of messages through the full pipeline.
+    
+    Args:
+        group_id: Telegram group ID
+        messages: List of {"sender": str, "text": str, "timestamp": str}
+    
+    Returns:
+        List of stored Signal objects
+    """
+    # Format messages for the LLM
+    formatted = "\n".join([f"[{m['sender']}]: {m['text']}" for m in messages])
+
+    # Get or detect lens
+    lens = get_lens(group_id)
+    if lens == "dev" and group_id not in _group_configs:
+        # First time seeing this group — auto-detect
+        lens = await detect_and_set_lens(group_id, formatted)
+
+    # Step 1: Extract signals
+    signals = await extract_signals(formatted, group_id, lens=lens)
+
+    if not signals:
+        logger.info(f"No signals extracted from batch in {group_id}")
+        return []
+
+    # Step 2: Classify each signal (parallel for speed)
+    classified_signals = await asyncio.gather(*[classify_signal(s) for s in signals])
+
+    # Step 3: Store in ChromaDB
+    store_signals(group_id, [s.model_dump() for s in classified_signals])
+
+    # Append inside process_message_batch(), after store_signals() call:
+    # ─── Auto-raise Jira tickets for critical signals ─────────────────────────────
+    from backend.config import JIRA_AUTO_RAISE, ENABLE_JIRA
+    
+    if ENABLE_JIRA and JIRA_AUTO_RAISE and signals:
+        from backend.agents.jira_agent import bulk_raise_for_group
+        
+        critical_signals = [
+            s for s in signals
+            if s.get("severity", "low") in ("high", "critical")
+        ]
+        if critical_signals:
+            asyncio.create_task(
+                _auto_raise_and_notify(group_id, critical_signals)
+            )
+
+    logger.info(f"Pipeline complete: {len(classified_signals)} signals stored for {group_id}")
+    return classified_signals
+
+
+async def query_knowledge(group_id: str, question: str, force_web_search: bool = False) -> str:
+    """
+    Query the knowledge base with natural language, with cross-group fallback and
+    conservative web search (only when all internal sources fail).
+
+    Flow:
+    1. Search this group's knowledge base (ChromaDB)
+    2. If results are weak, also search all other groups (cross-group fallback)
+    3. Only hit the web if no internal knowledge is found AND question is clearly external
+    4. LLM synthesizes the best available context into a final answer
+    """
+    from backend.providers import call_llm
+    from backend.agents.web_search import search_web, format_search_results_for_llm
+    from backend.config import ENABLE_WEB_SEARCH
+    from backend.db.chroma import query_signals_global
+
+    # ── Step 1: search this group's own collection ──────────────────────────────
+    results = query_signals(group_id, question, n_results=8)
+
+    # A result is "strong" when the top hit has high semantic similarity (≥ 0.40)
+    STRONG_THRESHOLD = 0.40
+    has_strong_local = bool(results) and results[0].get("relevance_score", 0) >= STRONG_THRESHOLD
+
+    # ── Step 2: cross-group fallback ────────────────────────────────────────────
+    cross_results = []
+    if not has_strong_local:
+        cross_results = query_signals_global(question, n_results=8, exclude_group_id=group_id)
+
+    # Combine: local results first, then cross-group ones (de-duplicated by document text)
+    seen_docs = {r["document"] for r in results}
+    for cr in cross_results:
+        if cr["document"] not in seen_docs:
+            results.append(cr)
+            seen_docs.add(cr["document"])
+
+    # ── Recency re-ranking ───────────────────────────────────────────────────────
+    # Boost signals that are recent so a fresh update beats an older one on the same topic.
+    # Boost formula: +0.4 for brand-new, decays to ~0 after 7 days.
+    from datetime import datetime, timezone
+    import math
+    now = datetime.now(timezone.utc)
+
+    def _recency_boost(ts_str: str) -> float:
+        try:
+            ts = datetime.fromisoformat(ts_str)
+            if ts.tzinfo is None:
+                ts = ts.replace(tzinfo=timezone.utc)
+            age_hours = max(0, (now - ts).total_seconds() / 3600)
+            return 0.4 * math.exp(-age_hours / 48)  # half-life ≈ 33 hours
+        except Exception:
+            return 0.0
+
+    for r in results:
+        ts = r.get("metadata", {}).get("timestamp", "")
+        r["_ranked_score"] = r.get("relevance_score", 0) + _recency_boost(ts)
+
+    results.sort(key=lambda x: x["_ranked_score"], reverse=True)
+
+    # Re-evaluate strength after combining and re-ranking
+    has_any_internal = bool(results) and results[0].get("relevance_score", 0) >= STRONG_THRESHOLD
+
+    # ── Build internal context ───────────────────────────────────────────────────
+    # Results are already sorted by (relevance + recency). The first result is the
+    # best match. We label it explicitly so even small fallback models can't miss it.
+    from backend.agents.query_agent import _format_signal_for_context, VOICE_CITATION_INSTRUCTION
+
+    internal_context = ""
+    has_voice_signals = False
+    if results:
+        context_parts = []
+        for i, r in enumerate(results):
+            meta = r["metadata"]
+            source_group = r.get("source_group_id")
+
+            if meta.get("source") == "voice" or meta.get("type") == "voice_transcript":
+                has_voice_signals = True
+
+            # Rich source label using voice-aware formatter
+            signal_label = _format_signal_for_context(r)
+
+            rank_header = (
+                "*** BEST MATCH (use this as your primary answer) ***\n"
+                if i == 0 else
+                f"(supporting context {i+1})\n"
+            )
+            context_parts.append(
+                f"{rank_header}"
+                f"{signal_label}\n"
+                f"Content: {r['document']}\n"
+                f"Entities: {meta.get('entities', '[]')}"
+            )
+        internal_context = "\n\n---\n\n".join(context_parts)
+
+    # ── Step 3: web search — only when all internal sources fail ────────────────
+    # Only keywords that are clearly external / internet-specific trigger web search.
+    # Intentionally excludes personal/team words like "update", "current", "what is".
+    web_keywords = [
+        "latest news", "industry standard", "best practice", "benchmark",
+        "security vulnerability", "cve", "public release", "changelog",
+        "documentation for", "how to install", "npm package", "pypi",
+    ]
+    question_lower = question.lower()
+    wants_external = any(kw in question_lower for kw in web_keywords)
+
+    # Web search fires only when: explicitly forced, OR no internal knowledge at all
+    # AND the question looks like it's asking about something external.
+    should_search_web = ENABLE_WEB_SEARCH and (
+        force_web_search
+        or (not has_any_internal and wants_external)
+    )
+
+    web_context = ""
+    used_web = False
+    if should_search_web:
+        web_results = await search_web(question, max_results=3)
+        if web_results:
+            web_context = format_search_results_for_llm(web_results)
+            used_web = True
+
+    # ── Step 4: build combined prompt ───────────────────────────────────────────
+    if not internal_context and not web_context:
+        return (
+            "I don't have any information about that yet across all team chats. "
+            "The relevant group may need more conversation, or try /search for external info."
+        )
+
+    combined_context = ""
+    if internal_context:
+        combined_context += (
+            "=== INTERNAL KNOWLEDGE BASE (from team conversations & documents) ===\n\n"
+            f"{internal_context}\n\n"
+        )
+    if web_context:
+        combined_context += f"=== WEB SEARCH RESULTS ===\n\n{web_context}\n\n"
+
+    system_prompt = """You are the Query Agent for ThirdEye. Answer the question using the context below.
+
+The context is sorted: the BEST MATCH signal appears first and is your primary source. 
+Older or supporting signals appear after it — they may be outdated, so prefer the BEST MATCH.
+
+RULES:
+- Answer from the BEST MATCH signal first. Only use other signals as supporting context.
+- Quote exact numbers, dates, and durations directly — never paraphrase them.
+- If a signal has a "Quote:" field, that is the verbatim team message — treat it as ground truth.
+- Signals from "other group" are still internal team knowledge.
+- Be concise (2-3 sentences). Plain text only, no markdown headers."""
+
+    if has_voice_signals:
+        system_prompt += VOICE_CITATION_INSTRUCTION
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": f"Context:\n\n{combined_context}\n\nQuestion: {question}"},
+    ]
+
+    try:
+        result = await call_llm("fast_large", messages, temperature=0.3, max_tokens=600)
+        answer = result["content"]
+
+        sources = []
+        if internal_context:
+            sources.append("knowledge base")
+        if used_web:
+            sources.append("web search")
+        answer += f"\n\n📌 Sources: {' + '.join(sources)}"
+
+        return answer
+    except Exception as e:
+        logger.error(f"Query agent failed: {e}")
+        return "Sorry, I encountered an error while searching. Please try again."
+
+
--- a/thirdeye/backend/providers.py
+++ b/thirdeye/backend/providers.py
@@ -0,0 +1,177 @@
+"""Multi-provider LLM router with automatic fallback on rate limits.
+
+Groq pool: up to 3 API keys (GROQ_API_KEY, GROQ_API_KEY_2, GROQ_API_KEY_3) all running
+llama-3.3-70b-versatile.  Calls are round-robined across the pool so the per-key rate
+limit is shared evenly.  When a key is rate-limited the router falls through to the next
+key in rotation, then to the rest of the fallback chain.
+"""
+import asyncio
+import logging
+from collections import defaultdict
+from openai import AsyncOpenAI
+from backend.config import (
+    GROQ_API_KEY, GROQ_API_KEY_2, GROQ_API_KEY_3,
+    CEREBRAS_API_KEY, SAMBANOVA_API_KEY,
+    OPENROUTER_API_KEY, GEMINI_API_KEY,
+    OLLAMA_BASE_URL, OLLAMA_ENABLED,
+)
+
+logger = logging.getLogger("thirdeye.providers")
+
+# ── Client registry ──────────────────────────────────────────────────────────
+_clients: dict[str, AsyncOpenAI] = {}
+
+def _init_client(name: str, base_url: str, api_key: str | None):
+    if api_key and len(api_key) > 5:
+        _clients[name] = AsyncOpenAI(base_url=base_url, api_key=api_key)
+
+# Ollama (local) — uses a dummy key; the OpenAI client requires a non-empty value
+if OLLAMA_ENABLED:
+    _clients["ollama"] = AsyncOpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama")
+
+# Groq pool: register each key under its own name
+_init_client("groq",   "https://api.groq.com/openai/v1", GROQ_API_KEY)
+_init_client("groq_2", "https://api.groq.com/openai/v1", GROQ_API_KEY_2)
+_init_client("groq_3", "https://api.groq.com/openai/v1", GROQ_API_KEY_3)
+
+_init_client("cerebras",   "https://api.cerebras.ai/v1",                       CEREBRAS_API_KEY)
+_init_client("sambanova",  "https://api.sambanova.ai/v1",                       SAMBANOVA_API_KEY)
+_init_client("openrouter", "https://openrouter.ai/api/v1",                      OPENROUTER_API_KEY)
+_init_client("google",     "https://generativelanguage.googleapis.com/v1beta/openai/", GEMINI_API_KEY)
+
+# Which provider names belong to the Groq pool
+_GROQ_POOL = [name for name in ("groq", "groq_2", "groq_3") if name in _clients]
+_GROQ_MODEL = "llama-3.3-70b-versatile"
+
+# Round-robin cursor per task_type (incremented after every call attempt on the pool)
+_rr_cursor: dict[str, int] = defaultdict(int)
+
+# ── Model registry ───────────────────────────────────────────────────────────
+# Groq pool entries are expanded dynamically at call time so the cursor drives order.
+# Use the sentinel string "groq_pool" to indicate "use all available Groq keys".
+_GROQ_POOL_SENTINEL = "__groq_pool__"
+
+MODEL_REGISTRY: dict[str, list[tuple[str, str]]] = {
+    "fast_small": [
+        ("ollama",     "llama3:8b"),
+        ("groq",       "llama-3.1-8b-instant"),
+        ("cerebras",   "llama-3.1-8b"),
+        ("openrouter", "openai/gpt-oss-20b:free"),
+    ],
+    "fast_large": [
+        (_GROQ_POOL_SENTINEL, _GROQ_MODEL),          # expands to all 3 Groq keys (round-robin)
+        ("openrouter",  "arcee-ai/trinity-large-preview:free"),
+        ("openrouter",  "meta-llama/llama-3.3-70b-instruct:free"),
+        ("sambanova",   "Meta-Llama-3.3-70B-Instruct"),
+        ("cerebras",    "llama3.1-8b"),
+    ],
+    "reasoning": [
+        ("sambanova",  "DeepSeek-R1-Distill-Llama-70B"),
+        ("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
+        ("openrouter", "openai/gpt-oss-120b:free"),
+    ],
+    "agentic": [
+        ("openrouter", "minimax/minimax-m2.5:free"),
+        ("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
+        (_GROQ_POOL_SENTINEL, _GROQ_MODEL),
+    ],
+    "fallback": [
+        ("openrouter", "openrouter/free"),
+        ("google",     "gemini-2.5-flash"),
+    ],
+}
+
+
+def _expand_candidates(task_type: str) -> list[tuple[str, str]]:
+    """
+    Return the full candidate list for a task_type with the Groq pool sentinel
+    expanded into ordered (provider_name, model) tuples starting from the
+    current round-robin cursor position.
+    """
+    raw = MODEL_REGISTRY.get(task_type, []) + MODEL_REGISTRY["fallback"]
+    expanded: list[tuple[str, str]] = []
+
+    for provider, model in raw:
+        if provider != _GROQ_POOL_SENTINEL:
+            expanded.append((provider, model))
+            continue
+
+        if not _GROQ_POOL:
+            continue
+
+        # Rotate: start from cursor, wrap around
+        start = _rr_cursor[task_type] % len(_GROQ_POOL)
+        ordered = _GROQ_POOL[start:] + _GROQ_POOL[:start]
+        for key_name in ordered:
+            expanded.append((key_name, model))
+
+    return expanded
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+async def call_llm(
+    task_type: str,
+    messages: list,
+    temperature: float = 0.3,
+    max_tokens: int = 2000,
+    response_format: dict = None,
+) -> dict:
+    """
+    Route to the best available provider with automatic fallback.
+
+    Returns:
+        {"content": str, "provider": str, "model": str}
+    """
+    candidates = _expand_candidates(task_type)
+    errors = []
+
+    for provider_name, model_id in candidates:
+        client = _clients.get(provider_name)
+        if not client:
+            continue
+
+        try:
+            kwargs = {
+                "model": model_id,
+                "messages": messages,
+                "temperature": temperature,
+                "max_tokens": max_tokens,
+                "timeout": 45,
+            }
+            if response_format and provider_name not in ("google",):
+                kwargs["response_format"] = response_format
+
+            response = await client.chat.completions.create(**kwargs)
+            content = response.choices[0].message.content
+
+            # Advance round-robin cursor on success so next call starts from the
+            # following key, distributing load evenly across the pool.
+            if provider_name in _GROQ_POOL:
+                _rr_cursor[task_type] = (_rr_cursor[task_type] + 1) % len(_GROQ_POOL)
+
+            display_name = provider_name if provider_name not in ("groq_2", "groq_3") else f"groq[key{provider_name[-1]}]"
+            logger.info(f"LLM call success: {display_name}/{model_id} ({task_type})")
+            return {
+                "content": content,
+                "provider": display_name,
+                "model": model_id,
+            }
+
+        except Exception as e:
+            err = str(e).lower()
+            is_rate_limit = any(k in err for k in ["429", "rate", "quota", "limit", "exceeded", "capacity"])
+            is_timeout    = "timeout" in err or "timed out" in err
+
+            if is_rate_limit or is_timeout:
+                logger.warning(f"Provider {provider_name}/{model_id} unavailable: {type(e).__name__}")
+                errors.append(f"{provider_name}: rate limited")
+                # Also advance cursor so the next call won't start on this key
+                if provider_name in _GROQ_POOL:
+                    _rr_cursor[task_type] = (_rr_cursor[task_type] + 1) % len(_GROQ_POOL)
+            else:
+                logger.error(f"Provider {provider_name}/{model_id} error: {e}")
+                errors.append(f"{provider_name}: {e}")
+            continue
+
+    raise Exception(f"All LLM providers exhausted. Errors: {errors}")