This commit is contained in:
2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
"""Classifier Agent — adds metadata tags to extracted signals."""
import logging
from backend.providers import call_llm
from backend.db.models import Signal
from backend.agents.json_utils import extract_json_object
logger = logging.getLogger("thirdeye.agents.classifier")
SYSTEM_PROMPT = """You are a fast metadata classifier. Given an extracted signal, add classification tags.
Respond ONLY with valid JSON (no markdown, no backticks):
{"sentiment": "positive|neutral|negative|urgent", "urgency": "none|low|medium|high|critical", "keywords": ["3-5 searchable keywords"]}
"""
async def classify_signal(signal: Signal) -> Signal:
"""Add classification metadata to a signal."""
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Classify this signal:\nType: {signal.type}\nSummary: {signal.summary}\nQuote: {signal.raw_quote}"},
]
try:
result = await call_llm("fast_small", messages, temperature=0.1, max_tokens=200)
parsed = extract_json_object(result.get("content", ""))
signal.sentiment = parsed.get("sentiment", signal.sentiment)
signal.urgency = parsed.get("urgency", signal.urgency)
signal.keywords = parsed.get("keywords", signal.keywords)
except Exception as e:
logger.warning(f"Classification failed, using defaults: {e}")
# Keep defaults — classification failure is non-fatal
return signal

View File

@@ -0,0 +1,107 @@
"""Context Detector Agent — auto-classifies group type from messages."""
import logging
from backend.providers import call_llm
from backend.agents.json_utils import extract_json_object
logger = logging.getLogger("thirdeye.agents.context_detector")
SYSTEM_PROMPT = """You analyze a batch of messages from a Telegram group and determine what TYPE of group this is.
CLASSIFY into exactly ONE:
- "dev" — Software engineering team (code, PRs, deployments, bugs, tech stack)
- "product" — Product/business team (features, users, metrics, roadmap, competitors)
- "client" — Client/agency channel (deliverables, timelines, approvals, invoices)
- "community" — Community/interest group (recommendations, events, local info, casual)
Respond ONLY with valid JSON (no markdown, no backticks):
{"detected_lens": "dev|product|client|community", "confidence": 0.0-1.0, "evidence": ["signal1", "signal2", "signal3"]}
"""
VALID_LENSES = {"dev", "product", "client", "community"}
def _heuristic_detect_context(messages_text: str) -> dict:
"""Rule-based fallback when LLM output is malformed/unavailable."""
text = (messages_text or "").lower()
lens_keywords = {
"dev": [
"bug", "deploy", "deployment", "api", "database", "schema", "postgres", "mongo",
"timeout", "endpoint", "pod", "pr", "code", "docker", "stack", "integration",
],
"product": [
"feature", "roadmap", "user", "users", "client", "customers", "complain", "pain",
"prioritize", "priority", "enterprise", "competitor", "demo", "sso", "dark mode",
"mobile", "stability", "integration",
],
"client": [
"invoice", "deadline", "deliverable", "approval", "sign-off", "scope", "payment",
"contract", "proposal", "timeline", "meeting",
],
"community": [
"event", "meetup", "recommend", "anyone", "community", "local", "where can i",
"suggestion", "friends", "weekend",
],
}
scores = {
lens: sum(text.count(keyword) for keyword in keywords)
for lens, keywords in lens_keywords.items()
}
best_lens = max(scores, key=scores.get)
best_score = scores[best_lens]
if best_score == 0:
best_lens = "dev"
evidence = [k for k in lens_keywords[best_lens] if k in text][:3]
confidence = min(0.95, 0.35 + 0.08 * best_score) if best_score > 0 else 0.0
return {
"detected_lens": best_lens,
"confidence": round(confidence, 2),
"evidence": evidence or ["heuristic_fallback"],
}
async def detect_context(messages_text: str) -> dict:
"""Detect group type from a batch of messages."""
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Classify this group based on these messages:\n\n{messages_text}"},
]
try:
result = await call_llm(
"fast_large",
messages,
temperature=0.1,
max_tokens=300,
response_format={"type": "json_object"},
)
parsed = extract_json_object(result.get("content", ""))
detected_lens = str(parsed.get("detected_lens", "dev")).strip().lower()
if detected_lens not in VALID_LENSES:
detected_lens = "dev"
confidence = parsed.get("confidence", 0.5)
try:
confidence = float(confidence)
except (TypeError, ValueError):
confidence = 0.5
evidence = parsed.get("evidence", [])
if not isinstance(evidence, list):
evidence = [str(evidence)]
return {
"detected_lens": detected_lens,
"confidence": max(0.0, min(1.0, confidence)),
"evidence": [str(x) for x in evidence][:5],
}
except Exception as e:
logger.error(f"Context detection failed: {e}")
fallback = _heuristic_detect_context(messages_text)
fallback["evidence"] = fallback["evidence"] + ["detection_failed"]
return fallback

View File

@@ -0,0 +1,287 @@
"""Cross-Group Analyst Agent — detects blind spots between multiple teams."""
import logging
from backend.providers import call_llm
from backend.db.chroma import get_all_signals, get_group_ids
from backend.db.models import CrossGroupInsight
from backend.agents.json_utils import extract_json_object
logger = logging.getLogger("thirdeye.agents.cross_group_analyst")
SYSTEM_PROMPT = """You are the Cross-Group Intelligence Analyst for ThirdEye. This is the MOST IMPORTANT analysis.
You receive intelligence summaries from MULTIPLE Telegram groups. Your job is to find BLIND SPOTS — information in one group that should be in another.
Detect:
- blocked_handoff: Team A waiting for something from Team B, but Team B doesn't know
- conflicting_decision: Team A decided X, Team B decided the opposite
- information_silo: Critical info in Group A never reached Group B
- promise_reality_gap: Promise made in one group, but another group shows it's blocked
- duplicated_effort: Two teams working on similar things unknowingly
Respond ONLY with valid JSON (no markdown):
{"insights": [{"type": "insight_type", "description": "SPECIFIC description naming the groups, people, and topics", "group_a": {"name": "group_name", "evidence": "what was said"}, "group_b": {"name": "group_name", "evidence": "what was said or NOT said"}, "severity": "warning|critical", "recommendation": "Specific action"}]}
If no cross-group issues: {"insights": []}
Be SPECIFIC. Name the groups, people, topics, and exact conflicts."""
def _heuristic_cross_group_insights(
group_summaries: dict[str, list[dict]],
) -> list[CrossGroupInsight]:
"""Generate best-effort cross-group insights when LLM output is unavailable."""
insights: list[CrossGroupInsight] = []
normalized = {}
for group_name, signals in group_summaries.items():
docs = [str(s.get("document", "")) for s in signals]
combined = " ".join(docs).lower()
signal_types = []
for s in signals:
signal_types.append(
str(s.get("metadata", {}).get("type", "unknown")).lower()
)
normalized[group_name] = {
"text": combined,
"signals": signals,
"types": signal_types,
}
group_names = list(normalized.keys())
for i in range(len(group_names)):
for j in range(i + 1, len(group_names)):
group_a = group_names[i]
group_b = group_names[j]
text_a = normalized[group_a]["text"]
text_b = normalized[group_b]["text"]
types_a = set(normalized[group_a]["types"])
types_b = set(normalized[group_b]["types"])
# Detect a likely blocked handoff around design/spec dependencies.
a_waiting = any(
k in text_a for k in ["waiting", "blocked", "design spec", "specs"]
)
b_mentions_specs = any(
k in text_b for k in ["design spec", "specs", "design"]
)
if a_waiting and not b_mentions_specs:
insights.append(
CrossGroupInsight(
type="blocked_handoff",
description=(
f"{group_a} indicates dependency blockage (design/spec inputs), "
f"but {group_b} has no corresponding discussion of that dependency."
),
group_a={
"name": group_a,
"evidence": "Contains waiting/blocked language tied to specs or design dependency.",
},
group_b={
"name": group_b,
"evidence": "No clear mention of design specs/dependency handoff in available signals.",
},
severity="warning",
recommendation=(
f"Create a shared handoff item between {group_a} and {group_b} for design/spec ownership "
"with an explicit due date."
),
)
)
# Detect likely promise vs execution mismatch.
b_promises = any(
k in text_b
for k in ["demo", "friday", "promised", "told the client", "ready by"]
)
a_blocked = any(
k in text_a
for k in ["blocked", "waiting", "can't proceed", "cannot proceed"]
)
if b_promises and a_blocked:
insights.append(
CrossGroupInsight(
type="promise_reality_gap",
description=(
f"{group_b} signals delivery promises while {group_a} reports blockers that may prevent those commitments."
),
group_a={
"name": group_a,
"evidence": "Signals include active blockers/waiting dependencies.",
},
group_b={
"name": group_b,
"evidence": "Signals include explicit client/demo commitments and timelines.",
},
severity="critical",
recommendation="Run a joint risk review and re-baseline commitments before the next client update.",
)
)
# Type-based silo detection when lexical cues are weak.
a_operational_risk = bool(
types_a.intersection(
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
)
)
b_planning_focus = bool(
types_b.intersection(
{
"feature_request",
"roadmap_drift",
"priority_conflict",
"user_pain_point",
}
)
)
if a_operational_risk and b_planning_focus:
insights.append(
CrossGroupInsight(
type="information_silo",
description=(
f"{group_a} shows operational risk signals while {group_b} is focused on planning/user demands, "
"suggesting risk context is not shared across groups."
),
group_a={
"name": group_a,
"evidence": f"Operational risk signal types: {sorted(types_a.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
},
group_b={
"name": group_b,
"evidence": f"Planning-focused signal types: {sorted(types_b.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
},
severity="warning",
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
)
)
# Check reverse direction as well.
b_operational_risk = bool(
types_b.intersection(
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
)
)
a_planning_focus = bool(
types_a.intersection(
{
"feature_request",
"roadmap_drift",
"priority_conflict",
"user_pain_point",
}
)
)
if b_operational_risk and a_planning_focus:
insights.append(
CrossGroupInsight(
type="information_silo",
description=(
f"{group_b} shows operational risk signals while {group_a} is focused on planning/user demands, "
"suggesting risk context is not shared across groups."
),
group_a={
"name": group_b,
"evidence": f"Operational risk signal types: {sorted(types_b.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
},
group_b={
"name": group_a,
"evidence": f"Planning-focused signal types: {sorted(types_a.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
},
severity="warning",
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
)
)
deduped = []
seen_keys = set()
for insight in insights:
key = (insight.type, insight.group_a.get("name"), insight.group_b.get("name"))
if key in seen_keys:
continue
seen_keys.add(key)
deduped.append(insight)
return deduped[:5]
async def analyze_cross_group(
group_summaries: dict[str, list[dict]] = None,
) -> list[CrossGroupInsight]:
"""
Analyze intelligence across all monitored groups to find blind spots.
Args:
group_summaries: Optional pre-built summaries. If None, loads from ChromaDB.
"""
if group_summaries is None:
group_ids = get_group_ids()
if len(group_ids) < 2:
logger.info("Need at least 2 groups for cross-group analysis")
return []
group_summaries = {}
for gid in group_ids:
signals = get_all_signals(gid)
group_summaries[gid] = signals
if len(group_summaries) < 2:
return []
# Format summaries for the LLM
summary_parts = []
for group_name, signals in group_summaries.items():
signal_lines = []
for s in signals[:30]: # Limit per group to fit context
meta = s["metadata"]
signal_lines.append(f" - [{meta.get('type', '?')}] {s['document'][:120]}")
summary_parts.append(
f"=== GROUP: {group_name} ({len(signals)} total signals) ===\n"
+ "\n".join(signal_lines)
)
full_summary = "\n\n".join(summary_parts)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": f"Analyze cross-group intelligence:\n\n{full_summary}",
},
]
try:
result = await call_llm(
"reasoning",
messages,
temperature=0.2,
max_tokens=2000,
response_format={"type": "json_object"},
)
parsed = extract_json_object(result.get("content", ""))
insights = []
for i in parsed.get("insights", []):
insights.append(
CrossGroupInsight(
type=i.get("type", "unknown"),
description=i.get("description", ""),
group_a=i.get("group_a", {}),
group_b=i.get("group_b", {}),
severity=i.get("severity", "warning"),
recommendation=i.get("recommendation", ""),
)
)
logger.info(f"Cross-group analysis found {len(insights)} insights")
return insights
except Exception as e:
raw = ""
if "result" in locals() and isinstance(result, dict):
raw = str(result.get("content", ""))[:300].replace("\n", " ")
logger.info(f"Cross-group LLM parse issue, using fallback: {e}; raw_head={raw}")
fallback = _heuristic_cross_group_insights(group_summaries)
if fallback:
logger.info(
f"Cross-group heuristic fallback produced {len(fallback)} insights"
)
return fallback

View File

@@ -0,0 +1,200 @@
"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
import os
import logging
import uuid
from datetime import datetime
logger = logging.getLogger("thirdeye.agents.document_ingestor")
# --- Text Extraction ---
def extract_text_from_pdf(file_path: str) -> list[dict]:
"""Extract text from PDF, returns list of {page: int, text: str}."""
from PyPDF2 import PdfReader
pages = []
try:
reader = PdfReader(file_path)
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text and text.strip():
pages.append({"page": i + 1, "text": text.strip()})
except Exception as e:
logger.error(f"PDF extraction failed for {file_path}: {e}")
return pages
def extract_text_from_docx(file_path: str) -> list[dict]:
"""Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
from docx import Document
try:
doc = Document(file_path)
full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
if full_text.strip():
return [{"page": 1, "text": full_text.strip()}]
except Exception as e:
logger.error(f"DOCX extraction failed for {file_path}: {e}")
return []
def extract_text_from_txt(file_path: str) -> list[dict]:
"""Extract text from plain text file."""
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read().strip()
if text:
return [{"page": 1, "text": text}]
except Exception as e:
logger.error(f"TXT extraction failed for {file_path}: {e}")
return []
EXTRACTORS = {
".pdf": extract_text_from_pdf,
".docx": extract_text_from_docx,
".txt": extract_text_from_txt,
".md": extract_text_from_txt,
".csv": extract_text_from_txt,
".json": extract_text_from_txt,
".log": extract_text_from_txt,
}
def extract_text(file_path: str) -> list[dict]:
"""Route to correct extractor based on file extension."""
ext = os.path.splitext(file_path)[1].lower()
extractor = EXTRACTORS.get(ext)
if not extractor:
logger.warning(f"Unsupported file type: {ext} ({file_path})")
return []
return extractor(file_path)
# --- Chunking ---
def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
"""
Split text into overlapping chunks.
Uses paragraph boundaries when possible, falls back to sentence boundaries,
then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
"""
if len(text) <= max_chars:
return [text]
# Split by paragraphs first
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
chunks = []
current_chunk = ""
for para in paragraphs:
# If adding this paragraph stays under limit, add it
if len(current_chunk) + len(para) + 1 <= max_chars:
current_chunk = (current_chunk + "\n" + para).strip()
else:
# Save current chunk if it has content
if current_chunk:
chunks.append(current_chunk)
# If single paragraph is too long, split it by sentences
if len(para) > max_chars:
sentences = para.replace(". ", ".\n").split("\n")
sub_chunk = ""
for sent in sentences:
if len(sub_chunk) + len(sent) + 1 <= max_chars:
sub_chunk = (sub_chunk + " " + sent).strip()
else:
if sub_chunk:
chunks.append(sub_chunk)
sub_chunk = sent
if sub_chunk:
current_chunk = sub_chunk
else:
current_chunk = ""
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
# Add overlap: prepend last N chars of previous chunk to each subsequent chunk
if overlap_chars > 0 and len(chunks) > 1:
overlapped = [chunks[0]]
for i in range(1, len(chunks)):
prev_tail = chunks[i - 1][-overlap_chars:]
# Find a word boundary in the overlap
space_idx = prev_tail.find(" ")
if space_idx > 0:
prev_tail = prev_tail[space_idx + 1:]
overlapped.append(prev_tail + " " + chunks[i])
chunks = overlapped
return chunks
# --- Main Ingestion ---
def ingest_document(
file_path: str,
group_id: str,
shared_by: str = "Unknown",
filename: str = None,
) -> list[dict]:
"""
Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
Args:
file_path: Path to the downloaded file on disk
group_id: Telegram group ID
shared_by: Who shared the file
filename: Original filename (for metadata)
Returns:
List of signal dicts ready for store_signals()
"""
if filename is None:
filename = os.path.basename(file_path)
# Extract
pages = extract_text(file_path)
if not pages:
logger.warning(f"No text extracted from {filename}")
return []
# Chunk each page
signals = []
total_chunks = 0
for page_data in pages:
page_num = page_data["page"]
chunks = chunk_text(page_data["text"])
for chunk_idx, chunk_text_str in enumerate(chunks):
if len(chunk_text_str.strip()) < 30:
continue # Skip tiny chunks
signal = {
"id": str(uuid.uuid4()),
"type": "document_knowledge",
"summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
"entities": [f"@{shared_by}", filename],
"severity": "low",
"status": "reference",
"sentiment": "neutral",
"urgency": "none",
"raw_quote": chunk_text_str,
"timestamp": datetime.utcnow().isoformat(),
"group_id": group_id,
"lens": "document",
"keywords": [filename, f"page_{page_num}", "document", shared_by],
}
signals.append(signal)
total_chunks += 1
logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
return signals

View File

@@ -0,0 +1,373 @@
"""
Jira Signal Agent
Takes ThirdEye signals and converts them into well-formed Jira tickets.
Responsibilities:
1. Map signal type → Jira issue type + priority
2. LLM-generate a clean ticket title and structured description from signal context
3. Extract assignee names and match them to Jira account IDs (best-effort)
4. Raise the ticket via jira_client and mark the signal in ChromaDB
5. Bulk-raise: process a group's unraised high-severity signals in one call
"""
import json
import logging
from datetime import datetime
from backend.providers import call_llm
from backend.integrations.jira_client import (
create_issue, search_issues, add_comment, is_configured, search_users
)
from backend.db.chroma import store_signals, mark_signal_as_raised, get_raised_signal_ids
from backend.config import (
JIRA_DEFAULT_PROJECT, JIRA_DEFAULT_ISSUE_TYPE,
JIRA_AUTO_RAISE_SEVERITY
)
logger = logging.getLogger("thirdeye.agents.jira_agent")
# ─── Signal → Jira type mapping ──────────────────────────────────────────────
# Maps ThirdEye signal type → (Jira issue type, default priority)
# Note: Issue types must match what's available in your Jira project
# Common types: Task, Bug, Story, Epic, Workstream (project-specific)
SIGNAL_TYPE_MAP = {
# Dev signals
"tech_debt": ("Task", "Low"),
"recurring_bug": ("Task", "High"), # Changed from Bug to Task
"architecture_decision": ("Task", "Medium"),
"deployment_risk": ("Task", "High"),
"workaround": ("Task", "Medium"),
"knowledge_silo": ("Task", "Medium"),
# Product signals
"feature_request": ("Task", "Medium"), # Changed from Story to Task
"priority_conflict": ("Task", "High"),
"sentiment_shift": ("Task", "Medium"),
# Client signals
"promise": ("Task", "High"),
"scope_creep": ("Task", "High"),
"risk": ("Task", "High"),
# Meet signals
"meet_action_item": ("Task", "Medium"),
"meet_blocker": ("Task", "Highest"),
"meet_risk": ("Task", "High"),
"meet_decision": ("Task", "Medium"),
"meet_open_q": ("Task", "Low"),
# Generic
"blocker": ("Task", "Highest"),
"decision": ("Task", "Medium"),
"action_item": ("Task", "Medium"),
}
SEVERITY_TO_PRIORITY = {
"critical": "Highest",
"high": "High",
"medium": "Medium",
"low": "Low",
}
RAISEABLE_TYPES = set(SIGNAL_TYPE_MAP.keys())
# ─── Assignee resolution ─────────────────────────────────────────────────────
async def resolve_assignee_account_id(name: str) -> str | None:
"""
Resolve a person's display name (or @name) to their Jira account ID.
Uses Jira's user search API and fuzzy-matches the best result.
Returns the account ID string, or None if no confident match is found.
"""
if not name:
return None
clean = name.lstrip("@").strip()
try:
users = await search_users(clean)
if not users:
return None
clean_lower = clean.lower()
# Exact display-name match first
for u in users:
if u["display_name"].lower() == clean_lower:
return u["account_id"]
# Partial match (all search words appear in display name)
words = clean_lower.split()
for u in users:
dn = u["display_name"].lower()
if all(w in dn for w in words):
return u["account_id"]
# Last resort: first result
return users[0]["account_id"]
except Exception as e:
logger.warning(f"resolve_assignee_account_id failed for '{name}': {e}")
return None
# ─── LLM ticket generation ───────────────────────────────────────────────────
TICKET_GEN_SYSTEM_PROMPT = """You are a senior engineering manager writing Jira tickets from team intelligence signals.
Given a ThirdEye signal (a structured piece of extracted team knowledge), write a Jira ticket.
Return ONLY a valid JSON object with exactly these fields:
{
"summary": "Short, actionable ticket title (max 100 chars). Start with a verb. No jargon.",
"description": "Full ticket description. Include: what the issue is, context from the signal, why it matters, suggested next steps. Use blank lines between sections. Use '- ' for bullet points. Max 400 words.",
"labels": ["label1", "label2"],
"assignee_name": "First name or @name of the person to assign, or null if unclear"
}
Label rules:
- Always include "thirdeye" and "auto-raised"
- Add the signal type as a label (e.g. "tech-debt", "recurring-bug")
- Add "urgent" if severity is high or critical
- Labels must not have spaces (use hyphens)
Summary rules:
- Starts with a verb: "Fix", "Investigate", "Address", "Resolve", "Document", "Implement"
- Be specific — "Fix intermittent checkout timeout" NOT "Fix bug"
- Never exceed 100 characters
Description must include:
1. What: clear 1-sentence problem statement
2. Context: what was actually said / detected (cite the signal)
3. Impact: why this matters to the team or product
4. Suggested next steps (2-3 bullet points)
Return JSON only — no markdown, no preamble."""
async def generate_ticket_content(signal: dict) -> dict:
"""
Use an LLM to generate a clean, context-rich Jira ticket from a ThirdEye signal.
Returns {"summary": str, "description": str, "labels": list, "assignee_name": str|None}
"""
signal_text = (
f"Signal type: {signal.get('type', 'unknown')}\n"
f"Summary: {signal.get('summary', '')}\n"
f"Raw quote: {signal.get('raw_quote', '')[:300]}\n"
f"Severity: {signal.get('severity', 'medium')}\n"
f"Entities involved: {', '.join(signal.get('entities', []))}\n"
f"Keywords: {', '.join(signal.get('keywords', []))}\n"
f"Timestamp: {signal.get('timestamp', '')}\n"
f"Group: {signal.get('group_id', '')}\n"
f"Lens: {signal.get('lens', '')}"
)
try:
result = await call_llm(
task_type="fast_large",
messages=[
{"role": "system", "content": TICKET_GEN_SYSTEM_PROMPT},
{"role": "user", "content": signal_text},
],
temperature=0.2,
max_tokens=800,
response_format={"type": "json_object"},
)
raw = result["content"].strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw)
except Exception as e:
logger.warning(f"Ticket generation LLM failed: {e}. Using fallback.")
# Fallback: build a basic ticket without LLM
sig_type = signal.get("type", "unknown").replace("_", " ").title()
return {
"summary": f"{sig_type}: {signal.get('summary', 'Unknown issue')[:80]}",
"description": (
f"Signal detected by ThirdEye.\n\n"
f"Type: {signal.get('type', 'unknown')}\n"
f"Summary: {signal.get('summary', '')}\n\n"
f"Raw context:\n{signal.get('raw_quote', '(none)')[:300]}\n\n"
f"Severity: {signal.get('severity', 'medium')}"
),
"labels": ["thirdeye", "auto-raised", signal.get("type", "unknown").replace("_", "-")],
"assignee_name": None,
}
# ─── Main raise function ──────────────────────────────────────────────────────
async def raise_ticket_for_signal(
signal: dict,
group_id: str,
project_key: str = None,
force: bool = False,
assignee_account_id: str = None,
) -> dict:
"""
Create a Jira ticket for a single ThirdEye signal.
Args:
signal: The signal dict from ChromaDB
group_id: The group this signal belongs to (for dedup tracking)
project_key: Override project (default: JIRA_DEFAULT_PROJECT)
force: If True, raise even if already raised before
Returns:
{"ok": True, "key": "ENG-42", "url": "...", "summary": "..."}
OR
{"ok": False, "reason": "already_raised" | "not_raiseable" | "jira_error", ...}
"""
if not is_configured():
return {"ok": False, "reason": "jira_not_configured"}
signal_id = signal.get("id", "")
signal_type = signal.get("type", "")
# Check if this signal type is raiseable
if signal_type not in RAISEABLE_TYPES:
return {"ok": False, "reason": "not_raiseable", "signal_type": signal_type}
# Check if already raised (skip if force=True)
if not force and signal_id:
already_raised = get_raised_signal_ids(group_id)
if signal_id in already_raised:
return {"ok": False, "reason": "already_raised", "signal_id": signal_id}
# Determine Jira issue type and priority from signal
default_type, default_priority = SIGNAL_TYPE_MAP.get(signal_type, (JIRA_DEFAULT_ISSUE_TYPE, "Medium"))
severity = signal.get("severity", "medium").lower()
priority = SEVERITY_TO_PRIORITY.get(severity, default_priority)
# Generate ticket content via LLM
ticket_content = await generate_ticket_content(signal)
summary = ticket_content.get("summary", signal.get("summary", "ThirdEye signal")[:100])
description = ticket_content.get("description", signal.get("summary", ""))
labels = ticket_content.get("labels", ["thirdeye", "auto-raised"])
# Always ensure thirdeye label is present
if "thirdeye" not in labels:
labels.append("thirdeye")
# Append ThirdEye metadata as a context section in the description
meta_section = (
f"\n\n---\n"
f"Raised by: ThirdEye\n"
f"Signal ID: {signal_id}\n"
f"Group: {group_id}\n"
f"Detected: {signal.get('timestamp', datetime.utcnow().isoformat())}"
)
description = description + meta_section
# Resolve assignee: explicit account_id wins, then signal override name, then LLM-extracted name
if not assignee_account_id:
name_hint = signal.get("assignee_override") or ticket_content.get("assignee_name")
if name_hint:
assignee_account_id = await resolve_assignee_account_id(name_hint)
if assignee_account_id:
logger.info(f"Resolved assignee '{name_hint}'{assignee_account_id}")
else:
logger.warning(f"Could not resolve assignee '{name_hint}' to a Jira account")
# Create the ticket
result = await create_issue(
project_key=project_key or JIRA_DEFAULT_PROJECT,
summary=summary,
description=description,
issue_type=default_type,
priority=priority,
labels=labels,
assignee_account_id=assignee_account_id,
)
if result.get("ok"):
jira_key = result["key"]
jira_url = result["url"]
# Mark this signal as raised in ChromaDB so we never duplicate it
if signal_id:
mark_signal_as_raised(
group_id, signal_id, jira_key,
jira_url=jira_url,
jira_summary=summary,
jira_priority=priority,
)
logger.info(f"Raised Jira ticket {jira_key} for signal {signal_id} ({signal_type})")
return {
"ok": True,
"key": jira_key,
"url": jira_url,
"summary": summary,
"issue_type": default_type,
"priority": priority,
"assignee_account_id": assignee_account_id,
}
else:
logger.error(f"Jira ticket creation failed: {result}")
return {
"ok": False,
"reason": "jira_error",
"error": result.get("error"),
"details": result.get("details"),
}
async def bulk_raise_for_group(
group_id: str,
signals: list[dict],
min_severity: str = None,
project_key: str = None,
max_tickets: int = 10,
) -> list[dict]:
"""
Raise Jira tickets for multiple signals from a group in one call.
Filters:
- Only raiseable signal types
- Only signals at or above min_severity (defaults to JIRA_AUTO_RAISE_SEVERITY)
- Skips signals already raised
- Caps at max_tickets to avoid flooding Jira
Returns list of raise results.
"""
min_sev = (min_severity or JIRA_AUTO_RAISE_SEVERITY).lower()
severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
min_rank = severity_rank.get(min_sev, 2) # Default: high
already_raised = get_raised_signal_ids(group_id)
candidates = []
for sig in signals:
sig_type = sig.get("type", "")
sig_id = sig.get("id", "")
severity = sig.get("severity", "low").lower()
rank = severity_rank.get(severity, 0)
if sig_type not in RAISEABLE_TYPES:
continue
if rank < min_rank:
continue
if sig_id in already_raised:
continue
candidates.append(sig)
# Sort by severity descending, then raise up to max_tickets
candidates.sort(key=lambda s: severity_rank.get(s.get("severity", "low"), 0), reverse=True)
candidates = candidates[:max_tickets]
results = []
for sig in candidates:
result = await raise_ticket_for_signal(sig, group_id, project_key=project_key)
results.append({**result, "signal_type": sig.get("type"), "signal_summary": sig.get("summary", "")[:80]})
logger.info(f"Bulk raise for group {group_id}: {len(results)} tickets from {len(signals)} signals")
return results
def format_raise_result_for_telegram(result: dict) -> str:
"""Format a single raise result as a Telegram message line."""
if result.get("ok"):
return (
f"✅ [{result['key']}]({result['url']}) — "
f"*{result.get('issue_type', 'Task')}* | {result.get('priority', 'Medium')} priority\n"
f" _{result.get('summary', '')[:90]}_"
)
reason = result.get("reason", "unknown")
if reason == "already_raised":
return f"⏭️ Already raised — skipped"
if reason == "not_raiseable":
return f"⚪ Signal type `{result.get('signal_type', '?')}` — not mapped to Jira"
return f"❌ Failed: {result.get('error', reason)}"

View File

@@ -0,0 +1,43 @@
"""Utilities for robustly parsing JSON from LLM responses."""
import json
import re
def extract_json_object(content: str) -> dict:
"""Extract and parse the first JSON object from raw LLM output."""
text = (content or "").strip()
if not text:
raise json.JSONDecodeError("Empty LLM response", text, 0)
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
text = re.sub(r"\s*```$", "", text)
text = text.strip()
if not text:
raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)
decoder = json.JSONDecoder()
# Direct parse for pure JSON responses.
try:
parsed = json.loads(text)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
# Try to decode from each object start. This handles wrapper text more
# reliably than regex, especially with nested braces.
for idx, ch in enumerate(text):
if ch != "{":
continue
try:
parsed, _ = decoder.raw_decode(text[idx:])
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
continue
raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)

View File

@@ -0,0 +1,213 @@
"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
import re
import uuid
import logging
import asyncio
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
from backend.providers import call_llm
from backend.config import ENABLE_LINK_FETCH
logger = logging.getLogger("thirdeye.agents.link_fetcher")
# Patterns to skip (images, downloads, social media embeds, etc.)
SKIP_PATTERNS = [
r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
r"^https?://(www\.)?instagram\.com/p/",
r"^https?://(www\.)?tiktok\.com/",
r"^https?://(www\.)?youtube\.com/shorts/",
r"^https?://t\.me/", # Other Telegram links
]
SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
def extract_urls(text: str) -> list[str]:
"""Extract all HTTP/HTTPS URLs from a text string."""
url_pattern = re.compile(
r"https?://[^\s<>\"')\]},;]+"
)
urls = url_pattern.findall(text)
# Clean trailing punctuation
cleaned = []
for url in urls:
url = url.rstrip(".,;:!?)")
if len(url) > 10:
cleaned.append(url)
return cleaned
def should_fetch(url: str) -> bool:
"""Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
for pattern in SKIP_COMPILED:
if pattern.search(url):
return False
return True
async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
"""
Fetch a URL and extract main text content.
Returns:
{title, text, url} or None if fetch fails
"""
try:
async with httpx.AsyncClient(
follow_redirects=True,
timeout=timeout,
headers={
"User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
) as client:
response = await client.get(url)
if response.status_code != 200:
logger.info(f"URL returned {response.status_code}: {url[:80]}")
return None
content_type = response.headers.get("content-type", "")
if "text/html" not in content_type and "application/xhtml" not in content_type:
logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
return None
html = response.text
except httpx.TimeoutException:
logger.info(f"URL timed out: {url[:80]}")
return None
except Exception as e:
logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
return None
# Parse HTML
try:
soup = BeautifulSoup(html, "html.parser")
# Extract title
title = ""
if soup.title and soup.title.string:
title = soup.title.string.strip()
# Remove script, style, nav, footer, header elements
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
tag.decompose()
# Try to find main content area
main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
if main:
text = main.get_text(separator="\n", strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
# Clean up
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
# Skip if too little content
if len(text) < 100:
logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
return None
# Truncate very long content
if len(text) > 8000:
text = text[:8000] + "\n\n[Content truncated]"
return {
"title": title or url,
"text": text,
"url": url,
}
except Exception as e:
logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
return None
async def summarize_content(title: str, text: str, url: str) -> str:
"""Use LLM to create a concise summary of fetched content."""
# Limit text sent to LLM
text_preview = text[:3000]
messages = [
{"role": "system", "content": """You are a content summarizer for ThirdEye.
Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
Respond with ONLY the summary text, nothing else."""},
{"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
]
try:
result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
return result["content"].strip()
except Exception as e:
logger.warning(f"Link summarization failed: {e}")
# Fallback: use first 200 chars of text
return text[:200] + "..."
async def process_links_from_message(
text: str,
group_id: str,
shared_by: str = "Unknown",
) -> list[dict]:
"""
Full pipeline: extract URLs from message → fetch → summarize → produce signals.
Designed to be called in the background (non-blocking to the main message pipeline).
Returns:
List of signal dicts ready for store_signals()
"""
if not ENABLE_LINK_FETCH:
return []
urls = extract_urls(text)
fetchable = [u for u in urls if should_fetch(u)]
if not fetchable:
return []
signals = []
# Process up to 3 links per message to avoid overload
for url in fetchable[:3]:
try:
content = await fetch_url_content(url)
if not content:
continue
summary = await summarize_content(content["title"], content["text"], url)
signal = {
"id": str(uuid.uuid4()),
"type": "link_knowledge",
"summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
"entities": [f"@{shared_by}", url[:100]],
"severity": "low",
"status": "reference",
"sentiment": "neutral",
"urgency": "none",
"raw_quote": summary,
"timestamp": datetime.utcnow().isoformat(),
"group_id": group_id,
"lens": "link",
"keywords": [content["title"][:50], "link", "web", shared_by],
}
signals.append(signal)
logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
except Exception as e:
logger.warning(f"Link processing failed for {url[:60]}: {e}")
continue
return signals

View File

@@ -0,0 +1,188 @@
"""
Meet Cross-Reference Agent
Finds connections between meeting signals and existing Telegram group signals.
Surfaces: confirmations (meeting agrees with chat), contradictions (meeting contradicts chat),
and blind spots (meeting discusses something chat groups don't know about).
"""
import logging
from backend.providers import call_llm
from backend.db.chroma import query_signals, get_all_signals
from backend.config import MEET_CROSS_REF_GROUPS, MEET_DEFAULT_GROUP_ID
logger = logging.getLogger("thirdeye.agents.meet_cross_ref")
CROSS_REF_SYSTEM_PROMPT = """You are an expert at finding connections between meeting discussions and team chat history.
You will receive:
1. MEETING SIGNALS — decisions, action items, blockers, risks from a recent Google Meet
2. CHAT SIGNALS — existing signals from team Telegram groups
Find meaningful connections across three categories:
CONFIRMATIONS: Meeting agrees with or reinforces something from chat history
CONTRADICTIONS: Meeting decision conflicts with what was said/decided in chat
BLIND SPOTS: Important things from the meeting that the chat teams don't seem to know about
Return ONLY a valid JSON object:
{
"confirmations": [
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "significance": "high|medium|low"}
],
"contradictions": [
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "impact": "...", "significance": "high|medium|low"}
],
"blind_spots": [
{"meeting_signal": "...", "teams_unaware": ["group1", "group2"], "recommendation": "..."}
]
}
Rules:
- Only include HIGH confidence matches — do not stretch for weak connections
- Keep each signal description concise (1 sentence max)
- significance "high" = this matters for team alignment; "medium" = worth noting; "low" = minor
- If a category has nothing meaningful, use an empty array []
- Return JSON only"""
async def find_cross_references(
meeting_id: str,
group_id: str = None,
cross_ref_group_ids: list[str] = None,
) -> dict:
"""
Compare meeting signals against chat group signals.
Args:
meeting_id: The meeting to analyze
group_id: ChromaDB group where meet signals are stored (defaults to MEET_DEFAULT_GROUP_ID)
cross_ref_group_ids: Groups to compare against (defaults to MEET_CROSS_REF_GROUPS from config)
Returns:
Dict with confirmations, contradictions, blind_spots lists
"""
group_id = group_id or MEET_DEFAULT_GROUP_ID
cross_ref_group_ids = cross_ref_group_ids or MEET_CROSS_REF_GROUPS
if not cross_ref_group_ids:
return {
"confirmations": [],
"contradictions": [],
"blind_spots": [],
"error": "No cross-reference groups configured. Set MEET_CROSS_REF_GROUPS in .env",
}
# 1. Get meeting signals (decisions, actions, blockers, risks — NOT raw chunks)
meet_signals = query_signals(group_id, meeting_id, n_results=30)
structured_meet = [
s for s in meet_signals
if s.get("metadata", {}).get("type") in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk", "meet_open_q")
]
if not structured_meet:
return {
"confirmations": [],
"contradictions": [],
"blind_spots": [],
"error": f"No structured signals found for meeting {meeting_id}. Has it been processed yet?",
}
# 2. Get signals from each cross-reference group
chat_context_parts = []
for gid in cross_ref_group_ids:
try:
all_sig = get_all_signals(gid)
if all_sig:
formatted = "\n".join([
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:120]}"
for s in all_sig[:20] # Cap at 20 per group to stay within token limits
])
chat_context_parts.append(f"Group '{gid}':\n{formatted}")
except Exception as e:
logger.warning(f"Could not load signals for group {gid}: {e}")
if not chat_context_parts:
return {
"confirmations": [],
"contradictions": [],
"blind_spots": [],
"error": "Could not load any signals from cross-reference groups.",
}
# 3. Format inputs for LLM
meet_text = "\n".join([
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:150]}" for s in structured_meet
])
chat_text = "\n\n".join(chat_context_parts)
prompt = f"""MEETING SIGNALS (from meeting: {meeting_id}):
{meet_text}
CHAT SIGNALS (from monitored Telegram groups):
{chat_text}"""
try:
import json
result = await call_llm(
task_type="reasoning",
messages=[
{"role": "system", "content": CROSS_REF_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
temperature=0.2,
max_tokens=1500,
response_format={"type": "json_object"},
)
raw = result["content"].strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw)
except Exception as e:
logger.error(f"Cross-reference LLM call failed: {e}")
return {
"confirmations": [],
"contradictions": [],
"blind_spots": [],
"error": str(e),
}
def format_cross_ref_for_telegram(analysis: dict, meeting_id: str) -> str:
"""Format cross-reference results as a Telegram message."""
parts = [f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting: `{meeting_id}`\n"]
if analysis.get("error"):
return f"⚠️ Cross-reference failed: {analysis['error']}"
confirmations = analysis.get("confirmations", [])
contradictions = analysis.get("contradictions", [])
blind_spots = analysis.get("blind_spots", [])
if not confirmations and not contradictions and not blind_spots:
return f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting `{meeting_id}`: No significant connections found between this meeting and your chat groups."
if confirmations:
parts.append(f"✅ *Confirmations* ({len(confirmations)})")
for c in confirmations[:3]: # Cap at 3 for readability
sig = "🔴" if c.get("significance") == "high" else "🟡"
parts.append(f"{sig} Meeting: _{c['meeting_signal'][:100]}_")
parts.append(f" Matches [{c.get('group', '?')}]: _{c['chat_signal'][:100]}_\n")
if contradictions:
parts.append(f"⚡ *Contradictions* ({len(contradictions)}) — ACTION NEEDED")
for c in contradictions[:3]:
parts.append(f"🔴 Meeting decided: _{c['meeting_signal'][:100]}_")
parts.append(f" BUT [{c.get('group', '?')}] says: _{c['chat_signal'][:100]}_")
if c.get("impact"):
parts.append(f" Impact: {c['impact'][:100]}\n")
if blind_spots:
parts.append(f"🔦 *Blind Spots* ({len(blind_spots)}) — Teams may not know")
for b in blind_spots[:3]:
parts.append(f"🟠 {b['meeting_signal'][:120]}")
if b.get("recommendation"):
parts.append(f"{b['recommendation'][:100]}\n")
return "\n".join(parts)

View File

@@ -0,0 +1,342 @@
"""
Meet Ingestor Agent
Processes raw Google Meet transcript chunks and extracts structured signals.
Signal types produced:
meet_decision — A decision made during the meeting
meet_action_item — A task assigned to someone
meet_blocker — A blocker or dependency raised
meet_risk — A risk or concern identified
meet_open_q — An unresolved question left open
meet_summary — Full meeting summary (emitted on is_final=True)
meet_chunk_raw — Raw transcript chunk (always stored, for full-text search)
"""
import asyncio
import json
import logging
import uuid
from datetime import datetime
from backend.providers import call_llm
from backend.db.chroma import store_signals
logger = logging.getLogger("thirdeye.agents.meet_ingestor")
# ─── Extraction prompt ───────────────────────────────────────────────────────
EXTRACTION_SYSTEM_PROMPT = """You are an expert meeting analyst. You receive raw transcript chunks from a Google Meet recording and extract structured signals.
Extract ONLY signals that are clearly present. Do NOT hallucinate or infer beyond what is stated.
Return ONLY a valid JSON object with this exact structure:
{
"decisions": [
{"text": "...", "owner": "@name or null", "confidence": "high|medium|low"}
],
"action_items": [
{"text": "...", "owner": "@name or null", "due": "date string or null", "confidence": "high|medium|low"}
],
"blockers": [
{"text": "...", "blocking_what": "...", "confidence": "high|medium|low"}
],
"risks": [
{"text": "...", "severity": "high|medium|low", "confidence": "high|medium|low"}
],
"open_questions": [
{"text": "...", "confidence": "high|medium|low"}
]
}
Rules:
- If a category has nothing, use an empty array []
- owner must start with @ if it's a person's name (e.g. "@Alex")
- text must be a clear, standalone sentence — not a fragment
- Only include confidence "high" if the signal is unambiguous
- Do NOT reproduce filler words, pleasantries, or off-topic banter
- Return JSON only — no markdown, no preamble, no explanation"""
SUMMARY_SYSTEM_PROMPT = """You are a meeting intelligence expert. Given a full meeting transcript (possibly from multiple chunks), write a concise but complete meeting summary.
Structure your summary as:
1. One-sentence overview (what was the meeting about)
2. Key decisions made (bullet points, max 5)
3. Action items assigned (who does what by when)
4. Blockers or risks raised
5. Open questions still unresolved
Keep the summary under 400 words. Be specific. Use names when available. Do NOT use filler phrases like "the team discussed" — just state what was decided/agreed/assigned."""
# ─── Signal builder ─────────────────────────────────────────────────────────
def _build_signal(
signal_type: str,
summary: str,
raw_quote: str,
severity: str,
entities: list[str],
keywords: list[str],
timestamp: str,
group_id: str,
meeting_id: str,
urgency: str = "none",
status: str = "open",
) -> dict:
return {
"id": str(uuid.uuid4()),
"type": signal_type,
"summary": summary,
"raw_quote": raw_quote[:500] if raw_quote else "",
"severity": severity,
"status": status,
"sentiment": "neutral",
"urgency": urgency,
"entities": entities,
"keywords": keywords,
"timestamp": timestamp,
"group_id": group_id,
"lens": "meet",
"meeting_id": meeting_id,
}
def _extract_entities(text: str, owner: str = None) -> list[str]:
"""Extract entity strings from text (names starting with @)."""
import re
entities = re.findall(r"@[\w]+", text)
if owner and owner.startswith("@"):
entities.append(owner)
return list(set(entities))
def _extract_keywords(text: str) -> list[str]:
"""Simple keyword extraction: lowercase meaningful words."""
stopwords = {"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not"}
words = text.lower().split()
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
return list(dict.fromkeys(keywords))[:10] # deduplicate, keep first 10
# ─── Main processing function ────────────────────────────────────────────────
async def process_meet_chunk(
meeting_id: str,
group_id: str,
chunk_index: int,
text: str,
speaker: str,
timestamp: str,
is_final: bool,
):
"""
Full pipeline for a transcript chunk:
1. Always store raw chunk for full-text search
2. Extract structured signals via LLM
3. If is_final, generate a full meeting summary
"""
logger.info(f"Processing meet chunk {chunk_index} for meeting {meeting_id} ({len(text)} chars)")
signals_to_store = []
# 1. Always store the raw chunk (enables full-text similarity search later)
raw_signal = _build_signal(
signal_type="meet_chunk_raw",
summary=f"[{meeting_id}] Chunk {chunk_index}: {text[:120]}...",
raw_quote=text,
severity="low",
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
keywords=_extract_keywords(text),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
)
signals_to_store.append(raw_signal)
# 2. Extract structured signals via LLM
try:
result = await call_llm(
task_type="fast_large",
messages=[
{"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
{"role": "user", "content": f"Transcript chunk from speaker '{speaker}':\n\n{text}"},
],
temperature=0.1,
max_tokens=1500,
response_format={"type": "json_object"},
)
raw_json = result["content"].strip()
# Strip markdown code fences if present
if raw_json.startswith("```"):
raw_json = raw_json.split("```")[1]
if raw_json.startswith("json"):
raw_json = raw_json[4:]
extracted = json.loads(raw_json)
except Exception as e:
logger.warning(f"Meet extraction LLM failed for chunk {chunk_index}: {e}")
extracted = {}
# Decisions
for item in extracted.get("decisions", []):
if item.get("confidence") in ("high", "medium"):
signals_to_store.append(_build_signal(
signal_type="meet_decision",
summary=item["text"],
raw_quote=item["text"],
severity="medium",
entities=_extract_entities(item["text"], item.get("owner")),
keywords=_extract_keywords(item["text"]),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
status="decided",
))
# Action items
for item in extracted.get("action_items", []):
if item.get("confidence") in ("high", "medium"):
due_str = f" Due: {item['due']}." if item.get("due") else ""
signals_to_store.append(_build_signal(
signal_type="meet_action_item",
summary=f"{item['text']}{due_str}",
raw_quote=item["text"],
severity="medium",
entities=_extract_entities(item["text"], item.get("owner")),
keywords=_extract_keywords(item["text"]),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
urgency="medium" if item.get("due") else "low",
status="open",
))
# Blockers
for item in extracted.get("blockers", []):
if item.get("confidence") in ("high", "medium"):
signals_to_store.append(_build_signal(
signal_type="meet_blocker",
summary=item["text"],
raw_quote=item["text"],
severity="high",
entities=_extract_entities(item["text"]),
keywords=_extract_keywords(item["text"]),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
urgency="high",
status="open",
))
# Risks
for item in extracted.get("risks", []):
signals_to_store.append(_build_signal(
signal_type="meet_risk",
summary=item["text"],
raw_quote=item["text"],
severity=item.get("severity", "medium"),
entities=_extract_entities(item["text"]),
keywords=_extract_keywords(item["text"]),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
urgency="medium",
status="open",
))
# Open questions
for item in extracted.get("open_questions", []):
if item.get("confidence") in ("high", "medium"):
signals_to_store.append(_build_signal(
signal_type="meet_open_q",
summary=item["text"],
raw_quote=item["text"],
severity="low",
entities=_extract_entities(item["text"]),
keywords=_extract_keywords(item["text"]),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
status="open",
))
# 3. If this is the final chunk, generate a meeting summary
if is_final:
summary_signal = await _generate_meeting_summary(
meeting_id, group_id, text, speaker, timestamp
)
if summary_signal:
signals_to_store.append(summary_signal)
# Store everything
if signals_to_store:
store_signals(group_id, signals_to_store)
logger.info(
f"Stored {len(signals_to_store)} signals for meeting {meeting_id} chunk {chunk_index}"
)
return signals_to_store
async def _generate_meeting_summary(
meeting_id: str,
group_id: str,
final_chunk_text: str,
speaker: str,
timestamp: str,
) -> dict | None:
"""
Pull all raw chunks for this meeting from ChromaDB and generate a summary.
Falls back to summarizing just the final chunk if retrieval fails.
"""
from backend.db.chroma import query_signals
try:
# Get all raw chunks for this meeting
raw_chunks = query_signals(
group_id,
meeting_id,
n_results=50,
signal_type="meet_chunk_raw",
)
full_transcript = "\n\n".join(
[s.get("metadata", {}).get("raw_quote", "") or s.get("document", "") for s in raw_chunks]
)
if not full_transcript.strip():
full_transcript = final_chunk_text
except Exception:
full_transcript = final_chunk_text
try:
result = await call_llm(
task_type="fast_large",
messages=[
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
{
"role": "user",
"content": f"Meeting ID: {meeting_id}\n\nFull transcript:\n\n{full_transcript[:6000]}",
},
],
temperature=0.3,
max_tokens=600,
)
summary_text = result["content"].strip()
except Exception as e:
logger.warning(f"Meeting summary generation failed: {e}")
return None
return _build_signal(
signal_type="meet_summary",
summary=summary_text,
raw_quote=full_transcript[:500],
severity="medium",
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
keywords=_extract_keywords(summary_text),
timestamp=timestamp,
group_id=group_id,
meeting_id=meeting_id,
status="completed",
)

View File

@@ -0,0 +1,114 @@
"""Pattern Detector Agent — finds trends and anomalies in accumulated signals."""
import logging
from backend.providers import call_llm
from backend.db.chroma import get_all_signals
from backend.db.models import Pattern
from backend.agents.json_utils import extract_json_object
logger = logging.getLogger("thirdeye.agents.pattern_detector")
SYSTEM_PROMPT = """You are the Pattern Detector for ThirdEye. You analyze accumulated signals to find patterns and anomalies.
Detect these pattern types:
- frequency_spike: A signal type mentioned significantly more than usual
- knowledge_silo: Only one person discusses a critical topic (bus factor = 1)
- recurring_issue: Same bug/problem appearing repeatedly
- sentiment_trend: Gradual shift in tone over time
- stale_item: Decisions proposed but never resolved, promises with no follow-up
Respond ONLY with valid JSON (no markdown, no backticks):
{"patterns": [{"type": "pattern_type", "description": "Clear human-readable description", "severity": "info|warning|critical", "evidence_ids": [], "recommendation": "Suggested action"}]}
If no patterns found: {"patterns": []}
Only report patterns that are genuinely concerning. Do NOT manufacture patterns from insufficient data."""
def _heuristic_detect_patterns(group_id: str, all_signals: list[dict]) -> list[Pattern]:
"""Generate conservative patterns from signal metadata when LLM output is unavailable."""
patterns: list[Pattern] = []
type_counts: dict[str, int] = {}
entity_counts: dict[str, int] = {}
for s in all_signals:
meta = s.get("metadata", {})
signal_type = str(meta.get("type", "unknown"))
type_counts[signal_type] = type_counts.get(signal_type, 0) + 1
entities = meta.get("entities", [])
if isinstance(entities, str):
entities = [entities]
if isinstance(entities, list):
for ent in entities:
ent_key = str(ent).strip()
if ent_key:
entity_counts[ent_key] = entity_counts.get(ent_key, 0) + 1
recurring_types = [t for t, c in type_counts.items() if c >= 2 and t in {"recurring_bug", "workaround", "tech_debt"}]
for signal_type in recurring_types:
patterns.append(Pattern(
group_id=group_id,
type="recurring_issue",
description=f"Signal type '{signal_type}' has appeared repeatedly ({type_counts[signal_type]} times).",
severity="warning",
recommendation="Create a dedicated action item with owner and due date to stop repeated recurrence.",
))
silo_entities = [ent for ent, c in entity_counts.items() if c >= 2]
if any("stripe" in ent.lower() or "payment" in ent.lower() for ent in silo_entities):
patterns.append(Pattern(
group_id=group_id,
type="knowledge_silo",
description="Critical payment-related topics are concentrated in repeated mentions, suggesting low bus factor.",
severity="warning",
recommendation="Document payment workflows and assign at least one backup owner.",
))
return patterns[:5]
async def detect_patterns(group_id: str) -> list[Pattern]:
"""Analyze all signals in a group and detect patterns."""
all_signals = get_all_signals(group_id)
if len(all_signals) < 3:
logger.info(f"Not enough signals ({len(all_signals)}) for pattern detection in {group_id}")
return []
# Format signals for the LLM
signal_summary = []
for s in all_signals:
meta = s["metadata"]
signal_summary.append(
f"- [{meta.get('type', '?')}] {s['document'][:100]} "
f"(severity={meta.get('severity', '?')}, entities={meta.get('entities', '[]')}, "
f"time={meta.get('timestamp', '?')})"
)
signals_text = "\n".join(signal_summary)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Analyze these {len(all_signals)} signals from group '{group_id}':\n\n{signals_text}"},
]
try:
result = await call_llm("reasoning", messages, temperature=0.2, max_tokens=1500)
parsed = extract_json_object(result.get("content", ""))
patterns = []
for p in parsed.get("patterns", []):
patterns.append(Pattern(
group_id=group_id,
type=p.get("type", "unknown"),
description=p.get("description", ""),
severity=p.get("severity", "info"),
recommendation=p.get("recommendation", ""),
))
logger.info(f"Detected {len(patterns)} patterns in {group_id}")
return patterns
except Exception as e:
logger.info(f"Pattern detection LLM parse issue, using fallback: {e}")
fallback = _heuristic_detect_patterns(group_id, all_signals)
if fallback:
logger.info(f"Pattern heuristic fallback produced {len(fallback)} patterns in {group_id}")
return fallback

View File

@@ -0,0 +1,68 @@
"""
Query Agent — voice-aware signal context formatting for ThirdEye.
Provides _format_signal_for_context() which labels each ChromaDB signal with
its true origin (voice note, document, meeting, chat) so the LLM can produce
properly attributed answers like:
"Based on what @Raj said in a voice note on Mar 14 (45s), the team decided..."
"""
from datetime import datetime
VOICE_CITATION_INSTRUCTION = """
When context includes [VOICE NOTE — @name on Date (Xs)] signals, ALWAYS cite the voice note explicitly.
Example: "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided to use PostgreSQL."
Never flatten voice signals into generic "the team discussed" language. Always name the speaker and source.
"""
def _format_signal_for_context(signal: dict) -> str:
"""
Format a ChromaDB signal as a context snippet for the Query Agent LLM.
Voice-sourced signals get explicit attribution so the LLM cites them correctly.
Accepts both flat signal dicts and dicts with a nested 'metadata' key.
"""
# Support both flat dicts and ChromaDB-style {"metadata": {...}, "document": ...}
meta = signal.get("metadata", signal)
source = meta.get("source", signal.get("source", "chat"))
sig_type = meta.get("type", signal.get("type", "unknown"))
summary = meta.get("summary", signal.get("summary", ""))
timestamp = meta.get("timestamp", signal.get("timestamp", ""))
date_str = ""
if timestamp:
try:
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
date_str = dt.strftime("%b %d")
except Exception:
date_str = timestamp[:10]
if source == "voice":
speaker = meta.get("speaker", signal.get("speaker", "Unknown"))
duration = meta.get("voice_duration", signal.get("voice_duration", 0))
duration_str = f"{duration}s" if duration else "?"
return (
f"[VOICE NOTE — @{speaker} on {date_str} ({duration_str})] "
f"[{sig_type}] {summary}"
)
if source == "document":
return f"[DOCUMENT — {date_str}] [{sig_type}] {summary}"
if source == "link":
return f"[WEB LINK — {date_str}] [{sig_type}] {summary}"
if sig_type in ("meet_decision", "meet_action_item", "meet_blocker", "meet_summary"):
meeting_id = meta.get("meeting_id", signal.get("meeting_id", ""))
return f"[MEETING {meeting_id}{date_str}] [{sig_type}] {summary}"
entities_raw = meta.get("entities", signal.get("entities", []))
if isinstance(entities_raw, str):
import json
try:
entities_raw = json.loads(entities_raw)
except Exception:
entities_raw = []
sender_str = entities_raw[0] if entities_raw else ""
return f"[CHAT — {sender_str} on {date_str}] [{sig_type}] {summary}"

View File

@@ -0,0 +1,128 @@
"""Signal Extractor Agent — extracts structured signals from chat messages."""
import logging
from backend.providers import call_llm
from backend.db.models import Signal
from datetime import datetime
from backend.agents.json_utils import extract_json_object
logger = logging.getLogger("thirdeye.agents.signal_extractor")
# Lens-specific system prompts
LENS_PROMPTS = {
"dev": """You are the Signal Extractor for ThirdEye operating in DevLens mode.
You analyze batches of developer team chat messages and extract STRUCTURED SIGNALS.
Extract ONLY signals that represent meaningful technical information. Skip greetings, small talk, emoji reactions, and meta-conversation.
Signal types to look for:
- architecture_decision: Technology choices, design decisions with rationale
- tech_debt: Shortcuts, hardcoded values, "will fix later" patterns
- knowledge_silo_evidence: Only one person discusses a critical topic
- recurring_bug: Same issue mentioned repeatedly
- stack_decision: Technology/framework choices (proposed or decided)
- deployment_risk: Risky deployment practices
- workaround: Temporary fixes being applied repeatedly
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
For EACH signal found, include it in the JSON array. If NO meaningful signals exist, return empty array.
Be SELECTIVE. Quality over quantity.""",
"product": """You are the Signal Extractor for ThirdEye operating in ProductLens mode.
Signal types to look for:
- feature_request: Features users or team members are asking for
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
- user_pain_point: User difficulties, complaints, confusion
- roadmap_drift: Discussion of topics not on the current plan
- priority_conflict: Team members disagreeing on what's most important
- metric_mention: Specific numbers, conversion rates, performance data
- user_quote: Direct quotes from users/customers
- competitor_intel: Mentions of competitor actions or features
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
Be SELECTIVE. Quality over quantity.""",
"client": """You are the Signal Extractor for ThirdEye operating in ClientLens mode.
Signal types to look for:
- promise: Commitments made with deadlines (explicit or implicit)
- scope_creep: Additional requests introduced casually without formal change requests
- sentiment_signal: Tone changes (positive praise, growing frustration, formality shifts)
- unanswered_request: Questions or requests that haven't received responses
- satisfaction: Explicit positive or negative feedback
- escalation_risk: Mentions of involving management, expressing deadline concerns
- client_decision: Decisions made by the client
Pay SPECIAL attention to implicit deadlines ("by end of week", "before the meeting").
Be SELECTIVE. Quality over quantity.""",
"community": """You are the Signal Extractor for ThirdEye operating in CommunityLens mode.
Signal types: recommendation, event, issue, local_knowledge, question
Be SELECTIVE. Quality over quantity.""",
}
EXTRACTION_FORMAT = """
Respond ONLY with valid JSON in this exact format (no markdown, no backticks, no explanation):
{"signals": [{"type": "signal_type_here", "summary": "One clear sentence that includes specific names, numbers, timelines, and commitments", "entities": ["@person", "technology"], "severity": "low|medium|high|critical", "status": "proposed|decided|implemented|unresolved", "raw_quote": "Exact verbatim sentence(s) from the message that capture the full claim, including names, numbers, and timelines", "message_index": 0}]}
IMPORTANT for raw_quote: copy the FULL relevant sentence from the message, not just a topic keyword.
Example — message "Anirban: feature page revamp will take approx 2 more days"
WRONG raw_quote: "feature page revamp"
CORRECT raw_quote: "feature page revamp will take approx 2 more days"
If no signals found: {"signals": []}
"""
async def extract_signals(messages_text: str, group_id: str, lens: str = "dev") -> list[Signal]:
"""
Extract structured signals from a batch of formatted chat messages.
Args:
messages_text: Formatted string like "[Alex]: Let's use Redis\\n[Bob]: Agreed"
group_id: Telegram group ID
lens: Active lens mode (dev, product, client, community)
Returns:
List of Signal objects
"""
system_prompt = LENS_PROMPTS.get(lens, LENS_PROMPTS["dev"])
messages = [
{"role": "system", "content": system_prompt + "\n\n" + EXTRACTION_FORMAT},
{"role": "user", "content": f"Extract signals from these messages:\n\n{messages_text}"},
]
try:
result = await call_llm("fast_large", messages, temperature=0.2, max_tokens=2000)
parsed = extract_json_object(result.get("content", ""))
raw_signals = parsed.get("signals", [])
# Convert to Signal objects
signals = []
for raw in raw_signals:
try:
signal = Signal(
group_id=group_id,
lens=lens,
type=raw.get("type", "unknown"),
summary=raw.get("summary", ""),
entities=raw.get("entities", []),
severity=raw.get("severity", "low"),
status=raw.get("status", "unknown"),
raw_quote=raw.get("raw_quote", ""),
timestamp=datetime.utcnow().isoformat(),
)
signals.append(signal)
except Exception as e:
logger.warning(f"Failed to parse signal: {e}")
continue
logger.info(f"Extracted {len(signals)} signals from {group_id} (lens={lens}) via {result['provider']}")
return signals
except Exception as e:
logger.error(f"Signal extraction failed: {e}")
return []

View File

@@ -0,0 +1,281 @@
"""
Voice Handler
Orchestrates the full pipeline for Telegram voice messages and video notes:
Telegram voice/video_note message
-> download audio bytes
-> transcribe via Groq Whisper (voice_transcriber.py)
-> build a voice_transcript signal (stored raw for full-text search)
-> run transcript through process_message_batch (signal extraction)
-> all extracted signals carry voice attribution metadata
Voice metadata attached to every extracted signal:
source: "voice"
voice_file_id: Telegram file ID
voice_duration: seconds
speaker: sender display name
"""
import logging
import uuid
from datetime import datetime, timezone
from backend.agents.voice_transcriber import (
transcribe_audio, download_telegram_audio, format_duration
)
from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
from backend.db.chroma import store_signals
from backend.pipeline import process_message_batch
logger = logging.getLogger("thirdeye.agents.voice_handler")
# --- Voice transcript signal builder -----------------------------------------
def build_voice_transcript_signal(
transcript: str,
sender: str,
group_id: str,
voice_file_id: str,
duration_seconds: int,
language: str,
timestamp: str,
) -> dict:
"""
Build a voice_transcript signal that stores the full raw transcription.
Always stored alongside extracted signals so the full transcript is
searchable in ChromaDB even if no structured signals were extracted.
"""
return {
"id": str(uuid.uuid4()),
"type": "voice_transcript",
"summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
"raw_quote": transcript,
"severity": "low",
"status": "transcribed",
"sentiment": "neutral",
"urgency": "none",
"entities": [f"@{sender}"],
"keywords": _extract_voice_keywords(transcript),
"timestamp": timestamp,
"group_id": group_id,
"lens": "voice",
"source": "voice",
"voice_file_id": voice_file_id,
"voice_duration": duration_seconds,
"voice_language": language,
"speaker": sender,
}
def _extract_voice_keywords(text: str) -> list[str]:
"""Simple keyword extraction from transcript text."""
stopwords = {
"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
"so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
}
words = text.lower().split()
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
return list(dict.fromkeys(keywords))[:12]
def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
"""
Inject voice attribution into every signal extracted from a voice transcript.
Accepts both Signal Pydantic model objects and plain dicts.
This ensures /ask can cite the voice source in its answers.
"""
result = []
for signal in signals:
sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
sig["source"] = "voice"
sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
sig["voice_language"] = voice_meta.get("language", "")
sig["speaker"] = voice_meta.get("sender", "Unknown")
if "[Voice]" not in sig.get("summary", ""):
sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
result.append(sig)
return result
# --- Fallback signal builder -------------------------------------------------
# Keywords that hint at a signal type when the LLM extraction returns nothing
_FALLBACK_TYPE_HINTS = {
"feature_request": {
"need", "needs", "required", "require", "want", "should", "missing",
"add", "feature", "ui", "ux", "design", "change", "changes", "update",
"improve", "improvement", "responsiveness", "responsive",
},
"blocker": {
"blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
"issue", "problem", "broken", "fails", "failing",
},
"action_item": {
"will", "going", "plan", "todo", "do", "fix", "implement", "setup",
"create", "build", "deploy", "check",
},
"risk": {
"risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
"deadline", "delay", "late",
},
}
def _build_fallback_signal(
transcript: str,
sender: str,
group_id: str,
timestamp: str,
voice_meta: dict,
) -> dict:
"""
Build a best-effort structured signal from a voice transcript when the LLM
returned 0 signals. Picks the most likely signal type from keyword hints,
falling back to 'feature_request' as the safe default.
"""
words = set(transcript.lower().split())
scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
severity = "high" if words & urgency_words else "medium"
summary = transcript[:200].strip()
if len(transcript) > 200:
summary += "..."
return {
"id": str(uuid.uuid4()),
"type": best_type,
"summary": f"[Voice @{sender}] {summary}",
"raw_quote": transcript[:500],
"severity": severity,
"status": "unresolved",
"sentiment": "neutral",
"urgency": "high" if severity == "high" else "medium",
"entities": [f"@{sender}"],
"keywords": _extract_voice_keywords(transcript),
"timestamp": timestamp,
"group_id": group_id,
"lens": "voice",
"source": "voice",
"speaker": sender,
"voice_file_id": voice_meta.get("voice_file_id", ""),
"voice_duration": voice_meta.get("duration_seconds", 0),
"voice_language": voice_meta.get("language", ""),
}
# --- Main handler ------------------------------------------------------------
async def handle_voice_message(
bot,
group_id: str,
sender: str,
file_id: str,
duration_seconds: int,
message_date,
is_video_note: bool = False,
) -> dict:
"""
Full pipeline for a single voice or video note message.
Returns:
{"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
OR {"ok": False, "reason": "...", "error": "..."}
"""
if not ENABLE_VOICE_TRANSCRIPTION:
return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
msg_type = "video note" if is_video_note else "voice message"
logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
# 1. Download audio
try:
audio_bytes = await download_telegram_audio(bot, file_id)
except Exception as e:
logger.error(f"Failed to download audio from {sender}: {e}")
return {"ok": False, "reason": "download_failed", "error": str(e)}
# 2. Transcribe
filename = "audio.mp4" if is_video_note else "audio.ogg"
transcription = await transcribe_audio(
audio_bytes,
filename=filename,
duration_seconds=duration_seconds,
)
if not transcription["ok"]:
logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
transcript = transcription["transcript"]
language = transcription.get("language", "unknown")
timestamp = (
message_date.replace(tzinfo=timezone.utc).isoformat()
if message_date else datetime.utcnow().isoformat()
)
# 3. Store raw voice transcript signal
if VOICE_STORE_TRANSCRIPT:
transcript_signal = build_voice_transcript_signal(
transcript=transcript,
sender=sender,
group_id=group_id,
voice_file_id=file_id,
duration_seconds=duration_seconds,
language=language,
timestamp=timestamp,
)
store_signals(group_id, [transcript_signal])
logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
# 4. Run through signal extraction pipeline — treat as a regular text message
voice_meta = {
"sender": sender,
"voice_file_id": file_id,
"duration_seconds": duration_seconds,
"language": language,
}
messages = [{
"sender": sender,
"text": transcript,
"timestamp": timestamp,
"source": "voice",
"voice_file_id": file_id,
"voice_duration": duration_seconds,
}]
try:
extracted_signals = await process_message_batch(group_id, messages)
extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
signals_count = len(extracted_signals)
# Fallback: if the LLM extracted nothing from a meaningful voice message,
# create a generic signal so the content is still searchable as structured data.
if signals_count == 0 and len(transcript.split()) >= 5:
fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
store_signals(group_id, [fallback])
signals_count = 1
logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
except Exception as e:
logger.error(f"Signal extraction failed for voice from {sender}: {e}")
signals_count = 0
logger.info(
f"Voice pipeline complete: {sender}, {duration_seconds}s, "
f"{signals_count} signals, transcript={len(transcript)} chars"
)
return {
"ok": True,
"transcript": transcript,
"signals_extracted": signals_count,
"duration": duration_seconds,
"sender": f"@{sender}",
"language": language,
}

View File

@@ -0,0 +1,194 @@
"""
Voice Transcriber — Groq Whisper integration.
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
audio bytes from Telegram voice messages and video notes into plain text.
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
Telegram voice messages: OGG/Opus
Telegram video notes: MP4
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
"""
import io
import logging
from typing import Optional
import httpx
from backend.config import (
GROQ_API_KEY,
VOICE_LANGUAGE,
VOICE_MAX_DURATION_SECONDS,
VOICE_MIN_DURATION_SECONDS,
)
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
WHISPER_MODEL = "whisper-large-v3"
# Groq file size limit for Whisper: 25 MB
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
# --- Main transcription function ---------------------------------------------
async def transcribe_audio(
audio_bytes: bytes,
filename: str = "audio.ogg",
duration_seconds: int = None,
) -> dict:
"""
Transcribe audio bytes using Groq Whisper.
Args:
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
filename: Filename hint for the API (determines format detection)
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
Returns:
{
"ok": True,
"transcript": "The full transcribed text...",
"language": "en",
"duration": 45,
"word_count": 120,
}
OR on failure:
{
"ok": False,
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
"error": "optional error string",
}
"""
# Pre-flight checks
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
if not audio_bytes:
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
return {
"ok": False,
"reason": "file_too_large",
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
}
if duration_seconds is not None:
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
return {
"ok": False,
"reason": "too_short",
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
}
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
return {
"ok": False,
"reason": "too_long",
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
}
# Determine MIME type from filename extension
ext_to_mime = {
".ogg": "audio/ogg",
".opus": "audio/ogg",
".mp3": "audio/mpeg",
".mp4": "video/mp4",
".m4a": "audio/mp4",
".wav": "audio/wav",
".flac": "audio/flac",
".webm": "audio/webm",
}
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
mime_type = ext_to_mime.get(ext, "audio/ogg")
form_data = {
"model": WHISPER_MODEL,
"response_format": "verbose_json", # returns language detection
"temperature": "0", # deterministic transcription
}
if VOICE_LANGUAGE:
form_data["language"] = VOICE_LANGUAGE
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
GROQ_WHISPER_URL,
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
data=form_data,
)
resp.raise_for_status()
data = resp.json()
except httpx.HTTPStatusError as e:
error_text = ""
try:
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
except Exception:
error_text = e.response.text[:200]
if e.response.status_code == 429:
logger.warning("Groq Whisper rate limited")
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
except httpx.TimeoutException:
logger.warning("Groq Whisper request timed out")
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
except Exception as e:
logger.error(f"Groq Whisper unexpected error: {e}")
return {"ok": False, "reason": "api_error", "error": str(e)}
# Parse response
transcript = (data.get("text") or "").strip()
if not transcript:
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
# Detect if Whisper only returned noise markers
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
if transcript.lower() in noise_patterns:
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
word_count = len(transcript.split())
logger.info(
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
)
return {
"ok": True,
"transcript": transcript,
"language": detected_language,
"duration": duration_seconds,
"word_count": word_count,
}
# --- Telegram-specific download helper ---------------------------------------
async def download_telegram_audio(bot, file_id: str) -> bytes:
"""
Download a Telegram file (voice or video_note) and return raw bytes.
"""
tg_file = await bot.get_file(file_id)
audio_bytes = await tg_file.download_as_bytearray()
return bytes(audio_bytes)
def format_duration(seconds: int) -> str:
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
if seconds is None:
return "?"
if seconds >= 60:
return f"{seconds // 60}m {seconds % 60}s"
return f"{seconds}s"

View File

@@ -0,0 +1,84 @@
"""Web Search Agent — Tavily integration for real-time web context."""
import logging
from backend.config import TAVILY_API_KEY, ENABLE_WEB_SEARCH
logger = logging.getLogger("thirdeye.agents.web_search")
_tavily_client = None
def _get_client():
global _tavily_client
if _tavily_client is None and TAVILY_API_KEY and len(TAVILY_API_KEY) > 5:
try:
from tavily import TavilyClient
_tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
logger.info("Tavily client initialized")
except ImportError:
logger.error("tavily-python not installed. Run: pip install tavily-python")
except Exception as e:
logger.error(f"Tavily client init failed: {e}")
return _tavily_client
async def search_web(query: str, max_results: int = 5) -> list[dict]:
"""
Search the web using Tavily and return structured results.
Args:
query: Search query string
max_results: Max results to return (1-10)
Returns:
List of {title, url, content, score} dicts, sorted by relevance
"""
if not ENABLE_WEB_SEARCH:
logger.info("Web search is disabled via feature flag")
return []
client = _get_client()
if not client:
logger.warning("Tavily client not available (missing API key or install)")
return []
try:
response = client.search(
query=query,
max_results=max_results,
search_depth="basic", # "basic" is faster + free-tier friendly; "advanced" for deeper
include_answer=False,
include_raw_content=False,
)
results = []
for r in response.get("results", []):
results.append({
"title": r.get("title", ""),
"url": r.get("url", ""),
"content": r.get("content", ""),
"score": r.get("score", 0.0),
})
logger.info(f"Tavily returned {len(results)} results for: {query[:60]}")
return results
except Exception as e:
logger.error(f"Tavily search failed: {e}")
return []
def format_search_results_for_llm(results: list[dict]) -> str:
"""Format Tavily results into context string for the Query Agent."""
if not results:
return ""
parts = []
for i, r in enumerate(results):
content_preview = r["content"][:500] if r["content"] else "No content"
parts.append(
f"[Web Result {i+1}] {r['title']}\n"
f"Source: {r['url']}\n"
f"Content: {content_preview}"
)
return "\n\n".join(parts)