mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 12:41:48 +00:00
init
This commit is contained in:
34
thirdeye/backend/agents/classifier.py
Normal file
34
thirdeye/backend/agents/classifier.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Classifier Agent — adds metadata tags to extracted signals."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.models import Signal
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.classifier")
|
||||
|
||||
SYSTEM_PROMPT = """You are a fast metadata classifier. Given an extracted signal, add classification tags.
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"sentiment": "positive|neutral|negative|urgent", "urgency": "none|low|medium|high|critical", "keywords": ["3-5 searchable keywords"]}
|
||||
"""
|
||||
|
||||
|
||||
async def classify_signal(signal: Signal) -> Signal:
|
||||
"""Add classification metadata to a signal."""
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Classify this signal:\nType: {signal.type}\nSummary: {signal.summary}\nQuote: {signal.raw_quote}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_small", messages, temperature=0.1, max_tokens=200)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
signal.sentiment = parsed.get("sentiment", signal.sentiment)
|
||||
signal.urgency = parsed.get("urgency", signal.urgency)
|
||||
signal.keywords = parsed.get("keywords", signal.keywords)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Classification failed, using defaults: {e}")
|
||||
# Keep defaults — classification failure is non-fatal
|
||||
|
||||
return signal
|
||||
107
thirdeye/backend/agents/context_detector.py
Normal file
107
thirdeye/backend/agents/context_detector.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Context Detector Agent — auto-classifies group type from messages."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.context_detector")
|
||||
|
||||
SYSTEM_PROMPT = """You analyze a batch of messages from a Telegram group and determine what TYPE of group this is.
|
||||
|
||||
CLASSIFY into exactly ONE:
|
||||
- "dev" — Software engineering team (code, PRs, deployments, bugs, tech stack)
|
||||
- "product" — Product/business team (features, users, metrics, roadmap, competitors)
|
||||
- "client" — Client/agency channel (deliverables, timelines, approvals, invoices)
|
||||
- "community" — Community/interest group (recommendations, events, local info, casual)
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"detected_lens": "dev|product|client|community", "confidence": 0.0-1.0, "evidence": ["signal1", "signal2", "signal3"]}
|
||||
"""
|
||||
|
||||
VALID_LENSES = {"dev", "product", "client", "community"}
|
||||
|
||||
|
||||
def _heuristic_detect_context(messages_text: str) -> dict:
|
||||
"""Rule-based fallback when LLM output is malformed/unavailable."""
|
||||
text = (messages_text or "").lower()
|
||||
|
||||
lens_keywords = {
|
||||
"dev": [
|
||||
"bug", "deploy", "deployment", "api", "database", "schema", "postgres", "mongo",
|
||||
"timeout", "endpoint", "pod", "pr", "code", "docker", "stack", "integration",
|
||||
],
|
||||
"product": [
|
||||
"feature", "roadmap", "user", "users", "client", "customers", "complain", "pain",
|
||||
"prioritize", "priority", "enterprise", "competitor", "demo", "sso", "dark mode",
|
||||
"mobile", "stability", "integration",
|
||||
],
|
||||
"client": [
|
||||
"invoice", "deadline", "deliverable", "approval", "sign-off", "scope", "payment",
|
||||
"contract", "proposal", "timeline", "meeting",
|
||||
],
|
||||
"community": [
|
||||
"event", "meetup", "recommend", "anyone", "community", "local", "where can i",
|
||||
"suggestion", "friends", "weekend",
|
||||
],
|
||||
}
|
||||
|
||||
scores = {
|
||||
lens: sum(text.count(keyword) for keyword in keywords)
|
||||
for lens, keywords in lens_keywords.items()
|
||||
}
|
||||
|
||||
best_lens = max(scores, key=scores.get)
|
||||
best_score = scores[best_lens]
|
||||
if best_score == 0:
|
||||
best_lens = "dev"
|
||||
|
||||
evidence = [k for k in lens_keywords[best_lens] if k in text][:3]
|
||||
confidence = min(0.95, 0.35 + 0.08 * best_score) if best_score > 0 else 0.0
|
||||
|
||||
return {
|
||||
"detected_lens": best_lens,
|
||||
"confidence": round(confidence, 2),
|
||||
"evidence": evidence or ["heuristic_fallback"],
|
||||
}
|
||||
|
||||
|
||||
async def detect_context(messages_text: str) -> dict:
|
||||
"""Detect group type from a batch of messages."""
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Classify this group based on these messages:\n\n{messages_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
"fast_large",
|
||||
messages,
|
||||
temperature=0.1,
|
||||
max_tokens=300,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
|
||||
detected_lens = str(parsed.get("detected_lens", "dev")).strip().lower()
|
||||
if detected_lens not in VALID_LENSES:
|
||||
detected_lens = "dev"
|
||||
|
||||
confidence = parsed.get("confidence", 0.5)
|
||||
try:
|
||||
confidence = float(confidence)
|
||||
except (TypeError, ValueError):
|
||||
confidence = 0.5
|
||||
|
||||
evidence = parsed.get("evidence", [])
|
||||
if not isinstance(evidence, list):
|
||||
evidence = [str(evidence)]
|
||||
|
||||
return {
|
||||
"detected_lens": detected_lens,
|
||||
"confidence": max(0.0, min(1.0, confidence)),
|
||||
"evidence": [str(x) for x in evidence][:5],
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Context detection failed: {e}")
|
||||
fallback = _heuristic_detect_context(messages_text)
|
||||
fallback["evidence"] = fallback["evidence"] + ["detection_failed"]
|
||||
return fallback
|
||||
287
thirdeye/backend/agents/cross_group_analyst.py
Normal file
287
thirdeye/backend/agents/cross_group_analyst.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""Cross-Group Analyst Agent — detects blind spots between multiple teams."""
|
||||
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
from backend.db.models import CrossGroupInsight
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.cross_group_analyst")
|
||||
|
||||
SYSTEM_PROMPT = """You are the Cross-Group Intelligence Analyst for ThirdEye. This is the MOST IMPORTANT analysis.
|
||||
|
||||
You receive intelligence summaries from MULTIPLE Telegram groups. Your job is to find BLIND SPOTS — information in one group that should be in another.
|
||||
|
||||
Detect:
|
||||
- blocked_handoff: Team A waiting for something from Team B, but Team B doesn't know
|
||||
- conflicting_decision: Team A decided X, Team B decided the opposite
|
||||
- information_silo: Critical info in Group A never reached Group B
|
||||
- promise_reality_gap: Promise made in one group, but another group shows it's blocked
|
||||
- duplicated_effort: Two teams working on similar things unknowingly
|
||||
|
||||
Respond ONLY with valid JSON (no markdown):
|
||||
{"insights": [{"type": "insight_type", "description": "SPECIFIC description naming the groups, people, and topics", "group_a": {"name": "group_name", "evidence": "what was said"}, "group_b": {"name": "group_name", "evidence": "what was said or NOT said"}, "severity": "warning|critical", "recommendation": "Specific action"}]}
|
||||
|
||||
If no cross-group issues: {"insights": []}
|
||||
Be SPECIFIC. Name the groups, people, topics, and exact conflicts."""
|
||||
|
||||
|
||||
def _heuristic_cross_group_insights(
|
||||
group_summaries: dict[str, list[dict]],
|
||||
) -> list[CrossGroupInsight]:
|
||||
"""Generate best-effort cross-group insights when LLM output is unavailable."""
|
||||
insights: list[CrossGroupInsight] = []
|
||||
|
||||
normalized = {}
|
||||
for group_name, signals in group_summaries.items():
|
||||
docs = [str(s.get("document", "")) for s in signals]
|
||||
combined = " ".join(docs).lower()
|
||||
signal_types = []
|
||||
for s in signals:
|
||||
signal_types.append(
|
||||
str(s.get("metadata", {}).get("type", "unknown")).lower()
|
||||
)
|
||||
normalized[group_name] = {
|
||||
"text": combined,
|
||||
"signals": signals,
|
||||
"types": signal_types,
|
||||
}
|
||||
|
||||
group_names = list(normalized.keys())
|
||||
for i in range(len(group_names)):
|
||||
for j in range(i + 1, len(group_names)):
|
||||
group_a = group_names[i]
|
||||
group_b = group_names[j]
|
||||
text_a = normalized[group_a]["text"]
|
||||
text_b = normalized[group_b]["text"]
|
||||
types_a = set(normalized[group_a]["types"])
|
||||
types_b = set(normalized[group_b]["types"])
|
||||
|
||||
# Detect a likely blocked handoff around design/spec dependencies.
|
||||
a_waiting = any(
|
||||
k in text_a for k in ["waiting", "blocked", "design spec", "specs"]
|
||||
)
|
||||
b_mentions_specs = any(
|
||||
k in text_b for k in ["design spec", "specs", "design"]
|
||||
)
|
||||
if a_waiting and not b_mentions_specs:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="blocked_handoff",
|
||||
description=(
|
||||
f"{group_a} indicates dependency blockage (design/spec inputs), "
|
||||
f"but {group_b} has no corresponding discussion of that dependency."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": "Contains waiting/blocked language tied to specs or design dependency.",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": "No clear mention of design specs/dependency handoff in available signals.",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation=(
|
||||
f"Create a shared handoff item between {group_a} and {group_b} for design/spec ownership "
|
||||
"with an explicit due date."
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Detect likely promise vs execution mismatch.
|
||||
b_promises = any(
|
||||
k in text_b
|
||||
for k in ["demo", "friday", "promised", "told the client", "ready by"]
|
||||
)
|
||||
a_blocked = any(
|
||||
k in text_a
|
||||
for k in ["blocked", "waiting", "can't proceed", "cannot proceed"]
|
||||
)
|
||||
if b_promises and a_blocked:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="promise_reality_gap",
|
||||
description=(
|
||||
f"{group_b} signals delivery promises while {group_a} reports blockers that may prevent those commitments."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": "Signals include active blockers/waiting dependencies.",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": "Signals include explicit client/demo commitments and timelines.",
|
||||
},
|
||||
severity="critical",
|
||||
recommendation="Run a joint risk review and re-baseline commitments before the next client update.",
|
||||
)
|
||||
)
|
||||
|
||||
# Type-based silo detection when lexical cues are weak.
|
||||
a_operational_risk = bool(
|
||||
types_a.intersection(
|
||||
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
|
||||
)
|
||||
)
|
||||
b_planning_focus = bool(
|
||||
types_b.intersection(
|
||||
{
|
||||
"feature_request",
|
||||
"roadmap_drift",
|
||||
"priority_conflict",
|
||||
"user_pain_point",
|
||||
}
|
||||
)
|
||||
)
|
||||
if a_operational_risk and b_planning_focus:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="information_silo",
|
||||
description=(
|
||||
f"{group_a} shows operational risk signals while {group_b} is focused on planning/user demands, "
|
||||
"suggesting risk context is not shared across groups."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": f"Operational risk signal types: {sorted(types_a.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": f"Planning-focused signal types: {sorted(types_b.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
|
||||
)
|
||||
)
|
||||
|
||||
# Check reverse direction as well.
|
||||
b_operational_risk = bool(
|
||||
types_b.intersection(
|
||||
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
|
||||
)
|
||||
)
|
||||
a_planning_focus = bool(
|
||||
types_a.intersection(
|
||||
{
|
||||
"feature_request",
|
||||
"roadmap_drift",
|
||||
"priority_conflict",
|
||||
"user_pain_point",
|
||||
}
|
||||
)
|
||||
)
|
||||
if b_operational_risk and a_planning_focus:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="information_silo",
|
||||
description=(
|
||||
f"{group_b} shows operational risk signals while {group_a} is focused on planning/user demands, "
|
||||
"suggesting risk context is not shared across groups."
|
||||
),
|
||||
group_a={
|
||||
"name": group_b,
|
||||
"evidence": f"Operational risk signal types: {sorted(types_b.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
|
||||
},
|
||||
group_b={
|
||||
"name": group_a,
|
||||
"evidence": f"Planning-focused signal types: {sorted(types_a.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
|
||||
)
|
||||
)
|
||||
|
||||
deduped = []
|
||||
seen_keys = set()
|
||||
for insight in insights:
|
||||
key = (insight.type, insight.group_a.get("name"), insight.group_b.get("name"))
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
deduped.append(insight)
|
||||
|
||||
return deduped[:5]
|
||||
|
||||
|
||||
async def analyze_cross_group(
|
||||
group_summaries: dict[str, list[dict]] = None,
|
||||
) -> list[CrossGroupInsight]:
|
||||
"""
|
||||
Analyze intelligence across all monitored groups to find blind spots.
|
||||
|
||||
Args:
|
||||
group_summaries: Optional pre-built summaries. If None, loads from ChromaDB.
|
||||
"""
|
||||
if group_summaries is None:
|
||||
group_ids = get_group_ids()
|
||||
if len(group_ids) < 2:
|
||||
logger.info("Need at least 2 groups for cross-group analysis")
|
||||
return []
|
||||
|
||||
group_summaries = {}
|
||||
for gid in group_ids:
|
||||
signals = get_all_signals(gid)
|
||||
group_summaries[gid] = signals
|
||||
|
||||
if len(group_summaries) < 2:
|
||||
return []
|
||||
|
||||
# Format summaries for the LLM
|
||||
summary_parts = []
|
||||
for group_name, signals in group_summaries.items():
|
||||
signal_lines = []
|
||||
for s in signals[:30]: # Limit per group to fit context
|
||||
meta = s["metadata"]
|
||||
signal_lines.append(f" - [{meta.get('type', '?')}] {s['document'][:120]}")
|
||||
|
||||
summary_parts.append(
|
||||
f"=== GROUP: {group_name} ({len(signals)} total signals) ===\n"
|
||||
+ "\n".join(signal_lines)
|
||||
)
|
||||
|
||||
full_summary = "\n\n".join(summary_parts)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Analyze cross-group intelligence:\n\n{full_summary}",
|
||||
},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
"reasoning",
|
||||
messages,
|
||||
temperature=0.2,
|
||||
max_tokens=2000,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
insights = []
|
||||
for i in parsed.get("insights", []):
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type=i.get("type", "unknown"),
|
||||
description=i.get("description", ""),
|
||||
group_a=i.get("group_a", {}),
|
||||
group_b=i.get("group_b", {}),
|
||||
severity=i.get("severity", "warning"),
|
||||
recommendation=i.get("recommendation", ""),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Cross-group analysis found {len(insights)} insights")
|
||||
return insights
|
||||
|
||||
except Exception as e:
|
||||
raw = ""
|
||||
if "result" in locals() and isinstance(result, dict):
|
||||
raw = str(result.get("content", ""))[:300].replace("\n", " ")
|
||||
logger.info(f"Cross-group LLM parse issue, using fallback: {e}; raw_head={raw}")
|
||||
fallback = _heuristic_cross_group_insights(group_summaries)
|
||||
if fallback:
|
||||
logger.info(
|
||||
f"Cross-group heuristic fallback produced {len(fallback)} insights"
|
||||
)
|
||||
return fallback
|
||||
200
thirdeye/backend/agents/document_ingestor.py
Normal file
200
thirdeye/backend/agents/document_ingestor.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
|
||||
import os
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.document_ingestor")
|
||||
|
||||
# --- Text Extraction ---
|
||||
|
||||
def extract_text_from_pdf(file_path: str) -> list[dict]:
|
||||
"""Extract text from PDF, returns list of {page: int, text: str}."""
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
pages = []
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
for i, page in enumerate(reader.pages):
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
pages.append({"page": i + 1, "text": text.strip()})
|
||||
except Exception as e:
|
||||
logger.error(f"PDF extraction failed for {file_path}: {e}")
|
||||
|
||||
return pages
|
||||
|
||||
|
||||
def extract_text_from_docx(file_path: str) -> list[dict]:
|
||||
"""Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
|
||||
from docx import Document
|
||||
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
||||
if full_text.strip():
|
||||
return [{"page": 1, "text": full_text.strip()}]
|
||||
except Exception as e:
|
||||
logger.error(f"DOCX extraction failed for {file_path}: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_text_from_txt(file_path: str) -> list[dict]:
|
||||
"""Extract text from plain text file."""
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
text = f.read().strip()
|
||||
if text:
|
||||
return [{"page": 1, "text": text}]
|
||||
except Exception as e:
|
||||
logger.error(f"TXT extraction failed for {file_path}: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
".pdf": extract_text_from_pdf,
|
||||
".docx": extract_text_from_docx,
|
||||
".txt": extract_text_from_txt,
|
||||
".md": extract_text_from_txt,
|
||||
".csv": extract_text_from_txt,
|
||||
".json": extract_text_from_txt,
|
||||
".log": extract_text_from_txt,
|
||||
}
|
||||
|
||||
|
||||
def extract_text(file_path: str) -> list[dict]:
|
||||
"""Route to correct extractor based on file extension."""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
extractor = EXTRACTORS.get(ext)
|
||||
if not extractor:
|
||||
logger.warning(f"Unsupported file type: {ext} ({file_path})")
|
||||
return []
|
||||
return extractor(file_path)
|
||||
|
||||
|
||||
# --- Chunking ---
|
||||
|
||||
def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
|
||||
Uses paragraph boundaries when possible, falls back to sentence boundaries,
|
||||
then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
|
||||
"""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
# Split by paragraphs first
|
||||
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for para in paragraphs:
|
||||
# If adding this paragraph stays under limit, add it
|
||||
if len(current_chunk) + len(para) + 1 <= max_chars:
|
||||
current_chunk = (current_chunk + "\n" + para).strip()
|
||||
else:
|
||||
# Save current chunk if it has content
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# If single paragraph is too long, split it by sentences
|
||||
if len(para) > max_chars:
|
||||
sentences = para.replace(". ", ".\n").split("\n")
|
||||
sub_chunk = ""
|
||||
for sent in sentences:
|
||||
if len(sub_chunk) + len(sent) + 1 <= max_chars:
|
||||
sub_chunk = (sub_chunk + " " + sent).strip()
|
||||
else:
|
||||
if sub_chunk:
|
||||
chunks.append(sub_chunk)
|
||||
sub_chunk = sent
|
||||
if sub_chunk:
|
||||
current_chunk = sub_chunk
|
||||
else:
|
||||
current_chunk = ""
|
||||
else:
|
||||
current_chunk = para
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# Add overlap: prepend last N chars of previous chunk to each subsequent chunk
|
||||
if overlap_chars > 0 and len(chunks) > 1:
|
||||
overlapped = [chunks[0]]
|
||||
for i in range(1, len(chunks)):
|
||||
prev_tail = chunks[i - 1][-overlap_chars:]
|
||||
# Find a word boundary in the overlap
|
||||
space_idx = prev_tail.find(" ")
|
||||
if space_idx > 0:
|
||||
prev_tail = prev_tail[space_idx + 1:]
|
||||
overlapped.append(prev_tail + " " + chunks[i])
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
# --- Main Ingestion ---
|
||||
|
||||
def ingest_document(
|
||||
file_path: str,
|
||||
group_id: str,
|
||||
shared_by: str = "Unknown",
|
||||
filename: str = None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
|
||||
|
||||
Args:
|
||||
file_path: Path to the downloaded file on disk
|
||||
group_id: Telegram group ID
|
||||
shared_by: Who shared the file
|
||||
filename: Original filename (for metadata)
|
||||
|
||||
Returns:
|
||||
List of signal dicts ready for store_signals()
|
||||
"""
|
||||
if filename is None:
|
||||
filename = os.path.basename(file_path)
|
||||
|
||||
# Extract
|
||||
pages = extract_text(file_path)
|
||||
if not pages:
|
||||
logger.warning(f"No text extracted from {filename}")
|
||||
return []
|
||||
|
||||
# Chunk each page
|
||||
signals = []
|
||||
total_chunks = 0
|
||||
|
||||
for page_data in pages:
|
||||
page_num = page_data["page"]
|
||||
chunks = chunk_text(page_data["text"])
|
||||
|
||||
for chunk_idx, chunk_text_str in enumerate(chunks):
|
||||
if len(chunk_text_str.strip()) < 30:
|
||||
continue # Skip tiny chunks
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "document_knowledge",
|
||||
"summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
|
||||
"entities": [f"@{shared_by}", filename],
|
||||
"severity": "low",
|
||||
"status": "reference",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"raw_quote": chunk_text_str,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "document",
|
||||
"keywords": [filename, f"page_{page_num}", "document", shared_by],
|
||||
}
|
||||
signals.append(signal)
|
||||
total_chunks += 1
|
||||
|
||||
logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
|
||||
return signals
|
||||
373
thirdeye/backend/agents/jira_agent.py
Normal file
373
thirdeye/backend/agents/jira_agent.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Jira Signal Agent
|
||||
Takes ThirdEye signals and converts them into well-formed Jira tickets.
|
||||
|
||||
Responsibilities:
|
||||
1. Map signal type → Jira issue type + priority
|
||||
2. LLM-generate a clean ticket title and structured description from signal context
|
||||
3. Extract assignee names and match them to Jira account IDs (best-effort)
|
||||
4. Raise the ticket via jira_client and mark the signal in ChromaDB
|
||||
5. Bulk-raise: process a group's unraised high-severity signals in one call
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.integrations.jira_client import (
|
||||
create_issue, search_issues, add_comment, is_configured, search_users
|
||||
)
|
||||
from backend.db.chroma import store_signals, mark_signal_as_raised, get_raised_signal_ids
|
||||
from backend.config import (
|
||||
JIRA_DEFAULT_PROJECT, JIRA_DEFAULT_ISSUE_TYPE,
|
||||
JIRA_AUTO_RAISE_SEVERITY
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.jira_agent")
|
||||
|
||||
|
||||
# ─── Signal → Jira type mapping ──────────────────────────────────────────────
|
||||
|
||||
# Maps ThirdEye signal type → (Jira issue type, default priority)
|
||||
# Note: Issue types must match what's available in your Jira project
|
||||
# Common types: Task, Bug, Story, Epic, Workstream (project-specific)
|
||||
SIGNAL_TYPE_MAP = {
|
||||
# Dev signals
|
||||
"tech_debt": ("Task", "Low"),
|
||||
"recurring_bug": ("Task", "High"), # Changed from Bug to Task
|
||||
"architecture_decision": ("Task", "Medium"),
|
||||
"deployment_risk": ("Task", "High"),
|
||||
"workaround": ("Task", "Medium"),
|
||||
"knowledge_silo": ("Task", "Medium"),
|
||||
# Product signals
|
||||
"feature_request": ("Task", "Medium"), # Changed from Story to Task
|
||||
"priority_conflict": ("Task", "High"),
|
||||
"sentiment_shift": ("Task", "Medium"),
|
||||
# Client signals
|
||||
"promise": ("Task", "High"),
|
||||
"scope_creep": ("Task", "High"),
|
||||
"risk": ("Task", "High"),
|
||||
# Meet signals
|
||||
"meet_action_item": ("Task", "Medium"),
|
||||
"meet_blocker": ("Task", "Highest"),
|
||||
"meet_risk": ("Task", "High"),
|
||||
"meet_decision": ("Task", "Medium"),
|
||||
"meet_open_q": ("Task", "Low"),
|
||||
# Generic
|
||||
"blocker": ("Task", "Highest"),
|
||||
"decision": ("Task", "Medium"),
|
||||
"action_item": ("Task", "Medium"),
|
||||
}
|
||||
|
||||
SEVERITY_TO_PRIORITY = {
|
||||
"critical": "Highest",
|
||||
"high": "High",
|
||||
"medium": "Medium",
|
||||
"low": "Low",
|
||||
}
|
||||
|
||||
RAISEABLE_TYPES = set(SIGNAL_TYPE_MAP.keys())
|
||||
|
||||
|
||||
# ─── Assignee resolution ─────────────────────────────────────────────────────
|
||||
|
||||
async def resolve_assignee_account_id(name: str) -> str | None:
|
||||
"""
|
||||
Resolve a person's display name (or @name) to their Jira account ID.
|
||||
Uses Jira's user search API and fuzzy-matches the best result.
|
||||
Returns the account ID string, or None if no confident match is found.
|
||||
"""
|
||||
if not name:
|
||||
return None
|
||||
clean = name.lstrip("@").strip()
|
||||
try:
|
||||
users = await search_users(clean)
|
||||
if not users:
|
||||
return None
|
||||
clean_lower = clean.lower()
|
||||
# Exact display-name match first
|
||||
for u in users:
|
||||
if u["display_name"].lower() == clean_lower:
|
||||
return u["account_id"]
|
||||
# Partial match (all search words appear in display name)
|
||||
words = clean_lower.split()
|
||||
for u in users:
|
||||
dn = u["display_name"].lower()
|
||||
if all(w in dn for w in words):
|
||||
return u["account_id"]
|
||||
# Last resort: first result
|
||||
return users[0]["account_id"]
|
||||
except Exception as e:
|
||||
logger.warning(f"resolve_assignee_account_id failed for '{name}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ─── LLM ticket generation ───────────────────────────────────────────────────
|
||||
|
||||
TICKET_GEN_SYSTEM_PROMPT = """You are a senior engineering manager writing Jira tickets from team intelligence signals.
|
||||
|
||||
Given a ThirdEye signal (a structured piece of extracted team knowledge), write a Jira ticket.
|
||||
|
||||
Return ONLY a valid JSON object with exactly these fields:
|
||||
{
|
||||
"summary": "Short, actionable ticket title (max 100 chars). Start with a verb. No jargon.",
|
||||
"description": "Full ticket description. Include: what the issue is, context from the signal, why it matters, suggested next steps. Use blank lines between sections. Use '- ' for bullet points. Max 400 words.",
|
||||
"labels": ["label1", "label2"],
|
||||
"assignee_name": "First name or @name of the person to assign, or null if unclear"
|
||||
}
|
||||
|
||||
Label rules:
|
||||
- Always include "thirdeye" and "auto-raised"
|
||||
- Add the signal type as a label (e.g. "tech-debt", "recurring-bug")
|
||||
- Add "urgent" if severity is high or critical
|
||||
- Labels must not have spaces (use hyphens)
|
||||
|
||||
Summary rules:
|
||||
- Starts with a verb: "Fix", "Investigate", "Address", "Resolve", "Document", "Implement"
|
||||
- Be specific — "Fix intermittent checkout timeout" NOT "Fix bug"
|
||||
- Never exceed 100 characters
|
||||
|
||||
Description must include:
|
||||
1. What: clear 1-sentence problem statement
|
||||
2. Context: what was actually said / detected (cite the signal)
|
||||
3. Impact: why this matters to the team or product
|
||||
4. Suggested next steps (2-3 bullet points)
|
||||
|
||||
Return JSON only — no markdown, no preamble."""
|
||||
|
||||
|
||||
async def generate_ticket_content(signal: dict) -> dict:
|
||||
"""
|
||||
Use an LLM to generate a clean, context-rich Jira ticket from a ThirdEye signal.
|
||||
Returns {"summary": str, "description": str, "labels": list, "assignee_name": str|None}
|
||||
"""
|
||||
signal_text = (
|
||||
f"Signal type: {signal.get('type', 'unknown')}\n"
|
||||
f"Summary: {signal.get('summary', '')}\n"
|
||||
f"Raw quote: {signal.get('raw_quote', '')[:300]}\n"
|
||||
f"Severity: {signal.get('severity', 'medium')}\n"
|
||||
f"Entities involved: {', '.join(signal.get('entities', []))}\n"
|
||||
f"Keywords: {', '.join(signal.get('keywords', []))}\n"
|
||||
f"Timestamp: {signal.get('timestamp', '')}\n"
|
||||
f"Group: {signal.get('group_id', '')}\n"
|
||||
f"Lens: {signal.get('lens', '')}"
|
||||
)
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": TICKET_GEN_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": signal_text},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=800,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = result["content"].strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
return json.loads(raw)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Ticket generation LLM failed: {e}. Using fallback.")
|
||||
# Fallback: build a basic ticket without LLM
|
||||
sig_type = signal.get("type", "unknown").replace("_", " ").title()
|
||||
return {
|
||||
"summary": f"{sig_type}: {signal.get('summary', 'Unknown issue')[:80]}",
|
||||
"description": (
|
||||
f"Signal detected by ThirdEye.\n\n"
|
||||
f"Type: {signal.get('type', 'unknown')}\n"
|
||||
f"Summary: {signal.get('summary', '')}\n\n"
|
||||
f"Raw context:\n{signal.get('raw_quote', '(none)')[:300]}\n\n"
|
||||
f"Severity: {signal.get('severity', 'medium')}"
|
||||
),
|
||||
"labels": ["thirdeye", "auto-raised", signal.get("type", "unknown").replace("_", "-")],
|
||||
"assignee_name": None,
|
||||
}
|
||||
|
||||
|
||||
# ─── Main raise function ──────────────────────────────────────────────────────
|
||||
|
||||
async def raise_ticket_for_signal(
|
||||
signal: dict,
|
||||
group_id: str,
|
||||
project_key: str = None,
|
||||
force: bool = False,
|
||||
assignee_account_id: str = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Create a Jira ticket for a single ThirdEye signal.
|
||||
|
||||
Args:
|
||||
signal: The signal dict from ChromaDB
|
||||
group_id: The group this signal belongs to (for dedup tracking)
|
||||
project_key: Override project (default: JIRA_DEFAULT_PROJECT)
|
||||
force: If True, raise even if already raised before
|
||||
|
||||
Returns:
|
||||
{"ok": True, "key": "ENG-42", "url": "...", "summary": "..."}
|
||||
OR
|
||||
{"ok": False, "reason": "already_raised" | "not_raiseable" | "jira_error", ...}
|
||||
"""
|
||||
if not is_configured():
|
||||
return {"ok": False, "reason": "jira_not_configured"}
|
||||
|
||||
signal_id = signal.get("id", "")
|
||||
signal_type = signal.get("type", "")
|
||||
|
||||
# Check if this signal type is raiseable
|
||||
if signal_type not in RAISEABLE_TYPES:
|
||||
return {"ok": False, "reason": "not_raiseable", "signal_type": signal_type}
|
||||
|
||||
# Check if already raised (skip if force=True)
|
||||
if not force and signal_id:
|
||||
already_raised = get_raised_signal_ids(group_id)
|
||||
if signal_id in already_raised:
|
||||
return {"ok": False, "reason": "already_raised", "signal_id": signal_id}
|
||||
|
||||
# Determine Jira issue type and priority from signal
|
||||
default_type, default_priority = SIGNAL_TYPE_MAP.get(signal_type, (JIRA_DEFAULT_ISSUE_TYPE, "Medium"))
|
||||
severity = signal.get("severity", "medium").lower()
|
||||
priority = SEVERITY_TO_PRIORITY.get(severity, default_priority)
|
||||
|
||||
# Generate ticket content via LLM
|
||||
ticket_content = await generate_ticket_content(signal)
|
||||
|
||||
summary = ticket_content.get("summary", signal.get("summary", "ThirdEye signal")[:100])
|
||||
description = ticket_content.get("description", signal.get("summary", ""))
|
||||
labels = ticket_content.get("labels", ["thirdeye", "auto-raised"])
|
||||
# Always ensure thirdeye label is present
|
||||
if "thirdeye" not in labels:
|
||||
labels.append("thirdeye")
|
||||
|
||||
# Append ThirdEye metadata as a context section in the description
|
||||
meta_section = (
|
||||
f"\n\n---\n"
|
||||
f"Raised by: ThirdEye\n"
|
||||
f"Signal ID: {signal_id}\n"
|
||||
f"Group: {group_id}\n"
|
||||
f"Detected: {signal.get('timestamp', datetime.utcnow().isoformat())}"
|
||||
)
|
||||
description = description + meta_section
|
||||
|
||||
# Resolve assignee: explicit account_id wins, then signal override name, then LLM-extracted name
|
||||
if not assignee_account_id:
|
||||
name_hint = signal.get("assignee_override") or ticket_content.get("assignee_name")
|
||||
if name_hint:
|
||||
assignee_account_id = await resolve_assignee_account_id(name_hint)
|
||||
if assignee_account_id:
|
||||
logger.info(f"Resolved assignee '{name_hint}' → {assignee_account_id}")
|
||||
else:
|
||||
logger.warning(f"Could not resolve assignee '{name_hint}' to a Jira account")
|
||||
|
||||
# Create the ticket
|
||||
result = await create_issue(
|
||||
project_key=project_key or JIRA_DEFAULT_PROJECT,
|
||||
summary=summary,
|
||||
description=description,
|
||||
issue_type=default_type,
|
||||
priority=priority,
|
||||
labels=labels,
|
||||
assignee_account_id=assignee_account_id,
|
||||
)
|
||||
|
||||
if result.get("ok"):
|
||||
jira_key = result["key"]
|
||||
jira_url = result["url"]
|
||||
# Mark this signal as raised in ChromaDB so we never duplicate it
|
||||
if signal_id:
|
||||
mark_signal_as_raised(
|
||||
group_id, signal_id, jira_key,
|
||||
jira_url=jira_url,
|
||||
jira_summary=summary,
|
||||
jira_priority=priority,
|
||||
)
|
||||
logger.info(f"Raised Jira ticket {jira_key} for signal {signal_id} ({signal_type})")
|
||||
return {
|
||||
"ok": True,
|
||||
"key": jira_key,
|
||||
"url": jira_url,
|
||||
"summary": summary,
|
||||
"issue_type": default_type,
|
||||
"priority": priority,
|
||||
"assignee_account_id": assignee_account_id,
|
||||
}
|
||||
else:
|
||||
logger.error(f"Jira ticket creation failed: {result}")
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "jira_error",
|
||||
"error": result.get("error"),
|
||||
"details": result.get("details"),
|
||||
}
|
||||
|
||||
|
||||
async def bulk_raise_for_group(
|
||||
group_id: str,
|
||||
signals: list[dict],
|
||||
min_severity: str = None,
|
||||
project_key: str = None,
|
||||
max_tickets: int = 10,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Raise Jira tickets for multiple signals from a group in one call.
|
||||
|
||||
Filters:
|
||||
- Only raiseable signal types
|
||||
- Only signals at or above min_severity (defaults to JIRA_AUTO_RAISE_SEVERITY)
|
||||
- Skips signals already raised
|
||||
- Caps at max_tickets to avoid flooding Jira
|
||||
|
||||
Returns list of raise results.
|
||||
"""
|
||||
min_sev = (min_severity or JIRA_AUTO_RAISE_SEVERITY).lower()
|
||||
severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
|
||||
min_rank = severity_rank.get(min_sev, 2) # Default: high
|
||||
|
||||
already_raised = get_raised_signal_ids(group_id)
|
||||
candidates = []
|
||||
|
||||
for sig in signals:
|
||||
sig_type = sig.get("type", "")
|
||||
sig_id = sig.get("id", "")
|
||||
severity = sig.get("severity", "low").lower()
|
||||
rank = severity_rank.get(severity, 0)
|
||||
|
||||
if sig_type not in RAISEABLE_TYPES:
|
||||
continue
|
||||
if rank < min_rank:
|
||||
continue
|
||||
if sig_id in already_raised:
|
||||
continue
|
||||
candidates.append(sig)
|
||||
|
||||
# Sort by severity descending, then raise up to max_tickets
|
||||
candidates.sort(key=lambda s: severity_rank.get(s.get("severity", "low"), 0), reverse=True)
|
||||
candidates = candidates[:max_tickets]
|
||||
|
||||
results = []
|
||||
for sig in candidates:
|
||||
result = await raise_ticket_for_signal(sig, group_id, project_key=project_key)
|
||||
results.append({**result, "signal_type": sig.get("type"), "signal_summary": sig.get("summary", "")[:80]})
|
||||
|
||||
logger.info(f"Bulk raise for group {group_id}: {len(results)} tickets from {len(signals)} signals")
|
||||
return results
|
||||
|
||||
|
||||
def format_raise_result_for_telegram(result: dict) -> str:
|
||||
"""Format a single raise result as a Telegram message line."""
|
||||
if result.get("ok"):
|
||||
return (
|
||||
f"✅ [{result['key']}]({result['url']}) — "
|
||||
f"*{result.get('issue_type', 'Task')}* | {result.get('priority', 'Medium')} priority\n"
|
||||
f" _{result.get('summary', '')[:90]}_"
|
||||
)
|
||||
reason = result.get("reason", "unknown")
|
||||
if reason == "already_raised":
|
||||
return f"⏭️ Already raised — skipped"
|
||||
if reason == "not_raiseable":
|
||||
return f"⚪ Signal type `{result.get('signal_type', '?')}` — not mapped to Jira"
|
||||
return f"❌ Failed: {result.get('error', reason)}"
|
||||
43
thirdeye/backend/agents/json_utils.py
Normal file
43
thirdeye/backend/agents/json_utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Utilities for robustly parsing JSON from LLM responses."""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def extract_json_object(content: str) -> dict:
|
||||
"""Extract and parse the first JSON object from raw LLM output."""
|
||||
text = (content or "").strip()
|
||||
if not text:
|
||||
raise json.JSONDecodeError("Empty LLM response", text, 0)
|
||||
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)
|
||||
|
||||
decoder = json.JSONDecoder()
|
||||
|
||||
# Direct parse for pure JSON responses.
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try to decode from each object start. This handles wrapper text more
|
||||
# reliably than regex, especially with nested braces.
|
||||
for idx, ch in enumerate(text):
|
||||
if ch != "{":
|
||||
continue
|
||||
try:
|
||||
parsed, _ = decoder.raw_decode(text[idx:])
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)
|
||||
213
thirdeye/backend/agents/link_fetcher.py
Normal file
213
thirdeye/backend/agents/link_fetcher.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
|
||||
import re
|
||||
import uuid
|
||||
import logging
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.config import ENABLE_LINK_FETCH
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.link_fetcher")
|
||||
|
||||
# Patterns to skip (images, downloads, social media embeds, etc.)
|
||||
SKIP_PATTERNS = [
|
||||
r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
|
||||
r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
|
||||
r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
|
||||
r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
|
||||
r"^https?://(www\.)?instagram\.com/p/",
|
||||
r"^https?://(www\.)?tiktok\.com/",
|
||||
r"^https?://(www\.)?youtube\.com/shorts/",
|
||||
r"^https?://t\.me/", # Other Telegram links
|
||||
]
|
||||
|
||||
SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
|
||||
|
||||
|
||||
def extract_urls(text: str) -> list[str]:
|
||||
"""Extract all HTTP/HTTPS URLs from a text string."""
|
||||
url_pattern = re.compile(
|
||||
r"https?://[^\s<>\"')\]},;]+"
|
||||
)
|
||||
urls = url_pattern.findall(text)
|
||||
|
||||
# Clean trailing punctuation
|
||||
cleaned = []
|
||||
for url in urls:
|
||||
url = url.rstrip(".,;:!?)")
|
||||
if len(url) > 10:
|
||||
cleaned.append(url)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def should_fetch(url: str) -> bool:
|
||||
"""Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
|
||||
for pattern in SKIP_COMPILED:
|
||||
if pattern.search(url):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
|
||||
"""
|
||||
Fetch a URL and extract main text content.
|
||||
|
||||
Returns:
|
||||
{title, text, url} or None if fetch fails
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.info(f"URL returned {response.status_code}: {url[:80]}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
||||
logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
|
||||
return None
|
||||
|
||||
html = response.text
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.info(f"URL timed out: {url[:80]}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Parse HTML
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
|
||||
# Remove script, style, nav, footer, header elements
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
|
||||
tag.decompose()
|
||||
|
||||
# Try to find main content area
|
||||
main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
|
||||
if main:
|
||||
text = main.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up
|
||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Skip if too little content
|
||||
if len(text) < 100:
|
||||
logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Truncate very long content
|
||||
if len(text) > 8000:
|
||||
text = text[:8000] + "\n\n[Content truncated]"
|
||||
|
||||
return {
|
||||
"title": title or url,
|
||||
"text": text,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def summarize_content(title: str, text: str, url: str) -> str:
|
||||
"""Use LLM to create a concise summary of fetched content."""
|
||||
# Limit text sent to LLM
|
||||
text_preview = text[:3000]
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": """You are a content summarizer for ThirdEye.
|
||||
Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
|
||||
Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
|
||||
Respond with ONLY the summary text, nothing else."""},
|
||||
{"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
|
||||
return result["content"].strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Link summarization failed: {e}")
|
||||
# Fallback: use first 200 chars of text
|
||||
return text[:200] + "..."
|
||||
|
||||
|
||||
async def process_links_from_message(
|
||||
text: str,
|
||||
group_id: str,
|
||||
shared_by: str = "Unknown",
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Full pipeline: extract URLs from message → fetch → summarize → produce signals.
|
||||
|
||||
Designed to be called in the background (non-blocking to the main message pipeline).
|
||||
|
||||
Returns:
|
||||
List of signal dicts ready for store_signals()
|
||||
"""
|
||||
if not ENABLE_LINK_FETCH:
|
||||
return []
|
||||
|
||||
urls = extract_urls(text)
|
||||
fetchable = [u for u in urls if should_fetch(u)]
|
||||
|
||||
if not fetchable:
|
||||
return []
|
||||
|
||||
signals = []
|
||||
|
||||
# Process up to 3 links per message to avoid overload
|
||||
for url in fetchable[:3]:
|
||||
try:
|
||||
content = await fetch_url_content(url)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
summary = await summarize_content(content["title"], content["text"], url)
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "link_knowledge",
|
||||
"summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
|
||||
"entities": [f"@{shared_by}", url[:100]],
|
||||
"severity": "low",
|
||||
"status": "reference",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"raw_quote": summary,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "link",
|
||||
"keywords": [content["title"][:50], "link", "web", shared_by],
|
||||
}
|
||||
signals.append(signal)
|
||||
logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Link processing failed for {url[:60]}: {e}")
|
||||
continue
|
||||
|
||||
return signals
|
||||
188
thirdeye/backend/agents/meet_cross_ref.py
Normal file
188
thirdeye/backend/agents/meet_cross_ref.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Meet Cross-Reference Agent
|
||||
Finds connections between meeting signals and existing Telegram group signals.
|
||||
Surfaces: confirmations (meeting agrees with chat), contradictions (meeting contradicts chat),
|
||||
and blind spots (meeting discusses something chat groups don't know about).
|
||||
"""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import query_signals, get_all_signals
|
||||
from backend.config import MEET_CROSS_REF_GROUPS, MEET_DEFAULT_GROUP_ID
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.meet_cross_ref")
|
||||
|
||||
CROSS_REF_SYSTEM_PROMPT = """You are an expert at finding connections between meeting discussions and team chat history.
|
||||
|
||||
You will receive:
|
||||
1. MEETING SIGNALS — decisions, action items, blockers, risks from a recent Google Meet
|
||||
2. CHAT SIGNALS — existing signals from team Telegram groups
|
||||
|
||||
Find meaningful connections across three categories:
|
||||
|
||||
CONFIRMATIONS: Meeting agrees with or reinforces something from chat history
|
||||
CONTRADICTIONS: Meeting decision conflicts with what was said/decided in chat
|
||||
BLIND SPOTS: Important things from the meeting that the chat teams don't seem to know about
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{
|
||||
"confirmations": [
|
||||
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "significance": "high|medium|low"}
|
||||
],
|
||||
"contradictions": [
|
||||
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "impact": "...", "significance": "high|medium|low"}
|
||||
],
|
||||
"blind_spots": [
|
||||
{"meeting_signal": "...", "teams_unaware": ["group1", "group2"], "recommendation": "..."}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Only include HIGH confidence matches — do not stretch for weak connections
|
||||
- Keep each signal description concise (1 sentence max)
|
||||
- significance "high" = this matters for team alignment; "medium" = worth noting; "low" = minor
|
||||
- If a category has nothing meaningful, use an empty array []
|
||||
- Return JSON only"""
|
||||
|
||||
|
||||
async def find_cross_references(
|
||||
meeting_id: str,
|
||||
group_id: str = None,
|
||||
cross_ref_group_ids: list[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Compare meeting signals against chat group signals.
|
||||
|
||||
Args:
|
||||
meeting_id: The meeting to analyze
|
||||
group_id: ChromaDB group where meet signals are stored (defaults to MEET_DEFAULT_GROUP_ID)
|
||||
cross_ref_group_ids: Groups to compare against (defaults to MEET_CROSS_REF_GROUPS from config)
|
||||
|
||||
Returns:
|
||||
Dict with confirmations, contradictions, blind_spots lists
|
||||
"""
|
||||
group_id = group_id or MEET_DEFAULT_GROUP_ID
|
||||
cross_ref_group_ids = cross_ref_group_ids or MEET_CROSS_REF_GROUPS
|
||||
|
||||
if not cross_ref_group_ids:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": "No cross-reference groups configured. Set MEET_CROSS_REF_GROUPS in .env",
|
||||
}
|
||||
|
||||
# 1. Get meeting signals (decisions, actions, blockers, risks — NOT raw chunks)
|
||||
meet_signals = query_signals(group_id, meeting_id, n_results=30)
|
||||
structured_meet = [
|
||||
s for s in meet_signals
|
||||
if s.get("metadata", {}).get("type") in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk", "meet_open_q")
|
||||
]
|
||||
|
||||
if not structured_meet:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": f"No structured signals found for meeting {meeting_id}. Has it been processed yet?",
|
||||
}
|
||||
|
||||
# 2. Get signals from each cross-reference group
|
||||
chat_context_parts = []
|
||||
for gid in cross_ref_group_ids:
|
||||
try:
|
||||
all_sig = get_all_signals(gid)
|
||||
if all_sig:
|
||||
formatted = "\n".join([
|
||||
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:120]}"
|
||||
for s in all_sig[:20] # Cap at 20 per group to stay within token limits
|
||||
])
|
||||
chat_context_parts.append(f"Group '{gid}':\n{formatted}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load signals for group {gid}: {e}")
|
||||
|
||||
if not chat_context_parts:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": "Could not load any signals from cross-reference groups.",
|
||||
}
|
||||
|
||||
# 3. Format inputs for LLM
|
||||
meet_text = "\n".join([
|
||||
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:150]}" for s in structured_meet
|
||||
])
|
||||
chat_text = "\n\n".join(chat_context_parts)
|
||||
|
||||
prompt = f"""MEETING SIGNALS (from meeting: {meeting_id}):
|
||||
{meet_text}
|
||||
|
||||
CHAT SIGNALS (from monitored Telegram groups):
|
||||
{chat_text}"""
|
||||
|
||||
try:
|
||||
import json
|
||||
result = await call_llm(
|
||||
task_type="reasoning",
|
||||
messages=[
|
||||
{"role": "system", "content": CROSS_REF_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=1500,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = result["content"].strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
return json.loads(raw)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cross-reference LLM call failed: {e}")
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
def format_cross_ref_for_telegram(analysis: dict, meeting_id: str) -> str:
|
||||
"""Format cross-reference results as a Telegram message."""
|
||||
parts = [f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting: `{meeting_id}`\n"]
|
||||
|
||||
if analysis.get("error"):
|
||||
return f"⚠️ Cross-reference failed: {analysis['error']}"
|
||||
|
||||
confirmations = analysis.get("confirmations", [])
|
||||
contradictions = analysis.get("contradictions", [])
|
||||
blind_spots = analysis.get("blind_spots", [])
|
||||
|
||||
if not confirmations and not contradictions and not blind_spots:
|
||||
return f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting `{meeting_id}`: No significant connections found between this meeting and your chat groups."
|
||||
|
||||
if confirmations:
|
||||
parts.append(f"✅ *Confirmations* ({len(confirmations)})")
|
||||
for c in confirmations[:3]: # Cap at 3 for readability
|
||||
sig = "🔴" if c.get("significance") == "high" else "🟡"
|
||||
parts.append(f"{sig} Meeting: _{c['meeting_signal'][:100]}_")
|
||||
parts.append(f" Matches [{c.get('group', '?')}]: _{c['chat_signal'][:100]}_\n")
|
||||
|
||||
if contradictions:
|
||||
parts.append(f"⚡ *Contradictions* ({len(contradictions)}) — ACTION NEEDED")
|
||||
for c in contradictions[:3]:
|
||||
parts.append(f"🔴 Meeting decided: _{c['meeting_signal'][:100]}_")
|
||||
parts.append(f" BUT [{c.get('group', '?')}] says: _{c['chat_signal'][:100]}_")
|
||||
if c.get("impact"):
|
||||
parts.append(f" Impact: {c['impact'][:100]}\n")
|
||||
|
||||
if blind_spots:
|
||||
parts.append(f"🔦 *Blind Spots* ({len(blind_spots)}) — Teams may not know")
|
||||
for b in blind_spots[:3]:
|
||||
parts.append(f"🟠 {b['meeting_signal'][:120]}")
|
||||
if b.get("recommendation"):
|
||||
parts.append(f" → {b['recommendation'][:100]}\n")
|
||||
|
||||
return "\n".join(parts)
|
||||
342
thirdeye/backend/agents/meet_ingestor.py
Normal file
342
thirdeye/backend/agents/meet_ingestor.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Meet Ingestor Agent
|
||||
Processes raw Google Meet transcript chunks and extracts structured signals.
|
||||
|
||||
Signal types produced:
|
||||
meet_decision — A decision made during the meeting
|
||||
meet_action_item — A task assigned to someone
|
||||
meet_blocker — A blocker or dependency raised
|
||||
meet_risk — A risk or concern identified
|
||||
meet_open_q — An unresolved question left open
|
||||
meet_summary — Full meeting summary (emitted on is_final=True)
|
||||
meet_chunk_raw — Raw transcript chunk (always stored, for full-text search)
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import store_signals
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.meet_ingestor")
|
||||
|
||||
|
||||
# ─── Extraction prompt ───────────────────────────────────────────────────────
|
||||
|
||||
EXTRACTION_SYSTEM_PROMPT = """You are an expert meeting analyst. You receive raw transcript chunks from a Google Meet recording and extract structured signals.
|
||||
|
||||
Extract ONLY signals that are clearly present. Do NOT hallucinate or infer beyond what is stated.
|
||||
|
||||
Return ONLY a valid JSON object with this exact structure:
|
||||
{
|
||||
"decisions": [
|
||||
{"text": "...", "owner": "@name or null", "confidence": "high|medium|low"}
|
||||
],
|
||||
"action_items": [
|
||||
{"text": "...", "owner": "@name or null", "due": "date string or null", "confidence": "high|medium|low"}
|
||||
],
|
||||
"blockers": [
|
||||
{"text": "...", "blocking_what": "...", "confidence": "high|medium|low"}
|
||||
],
|
||||
"risks": [
|
||||
{"text": "...", "severity": "high|medium|low", "confidence": "high|medium|low"}
|
||||
],
|
||||
"open_questions": [
|
||||
{"text": "...", "confidence": "high|medium|low"}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- If a category has nothing, use an empty array []
|
||||
- owner must start with @ if it's a person's name (e.g. "@Alex")
|
||||
- text must be a clear, standalone sentence — not a fragment
|
||||
- Only include confidence "high" if the signal is unambiguous
|
||||
- Do NOT reproduce filler words, pleasantries, or off-topic banter
|
||||
- Return JSON only — no markdown, no preamble, no explanation"""
|
||||
|
||||
|
||||
SUMMARY_SYSTEM_PROMPT = """You are a meeting intelligence expert. Given a full meeting transcript (possibly from multiple chunks), write a concise but complete meeting summary.
|
||||
|
||||
Structure your summary as:
|
||||
1. One-sentence overview (what was the meeting about)
|
||||
2. Key decisions made (bullet points, max 5)
|
||||
3. Action items assigned (who does what by when)
|
||||
4. Blockers or risks raised
|
||||
5. Open questions still unresolved
|
||||
|
||||
Keep the summary under 400 words. Be specific. Use names when available. Do NOT use filler phrases like "the team discussed" — just state what was decided/agreed/assigned."""
|
||||
|
||||
|
||||
# ─── Signal builder ─────────────────────────────────────────────────────────
|
||||
|
||||
def _build_signal(
|
||||
signal_type: str,
|
||||
summary: str,
|
||||
raw_quote: str,
|
||||
severity: str,
|
||||
entities: list[str],
|
||||
keywords: list[str],
|
||||
timestamp: str,
|
||||
group_id: str,
|
||||
meeting_id: str,
|
||||
urgency: str = "none",
|
||||
status: str = "open",
|
||||
) -> dict:
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": signal_type,
|
||||
"summary": summary,
|
||||
"raw_quote": raw_quote[:500] if raw_quote else "",
|
||||
"severity": severity,
|
||||
"status": status,
|
||||
"sentiment": "neutral",
|
||||
"urgency": urgency,
|
||||
"entities": entities,
|
||||
"keywords": keywords,
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "meet",
|
||||
"meeting_id": meeting_id,
|
||||
}
|
||||
|
||||
|
||||
def _extract_entities(text: str, owner: str = None) -> list[str]:
|
||||
"""Extract entity strings from text (names starting with @)."""
|
||||
import re
|
||||
entities = re.findall(r"@[\w]+", text)
|
||||
if owner and owner.startswith("@"):
|
||||
entities.append(owner)
|
||||
return list(set(entities))
|
||||
|
||||
|
||||
def _extract_keywords(text: str) -> list[str]:
|
||||
"""Simple keyword extraction: lowercase meaningful words."""
|
||||
stopwords = {"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
|
||||
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
|
||||
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not"}
|
||||
words = text.lower().split()
|
||||
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
|
||||
return list(dict.fromkeys(keywords))[:10] # deduplicate, keep first 10
|
||||
|
||||
|
||||
# ─── Main processing function ────────────────────────────────────────────────
|
||||
|
||||
async def process_meet_chunk(
|
||||
meeting_id: str,
|
||||
group_id: str,
|
||||
chunk_index: int,
|
||||
text: str,
|
||||
speaker: str,
|
||||
timestamp: str,
|
||||
is_final: bool,
|
||||
):
|
||||
"""
|
||||
Full pipeline for a transcript chunk:
|
||||
1. Always store raw chunk for full-text search
|
||||
2. Extract structured signals via LLM
|
||||
3. If is_final, generate a full meeting summary
|
||||
"""
|
||||
logger.info(f"Processing meet chunk {chunk_index} for meeting {meeting_id} ({len(text)} chars)")
|
||||
signals_to_store = []
|
||||
|
||||
# 1. Always store the raw chunk (enables full-text similarity search later)
|
||||
raw_signal = _build_signal(
|
||||
signal_type="meet_chunk_raw",
|
||||
summary=f"[{meeting_id}] Chunk {chunk_index}: {text[:120]}...",
|
||||
raw_quote=text,
|
||||
severity="low",
|
||||
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
|
||||
keywords=_extract_keywords(text),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
)
|
||||
signals_to_store.append(raw_signal)
|
||||
|
||||
# 2. Extract structured signals via LLM
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Transcript chunk from speaker '{speaker}':\n\n{text}"},
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=1500,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
raw_json = result["content"].strip()
|
||||
# Strip markdown code fences if present
|
||||
if raw_json.startswith("```"):
|
||||
raw_json = raw_json.split("```")[1]
|
||||
if raw_json.startswith("json"):
|
||||
raw_json = raw_json[4:]
|
||||
extracted = json.loads(raw_json)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Meet extraction LLM failed for chunk {chunk_index}: {e}")
|
||||
extracted = {}
|
||||
|
||||
# Decisions
|
||||
for item in extracted.get("decisions", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_decision",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="medium",
|
||||
entities=_extract_entities(item["text"], item.get("owner")),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="decided",
|
||||
))
|
||||
|
||||
# Action items
|
||||
for item in extracted.get("action_items", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
due_str = f" Due: {item['due']}." if item.get("due") else ""
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_action_item",
|
||||
summary=f"{item['text']}{due_str}",
|
||||
raw_quote=item["text"],
|
||||
severity="medium",
|
||||
entities=_extract_entities(item["text"], item.get("owner")),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="medium" if item.get("due") else "low",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Blockers
|
||||
for item in extracted.get("blockers", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_blocker",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="high",
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="high",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Risks
|
||||
for item in extracted.get("risks", []):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_risk",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity=item.get("severity", "medium"),
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="medium",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Open questions
|
||||
for item in extracted.get("open_questions", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_open_q",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="low",
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="open",
|
||||
))
|
||||
|
||||
# 3. If this is the final chunk, generate a meeting summary
|
||||
if is_final:
|
||||
summary_signal = await _generate_meeting_summary(
|
||||
meeting_id, group_id, text, speaker, timestamp
|
||||
)
|
||||
if summary_signal:
|
||||
signals_to_store.append(summary_signal)
|
||||
|
||||
# Store everything
|
||||
if signals_to_store:
|
||||
store_signals(group_id, signals_to_store)
|
||||
logger.info(
|
||||
f"Stored {len(signals_to_store)} signals for meeting {meeting_id} chunk {chunk_index}"
|
||||
)
|
||||
|
||||
return signals_to_store
|
||||
|
||||
|
||||
async def _generate_meeting_summary(
|
||||
meeting_id: str,
|
||||
group_id: str,
|
||||
final_chunk_text: str,
|
||||
speaker: str,
|
||||
timestamp: str,
|
||||
) -> dict | None:
|
||||
"""
|
||||
Pull all raw chunks for this meeting from ChromaDB and generate a summary.
|
||||
Falls back to summarizing just the final chunk if retrieval fails.
|
||||
"""
|
||||
from backend.db.chroma import query_signals
|
||||
|
||||
try:
|
||||
# Get all raw chunks for this meeting
|
||||
raw_chunks = query_signals(
|
||||
group_id,
|
||||
meeting_id,
|
||||
n_results=50,
|
||||
signal_type="meet_chunk_raw",
|
||||
)
|
||||
full_transcript = "\n\n".join(
|
||||
[s.get("metadata", {}).get("raw_quote", "") or s.get("document", "") for s in raw_chunks]
|
||||
)
|
||||
if not full_transcript.strip():
|
||||
full_transcript = final_chunk_text
|
||||
except Exception:
|
||||
full_transcript = final_chunk_text
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Meeting ID: {meeting_id}\n\nFull transcript:\n\n{full_transcript[:6000]}",
|
||||
},
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=600,
|
||||
)
|
||||
summary_text = result["content"].strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Meeting summary generation failed: {e}")
|
||||
return None
|
||||
|
||||
return _build_signal(
|
||||
signal_type="meet_summary",
|
||||
summary=summary_text,
|
||||
raw_quote=full_transcript[:500],
|
||||
severity="medium",
|
||||
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
|
||||
keywords=_extract_keywords(summary_text),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="completed",
|
||||
)
|
||||
114
thirdeye/backend/agents/pattern_detector.py
Normal file
114
thirdeye/backend/agents/pattern_detector.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Pattern Detector Agent — finds trends and anomalies in accumulated signals."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import get_all_signals
|
||||
from backend.db.models import Pattern
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.pattern_detector")
|
||||
|
||||
SYSTEM_PROMPT = """You are the Pattern Detector for ThirdEye. You analyze accumulated signals to find patterns and anomalies.
|
||||
|
||||
Detect these pattern types:
|
||||
- frequency_spike: A signal type mentioned significantly more than usual
|
||||
- knowledge_silo: Only one person discusses a critical topic (bus factor = 1)
|
||||
- recurring_issue: Same bug/problem appearing repeatedly
|
||||
- sentiment_trend: Gradual shift in tone over time
|
||||
- stale_item: Decisions proposed but never resolved, promises with no follow-up
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"patterns": [{"type": "pattern_type", "description": "Clear human-readable description", "severity": "info|warning|critical", "evidence_ids": [], "recommendation": "Suggested action"}]}
|
||||
|
||||
If no patterns found: {"patterns": []}
|
||||
Only report patterns that are genuinely concerning. Do NOT manufacture patterns from insufficient data."""
|
||||
|
||||
|
||||
def _heuristic_detect_patterns(group_id: str, all_signals: list[dict]) -> list[Pattern]:
|
||||
"""Generate conservative patterns from signal metadata when LLM output is unavailable."""
|
||||
patterns: list[Pattern] = []
|
||||
type_counts: dict[str, int] = {}
|
||||
entity_counts: dict[str, int] = {}
|
||||
|
||||
for s in all_signals:
|
||||
meta = s.get("metadata", {})
|
||||
signal_type = str(meta.get("type", "unknown"))
|
||||
type_counts[signal_type] = type_counts.get(signal_type, 0) + 1
|
||||
|
||||
entities = meta.get("entities", [])
|
||||
if isinstance(entities, str):
|
||||
entities = [entities]
|
||||
if isinstance(entities, list):
|
||||
for ent in entities:
|
||||
ent_key = str(ent).strip()
|
||||
if ent_key:
|
||||
entity_counts[ent_key] = entity_counts.get(ent_key, 0) + 1
|
||||
|
||||
recurring_types = [t for t, c in type_counts.items() if c >= 2 and t in {"recurring_bug", "workaround", "tech_debt"}]
|
||||
for signal_type in recurring_types:
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type="recurring_issue",
|
||||
description=f"Signal type '{signal_type}' has appeared repeatedly ({type_counts[signal_type]} times).",
|
||||
severity="warning",
|
||||
recommendation="Create a dedicated action item with owner and due date to stop repeated recurrence.",
|
||||
))
|
||||
|
||||
silo_entities = [ent for ent, c in entity_counts.items() if c >= 2]
|
||||
if any("stripe" in ent.lower() or "payment" in ent.lower() for ent in silo_entities):
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type="knowledge_silo",
|
||||
description="Critical payment-related topics are concentrated in repeated mentions, suggesting low bus factor.",
|
||||
severity="warning",
|
||||
recommendation="Document payment workflows and assign at least one backup owner.",
|
||||
))
|
||||
|
||||
return patterns[:5]
|
||||
|
||||
|
||||
async def detect_patterns(group_id: str) -> list[Pattern]:
|
||||
"""Analyze all signals in a group and detect patterns."""
|
||||
all_signals = get_all_signals(group_id)
|
||||
|
||||
if len(all_signals) < 3:
|
||||
logger.info(f"Not enough signals ({len(all_signals)}) for pattern detection in {group_id}")
|
||||
return []
|
||||
|
||||
# Format signals for the LLM
|
||||
signal_summary = []
|
||||
for s in all_signals:
|
||||
meta = s["metadata"]
|
||||
signal_summary.append(
|
||||
f"- [{meta.get('type', '?')}] {s['document'][:100]} "
|
||||
f"(severity={meta.get('severity', '?')}, entities={meta.get('entities', '[]')}, "
|
||||
f"time={meta.get('timestamp', '?')})"
|
||||
)
|
||||
signals_text = "\n".join(signal_summary)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Analyze these {len(all_signals)} signals from group '{group_id}':\n\n{signals_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("reasoning", messages, temperature=0.2, max_tokens=1500)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
patterns = []
|
||||
for p in parsed.get("patterns", []):
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type=p.get("type", "unknown"),
|
||||
description=p.get("description", ""),
|
||||
severity=p.get("severity", "info"),
|
||||
recommendation=p.get("recommendation", ""),
|
||||
))
|
||||
|
||||
logger.info(f"Detected {len(patterns)} patterns in {group_id}")
|
||||
return patterns
|
||||
|
||||
except Exception as e:
|
||||
logger.info(f"Pattern detection LLM parse issue, using fallback: {e}")
|
||||
fallback = _heuristic_detect_patterns(group_id, all_signals)
|
||||
if fallback:
|
||||
logger.info(f"Pattern heuristic fallback produced {len(fallback)} patterns in {group_id}")
|
||||
return fallback
|
||||
68
thirdeye/backend/agents/query_agent.py
Normal file
68
thirdeye/backend/agents/query_agent.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Query Agent — voice-aware signal context formatting for ThirdEye.
|
||||
|
||||
Provides _format_signal_for_context() which labels each ChromaDB signal with
|
||||
its true origin (voice note, document, meeting, chat) so the LLM can produce
|
||||
properly attributed answers like:
|
||||
"Based on what @Raj said in a voice note on Mar 14 (45s), the team decided..."
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
VOICE_CITATION_INSTRUCTION = """
|
||||
When context includes [VOICE NOTE — @name on Date (Xs)] signals, ALWAYS cite the voice note explicitly.
|
||||
Example: "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided to use PostgreSQL."
|
||||
Never flatten voice signals into generic "the team discussed" language. Always name the speaker and source.
|
||||
"""
|
||||
|
||||
|
||||
def _format_signal_for_context(signal: dict) -> str:
|
||||
"""
|
||||
Format a ChromaDB signal as a context snippet for the Query Agent LLM.
|
||||
Voice-sourced signals get explicit attribution so the LLM cites them correctly.
|
||||
Accepts both flat signal dicts and dicts with a nested 'metadata' key.
|
||||
"""
|
||||
# Support both flat dicts and ChromaDB-style {"metadata": {...}, "document": ...}
|
||||
meta = signal.get("metadata", signal)
|
||||
|
||||
source = meta.get("source", signal.get("source", "chat"))
|
||||
sig_type = meta.get("type", signal.get("type", "unknown"))
|
||||
summary = meta.get("summary", signal.get("summary", ""))
|
||||
timestamp = meta.get("timestamp", signal.get("timestamp", ""))
|
||||
|
||||
date_str = ""
|
||||
if timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
||||
date_str = dt.strftime("%b %d")
|
||||
except Exception:
|
||||
date_str = timestamp[:10]
|
||||
|
||||
if source == "voice":
|
||||
speaker = meta.get("speaker", signal.get("speaker", "Unknown"))
|
||||
duration = meta.get("voice_duration", signal.get("voice_duration", 0))
|
||||
duration_str = f"{duration}s" if duration else "?"
|
||||
return (
|
||||
f"[VOICE NOTE — @{speaker} on {date_str} ({duration_str})] "
|
||||
f"[{sig_type}] {summary}"
|
||||
)
|
||||
|
||||
if source == "document":
|
||||
return f"[DOCUMENT — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
if source == "link":
|
||||
return f"[WEB LINK — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
if sig_type in ("meet_decision", "meet_action_item", "meet_blocker", "meet_summary"):
|
||||
meeting_id = meta.get("meeting_id", signal.get("meeting_id", ""))
|
||||
return f"[MEETING {meeting_id} — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
entities_raw = meta.get("entities", signal.get("entities", []))
|
||||
if isinstance(entities_raw, str):
|
||||
import json
|
||||
try:
|
||||
entities_raw = json.loads(entities_raw)
|
||||
except Exception:
|
||||
entities_raw = []
|
||||
sender_str = entities_raw[0] if entities_raw else ""
|
||||
return f"[CHAT — {sender_str} on {date_str}] [{sig_type}] {summary}"
|
||||
128
thirdeye/backend/agents/signal_extractor.py
Normal file
128
thirdeye/backend/agents/signal_extractor.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Signal Extractor Agent — extracts structured signals from chat messages."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.models import Signal
|
||||
from datetime import datetime
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.signal_extractor")
|
||||
|
||||
# Lens-specific system prompts
|
||||
LENS_PROMPTS = {
|
||||
"dev": """You are the Signal Extractor for ThirdEye operating in DevLens mode.
|
||||
You analyze batches of developer team chat messages and extract STRUCTURED SIGNALS.
|
||||
|
||||
Extract ONLY signals that represent meaningful technical information. Skip greetings, small talk, emoji reactions, and meta-conversation.
|
||||
|
||||
Signal types to look for:
|
||||
- architecture_decision: Technology choices, design decisions with rationale
|
||||
- tech_debt: Shortcuts, hardcoded values, "will fix later" patterns
|
||||
- knowledge_silo_evidence: Only one person discusses a critical topic
|
||||
- recurring_bug: Same issue mentioned repeatedly
|
||||
- stack_decision: Technology/framework choices (proposed or decided)
|
||||
- deployment_risk: Risky deployment practices
|
||||
- workaround: Temporary fixes being applied repeatedly
|
||||
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
|
||||
|
||||
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
|
||||
For EACH signal found, include it in the JSON array. If NO meaningful signals exist, return empty array.
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"product": """You are the Signal Extractor for ThirdEye operating in ProductLens mode.
|
||||
|
||||
Signal types to look for:
|
||||
- feature_request: Features users or team members are asking for
|
||||
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
|
||||
- user_pain_point: User difficulties, complaints, confusion
|
||||
- roadmap_drift: Discussion of topics not on the current plan
|
||||
- priority_conflict: Team members disagreeing on what's most important
|
||||
- metric_mention: Specific numbers, conversion rates, performance data
|
||||
- user_quote: Direct quotes from users/customers
|
||||
- competitor_intel: Mentions of competitor actions or features
|
||||
|
||||
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"client": """You are the Signal Extractor for ThirdEye operating in ClientLens mode.
|
||||
|
||||
Signal types to look for:
|
||||
- promise: Commitments made with deadlines (explicit or implicit)
|
||||
- scope_creep: Additional requests introduced casually without formal change requests
|
||||
- sentiment_signal: Tone changes (positive praise, growing frustration, formality shifts)
|
||||
- unanswered_request: Questions or requests that haven't received responses
|
||||
- satisfaction: Explicit positive or negative feedback
|
||||
- escalation_risk: Mentions of involving management, expressing deadline concerns
|
||||
- client_decision: Decisions made by the client
|
||||
|
||||
Pay SPECIAL attention to implicit deadlines ("by end of week", "before the meeting").
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"community": """You are the Signal Extractor for ThirdEye operating in CommunityLens mode.
|
||||
|
||||
Signal types: recommendation, event, issue, local_knowledge, question
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
}
|
||||
|
||||
EXTRACTION_FORMAT = """
|
||||
Respond ONLY with valid JSON in this exact format (no markdown, no backticks, no explanation):
|
||||
{"signals": [{"type": "signal_type_here", "summary": "One clear sentence that includes specific names, numbers, timelines, and commitments", "entities": ["@person", "technology"], "severity": "low|medium|high|critical", "status": "proposed|decided|implemented|unresolved", "raw_quote": "Exact verbatim sentence(s) from the message that capture the full claim, including names, numbers, and timelines", "message_index": 0}]}
|
||||
|
||||
IMPORTANT for raw_quote: copy the FULL relevant sentence from the message, not just a topic keyword.
|
||||
Example — message "Anirban: feature page revamp will take approx 2 more days"
|
||||
WRONG raw_quote: "feature page revamp"
|
||||
CORRECT raw_quote: "feature page revamp will take approx 2 more days"
|
||||
|
||||
If no signals found: {"signals": []}
|
||||
"""
|
||||
|
||||
|
||||
async def extract_signals(messages_text: str, group_id: str, lens: str = "dev") -> list[Signal]:
|
||||
"""
|
||||
Extract structured signals from a batch of formatted chat messages.
|
||||
|
||||
Args:
|
||||
messages_text: Formatted string like "[Alex]: Let's use Redis\\n[Bob]: Agreed"
|
||||
group_id: Telegram group ID
|
||||
lens: Active lens mode (dev, product, client, community)
|
||||
|
||||
Returns:
|
||||
List of Signal objects
|
||||
"""
|
||||
system_prompt = LENS_PROMPTS.get(lens, LENS_PROMPTS["dev"])
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt + "\n\n" + EXTRACTION_FORMAT},
|
||||
{"role": "user", "content": f"Extract signals from these messages:\n\n{messages_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_large", messages, temperature=0.2, max_tokens=2000)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
raw_signals = parsed.get("signals", [])
|
||||
|
||||
# Convert to Signal objects
|
||||
signals = []
|
||||
for raw in raw_signals:
|
||||
try:
|
||||
signal = Signal(
|
||||
group_id=group_id,
|
||||
lens=lens,
|
||||
type=raw.get("type", "unknown"),
|
||||
summary=raw.get("summary", ""),
|
||||
entities=raw.get("entities", []),
|
||||
severity=raw.get("severity", "low"),
|
||||
status=raw.get("status", "unknown"),
|
||||
raw_quote=raw.get("raw_quote", ""),
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
)
|
||||
signals.append(signal)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse signal: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Extracted {len(signals)} signals from {group_id} (lens={lens}) via {result['provider']}")
|
||||
return signals
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Signal extraction failed: {e}")
|
||||
return []
|
||||
281
thirdeye/backend/agents/voice_handler.py
Normal file
281
thirdeye/backend/agents/voice_handler.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Voice Handler
|
||||
Orchestrates the full pipeline for Telegram voice messages and video notes:
|
||||
|
||||
Telegram voice/video_note message
|
||||
-> download audio bytes
|
||||
-> transcribe via Groq Whisper (voice_transcriber.py)
|
||||
-> build a voice_transcript signal (stored raw for full-text search)
|
||||
-> run transcript through process_message_batch (signal extraction)
|
||||
-> all extracted signals carry voice attribution metadata
|
||||
|
||||
Voice metadata attached to every extracted signal:
|
||||
source: "voice"
|
||||
voice_file_id: Telegram file ID
|
||||
voice_duration: seconds
|
||||
speaker: sender display name
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from backend.agents.voice_transcriber import (
|
||||
transcribe_audio, download_telegram_audio, format_duration
|
||||
)
|
||||
from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
|
||||
from backend.db.chroma import store_signals
|
||||
from backend.pipeline import process_message_batch
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_handler")
|
||||
|
||||
|
||||
# --- Voice transcript signal builder -----------------------------------------
|
||||
|
||||
def build_voice_transcript_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
voice_file_id: str,
|
||||
duration_seconds: int,
|
||||
language: str,
|
||||
timestamp: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a voice_transcript signal that stores the full raw transcription.
|
||||
Always stored alongside extracted signals so the full transcript is
|
||||
searchable in ChromaDB even if no structured signals were extracted.
|
||||
"""
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "voice_transcript",
|
||||
"summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
|
||||
"raw_quote": transcript,
|
||||
"severity": "low",
|
||||
"status": "transcribed",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"voice_file_id": voice_file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
"voice_language": language,
|
||||
"speaker": sender,
|
||||
}
|
||||
|
||||
|
||||
def _extract_voice_keywords(text: str) -> list[str]:
|
||||
"""Simple keyword extraction from transcript text."""
|
||||
stopwords = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
|
||||
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
|
||||
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
|
||||
"so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
|
||||
}
|
||||
words = text.lower().split()
|
||||
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
|
||||
return list(dict.fromkeys(keywords))[:12]
|
||||
|
||||
|
||||
def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
|
||||
"""
|
||||
Inject voice attribution into every signal extracted from a voice transcript.
|
||||
Accepts both Signal Pydantic model objects and plain dicts.
|
||||
This ensures /ask can cite the voice source in its answers.
|
||||
"""
|
||||
result = []
|
||||
for signal in signals:
|
||||
sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
|
||||
sig["source"] = "voice"
|
||||
sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
|
||||
sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
|
||||
sig["voice_language"] = voice_meta.get("language", "")
|
||||
sig["speaker"] = voice_meta.get("sender", "Unknown")
|
||||
if "[Voice]" not in sig.get("summary", ""):
|
||||
sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
|
||||
result.append(sig)
|
||||
return result
|
||||
|
||||
|
||||
# --- Fallback signal builder -------------------------------------------------
|
||||
|
||||
# Keywords that hint at a signal type when the LLM extraction returns nothing
|
||||
_FALLBACK_TYPE_HINTS = {
|
||||
"feature_request": {
|
||||
"need", "needs", "required", "require", "want", "should", "missing",
|
||||
"add", "feature", "ui", "ux", "design", "change", "changes", "update",
|
||||
"improve", "improvement", "responsiveness", "responsive",
|
||||
},
|
||||
"blocker": {
|
||||
"blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
|
||||
"issue", "problem", "broken", "fails", "failing",
|
||||
},
|
||||
"action_item": {
|
||||
"will", "going", "plan", "todo", "do", "fix", "implement", "setup",
|
||||
"create", "build", "deploy", "check",
|
||||
},
|
||||
"risk": {
|
||||
"risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
|
||||
"deadline", "delay", "late",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _build_fallback_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
timestamp: str,
|
||||
voice_meta: dict,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a best-effort structured signal from a voice transcript when the LLM
|
||||
returned 0 signals. Picks the most likely signal type from keyword hints,
|
||||
falling back to 'feature_request' as the safe default.
|
||||
"""
|
||||
words = set(transcript.lower().split())
|
||||
scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
|
||||
best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
|
||||
|
||||
urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
|
||||
severity = "high" if words & urgency_words else "medium"
|
||||
|
||||
summary = transcript[:200].strip()
|
||||
if len(transcript) > 200:
|
||||
summary += "..."
|
||||
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": best_type,
|
||||
"summary": f"[Voice @{sender}] {summary}",
|
||||
"raw_quote": transcript[:500],
|
||||
"severity": severity,
|
||||
"status": "unresolved",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "high" if severity == "high" else "medium",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"speaker": sender,
|
||||
"voice_file_id": voice_meta.get("voice_file_id", ""),
|
||||
"voice_duration": voice_meta.get("duration_seconds", 0),
|
||||
"voice_language": voice_meta.get("language", ""),
|
||||
}
|
||||
|
||||
|
||||
# --- Main handler ------------------------------------------------------------
|
||||
|
||||
async def handle_voice_message(
|
||||
bot,
|
||||
group_id: str,
|
||||
sender: str,
|
||||
file_id: str,
|
||||
duration_seconds: int,
|
||||
message_date,
|
||||
is_video_note: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Full pipeline for a single voice or video note message.
|
||||
|
||||
Returns:
|
||||
{"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
|
||||
OR {"ok": False, "reason": "...", "error": "..."}
|
||||
"""
|
||||
if not ENABLE_VOICE_TRANSCRIPTION:
|
||||
return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
|
||||
|
||||
msg_type = "video note" if is_video_note else "voice message"
|
||||
logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
|
||||
|
||||
# 1. Download audio
|
||||
try:
|
||||
audio_bytes = await download_telegram_audio(bot, file_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download audio from {sender}: {e}")
|
||||
return {"ok": False, "reason": "download_failed", "error": str(e)}
|
||||
|
||||
# 2. Transcribe
|
||||
filename = "audio.mp4" if is_video_note else "audio.ogg"
|
||||
transcription = await transcribe_audio(
|
||||
audio_bytes,
|
||||
filename=filename,
|
||||
duration_seconds=duration_seconds,
|
||||
)
|
||||
|
||||
if not transcription["ok"]:
|
||||
logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
|
||||
return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
|
||||
|
||||
transcript = transcription["transcript"]
|
||||
language = transcription.get("language", "unknown")
|
||||
timestamp = (
|
||||
message_date.replace(tzinfo=timezone.utc).isoformat()
|
||||
if message_date else datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
# 3. Store raw voice transcript signal
|
||||
if VOICE_STORE_TRANSCRIPT:
|
||||
transcript_signal = build_voice_transcript_signal(
|
||||
transcript=transcript,
|
||||
sender=sender,
|
||||
group_id=group_id,
|
||||
voice_file_id=file_id,
|
||||
duration_seconds=duration_seconds,
|
||||
language=language,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
store_signals(group_id, [transcript_signal])
|
||||
logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
|
||||
|
||||
# 4. Run through signal extraction pipeline — treat as a regular text message
|
||||
voice_meta = {
|
||||
"sender": sender,
|
||||
"voice_file_id": file_id,
|
||||
"duration_seconds": duration_seconds,
|
||||
"language": language,
|
||||
}
|
||||
|
||||
messages = [{
|
||||
"sender": sender,
|
||||
"text": transcript,
|
||||
"timestamp": timestamp,
|
||||
"source": "voice",
|
||||
"voice_file_id": file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
}]
|
||||
|
||||
try:
|
||||
extracted_signals = await process_message_batch(group_id, messages)
|
||||
extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
|
||||
signals_count = len(extracted_signals)
|
||||
|
||||
# Fallback: if the LLM extracted nothing from a meaningful voice message,
|
||||
# create a generic signal so the content is still searchable as structured data.
|
||||
if signals_count == 0 and len(transcript.split()) >= 5:
|
||||
fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
|
||||
store_signals(group_id, [fallback])
|
||||
signals_count = 1
|
||||
logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
|
||||
except Exception as e:
|
||||
logger.error(f"Signal extraction failed for voice from {sender}: {e}")
|
||||
signals_count = 0
|
||||
|
||||
logger.info(
|
||||
f"Voice pipeline complete: {sender}, {duration_seconds}s, "
|
||||
f"{signals_count} signals, transcript={len(transcript)} chars"
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"signals_extracted": signals_count,
|
||||
"duration": duration_seconds,
|
||||
"sender": f"@{sender}",
|
||||
"language": language,
|
||||
}
|
||||
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Voice Transcriber — Groq Whisper integration.
|
||||
|
||||
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
|
||||
audio bytes from Telegram voice messages and video notes into plain text.
|
||||
|
||||
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
|
||||
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
|
||||
Telegram voice messages: OGG/Opus
|
||||
Telegram video notes: MP4
|
||||
|
||||
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
|
||||
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.config import (
|
||||
GROQ_API_KEY,
|
||||
VOICE_LANGUAGE,
|
||||
VOICE_MAX_DURATION_SECONDS,
|
||||
VOICE_MIN_DURATION_SECONDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
|
||||
|
||||
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
WHISPER_MODEL = "whisper-large-v3"
|
||||
|
||||
# Groq file size limit for Whisper: 25 MB
|
||||
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
|
||||
|
||||
# --- Main transcription function ---------------------------------------------
|
||||
|
||||
async def transcribe_audio(
|
||||
audio_bytes: bytes,
|
||||
filename: str = "audio.ogg",
|
||||
duration_seconds: int = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe audio bytes using Groq Whisper.
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
|
||||
filename: Filename hint for the API (determines format detection)
|
||||
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"ok": True,
|
||||
"transcript": "The full transcribed text...",
|
||||
"language": "en",
|
||||
"duration": 45,
|
||||
"word_count": 120,
|
||||
}
|
||||
OR on failure:
|
||||
{
|
||||
"ok": False,
|
||||
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
|
||||
"error": "optional error string",
|
||||
}
|
||||
"""
|
||||
# Pre-flight checks
|
||||
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
|
||||
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
|
||||
|
||||
if not audio_bytes:
|
||||
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
|
||||
|
||||
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "file_too_large",
|
||||
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
|
||||
}
|
||||
|
||||
if duration_seconds is not None:
|
||||
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_short",
|
||||
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
|
||||
}
|
||||
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_long",
|
||||
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
|
||||
}
|
||||
|
||||
# Determine MIME type from filename extension
|
||||
ext_to_mime = {
|
||||
".ogg": "audio/ogg",
|
||||
".opus": "audio/ogg",
|
||||
".mp3": "audio/mpeg",
|
||||
".mp4": "video/mp4",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".flac": "audio/flac",
|
||||
".webm": "audio/webm",
|
||||
}
|
||||
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
|
||||
mime_type = ext_to_mime.get(ext, "audio/ogg")
|
||||
|
||||
form_data = {
|
||||
"model": WHISPER_MODEL,
|
||||
"response_format": "verbose_json", # returns language detection
|
||||
"temperature": "0", # deterministic transcription
|
||||
}
|
||||
if VOICE_LANGUAGE:
|
||||
form_data["language"] = VOICE_LANGUAGE
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.post(
|
||||
GROQ_WHISPER_URL,
|
||||
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
|
||||
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
|
||||
data=form_data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_text = ""
|
||||
try:
|
||||
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
|
||||
except Exception:
|
||||
error_text = e.response.text[:200]
|
||||
|
||||
if e.response.status_code == 429:
|
||||
logger.warning("Groq Whisper rate limited")
|
||||
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
|
||||
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
|
||||
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Groq Whisper request timed out")
|
||||
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Groq Whisper unexpected error: {e}")
|
||||
return {"ok": False, "reason": "api_error", "error": str(e)}
|
||||
|
||||
# Parse response
|
||||
transcript = (data.get("text") or "").strip()
|
||||
|
||||
if not transcript:
|
||||
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
|
||||
|
||||
# Detect if Whisper only returned noise markers
|
||||
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
|
||||
if transcript.lower() in noise_patterns:
|
||||
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
|
||||
|
||||
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
|
||||
word_count = len(transcript.split())
|
||||
|
||||
logger.info(
|
||||
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
|
||||
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"language": detected_language,
|
||||
"duration": duration_seconds,
|
||||
"word_count": word_count,
|
||||
}
|
||||
|
||||
|
||||
# --- Telegram-specific download helper ---------------------------------------
|
||||
|
||||
async def download_telegram_audio(bot, file_id: str) -> bytes:
|
||||
"""
|
||||
Download a Telegram file (voice or video_note) and return raw bytes.
|
||||
"""
|
||||
tg_file = await bot.get_file(file_id)
|
||||
audio_bytes = await tg_file.download_as_bytearray()
|
||||
return bytes(audio_bytes)
|
||||
|
||||
|
||||
def format_duration(seconds: int) -> str:
|
||||
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
|
||||
if seconds is None:
|
||||
return "?"
|
||||
if seconds >= 60:
|
||||
return f"{seconds // 60}m {seconds % 60}s"
|
||||
return f"{seconds}s"
|
||||
84
thirdeye/backend/agents/web_search.py
Normal file
84
thirdeye/backend/agents/web_search.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Web Search Agent — Tavily integration for real-time web context."""
|
||||
import logging
|
||||
from backend.config import TAVILY_API_KEY, ENABLE_WEB_SEARCH
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.web_search")
|
||||
|
||||
_tavily_client = None
|
||||
|
||||
|
||||
def _get_client():
|
||||
global _tavily_client
|
||||
if _tavily_client is None and TAVILY_API_KEY and len(TAVILY_API_KEY) > 5:
|
||||
try:
|
||||
from tavily import TavilyClient
|
||||
_tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
||||
logger.info("Tavily client initialized")
|
||||
except ImportError:
|
||||
logger.error("tavily-python not installed. Run: pip install tavily-python")
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily client init failed: {e}")
|
||||
return _tavily_client
|
||||
|
||||
|
||||
async def search_web(query: str, max_results: int = 5) -> list[dict]:
|
||||
"""
|
||||
Search the web using Tavily and return structured results.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
max_results: Max results to return (1-10)
|
||||
|
||||
Returns:
|
||||
List of {title, url, content, score} dicts, sorted by relevance
|
||||
"""
|
||||
if not ENABLE_WEB_SEARCH:
|
||||
logger.info("Web search is disabled via feature flag")
|
||||
return []
|
||||
|
||||
client = _get_client()
|
||||
if not client:
|
||||
logger.warning("Tavily client not available (missing API key or install)")
|
||||
return []
|
||||
|
||||
try:
|
||||
response = client.search(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
search_depth="basic", # "basic" is faster + free-tier friendly; "advanced" for deeper
|
||||
include_answer=False,
|
||||
include_raw_content=False,
|
||||
)
|
||||
|
||||
results = []
|
||||
for r in response.get("results", []):
|
||||
results.append({
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("content", ""),
|
||||
"score": r.get("score", 0.0),
|
||||
})
|
||||
|
||||
logger.info(f"Tavily returned {len(results)} results for: {query[:60]}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def format_search_results_for_llm(results: list[dict]) -> str:
|
||||
"""Format Tavily results into context string for the Query Agent."""
|
||||
if not results:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
for i, r in enumerate(results):
|
||||
content_preview = r["content"][:500] if r["content"] else "No content"
|
||||
parts.append(
|
||||
f"[Web Result {i+1}] {r['title']}\n"
|
||||
f"Source: {r['url']}\n"
|
||||
f"Content: {content_preview}"
|
||||
)
|
||||
|
||||
return "\n\n".join(parts)
|
||||
Reference in New Issue
Block a user