mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 12:41:48 +00:00
init
This commit is contained in:
34
thirdeye/backend/agents/classifier.py
Normal file
34
thirdeye/backend/agents/classifier.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Classifier Agent — adds metadata tags to extracted signals."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.models import Signal
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.classifier")
|
||||
|
||||
SYSTEM_PROMPT = """You are a fast metadata classifier. Given an extracted signal, add classification tags.
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"sentiment": "positive|neutral|negative|urgent", "urgency": "none|low|medium|high|critical", "keywords": ["3-5 searchable keywords"]}
|
||||
"""
|
||||
|
||||
|
||||
async def classify_signal(signal: Signal) -> Signal:
|
||||
"""Add classification metadata to a signal."""
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Classify this signal:\nType: {signal.type}\nSummary: {signal.summary}\nQuote: {signal.raw_quote}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_small", messages, temperature=0.1, max_tokens=200)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
signal.sentiment = parsed.get("sentiment", signal.sentiment)
|
||||
signal.urgency = parsed.get("urgency", signal.urgency)
|
||||
signal.keywords = parsed.get("keywords", signal.keywords)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Classification failed, using defaults: {e}")
|
||||
# Keep defaults — classification failure is non-fatal
|
||||
|
||||
return signal
|
||||
107
thirdeye/backend/agents/context_detector.py
Normal file
107
thirdeye/backend/agents/context_detector.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""Context Detector Agent — auto-classifies group type from messages."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.context_detector")
|
||||
|
||||
SYSTEM_PROMPT = """You analyze a batch of messages from a Telegram group and determine what TYPE of group this is.
|
||||
|
||||
CLASSIFY into exactly ONE:
|
||||
- "dev" — Software engineering team (code, PRs, deployments, bugs, tech stack)
|
||||
- "product" — Product/business team (features, users, metrics, roadmap, competitors)
|
||||
- "client" — Client/agency channel (deliverables, timelines, approvals, invoices)
|
||||
- "community" — Community/interest group (recommendations, events, local info, casual)
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"detected_lens": "dev|product|client|community", "confidence": 0.0-1.0, "evidence": ["signal1", "signal2", "signal3"]}
|
||||
"""
|
||||
|
||||
VALID_LENSES = {"dev", "product", "client", "community"}
|
||||
|
||||
|
||||
def _heuristic_detect_context(messages_text: str) -> dict:
|
||||
"""Rule-based fallback when LLM output is malformed/unavailable."""
|
||||
text = (messages_text or "").lower()
|
||||
|
||||
lens_keywords = {
|
||||
"dev": [
|
||||
"bug", "deploy", "deployment", "api", "database", "schema", "postgres", "mongo",
|
||||
"timeout", "endpoint", "pod", "pr", "code", "docker", "stack", "integration",
|
||||
],
|
||||
"product": [
|
||||
"feature", "roadmap", "user", "users", "client", "customers", "complain", "pain",
|
||||
"prioritize", "priority", "enterprise", "competitor", "demo", "sso", "dark mode",
|
||||
"mobile", "stability", "integration",
|
||||
],
|
||||
"client": [
|
||||
"invoice", "deadline", "deliverable", "approval", "sign-off", "scope", "payment",
|
||||
"contract", "proposal", "timeline", "meeting",
|
||||
],
|
||||
"community": [
|
||||
"event", "meetup", "recommend", "anyone", "community", "local", "where can i",
|
||||
"suggestion", "friends", "weekend",
|
||||
],
|
||||
}
|
||||
|
||||
scores = {
|
||||
lens: sum(text.count(keyword) for keyword in keywords)
|
||||
for lens, keywords in lens_keywords.items()
|
||||
}
|
||||
|
||||
best_lens = max(scores, key=scores.get)
|
||||
best_score = scores[best_lens]
|
||||
if best_score == 0:
|
||||
best_lens = "dev"
|
||||
|
||||
evidence = [k for k in lens_keywords[best_lens] if k in text][:3]
|
||||
confidence = min(0.95, 0.35 + 0.08 * best_score) if best_score > 0 else 0.0
|
||||
|
||||
return {
|
||||
"detected_lens": best_lens,
|
||||
"confidence": round(confidence, 2),
|
||||
"evidence": evidence or ["heuristic_fallback"],
|
||||
}
|
||||
|
||||
|
||||
async def detect_context(messages_text: str) -> dict:
|
||||
"""Detect group type from a batch of messages."""
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Classify this group based on these messages:\n\n{messages_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
"fast_large",
|
||||
messages,
|
||||
temperature=0.1,
|
||||
max_tokens=300,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
|
||||
detected_lens = str(parsed.get("detected_lens", "dev")).strip().lower()
|
||||
if detected_lens not in VALID_LENSES:
|
||||
detected_lens = "dev"
|
||||
|
||||
confidence = parsed.get("confidence", 0.5)
|
||||
try:
|
||||
confidence = float(confidence)
|
||||
except (TypeError, ValueError):
|
||||
confidence = 0.5
|
||||
|
||||
evidence = parsed.get("evidence", [])
|
||||
if not isinstance(evidence, list):
|
||||
evidence = [str(evidence)]
|
||||
|
||||
return {
|
||||
"detected_lens": detected_lens,
|
||||
"confidence": max(0.0, min(1.0, confidence)),
|
||||
"evidence": [str(x) for x in evidence][:5],
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Context detection failed: {e}")
|
||||
fallback = _heuristic_detect_context(messages_text)
|
||||
fallback["evidence"] = fallback["evidence"] + ["detection_failed"]
|
||||
return fallback
|
||||
287
thirdeye/backend/agents/cross_group_analyst.py
Normal file
287
thirdeye/backend/agents/cross_group_analyst.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""Cross-Group Analyst Agent — detects blind spots between multiple teams."""
|
||||
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
from backend.db.models import CrossGroupInsight
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.cross_group_analyst")
|
||||
|
||||
SYSTEM_PROMPT = """You are the Cross-Group Intelligence Analyst for ThirdEye. This is the MOST IMPORTANT analysis.
|
||||
|
||||
You receive intelligence summaries from MULTIPLE Telegram groups. Your job is to find BLIND SPOTS — information in one group that should be in another.
|
||||
|
||||
Detect:
|
||||
- blocked_handoff: Team A waiting for something from Team B, but Team B doesn't know
|
||||
- conflicting_decision: Team A decided X, Team B decided the opposite
|
||||
- information_silo: Critical info in Group A never reached Group B
|
||||
- promise_reality_gap: Promise made in one group, but another group shows it's blocked
|
||||
- duplicated_effort: Two teams working on similar things unknowingly
|
||||
|
||||
Respond ONLY with valid JSON (no markdown):
|
||||
{"insights": [{"type": "insight_type", "description": "SPECIFIC description naming the groups, people, and topics", "group_a": {"name": "group_name", "evidence": "what was said"}, "group_b": {"name": "group_name", "evidence": "what was said or NOT said"}, "severity": "warning|critical", "recommendation": "Specific action"}]}
|
||||
|
||||
If no cross-group issues: {"insights": []}
|
||||
Be SPECIFIC. Name the groups, people, topics, and exact conflicts."""
|
||||
|
||||
|
||||
def _heuristic_cross_group_insights(
|
||||
group_summaries: dict[str, list[dict]],
|
||||
) -> list[CrossGroupInsight]:
|
||||
"""Generate best-effort cross-group insights when LLM output is unavailable."""
|
||||
insights: list[CrossGroupInsight] = []
|
||||
|
||||
normalized = {}
|
||||
for group_name, signals in group_summaries.items():
|
||||
docs = [str(s.get("document", "")) for s in signals]
|
||||
combined = " ".join(docs).lower()
|
||||
signal_types = []
|
||||
for s in signals:
|
||||
signal_types.append(
|
||||
str(s.get("metadata", {}).get("type", "unknown")).lower()
|
||||
)
|
||||
normalized[group_name] = {
|
||||
"text": combined,
|
||||
"signals": signals,
|
||||
"types": signal_types,
|
||||
}
|
||||
|
||||
group_names = list(normalized.keys())
|
||||
for i in range(len(group_names)):
|
||||
for j in range(i + 1, len(group_names)):
|
||||
group_a = group_names[i]
|
||||
group_b = group_names[j]
|
||||
text_a = normalized[group_a]["text"]
|
||||
text_b = normalized[group_b]["text"]
|
||||
types_a = set(normalized[group_a]["types"])
|
||||
types_b = set(normalized[group_b]["types"])
|
||||
|
||||
# Detect a likely blocked handoff around design/spec dependencies.
|
||||
a_waiting = any(
|
||||
k in text_a for k in ["waiting", "blocked", "design spec", "specs"]
|
||||
)
|
||||
b_mentions_specs = any(
|
||||
k in text_b for k in ["design spec", "specs", "design"]
|
||||
)
|
||||
if a_waiting and not b_mentions_specs:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="blocked_handoff",
|
||||
description=(
|
||||
f"{group_a} indicates dependency blockage (design/spec inputs), "
|
||||
f"but {group_b} has no corresponding discussion of that dependency."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": "Contains waiting/blocked language tied to specs or design dependency.",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": "No clear mention of design specs/dependency handoff in available signals.",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation=(
|
||||
f"Create a shared handoff item between {group_a} and {group_b} for design/spec ownership "
|
||||
"with an explicit due date."
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Detect likely promise vs execution mismatch.
|
||||
b_promises = any(
|
||||
k in text_b
|
||||
for k in ["demo", "friday", "promised", "told the client", "ready by"]
|
||||
)
|
||||
a_blocked = any(
|
||||
k in text_a
|
||||
for k in ["blocked", "waiting", "can't proceed", "cannot proceed"]
|
||||
)
|
||||
if b_promises and a_blocked:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="promise_reality_gap",
|
||||
description=(
|
||||
f"{group_b} signals delivery promises while {group_a} reports blockers that may prevent those commitments."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": "Signals include active blockers/waiting dependencies.",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": "Signals include explicit client/demo commitments and timelines.",
|
||||
},
|
||||
severity="critical",
|
||||
recommendation="Run a joint risk review and re-baseline commitments before the next client update.",
|
||||
)
|
||||
)
|
||||
|
||||
# Type-based silo detection when lexical cues are weak.
|
||||
a_operational_risk = bool(
|
||||
types_a.intersection(
|
||||
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
|
||||
)
|
||||
)
|
||||
b_planning_focus = bool(
|
||||
types_b.intersection(
|
||||
{
|
||||
"feature_request",
|
||||
"roadmap_drift",
|
||||
"priority_conflict",
|
||||
"user_pain_point",
|
||||
}
|
||||
)
|
||||
)
|
||||
if a_operational_risk and b_planning_focus:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="information_silo",
|
||||
description=(
|
||||
f"{group_a} shows operational risk signals while {group_b} is focused on planning/user demands, "
|
||||
"suggesting risk context is not shared across groups."
|
||||
),
|
||||
group_a={
|
||||
"name": group_a,
|
||||
"evidence": f"Operational risk signal types: {sorted(types_a.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
|
||||
},
|
||||
group_b={
|
||||
"name": group_b,
|
||||
"evidence": f"Planning-focused signal types: {sorted(types_b.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
|
||||
)
|
||||
)
|
||||
|
||||
# Check reverse direction as well.
|
||||
b_operational_risk = bool(
|
||||
types_b.intersection(
|
||||
{"recurring_bug", "workaround", "tech_debt", "deployment_risk"}
|
||||
)
|
||||
)
|
||||
a_planning_focus = bool(
|
||||
types_a.intersection(
|
||||
{
|
||||
"feature_request",
|
||||
"roadmap_drift",
|
||||
"priority_conflict",
|
||||
"user_pain_point",
|
||||
}
|
||||
)
|
||||
)
|
||||
if b_operational_risk and a_planning_focus:
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type="information_silo",
|
||||
description=(
|
||||
f"{group_b} shows operational risk signals while {group_a} is focused on planning/user demands, "
|
||||
"suggesting risk context is not shared across groups."
|
||||
),
|
||||
group_a={
|
||||
"name": group_b,
|
||||
"evidence": f"Operational risk signal types: {sorted(types_b.intersection({'recurring_bug', 'workaround', 'tech_debt', 'deployment_risk'}))}",
|
||||
},
|
||||
group_b={
|
||||
"name": group_a,
|
||||
"evidence": f"Planning-focused signal types: {sorted(types_a.intersection({'feature_request', 'roadmap_drift', 'priority_conflict', 'user_pain_point'}))}",
|
||||
},
|
||||
severity="warning",
|
||||
recommendation="Add a weekly cross-functional risk sync so product planning reflects current engineering constraints.",
|
||||
)
|
||||
)
|
||||
|
||||
deduped = []
|
||||
seen_keys = set()
|
||||
for insight in insights:
|
||||
key = (insight.type, insight.group_a.get("name"), insight.group_b.get("name"))
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
deduped.append(insight)
|
||||
|
||||
return deduped[:5]
|
||||
|
||||
|
||||
async def analyze_cross_group(
|
||||
group_summaries: dict[str, list[dict]] = None,
|
||||
) -> list[CrossGroupInsight]:
|
||||
"""
|
||||
Analyze intelligence across all monitored groups to find blind spots.
|
||||
|
||||
Args:
|
||||
group_summaries: Optional pre-built summaries. If None, loads from ChromaDB.
|
||||
"""
|
||||
if group_summaries is None:
|
||||
group_ids = get_group_ids()
|
||||
if len(group_ids) < 2:
|
||||
logger.info("Need at least 2 groups for cross-group analysis")
|
||||
return []
|
||||
|
||||
group_summaries = {}
|
||||
for gid in group_ids:
|
||||
signals = get_all_signals(gid)
|
||||
group_summaries[gid] = signals
|
||||
|
||||
if len(group_summaries) < 2:
|
||||
return []
|
||||
|
||||
# Format summaries for the LLM
|
||||
summary_parts = []
|
||||
for group_name, signals in group_summaries.items():
|
||||
signal_lines = []
|
||||
for s in signals[:30]: # Limit per group to fit context
|
||||
meta = s["metadata"]
|
||||
signal_lines.append(f" - [{meta.get('type', '?')}] {s['document'][:120]}")
|
||||
|
||||
summary_parts.append(
|
||||
f"=== GROUP: {group_name} ({len(signals)} total signals) ===\n"
|
||||
+ "\n".join(signal_lines)
|
||||
)
|
||||
|
||||
full_summary = "\n\n".join(summary_parts)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Analyze cross-group intelligence:\n\n{full_summary}",
|
||||
},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
"reasoning",
|
||||
messages,
|
||||
temperature=0.2,
|
||||
max_tokens=2000,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
insights = []
|
||||
for i in parsed.get("insights", []):
|
||||
insights.append(
|
||||
CrossGroupInsight(
|
||||
type=i.get("type", "unknown"),
|
||||
description=i.get("description", ""),
|
||||
group_a=i.get("group_a", {}),
|
||||
group_b=i.get("group_b", {}),
|
||||
severity=i.get("severity", "warning"),
|
||||
recommendation=i.get("recommendation", ""),
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"Cross-group analysis found {len(insights)} insights")
|
||||
return insights
|
||||
|
||||
except Exception as e:
|
||||
raw = ""
|
||||
if "result" in locals() and isinstance(result, dict):
|
||||
raw = str(result.get("content", ""))[:300].replace("\n", " ")
|
||||
logger.info(f"Cross-group LLM parse issue, using fallback: {e}; raw_head={raw}")
|
||||
fallback = _heuristic_cross_group_insights(group_summaries)
|
||||
if fallback:
|
||||
logger.info(
|
||||
f"Cross-group heuristic fallback produced {len(fallback)} insights"
|
||||
)
|
||||
return fallback
|
||||
200
thirdeye/backend/agents/document_ingestor.py
Normal file
200
thirdeye/backend/agents/document_ingestor.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
|
||||
import os
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.document_ingestor")
|
||||
|
||||
# --- Text Extraction ---
|
||||
|
||||
def extract_text_from_pdf(file_path: str) -> list[dict]:
|
||||
"""Extract text from PDF, returns list of {page: int, text: str}."""
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
pages = []
|
||||
try:
|
||||
reader = PdfReader(file_path)
|
||||
for i, page in enumerate(reader.pages):
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
pages.append({"page": i + 1, "text": text.strip()})
|
||||
except Exception as e:
|
||||
logger.error(f"PDF extraction failed for {file_path}: {e}")
|
||||
|
||||
return pages
|
||||
|
||||
|
||||
def extract_text_from_docx(file_path: str) -> list[dict]:
|
||||
"""Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
|
||||
from docx import Document
|
||||
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
||||
if full_text.strip():
|
||||
return [{"page": 1, "text": full_text.strip()}]
|
||||
except Exception as e:
|
||||
logger.error(f"DOCX extraction failed for {file_path}: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_text_from_txt(file_path: str) -> list[dict]:
|
||||
"""Extract text from plain text file."""
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
text = f.read().strip()
|
||||
if text:
|
||||
return [{"page": 1, "text": text}]
|
||||
except Exception as e:
|
||||
logger.error(f"TXT extraction failed for {file_path}: {e}")
|
||||
|
||||
return []
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
".pdf": extract_text_from_pdf,
|
||||
".docx": extract_text_from_docx,
|
||||
".txt": extract_text_from_txt,
|
||||
".md": extract_text_from_txt,
|
||||
".csv": extract_text_from_txt,
|
||||
".json": extract_text_from_txt,
|
||||
".log": extract_text_from_txt,
|
||||
}
|
||||
|
||||
|
||||
def extract_text(file_path: str) -> list[dict]:
|
||||
"""Route to correct extractor based on file extension."""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
extractor = EXTRACTORS.get(ext)
|
||||
if not extractor:
|
||||
logger.warning(f"Unsupported file type: {ext} ({file_path})")
|
||||
return []
|
||||
return extractor(file_path)
|
||||
|
||||
|
||||
# --- Chunking ---
|
||||
|
||||
def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
|
||||
"""
|
||||
Split text into overlapping chunks.
|
||||
|
||||
Uses paragraph boundaries when possible, falls back to sentence boundaries,
|
||||
then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
|
||||
"""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
# Split by paragraphs first
|
||||
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for para in paragraphs:
|
||||
# If adding this paragraph stays under limit, add it
|
||||
if len(current_chunk) + len(para) + 1 <= max_chars:
|
||||
current_chunk = (current_chunk + "\n" + para).strip()
|
||||
else:
|
||||
# Save current chunk if it has content
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# If single paragraph is too long, split it by sentences
|
||||
if len(para) > max_chars:
|
||||
sentences = para.replace(". ", ".\n").split("\n")
|
||||
sub_chunk = ""
|
||||
for sent in sentences:
|
||||
if len(sub_chunk) + len(sent) + 1 <= max_chars:
|
||||
sub_chunk = (sub_chunk + " " + sent).strip()
|
||||
else:
|
||||
if sub_chunk:
|
||||
chunks.append(sub_chunk)
|
||||
sub_chunk = sent
|
||||
if sub_chunk:
|
||||
current_chunk = sub_chunk
|
||||
else:
|
||||
current_chunk = ""
|
||||
else:
|
||||
current_chunk = para
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# Add overlap: prepend last N chars of previous chunk to each subsequent chunk
|
||||
if overlap_chars > 0 and len(chunks) > 1:
|
||||
overlapped = [chunks[0]]
|
||||
for i in range(1, len(chunks)):
|
||||
prev_tail = chunks[i - 1][-overlap_chars:]
|
||||
# Find a word boundary in the overlap
|
||||
space_idx = prev_tail.find(" ")
|
||||
if space_idx > 0:
|
||||
prev_tail = prev_tail[space_idx + 1:]
|
||||
overlapped.append(prev_tail + " " + chunks[i])
|
||||
chunks = overlapped
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
# --- Main Ingestion ---
|
||||
|
||||
def ingest_document(
|
||||
file_path: str,
|
||||
group_id: str,
|
||||
shared_by: str = "Unknown",
|
||||
filename: str = None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
|
||||
|
||||
Args:
|
||||
file_path: Path to the downloaded file on disk
|
||||
group_id: Telegram group ID
|
||||
shared_by: Who shared the file
|
||||
filename: Original filename (for metadata)
|
||||
|
||||
Returns:
|
||||
List of signal dicts ready for store_signals()
|
||||
"""
|
||||
if filename is None:
|
||||
filename = os.path.basename(file_path)
|
||||
|
||||
# Extract
|
||||
pages = extract_text(file_path)
|
||||
if not pages:
|
||||
logger.warning(f"No text extracted from {filename}")
|
||||
return []
|
||||
|
||||
# Chunk each page
|
||||
signals = []
|
||||
total_chunks = 0
|
||||
|
||||
for page_data in pages:
|
||||
page_num = page_data["page"]
|
||||
chunks = chunk_text(page_data["text"])
|
||||
|
||||
for chunk_idx, chunk_text_str in enumerate(chunks):
|
||||
if len(chunk_text_str.strip()) < 30:
|
||||
continue # Skip tiny chunks
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "document_knowledge",
|
||||
"summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
|
||||
"entities": [f"@{shared_by}", filename],
|
||||
"severity": "low",
|
||||
"status": "reference",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"raw_quote": chunk_text_str,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "document",
|
||||
"keywords": [filename, f"page_{page_num}", "document", shared_by],
|
||||
}
|
||||
signals.append(signal)
|
||||
total_chunks += 1
|
||||
|
||||
logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
|
||||
return signals
|
||||
373
thirdeye/backend/agents/jira_agent.py
Normal file
373
thirdeye/backend/agents/jira_agent.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Jira Signal Agent
|
||||
Takes ThirdEye signals and converts them into well-formed Jira tickets.
|
||||
|
||||
Responsibilities:
|
||||
1. Map signal type → Jira issue type + priority
|
||||
2. LLM-generate a clean ticket title and structured description from signal context
|
||||
3. Extract assignee names and match them to Jira account IDs (best-effort)
|
||||
4. Raise the ticket via jira_client and mark the signal in ChromaDB
|
||||
5. Bulk-raise: process a group's unraised high-severity signals in one call
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.integrations.jira_client import (
|
||||
create_issue, search_issues, add_comment, is_configured, search_users
|
||||
)
|
||||
from backend.db.chroma import store_signals, mark_signal_as_raised, get_raised_signal_ids
|
||||
from backend.config import (
|
||||
JIRA_DEFAULT_PROJECT, JIRA_DEFAULT_ISSUE_TYPE,
|
||||
JIRA_AUTO_RAISE_SEVERITY
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.jira_agent")
|
||||
|
||||
|
||||
# ─── Signal → Jira type mapping ──────────────────────────────────────────────
|
||||
|
||||
# Maps ThirdEye signal type → (Jira issue type, default priority)
|
||||
# Note: Issue types must match what's available in your Jira project
|
||||
# Common types: Task, Bug, Story, Epic, Workstream (project-specific)
|
||||
SIGNAL_TYPE_MAP = {
|
||||
# Dev signals
|
||||
"tech_debt": ("Task", "Low"),
|
||||
"recurring_bug": ("Task", "High"), # Changed from Bug to Task
|
||||
"architecture_decision": ("Task", "Medium"),
|
||||
"deployment_risk": ("Task", "High"),
|
||||
"workaround": ("Task", "Medium"),
|
||||
"knowledge_silo": ("Task", "Medium"),
|
||||
# Product signals
|
||||
"feature_request": ("Task", "Medium"), # Changed from Story to Task
|
||||
"priority_conflict": ("Task", "High"),
|
||||
"sentiment_shift": ("Task", "Medium"),
|
||||
# Client signals
|
||||
"promise": ("Task", "High"),
|
||||
"scope_creep": ("Task", "High"),
|
||||
"risk": ("Task", "High"),
|
||||
# Meet signals
|
||||
"meet_action_item": ("Task", "Medium"),
|
||||
"meet_blocker": ("Task", "Highest"),
|
||||
"meet_risk": ("Task", "High"),
|
||||
"meet_decision": ("Task", "Medium"),
|
||||
"meet_open_q": ("Task", "Low"),
|
||||
# Generic
|
||||
"blocker": ("Task", "Highest"),
|
||||
"decision": ("Task", "Medium"),
|
||||
"action_item": ("Task", "Medium"),
|
||||
}
|
||||
|
||||
SEVERITY_TO_PRIORITY = {
|
||||
"critical": "Highest",
|
||||
"high": "High",
|
||||
"medium": "Medium",
|
||||
"low": "Low",
|
||||
}
|
||||
|
||||
RAISEABLE_TYPES = set(SIGNAL_TYPE_MAP.keys())
|
||||
|
||||
|
||||
# ─── Assignee resolution ─────────────────────────────────────────────────────
|
||||
|
||||
async def resolve_assignee_account_id(name: str) -> str | None:
|
||||
"""
|
||||
Resolve a person's display name (or @name) to their Jira account ID.
|
||||
Uses Jira's user search API and fuzzy-matches the best result.
|
||||
Returns the account ID string, or None if no confident match is found.
|
||||
"""
|
||||
if not name:
|
||||
return None
|
||||
clean = name.lstrip("@").strip()
|
||||
try:
|
||||
users = await search_users(clean)
|
||||
if not users:
|
||||
return None
|
||||
clean_lower = clean.lower()
|
||||
# Exact display-name match first
|
||||
for u in users:
|
||||
if u["display_name"].lower() == clean_lower:
|
||||
return u["account_id"]
|
||||
# Partial match (all search words appear in display name)
|
||||
words = clean_lower.split()
|
||||
for u in users:
|
||||
dn = u["display_name"].lower()
|
||||
if all(w in dn for w in words):
|
||||
return u["account_id"]
|
||||
# Last resort: first result
|
||||
return users[0]["account_id"]
|
||||
except Exception as e:
|
||||
logger.warning(f"resolve_assignee_account_id failed for '{name}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ─── LLM ticket generation ───────────────────────────────────────────────────
|
||||
|
||||
TICKET_GEN_SYSTEM_PROMPT = """You are a senior engineering manager writing Jira tickets from team intelligence signals.
|
||||
|
||||
Given a ThirdEye signal (a structured piece of extracted team knowledge), write a Jira ticket.
|
||||
|
||||
Return ONLY a valid JSON object with exactly these fields:
|
||||
{
|
||||
"summary": "Short, actionable ticket title (max 100 chars). Start with a verb. No jargon.",
|
||||
"description": "Full ticket description. Include: what the issue is, context from the signal, why it matters, suggested next steps. Use blank lines between sections. Use '- ' for bullet points. Max 400 words.",
|
||||
"labels": ["label1", "label2"],
|
||||
"assignee_name": "First name or @name of the person to assign, or null if unclear"
|
||||
}
|
||||
|
||||
Label rules:
|
||||
- Always include "thirdeye" and "auto-raised"
|
||||
- Add the signal type as a label (e.g. "tech-debt", "recurring-bug")
|
||||
- Add "urgent" if severity is high or critical
|
||||
- Labels must not have spaces (use hyphens)
|
||||
|
||||
Summary rules:
|
||||
- Starts with a verb: "Fix", "Investigate", "Address", "Resolve", "Document", "Implement"
|
||||
- Be specific — "Fix intermittent checkout timeout" NOT "Fix bug"
|
||||
- Never exceed 100 characters
|
||||
|
||||
Description must include:
|
||||
1. What: clear 1-sentence problem statement
|
||||
2. Context: what was actually said / detected (cite the signal)
|
||||
3. Impact: why this matters to the team or product
|
||||
4. Suggested next steps (2-3 bullet points)
|
||||
|
||||
Return JSON only — no markdown, no preamble."""
|
||||
|
||||
|
||||
async def generate_ticket_content(signal: dict) -> dict:
|
||||
"""
|
||||
Use an LLM to generate a clean, context-rich Jira ticket from a ThirdEye signal.
|
||||
Returns {"summary": str, "description": str, "labels": list, "assignee_name": str|None}
|
||||
"""
|
||||
signal_text = (
|
||||
f"Signal type: {signal.get('type', 'unknown')}\n"
|
||||
f"Summary: {signal.get('summary', '')}\n"
|
||||
f"Raw quote: {signal.get('raw_quote', '')[:300]}\n"
|
||||
f"Severity: {signal.get('severity', 'medium')}\n"
|
||||
f"Entities involved: {', '.join(signal.get('entities', []))}\n"
|
||||
f"Keywords: {', '.join(signal.get('keywords', []))}\n"
|
||||
f"Timestamp: {signal.get('timestamp', '')}\n"
|
||||
f"Group: {signal.get('group_id', '')}\n"
|
||||
f"Lens: {signal.get('lens', '')}"
|
||||
)
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": TICKET_GEN_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": signal_text},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=800,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = result["content"].strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
return json.loads(raw)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Ticket generation LLM failed: {e}. Using fallback.")
|
||||
# Fallback: build a basic ticket without LLM
|
||||
sig_type = signal.get("type", "unknown").replace("_", " ").title()
|
||||
return {
|
||||
"summary": f"{sig_type}: {signal.get('summary', 'Unknown issue')[:80]}",
|
||||
"description": (
|
||||
f"Signal detected by ThirdEye.\n\n"
|
||||
f"Type: {signal.get('type', 'unknown')}\n"
|
||||
f"Summary: {signal.get('summary', '')}\n\n"
|
||||
f"Raw context:\n{signal.get('raw_quote', '(none)')[:300]}\n\n"
|
||||
f"Severity: {signal.get('severity', 'medium')}"
|
||||
),
|
||||
"labels": ["thirdeye", "auto-raised", signal.get("type", "unknown").replace("_", "-")],
|
||||
"assignee_name": None,
|
||||
}
|
||||
|
||||
|
||||
# ─── Main raise function ──────────────────────────────────────────────────────
|
||||
|
||||
async def raise_ticket_for_signal(
|
||||
signal: dict,
|
||||
group_id: str,
|
||||
project_key: str = None,
|
||||
force: bool = False,
|
||||
assignee_account_id: str = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Create a Jira ticket for a single ThirdEye signal.
|
||||
|
||||
Args:
|
||||
signal: The signal dict from ChromaDB
|
||||
group_id: The group this signal belongs to (for dedup tracking)
|
||||
project_key: Override project (default: JIRA_DEFAULT_PROJECT)
|
||||
force: If True, raise even if already raised before
|
||||
|
||||
Returns:
|
||||
{"ok": True, "key": "ENG-42", "url": "...", "summary": "..."}
|
||||
OR
|
||||
{"ok": False, "reason": "already_raised" | "not_raiseable" | "jira_error", ...}
|
||||
"""
|
||||
if not is_configured():
|
||||
return {"ok": False, "reason": "jira_not_configured"}
|
||||
|
||||
signal_id = signal.get("id", "")
|
||||
signal_type = signal.get("type", "")
|
||||
|
||||
# Check if this signal type is raiseable
|
||||
if signal_type not in RAISEABLE_TYPES:
|
||||
return {"ok": False, "reason": "not_raiseable", "signal_type": signal_type}
|
||||
|
||||
# Check if already raised (skip if force=True)
|
||||
if not force and signal_id:
|
||||
already_raised = get_raised_signal_ids(group_id)
|
||||
if signal_id in already_raised:
|
||||
return {"ok": False, "reason": "already_raised", "signal_id": signal_id}
|
||||
|
||||
# Determine Jira issue type and priority from signal
|
||||
default_type, default_priority = SIGNAL_TYPE_MAP.get(signal_type, (JIRA_DEFAULT_ISSUE_TYPE, "Medium"))
|
||||
severity = signal.get("severity", "medium").lower()
|
||||
priority = SEVERITY_TO_PRIORITY.get(severity, default_priority)
|
||||
|
||||
# Generate ticket content via LLM
|
||||
ticket_content = await generate_ticket_content(signal)
|
||||
|
||||
summary = ticket_content.get("summary", signal.get("summary", "ThirdEye signal")[:100])
|
||||
description = ticket_content.get("description", signal.get("summary", ""))
|
||||
labels = ticket_content.get("labels", ["thirdeye", "auto-raised"])
|
||||
# Always ensure thirdeye label is present
|
||||
if "thirdeye" not in labels:
|
||||
labels.append("thirdeye")
|
||||
|
||||
# Append ThirdEye metadata as a context section in the description
|
||||
meta_section = (
|
||||
f"\n\n---\n"
|
||||
f"Raised by: ThirdEye\n"
|
||||
f"Signal ID: {signal_id}\n"
|
||||
f"Group: {group_id}\n"
|
||||
f"Detected: {signal.get('timestamp', datetime.utcnow().isoformat())}"
|
||||
)
|
||||
description = description + meta_section
|
||||
|
||||
# Resolve assignee: explicit account_id wins, then signal override name, then LLM-extracted name
|
||||
if not assignee_account_id:
|
||||
name_hint = signal.get("assignee_override") or ticket_content.get("assignee_name")
|
||||
if name_hint:
|
||||
assignee_account_id = await resolve_assignee_account_id(name_hint)
|
||||
if assignee_account_id:
|
||||
logger.info(f"Resolved assignee '{name_hint}' → {assignee_account_id}")
|
||||
else:
|
||||
logger.warning(f"Could not resolve assignee '{name_hint}' to a Jira account")
|
||||
|
||||
# Create the ticket
|
||||
result = await create_issue(
|
||||
project_key=project_key or JIRA_DEFAULT_PROJECT,
|
||||
summary=summary,
|
||||
description=description,
|
||||
issue_type=default_type,
|
||||
priority=priority,
|
||||
labels=labels,
|
||||
assignee_account_id=assignee_account_id,
|
||||
)
|
||||
|
||||
if result.get("ok"):
|
||||
jira_key = result["key"]
|
||||
jira_url = result["url"]
|
||||
# Mark this signal as raised in ChromaDB so we never duplicate it
|
||||
if signal_id:
|
||||
mark_signal_as_raised(
|
||||
group_id, signal_id, jira_key,
|
||||
jira_url=jira_url,
|
||||
jira_summary=summary,
|
||||
jira_priority=priority,
|
||||
)
|
||||
logger.info(f"Raised Jira ticket {jira_key} for signal {signal_id} ({signal_type})")
|
||||
return {
|
||||
"ok": True,
|
||||
"key": jira_key,
|
||||
"url": jira_url,
|
||||
"summary": summary,
|
||||
"issue_type": default_type,
|
||||
"priority": priority,
|
||||
"assignee_account_id": assignee_account_id,
|
||||
}
|
||||
else:
|
||||
logger.error(f"Jira ticket creation failed: {result}")
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "jira_error",
|
||||
"error": result.get("error"),
|
||||
"details": result.get("details"),
|
||||
}
|
||||
|
||||
|
||||
async def bulk_raise_for_group(
|
||||
group_id: str,
|
||||
signals: list[dict],
|
||||
min_severity: str = None,
|
||||
project_key: str = None,
|
||||
max_tickets: int = 10,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Raise Jira tickets for multiple signals from a group in one call.
|
||||
|
||||
Filters:
|
||||
- Only raiseable signal types
|
||||
- Only signals at or above min_severity (defaults to JIRA_AUTO_RAISE_SEVERITY)
|
||||
- Skips signals already raised
|
||||
- Caps at max_tickets to avoid flooding Jira
|
||||
|
||||
Returns list of raise results.
|
||||
"""
|
||||
min_sev = (min_severity or JIRA_AUTO_RAISE_SEVERITY).lower()
|
||||
severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
|
||||
min_rank = severity_rank.get(min_sev, 2) # Default: high
|
||||
|
||||
already_raised = get_raised_signal_ids(group_id)
|
||||
candidates = []
|
||||
|
||||
for sig in signals:
|
||||
sig_type = sig.get("type", "")
|
||||
sig_id = sig.get("id", "")
|
||||
severity = sig.get("severity", "low").lower()
|
||||
rank = severity_rank.get(severity, 0)
|
||||
|
||||
if sig_type not in RAISEABLE_TYPES:
|
||||
continue
|
||||
if rank < min_rank:
|
||||
continue
|
||||
if sig_id in already_raised:
|
||||
continue
|
||||
candidates.append(sig)
|
||||
|
||||
# Sort by severity descending, then raise up to max_tickets
|
||||
candidates.sort(key=lambda s: severity_rank.get(s.get("severity", "low"), 0), reverse=True)
|
||||
candidates = candidates[:max_tickets]
|
||||
|
||||
results = []
|
||||
for sig in candidates:
|
||||
result = await raise_ticket_for_signal(sig, group_id, project_key=project_key)
|
||||
results.append({**result, "signal_type": sig.get("type"), "signal_summary": sig.get("summary", "")[:80]})
|
||||
|
||||
logger.info(f"Bulk raise for group {group_id}: {len(results)} tickets from {len(signals)} signals")
|
||||
return results
|
||||
|
||||
|
||||
def format_raise_result_for_telegram(result: dict) -> str:
|
||||
"""Format a single raise result as a Telegram message line."""
|
||||
if result.get("ok"):
|
||||
return (
|
||||
f"✅ [{result['key']}]({result['url']}) — "
|
||||
f"*{result.get('issue_type', 'Task')}* | {result.get('priority', 'Medium')} priority\n"
|
||||
f" _{result.get('summary', '')[:90]}_"
|
||||
)
|
||||
reason = result.get("reason", "unknown")
|
||||
if reason == "already_raised":
|
||||
return f"⏭️ Already raised — skipped"
|
||||
if reason == "not_raiseable":
|
||||
return f"⚪ Signal type `{result.get('signal_type', '?')}` — not mapped to Jira"
|
||||
return f"❌ Failed: {result.get('error', reason)}"
|
||||
43
thirdeye/backend/agents/json_utils.py
Normal file
43
thirdeye/backend/agents/json_utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Utilities for robustly parsing JSON from LLM responses."""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def extract_json_object(content: str) -> dict:
|
||||
"""Extract and parse the first JSON object from raw LLM output."""
|
||||
text = (content or "").strip()
|
||||
if not text:
|
||||
raise json.JSONDecodeError("Empty LLM response", text, 0)
|
||||
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)
|
||||
|
||||
decoder = json.JSONDecoder()
|
||||
|
||||
# Direct parse for pure JSON responses.
|
||||
try:
|
||||
parsed = json.loads(text)
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try to decode from each object start. This handles wrapper text more
|
||||
# reliably than regex, especially with nested braces.
|
||||
for idx, ch in enumerate(text):
|
||||
if ch != "{":
|
||||
continue
|
||||
try:
|
||||
parsed, _ = decoder.raw_decode(text[idx:])
|
||||
if isinstance(parsed, dict):
|
||||
return parsed
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)
|
||||
213
thirdeye/backend/agents/link_fetcher.py
Normal file
213
thirdeye/backend/agents/link_fetcher.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
|
||||
import re
|
||||
import uuid
|
||||
import logging
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.config import ENABLE_LINK_FETCH
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.link_fetcher")
|
||||
|
||||
# Patterns to skip (images, downloads, social media embeds, etc.)
|
||||
SKIP_PATTERNS = [
|
||||
r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
|
||||
r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
|
||||
r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
|
||||
r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
|
||||
r"^https?://(www\.)?instagram\.com/p/",
|
||||
r"^https?://(www\.)?tiktok\.com/",
|
||||
r"^https?://(www\.)?youtube\.com/shorts/",
|
||||
r"^https?://t\.me/", # Other Telegram links
|
||||
]
|
||||
|
||||
SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
|
||||
|
||||
|
||||
def extract_urls(text: str) -> list[str]:
|
||||
"""Extract all HTTP/HTTPS URLs from a text string."""
|
||||
url_pattern = re.compile(
|
||||
r"https?://[^\s<>\"')\]},;]+"
|
||||
)
|
||||
urls = url_pattern.findall(text)
|
||||
|
||||
# Clean trailing punctuation
|
||||
cleaned = []
|
||||
for url in urls:
|
||||
url = url.rstrip(".,;:!?)")
|
||||
if len(url) > 10:
|
||||
cleaned.append(url)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def should_fetch(url: str) -> bool:
|
||||
"""Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
|
||||
for pattern in SKIP_COMPILED:
|
||||
if pattern.search(url):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
|
||||
"""
|
||||
Fetch a URL and extract main text content.
|
||||
|
||||
Returns:
|
||||
{title, text, url} or None if fetch fails
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.info(f"URL returned {response.status_code}: {url[:80]}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
||||
logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
|
||||
return None
|
||||
|
||||
html = response.text
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.info(f"URL timed out: {url[:80]}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Parse HTML
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
|
||||
# Remove script, style, nav, footer, header elements
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
|
||||
tag.decompose()
|
||||
|
||||
# Try to find main content area
|
||||
main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
|
||||
if main:
|
||||
text = main.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up
|
||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Skip if too little content
|
||||
if len(text) < 100:
|
||||
logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Truncate very long content
|
||||
if len(text) > 8000:
|
||||
text = text[:8000] + "\n\n[Content truncated]"
|
||||
|
||||
return {
|
||||
"title": title or url,
|
||||
"text": text,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def summarize_content(title: str, text: str, url: str) -> str:
|
||||
"""Use LLM to create a concise summary of fetched content."""
|
||||
# Limit text sent to LLM
|
||||
text_preview = text[:3000]
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": """You are a content summarizer for ThirdEye.
|
||||
Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
|
||||
Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
|
||||
Respond with ONLY the summary text, nothing else."""},
|
||||
{"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
|
||||
return result["content"].strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Link summarization failed: {e}")
|
||||
# Fallback: use first 200 chars of text
|
||||
return text[:200] + "..."
|
||||
|
||||
|
||||
async def process_links_from_message(
|
||||
text: str,
|
||||
group_id: str,
|
||||
shared_by: str = "Unknown",
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Full pipeline: extract URLs from message → fetch → summarize → produce signals.
|
||||
|
||||
Designed to be called in the background (non-blocking to the main message pipeline).
|
||||
|
||||
Returns:
|
||||
List of signal dicts ready for store_signals()
|
||||
"""
|
||||
if not ENABLE_LINK_FETCH:
|
||||
return []
|
||||
|
||||
urls = extract_urls(text)
|
||||
fetchable = [u for u in urls if should_fetch(u)]
|
||||
|
||||
if not fetchable:
|
||||
return []
|
||||
|
||||
signals = []
|
||||
|
||||
# Process up to 3 links per message to avoid overload
|
||||
for url in fetchable[:3]:
|
||||
try:
|
||||
content = await fetch_url_content(url)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
summary = await summarize_content(content["title"], content["text"], url)
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "link_knowledge",
|
||||
"summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
|
||||
"entities": [f"@{shared_by}", url[:100]],
|
||||
"severity": "low",
|
||||
"status": "reference",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"raw_quote": summary,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "link",
|
||||
"keywords": [content["title"][:50], "link", "web", shared_by],
|
||||
}
|
||||
signals.append(signal)
|
||||
logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Link processing failed for {url[:60]}: {e}")
|
||||
continue
|
||||
|
||||
return signals
|
||||
188
thirdeye/backend/agents/meet_cross_ref.py
Normal file
188
thirdeye/backend/agents/meet_cross_ref.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Meet Cross-Reference Agent
|
||||
Finds connections between meeting signals and existing Telegram group signals.
|
||||
Surfaces: confirmations (meeting agrees with chat), contradictions (meeting contradicts chat),
|
||||
and blind spots (meeting discusses something chat groups don't know about).
|
||||
"""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import query_signals, get_all_signals
|
||||
from backend.config import MEET_CROSS_REF_GROUPS, MEET_DEFAULT_GROUP_ID
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.meet_cross_ref")
|
||||
|
||||
CROSS_REF_SYSTEM_PROMPT = """You are an expert at finding connections between meeting discussions and team chat history.
|
||||
|
||||
You will receive:
|
||||
1. MEETING SIGNALS — decisions, action items, blockers, risks from a recent Google Meet
|
||||
2. CHAT SIGNALS — existing signals from team Telegram groups
|
||||
|
||||
Find meaningful connections across three categories:
|
||||
|
||||
CONFIRMATIONS: Meeting agrees with or reinforces something from chat history
|
||||
CONTRADICTIONS: Meeting decision conflicts with what was said/decided in chat
|
||||
BLIND SPOTS: Important things from the meeting that the chat teams don't seem to know about
|
||||
|
||||
Return ONLY a valid JSON object:
|
||||
{
|
||||
"confirmations": [
|
||||
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "significance": "high|medium|low"}
|
||||
],
|
||||
"contradictions": [
|
||||
{"meeting_signal": "...", "chat_signal": "...", "group": "...", "impact": "...", "significance": "high|medium|low"}
|
||||
],
|
||||
"blind_spots": [
|
||||
{"meeting_signal": "...", "teams_unaware": ["group1", "group2"], "recommendation": "..."}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Only include HIGH confidence matches — do not stretch for weak connections
|
||||
- Keep each signal description concise (1 sentence max)
|
||||
- significance "high" = this matters for team alignment; "medium" = worth noting; "low" = minor
|
||||
- If a category has nothing meaningful, use an empty array []
|
||||
- Return JSON only"""
|
||||
|
||||
|
||||
async def find_cross_references(
|
||||
meeting_id: str,
|
||||
group_id: str = None,
|
||||
cross_ref_group_ids: list[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Compare meeting signals against chat group signals.
|
||||
|
||||
Args:
|
||||
meeting_id: The meeting to analyze
|
||||
group_id: ChromaDB group where meet signals are stored (defaults to MEET_DEFAULT_GROUP_ID)
|
||||
cross_ref_group_ids: Groups to compare against (defaults to MEET_CROSS_REF_GROUPS from config)
|
||||
|
||||
Returns:
|
||||
Dict with confirmations, contradictions, blind_spots lists
|
||||
"""
|
||||
group_id = group_id or MEET_DEFAULT_GROUP_ID
|
||||
cross_ref_group_ids = cross_ref_group_ids or MEET_CROSS_REF_GROUPS
|
||||
|
||||
if not cross_ref_group_ids:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": "No cross-reference groups configured. Set MEET_CROSS_REF_GROUPS in .env",
|
||||
}
|
||||
|
||||
# 1. Get meeting signals (decisions, actions, blockers, risks — NOT raw chunks)
|
||||
meet_signals = query_signals(group_id, meeting_id, n_results=30)
|
||||
structured_meet = [
|
||||
s for s in meet_signals
|
||||
if s.get("metadata", {}).get("type") in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk", "meet_open_q")
|
||||
]
|
||||
|
||||
if not structured_meet:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": f"No structured signals found for meeting {meeting_id}. Has it been processed yet?",
|
||||
}
|
||||
|
||||
# 2. Get signals from each cross-reference group
|
||||
chat_context_parts = []
|
||||
for gid in cross_ref_group_ids:
|
||||
try:
|
||||
all_sig = get_all_signals(gid)
|
||||
if all_sig:
|
||||
formatted = "\n".join([
|
||||
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:120]}"
|
||||
for s in all_sig[:20] # Cap at 20 per group to stay within token limits
|
||||
])
|
||||
chat_context_parts.append(f"Group '{gid}':\n{formatted}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load signals for group {gid}: {e}")
|
||||
|
||||
if not chat_context_parts:
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": "Could not load any signals from cross-reference groups.",
|
||||
}
|
||||
|
||||
# 3. Format inputs for LLM
|
||||
meet_text = "\n".join([
|
||||
f" [{s.get('metadata', {}).get('type', '?')}] {s.get('document', '')[:150]}" for s in structured_meet
|
||||
])
|
||||
chat_text = "\n\n".join(chat_context_parts)
|
||||
|
||||
prompt = f"""MEETING SIGNALS (from meeting: {meeting_id}):
|
||||
{meet_text}
|
||||
|
||||
CHAT SIGNALS (from monitored Telegram groups):
|
||||
{chat_text}"""
|
||||
|
||||
try:
|
||||
import json
|
||||
result = await call_llm(
|
||||
task_type="reasoning",
|
||||
messages=[
|
||||
{"role": "system", "content": CROSS_REF_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=1500,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = result["content"].strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
return json.loads(raw)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cross-reference LLM call failed: {e}")
|
||||
return {
|
||||
"confirmations": [],
|
||||
"contradictions": [],
|
||||
"blind_spots": [],
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
def format_cross_ref_for_telegram(analysis: dict, meeting_id: str) -> str:
|
||||
"""Format cross-reference results as a Telegram message."""
|
||||
parts = [f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting: `{meeting_id}`\n"]
|
||||
|
||||
if analysis.get("error"):
|
||||
return f"⚠️ Cross-reference failed: {analysis['error']}"
|
||||
|
||||
confirmations = analysis.get("confirmations", [])
|
||||
contradictions = analysis.get("contradictions", [])
|
||||
blind_spots = analysis.get("blind_spots", [])
|
||||
|
||||
if not confirmations and not contradictions and not blind_spots:
|
||||
return f"🔗 *Meet ↔ Chat Cross-Reference*\nMeeting `{meeting_id}`: No significant connections found between this meeting and your chat groups."
|
||||
|
||||
if confirmations:
|
||||
parts.append(f"✅ *Confirmations* ({len(confirmations)})")
|
||||
for c in confirmations[:3]: # Cap at 3 for readability
|
||||
sig = "🔴" if c.get("significance") == "high" else "🟡"
|
||||
parts.append(f"{sig} Meeting: _{c['meeting_signal'][:100]}_")
|
||||
parts.append(f" Matches [{c.get('group', '?')}]: _{c['chat_signal'][:100]}_\n")
|
||||
|
||||
if contradictions:
|
||||
parts.append(f"⚡ *Contradictions* ({len(contradictions)}) — ACTION NEEDED")
|
||||
for c in contradictions[:3]:
|
||||
parts.append(f"🔴 Meeting decided: _{c['meeting_signal'][:100]}_")
|
||||
parts.append(f" BUT [{c.get('group', '?')}] says: _{c['chat_signal'][:100]}_")
|
||||
if c.get("impact"):
|
||||
parts.append(f" Impact: {c['impact'][:100]}\n")
|
||||
|
||||
if blind_spots:
|
||||
parts.append(f"🔦 *Blind Spots* ({len(blind_spots)}) — Teams may not know")
|
||||
for b in blind_spots[:3]:
|
||||
parts.append(f"🟠 {b['meeting_signal'][:120]}")
|
||||
if b.get("recommendation"):
|
||||
parts.append(f" → {b['recommendation'][:100]}\n")
|
||||
|
||||
return "\n".join(parts)
|
||||
342
thirdeye/backend/agents/meet_ingestor.py
Normal file
342
thirdeye/backend/agents/meet_ingestor.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Meet Ingestor Agent
|
||||
Processes raw Google Meet transcript chunks and extracts structured signals.
|
||||
|
||||
Signal types produced:
|
||||
meet_decision — A decision made during the meeting
|
||||
meet_action_item — A task assigned to someone
|
||||
meet_blocker — A blocker or dependency raised
|
||||
meet_risk — A risk or concern identified
|
||||
meet_open_q — An unresolved question left open
|
||||
meet_summary — Full meeting summary (emitted on is_final=True)
|
||||
meet_chunk_raw — Raw transcript chunk (always stored, for full-text search)
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import store_signals
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.meet_ingestor")
|
||||
|
||||
|
||||
# ─── Extraction prompt ───────────────────────────────────────────────────────
|
||||
|
||||
EXTRACTION_SYSTEM_PROMPT = """You are an expert meeting analyst. You receive raw transcript chunks from a Google Meet recording and extract structured signals.
|
||||
|
||||
Extract ONLY signals that are clearly present. Do NOT hallucinate or infer beyond what is stated.
|
||||
|
||||
Return ONLY a valid JSON object with this exact structure:
|
||||
{
|
||||
"decisions": [
|
||||
{"text": "...", "owner": "@name or null", "confidence": "high|medium|low"}
|
||||
],
|
||||
"action_items": [
|
||||
{"text": "...", "owner": "@name or null", "due": "date string or null", "confidence": "high|medium|low"}
|
||||
],
|
||||
"blockers": [
|
||||
{"text": "...", "blocking_what": "...", "confidence": "high|medium|low"}
|
||||
],
|
||||
"risks": [
|
||||
{"text": "...", "severity": "high|medium|low", "confidence": "high|medium|low"}
|
||||
],
|
||||
"open_questions": [
|
||||
{"text": "...", "confidence": "high|medium|low"}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- If a category has nothing, use an empty array []
|
||||
- owner must start with @ if it's a person's name (e.g. "@Alex")
|
||||
- text must be a clear, standalone sentence — not a fragment
|
||||
- Only include confidence "high" if the signal is unambiguous
|
||||
- Do NOT reproduce filler words, pleasantries, or off-topic banter
|
||||
- Return JSON only — no markdown, no preamble, no explanation"""
|
||||
|
||||
|
||||
SUMMARY_SYSTEM_PROMPT = """You are a meeting intelligence expert. Given a full meeting transcript (possibly from multiple chunks), write a concise but complete meeting summary.
|
||||
|
||||
Structure your summary as:
|
||||
1. One-sentence overview (what was the meeting about)
|
||||
2. Key decisions made (bullet points, max 5)
|
||||
3. Action items assigned (who does what by when)
|
||||
4. Blockers or risks raised
|
||||
5. Open questions still unresolved
|
||||
|
||||
Keep the summary under 400 words. Be specific. Use names when available. Do NOT use filler phrases like "the team discussed" — just state what was decided/agreed/assigned."""
|
||||
|
||||
|
||||
# ─── Signal builder ─────────────────────────────────────────────────────────
|
||||
|
||||
def _build_signal(
|
||||
signal_type: str,
|
||||
summary: str,
|
||||
raw_quote: str,
|
||||
severity: str,
|
||||
entities: list[str],
|
||||
keywords: list[str],
|
||||
timestamp: str,
|
||||
group_id: str,
|
||||
meeting_id: str,
|
||||
urgency: str = "none",
|
||||
status: str = "open",
|
||||
) -> dict:
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": signal_type,
|
||||
"summary": summary,
|
||||
"raw_quote": raw_quote[:500] if raw_quote else "",
|
||||
"severity": severity,
|
||||
"status": status,
|
||||
"sentiment": "neutral",
|
||||
"urgency": urgency,
|
||||
"entities": entities,
|
||||
"keywords": keywords,
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "meet",
|
||||
"meeting_id": meeting_id,
|
||||
}
|
||||
|
||||
|
||||
def _extract_entities(text: str, owner: str = None) -> list[str]:
|
||||
"""Extract entity strings from text (names starting with @)."""
|
||||
import re
|
||||
entities = re.findall(r"@[\w]+", text)
|
||||
if owner and owner.startswith("@"):
|
||||
entities.append(owner)
|
||||
return list(set(entities))
|
||||
|
||||
|
||||
def _extract_keywords(text: str) -> list[str]:
|
||||
"""Simple keyword extraction: lowercase meaningful words."""
|
||||
stopwords = {"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
|
||||
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
|
||||
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not"}
|
||||
words = text.lower().split()
|
||||
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
|
||||
return list(dict.fromkeys(keywords))[:10] # deduplicate, keep first 10
|
||||
|
||||
|
||||
# ─── Main processing function ────────────────────────────────────────────────
|
||||
|
||||
async def process_meet_chunk(
|
||||
meeting_id: str,
|
||||
group_id: str,
|
||||
chunk_index: int,
|
||||
text: str,
|
||||
speaker: str,
|
||||
timestamp: str,
|
||||
is_final: bool,
|
||||
):
|
||||
"""
|
||||
Full pipeline for a transcript chunk:
|
||||
1. Always store raw chunk for full-text search
|
||||
2. Extract structured signals via LLM
|
||||
3. If is_final, generate a full meeting summary
|
||||
"""
|
||||
logger.info(f"Processing meet chunk {chunk_index} for meeting {meeting_id} ({len(text)} chars)")
|
||||
signals_to_store = []
|
||||
|
||||
# 1. Always store the raw chunk (enables full-text similarity search later)
|
||||
raw_signal = _build_signal(
|
||||
signal_type="meet_chunk_raw",
|
||||
summary=f"[{meeting_id}] Chunk {chunk_index}: {text[:120]}...",
|
||||
raw_quote=text,
|
||||
severity="low",
|
||||
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
|
||||
keywords=_extract_keywords(text),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
)
|
||||
signals_to_store.append(raw_signal)
|
||||
|
||||
# 2. Extract structured signals via LLM
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Transcript chunk from speaker '{speaker}':\n\n{text}"},
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=1500,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
raw_json = result["content"].strip()
|
||||
# Strip markdown code fences if present
|
||||
if raw_json.startswith("```"):
|
||||
raw_json = raw_json.split("```")[1]
|
||||
if raw_json.startswith("json"):
|
||||
raw_json = raw_json[4:]
|
||||
extracted = json.loads(raw_json)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Meet extraction LLM failed for chunk {chunk_index}: {e}")
|
||||
extracted = {}
|
||||
|
||||
# Decisions
|
||||
for item in extracted.get("decisions", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_decision",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="medium",
|
||||
entities=_extract_entities(item["text"], item.get("owner")),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="decided",
|
||||
))
|
||||
|
||||
# Action items
|
||||
for item in extracted.get("action_items", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
due_str = f" Due: {item['due']}." if item.get("due") else ""
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_action_item",
|
||||
summary=f"{item['text']}{due_str}",
|
||||
raw_quote=item["text"],
|
||||
severity="medium",
|
||||
entities=_extract_entities(item["text"], item.get("owner")),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="medium" if item.get("due") else "low",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Blockers
|
||||
for item in extracted.get("blockers", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_blocker",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="high",
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="high",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Risks
|
||||
for item in extracted.get("risks", []):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_risk",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity=item.get("severity", "medium"),
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
urgency="medium",
|
||||
status="open",
|
||||
))
|
||||
|
||||
# Open questions
|
||||
for item in extracted.get("open_questions", []):
|
||||
if item.get("confidence") in ("high", "medium"):
|
||||
signals_to_store.append(_build_signal(
|
||||
signal_type="meet_open_q",
|
||||
summary=item["text"],
|
||||
raw_quote=item["text"],
|
||||
severity="low",
|
||||
entities=_extract_entities(item["text"]),
|
||||
keywords=_extract_keywords(item["text"]),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="open",
|
||||
))
|
||||
|
||||
# 3. If this is the final chunk, generate a meeting summary
|
||||
if is_final:
|
||||
summary_signal = await _generate_meeting_summary(
|
||||
meeting_id, group_id, text, speaker, timestamp
|
||||
)
|
||||
if summary_signal:
|
||||
signals_to_store.append(summary_signal)
|
||||
|
||||
# Store everything
|
||||
if signals_to_store:
|
||||
store_signals(group_id, signals_to_store)
|
||||
logger.info(
|
||||
f"Stored {len(signals_to_store)} signals for meeting {meeting_id} chunk {chunk_index}"
|
||||
)
|
||||
|
||||
return signals_to_store
|
||||
|
||||
|
||||
async def _generate_meeting_summary(
|
||||
meeting_id: str,
|
||||
group_id: str,
|
||||
final_chunk_text: str,
|
||||
speaker: str,
|
||||
timestamp: str,
|
||||
) -> dict | None:
|
||||
"""
|
||||
Pull all raw chunks for this meeting from ChromaDB and generate a summary.
|
||||
Falls back to summarizing just the final chunk if retrieval fails.
|
||||
"""
|
||||
from backend.db.chroma import query_signals
|
||||
|
||||
try:
|
||||
# Get all raw chunks for this meeting
|
||||
raw_chunks = query_signals(
|
||||
group_id,
|
||||
meeting_id,
|
||||
n_results=50,
|
||||
signal_type="meet_chunk_raw",
|
||||
)
|
||||
full_transcript = "\n\n".join(
|
||||
[s.get("metadata", {}).get("raw_quote", "") or s.get("document", "") for s in raw_chunks]
|
||||
)
|
||||
if not full_transcript.strip():
|
||||
full_transcript = final_chunk_text
|
||||
except Exception:
|
||||
full_transcript = final_chunk_text
|
||||
|
||||
try:
|
||||
result = await call_llm(
|
||||
task_type="fast_large",
|
||||
messages=[
|
||||
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Meeting ID: {meeting_id}\n\nFull transcript:\n\n{full_transcript[:6000]}",
|
||||
},
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=600,
|
||||
)
|
||||
summary_text = result["content"].strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Meeting summary generation failed: {e}")
|
||||
return None
|
||||
|
||||
return _build_signal(
|
||||
signal_type="meet_summary",
|
||||
summary=summary_text,
|
||||
raw_quote=full_transcript[:500],
|
||||
severity="medium",
|
||||
entities=[f"@{speaker}"] if speaker and speaker != "Unknown" else [],
|
||||
keywords=_extract_keywords(summary_text),
|
||||
timestamp=timestamp,
|
||||
group_id=group_id,
|
||||
meeting_id=meeting_id,
|
||||
status="completed",
|
||||
)
|
||||
114
thirdeye/backend/agents/pattern_detector.py
Normal file
114
thirdeye/backend/agents/pattern_detector.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Pattern Detector Agent — finds trends and anomalies in accumulated signals."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.chroma import get_all_signals
|
||||
from backend.db.models import Pattern
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.pattern_detector")
|
||||
|
||||
SYSTEM_PROMPT = """You are the Pattern Detector for ThirdEye. You analyze accumulated signals to find patterns and anomalies.
|
||||
|
||||
Detect these pattern types:
|
||||
- frequency_spike: A signal type mentioned significantly more than usual
|
||||
- knowledge_silo: Only one person discusses a critical topic (bus factor = 1)
|
||||
- recurring_issue: Same bug/problem appearing repeatedly
|
||||
- sentiment_trend: Gradual shift in tone over time
|
||||
- stale_item: Decisions proposed but never resolved, promises with no follow-up
|
||||
|
||||
Respond ONLY with valid JSON (no markdown, no backticks):
|
||||
{"patterns": [{"type": "pattern_type", "description": "Clear human-readable description", "severity": "info|warning|critical", "evidence_ids": [], "recommendation": "Suggested action"}]}
|
||||
|
||||
If no patterns found: {"patterns": []}
|
||||
Only report patterns that are genuinely concerning. Do NOT manufacture patterns from insufficient data."""
|
||||
|
||||
|
||||
def _heuristic_detect_patterns(group_id: str, all_signals: list[dict]) -> list[Pattern]:
|
||||
"""Generate conservative patterns from signal metadata when LLM output is unavailable."""
|
||||
patterns: list[Pattern] = []
|
||||
type_counts: dict[str, int] = {}
|
||||
entity_counts: dict[str, int] = {}
|
||||
|
||||
for s in all_signals:
|
||||
meta = s.get("metadata", {})
|
||||
signal_type = str(meta.get("type", "unknown"))
|
||||
type_counts[signal_type] = type_counts.get(signal_type, 0) + 1
|
||||
|
||||
entities = meta.get("entities", [])
|
||||
if isinstance(entities, str):
|
||||
entities = [entities]
|
||||
if isinstance(entities, list):
|
||||
for ent in entities:
|
||||
ent_key = str(ent).strip()
|
||||
if ent_key:
|
||||
entity_counts[ent_key] = entity_counts.get(ent_key, 0) + 1
|
||||
|
||||
recurring_types = [t for t, c in type_counts.items() if c >= 2 and t in {"recurring_bug", "workaround", "tech_debt"}]
|
||||
for signal_type in recurring_types:
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type="recurring_issue",
|
||||
description=f"Signal type '{signal_type}' has appeared repeatedly ({type_counts[signal_type]} times).",
|
||||
severity="warning",
|
||||
recommendation="Create a dedicated action item with owner and due date to stop repeated recurrence.",
|
||||
))
|
||||
|
||||
silo_entities = [ent for ent, c in entity_counts.items() if c >= 2]
|
||||
if any("stripe" in ent.lower() or "payment" in ent.lower() for ent in silo_entities):
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type="knowledge_silo",
|
||||
description="Critical payment-related topics are concentrated in repeated mentions, suggesting low bus factor.",
|
||||
severity="warning",
|
||||
recommendation="Document payment workflows and assign at least one backup owner.",
|
||||
))
|
||||
|
||||
return patterns[:5]
|
||||
|
||||
|
||||
async def detect_patterns(group_id: str) -> list[Pattern]:
|
||||
"""Analyze all signals in a group and detect patterns."""
|
||||
all_signals = get_all_signals(group_id)
|
||||
|
||||
if len(all_signals) < 3:
|
||||
logger.info(f"Not enough signals ({len(all_signals)}) for pattern detection in {group_id}")
|
||||
return []
|
||||
|
||||
# Format signals for the LLM
|
||||
signal_summary = []
|
||||
for s in all_signals:
|
||||
meta = s["metadata"]
|
||||
signal_summary.append(
|
||||
f"- [{meta.get('type', '?')}] {s['document'][:100]} "
|
||||
f"(severity={meta.get('severity', '?')}, entities={meta.get('entities', '[]')}, "
|
||||
f"time={meta.get('timestamp', '?')})"
|
||||
)
|
||||
signals_text = "\n".join(signal_summary)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"Analyze these {len(all_signals)} signals from group '{group_id}':\n\n{signals_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("reasoning", messages, temperature=0.2, max_tokens=1500)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
patterns = []
|
||||
for p in parsed.get("patterns", []):
|
||||
patterns.append(Pattern(
|
||||
group_id=group_id,
|
||||
type=p.get("type", "unknown"),
|
||||
description=p.get("description", ""),
|
||||
severity=p.get("severity", "info"),
|
||||
recommendation=p.get("recommendation", ""),
|
||||
))
|
||||
|
||||
logger.info(f"Detected {len(patterns)} patterns in {group_id}")
|
||||
return patterns
|
||||
|
||||
except Exception as e:
|
||||
logger.info(f"Pattern detection LLM parse issue, using fallback: {e}")
|
||||
fallback = _heuristic_detect_patterns(group_id, all_signals)
|
||||
if fallback:
|
||||
logger.info(f"Pattern heuristic fallback produced {len(fallback)} patterns in {group_id}")
|
||||
return fallback
|
||||
68
thirdeye/backend/agents/query_agent.py
Normal file
68
thirdeye/backend/agents/query_agent.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Query Agent — voice-aware signal context formatting for ThirdEye.
|
||||
|
||||
Provides _format_signal_for_context() which labels each ChromaDB signal with
|
||||
its true origin (voice note, document, meeting, chat) so the LLM can produce
|
||||
properly attributed answers like:
|
||||
"Based on what @Raj said in a voice note on Mar 14 (45s), the team decided..."
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
VOICE_CITATION_INSTRUCTION = """
|
||||
When context includes [VOICE NOTE — @name on Date (Xs)] signals, ALWAYS cite the voice note explicitly.
|
||||
Example: "Based on what @Raj said in a voice note on Mar 14 (45s), the team decided to use PostgreSQL."
|
||||
Never flatten voice signals into generic "the team discussed" language. Always name the speaker and source.
|
||||
"""
|
||||
|
||||
|
||||
def _format_signal_for_context(signal: dict) -> str:
|
||||
"""
|
||||
Format a ChromaDB signal as a context snippet for the Query Agent LLM.
|
||||
Voice-sourced signals get explicit attribution so the LLM cites them correctly.
|
||||
Accepts both flat signal dicts and dicts with a nested 'metadata' key.
|
||||
"""
|
||||
# Support both flat dicts and ChromaDB-style {"metadata": {...}, "document": ...}
|
||||
meta = signal.get("metadata", signal)
|
||||
|
||||
source = meta.get("source", signal.get("source", "chat"))
|
||||
sig_type = meta.get("type", signal.get("type", "unknown"))
|
||||
summary = meta.get("summary", signal.get("summary", ""))
|
||||
timestamp = meta.get("timestamp", signal.get("timestamp", ""))
|
||||
|
||||
date_str = ""
|
||||
if timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
||||
date_str = dt.strftime("%b %d")
|
||||
except Exception:
|
||||
date_str = timestamp[:10]
|
||||
|
||||
if source == "voice":
|
||||
speaker = meta.get("speaker", signal.get("speaker", "Unknown"))
|
||||
duration = meta.get("voice_duration", signal.get("voice_duration", 0))
|
||||
duration_str = f"{duration}s" if duration else "?"
|
||||
return (
|
||||
f"[VOICE NOTE — @{speaker} on {date_str} ({duration_str})] "
|
||||
f"[{sig_type}] {summary}"
|
||||
)
|
||||
|
||||
if source == "document":
|
||||
return f"[DOCUMENT — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
if source == "link":
|
||||
return f"[WEB LINK — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
if sig_type in ("meet_decision", "meet_action_item", "meet_blocker", "meet_summary"):
|
||||
meeting_id = meta.get("meeting_id", signal.get("meeting_id", ""))
|
||||
return f"[MEETING {meeting_id} — {date_str}] [{sig_type}] {summary}"
|
||||
|
||||
entities_raw = meta.get("entities", signal.get("entities", []))
|
||||
if isinstance(entities_raw, str):
|
||||
import json
|
||||
try:
|
||||
entities_raw = json.loads(entities_raw)
|
||||
except Exception:
|
||||
entities_raw = []
|
||||
sender_str = entities_raw[0] if entities_raw else ""
|
||||
return f"[CHAT — {sender_str} on {date_str}] [{sig_type}] {summary}"
|
||||
128
thirdeye/backend/agents/signal_extractor.py
Normal file
128
thirdeye/backend/agents/signal_extractor.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Signal Extractor Agent — extracts structured signals from chat messages."""
|
||||
import logging
|
||||
from backend.providers import call_llm
|
||||
from backend.db.models import Signal
|
||||
from datetime import datetime
|
||||
from backend.agents.json_utils import extract_json_object
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.signal_extractor")
|
||||
|
||||
# Lens-specific system prompts
|
||||
LENS_PROMPTS = {
|
||||
"dev": """You are the Signal Extractor for ThirdEye operating in DevLens mode.
|
||||
You analyze batches of developer team chat messages and extract STRUCTURED SIGNALS.
|
||||
|
||||
Extract ONLY signals that represent meaningful technical information. Skip greetings, small talk, emoji reactions, and meta-conversation.
|
||||
|
||||
Signal types to look for:
|
||||
- architecture_decision: Technology choices, design decisions with rationale
|
||||
- tech_debt: Shortcuts, hardcoded values, "will fix later" patterns
|
||||
- knowledge_silo_evidence: Only one person discusses a critical topic
|
||||
- recurring_bug: Same issue mentioned repeatedly
|
||||
- stack_decision: Technology/framework choices (proposed or decided)
|
||||
- deployment_risk: Risky deployment practices
|
||||
- workaround: Temporary fixes being applied repeatedly
|
||||
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
|
||||
|
||||
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
|
||||
For EACH signal found, include it in the JSON array. If NO meaningful signals exist, return empty array.
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"product": """You are the Signal Extractor for ThirdEye operating in ProductLens mode.
|
||||
|
||||
Signal types to look for:
|
||||
- feature_request: Features users or team members are asking for
|
||||
- delivery_commitment: A team member gives an explicit or estimated timeline/ETA ("will take 2 days", "done by Friday", "approx 3 hours")
|
||||
- user_pain_point: User difficulties, complaints, confusion
|
||||
- roadmap_drift: Discussion of topics not on the current plan
|
||||
- priority_conflict: Team members disagreeing on what's most important
|
||||
- metric_mention: Specific numbers, conversion rates, performance data
|
||||
- user_quote: Direct quotes from users/customers
|
||||
- competitor_intel: Mentions of competitor actions or features
|
||||
|
||||
Pay SPECIAL attention to delivery commitments — capture the person's name, the work item, and the exact time estimate.
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"client": """You are the Signal Extractor for ThirdEye operating in ClientLens mode.
|
||||
|
||||
Signal types to look for:
|
||||
- promise: Commitments made with deadlines (explicit or implicit)
|
||||
- scope_creep: Additional requests introduced casually without formal change requests
|
||||
- sentiment_signal: Tone changes (positive praise, growing frustration, formality shifts)
|
||||
- unanswered_request: Questions or requests that haven't received responses
|
||||
- satisfaction: Explicit positive or negative feedback
|
||||
- escalation_risk: Mentions of involving management, expressing deadline concerns
|
||||
- client_decision: Decisions made by the client
|
||||
|
||||
Pay SPECIAL attention to implicit deadlines ("by end of week", "before the meeting").
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
|
||||
"community": """You are the Signal Extractor for ThirdEye operating in CommunityLens mode.
|
||||
|
||||
Signal types: recommendation, event, issue, local_knowledge, question
|
||||
Be SELECTIVE. Quality over quantity.""",
|
||||
}
|
||||
|
||||
EXTRACTION_FORMAT = """
|
||||
Respond ONLY with valid JSON in this exact format (no markdown, no backticks, no explanation):
|
||||
{"signals": [{"type": "signal_type_here", "summary": "One clear sentence that includes specific names, numbers, timelines, and commitments", "entities": ["@person", "technology"], "severity": "low|medium|high|critical", "status": "proposed|decided|implemented|unresolved", "raw_quote": "Exact verbatim sentence(s) from the message that capture the full claim, including names, numbers, and timelines", "message_index": 0}]}
|
||||
|
||||
IMPORTANT for raw_quote: copy the FULL relevant sentence from the message, not just a topic keyword.
|
||||
Example — message "Anirban: feature page revamp will take approx 2 more days"
|
||||
WRONG raw_quote: "feature page revamp"
|
||||
CORRECT raw_quote: "feature page revamp will take approx 2 more days"
|
||||
|
||||
If no signals found: {"signals": []}
|
||||
"""
|
||||
|
||||
|
||||
async def extract_signals(messages_text: str, group_id: str, lens: str = "dev") -> list[Signal]:
|
||||
"""
|
||||
Extract structured signals from a batch of formatted chat messages.
|
||||
|
||||
Args:
|
||||
messages_text: Formatted string like "[Alex]: Let's use Redis\\n[Bob]: Agreed"
|
||||
group_id: Telegram group ID
|
||||
lens: Active lens mode (dev, product, client, community)
|
||||
|
||||
Returns:
|
||||
List of Signal objects
|
||||
"""
|
||||
system_prompt = LENS_PROMPTS.get(lens, LENS_PROMPTS["dev"])
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt + "\n\n" + EXTRACTION_FORMAT},
|
||||
{"role": "user", "content": f"Extract signals from these messages:\n\n{messages_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_large", messages, temperature=0.2, max_tokens=2000)
|
||||
parsed = extract_json_object(result.get("content", ""))
|
||||
raw_signals = parsed.get("signals", [])
|
||||
|
||||
# Convert to Signal objects
|
||||
signals = []
|
||||
for raw in raw_signals:
|
||||
try:
|
||||
signal = Signal(
|
||||
group_id=group_id,
|
||||
lens=lens,
|
||||
type=raw.get("type", "unknown"),
|
||||
summary=raw.get("summary", ""),
|
||||
entities=raw.get("entities", []),
|
||||
severity=raw.get("severity", "low"),
|
||||
status=raw.get("status", "unknown"),
|
||||
raw_quote=raw.get("raw_quote", ""),
|
||||
timestamp=datetime.utcnow().isoformat(),
|
||||
)
|
||||
signals.append(signal)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse signal: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Extracted {len(signals)} signals from {group_id} (lens={lens}) via {result['provider']}")
|
||||
return signals
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Signal extraction failed: {e}")
|
||||
return []
|
||||
281
thirdeye/backend/agents/voice_handler.py
Normal file
281
thirdeye/backend/agents/voice_handler.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Voice Handler
|
||||
Orchestrates the full pipeline for Telegram voice messages and video notes:
|
||||
|
||||
Telegram voice/video_note message
|
||||
-> download audio bytes
|
||||
-> transcribe via Groq Whisper (voice_transcriber.py)
|
||||
-> build a voice_transcript signal (stored raw for full-text search)
|
||||
-> run transcript through process_message_batch (signal extraction)
|
||||
-> all extracted signals carry voice attribution metadata
|
||||
|
||||
Voice metadata attached to every extracted signal:
|
||||
source: "voice"
|
||||
voice_file_id: Telegram file ID
|
||||
voice_duration: seconds
|
||||
speaker: sender display name
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from backend.agents.voice_transcriber import (
|
||||
transcribe_audio, download_telegram_audio, format_duration
|
||||
)
|
||||
from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
|
||||
from backend.db.chroma import store_signals
|
||||
from backend.pipeline import process_message_batch
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_handler")
|
||||
|
||||
|
||||
# --- Voice transcript signal builder -----------------------------------------
|
||||
|
||||
def build_voice_transcript_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
voice_file_id: str,
|
||||
duration_seconds: int,
|
||||
language: str,
|
||||
timestamp: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a voice_transcript signal that stores the full raw transcription.
|
||||
Always stored alongside extracted signals so the full transcript is
|
||||
searchable in ChromaDB even if no structured signals were extracted.
|
||||
"""
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "voice_transcript",
|
||||
"summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
|
||||
"raw_quote": transcript,
|
||||
"severity": "low",
|
||||
"status": "transcribed",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"voice_file_id": voice_file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
"voice_language": language,
|
||||
"speaker": sender,
|
||||
}
|
||||
|
||||
|
||||
def _extract_voice_keywords(text: str) -> list[str]:
|
||||
"""Simple keyword extraction from transcript text."""
|
||||
stopwords = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
|
||||
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
|
||||
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
|
||||
"so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
|
||||
}
|
||||
words = text.lower().split()
|
||||
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
|
||||
return list(dict.fromkeys(keywords))[:12]
|
||||
|
||||
|
||||
def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
|
||||
"""
|
||||
Inject voice attribution into every signal extracted from a voice transcript.
|
||||
Accepts both Signal Pydantic model objects and plain dicts.
|
||||
This ensures /ask can cite the voice source in its answers.
|
||||
"""
|
||||
result = []
|
||||
for signal in signals:
|
||||
sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
|
||||
sig["source"] = "voice"
|
||||
sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
|
||||
sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
|
||||
sig["voice_language"] = voice_meta.get("language", "")
|
||||
sig["speaker"] = voice_meta.get("sender", "Unknown")
|
||||
if "[Voice]" not in sig.get("summary", ""):
|
||||
sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
|
||||
result.append(sig)
|
||||
return result
|
||||
|
||||
|
||||
# --- Fallback signal builder -------------------------------------------------
|
||||
|
||||
# Keywords that hint at a signal type when the LLM extraction returns nothing
|
||||
_FALLBACK_TYPE_HINTS = {
|
||||
"feature_request": {
|
||||
"need", "needs", "required", "require", "want", "should", "missing",
|
||||
"add", "feature", "ui", "ux", "design", "change", "changes", "update",
|
||||
"improve", "improvement", "responsiveness", "responsive",
|
||||
},
|
||||
"blocker": {
|
||||
"blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
|
||||
"issue", "problem", "broken", "fails", "failing",
|
||||
},
|
||||
"action_item": {
|
||||
"will", "going", "plan", "todo", "do", "fix", "implement", "setup",
|
||||
"create", "build", "deploy", "check",
|
||||
},
|
||||
"risk": {
|
||||
"risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
|
||||
"deadline", "delay", "late",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _build_fallback_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
timestamp: str,
|
||||
voice_meta: dict,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a best-effort structured signal from a voice transcript when the LLM
|
||||
returned 0 signals. Picks the most likely signal type from keyword hints,
|
||||
falling back to 'feature_request' as the safe default.
|
||||
"""
|
||||
words = set(transcript.lower().split())
|
||||
scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
|
||||
best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
|
||||
|
||||
urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
|
||||
severity = "high" if words & urgency_words else "medium"
|
||||
|
||||
summary = transcript[:200].strip()
|
||||
if len(transcript) > 200:
|
||||
summary += "..."
|
||||
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": best_type,
|
||||
"summary": f"[Voice @{sender}] {summary}",
|
||||
"raw_quote": transcript[:500],
|
||||
"severity": severity,
|
||||
"status": "unresolved",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "high" if severity == "high" else "medium",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"speaker": sender,
|
||||
"voice_file_id": voice_meta.get("voice_file_id", ""),
|
||||
"voice_duration": voice_meta.get("duration_seconds", 0),
|
||||
"voice_language": voice_meta.get("language", ""),
|
||||
}
|
||||
|
||||
|
||||
# --- Main handler ------------------------------------------------------------
|
||||
|
||||
async def handle_voice_message(
|
||||
bot,
|
||||
group_id: str,
|
||||
sender: str,
|
||||
file_id: str,
|
||||
duration_seconds: int,
|
||||
message_date,
|
||||
is_video_note: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Full pipeline for a single voice or video note message.
|
||||
|
||||
Returns:
|
||||
{"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
|
||||
OR {"ok": False, "reason": "...", "error": "..."}
|
||||
"""
|
||||
if not ENABLE_VOICE_TRANSCRIPTION:
|
||||
return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
|
||||
|
||||
msg_type = "video note" if is_video_note else "voice message"
|
||||
logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
|
||||
|
||||
# 1. Download audio
|
||||
try:
|
||||
audio_bytes = await download_telegram_audio(bot, file_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download audio from {sender}: {e}")
|
||||
return {"ok": False, "reason": "download_failed", "error": str(e)}
|
||||
|
||||
# 2. Transcribe
|
||||
filename = "audio.mp4" if is_video_note else "audio.ogg"
|
||||
transcription = await transcribe_audio(
|
||||
audio_bytes,
|
||||
filename=filename,
|
||||
duration_seconds=duration_seconds,
|
||||
)
|
||||
|
||||
if not transcription["ok"]:
|
||||
logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
|
||||
return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
|
||||
|
||||
transcript = transcription["transcript"]
|
||||
language = transcription.get("language", "unknown")
|
||||
timestamp = (
|
||||
message_date.replace(tzinfo=timezone.utc).isoformat()
|
||||
if message_date else datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
# 3. Store raw voice transcript signal
|
||||
if VOICE_STORE_TRANSCRIPT:
|
||||
transcript_signal = build_voice_transcript_signal(
|
||||
transcript=transcript,
|
||||
sender=sender,
|
||||
group_id=group_id,
|
||||
voice_file_id=file_id,
|
||||
duration_seconds=duration_seconds,
|
||||
language=language,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
store_signals(group_id, [transcript_signal])
|
||||
logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
|
||||
|
||||
# 4. Run through signal extraction pipeline — treat as a regular text message
|
||||
voice_meta = {
|
||||
"sender": sender,
|
||||
"voice_file_id": file_id,
|
||||
"duration_seconds": duration_seconds,
|
||||
"language": language,
|
||||
}
|
||||
|
||||
messages = [{
|
||||
"sender": sender,
|
||||
"text": transcript,
|
||||
"timestamp": timestamp,
|
||||
"source": "voice",
|
||||
"voice_file_id": file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
}]
|
||||
|
||||
try:
|
||||
extracted_signals = await process_message_batch(group_id, messages)
|
||||
extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
|
||||
signals_count = len(extracted_signals)
|
||||
|
||||
# Fallback: if the LLM extracted nothing from a meaningful voice message,
|
||||
# create a generic signal so the content is still searchable as structured data.
|
||||
if signals_count == 0 and len(transcript.split()) >= 5:
|
||||
fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
|
||||
store_signals(group_id, [fallback])
|
||||
signals_count = 1
|
||||
logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
|
||||
except Exception as e:
|
||||
logger.error(f"Signal extraction failed for voice from {sender}: {e}")
|
||||
signals_count = 0
|
||||
|
||||
logger.info(
|
||||
f"Voice pipeline complete: {sender}, {duration_seconds}s, "
|
||||
f"{signals_count} signals, transcript={len(transcript)} chars"
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"signals_extracted": signals_count,
|
||||
"duration": duration_seconds,
|
||||
"sender": f"@{sender}",
|
||||
"language": language,
|
||||
}
|
||||
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
194
thirdeye/backend/agents/voice_transcriber.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Voice Transcriber — Groq Whisper integration.
|
||||
|
||||
Uses Groq's whisper-large-v3 model (free, already in provider stack) to transcribe
|
||||
audio bytes from Telegram voice messages and video notes into plain text.
|
||||
|
||||
Groq Whisper endpoint: https://api.groq.com/openai/v1/audio/transcriptions
|
||||
Supported formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, opus, wav, webm
|
||||
Telegram voice messages: OGG/Opus
|
||||
Telegram video notes: MP4
|
||||
|
||||
Free tier limits: 7,200 seconds of audio / hour on Groq free plan.
|
||||
At avg 30s per voice note: ~240 voice notes / hour — more than any team sends.
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from backend.config import (
|
||||
GROQ_API_KEY,
|
||||
VOICE_LANGUAGE,
|
||||
VOICE_MAX_DURATION_SECONDS,
|
||||
VOICE_MIN_DURATION_SECONDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_transcriber")
|
||||
|
||||
GROQ_WHISPER_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
WHISPER_MODEL = "whisper-large-v3"
|
||||
|
||||
# Groq file size limit for Whisper: 25 MB
|
||||
GROQ_MAX_FILE_BYTES = 25 * 1024 * 1024
|
||||
|
||||
|
||||
# --- Main transcription function ---------------------------------------------
|
||||
|
||||
async def transcribe_audio(
|
||||
audio_bytes: bytes,
|
||||
filename: str = "audio.ogg",
|
||||
duration_seconds: int = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe audio bytes using Groq Whisper.
|
||||
|
||||
Args:
|
||||
audio_bytes: Raw audio data (OGG, MP4, WAV, etc.)
|
||||
filename: Filename hint for the API (determines format detection)
|
||||
duration_seconds: Voice message duration from Telegram metadata (for pre-filtering)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"ok": True,
|
||||
"transcript": "The full transcribed text...",
|
||||
"language": "en",
|
||||
"duration": 45,
|
||||
"word_count": 120,
|
||||
}
|
||||
OR on failure:
|
||||
{
|
||||
"ok": False,
|
||||
"reason": "too_long" | "too_short" | "empty" | "file_too_large" | "api_error" | "no_speech",
|
||||
"error": "optional error string",
|
||||
}
|
||||
"""
|
||||
# Pre-flight checks
|
||||
if not GROQ_API_KEY or len(GROQ_API_KEY) < 5:
|
||||
return {"ok": False, "reason": "api_error", "error": "GROQ_API_KEY not set"}
|
||||
|
||||
if not audio_bytes:
|
||||
return {"ok": False, "reason": "empty", "error": "No audio bytes received"}
|
||||
|
||||
if len(audio_bytes) > GROQ_MAX_FILE_BYTES:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "file_too_large",
|
||||
"error": f"Audio is {len(audio_bytes) / 1024 / 1024:.1f}MB — Groq limit is 25MB",
|
||||
}
|
||||
|
||||
if duration_seconds is not None:
|
||||
if duration_seconds < VOICE_MIN_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_short",
|
||||
"error": f"Voice note is {duration_seconds}s — minimum is {VOICE_MIN_DURATION_SECONDS}s",
|
||||
}
|
||||
if duration_seconds > VOICE_MAX_DURATION_SECONDS:
|
||||
return {
|
||||
"ok": False,
|
||||
"reason": "too_long",
|
||||
"error": f"Voice note is {duration_seconds}s — maximum is {VOICE_MAX_DURATION_SECONDS}s",
|
||||
}
|
||||
|
||||
# Determine MIME type from filename extension
|
||||
ext_to_mime = {
|
||||
".ogg": "audio/ogg",
|
||||
".opus": "audio/ogg",
|
||||
".mp3": "audio/mpeg",
|
||||
".mp4": "video/mp4",
|
||||
".m4a": "audio/mp4",
|
||||
".wav": "audio/wav",
|
||||
".flac": "audio/flac",
|
||||
".webm": "audio/webm",
|
||||
}
|
||||
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ".ogg"
|
||||
mime_type = ext_to_mime.get(ext, "audio/ogg")
|
||||
|
||||
form_data = {
|
||||
"model": WHISPER_MODEL,
|
||||
"response_format": "verbose_json", # returns language detection
|
||||
"temperature": "0", # deterministic transcription
|
||||
}
|
||||
if VOICE_LANGUAGE:
|
||||
form_data["language"] = VOICE_LANGUAGE
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.post(
|
||||
GROQ_WHISPER_URL,
|
||||
headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
|
||||
files={"file": (filename, io.BytesIO(audio_bytes), mime_type)},
|
||||
data=form_data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_text = ""
|
||||
try:
|
||||
error_text = e.response.json().get("error", {}).get("message", e.response.text[:200])
|
||||
except Exception:
|
||||
error_text = e.response.text[:200]
|
||||
|
||||
if e.response.status_code == 429:
|
||||
logger.warning("Groq Whisper rate limited")
|
||||
return {"ok": False, "reason": "api_error", "error": "Rate limited — try again shortly"}
|
||||
logger.error(f"Groq Whisper HTTP error {e.response.status_code}: {error_text}")
|
||||
return {"ok": False, "reason": "api_error", "error": f"HTTP {e.response.status_code}: {error_text}"}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Groq Whisper request timed out")
|
||||
return {"ok": False, "reason": "api_error", "error": "Request timed out after 60s"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Groq Whisper unexpected error: {e}")
|
||||
return {"ok": False, "reason": "api_error", "error": str(e)}
|
||||
|
||||
# Parse response
|
||||
transcript = (data.get("text") or "").strip()
|
||||
|
||||
if not transcript:
|
||||
return {"ok": False, "reason": "no_speech", "error": "Whisper returned empty transcript"}
|
||||
|
||||
# Detect if Whisper only returned noise markers
|
||||
noise_patterns = {"[music]", "[noise]", "[silence]", "[inaudible]", "(music)", "(noise)"}
|
||||
if transcript.lower() in noise_patterns:
|
||||
return {"ok": False, "reason": "no_speech", "error": f"Only noise detected: {transcript}"}
|
||||
|
||||
detected_language = data.get("language", VOICE_LANGUAGE or "unknown")
|
||||
word_count = len(transcript.split())
|
||||
|
||||
logger.info(
|
||||
f"Whisper transcribed {duration_seconds or '?'}s audio -> "
|
||||
f"{word_count} words [{detected_language}]: {transcript[:60]}..."
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"language": detected_language,
|
||||
"duration": duration_seconds,
|
||||
"word_count": word_count,
|
||||
}
|
||||
|
||||
|
||||
# --- Telegram-specific download helper ---------------------------------------
|
||||
|
||||
async def download_telegram_audio(bot, file_id: str) -> bytes:
|
||||
"""
|
||||
Download a Telegram file (voice or video_note) and return raw bytes.
|
||||
"""
|
||||
tg_file = await bot.get_file(file_id)
|
||||
audio_bytes = await tg_file.download_as_bytearray()
|
||||
return bytes(audio_bytes)
|
||||
|
||||
|
||||
def format_duration(seconds: int) -> str:
|
||||
"""Format seconds into human-readable string: '1m 34s' or '45s'."""
|
||||
if seconds is None:
|
||||
return "?"
|
||||
if seconds >= 60:
|
||||
return f"{seconds // 60}m {seconds % 60}s"
|
||||
return f"{seconds}s"
|
||||
84
thirdeye/backend/agents/web_search.py
Normal file
84
thirdeye/backend/agents/web_search.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Web Search Agent — Tavily integration for real-time web context."""
|
||||
import logging
|
||||
from backend.config import TAVILY_API_KEY, ENABLE_WEB_SEARCH
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.web_search")
|
||||
|
||||
_tavily_client = None
|
||||
|
||||
|
||||
def _get_client():
|
||||
global _tavily_client
|
||||
if _tavily_client is None and TAVILY_API_KEY and len(TAVILY_API_KEY) > 5:
|
||||
try:
|
||||
from tavily import TavilyClient
|
||||
_tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
|
||||
logger.info("Tavily client initialized")
|
||||
except ImportError:
|
||||
logger.error("tavily-python not installed. Run: pip install tavily-python")
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily client init failed: {e}")
|
||||
return _tavily_client
|
||||
|
||||
|
||||
async def search_web(query: str, max_results: int = 5) -> list[dict]:
|
||||
"""
|
||||
Search the web using Tavily and return structured results.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
max_results: Max results to return (1-10)
|
||||
|
||||
Returns:
|
||||
List of {title, url, content, score} dicts, sorted by relevance
|
||||
"""
|
||||
if not ENABLE_WEB_SEARCH:
|
||||
logger.info("Web search is disabled via feature flag")
|
||||
return []
|
||||
|
||||
client = _get_client()
|
||||
if not client:
|
||||
logger.warning("Tavily client not available (missing API key or install)")
|
||||
return []
|
||||
|
||||
try:
|
||||
response = client.search(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
search_depth="basic", # "basic" is faster + free-tier friendly; "advanced" for deeper
|
||||
include_answer=False,
|
||||
include_raw_content=False,
|
||||
)
|
||||
|
||||
results = []
|
||||
for r in response.get("results", []):
|
||||
results.append({
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("content", ""),
|
||||
"score": r.get("score", 0.0),
|
||||
})
|
||||
|
||||
logger.info(f"Tavily returned {len(results)} results for: {query[:60]}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Tavily search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def format_search_results_for_llm(results: list[dict]) -> str:
|
||||
"""Format Tavily results into context string for the Query Agent."""
|
||||
if not results:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
for i, r in enumerate(results):
|
||||
content_preview = r["content"][:500] if r["content"] else "No content"
|
||||
parts.append(
|
||||
f"[Web Result {i+1}] {r['title']}\n"
|
||||
f"Source: {r['url']}\n"
|
||||
f"Content: {content_preview}"
|
||||
)
|
||||
|
||||
return "\n\n".join(parts)
|
||||
785
thirdeye/backend/api/routes.py
Normal file
785
thirdeye/backend/api/routes.py
Normal file
@@ -0,0 +1,785 @@
|
||||
"""FastAPI routes for the ThirdEye dashboard."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from backend.db.chroma import get_all_signals, query_signals, get_group_ids, get_group_names
|
||||
from backend.pipeline import query_knowledge, get_lens, set_lens
|
||||
from backend.agents.pattern_detector import detect_patterns
|
||||
from backend.agents.cross_group_analyst import analyze_cross_group
|
||||
from collections import defaultdict
|
||||
|
||||
logger = logging.getLogger("thirdeye.api")
|
||||
|
||||
app = FastAPI(title="ThirdEye API", version="1.0.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/groups")
|
||||
async def list_groups():
|
||||
"""List all monitored groups."""
|
||||
group_ids = get_group_ids()
|
||||
names = get_group_names()
|
||||
groups = []
|
||||
for gid in group_ids:
|
||||
signals = get_all_signals(gid)
|
||||
groups.append({
|
||||
"group_id": gid,
|
||||
"group_name": names.get(gid, gid),
|
||||
"signal_count": len(signals),
|
||||
"lens": get_lens(gid),
|
||||
})
|
||||
return {"groups": groups}
|
||||
|
||||
|
||||
@app.get("/api/groups/{group_id}/signals")
|
||||
async def get_signals(
|
||||
group_id: str,
|
||||
signal_type: str = None,
|
||||
severity: str = None,
|
||||
lens: str = None,
|
||||
date_from: str = None,
|
||||
date_to: str = None,
|
||||
):
|
||||
"""Get signals for a group with optional filters."""
|
||||
signals = get_all_signals(group_id, signal_type=signal_type)
|
||||
|
||||
if severity:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("severity") == severity]
|
||||
if lens:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("lens") == lens]
|
||||
if date_from:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") >= date_from]
|
||||
if date_to:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") <= date_to]
|
||||
|
||||
signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""), reverse=True)
|
||||
return {"signals": signals, "count": len(signals)}
|
||||
|
||||
|
||||
@app.post("/api/groups/{group_id}/query")
|
||||
async def query_group(group_id: str, body: dict):
|
||||
"""Natural language query over a group's knowledge base."""
|
||||
question = body.get("question", "")
|
||||
if not question:
|
||||
raise HTTPException(400, "Missing 'question' field")
|
||||
try:
|
||||
answer = await query_knowledge(group_id, question)
|
||||
return {"answer": answer, "question": question}
|
||||
except Exception as e:
|
||||
logger.warning(f"Query failed for {group_id}: {e}")
|
||||
raise HTTPException(500, "Query processing failed — please try again")
|
||||
|
||||
|
||||
@app.get("/api/groups/{group_id}/patterns")
|
||||
async def get_patterns(group_id: str):
|
||||
"""Detect and return patterns for a group."""
|
||||
try:
|
||||
patterns = await asyncio.wait_for(detect_patterns(group_id), timeout=25.0)
|
||||
return {"patterns": [p.model_dump() for p in patterns]}
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Pattern detection timed out for {group_id}")
|
||||
return {"patterns": []}
|
||||
except Exception as e:
|
||||
logger.warning(f"Pattern detection failed for {group_id}: {e}")
|
||||
return {"patterns": []}
|
||||
|
||||
|
||||
@app.get("/api/cross-group/insights")
|
||||
async def get_cross_group_insights():
|
||||
"""Run cross-group analysis and return insights."""
|
||||
try:
|
||||
group_ids = get_group_ids()
|
||||
if len(group_ids) < 2:
|
||||
return {"insights": [], "message": "Need at least 2 monitored groups"}
|
||||
|
||||
summaries = {}
|
||||
for gid in group_ids:
|
||||
summaries[gid] = get_all_signals(gid)
|
||||
|
||||
insights = await asyncio.wait_for(analyze_cross_group(summaries), timeout=25.0)
|
||||
return {"insights": [i.model_dump() for i in insights]}
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Cross-group analysis timed out — returning heuristic fallback")
|
||||
from backend.agents.cross_group_analyst import _heuristic_cross_group_insights
|
||||
fallback = _heuristic_cross_group_insights(summaries)
|
||||
return {"insights": [i.model_dump() for i in fallback]}
|
||||
except Exception as e:
|
||||
logger.warning(f"Cross-group analysis failed: {e}")
|
||||
return {"insights": [], "message": "Analysis temporarily unavailable"}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "service": "thirdeye"}
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Google Meet Ingestion Endpoints
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
from pydantic import BaseModel
|
||||
from backend.config import MEET_INGEST_SECRET, ENABLE_MEET_INGESTION
|
||||
|
||||
class MeetStartPayload(BaseModel):
|
||||
meeting_id: str
|
||||
group_id: str = "meet_sessions"
|
||||
started_at: str
|
||||
speaker: str = "Unknown"
|
||||
|
||||
class MeetChunkPayload(BaseModel):
|
||||
meeting_id: str
|
||||
group_id: str = "meet_sessions"
|
||||
chunk_index: int
|
||||
text: str
|
||||
speaker: str = "Unknown"
|
||||
timestamp: str
|
||||
is_final: bool = False
|
||||
|
||||
def _verify_meet_secret(request: Request):
|
||||
secret = request.headers.get("X-ThirdEye-Secret", "")
|
||||
if secret != MEET_INGEST_SECRET:
|
||||
from fastapi import HTTPException
|
||||
raise HTTPException(status_code=403, detail="Invalid Meet ingest secret")
|
||||
|
||||
@app.post("/api/meet/start")
|
||||
async def meet_start(payload: MeetStartPayload, request: Request):
|
||||
"""Called by extension when a new meeting begins."""
|
||||
_verify_meet_secret(request)
|
||||
if not ENABLE_MEET_INGESTION:
|
||||
return {"ok": False, "reason": "Meet ingestion disabled"}
|
||||
|
||||
# Store a meeting-started signal immediately
|
||||
from backend.db.chroma import store_signals
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "meet_started",
|
||||
"summary": f"Meeting {payload.meeting_id} started by {payload.speaker}",
|
||||
"raw_quote": "",
|
||||
"severity": "low",
|
||||
"status": "active",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [f"@{payload.speaker}", f"#{payload.meeting_id}"],
|
||||
"keywords": ["meeting", "started", payload.meeting_id],
|
||||
"timestamp": payload.started_at,
|
||||
"group_id": payload.group_id,
|
||||
"lens": "meet",
|
||||
"meeting_id": payload.meeting_id,
|
||||
}
|
||||
store_signals(payload.group_id, [signal])
|
||||
return {"ok": True, "meeting_id": payload.meeting_id}
|
||||
|
||||
|
||||
@app.post("/api/meet/ingest")
|
||||
async def meet_ingest(payload: MeetChunkPayload, request: Request, background_tasks: BackgroundTasks):
|
||||
"""Called by extension every 30s with a transcript chunk."""
|
||||
_verify_meet_secret(request)
|
||||
if not ENABLE_MEET_INGESTION:
|
||||
return {"ok": False, "reason": "Meet ingestion disabled"}
|
||||
|
||||
if len(payload.text.strip()) < 10:
|
||||
return {"ok": True, "skipped": True, "reason": "chunk too short"}
|
||||
|
||||
# Process asynchronously so the extension gets a fast response
|
||||
from backend.agents.meet_ingestor import process_meet_chunk
|
||||
background_tasks.add_task(
|
||||
process_meet_chunk,
|
||||
payload.meeting_id,
|
||||
payload.group_id,
|
||||
payload.chunk_index,
|
||||
payload.text,
|
||||
payload.speaker,
|
||||
payload.timestamp,
|
||||
payload.is_final,
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"meeting_id": payload.meeting_id,
|
||||
"chunk_index": payload.chunk_index,
|
||||
"queued": True,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/meet/meetings")
|
||||
async def list_meetings():
|
||||
"""List all recorded meetings with their signal counts."""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
|
||||
meetings = {}
|
||||
|
||||
# Check all groups for meet-related signals
|
||||
for group_id in get_group_ids():
|
||||
all_signals = get_all_signals(group_id)
|
||||
for sig in all_signals:
|
||||
# Only process signals that have lens="meet"
|
||||
if sig.get("metadata", {}).get("lens") != "meet":
|
||||
continue
|
||||
|
||||
mid = sig.get("metadata", {}).get("meeting_id", "unknown")
|
||||
if not mid or mid == "":
|
||||
continue
|
||||
|
||||
if mid not in meetings:
|
||||
meetings[mid] = {"meeting_id": mid, "signal_count": 0, "types": {}}
|
||||
meetings[mid]["signal_count"] += 1
|
||||
t = sig.get("metadata", {}).get("type", "unknown")
|
||||
meetings[mid]["types"][t] = meetings[mid]["types"].get(t, 0) + 1
|
||||
|
||||
return {"meetings": list(meetings.values())}
|
||||
|
||||
|
||||
@app.get("/api/meet/meetings/{meeting_id}/signals")
|
||||
async def get_meeting_signals(meeting_id: str):
|
||||
"""Get all signals for a specific meeting."""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
|
||||
all_signals = []
|
||||
for gid in get_group_ids():
|
||||
for sig in get_all_signals(gid):
|
||||
meta = sig.get("metadata", {})
|
||||
if meta.get("meeting_id") == meeting_id and meta.get("lens") == "meet":
|
||||
all_signals.append(sig)
|
||||
|
||||
all_signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""))
|
||||
return {"meeting_id": meeting_id, "signals": all_signals, "count": len(all_signals)}
|
||||
|
||||
|
||||
@app.get("/api/meet/meetings/{meeting_id}")
|
||||
async def get_meeting_detail(meeting_id: str):
|
||||
"""Get detailed info for a single meeting."""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
|
||||
signals_by_type: dict = {}
|
||||
started_at = ""
|
||||
speaker = "Unknown"
|
||||
group_id = ""
|
||||
|
||||
for gid in get_group_ids():
|
||||
for sig in get_all_signals(gid):
|
||||
meta = sig.get("metadata", {})
|
||||
if meta.get("meeting_id") != meeting_id or meta.get("lens") != "meet":
|
||||
continue
|
||||
sig_type = meta.get("type", "unknown")
|
||||
signals_by_type.setdefault(sig_type, []).append(sig)
|
||||
if sig_type == "meet_started":
|
||||
started_at = meta.get("timestamp", "")
|
||||
speaker = meta.get("speaker", "Unknown") or meta.get("entities", "Unknown")
|
||||
group_id = gid
|
||||
|
||||
# Summary signal text
|
||||
summary_text = ""
|
||||
for sig in signals_by_type.get("meet_summary", []):
|
||||
summary_text = sig.get("metadata", {}).get("summary", "") or sig.get("document", "")
|
||||
break
|
||||
|
||||
signal_counts = {k: len(v) for k, v in signals_by_type.items()}
|
||||
total_signals = sum(signal_counts.values())
|
||||
|
||||
return {
|
||||
"meeting_id": meeting_id,
|
||||
"started_at": started_at,
|
||||
"speaker": speaker,
|
||||
"group_id": group_id,
|
||||
"total_signals": total_signals,
|
||||
"signal_counts": signal_counts,
|
||||
"summary": summary_text,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/meet/meetings/{meeting_id}/transcript")
|
||||
async def get_meeting_transcript(meeting_id: str):
|
||||
"""Get raw transcript chunks for a meeting in chronological order."""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
|
||||
chunks = []
|
||||
for gid in get_group_ids():
|
||||
for sig in get_all_signals(gid):
|
||||
meta = sig.get("metadata", {})
|
||||
if meta.get("meeting_id") == meeting_id and meta.get("type") == "meet_chunk_raw":
|
||||
chunks.append({
|
||||
"id": sig.get("id", ""),
|
||||
"text": meta.get("raw_quote", "") or sig.get("document", ""),
|
||||
"speaker": meta.get("speaker", "Unknown"),
|
||||
"timestamp": meta.get("timestamp", ""),
|
||||
"summary": meta.get("summary", ""),
|
||||
})
|
||||
|
||||
chunks.sort(key=lambda c: c["timestamp"])
|
||||
return {"meeting_id": meeting_id, "transcript": chunks, "chunk_count": len(chunks)}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Jira Endpoints
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
class JiraRaisePayload(BaseModel):
|
||||
signal_id: str
|
||||
group_id: str
|
||||
project_key: str = None
|
||||
force: bool = False
|
||||
|
||||
class JiraCreatePayload(BaseModel):
|
||||
summary: str
|
||||
description: str = ""
|
||||
project_key: str = None
|
||||
issue_type: str = "Task"
|
||||
priority: str = "Medium"
|
||||
labels: list = []
|
||||
assignee_account_id: str = None
|
||||
|
||||
|
||||
@app.get("/api/jira/tickets")
|
||||
async def list_jira_tickets(
|
||||
group_id: str = None,
|
||||
date_from: str = None,
|
||||
date_to: str = None,
|
||||
live: bool = False,
|
||||
):
|
||||
"""List all Jira tickets raised by ThirdEye across all groups."""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids
|
||||
from backend.integrations.jira_client import get_issue, is_configured
|
||||
|
||||
group_ids = [group_id] if group_id else get_group_ids()
|
||||
tickets = []
|
||||
|
||||
for gid in group_ids:
|
||||
for sig in get_all_signals(gid, signal_type="jira_raised"):
|
||||
meta = sig.get("metadata", {})
|
||||
ts = meta.get("timestamp", "")
|
||||
if date_from and ts < date_from:
|
||||
continue
|
||||
if date_to and ts > date_to:
|
||||
continue
|
||||
|
||||
jira_key = meta.get("jira_key", "") or (
|
||||
json.loads(meta.get("entities", "[]") or "[]") or [""]
|
||||
)[0]
|
||||
|
||||
tickets.append({
|
||||
"id": sig.get("id", ""),
|
||||
"jira_key": jira_key,
|
||||
"jira_url": meta.get("jira_url", ""),
|
||||
"jira_summary": meta.get("jira_summary", "") or meta.get("summary", ""),
|
||||
"jira_priority": meta.get("jira_priority", "Medium"),
|
||||
"original_signal_id": meta.get("original_signal_id", "") or meta.get("raw_quote", ""),
|
||||
"group_id": gid,
|
||||
"raised_at": ts,
|
||||
"status": "Unknown",
|
||||
})
|
||||
|
||||
# Fetch live status from Jira if requested and configured
|
||||
if live and is_configured() and tickets:
|
||||
for ticket in tickets:
|
||||
if ticket["jira_key"]:
|
||||
try:
|
||||
issue_data = await get_issue(ticket["jira_key"])
|
||||
ticket["status"] = issue_data.get("status", "Unknown")
|
||||
ticket["assignee"] = issue_data.get("assignee", "Unassigned")
|
||||
if not ticket["jira_summary"]:
|
||||
ticket["jira_summary"] = issue_data.get("summary", "")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tickets.sort(key=lambda t: t["raised_at"], reverse=True)
|
||||
return {"tickets": tickets, "count": len(tickets)}
|
||||
|
||||
|
||||
@app.get("/api/jira/tickets/{ticket_key}/status")
|
||||
async def get_jira_ticket_status(ticket_key: str):
|
||||
"""Fetch live status for a Jira ticket."""
|
||||
from backend.integrations.jira_client import get_issue, is_configured
|
||||
|
||||
if not is_configured():
|
||||
raise HTTPException(503, "Jira is not configured")
|
||||
try:
|
||||
data = await get_issue(ticket_key)
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"Jira API error: {e}")
|
||||
|
||||
|
||||
@app.post("/api/jira/raise")
|
||||
async def raise_jira_ticket(payload: JiraRaisePayload):
|
||||
"""Raise a Jira ticket for an existing ThirdEye signal."""
|
||||
from backend.db.chroma import get_all_signals
|
||||
from backend.agents.jira_agent import raise_ticket_for_signal
|
||||
from backend.integrations.jira_client import is_configured
|
||||
|
||||
if not is_configured():
|
||||
raise HTTPException(503, "Jira is not configured")
|
||||
|
||||
# Find the signal in the group
|
||||
signals = get_all_signals(payload.group_id)
|
||||
target = next((s for s in signals if s.get("id") == payload.signal_id), None)
|
||||
if not target:
|
||||
raise HTTPException(404, f"Signal {payload.signal_id} not found in group {payload.group_id}")
|
||||
|
||||
# Build flat signal dict from stored format
|
||||
meta = target.get("metadata", {})
|
||||
signal_dict = {
|
||||
"id": target.get("id", ""),
|
||||
"type": meta.get("type", "unknown"),
|
||||
"summary": meta.get("summary", "") or target.get("document", ""),
|
||||
"raw_quote": meta.get("raw_quote", ""),
|
||||
"severity": meta.get("severity", "medium"),
|
||||
"status": meta.get("status", "open"),
|
||||
"entities": json.loads(meta.get("entities", "[]") or "[]"),
|
||||
"keywords": json.loads(meta.get("keywords", "[]") or "[]"),
|
||||
"timestamp": meta.get("timestamp", ""),
|
||||
"group_id": payload.group_id,
|
||||
"lens": meta.get("lens", ""),
|
||||
}
|
||||
|
||||
result = await raise_ticket_for_signal(
|
||||
signal_dict,
|
||||
payload.group_id,
|
||||
project_key=payload.project_key,
|
||||
force=payload.force,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@app.post("/api/jira/create")
|
||||
async def create_jira_ticket(payload: JiraCreatePayload):
|
||||
"""Create a custom Jira ticket directly from the dashboard."""
|
||||
from backend.integrations.jira_client import create_issue, is_configured
|
||||
from backend.config import JIRA_DEFAULT_PROJECT
|
||||
from backend.db.chroma import store_signals
|
||||
import uuid as _uuid
|
||||
from datetime import datetime
|
||||
|
||||
if not is_configured():
|
||||
raise HTTPException(503, "Jira is not configured")
|
||||
|
||||
result = await create_issue(
|
||||
project_key=payload.project_key or JIRA_DEFAULT_PROJECT,
|
||||
summary=payload.summary,
|
||||
description=payload.description or "(Created from ThirdEye Dashboard)",
|
||||
issue_type=payload.issue_type,
|
||||
priority=payload.priority,
|
||||
labels=payload.labels or ["thirdeye", "dashboard"],
|
||||
assignee_account_id=payload.assignee_account_id or None,
|
||||
)
|
||||
|
||||
# Persist a jira_raised tracking signal so it appears in the ticket list
|
||||
if result.get("ok"):
|
||||
jira_key = result["key"]
|
||||
jira_url = result.get("url", "")
|
||||
tracking_signal = {
|
||||
"id": str(_uuid.uuid4()),
|
||||
"type": "jira_raised",
|
||||
"summary": payload.summary,
|
||||
"raw_quote": "manual",
|
||||
"severity": payload.priority.lower() if payload.priority else "medium",
|
||||
"status": "raised",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [jira_key],
|
||||
"keywords": ["jira", jira_key, "manual", "dashboard"],
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": "dashboard",
|
||||
"lens": "jira",
|
||||
"jira_key": jira_key,
|
||||
"jira_url": jira_url,
|
||||
"jira_summary": payload.summary,
|
||||
"jira_priority": payload.priority or "Medium",
|
||||
"original_signal_id": "manual",
|
||||
}
|
||||
store_signals("dashboard", [tracking_signal])
|
||||
logger.info(f"Stored manually-created Jira ticket {jira_key} in ChromaDB (group=dashboard)")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@app.get("/api/jira/users/search")
|
||||
async def search_jira_users(q: str = ""):
|
||||
"""Search Jira users by name or email fragment for assignee picker."""
|
||||
from backend.integrations.jira_client import search_users, is_configured
|
||||
if not is_configured():
|
||||
raise HTTPException(503, "Jira is not configured")
|
||||
if not q or len(q.strip()) < 1:
|
||||
return {"users": []}
|
||||
try:
|
||||
users = await search_users(q.strip(), max_results=8)
|
||||
return {"users": users}
|
||||
except Exception as e:
|
||||
logger.warning(f"Jira user search failed: {e}")
|
||||
return {"users": []}
|
||||
|
||||
|
||||
@app.get("/api/jira/config")
|
||||
async def get_jira_config():
|
||||
"""Check if Jira is configured and return basic project info."""
|
||||
from backend.integrations.jira_client import is_configured, test_connection, list_projects
|
||||
from backend.config import JIRA_DEFAULT_PROJECT, JIRA_BASE_URL
|
||||
|
||||
configured = is_configured()
|
||||
if not configured:
|
||||
return {"configured": False}
|
||||
|
||||
conn = await test_connection()
|
||||
projects = []
|
||||
if conn.get("ok"):
|
||||
try:
|
||||
projects = await list_projects()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"configured": True,
|
||||
"connected": conn.get("ok", False),
|
||||
"base_url": JIRA_BASE_URL,
|
||||
"default_project": JIRA_DEFAULT_PROJECT,
|
||||
"projects": projects,
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Knowledge Browser
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
_BROWSE_EXCLUDED_TYPES = {
|
||||
"jira_raised", "meet_started", "meet_chunk_raw", "voice_transcript",
|
||||
}
|
||||
|
||||
_BROWSE_STOPWORDS = {
|
||||
"the", "a", "an", "is", "in", "on", "at", "to", "and", "or", "not",
|
||||
"for", "of", "it", "this", "that", "be", "with", "as", "was", "are",
|
||||
"has", "have", "but", "by", "from", "we", "our", "they", "its",
|
||||
"will", "can", "would", "should", "about", "all", "new", "use",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/knowledge/browse/{group_id}")
|
||||
async def browse_knowledge(
|
||||
group_id: str,
|
||||
date_from: str = None,
|
||||
date_to: str = None,
|
||||
topic: str = None,
|
||||
):
|
||||
"""
|
||||
Browse a group's knowledge base organized by AI-clustered topics and date timeline.
|
||||
Returns topics (derived from keyword frequency) and a day-by-day timeline of signals.
|
||||
Excludes internal system signals (jira_raised, meet_chunk_raw, etc.).
|
||||
"""
|
||||
all_sigs = get_all_signals(group_id)
|
||||
|
||||
# Strip system / tracking signals
|
||||
signals = [
|
||||
s for s in all_sigs
|
||||
if s.get("metadata", {}).get("type", "") not in _BROWSE_EXCLUDED_TYPES
|
||||
]
|
||||
|
||||
# Date filtering
|
||||
if date_from:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") >= date_from]
|
||||
if date_to:
|
||||
signals = [s for s in signals if s.get("metadata", {}).get("timestamp", "") <= date_to + "T23:59:59"]
|
||||
|
||||
# ── Build keyword frequency map ────────────────────────────────────────────
|
||||
keyword_freq: dict[str, int] = defaultdict(int)
|
||||
for sig in signals:
|
||||
raw_kws = sig.get("metadata", {}).get("keywords", "[]")
|
||||
try:
|
||||
kws: list = json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws
|
||||
except Exception:
|
||||
kws = []
|
||||
for kw in kws:
|
||||
kw_clean = str(kw).lower().strip()
|
||||
if len(kw_clean) > 2 and kw_clean not in _BROWSE_STOPWORDS:
|
||||
keyword_freq[kw_clean] += 1
|
||||
|
||||
# Top 25 keywords become the selectable topics (must appear in ≥2 signals)
|
||||
sorted_kws = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)
|
||||
top_topics: list[str] = [kw for kw, freq in sorted_kws[:25] if freq >= 1]
|
||||
|
||||
def _primary_topic(sig: dict) -> str:
|
||||
"""Return the highest-ranked top-topic that this signal's keywords contain."""
|
||||
raw_kws = sig.get("metadata", {}).get("keywords", "[]")
|
||||
try:
|
||||
kw_set = {str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)}
|
||||
except Exception:
|
||||
kw_set = set()
|
||||
for t in top_topics:
|
||||
if t in kw_set:
|
||||
return t
|
||||
return sig.get("metadata", {}).get("type", "other").replace("_", " ")
|
||||
|
||||
def _all_topics(sig: dict) -> list[str]:
|
||||
"""Return all top-topics that this signal belongs to."""
|
||||
raw_kws = sig.get("metadata", {}).get("keywords", "[]")
|
||||
try:
|
||||
kw_set = {str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)}
|
||||
except Exception:
|
||||
kw_set = set()
|
||||
matched = [t for t in top_topics if t in kw_set]
|
||||
return matched if matched else [sig.get("metadata", {}).get("type", "other").replace("_", " ")]
|
||||
|
||||
# ── Topic-filter (optional) ────────────────────────────────────────────────
|
||||
if topic:
|
||||
topic_lower = topic.lower()
|
||||
filtered = []
|
||||
for sig in signals:
|
||||
raw_kws = sig.get("metadata", {}).get("keywords", "[]")
|
||||
try:
|
||||
kws = [str(k).lower().strip() for k in (json.loads(raw_kws) if isinstance(raw_kws, str) else raw_kws)]
|
||||
except Exception:
|
||||
kws = []
|
||||
sig_type = sig.get("metadata", {}).get("type", "").replace("_", " ")
|
||||
if topic_lower in kws or topic_lower == sig_type:
|
||||
filtered.append(sig)
|
||||
signals = filtered
|
||||
|
||||
# ── Build topics summary list ──────────────────────────────────────────────
|
||||
topic_buckets: dict[str, list] = defaultdict(list)
|
||||
for sig in (get_all_signals(group_id) if not (date_from or date_to or topic) else signals):
|
||||
# Rebuild buckets from the full unfiltered set for sidebar counts
|
||||
pass
|
||||
|
||||
# Use current filtered signals for topic counts
|
||||
for sig in signals:
|
||||
primary = _primary_topic(sig)
|
||||
topic_buckets[primary].append(sig)
|
||||
|
||||
topics_summary = []
|
||||
seen_topics: set[str] = set()
|
||||
for t in top_topics:
|
||||
bucket = topic_buckets.get(t, [])
|
||||
if bucket and t not in seen_topics:
|
||||
seen_topics.add(t)
|
||||
latest_ts = max((s.get("metadata", {}).get("timestamp", "") for s in bucket), default="")
|
||||
topics_summary.append({
|
||||
"name": t,
|
||||
"signal_count": len(bucket),
|
||||
"latest": latest_ts,
|
||||
"sample_signals": [
|
||||
s.get("metadata", {}).get("summary", "") or s.get("document", "")
|
||||
for s in bucket[:2]
|
||||
],
|
||||
})
|
||||
# Add leftover types as topics
|
||||
for t, bucket in sorted(topic_buckets.items(), key=lambda x: len(x[1]), reverse=True):
|
||||
if t not in seen_topics and bucket:
|
||||
seen_topics.add(t)
|
||||
latest_ts = max((s.get("metadata", {}).get("timestamp", "") for s in bucket), default="")
|
||||
topics_summary.append({
|
||||
"name": t,
|
||||
"signal_count": len(bucket),
|
||||
"latest": latest_ts,
|
||||
"sample_signals": [
|
||||
s.get("metadata", {}).get("summary", "") or s.get("document", "")
|
||||
for s in bucket[:2]
|
||||
],
|
||||
})
|
||||
topics_summary.sort(key=lambda t: t["signal_count"], reverse=True)
|
||||
|
||||
# ── Build day-by-day timeline ──────────────────────────────────────────────
|
||||
day_buckets: dict[str, list] = defaultdict(list)
|
||||
for sig in signals:
|
||||
ts = sig.get("metadata", {}).get("timestamp", "")
|
||||
date_key = ts[:10] if ts and len(ts) >= 10 else "unknown"
|
||||
day_buckets[date_key].append(sig)
|
||||
|
||||
timeline = []
|
||||
for date_key in sorted(day_buckets.keys(), reverse=True):
|
||||
day_sigs = sorted(
|
||||
day_buckets[date_key],
|
||||
key=lambda s: s.get("metadata", {}).get("timestamp", ""),
|
||||
reverse=True,
|
||||
)
|
||||
day_topics = list(dict.fromkeys(
|
||||
t for s in day_sigs for t in _all_topics(s)
|
||||
))
|
||||
timeline.append({
|
||||
"date": date_key,
|
||||
"signals": day_sigs,
|
||||
"topics": day_topics[:6],
|
||||
"signal_count": len(day_sigs),
|
||||
})
|
||||
|
||||
# ── Date range metadata ────────────────────────────────────────────────────
|
||||
all_ts = [
|
||||
s.get("metadata", {}).get("timestamp", "")
|
||||
for s in signals
|
||||
if s.get("metadata", {}).get("timestamp", "")
|
||||
]
|
||||
date_range = {
|
||||
"earliest": min(all_ts) if all_ts else "",
|
||||
"latest": max(all_ts) if all_ts else "",
|
||||
}
|
||||
|
||||
names = get_group_names()
|
||||
return {
|
||||
"group_id": group_id,
|
||||
"group_name": names.get(group_id, group_id),
|
||||
"total_signals": len(signals),
|
||||
"date_range": date_range,
|
||||
"topics": topics_summary,
|
||||
"timeline": timeline,
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Enhanced Chat / Signals Timeline
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
@app.get("/api/signals/timeline")
|
||||
async def get_signals_timeline(
|
||||
group_id: str = None,
|
||||
severity: str = None,
|
||||
lens: str = None,
|
||||
signal_type: str = None,
|
||||
date_from: str = None,
|
||||
date_to: str = None,
|
||||
limit: int = 200,
|
||||
):
|
||||
"""
|
||||
Cross-group signal timeline with full filter support.
|
||||
Returns signals sorted newest-first, ready for timeline rendering.
|
||||
"""
|
||||
from backend.db.chroma import get_all_signals, get_group_ids, get_group_names
|
||||
|
||||
group_ids = [group_id] if group_id else get_group_ids()
|
||||
names = get_group_names()
|
||||
all_signals = []
|
||||
|
||||
for gid in group_ids:
|
||||
for sig in get_all_signals(gid, signal_type=signal_type):
|
||||
meta = sig.get("metadata", {})
|
||||
ts = meta.get("timestamp", "")
|
||||
|
||||
if severity and meta.get("severity") != severity:
|
||||
continue
|
||||
if lens and meta.get("lens") != lens:
|
||||
continue
|
||||
if date_from and ts < date_from:
|
||||
continue
|
||||
if date_to and ts > date_to:
|
||||
continue
|
||||
# Exclude internal tracking signals from timeline
|
||||
if meta.get("type") in ("jira_raised", "meet_started"):
|
||||
continue
|
||||
|
||||
all_signals.append({
|
||||
**sig,
|
||||
"group_name": names.get(gid, gid),
|
||||
})
|
||||
|
||||
all_signals.sort(key=lambda s: s.get("metadata", {}).get("timestamp", ""), reverse=True)
|
||||
return {
|
||||
"signals": all_signals[:limit],
|
||||
"total": len(all_signals),
|
||||
"truncated": len(all_signals) > limit,
|
||||
}
|
||||
|
||||
1500
thirdeye/backend/bot/bot.py
Normal file
1500
thirdeye/backend/bot/bot.py
Normal file
File diff suppressed because it is too large
Load Diff
150
thirdeye/backend/bot/commands.py
Normal file
150
thirdeye/backend/bot/commands.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""
|
||||
ThirdEye bot commands — voice intelligence.
|
||||
Houses cmd_voicelog and any future command handlers that don't belong in the
|
||||
main bot.py module.
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger("thirdeye.bot.commands")
|
||||
|
||||
|
||||
async def cmd_voicelog(update, context):
|
||||
"""
|
||||
/voicelog [filter]
|
||||
Audit trail of all voice note decisions, actions, and blockers in this group.
|
||||
|
||||
Usage:
|
||||
/voicelog — all voice-sourced signals (last 20)
|
||||
/voicelog decisions — only decisions from voice notes
|
||||
/voicelog actions — only action items from voice notes
|
||||
/voicelog blockers — only blockers from voice notes
|
||||
/voicelog @Raj — only voice notes by Raj
|
||||
/voicelog search [query] — search voice note content
|
||||
"""
|
||||
from backend.db.chroma import query_signals, get_all_signals
|
||||
from backend.agents.voice_transcriber import format_duration
|
||||
|
||||
chat_id = str(update.effective_chat.id)
|
||||
args = context.args or []
|
||||
|
||||
filter_type = None
|
||||
filter_speaker = None
|
||||
search_query = None
|
||||
|
||||
if args:
|
||||
first = args[0].lower()
|
||||
if first == "decisions":
|
||||
filter_type = "architecture_decision"
|
||||
elif first == "actions":
|
||||
filter_type = "action_item"
|
||||
elif first == "blockers":
|
||||
filter_type = "blocker"
|
||||
elif first == "search" and len(args) > 1:
|
||||
search_query = " ".join(args[1:])
|
||||
elif first.startswith("@"):
|
||||
filter_speaker = first[1:]
|
||||
|
||||
await update.message.reply_text("🎤 Searching voice notes...", parse_mode="Markdown")
|
||||
|
||||
if search_query:
|
||||
raw_signals = query_signals(chat_id, search_query, n_results=30)
|
||||
else:
|
||||
raw_signals = get_all_signals(chat_id)
|
||||
|
||||
# Normalise: both query_signals and get_all_signals return
|
||||
# {"document": ..., "metadata": {...}, "id": ...} shaped dicts.
|
||||
# Flatten metadata to top-level for uniform field access below.
|
||||
def _flatten(s: dict) -> dict:
|
||||
meta = s.get("metadata", {})
|
||||
flat = {**meta}
|
||||
flat.setdefault("id", s.get("id", ""))
|
||||
flat.setdefault("document", s.get("document", ""))
|
||||
return flat
|
||||
|
||||
all_signals = [_flatten(s) for s in raw_signals]
|
||||
|
||||
# Filter to voice-sourced signals only
|
||||
voice_signals = [
|
||||
s for s in all_signals
|
||||
if s.get("source") == "voice"
|
||||
or s.get("type") == "voice_transcript"
|
||||
or "[Voice @" in s.get("summary", "")
|
||||
]
|
||||
|
||||
if filter_type:
|
||||
voice_signals = [s for s in voice_signals if s.get("type") == filter_type]
|
||||
if filter_speaker:
|
||||
voice_signals = [
|
||||
s for s in voice_signals
|
||||
if filter_speaker.lower() in s.get("speaker", "").lower()
|
||||
or filter_speaker.lower() in str(s.get("entities", [])).lower()
|
||||
]
|
||||
|
||||
# Prefer structured signals; fall back to raw transcripts if none
|
||||
structured = [s for s in voice_signals if s.get("type") != "voice_transcript"]
|
||||
display_signals = structured if structured else voice_signals
|
||||
|
||||
# Sort by timestamp descending
|
||||
def _ts(s):
|
||||
try:
|
||||
return datetime.fromisoformat(s.get("timestamp", "").replace("Z", "+00:00"))
|
||||
except Exception:
|
||||
return datetime.min
|
||||
|
||||
display_signals.sort(key=_ts, reverse=True)
|
||||
display_signals = display_signals[:20]
|
||||
|
||||
if not display_signals:
|
||||
await update.message.reply_text(
|
||||
"📭 No voice note signals found. Voice notes are transcribed automatically when sent here.",
|
||||
parse_mode="Markdown",
|
||||
)
|
||||
return
|
||||
|
||||
type_emoji = {
|
||||
"architecture_decision": "🏗️",
|
||||
"tech_debt": "⚠️",
|
||||
"action_item": "📌",
|
||||
"blocker": "🚧",
|
||||
"feature_request": "💡",
|
||||
"promise": "🤝",
|
||||
"risk": "🔴",
|
||||
"recurring_bug": "🐛",
|
||||
"voice_transcript": "🎤",
|
||||
}
|
||||
|
||||
filter_label = ""
|
||||
if filter_type:
|
||||
filter_label = f" — {filter_type.replace('_', ' ').title()}"
|
||||
elif filter_speaker:
|
||||
filter_label = f" — @{filter_speaker}"
|
||||
elif search_query:
|
||||
filter_label = f" — '{search_query}'"
|
||||
|
||||
lines = [f"🎤 *Voice Note Audit Trail*{filter_label}\n_{len(display_signals)} signal(s)_\n"]
|
||||
|
||||
for sig in display_signals:
|
||||
ts = sig.get("timestamp", "")
|
||||
date_str = ""
|
||||
if ts:
|
||||
try:
|
||||
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||||
date_str = dt.strftime("%b %d")
|
||||
except Exception:
|
||||
date_str = ts[:10]
|
||||
|
||||
speaker = sig.get("speaker", "")
|
||||
duration = sig.get("voice_duration", 0)
|
||||
duration_str = format_duration(int(duration)) if duration else ""
|
||||
emoji = type_emoji.get(sig.get("type", ""), "🎤")
|
||||
|
||||
summary = sig.get("summary", "")
|
||||
if summary.startswith("[Voice @"):
|
||||
summary = summary.split("] ", 1)[-1] if "] " in summary else summary
|
||||
|
||||
meta_parts = [f"@{speaker}" if speaker else "", date_str, duration_str]
|
||||
meta = " · ".join(filter(None, meta_parts))
|
||||
lines.append(f"{emoji} *{meta}*\n _{summary[:100]}_\n")
|
||||
|
||||
await update.message.reply_text("\n".join(lines), parse_mode="Markdown")
|
||||
62
thirdeye/backend/config.py
Normal file
62
thirdeye/backend/config.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Telegram
|
||||
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
|
||||
|
||||
# Ollama (local)
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
|
||||
OLLAMA_ENABLED = os.getenv("OLLAMA_ENABLED", "true").lower() == "true"
|
||||
|
||||
# LLM Providers
|
||||
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
||||
# Additional Groq keys for round-robin rotation (avoids rate limits on llama-3.3-70b-versatile)
|
||||
GROQ_API_KEY_2 = os.getenv("GROQ_API_KEY_2")
|
||||
GROQ_API_KEY_3 = os.getenv("GROQ_API_KEY_3")
|
||||
CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY")
|
||||
SAMBANOVA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
|
||||
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
||||
|
||||
# Embeddings
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
|
||||
|
||||
# App
|
||||
CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
|
||||
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "5"))
|
||||
BATCH_TIMEOUT_SECONDS = int(os.getenv("BATCH_TIMEOUT_SECONDS", "60"))
|
||||
|
||||
# Web Search
|
||||
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
||||
|
||||
# Feature Flags
|
||||
ENABLE_DOCUMENT_INGESTION = os.getenv("ENABLE_DOCUMENT_INGESTION", "true").lower() == "true"
|
||||
ENABLE_WEB_SEARCH = os.getenv("ENABLE_WEB_SEARCH", "true").lower() == "true"
|
||||
ENABLE_LINK_FETCH = os.getenv("ENABLE_LINK_FETCH", "true").lower() == "true"
|
||||
|
||||
# Google Meet Extension
|
||||
MEET_INGEST_SECRET = os.getenv("MEET_INGEST_SECRET", "thirdeye_meet_secret_change_me")
|
||||
MEET_DEFAULT_GROUP_ID = os.getenv("MEET_DEFAULT_GROUP_ID", "meet_sessions")
|
||||
ENABLE_MEET_INGESTION = os.getenv("ENABLE_MEET_INGESTION", "true").lower() == "true"
|
||||
MEET_CROSS_REF_GROUPS = [
|
||||
g.strip() for g in os.getenv("MEET_CROSS_REF_GROUPS", "").split(",") if g.strip()
|
||||
]
|
||||
|
||||
# Jira
|
||||
JIRA_BASE_URL = os.getenv("JIRA_BASE_URL", "").rstrip("/")
|
||||
JIRA_EMAIL = os.getenv("JIRA_EMAIL", "")
|
||||
JIRA_API_TOKEN = os.getenv("JIRA_API_TOKEN", "")
|
||||
JIRA_DEFAULT_PROJECT = os.getenv("JIRA_DEFAULT_PROJECT", "ENG")
|
||||
JIRA_DEFAULT_ISSUE_TYPE = os.getenv("JIRA_DEFAULT_ISSUE_TYPE", "Task")
|
||||
ENABLE_JIRA = os.getenv("ENABLE_JIRA", "true").lower() == "true"
|
||||
JIRA_AUTO_RAISE = os.getenv("JIRA_AUTO_RAISE", "false").lower() == "true"
|
||||
JIRA_AUTO_RAISE_SEVERITY = os.getenv("JIRA_AUTO_RAISE_SEVERITY", "high")
|
||||
|
||||
# Voice Message Intelligence
|
||||
ENABLE_VOICE_TRANSCRIPTION = os.getenv("ENABLE_VOICE_TRANSCRIPTION", "true").lower() == "true"
|
||||
VOICE_MAX_DURATION_SECONDS = int(os.getenv("VOICE_MAX_DURATION_SECONDS", "300"))
|
||||
VOICE_MIN_DURATION_SECONDS = int(os.getenv("VOICE_MIN_DURATION_SECONDS", "2"))
|
||||
VOICE_LANGUAGE = os.getenv("VOICE_LANGUAGE", "") # empty string = Whisper auto-detects
|
||||
VOICE_STORE_TRANSCRIPT = os.getenv("VOICE_STORE_TRANSCRIPT", "true").lower() == "true"
|
||||
279
thirdeye/backend/db/chroma.py
Normal file
279
thirdeye/backend/db/chroma.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""ChromaDB setup and operations."""
|
||||
import json
|
||||
import uuid
|
||||
import chromadb
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from backend.config import CHROMA_DB_PATH
|
||||
from backend.db.embeddings import embed_texts, embed_query
|
||||
|
||||
logger = logging.getLogger("thirdeye.chroma")
|
||||
|
||||
# Initialize persistent client
|
||||
_chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
|
||||
|
||||
|
||||
def get_collection(group_id: str) -> chromadb.Collection:
|
||||
"""Get or create a collection for a specific group."""
|
||||
safe_name = f"ll_{group_id.replace('-', '_')}"
|
||||
# ChromaDB collection names: 3-63 chars, alphanumeric + underscores
|
||||
safe_name = safe_name[:63]
|
||||
return _chroma_client.get_or_create_collection(name=safe_name)
|
||||
|
||||
|
||||
def set_group_name(group_id: str, name: str):
|
||||
"""Persist the human-readable Telegram group name in the collection metadata."""
|
||||
if not name or name == group_id:
|
||||
return
|
||||
try:
|
||||
col = get_collection(group_id)
|
||||
existing = dict(col.metadata or {})
|
||||
if existing.get("group_name") != name:
|
||||
existing["group_name"] = name
|
||||
col.modify(metadata=existing)
|
||||
except Exception as e:
|
||||
logger.warning(f"set_group_name failed for {group_id}: {e}")
|
||||
|
||||
|
||||
def get_group_names() -> dict[str, str]:
|
||||
"""Return a mapping of group_id -> human-readable name (falls back to group_id)."""
|
||||
result = {}
|
||||
for col in _chroma_client.list_collections():
|
||||
if not col.name.startswith("ll_"):
|
||||
continue
|
||||
group_id = col.name.replace("ll_", "").replace("_", "-")
|
||||
name = (col.metadata or {}).get("group_name", group_id)
|
||||
result[group_id] = name
|
||||
return result
|
||||
|
||||
|
||||
def store_signals(group_id: str, signals: list[dict]):
|
||||
"""Store extracted signals in ChromaDB with embeddings."""
|
||||
if not signals:
|
||||
return
|
||||
|
||||
collection = get_collection(group_id)
|
||||
documents = []
|
||||
metadatas = []
|
||||
ids = []
|
||||
|
||||
for signal in signals:
|
||||
doc_text = f"{signal['type']}: {signal['summary']}"
|
||||
if signal.get('raw_quote'):
|
||||
doc_text += f" | Quote: {signal['raw_quote']}"
|
||||
|
||||
documents.append(doc_text)
|
||||
metadatas.append({
|
||||
"type": signal.get("type", "unknown"),
|
||||
"severity": signal.get("severity", "low"),
|
||||
"status": signal.get("status", "unknown"),
|
||||
"sentiment": signal.get("sentiment", "neutral"),
|
||||
"urgency": signal.get("urgency", "none"),
|
||||
"entities": json.dumps(signal.get("entities", [])),
|
||||
"keywords": json.dumps(signal.get("keywords", [])),
|
||||
"raw_quote": signal.get("raw_quote", ""),
|
||||
"summary": signal.get("summary", ""),
|
||||
"timestamp": signal.get("timestamp", datetime.utcnow().isoformat()),
|
||||
"group_id": group_id,
|
||||
"lens": signal.get("lens", "unknown"),
|
||||
"meeting_id": signal.get("meeting_id", ""),
|
||||
# Voice attribution — preserved so /voicelog and /ask can cite the source
|
||||
"source": signal.get("source", ""),
|
||||
"speaker": signal.get("speaker", ""),
|
||||
"voice_file_id": signal.get("voice_file_id", ""),
|
||||
"voice_duration": int(signal.get("voice_duration", 0) or 0),
|
||||
"voice_language": signal.get("voice_language", ""),
|
||||
# Jira tracking fields (populated for jira_raised signals)
|
||||
"jira_key": signal.get("jira_key", ""),
|
||||
"jira_url": signal.get("jira_url", ""),
|
||||
"jira_summary": signal.get("jira_summary", ""),
|
||||
"jira_priority": signal.get("jira_priority", ""),
|
||||
"original_signal_id": signal.get("original_signal_id", ""),
|
||||
})
|
||||
ids.append(signal.get("id", str(uuid.uuid4())))
|
||||
|
||||
# Generate embeddings
|
||||
embeddings = embed_texts(documents)
|
||||
|
||||
collection.add(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
)
|
||||
logger.info(f"Stored {len(signals)} signals for group {group_id}")
|
||||
|
||||
|
||||
def query_signals(group_id: str, query: str, n_results: int = 10, signal_type: str = None) -> list[dict]:
|
||||
"""Query the knowledge base with natural language."""
|
||||
collection = get_collection(group_id)
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
|
||||
where_filter = None
|
||||
if signal_type:
|
||||
where_filter = {"type": signal_type}
|
||||
|
||||
try:
|
||||
results = collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=min(n_results, collection.count() or 1),
|
||||
where=where_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Query failed: {e}")
|
||||
return []
|
||||
|
||||
# Format results
|
||||
output = []
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"][0]):
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
distance = results["distances"][0][i] if results["distances"] else None
|
||||
sig_id = results["ids"][0][i] if results.get("ids") else ""
|
||||
output.append({
|
||||
"id": sig_id,
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"relevance_score": 1 - (distance or 0), # Convert distance to similarity
|
||||
})
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_all_signals(group_id: str, signal_type: str = None) -> list[dict]:
|
||||
"""Get all signals for a group (for pattern detection)."""
|
||||
collection = get_collection(group_id)
|
||||
count = collection.count()
|
||||
if count == 0:
|
||||
return []
|
||||
|
||||
where_filter = {"type": signal_type} if signal_type else None
|
||||
|
||||
try:
|
||||
results = collection.get(where=where_filter, limit=count)
|
||||
except Exception:
|
||||
results = collection.get(limit=count)
|
||||
|
||||
output = []
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"]):
|
||||
meta = results["metadatas"][i] if results["metadatas"] else {}
|
||||
output.append({"document": doc, "metadata": meta, "id": results["ids"][i]})
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_group_ids() -> list[str]:
|
||||
"""Get all group IDs that have collections."""
|
||||
collections = _chroma_client.list_collections()
|
||||
return [c.name.replace("ll_", "").replace("_", "-") for c in collections if c.name.startswith("ll_")]
|
||||
|
||||
|
||||
def query_signals_global(query: str, n_results: int = 5, exclude_group_id: str = None) -> list[dict]:
|
||||
"""
|
||||
Search across ALL group collections for a query.
|
||||
Used as a cross-group fallback when local search returns weak results.
|
||||
Each result is annotated with its source group_id.
|
||||
"""
|
||||
collections = _chroma_client.list_collections()
|
||||
query_embedding = embed_query(query)
|
||||
all_results = []
|
||||
|
||||
for col_meta in collections:
|
||||
if not col_meta.name.startswith("ll_"):
|
||||
continue
|
||||
|
||||
# Derive group_id from collection name
|
||||
raw = col_meta.name[len("ll_"):]
|
||||
group_id = raw.replace("_", "-")
|
||||
|
||||
if exclude_group_id and group_id == exclude_group_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
col = _chroma_client.get_collection(col_meta.name)
|
||||
count = col.count()
|
||||
if count == 0:
|
||||
continue
|
||||
|
||||
results = col.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=min(n_results, count),
|
||||
)
|
||||
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"][0]):
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
distance = results["distances"][0][i] if results["distances"] else None
|
||||
all_results.append({
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"relevance_score": 1 - (distance or 0),
|
||||
"source_group_id": group_id,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Global query failed for collection {col_meta.name}: {e}")
|
||||
continue
|
||||
|
||||
# Sort by relevance and return top n_results
|
||||
all_results.sort(key=lambda x: x["relevance_score"], reverse=True)
|
||||
return all_results[:n_results]
|
||||
|
||||
def mark_signal_as_raised(
|
||||
group_id: str,
|
||||
signal_id: str,
|
||||
jira_key: str,
|
||||
jira_url: str = "",
|
||||
jira_summary: str = "",
|
||||
jira_priority: str = "",
|
||||
):
|
||||
"""
|
||||
Tag a signal with its Jira ticket key so we never raise it twice.
|
||||
Adds a new signal of type 'jira_raised' linked to the original signal_id.
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
tracking_signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "jira_raised",
|
||||
"summary": jira_summary or f"Jira ticket {jira_key} raised for signal {signal_id}",
|
||||
"raw_quote": signal_id, # original signal_id — used by get_raised_signal_ids
|
||||
"severity": "low",
|
||||
"status": "raised",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [jira_key],
|
||||
"keywords": ["jira", jira_key, "raised"],
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "jira",
|
||||
# Jira tracking fields
|
||||
"jira_key": jira_key,
|
||||
"jira_url": jira_url,
|
||||
"jira_summary": jira_summary,
|
||||
"jira_priority": jira_priority,
|
||||
"original_signal_id": signal_id,
|
||||
}
|
||||
store_signals(group_id, [tracking_signal])
|
||||
|
||||
|
||||
def get_raised_signal_ids(group_id: str) -> set[str]:
|
||||
"""
|
||||
Return the set of signal IDs that have already had Jira tickets raised.
|
||||
Used to prevent duplicates.
|
||||
"""
|
||||
collection = get_collection(group_id)
|
||||
try:
|
||||
results = collection.get(where={"type": "jira_raised"})
|
||||
# raw_quote stores the original signal_id
|
||||
raised_ids = set()
|
||||
if results and results.get("metadatas"):
|
||||
for meta in results["metadatas"]:
|
||||
original_id = meta.get("raw_quote") # signal_id stored in raw_quote field
|
||||
if original_id:
|
||||
raised_ids.add(original_id)
|
||||
return raised_ids
|
||||
except Exception:
|
||||
return set()
|
||||
67
thirdeye/backend/db/embeddings.py
Normal file
67
thirdeye/backend/db/embeddings.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Embedding provider with Cohere primary and local fallback."""
|
||||
import cohere
|
||||
import logging
|
||||
from backend.config import COHERE_API_KEY
|
||||
|
||||
logger = logging.getLogger("thirdeye.embeddings")
|
||||
|
||||
_cohere_client = None
|
||||
_local_model = None
|
||||
|
||||
def _get_cohere():
|
||||
global _cohere_client
|
||||
if _cohere_client is None and COHERE_API_KEY:
|
||||
_cohere_client = cohere.Client(COHERE_API_KEY)
|
||||
return _cohere_client
|
||||
|
||||
def _get_local_model():
|
||||
global _local_model
|
||||
if _local_model is None:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_local_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
logger.info("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
return _local_model
|
||||
|
||||
|
||||
def embed_texts(texts: list[str]) -> list[list[float]]:
|
||||
"""Embed a list of texts. Tries Cohere first, falls back to local model."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Try Cohere
|
||||
client = _get_cohere()
|
||||
if client:
|
||||
try:
|
||||
response = client.embed(
|
||||
texts=texts,
|
||||
model="embed-english-v3.0",
|
||||
input_type="search_document",
|
||||
)
|
||||
logger.info(f"Cohere embedded {len(texts)} texts")
|
||||
return [list(e) for e in response.embeddings]
|
||||
except Exception as e:
|
||||
logger.warning(f"Cohere embedding failed: {e}, falling back to local")
|
||||
|
||||
# Fallback to local
|
||||
model = _get_local_model()
|
||||
embeddings = model.encode(texts).tolist()
|
||||
logger.info(f"Local model embedded {len(texts)} texts")
|
||||
return embeddings
|
||||
|
||||
|
||||
def embed_query(text: str) -> list[float]:
|
||||
"""Embed a single query text."""
|
||||
client = _get_cohere()
|
||||
if client:
|
||||
try:
|
||||
response = client.embed(
|
||||
texts=[text],
|
||||
model="embed-english-v3.0",
|
||||
input_type="search_query",
|
||||
)
|
||||
return list(response.embeddings[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
model = _get_local_model()
|
||||
return model.encode([text]).tolist()[0]
|
||||
57
thirdeye/backend/db/models.py
Normal file
57
thirdeye/backend/db/models.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Data models for ThirdEye."""
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
|
||||
class Signal(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
group_id: str
|
||||
lens: str = "unknown" # dev, product, client, community
|
||||
type: str # architecture_decision, tech_debt, etc.
|
||||
summary: str
|
||||
entities: list[str] = []
|
||||
severity: str = "low" # low, medium, high, critical
|
||||
status: str = "unknown" # proposed, decided, implemented, unresolved
|
||||
sentiment: str = "neutral"
|
||||
urgency: str = "none"
|
||||
raw_quote: str = ""
|
||||
source_messages: list[int] = []
|
||||
timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
keywords: list[str] = []
|
||||
|
||||
|
||||
class Pattern(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
group_id: str
|
||||
type: str # frequency_spike, knowledge_silo, recurring_issue, sentiment_trend, stale_item
|
||||
description: str
|
||||
severity: str = "info" # info, warning, critical
|
||||
evidence_signal_ids: list[str] = []
|
||||
recommendation: str = ""
|
||||
detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class CrossGroupInsight(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
type: str # blocked_handoff, conflicting_decision, information_silo, promise_reality_gap, duplicated_effort
|
||||
description: str
|
||||
group_a: dict = {} # {name, group_id, evidence}
|
||||
group_b: dict = {}
|
||||
severity: str = "warning"
|
||||
recommendation: str = ""
|
||||
detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
is_resolved: bool = False
|
||||
|
||||
|
||||
class GroupConfig(BaseModel):
|
||||
group_id: str
|
||||
group_name: str = ""
|
||||
lens_mode: str = "auto" # auto, dev, product, client, community
|
||||
detected_lens: str = "unknown"
|
||||
confidence: float = 0.0
|
||||
is_active: bool = True
|
||||
message_count: int = 0
|
||||
signal_count: int = 0
|
||||
0
thirdeye/backend/integrations/__init__.py
Normal file
0
thirdeye/backend/integrations/__init__.py
Normal file
346
thirdeye/backend/integrations/jira_client.py
Normal file
346
thirdeye/backend/integrations/jira_client.py
Normal file
@@ -0,0 +1,346 @@
|
||||
"""
|
||||
Jira REST API v3 client — async, using httpx.
|
||||
All methods return plain dicts (no Jira SDK objects).
|
||||
Authentication: Basic auth with email + API token (Jira Cloud standard).
|
||||
Docs: https://developer.atlassian.com/cloud/jira/platform/rest/v3/
|
||||
"""
|
||||
import base64
|
||||
import logging
|
||||
from typing import Optional
|
||||
import httpx
|
||||
|
||||
from backend.config import (
|
||||
JIRA_BASE_URL, JIRA_EMAIL, JIRA_API_TOKEN, ENABLE_JIRA
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.integrations.jira")
|
||||
|
||||
# ─── Auth ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _auth_header() -> dict:
|
||||
"""Build the Basic auth header from email + API token."""
|
||||
raw = f"{JIRA_EMAIL}:{JIRA_API_TOKEN}"
|
||||
encoded = base64.b64encode(raw.encode()).decode()
|
||||
return {
|
||||
"Authorization": f"Basic {encoded}",
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
|
||||
def _base_url() -> str:
|
||||
return f"{JIRA_BASE_URL}/rest/api/3"
|
||||
|
||||
|
||||
def is_configured() -> bool:
|
||||
"""Return True if all required Jira config is set."""
|
||||
return bool(JIRA_BASE_URL and JIRA_EMAIL and JIRA_API_TOKEN and ENABLE_JIRA)
|
||||
|
||||
|
||||
# ─── Core HTTP helpers ───────────────────────────────────────────────────────
|
||||
|
||||
async def _get(path: str, params: dict = None) -> dict:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
resp = await client.get(
|
||||
f"{_base_url()}{path}",
|
||||
headers=_auth_header(),
|
||||
params=params or {},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
async def _post(path: str, body: dict) -> dict:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_base_url()}{path}",
|
||||
headers=_auth_header(),
|
||||
json=body,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
async def _put(path: str, body: dict) -> dict:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
resp = await client.put(
|
||||
f"{_base_url()}{path}",
|
||||
headers=_auth_header(),
|
||||
json=body,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
# PUT /issue returns 204 No Content on success
|
||||
if resp.status_code == 204:
|
||||
return {"ok": True}
|
||||
return resp.json()
|
||||
|
||||
|
||||
# ─── Public API ──────────────────────────────────────────────────────────────
|
||||
|
||||
async def test_connection() -> dict:
|
||||
"""
|
||||
Verify credentials work by calling /myself.
|
||||
Returns {"ok": True, "displayName": "...", "email": "..."} or {"ok": False, "error": "..."}
|
||||
"""
|
||||
try:
|
||||
data = await _get("/myself")
|
||||
return {
|
||||
"ok": True,
|
||||
"display_name": data.get("displayName", "Unknown"),
|
||||
"email": data.get("emailAddress", "Unknown"),
|
||||
"account_id": data.get("accountId", ""),
|
||||
}
|
||||
except httpx.HTTPStatusError as e:
|
||||
return {"ok": False, "error": f"HTTP {e.response.status_code}: {e.response.text[:200]}"}
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
async def list_projects() -> list[dict]:
|
||||
"""
|
||||
List all accessible Jira projects.
|
||||
Returns list of {"key": "ENG", "name": "Engineering", "id": "10001"}
|
||||
"""
|
||||
data = await _get("/project/search", params={"maxResults": 50})
|
||||
return [
|
||||
{
|
||||
"key": p["key"],
|
||||
"name": p["name"],
|
||||
"id": p["id"],
|
||||
"type": p.get("projectTypeKey", "software"),
|
||||
}
|
||||
for p in data.get("values", [])
|
||||
]
|
||||
|
||||
|
||||
async def list_issue_types(project_key: str) -> list[dict]:
|
||||
"""
|
||||
List issue types available for a specific project.
|
||||
Returns list of {"id": "10001", "name": "Bug", "subtask": False}
|
||||
"""
|
||||
data = await _get(f"/project/{project_key}")
|
||||
issue_types = data.get("issueTypes", [])
|
||||
return [
|
||||
{
|
||||
"id": it["id"],
|
||||
"name": it["name"],
|
||||
"subtask": it.get("subtask", False),
|
||||
}
|
||||
for it in issue_types
|
||||
if not it.get("subtask", False) # Exclude subtask types
|
||||
]
|
||||
|
||||
|
||||
async def get_issue(issue_key: str) -> dict:
|
||||
"""
|
||||
Get a single issue by key (e.g. "ENG-42").
|
||||
Returns simplified issue dict.
|
||||
"""
|
||||
data = await _get(f"/issue/{issue_key}")
|
||||
fields = data.get("fields", {})
|
||||
return {
|
||||
"key": data["key"],
|
||||
"id": data["id"],
|
||||
"summary": fields.get("summary", ""),
|
||||
"status": fields.get("status", {}).get("name", "Unknown"),
|
||||
"priority": fields.get("priority", {}).get("name", "Medium"),
|
||||
"assignee": (fields.get("assignee") or {}).get("displayName", "Unassigned"),
|
||||
"issue_type": fields.get("issuetype", {}).get("name", "Task"),
|
||||
"url": f"{JIRA_BASE_URL}/browse/{data['key']}",
|
||||
"created": fields.get("created", ""),
|
||||
"updated": fields.get("updated", ""),
|
||||
}
|
||||
|
||||
|
||||
async def create_issue(
|
||||
project_key: str,
|
||||
summary: str,
|
||||
description: str,
|
||||
issue_type: str = "Task",
|
||||
priority: str = "Medium",
|
||||
labels: list[str] = None,
|
||||
assignee_account_id: str = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Create a new Jira issue.
|
||||
|
||||
Args:
|
||||
project_key: Project key (e.g. "ENG")
|
||||
summary: Issue title (max ~250 chars)
|
||||
description: Full description in Atlassian Document Format (ADF)
|
||||
issue_type: "Task", "Bug", "Story", "Epic"
|
||||
priority: "Highest", "High", "Medium", "Low", "Lowest"
|
||||
labels: List of label strings (no spaces allowed in labels)
|
||||
assignee_account_id: Jira account ID to assign to (optional)
|
||||
|
||||
Returns:
|
||||
{"key": "ENG-42", "id": "10042", "url": "https://..."}
|
||||
"""
|
||||
fields: dict = {
|
||||
"project": {"key": project_key},
|
||||
"summary": summary[:255], # Jira hard limit
|
||||
"description": _text_to_adf(description),
|
||||
"issuetype": {"name": issue_type},
|
||||
"priority": {"name": priority},
|
||||
}
|
||||
|
||||
if labels:
|
||||
# Jira labels cannot have spaces — replace with hyphens
|
||||
fields["labels"] = [l.replace(" ", "-") for l in labels]
|
||||
|
||||
if assignee_account_id:
|
||||
fields["assignee"] = {"accountId": assignee_account_id}
|
||||
|
||||
body = {"fields": fields}
|
||||
|
||||
try:
|
||||
data = await _post("/issue", body)
|
||||
issue_key = data["key"]
|
||||
return {
|
||||
"ok": True,
|
||||
"key": issue_key,
|
||||
"id": data["id"],
|
||||
"url": f"{JIRA_BASE_URL}/browse/{issue_key}",
|
||||
}
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_body = {}
|
||||
try:
|
||||
error_body = e.response.json()
|
||||
except Exception:
|
||||
pass
|
||||
errors = error_body.get("errors", {})
|
||||
messages = error_body.get("errorMessages", [])
|
||||
return {
|
||||
"ok": False,
|
||||
"error": f"HTTP {e.response.status_code}",
|
||||
"details": errors or messages or e.response.text[:300],
|
||||
}
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
async def search_issues(jql: str, max_results: int = 10) -> list[dict]:
|
||||
"""
|
||||
Search issues using JQL (Jira Query Language).
|
||||
Example JQL: 'project = ENG AND labels = thirdeye AND status != Done'
|
||||
|
||||
Returns list of simplified issue dicts.
|
||||
"""
|
||||
data = await _get("/search/jql", params={
|
||||
"jql": jql,
|
||||
"maxResults": max_results,
|
||||
"fields": "summary,status,priority,assignee,issuetype,labels,created",
|
||||
})
|
||||
results = []
|
||||
for issue in data.get("issues", []):
|
||||
fields = issue.get("fields", {})
|
||||
results.append({
|
||||
"key": issue["key"],
|
||||
"summary": fields.get("summary", ""),
|
||||
"status": fields.get("status", {}).get("name", "Unknown"),
|
||||
"priority": fields.get("priority", {}).get("name", "Medium"),
|
||||
"assignee": (fields.get("assignee") or {}).get("displayName", "Unassigned"),
|
||||
"issue_type": fields.get("issuetype", {}).get("name", "Task"),
|
||||
"labels": fields.get("labels", []),
|
||||
"url": f"{JIRA_BASE_URL}/browse/{issue['key']}",
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
async def search_users(query: str, max_results: int = 10) -> list[dict]:
|
||||
"""
|
||||
Search Jira users by display name or email fragment.
|
||||
Returns list of {"account_id", "display_name", "email", "active"}.
|
||||
"""
|
||||
try:
|
||||
data = await _get("/user/search", params={"query": query, "maxResults": max_results})
|
||||
return [
|
||||
{
|
||||
"account_id": u.get("accountId", ""),
|
||||
"display_name": u.get("displayName", ""),
|
||||
"email": u.get("emailAddress", ""),
|
||||
"active": u.get("active", True),
|
||||
}
|
||||
for u in data
|
||||
if u.get("active", True)
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"User search failed for '{query}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def assign_issue(issue_key: str, account_id: str) -> dict:
|
||||
"""
|
||||
Assign a Jira issue to a user by their Jira account ID.
|
||||
Returns {"ok": True} on success or {"ok": False, "error": "..."}.
|
||||
"""
|
||||
try:
|
||||
await _put(f"/issue/{issue_key}/assignee", {"accountId": account_id})
|
||||
return {"ok": True}
|
||||
except httpx.HTTPStatusError as e:
|
||||
return {"ok": False, "error": f"HTTP {e.response.status_code}: {e.response.text[:200]}"}
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
async def add_comment(issue_key: str, comment: str) -> dict:
|
||||
"""Add a plain-text comment to an existing issue."""
|
||||
try:
|
||||
data = await _post(f"/issue/{issue_key}/comment", {
|
||||
"body": _text_to_adf(comment)
|
||||
})
|
||||
return {"ok": True, "id": data.get("id")}
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
# ─── ADF helper ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _text_to_adf(text: str) -> dict:
|
||||
"""
|
||||
Convert plain text to Atlassian Document Format (ADF).
|
||||
Jira Cloud requires ADF for description/comment fields (not plain strings).
|
||||
Splits on double newlines to create separate paragraphs.
|
||||
"""
|
||||
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
||||
if not paragraphs:
|
||||
paragraphs = [text.strip() or "(no description)"]
|
||||
|
||||
content = []
|
||||
for para in paragraphs:
|
||||
# Handle bullet lines within a paragraph (lines starting with - or *)
|
||||
lines = para.split("\n")
|
||||
bullet_items = [l.lstrip("-* ").strip() for l in lines if l.strip().startswith(("-", "*", "•"))]
|
||||
non_bullets = [l for l in lines if not l.strip().startswith(("-", "*", "•"))]
|
||||
|
||||
if non_bullets:
|
||||
content.append({
|
||||
"type": "paragraph",
|
||||
"content": [{"type": "text", "text": " ".join(non_bullets)}],
|
||||
})
|
||||
|
||||
if bullet_items:
|
||||
content.append({
|
||||
"type": "bulletList",
|
||||
"content": [
|
||||
{
|
||||
"type": "listItem",
|
||||
"content": [
|
||||
{
|
||||
"type": "paragraph",
|
||||
"content": [{"type": "text", "text": item}],
|
||||
}
|
||||
],
|
||||
}
|
||||
for item in bullet_items
|
||||
],
|
||||
})
|
||||
|
||||
return {
|
||||
"type": "doc",
|
||||
"version": 1,
|
||||
"content": content or [
|
||||
{"type": "paragraph", "content": [{"type": "text", "text": "(no description)"}]}
|
||||
],
|
||||
}
|
||||
|
||||
288
thirdeye/backend/pipeline.py
Normal file
288
thirdeye/backend/pipeline.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""Core pipeline: message batch → signals → classified → stored → queryable."""
|
||||
import asyncio
|
||||
import logging
|
||||
from backend.agents.signal_extractor import extract_signals
|
||||
from backend.agents.classifier import classify_signal
|
||||
from backend.agents.context_detector import detect_context
|
||||
from backend.db.chroma import store_signals, query_signals
|
||||
from backend.db.models import Signal
|
||||
|
||||
logger = logging.getLogger("thirdeye.pipeline")
|
||||
|
||||
# In-memory group config store (replace with Redis/DB for production)
|
||||
_group_configs = {}
|
||||
|
||||
|
||||
async def detect_and_set_lens(group_id: str, messages_text: str) -> str:
|
||||
"""Auto-detect lens for a group from initial messages."""
|
||||
result = await detect_context(messages_text)
|
||||
_group_configs[group_id] = {
|
||||
"lens": result["detected_lens"],
|
||||
"confidence": result["confidence"],
|
||||
}
|
||||
logger.info(f"Group {group_id}: lens={result['detected_lens']} (conf={result['confidence']})")
|
||||
return result["detected_lens"]
|
||||
|
||||
|
||||
def get_lens(group_id: str) -> str:
|
||||
"""Get the current lens for a group."""
|
||||
config = _group_configs.get(group_id, {})
|
||||
return config.get("lens", "dev")
|
||||
|
||||
|
||||
def set_lens(group_id: str, lens: str):
|
||||
"""Manually set the lens for a group."""
|
||||
_group_configs[group_id] = {"lens": lens, "confidence": 1.0}
|
||||
|
||||
async def _auto_raise_and_notify(group_id: str, signals: list[dict]):
|
||||
"""
|
||||
Background task: raise Jira tickets for critical signals and log results.
|
||||
Called automatically when JIRA_AUTO_RAISE=true in .env.
|
||||
Does NOT send Telegram messages (no bot context here) — check logs or /jiraraised.
|
||||
"""
|
||||
import logging
|
||||
logger = logging.getLogger("thirdeye.pipeline.auto_raise")
|
||||
|
||||
try:
|
||||
from backend.agents.jira_agent import bulk_raise_for_group
|
||||
results = await bulk_raise_for_group(
|
||||
group_id=group_id,
|
||||
signals=signals,
|
||||
min_severity="high",
|
||||
max_tickets=5,
|
||||
)
|
||||
raised = [r for r in results if r.get("ok")]
|
||||
if raised:
|
||||
logger.info(
|
||||
f"[Auto-raise] Group {group_id}: {len(raised)} ticket(s) raised — "
|
||||
+ ", ".join(r.get("key", "?") for r in raised)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.getLogger("thirdeye.pipeline.auto_raise").error(f"Auto-raise failed: {e}")
|
||||
|
||||
|
||||
async def process_message_batch(group_id: str, messages: list[dict]) -> list[Signal]:
|
||||
"""
|
||||
Process a batch of messages through the full pipeline.
|
||||
|
||||
Args:
|
||||
group_id: Telegram group ID
|
||||
messages: List of {"sender": str, "text": str, "timestamp": str}
|
||||
|
||||
Returns:
|
||||
List of stored Signal objects
|
||||
"""
|
||||
# Format messages for the LLM
|
||||
formatted = "\n".join([f"[{m['sender']}]: {m['text']}" for m in messages])
|
||||
|
||||
# Get or detect lens
|
||||
lens = get_lens(group_id)
|
||||
if lens == "dev" and group_id not in _group_configs:
|
||||
# First time seeing this group — auto-detect
|
||||
lens = await detect_and_set_lens(group_id, formatted)
|
||||
|
||||
# Step 1: Extract signals
|
||||
signals = await extract_signals(formatted, group_id, lens=lens)
|
||||
|
||||
if not signals:
|
||||
logger.info(f"No signals extracted from batch in {group_id}")
|
||||
return []
|
||||
|
||||
# Step 2: Classify each signal (parallel for speed)
|
||||
classified_signals = await asyncio.gather(*[classify_signal(s) for s in signals])
|
||||
|
||||
# Step 3: Store in ChromaDB
|
||||
store_signals(group_id, [s.model_dump() for s in classified_signals])
|
||||
|
||||
# Append inside process_message_batch(), after store_signals() call:
|
||||
# ─── Auto-raise Jira tickets for critical signals ─────────────────────────────
|
||||
from backend.config import JIRA_AUTO_RAISE, ENABLE_JIRA
|
||||
|
||||
if ENABLE_JIRA and JIRA_AUTO_RAISE and signals:
|
||||
from backend.agents.jira_agent import bulk_raise_for_group
|
||||
|
||||
critical_signals = [
|
||||
s for s in signals
|
||||
if s.get("severity", "low") in ("high", "critical")
|
||||
]
|
||||
if critical_signals:
|
||||
asyncio.create_task(
|
||||
_auto_raise_and_notify(group_id, critical_signals)
|
||||
)
|
||||
|
||||
logger.info(f"Pipeline complete: {len(classified_signals)} signals stored for {group_id}")
|
||||
return classified_signals
|
||||
|
||||
|
||||
async def query_knowledge(group_id: str, question: str, force_web_search: bool = False) -> str:
|
||||
"""
|
||||
Query the knowledge base with natural language, with cross-group fallback and
|
||||
conservative web search (only when all internal sources fail).
|
||||
|
||||
Flow:
|
||||
1. Search this group's knowledge base (ChromaDB)
|
||||
2. If results are weak, also search all other groups (cross-group fallback)
|
||||
3. Only hit the web if no internal knowledge is found AND question is clearly external
|
||||
4. LLM synthesizes the best available context into a final answer
|
||||
"""
|
||||
from backend.providers import call_llm
|
||||
from backend.agents.web_search import search_web, format_search_results_for_llm
|
||||
from backend.config import ENABLE_WEB_SEARCH
|
||||
from backend.db.chroma import query_signals_global
|
||||
|
||||
# ── Step 1: search this group's own collection ──────────────────────────────
|
||||
results = query_signals(group_id, question, n_results=8)
|
||||
|
||||
# A result is "strong" when the top hit has high semantic similarity (≥ 0.40)
|
||||
STRONG_THRESHOLD = 0.40
|
||||
has_strong_local = bool(results) and results[0].get("relevance_score", 0) >= STRONG_THRESHOLD
|
||||
|
||||
# ── Step 2: cross-group fallback ────────────────────────────────────────────
|
||||
cross_results = []
|
||||
if not has_strong_local:
|
||||
cross_results = query_signals_global(question, n_results=8, exclude_group_id=group_id)
|
||||
|
||||
# Combine: local results first, then cross-group ones (de-duplicated by document text)
|
||||
seen_docs = {r["document"] for r in results}
|
||||
for cr in cross_results:
|
||||
if cr["document"] not in seen_docs:
|
||||
results.append(cr)
|
||||
seen_docs.add(cr["document"])
|
||||
|
||||
# ── Recency re-ranking ───────────────────────────────────────────────────────
|
||||
# Boost signals that are recent so a fresh update beats an older one on the same topic.
|
||||
# Boost formula: +0.4 for brand-new, decays to ~0 after 7 days.
|
||||
from datetime import datetime, timezone
|
||||
import math
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
def _recency_boost(ts_str: str) -> float:
|
||||
try:
|
||||
ts = datetime.fromisoformat(ts_str)
|
||||
if ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
age_hours = max(0, (now - ts).total_seconds() / 3600)
|
||||
return 0.4 * math.exp(-age_hours / 48) # half-life ≈ 33 hours
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
for r in results:
|
||||
ts = r.get("metadata", {}).get("timestamp", "")
|
||||
r["_ranked_score"] = r.get("relevance_score", 0) + _recency_boost(ts)
|
||||
|
||||
results.sort(key=lambda x: x["_ranked_score"], reverse=True)
|
||||
|
||||
# Re-evaluate strength after combining and re-ranking
|
||||
has_any_internal = bool(results) and results[0].get("relevance_score", 0) >= STRONG_THRESHOLD
|
||||
|
||||
# ── Build internal context ───────────────────────────────────────────────────
|
||||
# Results are already sorted by (relevance + recency). The first result is the
|
||||
# best match. We label it explicitly so even small fallback models can't miss it.
|
||||
from backend.agents.query_agent import _format_signal_for_context, VOICE_CITATION_INSTRUCTION
|
||||
|
||||
internal_context = ""
|
||||
has_voice_signals = False
|
||||
if results:
|
||||
context_parts = []
|
||||
for i, r in enumerate(results):
|
||||
meta = r["metadata"]
|
||||
source_group = r.get("source_group_id")
|
||||
|
||||
if meta.get("source") == "voice" or meta.get("type") == "voice_transcript":
|
||||
has_voice_signals = True
|
||||
|
||||
# Rich source label using voice-aware formatter
|
||||
signal_label = _format_signal_for_context(r)
|
||||
|
||||
rank_header = (
|
||||
"*** BEST MATCH (use this as your primary answer) ***\n"
|
||||
if i == 0 else
|
||||
f"(supporting context {i+1})\n"
|
||||
)
|
||||
context_parts.append(
|
||||
f"{rank_header}"
|
||||
f"{signal_label}\n"
|
||||
f"Content: {r['document']}\n"
|
||||
f"Entities: {meta.get('entities', '[]')}"
|
||||
)
|
||||
internal_context = "\n\n---\n\n".join(context_parts)
|
||||
|
||||
# ── Step 3: web search — only when all internal sources fail ────────────────
|
||||
# Only keywords that are clearly external / internet-specific trigger web search.
|
||||
# Intentionally excludes personal/team words like "update", "current", "what is".
|
||||
web_keywords = [
|
||||
"latest news", "industry standard", "best practice", "benchmark",
|
||||
"security vulnerability", "cve", "public release", "changelog",
|
||||
"documentation for", "how to install", "npm package", "pypi",
|
||||
]
|
||||
question_lower = question.lower()
|
||||
wants_external = any(kw in question_lower for kw in web_keywords)
|
||||
|
||||
# Web search fires only when: explicitly forced, OR no internal knowledge at all
|
||||
# AND the question looks like it's asking about something external.
|
||||
should_search_web = ENABLE_WEB_SEARCH and (
|
||||
force_web_search
|
||||
or (not has_any_internal and wants_external)
|
||||
)
|
||||
|
||||
web_context = ""
|
||||
used_web = False
|
||||
if should_search_web:
|
||||
web_results = await search_web(question, max_results=3)
|
||||
if web_results:
|
||||
web_context = format_search_results_for_llm(web_results)
|
||||
used_web = True
|
||||
|
||||
# ── Step 4: build combined prompt ───────────────────────────────────────────
|
||||
if not internal_context and not web_context:
|
||||
return (
|
||||
"I don't have any information about that yet across all team chats. "
|
||||
"The relevant group may need more conversation, or try /search for external info."
|
||||
)
|
||||
|
||||
combined_context = ""
|
||||
if internal_context:
|
||||
combined_context += (
|
||||
"=== INTERNAL KNOWLEDGE BASE (from team conversations & documents) ===\n\n"
|
||||
f"{internal_context}\n\n"
|
||||
)
|
||||
if web_context:
|
||||
combined_context += f"=== WEB SEARCH RESULTS ===\n\n{web_context}\n\n"
|
||||
|
||||
system_prompt = """You are the Query Agent for ThirdEye. Answer the question using the context below.
|
||||
|
||||
The context is sorted: the BEST MATCH signal appears first and is your primary source.
|
||||
Older or supporting signals appear after it — they may be outdated, so prefer the BEST MATCH.
|
||||
|
||||
RULES:
|
||||
- Answer from the BEST MATCH signal first. Only use other signals as supporting context.
|
||||
- Quote exact numbers, dates, and durations directly — never paraphrase them.
|
||||
- If a signal has a "Quote:" field, that is the verbatim team message — treat it as ground truth.
|
||||
- Signals from "other group" are still internal team knowledge.
|
||||
- Be concise (2-3 sentences). Plain text only, no markdown headers."""
|
||||
|
||||
if has_voice_signals:
|
||||
system_prompt += VOICE_CITATION_INSTRUCTION
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": f"Context:\n\n{combined_context}\n\nQuestion: {question}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_large", messages, temperature=0.3, max_tokens=600)
|
||||
answer = result["content"]
|
||||
|
||||
sources = []
|
||||
if internal_context:
|
||||
sources.append("knowledge base")
|
||||
if used_web:
|
||||
sources.append("web search")
|
||||
answer += f"\n\n📌 Sources: {' + '.join(sources)}"
|
||||
|
||||
return answer
|
||||
except Exception as e:
|
||||
logger.error(f"Query agent failed: {e}")
|
||||
return "Sorry, I encountered an error while searching. Please try again."
|
||||
|
||||
|
||||
177
thirdeye/backend/providers.py
Normal file
177
thirdeye/backend/providers.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""Multi-provider LLM router with automatic fallback on rate limits.
|
||||
|
||||
Groq pool: up to 3 API keys (GROQ_API_KEY, GROQ_API_KEY_2, GROQ_API_KEY_3) all running
|
||||
llama-3.3-70b-versatile. Calls are round-robined across the pool so the per-key rate
|
||||
limit is shared evenly. When a key is rate-limited the router falls through to the next
|
||||
key in rotation, then to the rest of the fallback chain.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from openai import AsyncOpenAI
|
||||
from backend.config import (
|
||||
GROQ_API_KEY, GROQ_API_KEY_2, GROQ_API_KEY_3,
|
||||
CEREBRAS_API_KEY, SAMBANOVA_API_KEY,
|
||||
OPENROUTER_API_KEY, GEMINI_API_KEY,
|
||||
OLLAMA_BASE_URL, OLLAMA_ENABLED,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("thirdeye.providers")
|
||||
|
||||
# ── Client registry ──────────────────────────────────────────────────────────
|
||||
_clients: dict[str, AsyncOpenAI] = {}
|
||||
|
||||
def _init_client(name: str, base_url: str, api_key: str | None):
|
||||
if api_key and len(api_key) > 5:
|
||||
_clients[name] = AsyncOpenAI(base_url=base_url, api_key=api_key)
|
||||
|
||||
# Ollama (local) — uses a dummy key; the OpenAI client requires a non-empty value
|
||||
if OLLAMA_ENABLED:
|
||||
_clients["ollama"] = AsyncOpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama")
|
||||
|
||||
# Groq pool: register each key under its own name
|
||||
_init_client("groq", "https://api.groq.com/openai/v1", GROQ_API_KEY)
|
||||
_init_client("groq_2", "https://api.groq.com/openai/v1", GROQ_API_KEY_2)
|
||||
_init_client("groq_3", "https://api.groq.com/openai/v1", GROQ_API_KEY_3)
|
||||
|
||||
_init_client("cerebras", "https://api.cerebras.ai/v1", CEREBRAS_API_KEY)
|
||||
_init_client("sambanova", "https://api.sambanova.ai/v1", SAMBANOVA_API_KEY)
|
||||
_init_client("openrouter", "https://openrouter.ai/api/v1", OPENROUTER_API_KEY)
|
||||
_init_client("google", "https://generativelanguage.googleapis.com/v1beta/openai/", GEMINI_API_KEY)
|
||||
|
||||
# Which provider names belong to the Groq pool
|
||||
_GROQ_POOL = [name for name in ("groq", "groq_2", "groq_3") if name in _clients]
|
||||
_GROQ_MODEL = "llama-3.3-70b-versatile"
|
||||
|
||||
# Round-robin cursor per task_type (incremented after every call attempt on the pool)
|
||||
_rr_cursor: dict[str, int] = defaultdict(int)
|
||||
|
||||
# ── Model registry ───────────────────────────────────────────────────────────
|
||||
# Groq pool entries are expanded dynamically at call time so the cursor drives order.
|
||||
# Use the sentinel string "groq_pool" to indicate "use all available Groq keys".
|
||||
_GROQ_POOL_SENTINEL = "__groq_pool__"
|
||||
|
||||
MODEL_REGISTRY: dict[str, list[tuple[str, str]]] = {
|
||||
"fast_small": [
|
||||
("ollama", "llama3:8b"),
|
||||
("groq", "llama-3.1-8b-instant"),
|
||||
("cerebras", "llama-3.1-8b"),
|
||||
("openrouter", "openai/gpt-oss-20b:free"),
|
||||
],
|
||||
"fast_large": [
|
||||
(_GROQ_POOL_SENTINEL, _GROQ_MODEL), # expands to all 3 Groq keys (round-robin)
|
||||
("openrouter", "arcee-ai/trinity-large-preview:free"),
|
||||
("openrouter", "meta-llama/llama-3.3-70b-instruct:free"),
|
||||
("sambanova", "Meta-Llama-3.3-70B-Instruct"),
|
||||
("cerebras", "llama3.1-8b"),
|
||||
],
|
||||
"reasoning": [
|
||||
("sambanova", "DeepSeek-R1-Distill-Llama-70B"),
|
||||
("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
|
||||
("openrouter", "openai/gpt-oss-120b:free"),
|
||||
],
|
||||
"agentic": [
|
||||
("openrouter", "minimax/minimax-m2.5:free"),
|
||||
("openrouter", "nvidia/nemotron-3-super-120b-a12b:free"),
|
||||
(_GROQ_POOL_SENTINEL, _GROQ_MODEL),
|
||||
],
|
||||
"fallback": [
|
||||
("openrouter", "openrouter/free"),
|
||||
("google", "gemini-2.5-flash"),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _expand_candidates(task_type: str) -> list[tuple[str, str]]:
|
||||
"""
|
||||
Return the full candidate list for a task_type with the Groq pool sentinel
|
||||
expanded into ordered (provider_name, model) tuples starting from the
|
||||
current round-robin cursor position.
|
||||
"""
|
||||
raw = MODEL_REGISTRY.get(task_type, []) + MODEL_REGISTRY["fallback"]
|
||||
expanded: list[tuple[str, str]] = []
|
||||
|
||||
for provider, model in raw:
|
||||
if provider != _GROQ_POOL_SENTINEL:
|
||||
expanded.append((provider, model))
|
||||
continue
|
||||
|
||||
if not _GROQ_POOL:
|
||||
continue
|
||||
|
||||
# Rotate: start from cursor, wrap around
|
||||
start = _rr_cursor[task_type] % len(_GROQ_POOL)
|
||||
ordered = _GROQ_POOL[start:] + _GROQ_POOL[:start]
|
||||
for key_name in ordered:
|
||||
expanded.append((key_name, model))
|
||||
|
||||
return expanded
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
async def call_llm(
|
||||
task_type: str,
|
||||
messages: list,
|
||||
temperature: float = 0.3,
|
||||
max_tokens: int = 2000,
|
||||
response_format: dict = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Route to the best available provider with automatic fallback.
|
||||
|
||||
Returns:
|
||||
{"content": str, "provider": str, "model": str}
|
||||
"""
|
||||
candidates = _expand_candidates(task_type)
|
||||
errors = []
|
||||
|
||||
for provider_name, model_id in candidates:
|
||||
client = _clients.get(provider_name)
|
||||
if not client:
|
||||
continue
|
||||
|
||||
try:
|
||||
kwargs = {
|
||||
"model": model_id,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
"timeout": 45,
|
||||
}
|
||||
if response_format and provider_name not in ("google",):
|
||||
kwargs["response_format"] = response_format
|
||||
|
||||
response = await client.chat.completions.create(**kwargs)
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Advance round-robin cursor on success so next call starts from the
|
||||
# following key, distributing load evenly across the pool.
|
||||
if provider_name in _GROQ_POOL:
|
||||
_rr_cursor[task_type] = (_rr_cursor[task_type] + 1) % len(_GROQ_POOL)
|
||||
|
||||
display_name = provider_name if provider_name not in ("groq_2", "groq_3") else f"groq[key{provider_name[-1]}]"
|
||||
logger.info(f"LLM call success: {display_name}/{model_id} ({task_type})")
|
||||
return {
|
||||
"content": content,
|
||||
"provider": display_name,
|
||||
"model": model_id,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
is_rate_limit = any(k in err for k in ["429", "rate", "quota", "limit", "exceeded", "capacity"])
|
||||
is_timeout = "timeout" in err or "timed out" in err
|
||||
|
||||
if is_rate_limit or is_timeout:
|
||||
logger.warning(f"Provider {provider_name}/{model_id} unavailable: {type(e).__name__}")
|
||||
errors.append(f"{provider_name}: rate limited")
|
||||
# Also advance cursor so the next call won't start on this key
|
||||
if provider_name in _GROQ_POOL:
|
||||
_rr_cursor[task_type] = (_rr_cursor[task_type] + 1) % len(_GROQ_POOL)
|
||||
else:
|
||||
logger.error(f"Provider {provider_name}/{model_id} error: {e}")
|
||||
errors.append(f"{provider_name}: {e}")
|
||||
continue
|
||||
|
||||
raise Exception(f"All LLM providers exhausted. Errors: {errors}")
|
||||
Reference in New Issue
Block a user