mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 12:41:48 +00:00
init
This commit is contained in:
279
thirdeye/backend/db/chroma.py
Normal file
279
thirdeye/backend/db/chroma.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""ChromaDB setup and operations."""
|
||||
import json
|
||||
import uuid
|
||||
import chromadb
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from backend.config import CHROMA_DB_PATH
|
||||
from backend.db.embeddings import embed_texts, embed_query
|
||||
|
||||
logger = logging.getLogger("thirdeye.chroma")
|
||||
|
||||
# Initialize persistent client
|
||||
_chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
|
||||
|
||||
|
||||
def get_collection(group_id: str) -> chromadb.Collection:
|
||||
"""Get or create a collection for a specific group."""
|
||||
safe_name = f"ll_{group_id.replace('-', '_')}"
|
||||
# ChromaDB collection names: 3-63 chars, alphanumeric + underscores
|
||||
safe_name = safe_name[:63]
|
||||
return _chroma_client.get_or_create_collection(name=safe_name)
|
||||
|
||||
|
||||
def set_group_name(group_id: str, name: str):
|
||||
"""Persist the human-readable Telegram group name in the collection metadata."""
|
||||
if not name or name == group_id:
|
||||
return
|
||||
try:
|
||||
col = get_collection(group_id)
|
||||
existing = dict(col.metadata or {})
|
||||
if existing.get("group_name") != name:
|
||||
existing["group_name"] = name
|
||||
col.modify(metadata=existing)
|
||||
except Exception as e:
|
||||
logger.warning(f"set_group_name failed for {group_id}: {e}")
|
||||
|
||||
|
||||
def get_group_names() -> dict[str, str]:
|
||||
"""Return a mapping of group_id -> human-readable name (falls back to group_id)."""
|
||||
result = {}
|
||||
for col in _chroma_client.list_collections():
|
||||
if not col.name.startswith("ll_"):
|
||||
continue
|
||||
group_id = col.name.replace("ll_", "").replace("_", "-")
|
||||
name = (col.metadata or {}).get("group_name", group_id)
|
||||
result[group_id] = name
|
||||
return result
|
||||
|
||||
|
||||
def store_signals(group_id: str, signals: list[dict]):
|
||||
"""Store extracted signals in ChromaDB with embeddings."""
|
||||
if not signals:
|
||||
return
|
||||
|
||||
collection = get_collection(group_id)
|
||||
documents = []
|
||||
metadatas = []
|
||||
ids = []
|
||||
|
||||
for signal in signals:
|
||||
doc_text = f"{signal['type']}: {signal['summary']}"
|
||||
if signal.get('raw_quote'):
|
||||
doc_text += f" | Quote: {signal['raw_quote']}"
|
||||
|
||||
documents.append(doc_text)
|
||||
metadatas.append({
|
||||
"type": signal.get("type", "unknown"),
|
||||
"severity": signal.get("severity", "low"),
|
||||
"status": signal.get("status", "unknown"),
|
||||
"sentiment": signal.get("sentiment", "neutral"),
|
||||
"urgency": signal.get("urgency", "none"),
|
||||
"entities": json.dumps(signal.get("entities", [])),
|
||||
"keywords": json.dumps(signal.get("keywords", [])),
|
||||
"raw_quote": signal.get("raw_quote", ""),
|
||||
"summary": signal.get("summary", ""),
|
||||
"timestamp": signal.get("timestamp", datetime.utcnow().isoformat()),
|
||||
"group_id": group_id,
|
||||
"lens": signal.get("lens", "unknown"),
|
||||
"meeting_id": signal.get("meeting_id", ""),
|
||||
# Voice attribution — preserved so /voicelog and /ask can cite the source
|
||||
"source": signal.get("source", ""),
|
||||
"speaker": signal.get("speaker", ""),
|
||||
"voice_file_id": signal.get("voice_file_id", ""),
|
||||
"voice_duration": int(signal.get("voice_duration", 0) or 0),
|
||||
"voice_language": signal.get("voice_language", ""),
|
||||
# Jira tracking fields (populated for jira_raised signals)
|
||||
"jira_key": signal.get("jira_key", ""),
|
||||
"jira_url": signal.get("jira_url", ""),
|
||||
"jira_summary": signal.get("jira_summary", ""),
|
||||
"jira_priority": signal.get("jira_priority", ""),
|
||||
"original_signal_id": signal.get("original_signal_id", ""),
|
||||
})
|
||||
ids.append(signal.get("id", str(uuid.uuid4())))
|
||||
|
||||
# Generate embeddings
|
||||
embeddings = embed_texts(documents)
|
||||
|
||||
collection.add(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
)
|
||||
logger.info(f"Stored {len(signals)} signals for group {group_id}")
|
||||
|
||||
|
||||
def query_signals(group_id: str, query: str, n_results: int = 10, signal_type: str = None) -> list[dict]:
|
||||
"""Query the knowledge base with natural language."""
|
||||
collection = get_collection(group_id)
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
|
||||
where_filter = None
|
||||
if signal_type:
|
||||
where_filter = {"type": signal_type}
|
||||
|
||||
try:
|
||||
results = collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=min(n_results, collection.count() or 1),
|
||||
where=where_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Query failed: {e}")
|
||||
return []
|
||||
|
||||
# Format results
|
||||
output = []
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"][0]):
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
distance = results["distances"][0][i] if results["distances"] else None
|
||||
sig_id = results["ids"][0][i] if results.get("ids") else ""
|
||||
output.append({
|
||||
"id": sig_id,
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"relevance_score": 1 - (distance or 0), # Convert distance to similarity
|
||||
})
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_all_signals(group_id: str, signal_type: str = None) -> list[dict]:
|
||||
"""Get all signals for a group (for pattern detection)."""
|
||||
collection = get_collection(group_id)
|
||||
count = collection.count()
|
||||
if count == 0:
|
||||
return []
|
||||
|
||||
where_filter = {"type": signal_type} if signal_type else None
|
||||
|
||||
try:
|
||||
results = collection.get(where=where_filter, limit=count)
|
||||
except Exception:
|
||||
results = collection.get(limit=count)
|
||||
|
||||
output = []
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"]):
|
||||
meta = results["metadatas"][i] if results["metadatas"] else {}
|
||||
output.append({"document": doc, "metadata": meta, "id": results["ids"][i]})
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def get_group_ids() -> list[str]:
|
||||
"""Get all group IDs that have collections."""
|
||||
collections = _chroma_client.list_collections()
|
||||
return [c.name.replace("ll_", "").replace("_", "-") for c in collections if c.name.startswith("ll_")]
|
||||
|
||||
|
||||
def query_signals_global(query: str, n_results: int = 5, exclude_group_id: str = None) -> list[dict]:
|
||||
"""
|
||||
Search across ALL group collections for a query.
|
||||
Used as a cross-group fallback when local search returns weak results.
|
||||
Each result is annotated with its source group_id.
|
||||
"""
|
||||
collections = _chroma_client.list_collections()
|
||||
query_embedding = embed_query(query)
|
||||
all_results = []
|
||||
|
||||
for col_meta in collections:
|
||||
if not col_meta.name.startswith("ll_"):
|
||||
continue
|
||||
|
||||
# Derive group_id from collection name
|
||||
raw = col_meta.name[len("ll_"):]
|
||||
group_id = raw.replace("_", "-")
|
||||
|
||||
if exclude_group_id and group_id == exclude_group_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
col = _chroma_client.get_collection(col_meta.name)
|
||||
count = col.count()
|
||||
if count == 0:
|
||||
continue
|
||||
|
||||
results = col.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=min(n_results, count),
|
||||
)
|
||||
|
||||
if results and results["documents"]:
|
||||
for i, doc in enumerate(results["documents"][0]):
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
distance = results["distances"][0][i] if results["distances"] else None
|
||||
all_results.append({
|
||||
"document": doc,
|
||||
"metadata": meta,
|
||||
"relevance_score": 1 - (distance or 0),
|
||||
"source_group_id": group_id,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Global query failed for collection {col_meta.name}: {e}")
|
||||
continue
|
||||
|
||||
# Sort by relevance and return top n_results
|
||||
all_results.sort(key=lambda x: x["relevance_score"], reverse=True)
|
||||
return all_results[:n_results]
|
||||
|
||||
def mark_signal_as_raised(
|
||||
group_id: str,
|
||||
signal_id: str,
|
||||
jira_key: str,
|
||||
jira_url: str = "",
|
||||
jira_summary: str = "",
|
||||
jira_priority: str = "",
|
||||
):
|
||||
"""
|
||||
Tag a signal with its Jira ticket key so we never raise it twice.
|
||||
Adds a new signal of type 'jira_raised' linked to the original signal_id.
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
tracking_signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "jira_raised",
|
||||
"summary": jira_summary or f"Jira ticket {jira_key} raised for signal {signal_id}",
|
||||
"raw_quote": signal_id, # original signal_id — used by get_raised_signal_ids
|
||||
"severity": "low",
|
||||
"status": "raised",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [jira_key],
|
||||
"keywords": ["jira", jira_key, "raised"],
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "jira",
|
||||
# Jira tracking fields
|
||||
"jira_key": jira_key,
|
||||
"jira_url": jira_url,
|
||||
"jira_summary": jira_summary,
|
||||
"jira_priority": jira_priority,
|
||||
"original_signal_id": signal_id,
|
||||
}
|
||||
store_signals(group_id, [tracking_signal])
|
||||
|
||||
|
||||
def get_raised_signal_ids(group_id: str) -> set[str]:
|
||||
"""
|
||||
Return the set of signal IDs that have already had Jira tickets raised.
|
||||
Used to prevent duplicates.
|
||||
"""
|
||||
collection = get_collection(group_id)
|
||||
try:
|
||||
results = collection.get(where={"type": "jira_raised"})
|
||||
# raw_quote stores the original signal_id
|
||||
raised_ids = set()
|
||||
if results and results.get("metadatas"):
|
||||
for meta in results["metadatas"]:
|
||||
original_id = meta.get("raw_quote") # signal_id stored in raw_quote field
|
||||
if original_id:
|
||||
raised_ids.add(original_id)
|
||||
return raised_ids
|
||||
except Exception:
|
||||
return set()
|
||||
67
thirdeye/backend/db/embeddings.py
Normal file
67
thirdeye/backend/db/embeddings.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Embedding provider with Cohere primary and local fallback."""
|
||||
import cohere
|
||||
import logging
|
||||
from backend.config import COHERE_API_KEY
|
||||
|
||||
logger = logging.getLogger("thirdeye.embeddings")
|
||||
|
||||
_cohere_client = None
|
||||
_local_model = None
|
||||
|
||||
def _get_cohere():
|
||||
global _cohere_client
|
||||
if _cohere_client is None and COHERE_API_KEY:
|
||||
_cohere_client = cohere.Client(COHERE_API_KEY)
|
||||
return _cohere_client
|
||||
|
||||
def _get_local_model():
|
||||
global _local_model
|
||||
if _local_model is None:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
_local_model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
logger.info("Loaded local embedding model: all-MiniLM-L6-v2")
|
||||
return _local_model
|
||||
|
||||
|
||||
def embed_texts(texts: list[str]) -> list[list[float]]:
|
||||
"""Embed a list of texts. Tries Cohere first, falls back to local model."""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Try Cohere
|
||||
client = _get_cohere()
|
||||
if client:
|
||||
try:
|
||||
response = client.embed(
|
||||
texts=texts,
|
||||
model="embed-english-v3.0",
|
||||
input_type="search_document",
|
||||
)
|
||||
logger.info(f"Cohere embedded {len(texts)} texts")
|
||||
return [list(e) for e in response.embeddings]
|
||||
except Exception as e:
|
||||
logger.warning(f"Cohere embedding failed: {e}, falling back to local")
|
||||
|
||||
# Fallback to local
|
||||
model = _get_local_model()
|
||||
embeddings = model.encode(texts).tolist()
|
||||
logger.info(f"Local model embedded {len(texts)} texts")
|
||||
return embeddings
|
||||
|
||||
|
||||
def embed_query(text: str) -> list[float]:
|
||||
"""Embed a single query text."""
|
||||
client = _get_cohere()
|
||||
if client:
|
||||
try:
|
||||
response = client.embed(
|
||||
texts=[text],
|
||||
model="embed-english-v3.0",
|
||||
input_type="search_query",
|
||||
)
|
||||
return list(response.embeddings[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
model = _get_local_model()
|
||||
return model.encode([text]).tolist()[0]
|
||||
57
thirdeye/backend/db/models.py
Normal file
57
thirdeye/backend/db/models.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Data models for ThirdEye."""
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
|
||||
|
||||
class Signal(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
group_id: str
|
||||
lens: str = "unknown" # dev, product, client, community
|
||||
type: str # architecture_decision, tech_debt, etc.
|
||||
summary: str
|
||||
entities: list[str] = []
|
||||
severity: str = "low" # low, medium, high, critical
|
||||
status: str = "unknown" # proposed, decided, implemented, unresolved
|
||||
sentiment: str = "neutral"
|
||||
urgency: str = "none"
|
||||
raw_quote: str = ""
|
||||
source_messages: list[int] = []
|
||||
timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
keywords: list[str] = []
|
||||
|
||||
|
||||
class Pattern(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
group_id: str
|
||||
type: str # frequency_spike, knowledge_silo, recurring_issue, sentiment_trend, stale_item
|
||||
description: str
|
||||
severity: str = "info" # info, warning, critical
|
||||
evidence_signal_ids: list[str] = []
|
||||
recommendation: str = ""
|
||||
detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class CrossGroupInsight(BaseModel):
|
||||
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
type: str # blocked_handoff, conflicting_decision, information_silo, promise_reality_gap, duplicated_effort
|
||||
description: str
|
||||
group_a: dict = {} # {name, group_id, evidence}
|
||||
group_b: dict = {}
|
||||
severity: str = "warning"
|
||||
recommendation: str = ""
|
||||
detected_at: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
||||
is_resolved: bool = False
|
||||
|
||||
|
||||
class GroupConfig(BaseModel):
|
||||
group_id: str
|
||||
group_name: str = ""
|
||||
lens_mode: str = "auto" # auto, dev, product, client, community
|
||||
detected_lens: str = "unknown"
|
||||
confidence: float = 0.0
|
||||
is_active: bool = True
|
||||
message_count: int = 0
|
||||
signal_count: int = 0
|
||||
Reference in New Issue
Block a user