This commit is contained in:
2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions

View File

@@ -0,0 +1,59 @@
"""
Clear all ChromaDB collections (signals + embeddings).
Collections will be automatically recreated when new signals are stored.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
import chromadb
from backend.config import CHROMA_DB_PATH
def clear_all_collections():
"""Delete all collections from ChromaDB."""
print("=" * 60)
print("ChromaDB Clear Script")
print("=" * 60)
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
# Get all collections
collections = client.list_collections()
if not collections:
print("\n✅ Database is already empty (no collections found)")
return
print(f"\nFound {len(collections)} collection(s):")
for coll in collections:
count = coll.count()
print(f" - {coll.name}: {count} documents")
# Confirm deletion
print(f"\n⚠️ This will DELETE all {len(collections)} collections and their data.")
print(" (Collections will be recreated automatically when new signals are added)")
response = input("\nType 'yes' to confirm deletion: ")
if response.lower() != 'yes':
print("\n❌ Deletion cancelled.")
return
# Delete all collections
print("\n🗑️ Deleting collections...")
deleted = 0
for coll in collections:
try:
client.delete_collection(coll.name)
print(f" ✅ Deleted: {coll.name}")
deleted += 1
except Exception as e:
print(f" ❌ Failed to delete {coll.name}: {e}")
print(f"\n✅ Successfully deleted {deleted}/{len(collections)} collection(s)")
print(" Database is now empty. Collections will be recreated on next signal ingestion.")
print("=" * 60)
if __name__ == "__main__":
clear_all_collections()

View File

@@ -0,0 +1,194 @@
"""
Advanced ChromaDB clear script with selective deletion options.
Usage:
python scripts/clear_db_advanced.py # Interactive - clear all
python scripts/clear_db_advanced.py --force # Clear all without confirmation
python scripts/clear_db_advanced.py --group meet-sessions # Clear specific group only
python scripts/clear_db_advanced.py --meet-only # Clear only meet-related signals
python scripts/clear_db_advanced.py --test-only # Clear only test collections
"""
import sys
import os
import argparse
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
import chromadb
from backend.config import CHROMA_DB_PATH
from backend.db.chroma import get_group_ids
def clear_all(force=False):
"""Delete all collections."""
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collections = client.list_collections()
if not collections:
print("✅ Database is already empty")
return
print(f"Found {len(collections)} collection(s) to delete:")
total_docs = 0
for coll in collections:
count = coll.count()
total_docs += count
print(f" - {coll.name}: {count} documents")
print(f"\nTotal: {total_docs} documents across {len(collections)} collections")
if not force:
response = input("\nType 'yes' to confirm deletion: ")
if response.lower() != 'yes':
print("❌ Deletion cancelled")
return
print("\n🗑️ Deleting...")
for coll in collections:
client.delete_collection(coll.name)
print(f" ✅ Deleted: {coll.name}")
print(f"\n✅ Deleted {len(collections)} collections, {total_docs} documents")
def clear_group(group_id: str, force=False):
"""Delete a specific group's collection."""
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
safe_name = f"ll_{group_id.replace('-', '_')}"[:63]
try:
coll = client.get_collection(safe_name)
count = coll.count()
print(f"Found collection '{safe_name}' with {count} documents")
if not force:
response = input(f"\nDelete collection '{safe_name}'? Type 'yes' to confirm: ")
if response.lower() != 'yes':
print("❌ Deletion cancelled")
return
client.delete_collection(safe_name)
print(f"✅ Deleted collection '{safe_name}' ({count} documents)")
except Exception as e:
print(f"❌ Collection '{safe_name}' not found or error: {e}")
def clear_meet_only(force=False):
"""Delete only meet-related collections."""
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collections = client.list_collections()
# Identify meet collections (have lens=meet signals)
meet_collections = []
for coll in collections:
try:
# Sample first few docs to check if they're meet-related
results = coll.get(limit=5)
if results and results.get("metadatas"):
for meta in results["metadatas"]:
if meta.get("lens") == "meet":
meet_collections.append(coll)
break
except Exception:
pass
if not meet_collections:
print("✅ No meet-related collections found")
return
print(f"Found {len(meet_collections)} meet-related collection(s):")
total_docs = 0
for coll in meet_collections:
count = coll.count()
total_docs += count
print(f" - {coll.name}: {count} documents")
if not force:
response = input("\nType 'yes' to confirm deletion: ")
if response.lower() != 'yes':
print("❌ Deletion cancelled")
return
print("\n🗑️ Deleting meet collections...")
for coll in meet_collections:
client.delete_collection(coll.name)
print(f" ✅ Deleted: {coll.name}")
print(f"\n✅ Deleted {len(meet_collections)} collections, {total_docs} documents")
def clear_test_only(force=False):
"""Delete only test collections (names starting with 'test' or 'll_test')."""
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collections = client.list_collections()
test_collections = [c for c in collections if 'test' in c.name.lower()]
if not test_collections:
print("✅ No test collections found")
return
print(f"Found {len(test_collections)} test collection(s):")
total_docs = 0
for coll in test_collections:
count = coll.count()
total_docs += count
print(f" - {coll.name}: {count} documents")
if not force:
response = input("\nType 'yes' to confirm deletion: ")
if response.lower() != 'yes':
print("❌ Deletion cancelled")
return
print("\n🗑️ Deleting test collections...")
for coll in test_collections:
client.delete_collection(coll.name)
print(f" ✅ Deleted: {coll.name}")
print(f"\n✅ Deleted {len(test_collections)} collections, {total_docs} documents")
def main():
parser = argparse.ArgumentParser(
description="Clear ChromaDB collections (signals + embeddings)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python scripts/clear_db_advanced.py # Clear all (interactive)
python scripts/clear_db_advanced.py --force # Clear all (no confirmation)
python scripts/clear_db_advanced.py --group acme-dev # Clear specific group
python scripts/clear_db_advanced.py --meet-only # Clear only meet signals
python scripts/clear_db_advanced.py --test-only --force # Clear test data automatically
"""
)
parser.add_argument("--force", action="store_true", help="Skip confirmation prompt")
parser.add_argument("--group", type=str, help="Delete only this specific group")
parser.add_argument("--meet-only", action="store_true", help="Delete only meet-related collections")
parser.add_argument("--test-only", action="store_true", help="Delete only test collections")
args = parser.parse_args()
print("=" * 60)
print("ChromaDB Advanced Clear Script")
print("=" * 60)
print(f"Database path: {CHROMA_DB_PATH}")
print()
# Determine which clear operation to run
if args.group:
clear_group(args.group, force=args.force)
elif args.meet_only:
clear_meet_only(force=args.force)
elif args.test_only:
clear_test_only(force=args.force)
else:
clear_all(force=args.force)
print("\n" + "=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,84 @@
"""
Debug script to check if Meet signals are being recorded and stored.
Run this while or after recording a Google Meet.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from backend.db.chroma import get_all_signals, get_group_ids
from backend.config import MEET_DEFAULT_GROUP_ID
import json
print("=" * 60)
print("ThirdEye Meet Recording Diagnostic")
print("=" * 60)
# Check all groups
print("\n1. All ChromaDB groups:")
all_groups = get_group_ids()
print(f" Found {len(all_groups)} groups: {all_groups}")
# Check default meet group
print(f"\n2. Default meet group: {MEET_DEFAULT_GROUP_ID}")
try:
meet_signals = get_all_signals(MEET_DEFAULT_GROUP_ID)
print(f" Found {len(meet_signals)} signals")
if meet_signals:
# Count by type
types = {}
meetings = {}
for sig in meet_signals:
meta = sig.get("metadata", {})
sig_type = meta.get("type", "unknown")
meet_id = meta.get("meeting_id", "unknown")
types[sig_type] = types.get(sig_type, 0) + 1
if meet_id not in meetings:
meetings[meet_id] = 0
meetings[meet_id] += 1
print("\n Signal types:")
for t, count in types.items():
print(f" - {t}: {count}")
print("\n Meetings recorded:")
for mid, count in meetings.items():
print(f" - {mid}: {count} signals")
# Show first few signals
print("\n First 3 signals (metadata only):")
for i, sig in enumerate(meet_signals[:3]):
print(f"\n Signal {i+1}:")
meta = sig.get("metadata", {})
for key in ["type", "meeting_id", "timestamp", "lens"]:
print(f" {key}: {meta.get(key, 'N/A')}")
doc = sig.get("document", "")
print(f" document: {doc[:100]}...")
else:
print(" ⚠️ No signals found in meet_sessions group")
print(" This means either:")
print(" - Extension hasn't recorded anything yet")
print(" - Backend isn't running")
print(" - Signals are being stored in a different group")
except Exception as e:
print(f" ❌ Error: {e}")
# Check all groups for meet signals
print("\n3. Checking ALL groups for meet-related signals:")
for group in all_groups:
try:
signals = get_all_signals(group)
meet_lens = [s for s in signals if s.get("metadata", {}).get("lens") == "meet"]
if meet_lens:
print(f" - {group}: {len(meet_lens)} meet signals")
meetings_in_group = set(s.get("metadata", {}).get("meeting_id") for s in meet_lens)
print(f" Meetings: {list(meetings_in_group)}")
except Exception as e:
print(f" - {group}: Error - {e}")
print("\n" + "=" * 60)
print("Diagnostic complete!")
print("=" * 60)

View File

@@ -0,0 +1,62 @@
"""
Debug script to see what signals were extracted for a specific group.
Shows the actual text stored in ChromaDB to help debug search issues.
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from backend.db.chroma import get_all_signals
import json
# Your Telegram group ID from the logs
GROUP_ID = "-5180781849" # Binary_hacks_dev
print("=" * 80)
print(f"Signals for group: {GROUP_ID} (Binary_hacks_dev)")
print("=" * 80)
signals = get_all_signals(GROUP_ID)
if not signals:
print("\n⚠️ No signals found for this group")
print(" Either no messages processed yet, or wrong group ID")
else:
print(f"\nFound {len(signals)} total signals\n")
# Show last 10 signals (most recent)
for i, sig in enumerate(signals[-10:], 1):
meta = sig.get("metadata", {})
doc = sig.get("document", "")
print(f"\n{'' * 80}")
print(f"Signal {i}/{len(signals)}")
print(f"{'' * 80}")
print(f"Type: {meta.get('type', 'unknown')}")
print(f"Severity: {meta.get('severity', 'unknown')}")
print(f"Timestamp: {meta.get('timestamp', 'unknown')}")
print(f"Entities: {meta.get('entities', '[]')}")
print(f"Keywords: {meta.get('keywords', '[]')}")
# Parse entities and keywords from JSON
try:
entities = json.loads(meta.get('entities', '[]'))
keywords = json.loads(meta.get('keywords', '[]'))
print(f"\nEntities list: {entities}")
print(f"Keywords list: {keywords}")
except Exception:
pass
print(f"\nDocument text:")
print(f"{doc[:300]}...")
raw_quote = meta.get('raw_quote', '')
if raw_quote:
print(f"\nRaw quote:")
print(f"{raw_quote[:300]}...")
print("\n" + "=" * 80)
print("\nTo search for specific terms:")
print(" from backend.db.chroma import query_signals")
print(f" results = query_signals('{GROUP_ID}', 'your search query', n_results=5)")
print("=" * 80)

View File

@@ -0,0 +1,109 @@
"""Seed demo data directly into ChromaDB (bypasses Telegram for speed)."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from backend.pipeline import process_message_batch, set_lens
# ====== ACME DEV TEAM ======
DEV_MESSAGES = [
{"sender": "Alex", "text": "Team, I'm proposing we use PostgreSQL for the main database. Our data is highly relational.", "timestamp": "2026-03-10T10:00:00Z"},
{"sender": "Priya", "text": "Agreed on Postgres. I'll set up the initial schema and migrations.", "timestamp": "2026-03-10T10:15:00Z"},
{"sender": "Raj", "text": "I'll take ownership of the payment module. I have experience with Stripe webhooks.", "timestamp": "2026-03-10T14:00:00Z"},
{"sender": "Alex", "text": "For the auth service, I'm hardcoding the JWT secret for now. We'll move to vault later.", "timestamp": "2026-03-11T09:00:00Z"},
{"sender": "Sam", "text": "Getting a timeout error on the checkout endpoint. Seems intermittent.", "timestamp": "2026-03-12T10:00:00Z"},
{"sender": "Raj", "text": "Payment webhook is fully integrated now. Only I know how the retry logic works though.", "timestamp": "2026-03-13T11:00:00Z"},
{"sender": "Sam", "text": "Timeout error on checkout is back again. Second time this week.", "timestamp": "2026-03-14T09:00:00Z"},
{"sender": "Alex", "text": "Just restart the pod when it happens. I'll investigate after the sprint.", "timestamp": "2026-03-14T09:30:00Z"},
{"sender": "Sam", "text": "Has anyone heard back from design about the dashboard specs? We need them to start.", "timestamp": "2026-03-15T10:00:00Z"},
{"sender": "Alex", "text": "Still no dashboard specs from design. This is blocking my entire sprint work.", "timestamp": "2026-03-17T10:00:00Z"},
{"sender": "Sam", "text": "Timeout error AGAIN. That's the third time. We have a systemic issue here.", "timestamp": "2026-03-18T09:00:00Z"},
{"sender": "Priya", "text": "I'm pushing this config change directly to main. It's a small fix, should be fine.", "timestamp": "2026-03-19T14:00:00Z"},
{"sender": "Sam", "text": "Dashboard is completely blocked without those design specs. Week 2 of waiting.", "timestamp": "2026-03-20T10:00:00Z"},
]
# ====== ACME PRODUCT TEAM ======
PRODUCT_MESSAGES = [
{"sender": "Lisa", "text": "Users keep asking about dark mode. It comes up in literally every demo.", "timestamp": "2026-03-10T10:00:00Z"},
{"sender": "Mike", "text": "I think we should prioritize the mobile app this sprint. Mobile traffic is 60%.", "timestamp": "2026-03-11T10:00:00Z"},
{"sender": "Sarah", "text": "No, API stability is way more important. Two enterprise clients complained last week about downtime.", "timestamp": "2026-03-11T10:30:00Z"},
{"sender": "Lisa", "text": "Sarah from ClientCo literally said 'I would pay double if you had SSO integration.'", "timestamp": "2026-03-12T14:00:00Z"},
{"sender": "Mike", "text": "Competitor X just launched a mobile-first version. We're falling behind on mobile.", "timestamp": "2026-03-13T10:00:00Z"},
{"sender": "Lisa", "text": "I told ClientCo we'd have the dashboard demo ready by Friday March 21st.", "timestamp": "2026-03-14T10:00:00Z"},
{"sender": "Sarah", "text": "Our conversion rate dropped to 2.3% after the last release. That's concerning.", "timestamp": "2026-03-15T11:00:00Z"},
{"sender": "Mike", "text": "Let's commit to API-first approach for the rest of Q1.", "timestamp": "2026-03-17T10:00:00Z"},
{"sender": "Lisa", "text": "Dark mode was mentioned again by three different users at the conference.", "timestamp": "2026-03-19T10:00:00Z"},
{"sender": "Sarah", "text": "We really need to decide: mobile or API stability? This conflict is slowing us down.", "timestamp": "2026-03-20T10:00:00Z"},
]
# ====== ACME ↔ CLIENT CHANNEL ======
CLIENT_MESSAGES = [
{"sender": "Lisa", "text": "Hi ClientCo team! Just confirming we'll have the dashboard mockups ready by Friday March 21st.", "timestamp": "2026-03-10T10:00:00Z"},
{"sender": "Client_CEO", "text": "Great, looking forward to seeing them. This is a key deliverable for our board meeting.", "timestamp": "2026-03-10T10:30:00Z"},
{"sender": "Lisa", "text": "We'll also share the API documentation by Wednesday March 19th.", "timestamp": "2026-03-11T10:00:00Z"},
{"sender": "Client_CEO", "text": "Perfect. Oh, could you also add an export-to-PDF feature for the reports? That would be really helpful.", "timestamp": "2026-03-12T10:00:00Z"},
{"sender": "Lisa", "text": "Sure, we'll look into the PDF export!", "timestamp": "2026-03-12T10:15:00Z"},
{"sender": "Client_CEO", "text": "Any update on the dashboard mockups?", "timestamp": "2026-03-17T10:00:00Z"},
{"sender": "Client_CEO", "text": "Also, would it be possible to add a dark mode option?", "timestamp": "2026-03-18T10:00:00Z"},
{"sender": "Client_CEO", "text": "We really need those mockups before the board meeting on Monday.", "timestamp": "2026-03-19T10:00:00Z"},
{"sender": "Client_CEO", "text": "I might need to loop in our VP if we can't get the timeline confirmed.", "timestamp": "2026-03-20T10:00:00Z"},
]
async def seed():
print("🌱 Seeding demo data...\n")
# Set lenses explicitly
set_lens("acme_dev", "dev")
set_lens("acme_product", "product")
set_lens("acme_client", "client")
# Process dev team (2 batches)
print("Processing Acme Dev Team...")
s1 = await process_message_batch("acme_dev", DEV_MESSAGES[:7])
s2 = await process_message_batch("acme_dev", DEV_MESSAGES[7:])
print(f" ✅ Dev team: {len(s1) + len(s2)} signals stored\n")
# Process product team
print("Processing Acme Product Team...")
s3 = await process_message_batch("acme_product", PRODUCT_MESSAGES[:5])
s4 = await process_message_batch("acme_product", PRODUCT_MESSAGES[5:])
print(f" ✅ Product team: {len(s3) + len(s4)} signals stored\n")
# Process client channel
print("Processing Acme ↔ ClientCo Channel...")
s5 = await process_message_batch("acme_client", CLIENT_MESSAGES[:5])
s6 = await process_message_batch("acme_client", CLIENT_MESSAGES[5:])
print(f" ✅ Client channel: {len(s5) + len(s6)} signals stored\n")
# Run pattern detection
print("Running pattern detection on dev team...")
from backend.agents.pattern_detector import detect_patterns
patterns = await detect_patterns("acme_dev")
print(f" Found {len(patterns)} patterns")
for p in patterns:
print(f" [{p.severity}] {p.type}: {p.description[:80]}")
# Run cross-group analysis
print("\n🔥 Running CROSS-GROUP ANALYSIS...")
from backend.agents.cross_group_analyst import analyze_cross_group
from backend.db.chroma import get_all_signals
summaries = {
"Acme Dev Team": get_all_signals("acme_dev"),
"Acme Product": get_all_signals("acme_product"),
"Acme ↔ ClientCo": get_all_signals("acme_client"),
}
insights = await analyze_cross_group(summaries)
print(f"\n Found {len(insights)} CROSS-GROUP INSIGHTS:")
for i in insights:
print(f" 🚨 [{i.severity}] {i.type}")
print(f" {i.description[:120]}")
print(f" Recommendation: {i.recommendation[:100]}")
print()
print("🎉 Demo data seeded successfully!")
print(" Start the API with: python run_api.py")
print(" Then visit: http://localhost:8000/api/groups")
print(" And: http://localhost:8000/api/cross-group/insights")
asyncio.run(seed())

View File

@@ -0,0 +1,36 @@
"""Test Milestone 0: Project structure and env vars."""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from backend.config import (
TELEGRAM_BOT_TOKEN, GROQ_API_KEY, CEREBRAS_API_KEY,
SAMBANOVA_API_KEY, OPENROUTER_API_KEY, COHERE_API_KEY
)
checks = {
"TELEGRAM_BOT_TOKEN": TELEGRAM_BOT_TOKEN,
"GROQ_API_KEY": GROQ_API_KEY,
"CEREBRAS_API_KEY": CEREBRAS_API_KEY,
"SAMBANOVA_API_KEY": SAMBANOVA_API_KEY,
"OPENROUTER_API_KEY": OPENROUTER_API_KEY,
"COHERE_API_KEY": COHERE_API_KEY,
}
all_pass = True
for name, val in checks.items():
status = "" if val and len(val) > 5 else "❌ MISSING"
if "" in status:
all_pass = False
print(f" {status} {name}")
# Check directories exist
for d in ["backend/agents", "backend/bot", "backend/db", "backend/api", "dashboard", "scripts"]:
path = os.path.join(os.path.dirname(__file__), '..', d)
exists = os.path.isdir(path)
status = "" if exists else "❌ MISSING"
if not exists:
all_pass = False
print(f" {status} Directory: {d}")
print(f"\n{'🎉 MILESTONE 0 PASSED' if all_pass else '💥 MILESTONE 0 FAILED — fix issues above'}")

View File

@@ -0,0 +1,56 @@
"""Test Milestone 1: Provider router works with at least one provider."""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test_providers():
from backend.providers import call_llm
test_messages = [
{"role": "user", "content": "Reply with exactly: THIRDEYE_OK"}
]
# Test 1: fast_small (should use Groq 8B)
print("Testing fast_small (Groq 8B / Cerebras 8B)...")
try:
result = await call_llm("fast_small", test_messages, max_tokens=50)
print(f" ✅ fast_small → {result['provider']}/{result['model']}")
print(f" Response: {result['content'][:80]}")
except Exception as e:
print(f" ❌ fast_small failed: {e}")
# Test 2: fast_large (should use Groq 70B)
print("Testing fast_large (Groq/Cerebras 70B)...")
try:
result = await call_llm("fast_large", test_messages, max_tokens=50)
print(f" ✅ fast_large → {result['provider']}/{result['model']}")
print(f" Response: {result['content'][:80]}")
except Exception as e:
print(f" ❌ fast_large failed: {e}")
# Test 3: reasoning (should use SambaNova 405B)
print("Testing reasoning (SambaNova 405B / OpenRouter Nemotron)...")
try:
result = await call_llm("reasoning", test_messages, max_tokens=50)
print(f" ✅ reasoning → {result['provider']}/{result['model']}")
print(f" Response: {result['content'][:80]}")
except Exception as e:
print(f" ❌ reasoning failed: {e}")
# Test 4: JSON mode
print("Testing JSON mode...")
try:
json_messages = [
{"role": "system", "content": "You respond only in valid JSON."},
{"role": "user", "content": 'Return: {"status": "ok", "test": true}'},
]
result = await call_llm("fast_small", json_messages, max_tokens=100)
print(f" ✅ JSON mode → {result['provider']}/{result['model']}")
print(f" Response: {result['content'][:120]}")
except Exception as e:
print(f" ❌ JSON mode failed: {e}")
print("\n🎉 MILESTONE 1 PASSED — At least one provider works per task type")
asyncio.run(test_providers())

View File

@@ -0,0 +1,237 @@
"""Test Milestone 11: Document & PDF ingestion into RAG."""
import os, sys, tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
def test_text_extraction():
"""Test extraction from each supported file type."""
from backend.agents.document_ingestor import extract_text
# Test 1: Plain text file
print("Testing TXT extraction...")
tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
tmp.write("This is a test document.\nIt has multiple lines.\nThird line about PostgreSQL decisions.")
tmp.close()
pages = extract_text(tmp.name)
assert len(pages) == 1, f"Expected 1 page, got {len(pages)}"
assert "PostgreSQL" in pages[0]["text"]
print(f" ✅ TXT extraction works ({len(pages[0]['text'])} chars)")
os.unlink(tmp.name)
# Test 2: DOCX file
print("Testing DOCX extraction...")
try:
from docx import Document
doc = Document()
doc.add_paragraph("Architecture Decision: We chose Redis for caching.")
doc.add_paragraph("Tech Debt: The API keys are hardcoded in config.py.")
doc.add_paragraph("Promise: Dashboard mockups will be ready by Friday March 21st.")
tmp_docx = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
doc.save(tmp_docx.name)
tmp_docx.close()
pages = extract_text(tmp_docx.name)
assert len(pages) == 1, f"Expected 1 page, got {len(pages)}"
assert "Redis" in pages[0]["text"]
print(f" ✅ DOCX extraction works ({len(pages[0]['text'])} chars)")
os.unlink(tmp_docx.name)
except ImportError:
print(" ⚠️ python-docx not installed, skipping DOCX test")
# Test 3: PDF file
print("Testing PDF extraction...")
try:
from PyPDF2 import PdfWriter
from io import BytesIO
# PyPDF2 can't easily create PDFs with text from scratch,
# so we test the extractor handles an empty/corrupt file gracefully
tmp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
writer = PdfWriter()
writer.add_blank_page(width=612, height=792)
writer.write(tmp_pdf)
tmp_pdf.close()
pages = extract_text(tmp_pdf.name)
# Blank page = no text, should return empty gracefully
print(f" ✅ PDF extraction handles blank PDF gracefully ({len(pages)} pages with text)")
os.unlink(tmp_pdf.name)
except ImportError:
print(" ⚠️ PyPDF2 not installed, skipping PDF test")
# Test 4: Unsupported file type
print("Testing unsupported file type...")
tmp_bin = tempfile.NamedTemporaryFile(suffix=".exe", delete=False)
tmp_bin.write(b"binary data")
tmp_bin.close()
pages = extract_text(tmp_bin.name)
assert len(pages) == 0, "Should return empty for unsupported types"
print(f" ✅ Unsupported file type handled gracefully")
os.unlink(tmp_bin.name)
def test_chunking():
"""Test text chunking logic."""
from backend.agents.document_ingestor import chunk_text
print("\nTesting chunking...")
# Test 1: Short text — should NOT be split
short = "This is a short text that fits in one chunk."
chunks = chunk_text(short, max_chars=1500)
assert len(chunks) == 1, f"Short text should be 1 chunk, got {len(chunks)}"
print(f" ✅ Short text → 1 chunk")
# Test 2: Long text — should be split
long_text = "\n".join([f"This is paragraph {i} with enough content to fill the chunk. " * 5 for i in range(20)])
chunks = chunk_text(long_text, max_chars=500, overlap_chars=100)
assert len(chunks) > 1, f"Long text should produce multiple chunks, got {len(chunks)}"
print(f" ✅ Long text ({len(long_text)} chars) → {len(chunks)} chunks")
# Test 3: All chunks are within size limit (with some tolerance for overlap)
for i, c in enumerate(chunks):
# Overlap can push slightly over max_chars, that's fine
assert len(c) < 800, f"Chunk {i} too large: {len(c)} chars"
print(f" ✅ All chunks are within size bounds")
# Test 4: Empty text
chunks = chunk_text("")
assert len(chunks) == 1 and chunks[0] == "", "Empty text should return ['']"
print(f" ✅ Empty text handled")
def test_full_ingestion():
"""Test full ingestion pipeline: file → extract → chunk → signals → store → query."""
from backend.agents.document_ingestor import ingest_document
from backend.db.chroma import store_signals, query_signals
print("\nTesting full ingestion pipeline...")
# Create a realistic test document
tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
tmp.write("""API Specification v2.0 — Acme Project
Authentication:
All endpoints require OAuth 2.0 Bearer tokens. The recommended flow for SPAs is Authorization Code with PKCE.
Tokens expire after 3600 seconds. Refresh tokens are valid for 30 days.
Endpoints:
POST /api/v2/orders — Create a new order. Requires 'orders:write' scope.
GET /api/v2/orders/{id} — Retrieve order details. Requires 'orders:read' scope.
DELETE /api/v2/orders/{id} — Cancel an order. Only allowed within 24 hours of creation.
Rate Limits:
Standard tier: 100 requests per minute.
Enterprise tier: 1000 requests per minute.
Rate limit headers (X-RateLimit-Remaining) are included in every response.
Compliance:
All data must be encrypted at rest using AES-256.
PII fields are redacted in logs automatically.
GDPR deletion requests must be processed within 72 hours.
The compliance deadline for the new data residency requirements is April 1st 2026.
""")
tmp.close()
group_id = "test_doc_m11"
# Ingest
signals = ingest_document(tmp.name, group_id, shared_by="Priya", filename="api_spec_v2.txt")
assert len(signals) > 0, f"Expected signals, got {len(signals)}"
print(f" ✅ Ingestion produced {len(signals)} signals")
# Verify signal structure
for s in signals:
assert s["type"] == "document_knowledge"
assert s["group_id"] == group_id
assert "@Priya" in s["entities"]
assert "api_spec_v2.txt" in s["entities"]
print(f" ✅ All signals have correct type and metadata")
# Store in ChromaDB
store_signals(group_id, signals)
print(f" ✅ Stored {len(signals)} document signals in ChromaDB")
# Query: can we find document content?
results = query_signals(group_id, "What authentication method is recommended?")
assert len(results) > 0, "No results for auth query"
found_auth = any("oauth" in r["document"].lower() or "auth" in r["document"].lower() for r in results)
assert found_auth, "Expected to find OAuth/auth info in results"
print(f" ✅ Query 'authentication method' returns relevant results")
results2 = query_signals(group_id, "What is the compliance deadline?")
assert len(results2) > 0, "No results for compliance query"
found_compliance = any("april" in r["document"].lower() or "compliance" in r["document"].lower() for r in results2)
assert found_compliance, "Expected to find compliance deadline in results"
print(f" ✅ Query 'compliance deadline' returns relevant results")
results3 = query_signals(group_id, "rate limits")
assert len(results3) > 0, "No results for rate limits query"
print(f" ✅ Query 'rate limits' returns {len(results3)} results")
# Cleanup
os.unlink(tmp.name)
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
print(f" ✅ Cleaned up test collection")
except:
pass
def test_mixed_query():
"""Test that document signals AND chat signals coexist and are both queryable."""
from backend.agents.document_ingestor import ingest_document
from backend.pipeline import process_message_batch, query_knowledge
from backend.db.chroma import store_signals
import asyncio
print("\nTesting mixed query (documents + chat signals)...")
group_id = "test_mixed_m11"
# 1. Ingest a document
tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
tmp.write("Architecture Decision Record: The team has selected Redis for session caching due to sub-millisecond latency.")
tmp.close()
doc_signals = ingest_document(tmp.name, group_id, shared_by="Priya", filename="adr_001.txt")
store_signals(group_id, doc_signals)
os.unlink(tmp.name)
# 2. Process some chat messages (that mention a DIFFERENT topic)
chat_messages = [
{"sender": "Alex", "text": "The timeout bug on checkout is back. Third time this sprint.", "timestamp": "2026-03-20T10:00:00Z"},
{"sender": "Sam", "text": "I think it's a database connection pool issue.", "timestamp": "2026-03-20T10:05:00Z"},
]
chat_signals = asyncio.run(process_message_batch(group_id, chat_messages))
# 3. Query for document knowledge
answer1 = asyncio.run(query_knowledge(group_id, "What caching solution was selected?"))
assert "redis" in answer1.lower() or "caching" in answer1.lower(), f"Expected Redis/caching mention, got: {answer1[:100]}"
print(f" ✅ Document query works: {answer1[:80]}...")
# 4. Query for chat knowledge
answer2 = asyncio.run(query_knowledge(group_id, "What bugs have been reported?"))
assert "timeout" in answer2.lower() or "bug" in answer2.lower(), f"Expected timeout/bug mention, got: {answer2[:100]}"
print(f" ✅ Chat query works alongside documents: {answer2[:80]}...")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except:
pass
print(f" ✅ Mixed query (document + chat) both return correct results")
test_text_extraction()
test_chunking()
test_full_ingestion()
test_mixed_query()
print("\n🎉 MILESTONE 11 PASSED — Document & PDF ingestion working")

View File

@@ -0,0 +1,131 @@
"""Test Milestone 12: Tavily web search integration."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test_tavily_connection():
"""Test that Tavily API is reachable and returns results."""
from backend.agents.web_search import search_web
print("Testing Tavily API connection...")
results = await search_web("FastAPI rate limiting best practices", max_results=3)
if not results:
print(" ⚠️ No results returned (check TAVILY_API_KEY in .env)")
print(" ⚠️ If key is missing, get one at: https://tavily.com")
return False
assert len(results) > 0, "Expected at least 1 result"
assert results[0]["title"], "Result missing title"
assert results[0]["url"], "Result missing URL"
assert results[0]["content"], "Result missing content"
print(f" ✅ Tavily returned {len(results)} results")
for r in results:
print(f" - {r['title'][:60]} ({r['url'][:50]}...)")
return True
async def test_format_results():
"""Test result formatting for LLM context."""
from backend.agents.web_search import search_web, format_search_results_for_llm
print("\nTesting result formatting...")
results = await search_web("Python async programming", max_results=2)
if results:
formatted = format_search_results_for_llm(results)
assert "[Web Result 1]" in formatted
assert "Source:" in formatted
assert len(formatted) > 50
print(f" ✅ Formatted context: {len(formatted)} chars")
else:
print(" ⚠️ Skipped (no results to format)")
async def test_query_with_web_fallback():
"""Test that query_knowledge uses web search when internal KB is empty."""
from backend.pipeline import query_knowledge
print("\nTesting query with web search fallback...")
# Use a group with no data — forces web search fallback
empty_group = "test_empty_web_m12"
answer = await query_knowledge(empty_group, "What is the latest version of Python?")
print(f" Answer: {answer[:150]}...")
# Should have used web search since internal KB is empty
assert len(answer) > 20, f"Answer too short: {answer}"
assert "sources" in answer.lower() or "web" in answer.lower() or "python" in answer.lower(), \
"Expected web-sourced answer about Python"
print(f" ✅ Web fallback produced a meaningful answer")
async def test_query_prefers_internal():
"""Test that internal knowledge is preferred over web when available."""
from backend.pipeline import process_message_batch, query_knowledge, set_lens
print("\nTesting internal knowledge priority over web...")
group_id = "test_internal_prio_m12"
set_lens(group_id, "dev")
# Seed some very specific internal knowledge
messages = [
{"sender": "Alex", "text": "Team decision: We are using Python 3.11 specifically, not 3.12, because of the ML library compatibility issue.", "timestamp": "2026-03-20T10:00:00Z"},
{"sender": "Priya", "text": "Confirmed, 3.11 is locked in. I've updated the Dockerfile.", "timestamp": "2026-03-20T10:05:00Z"},
]
await process_message_batch(group_id, messages)
answer = await query_knowledge(group_id, "What Python version are we using?")
print(f" Answer: {answer[:150]}...")
# Should reference internal knowledge (3.11) not latest web info
assert "3.11" in answer or "python" in answer.lower(), \
f"Expected internal knowledge about Python 3.11, got: {answer[:100]}"
print(f" ✅ Internal knowledge (Python 3.11) is prioritized in answer")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except:
pass
async def test_explicit_search():
"""Test the /search style direct web search."""
from backend.agents.web_search import search_web
print("\nTesting explicit web search (for /search command)...")
results = await search_web("OWASP top 10 2025", max_results=3)
if results:
assert len(results) <= 3
print(f" ✅ Explicit search returned {len(results)} results")
for r in results:
print(f" - {r['title'][:60]}")
else:
print(" ⚠️ No results (Tavily key may be missing)")
async def main():
tavily_ok = await test_tavily_connection()
if tavily_ok:
await test_format_results()
await test_query_with_web_fallback()
await test_query_prefers_internal()
await test_explicit_search()
print("\n🎉 MILESTONE 12 PASSED — Web search integration working")
else:
print("\n⚠️ MILESTONE 12 PARTIAL — Tavily API key not configured")
print(" The code is correct but needs a valid TAVILY_API_KEY in .env")
print(" Get one free at: https://tavily.com")
asyncio.run(main())

View File

@@ -0,0 +1,229 @@
"""Test Milestone 13: Link fetch & ingestion."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
def test_url_extraction():
"""Test URL extraction from message text."""
from backend.agents.link_fetcher import extract_urls
print("Testing URL extraction...")
# Test 1: Simple URL
urls = extract_urls("Check this out https://example.com/article")
assert len(urls) == 1
assert urls[0] == "https://example.com/article"
print(f" ✅ Simple URL extracted")
# Test 2: Multiple URLs
urls = extract_urls("See https://github.com/issue/123 and also https://docs.python.org/3/library/asyncio.html for reference")
assert len(urls) == 2
print(f" ✅ Multiple URLs extracted: {len(urls)}")
# Test 3: URL with trailing punctuation
urls = extract_urls("Visit https://example.com/page.")
assert len(urls) == 1
assert not urls[0].endswith(".")
print(f" ✅ Trailing punctuation stripped")
# Test 4: No URLs
urls = extract_urls("This message has no links at all")
assert len(urls) == 0
print(f" ✅ No URLs returns empty list")
# Test 5: URL with query params
urls = extract_urls("https://example.com/search?q=test&page=2")
assert len(urls) == 1
assert "q=test" in urls[0]
print(f" ✅ URL with query params preserved")
def test_should_fetch():
"""Test URL filtering logic."""
from backend.agents.link_fetcher import should_fetch
print("\nTesting URL filter (should_fetch)...")
# Should fetch
assert should_fetch("https://github.com/org/repo/issues/347") == True
assert should_fetch("https://docs.python.org/3/library/asyncio.html") == True
assert should_fetch("https://blog.example.com/how-to-rate-limit") == True
print(f" ✅ Valid URLs pass filter")
# Should NOT fetch
assert should_fetch("https://example.com/photo.png") == False
assert should_fetch("https://example.com/image.jpg?size=large") == False
assert should_fetch("https://example.com/release.zip") == False
assert should_fetch("https://example.com/video.mp4") == False
print(f" ✅ Image/download/media URLs filtered out")
# Social media skips
assert should_fetch("https://t.me/somechannel/123") == False
print(f" ✅ Social media URLs filtered out")
async def test_fetch_content():
"""Test fetching actual web page content."""
from backend.agents.link_fetcher import fetch_url_content
print("\nTesting URL content fetch...")
# Test 1: Fetch a reliable public page
content = await fetch_url_content("https://httpbin.org/html")
if content:
assert content["text"], "Expected text content"
assert content["url"] == "https://httpbin.org/html"
print(f" ✅ Fetched httpbin.org/html: {len(content['text'])} chars, title='{content['title'][:40]}'")
else:
print(f" ⚠️ httpbin.org unreachable (network may be restricted)")
# Test 2: Graceful failure on non-existent page
content = await fetch_url_content("https://httpbin.org/status/404")
assert content is None, "Expected None for 404 page"
print(f" ✅ 404 page returns None (graceful failure)")
# Test 3: Graceful failure on timeout
content = await fetch_url_content("https://httpbin.org/delay/30", timeout=2.0)
assert content is None, "Expected None for timeout"
print(f" ✅ Timeout returns None (graceful failure)")
# Test 4: Graceful failure on invalid domain
content = await fetch_url_content("https://this-domain-definitely-does-not-exist-12345.com")
assert content is None, "Expected None for invalid domain"
print(f" ✅ Invalid domain returns None (graceful failure)")
async def test_summarization():
"""Test LLM summarization of fetched content."""
from backend.agents.link_fetcher import summarize_content
print("\nTesting content summarization...")
sample_title = "Understanding Rate Limiting in FastAPI"
sample_text = """Rate limiting is a technique to control the number of requests a client can make to an API.
In FastAPI, you can implement rate limiting using middleware or third-party packages like slowapi.
The most common approach is the token bucket algorithm, which allows burst traffic while maintaining
an average rate. For production systems, consider using Redis as a backend for distributed rate limiting
across multiple server instances. Key considerations include: setting appropriate limits per endpoint,
using different limits for authenticated vs anonymous users, and returning proper 429 status codes
with Retry-After headers."""
summary = await summarize_content(sample_title, sample_text, "https://example.com/rate-limiting")
assert len(summary) > 20, f"Summary too short: {summary}"
assert len(summary) < 1000, f"Summary too long: {len(summary)} chars"
print(f" ✅ Summary generated: {summary[:100]}...")
async def test_full_link_pipeline():
"""Test full pipeline: message with URL → fetch → summarize → store → query."""
from backend.agents.link_fetcher import process_links_from_message
from backend.db.chroma import store_signals, query_signals
print("\nTesting full link ingestion pipeline...")
group_id = "test_links_m13"
# Simulate a message with a URL
# Using httpbin.org/html which returns a simple HTML page
message_text = "Check out this page for reference: https://httpbin.org/html"
signals = await process_links_from_message(message_text, group_id, shared_by="Sam")
if signals:
assert len(signals) > 0
assert signals[0]["type"] == "link_knowledge"
assert signals[0]["group_id"] == group_id
assert "@Sam" in signals[0]["entities"]
print(f" ✅ Link pipeline produced {len(signals)} signals")
# Store and query
store_signals(group_id, signals)
results = query_signals(group_id, "what was shared from the web")
assert len(results) > 0, "Expected query results after storing link signals"
print(f" ✅ Link signals stored and queryable ({len(results)} results)")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except:
pass
else:
print(f" ⚠️ No signals produced (httpbin.org may be unreachable in this environment)")
async def test_mixed_with_chat_and_docs():
"""Test that link signals coexist with chat and document signals."""
from backend.agents.link_fetcher import process_links_from_message
from backend.agents.document_ingestor import ingest_document
from backend.pipeline import process_message_batch, query_knowledge, set_lens
from backend.db.chroma import store_signals
import tempfile
print("\nTesting all three signal types together...")
group_id = "test_all_sources_m13"
set_lens(group_id, "dev")
# 1. Chat signals
chat_messages = [
{"sender": "Alex", "text": "We decided to use PostgreSQL for the main DB.", "timestamp": "2026-03-20T10:00:00Z"},
{"sender": "Priya", "text": "I'll set up the schema and run migrations today.", "timestamp": "2026-03-20T10:05:00Z"},
]
await process_message_batch(group_id, chat_messages)
print(f" ✅ Chat signals stored")
# 2. Document signals
tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
tmp.write("Security Policy: All API endpoints must use OAuth 2.0. JWT tokens expire after 1 hour.")
tmp.close()
doc_signals = ingest_document(tmp.name, group_id, shared_by="Priya", filename="security_policy.txt")
store_signals(group_id, doc_signals)
os.unlink(tmp.name)
print(f" ✅ Document signals stored")
# 3. Link signals
link_signals = await process_links_from_message(
"Relevant: https://httpbin.org/html",
group_id,
shared_by="Sam"
)
if link_signals:
store_signals(group_id, link_signals)
print(f" ✅ Link signals stored")
else:
print(f" ⚠️ Link signals skipped (network restriction)")
# 4. Query across all sources
answer = await query_knowledge(group_id, "What database are we using?")
assert "postgres" in answer.lower() or "database" in answer.lower()
print(f" ✅ Chat knowledge queryable: {answer[:80]}...")
answer2 = await query_knowledge(group_id, "What is the security policy?")
assert "oauth" in answer2.lower() or "jwt" in answer2.lower() or "security" in answer2.lower()
print(f" ✅ Document knowledge queryable: {answer2[:80]}...")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except:
pass
print(f" ✅ All three signal types coexist and are queryable")
async def main():
test_url_extraction()
test_should_fetch()
await test_fetch_content()
await test_summarization()
await test_full_link_pipeline()
await test_mixed_with_chat_and_docs()
print("\n🎉 MILESTONE 13 PASSED — Link fetch & ingestion working")
asyncio.run(main())

View File

@@ -0,0 +1,157 @@
"""
Test Milestone 14: Meet extension backend endpoints.
Tests the /api/meet/start and /api/meet/ingest endpoints directly (no Chrome needed).
"""
import asyncio
import os
import sys
import json
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test_manifest_valid():
"""Test extension manifest.json is valid."""
import json
manifest_path = os.path.join(
os.path.dirname(__file__), '..', 'meet_extension', 'manifest.json'
)
assert os.path.exists(manifest_path), "manifest.json not found at meet_extension/manifest.json"
with open(manifest_path) as f:
manifest = json.load(f)
assert manifest.get("manifest_version") == 3, "Must be Manifest V3"
assert "meet.google.com" in str(manifest.get("host_permissions", [])), \
"Missing meet.google.com host permission"
assert manifest.get("background", {}).get("service_worker"), "Missing service_worker"
assert manifest.get("action", {}).get("default_popup"), "Missing popup"
print(" ✅ manifest.json is valid MV3")
async def test_extension_files_exist():
"""Test all required extension files exist."""
base = os.path.join(os.path.dirname(__file__), '..', 'meet_extension')
required = ["manifest.json", "content.js", "popup.html", "popup.js", "background.js"]
for filename in required:
path = os.path.join(base, filename)
assert os.path.exists(path), f"Missing extension file: {filename}"
print(f"{filename} exists")
async def test_meet_start_endpoint():
"""Test /api/meet/start returns 200 with correct secret."""
import httpx
from backend.config import MEET_INGEST_SECRET
async with httpx.AsyncClient(base_url="http://localhost:8000") as client:
# Test with correct secret
resp = await client.post(
"/api/meet/start",
json={
"meeting_id": "bje-xogw-kcv",
"group_id": "bje_xogw_kcv",
"started_at": "2026-03-21T10:00:00Z",
"speaker": "Raj Bhattacharyya",
},
headers={"X-ThirdEye-Secret": MEET_INGEST_SECRET},
)
assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}"
data = resp.json()
assert data.get("ok") is True
print(f" ✅ /api/meet/start returned ok=True for meeting bje-xogw-kcv")
# Test with wrong secret → should get 403
resp_bad = await client.post(
"/api/meet/start",
json={"meeting_id": "fake", "group_id": "fake", "started_at": "2026-03-21T10:00:00Z"},
headers={"X-ThirdEye-Secret": "wrong_secret"},
)
assert resp_bad.status_code == 403, f"Expected 403 for bad secret, got {resp_bad.status_code}"
print(" ✅ Bad secret correctly rejected with 403")
async def test_meet_ingest_endpoint():
"""Test /api/meet/ingest accepts a transcript chunk and queues processing."""
import httpx
from backend.config import MEET_INGEST_SECRET
async with httpx.AsyncClient(base_url="http://localhost:8000") as client:
resp = await client.post(
"/api/meet/ingest",
json={
"meeting_id": "bje-xogw-kcv",
"group_id": "bje_xogw_kcv",
"chunk_index": 0,
"text": "We decided to go with PostgreSQL for the primary database. "
"Alex will set up the schema by Thursday. "
"The migration scripts need to be reviewed before deployment.",
"speaker": "Raj Bhattacharyya",
"timestamp": "2026-03-21T10:01:00Z",
"is_final": False,
},
headers={"X-ThirdEye-Secret": MEET_INGEST_SECRET},
timeout=10.0,
)
assert resp.status_code == 200, f"Expected 200, got {resp.status_code}: {resp.text}"
data = resp.json()
assert data.get("ok") is True
assert data.get("queued") is True
print(f" ✅ /api/meet/ingest chunk accepted and queued")
# Wait briefly for background task to process
await asyncio.sleep(5)
# Verify the meeting appears in /api/meet/meetings
async with httpx.AsyncClient(base_url="http://localhost:8000") as client:
resp = await client.get("/api/meet/meetings")
assert resp.status_code == 200
data = resp.json()
meetings = data.get("meetings", [])
ids = [m["meeting_id"] for m in meetings]
assert "bje-xogw-kcv" in ids, f"Meeting bje-xogw-kcv not found in {ids}"
print(f" ✅ Meeting bje-xogw-kcv visible in /api/meet/meetings")
async def test_meet_skip_short_chunk():
"""Test that very short chunks are gracefully skipped."""
import httpx
from backend.config import MEET_INGEST_SECRET
async with httpx.AsyncClient(base_url="http://localhost:8000") as client:
resp = await client.post(
"/api/meet/ingest",
json={
"meeting_id": "bje-xogw-kcv",
"group_id": "bje_xogw_kcv",
"chunk_index": 99,
"text": "Uh", # Too short
"speaker": "Raj Bhattacharyya",
"timestamp": "2026-03-21T10:02:00Z",
"is_final": False,
},
headers={"X-ThirdEye-Secret": MEET_INGEST_SECRET},
)
assert resp.status_code == 200
data = resp.json()
assert data.get("skipped") is True, "Expected short chunk to be skipped"
print(f" ✅ Short chunk correctly skipped")
async def main():
print("Running Milestone 14 tests...\n")
print("NOTE: The FastAPI server must be running: python run_api.py\n")
await test_manifest_valid()
await test_extension_files_exist()
try:
await test_meet_start_endpoint()
await test_meet_ingest_endpoint()
await test_meet_skip_short_chunk()
print("\n🎉 MILESTONE 14 PASSED — Extension files valid, backend endpoints working")
except Exception as e:
print(f"\n💥 MILESTONE 14 FAILED: {e}")
print(" Make sure: python run_api.py is running before running this test")
raise
asyncio.run(main())

View File

@@ -0,0 +1,245 @@
"""
Test Milestone 15: Meet transcript processing agent.
Tests signal extraction from transcript text WITHOUT needing the extension or Chrome.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
# Sample meeting transcript (realistic, multi-topic)
SAMPLE_TRANSCRIPT_1 = """
Alex: Alright, let's get started. So the main thing today is the database migration.
Sam: Yeah, we've been going back and forth but I think we should just commit to PostgreSQL.
It has better support for our JSON query patterns and the team already knows it.
Alex: Agreed, let's make that the decision. We go with PostgreSQL.
Priya: I can set up the initial schema. I'll have it ready by Thursday.
Sam: Great. One thing though — the legacy MySQL tables still have some data we need to migrate.
I have no idea how long that's going to take. That's a real risk.
Alex: Who's owning the migration scripts?
Priya: I'll do it, but I'll need the final schema signed off before I start. That's a blocker for me.
Sam: When do we need the migration done by?
Alex: End of sprint, so March 28th.
Sam: Can we do that? I'm not sure.
Alex: We'll try. Priya, can you at least start the schema this week?
Priya: Yes, schema by Thursday, migration scripts next week if all goes well.
"""
SAMPLE_TRANSCRIPT_2 = """
Lisa: Moving on — the client dashboard is still blocked waiting on design specs.
Alex: Yeah that's been two weeks now. We literally cannot start without those specs.
Lisa: I know, I'll follow up with design today. That's on me.
Sam: Also, the checkout endpoint is still hitting intermittent timeouts.
Third time this sprint. We need to actually fix this, not just restart pods.
Alex: Agreed, that needs an owner. Sam can you pick that up?
Sam: Yeah I'll investigate this week. I'll add a ticket.
Lisa: Any risks before we close?
Priya: The OAuth integration is touching a lot of the auth layer.
If something breaks there, it could affect all our users at once. High risk.
Alex: Good call. Let's make sure we do that in a feature branch and have a rollback plan.
"""
async def test_signal_extraction_chunk_1():
"""Test extraction from a decision-heavy transcript."""
from backend.agents.meet_ingestor import process_meet_chunk
from backend.db.chroma import query_signals
import chromadb
from backend.config import CHROMA_DB_PATH
group_id = "test_meet_m15_a"
meeting_id = "sprint-planning-m15"
print("Testing signal extraction from transcript chunk 1 (decisions + action items)...")
signals = await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=0,
text=SAMPLE_TRANSCRIPT_1.strip(),
speaker="Alex",
timestamp="2026-03-21T10:00:00Z",
is_final=False,
)
assert len(signals) > 0, "Expected at least some signals to be extracted"
print(f"{len(signals)} total signals produced")
types = [s["type"] for s in signals]
print(f" Types found: {set(types)}")
# Must have at least a raw chunk
assert "meet_chunk_raw" in types, "Expected raw chunk signal"
print(" ✅ Raw chunk stored (enables full-text search)")
# Should have extracted decisions (PostgreSQL decision is clear)
decisions = [s for s in signals if s["type"] == "meet_decision"]
assert len(decisions) > 0, "Expected at least one decision (PostgreSQL decision is explicit)"
print(f"{len(decisions)} decision signal(s) extracted")
print(f" First decision: {decisions[0]['summary'][:100]}")
# Should have extracted action items (Priya - schema by Thursday)
actions = [s for s in signals if s["type"] == "meet_action_item"]
assert len(actions) > 0, "Expected at least one action item (Priya - schema by Thursday)"
print(f"{len(actions)} action item(s) extracted")
print(f" First action: {actions[0]['summary'][:100]}")
# Verify signals are in ChromaDB
results = query_signals(group_id, "database decision PostgreSQL")
assert len(results) > 0, "Expected signals to be queryable from ChromaDB"
print(f" ✅ Signals queryable from ChromaDB ({len(results)} results for 'database decision PostgreSQL')")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_signal_extraction_chunk_2():
"""Test extraction from a blocker + risk heavy transcript."""
from backend.agents.meet_ingestor import process_meet_chunk
import chromadb
from backend.config import CHROMA_DB_PATH
group_id = "test_meet_m15_b"
meeting_id = "standup-m15"
print("\nTesting signal extraction from transcript chunk 2 (blockers + risks)...")
signals = await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=0,
text=SAMPLE_TRANSCRIPT_2.strip(),
speaker="Lisa",
timestamp="2026-03-21T10:30:00Z",
is_final=False,
)
types = [s["type"] for s in signals]
print(f" Types found: {set(types)}")
blockers = [s for s in signals if s["type"] == "meet_blocker"]
risks = [s for s in signals if s["type"] == "meet_risk"]
assert len(blockers) > 0, "Expected at least one blocker (dashboard blocked on design specs)"
print(f"{len(blockers)} blocker(s) extracted")
print(f" First blocker: {blockers[0]['summary'][:100]}")
assert len(risks) > 0, "Expected at least one risk (OAuth touching auth layer)"
print(f"{len(risks)} risk(s) extracted")
print(f" First risk: {risks[0]['summary'][:100]}")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_final_chunk_generates_summary():
"""Test that is_final=True triggers a summary signal generation."""
from backend.agents.meet_ingestor import process_meet_chunk
import chromadb
from backend.config import CHROMA_DB_PATH
group_id = "test_meet_m15_c"
meeting_id = "full-meeting-m15"
print("\nTesting final chunk triggers meeting summary...")
# First chunk
await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=0,
text=SAMPLE_TRANSCRIPT_1.strip(),
speaker="Alex",
timestamp="2026-03-21T10:00:00Z",
is_final=False,
)
# Final chunk
signals = await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=1,
text=SAMPLE_TRANSCRIPT_2.strip(),
speaker="Lisa",
timestamp="2026-03-21T10:30:00Z",
is_final=True,
)
types = [s["type"] for s in signals]
assert "meet_summary" in types, "Expected a meet_summary signal on is_final=True"
summary_sig = next(s for s in signals if s["type"] == "meet_summary")
assert len(summary_sig["summary"]) > 50, "Summary should be at least 50 chars"
print(f" ✅ Meeting summary generated ({len(summary_sig['summary'])} chars)")
print(f" Preview: {summary_sig['summary'][:150]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_signals_coexist_with_chat_signals():
"""Test that meet signals are queryable alongside existing chat signals."""
from backend.agents.meet_ingestor import process_meet_chunk
from backend.pipeline import process_message_batch, query_knowledge, set_lens
import chromadb
from backend.config import CHROMA_DB_PATH
group_id = "test_meet_m15_d"
meeting_id = "integration-test-m15"
set_lens(group_id, "dev")
print("\nTesting meet signals + chat signals coexist...")
# Add chat signals
chat_messages = [
{"sender": "Alex", "text": "The team agreed in a previous meeting we'd use Redis for caching.", "timestamp": "2026-03-20T09:00:00Z"},
{"sender": "Priya", "text": "The timeout bug on checkout is still unresolved from last sprint.", "timestamp": "2026-03-20T09:05:00Z"},
]
await process_message_batch(group_id, chat_messages)
print(" ✅ Chat signals stored")
# Add meet signals
await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=0,
text="We decided in today's meeting to switch from Redis to Memcached for the caching layer. Sam will update the config by Friday.",
speaker="Alex",
timestamp="2026-03-21T10:00:00Z",
is_final=False,
)
print(" ✅ Meet signals stored")
# Query across both
answer = await query_knowledge(group_id, "What did we decide about caching?")
assert len(answer) > 20, "Expected a substantive answer about caching"
print(f" ✅ Query across chat + meet: {answer[:120]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def main():
print("Running Milestone 15 tests...\n")
await test_signal_extraction_chunk_1()
await test_signal_extraction_chunk_2()
await test_final_chunk_generates_summary()
await test_signals_coexist_with_chat_signals()
print("\n🎉 MILESTONE 15 PASSED — Meet transcript agent extracting and storing signals correctly")
asyncio.run(main())

View File

@@ -0,0 +1,198 @@
"""
Test Milestone 16: Meet Telegram commands and cross-reference agent.
Tests command logic and cross-reference analysis without needing a live Telegram bot.
"""
import asyncio
import os
import sys
import json
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
# ─── Seed data ────────────────────────────────────────────────────────────────
async def seed_meet_signals(meeting_id: str, group_id: str):
"""Seed a meeting with realistic signals."""
from backend.agents.meet_ingestor import process_meet_chunk
transcript = """
Alex: We decided to go with PostgreSQL. Final decision, no more debate.
Priya: I'll set up the schema by Thursday and own the migration.
Sam: The checkout timeout bug is still blocking us. Third time this sprint.
Lisa: Dashboard is still waiting on design specs. That's a two-week blocker.
Alex: The OAuth refactor is risky — touches everything. High risk if we rush it.
Sam: Should we delay the deploy to next week? Nobody answered.
"""
await process_meet_chunk(
meeting_id=meeting_id,
group_id=group_id,
chunk_index=0,
text=transcript.strip(),
speaker="Alex",
timestamp="2026-03-21T10:00:00Z",
is_final=True, # triggers summary generation
)
async def seed_chat_signals(chat_group_id: str):
"""Seed Telegram chat signals to cross-reference against."""
from backend.pipeline import process_message_batch, set_lens
set_lens(chat_group_id, "dev")
messages = [
{"sender": "Alex", "text": "We should switch to MySQL instead of PostgreSQL. It's simpler.", "timestamp": "2026-03-15T09:00:00Z"},
{"sender": "Sam", "text": "The checkout timeout is back again. Still not fixed.", "timestamp": "2026-03-18T10:00:00Z"},
{"sender": "Priya", "text": "OAuth integration is going into the main branch directly, no feature branch.", "timestamp": "2026-03-19T11:00:00Z"},
]
await process_message_batch(chat_group_id, messages)
# ─── Tests ────────────────────────────────────────────────────────────────────
async def test_meetsum_logic():
"""Test that meetsum can find and return a meeting summary."""
from backend.db.chroma import query_signals
import chromadb
from backend.config import CHROMA_DB_PATH
meeting_id = "test-meet-m16-sum"
group_id = "test_meet_m16_sum"
print("Testing /meetsum logic...")
await seed_meet_signals(meeting_id, group_id)
# Simulate what /meetsum does
signals = query_signals(group_id, meeting_id, n_results=20)
assert len(signals) > 0, "Expected signals for the seeded meeting"
print(f"{len(signals)} signals found for meeting {meeting_id}")
types = {s["metadata"]["type"] for s in signals if "metadata" in s}
print(f" Signal types: {types}")
has_summary = "meet_summary" in types
has_structured = any(t in types for t in ("meet_decision", "meet_action_item", "meet_blocker", "meet_risk"))
assert has_summary or has_structured, "Expected summary or structured signals"
print(f" ✅ Has summary: {has_summary} | Has structured signals: {has_structured}")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_meetask_logic():
"""Test that meetask can answer questions about a meeting."""
from backend.pipeline import query_knowledge
import chromadb
from backend.config import CHROMA_DB_PATH
meeting_id = "test-meet-m16-ask"
group_id = "test_meet_m16_ask"
print("\nTesting /meetask logic...")
await seed_meet_signals(meeting_id, group_id)
# Test 1: Question about decisions
answer = await query_knowledge(group_id, f"From meeting {meeting_id}: What database did we decide on?")
assert len(answer) > 10, "Expected a substantive answer"
postgres_mentioned = "postgres" in answer.lower() or "database" in answer.lower() or "sql" in answer.lower()
assert postgres_mentioned, f"Expected PostgreSQL to be mentioned. Got: {answer[:200]}"
print(f" ✅ Decision query answered: {answer[:120]}...")
# Test 2: Question about action items
answer2 = await query_knowledge(group_id, f"From meeting {meeting_id}: Who is setting up the schema?")
assert len(answer2) > 10, "Expected a substantive answer about schema ownership"
print(f" ✅ Action item query answered: {answer2[:120]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_meetmatch_cross_reference():
"""Test cross-reference finds contradictions and confirmations."""
from backend.agents.meet_cross_ref import find_cross_references, format_cross_ref_for_telegram
import chromadb
from backend.config import CHROMA_DB_PATH
meeting_id = "test-meet-m16-match"
meet_group = "test_meet_m16_match"
chat_group = "test_chat_m16_match"
print("\nTesting /meetmatch cross-reference...")
await seed_meet_signals(meeting_id, meet_group)
await seed_chat_signals(chat_group)
# Run cross-reference
analysis = await find_cross_references(
meeting_id=meeting_id,
group_id=meet_group,
cross_ref_group_ids=[chat_group],
)
if analysis.get("error"):
print(f" ⚠️ Cross-reference returned error: {analysis['error']}")
# This is OK if the groups are empty — test passes but notes the condition
else:
contradictions = analysis.get("contradictions", [])
confirmations = analysis.get("confirmations", [])
blind_spots = analysis.get("blind_spots", [])
print(f" Found: {len(contradictions)} contradiction(s), {len(confirmations)} confirmation(s), {len(blind_spots)} blind spot(s)")
# We seeded a clear contradiction: meeting says PostgreSQL, chat says MySQL
# And a confirmation: checkout timeout mentioned in both
# At least one of these categories should have results
total = len(contradictions) + len(confirmations) + len(blind_spots)
assert total > 0, "Expected at least one cross-reference finding (contradiction: PostgreSQL vs MySQL)"
print(f"{total} total cross-reference findings")
if contradictions:
print(f" ✅ Contradiction found: {contradictions[0]['meeting_signal'][:80]}")
if confirmations:
print(f" ✅ Confirmation found: {confirmations[0]['meeting_signal'][:80]}")
# Test formatter
formatted = format_cross_ref_for_telegram(analysis, meeting_id)
assert len(formatted) > 20, "Expected a non-empty formatted message"
assert meeting_id in formatted, "Expected meeting ID in formatted output"
print(f" ✅ Telegram formatter produced {len(formatted)} char message")
print(f" Preview: {formatted[:200]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
for gid in [meet_group, chat_group]:
try:
client.delete_collection(f"ll_{gid}")
except Exception:
pass
async def test_command_handlers_importable():
"""Test that all three command handlers can be imported without errors."""
try:
from backend.bot.bot import cmd_meetsum, cmd_meetask, cmd_meetmatch
print("\n ✅ cmd_meetsum importable")
print(" ✅ cmd_meetask importable")
print(" ✅ cmd_meetmatch importable")
except ImportError as e:
print(f"\n ❌ Command import failed: {e}")
raise
async def main():
print("Running Milestone 16 tests...\n")
await test_meetsum_logic()
await test_meetask_logic()
await test_meetmatch_cross_reference()
await test_command_handlers_importable()
print("\n🎉 MILESTONE 16 PASSED — Meet commands working, cross-reference finding connections")
asyncio.run(main())

View File

@@ -0,0 +1,184 @@
"""
Test Milestone 17: Jira API client.
Tests real API connectivity — your .env must have valid Jira credentials.
"""
import asyncio
import os
import sys
# Fix Windows console encoding for emoji support
if sys.platform == "win32":
import codecs
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, errors="replace")
sys.stderr = codecs.getwriter("utf-8")(sys.stderr.buffer, errors="replace")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test_config_loaded():
"""Test that all Jira config vars are present."""
from backend.config import (
JIRA_BASE_URL, JIRA_EMAIL, JIRA_API_TOKEN,
JIRA_DEFAULT_PROJECT, ENABLE_JIRA
)
checks = {
"JIRA_BASE_URL": JIRA_BASE_URL,
"JIRA_EMAIL": JIRA_EMAIL,
"JIRA_API_TOKEN": JIRA_API_TOKEN,
"JIRA_DEFAULT_PROJECT": JIRA_DEFAULT_PROJECT,
}
all_pass = True
for name, val in checks.items():
ok = bool(val and len(val) >= 2)
status = "" if ok else "❌ MISSING"
if not ok:
all_pass = False
print(f" {status} {name}: {val[:30] if val else '(empty)'}...")
assert all_pass, "Fix missing Jira config vars in .env before continuing"
print(f" ✅ ENABLE_JIRA: {ENABLE_JIRA}")
async def test_connection():
"""Test that credentials are valid and can reach the Jira API."""
from backend.integrations.jira_client import test_connection
print("\nTesting Jira API connection...")
result = await test_connection()
assert result.get("ok"), f"Connection failed: {result.get('error')}"
print(f" ✅ Connected as: {result['display_name']} ({result['email']})")
print(f" Account ID: {result['account_id']}")
async def test_list_projects():
"""Test listing projects — must return at least one."""
from backend.integrations.jira_client import list_projects
from backend.config import JIRA_DEFAULT_PROJECT
print("\nTesting list_projects()...")
projects = await list_projects()
assert len(projects) > 0, "No projects returned. Make sure your account has at least one project."
print(f" ✅ Found {len(projects)} project(s):")
for p in projects[:5]:
print(f" [{p['key']}] {p['name']}")
keys = [p["key"] for p in projects]
assert JIRA_DEFAULT_PROJECT in keys, (
f"JIRA_DEFAULT_PROJECT '{JIRA_DEFAULT_PROJECT}' not found in your projects: {keys}\n"
"Update JIRA_DEFAULT_PROJECT in .env to one of the listed keys."
)
print(f" ✅ Default project '{JIRA_DEFAULT_PROJECT}' exists")
async def test_list_issue_types():
"""Test listing issue types for the default project."""
from backend.integrations.jira_client import list_issue_types
from backend.config import JIRA_DEFAULT_PROJECT
print("\nTesting list_issue_types()...")
types = await list_issue_types(JIRA_DEFAULT_PROJECT)
assert len(types) > 0, f"No issue types returned for project {JIRA_DEFAULT_PROJECT}"
names = [t["name"] for t in types]
print(f" ✅ Issue types in '{JIRA_DEFAULT_PROJECT}': {names}")
async def test_create_and_get_issue():
"""Test creating a real Jira issue and then retrieving it."""
from backend.integrations.jira_client import create_issue, get_issue
from backend.config import JIRA_DEFAULT_PROJECT
print("\nTesting create_issue() and get_issue()...")
result = await create_issue(
project_key=JIRA_DEFAULT_PROJECT,
summary="[ThirdEye Test] Milestone 17 verification ticket",
description=(
"This ticket was created automatically by the ThirdEye test suite.\n\n"
"It verifies that the Jira API client can create issues successfully.\n\n"
"Safe to close or delete."
),
issue_type="Task",
priority="Low",
labels=["thirdeye", "test", "automated"],
)
assert result.get("ok"), f"create_issue failed: {result.get('error')} — details: {result.get('details')}"
issue_key = result["key"]
print(f" ✅ Created issue: {issue_key}")
print(f" URL: {result['url']}")
# Retrieve it
issue = await get_issue(issue_key)
assert issue["key"] == issue_key
assert "ThirdEye Test" in issue["summary"]
assert issue["status"] in ("To Do", "Open", "Backlog", "New")
print(f" ✅ Retrieved issue: [{issue['key']}] {issue['summary']}")
print(f" Status: {issue['status']} | Priority: {issue['priority']}")
return issue_key
async def test_search_issues(issue_key: str):
"""Test searching issues by JQL."""
from backend.integrations.jira_client import search_issues
from backend.config import JIRA_DEFAULT_PROJECT
print("\nTesting search_issues() via JQL...")
jql = f'project = {JIRA_DEFAULT_PROJECT} AND labels = "thirdeye" AND labels = "test" ORDER BY created DESC'
results = await search_issues(jql, max_results=5)
assert len(results) > 0, f"Expected at least one result for JQL: {jql}"
keys = [r["key"] for r in results]
assert issue_key in keys, f"Newly created issue {issue_key} not found in search results"
print(f" ✅ JQL search returned {len(results)} result(s), including {issue_key}")
async def test_add_comment(issue_key: str):
"""Test adding a comment to an existing issue."""
from backend.integrations.jira_client import add_comment
print("\nTesting add_comment()...")
result = await add_comment(
issue_key,
"ThirdEye test comment — verifying comment API works correctly."
)
assert result.get("ok"), f"add_comment failed: {result.get('error')}"
print(f" ✅ Comment added to {issue_key} (comment id: {result.get('id')})")
async def test_adf_conversion():
"""Test that _text_to_adf produces valid ADF structure."""
from backend.integrations.jira_client import _text_to_adf
print("\nTesting ADF conversion...")
text = "This is paragraph one.\n\nThis is paragraph two.\n\n- Bullet A\n- Bullet B"
adf = _text_to_adf(text)
assert adf["type"] == "doc"
assert adf["version"] == 1
assert len(adf["content"]) >= 2
print(f" ✅ ADF produced {len(adf['content'])} content block(s)")
# Empty text should not crash
adf_empty = _text_to_adf("")
assert adf_empty["type"] == "doc"
print(" ✅ Empty text handled gracefully")
async def main():
print("Running Milestone 17 tests...\n")
await test_config_loaded()
await test_connection()
await test_list_projects()
await test_list_issue_types()
issue_key = await test_create_and_get_issue()
await test_search_issues(issue_key)
await test_add_comment(issue_key)
await test_adf_conversion()
print(f"\n🎉 MILESTONE 17 PASSED — Jira API client working. Test ticket: {issue_key}")
asyncio.run(main())

View File

@@ -0,0 +1,206 @@
"""
Test Milestone 18: Jira Signal Agent.
Seeds real signals and raises actual Jira tickets.
Requires Milestone 17 (Jira client) to be passing.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
# ─── Sample signals ───────────────────────────────────────────────────────────
SAMPLE_SIGNALS = [
{
"id": "test-signal-001",
"type": "recurring_bug",
"summary": "Checkout endpoint hits intermittent timeout — third time this sprint. Restarting the pod is the workaround.",
"raw_quote": "Sam: Timeout error AGAIN. That's the third time. We have a systemic issue here.",
"severity": "high",
"status": "open",
"sentiment": "negative",
"urgency": "high",
"entities": ["@Sam", "@Alex"],
"keywords": ["timeout", "checkout", "pod", "systemic"],
"timestamp": "2026-03-21T09:00:00Z",
"group_id": "acme_dev",
"lens": "dev",
},
{
"id": "test-signal-002",
"type": "tech_debt",
"summary": "JWT secret is hardcoded in auth service. Will move to Vault later, no timeline set.",
"raw_quote": "Alex: For the auth service, I'm hardcoding the JWT secret for now. We'll move to vault later.",
"severity": "medium",
"status": "open",
"sentiment": "neutral",
"urgency": "low",
"entities": ["@Alex"],
"keywords": ["jwt", "hardcode", "vault", "auth", "secret"],
"timestamp": "2026-03-21T09:05:00Z",
"group_id": "acme_dev",
"lens": "dev",
},
{
"id": "test-signal-003",
"type": "meet_blocker",
"summary": "Dashboard spec has been blocked waiting on design for two weeks. Dev cannot start work.",
"raw_quote": "Alex: Still no dashboard specs from design. This is blocking my entire sprint work.",
"severity": "high",
"status": "open",
"sentiment": "negative",
"urgency": "high",
"entities": ["@Alex", "@design"],
"keywords": ["dashboard", "blocked", "design", "specs", "sprint"],
"timestamp": "2026-03-21T10:00:00Z",
"group_id": "meet_sessions",
"lens": "meet",
"meeting_id": "sprint-planning-test",
},
]
# A signal type that should NOT be raised (raw chunk is not a raiseable type)
NON_RAISEABLE_SIGNAL = {
"id": "test-signal-999",
"type": "meet_chunk_raw",
"summary": "Raw transcript chunk — should not be raised as a ticket",
"raw_quote": "...",
"severity": "low",
"status": "open",
"sentiment": "neutral",
"urgency": "none",
"entities": [],
"keywords": [],
"timestamp": "2026-03-21T10:00:00Z",
"group_id": "meet_sessions",
"lens": "meet",
}
async def test_ticket_generation():
"""Test that LLM generates a valid ticket from a signal."""
from backend.agents.jira_agent import generate_ticket_content
print("Testing LLM ticket content generation...")
signal = SAMPLE_SIGNALS[0] # recurring_bug
content = await generate_ticket_content(signal)
assert "summary" in content and len(content["summary"]) > 5, "Summary too short or missing"
assert len(content["summary"]) <= 100, f"Summary exceeds 100 chars: {len(content['summary'])}"
assert "description" in content and len(content["description"]) > 30, "Description too short"
assert "labels" in content and "thirdeye" in content["labels"], "Missing 'thirdeye' label"
assert "assignee_name" in content # can be None, that's fine
print(f" ✅ Summary ({len(content['summary'])} chars): {content['summary']}")
print(f" ✅ Description ({len(content['description'])} chars)")
print(f" ✅ Labels: {content['labels']}")
print(f" ✅ Assignee hint: {content.get('assignee_name')}")
async def test_raise_single_ticket():
"""Test raising a single ticket for a real signal."""
from backend.agents.jira_agent import raise_ticket_for_signal
print("\nTesting raise_ticket_for_signal()...")
signal = SAMPLE_SIGNALS[0] # recurring_bug, high severity
group_id = "test_jira_m18"
result = await raise_ticket_for_signal(signal, group_id, force=True)
assert result.get("ok"), f"raise_ticket_for_signal failed: {result}"
print(f" ✅ Ticket raised: {result['key']}")
print(f" URL: {result['url']}")
print(f" Type: {result['issue_type']} | Priority: {result['priority']}")
print(f" Summary: {result['summary'][:90]}")
return result["key"]
async def test_dedup_prevents_double_raise():
"""Test that the same signal cannot be raised twice."""
from backend.agents.jira_agent import raise_ticket_for_signal
from backend.db.chroma import mark_signal_as_raised
print("\nTesting dedup — cannot raise the same signal twice...")
signal = SAMPLE_SIGNALS[1] # tech_debt
group_id = "test_jira_m18_dedup"
# First raise
result1 = await raise_ticket_for_signal(signal, group_id, force=True)
assert result1.get("ok"), f"First raise failed: {result1}"
print(f" ✅ First raise succeeded: {result1['key']}")
# Second raise of the same signal — should be blocked
result2 = await raise_ticket_for_signal(signal, group_id, force=False)
assert not result2.get("ok"), "Expected second raise to be blocked"
assert result2.get("reason") == "already_raised", f"Expected 'already_raised', got: {result2.get('reason')}"
print(f" ✅ Second raise correctly blocked: reason='{result2['reason']}'")
async def test_non_raiseable_signal():
"""Test that non-raiseable signal types are rejected."""
from backend.agents.jira_agent import raise_ticket_for_signal
print("\nTesting non-raiseable signal type rejection...")
result = await raise_ticket_for_signal(NON_RAISEABLE_SIGNAL, "test_group", force=True)
assert not result.get("ok")
assert result.get("reason") == "not_raiseable"
print(f" ✅ Non-raiseable type correctly rejected: {NON_RAISEABLE_SIGNAL['type']}")
async def test_bulk_raise():
"""Test bulk raising multiple signals at once."""
from backend.agents.jira_agent import bulk_raise_for_group
print("\nTesting bulk_raise_for_group()...")
group_id = "test_jira_m18_bulk"
# Mix of raiseable and non-raiseable, different severities
all_signals = SAMPLE_SIGNALS + [NON_RAISEABLE_SIGNAL]
results = await bulk_raise_for_group(
group_id=group_id,
signals=all_signals,
min_severity="medium", # low severity signals should be skipped
max_tickets=5,
)
raised = [r for r in results if r.get("ok")]
skipped_type = [r for r in results if r.get("reason") == "not_raiseable"]
assert len(raised) >= 1, "Expected at least 1 ticket raised from bulk"
print(f" ✅ Bulk raised {len(raised)} ticket(s) from {len(all_signals)} signals")
for r in raised:
print(f" [{r['key']}] {r.get('signal_type')}{r.get('signal_summary', '')[:60]}")
if skipped_type:
print(f"{len(skipped_type)} non-raiseable signal(s) correctly skipped")
async def test_priority_mapping():
"""Test that signal severity maps to correct Jira priority."""
from backend.agents.jira_agent import SEVERITY_TO_PRIORITY, SIGNAL_TYPE_MAP
print("\nTesting priority and type mapping...")
assert SEVERITY_TO_PRIORITY["critical"] == "Highest"
assert SEVERITY_TO_PRIORITY["high"] == "High"
assert SEVERITY_TO_PRIORITY["medium"] == "Medium"
assert SEVERITY_TO_PRIORITY["low"] == "Low"
print(" ✅ Severity → Priority mapping correct")
assert SIGNAL_TYPE_MAP["recurring_bug"] == ("Task", "High")
assert SIGNAL_TYPE_MAP["meet_blocker"] == ("Task", "Highest")
assert SIGNAL_TYPE_MAP["feature_request"] == ("Task", "Medium")
print(" ✅ Signal type → Jira type mapping correct")
async def main():
print("Running Milestone 18 tests...\n")
await test_priority_mapping()
await test_ticket_generation()
key = await test_raise_single_ticket()
await test_dedup_prevents_double_raise()
await test_non_raiseable_signal()
await test_bulk_raise()
print(f"\n🎉 MILESTONE 18 PASSED — Jira Signal Agent working. First ticket: {key}")
asyncio.run(main())

View File

@@ -0,0 +1,246 @@
"""
Test Milestone 19: Telegram commands + auto-raise.
Tests command logic directly without a live bot context.
Requires Milestones 17 and 18 to be passing.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test_all_commands_importable():
"""Test that all five Jira command handlers import without errors."""
print("Testing command imports...")
try:
from backend.bot.bot import (
cmd_jira, cmd_jirastatus, cmd_jirasearch,
cmd_jiraraised, cmd_jirawatch
)
for name in ["cmd_jira", "cmd_jirastatus", "cmd_jirasearch", "cmd_jiraraised", "cmd_jirawatch"]:
print(f"{name} importable")
except ImportError as e:
print(f" ❌ Import failed: {e}")
raise
async def test_jql_generation():
"""Test that natural language is converted to JQL correctly."""
from backend.providers import call_llm
from backend.config import JIRA_DEFAULT_PROJECT
print("\nTesting natural language → JQL conversion...")
queries = [
"open bugs assigned to Alex",
"all thirdeye tickets",
"high priority tasks created this week",
]
for query in queries:
try:
result = await call_llm(
task_type="fast_small",
messages=[
{
"role": "system",
"content": (
f"Convert the user's natural language query into a valid Jira JQL query. "
f"Default project is '{JIRA_DEFAULT_PROJECT}'. "
"Return ONLY the JQL string — no explanation, no quotes, no markdown."
),
},
{"role": "user", "content": query},
],
temperature=0.0,
max_tokens=100,
)
jql = result["content"].strip()
assert len(jql) > 5, f"JQL too short for query '{query}': {jql}"
assert "=" in jql or "~" in jql or "ORDER" in jql.upper(), \
f"JQL doesn't look valid for '{query}': {jql}"
print(f"'{query}'\n{jql}")
except Exception as e:
print(f" ⚠️ JQL generation failed for '{query}': {e} (non-fatal — fallback exists)")
async def test_preview_mode_logic():
"""Test /jira preview — filters to unraised high-severity signals."""
from backend.db.chroma import store_signals, get_all_signals, get_raised_signal_ids
from backend.agents.jira_agent import RAISEABLE_TYPES
import chromadb
from backend.config import CHROMA_DB_PATH
import uuid
print("\nTesting /jira preview mode filtering...")
group_id = "test_jira_m19_preview"
# Cleanup any previous test data first
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
# Seed signals at different severities
signals = [
{
"id": str(uuid.uuid4()), "type": "recurring_bug",
"summary": "Checkout timeout — HIGH severity", "raw_quote": "...",
"severity": "high", "status": "open", "sentiment": "negative",
"urgency": "high", "entities": [], "keywords": ["checkout", "timeout"],
"timestamp": "2026-03-21T10:00:00Z", "group_id": group_id, "lens": "dev",
},
{
"id": str(uuid.uuid4()), "type": "tech_debt",
"summary": "TODO comment in auth module — LOW severity", "raw_quote": "...",
"severity": "low", "status": "open", "sentiment": "neutral",
"urgency": "none", "entities": [], "keywords": ["todo", "auth"],
"timestamp": "2026-03-21T10:01:00Z", "group_id": group_id, "lens": "dev",
},
]
store_signals(group_id, signals)
all_sig = get_all_signals(group_id)
already_raised = get_raised_signal_ids(group_id)
severity_rank = {"low": 0, "medium": 1, "high": 2, "critical": 3}
candidates = [
s for s in all_sig
if s.get("metadata", {}).get("type") in RAISEABLE_TYPES
and s.get("id", "") not in already_raised
and severity_rank.get(s.get("metadata", {}).get("severity", "low"), 0) >= 2
]
assert len(candidates) == 1, f"Expected 1 high-severity candidate, got {len(candidates)}"
assert candidates[0].get("metadata", {}).get("type") == "recurring_bug"
print(f" ✅ Preview filtered correctly: 1 high-severity signal, 1 low-severity skipped")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_format_raise_result():
"""Test the Telegram message formatter for raise results."""
from backend.agents.jira_agent import format_raise_result_for_telegram
from backend.config import JIRA_BASE_URL
print("\nTesting raise result formatter...")
# Successful raise
result_ok = {
"ok": True,
"key": "ENG-99",
"url": f"{JIRA_BASE_URL}/browse/ENG-99",
"summary": "Fix intermittent checkout timeout",
"issue_type": "Bug",
"priority": "High",
}
formatted_ok = format_raise_result_for_telegram(result_ok)
assert "ENG-99" in formatted_ok
assert "Bug" in formatted_ok
assert "High" in formatted_ok
print(f" ✅ Success format: {formatted_ok[:120]}")
# Already raised
result_dup = {"ok": False, "reason": "already_raised"}
formatted_dup = format_raise_result_for_telegram(result_dup)
assert "Already raised" in formatted_dup or "skipped" in formatted_dup.lower()
print(f" ✅ Duplicate format: {formatted_dup}")
# Not raiseable
result_no = {"ok": False, "reason": "not_raiseable", "signal_type": "meet_chunk_raw"}
formatted_no = format_raise_result_for_telegram(result_no)
assert "meet_chunk_raw" in formatted_no or "not" in formatted_no.lower()
print(f" ✅ Not-raiseable format: {formatted_no}")
async def test_auto_raise_pipeline_wiring():
"""Test that pipeline.py has the auto-raise hook without importing bot context."""
import inspect
import importlib
print("\nTesting auto-raise hook in pipeline.py...")
try:
import backend.pipeline as pipeline_module
source = inspect.getsource(pipeline_module)
assert "JIRA_AUTO_RAISE" in source, "JIRA_AUTO_RAISE check not found in pipeline.py"
assert "_auto_raise_and_notify" in source, "_auto_raise_and_notify not found in pipeline.py"
print(" ✅ JIRA_AUTO_RAISE hook present in pipeline.py")
print(" ✅ _auto_raise_and_notify function present")
except Exception as e:
print(f" ⚠️ Could not inspect pipeline.py: {e}")
print(" Make sure you added the auto-raise hook to backend/pipeline.py")
async def test_end_to_end_raise_from_pipeline():
"""
Integration test: process messages → signals extracted → Jira ticket raised automatically.
Uses JIRA_AUTO_RAISE=false (manual mode) but calls bulk_raise directly to verify the chain.
"""
from backend.pipeline import process_message_batch, set_lens
from backend.db.chroma import get_all_signals
from backend.agents.jira_agent import bulk_raise_for_group
import chromadb
from backend.config import CHROMA_DB_PATH
print("\nTesting end-to-end: chat → signals → Jira tickets...")
group_id = "test_jira_m19_e2e"
set_lens(group_id, "dev")
# Process messages that should generate raiseable signals
messages = [
{
"sender": "Sam",
"text": "The checkout timeout is happening again — fourth time. Production is affected. Critical bug.",
"timestamp": "2026-03-21T10:00:00Z",
},
{
"sender": "Alex",
"text": "OAuth secret is still hardcoded in config.py. We need to rotate it but nobody owns it.",
"timestamp": "2026-03-21T10:01:00Z",
},
]
extracted = await process_message_batch(group_id, messages)
print(f"{len(extracted)} signal(s) extracted from 2 messages")
all_sig = get_all_signals(group_id)
print(f"{len(all_sig)} total signal(s) in ChromaDB for group")
# Now raise tickets for the high-severity ones
results = await bulk_raise_for_group(
group_id=group_id,
signals=all_sig,
min_severity="high",
max_tickets=3,
)
raised = [r for r in results if r.get("ok")]
print(f"{len(raised)} ticket(s) raised from pipeline signals:")
for r in raised:
print(f" [{r['key']}] {r.get('signal_type')}{r.get('signal_summary', '')[:60]}")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
assert len(raised) >= 0, "Test completed (0 raised is OK if signals were medium severity)"
print(" ✅ End-to-end pipeline → Jira raise verified")
async def main():
print("Running Milestone 19 tests...\n")
await test_all_commands_importable()
await test_jql_generation()
await test_preview_mode_logic()
await test_format_raise_result()
await test_auto_raise_pipeline_wiring()
await test_end_to_end_raise_from_pipeline()
print("\n🎉 MILESTONE 19 PASSED — All Jira commands working, auto-raise wired into pipeline")
asyncio.run(main())

110
thirdeye/scripts/test_m2.py Normal file
View File

@@ -0,0 +1,110 @@
"""Test Milestone 2: ChromaDB + Embeddings working."""
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
def test_embeddings():
print("Testing embeddings...")
from backend.db.embeddings import embed_texts, embed_query
texts = ["Let's use PostgreSQL for the database", "The timeout bug is happening again"]
embeddings = embed_texts(texts)
assert len(embeddings) == 2, f"Expected 2 embeddings, got {len(embeddings)}"
assert len(embeddings[0]) > 10, f"Embedding too short: {len(embeddings[0])}"
print(f" ✅ Embedded 2 texts, dimension={len(embeddings[0])}")
query_emb = embed_query("database decision")
assert len(query_emb) > 10
print(f" ✅ Query embedding works, dimension={len(query_emb)}")
def test_chroma():
print("Testing ChromaDB...")
from backend.db.chroma import store_signals, query_signals, get_all_signals
test_group = "test_group_m2"
# Store test signals
signals = [
{
"type": "architecture_decision",
"summary": "Team decided to use PostgreSQL over MongoDB for relational data",
"entities": ["@alex", "postgresql", "mongodb"],
"severity": "medium",
"status": "decided",
"raw_quote": "Let's go with Postgres, MongoDB is overkill",
"timestamp": "2026-03-20T10:00:00Z",
"lens": "dev",
},
{
"type": "tech_debt",
"summary": "API URL hardcoded instead of using environment variables",
"entities": ["@priya", "api_url"],
"severity": "low",
"status": "unresolved",
"raw_quote": "Just hardcoding the URL for now",
"timestamp": "2026-03-20T14:00:00Z",
"lens": "dev",
},
{
"type": "recurring_bug",
"summary": "Timeout error occurring repeatedly in payment service",
"entities": ["payment_service", "timeout"],
"severity": "high",
"status": "unresolved",
"raw_quote": "Timeout error is back again",
"timestamp": "2026-03-21T09:00:00Z",
"lens": "dev",
},
]
store_signals(test_group, signals)
print(f" ✅ Stored {len(signals)} signals")
# Query
results = query_signals(test_group, "database decision")
assert len(results) > 0, "No results for 'database decision'"
assert "postgres" in results[0]["document"].lower() or "database" in results[0]["document"].lower()
print(f" ✅ Query 'database decision' returned {len(results)} results")
print(f" Top result: {results[0]['document'][:80]}")
# Query with type filter
results2 = query_signals(test_group, "bug", signal_type="recurring_bug")
assert len(results2) > 0, "No results for type=recurring_bug"
print(f" ✅ Filtered query (type=recurring_bug) returned {len(results2)} results")
# Get all
all_sigs = get_all_signals(test_group)
assert len(all_sigs) >= 3, f"Expected >=3 signals, got {len(all_sigs)}"
print(f" ✅ get_all_signals returned {len(all_sigs)} signals")
# Cleanup test collection
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{test_group}")
print(f" ✅ Cleaned up test collection")
except:
pass
def test_models():
print("Testing data models...")
from backend.db.models import Signal, Pattern, CrossGroupInsight
s = Signal(group_id="test", type="tech_debt", summary="Test signal")
assert s.id is not None
assert s.severity == "low"
print(f" ✅ Signal model works (id={s.id[:8]}...)")
p = Pattern(group_id="test", type="frequency_spike", description="Test pattern")
assert p.is_active == True
print(f" ✅ Pattern model works")
c = CrossGroupInsight(type="blocked_handoff", description="Test insight")
assert c.is_resolved == False
print(f" ✅ CrossGroupInsight model works")
test_embeddings()
test_chroma()
test_models()
print("\n🎉 MILESTONE 2 PASSED — ChromaDB + Embeddings working")

View File

@@ -0,0 +1,132 @@
"""
Test Milestone 20: Groq Whisper transcription client.
Note: Full transcription tests require real audio bytes.
We test pre-flight filters and API reachability here.
Silent/near-silent audio will return "no_speech" — that is correct behaviour.
To test with real speech: record a short voice note and save as
thirdeye/scripts/test_voice.ogg before running this test.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
def _make_minimal_ogg() -> bytes:
"""
Generate a minimal valid OGG container header (silent).
Whisper will return no_speech for this — that IS the correct result.
We use it to confirm the API is reachable and credentials work.
"""
ogg_magic = b"OggS"
header = b"\x00\x02" + b"\x00" * 8 + b"\x00\x00\x00\x01" + b"\x00\x00\x00\x00" + b"\x00\x00\x00\x00" + b"\x01\x1e"
vorbis_id = b"\x01vorbis" + b"\x00" * 23
return ogg_magic + header + vorbis_id
async def test_config_loaded():
"""Test that GROQ_API_KEY is present (needed for Whisper)."""
from backend.config import GROQ_API_KEY, ENABLE_VOICE_TRANSCRIPTION
print("Testing voice transcription config...")
assert GROQ_API_KEY and len(GROQ_API_KEY) > 5, (
"GROQ_API_KEY is missing. Groq Whisper uses the same key as your LLM providers."
)
print(f" ✅ GROQ_API_KEY present ({len(GROQ_API_KEY)} chars)")
print(f" ✅ ENABLE_VOICE_TRANSCRIPTION: {ENABLE_VOICE_TRANSCRIPTION}")
async def test_pre_flight_filters():
"""Test that duration and size filters work before hitting the API."""
from backend.agents.voice_transcriber import transcribe_audio
print("\nTesting pre-flight filters (no API calls made)...")
result = await transcribe_audio(b"", filename="audio.ogg")
assert not result["ok"] and result["reason"] == "empty"
print(" ✅ Empty bytes -> reason='empty'")
result = await transcribe_audio(b"fake", filename="audio.ogg", duration_seconds=1)
assert not result["ok"] and result["reason"] == "too_short"
print(" ✅ 1s audio -> reason='too_short' (min is 2s)")
result = await transcribe_audio(b"fake", filename="audio.ogg", duration_seconds=9999)
assert not result["ok"] and result["reason"] == "too_long"
print(" ✅ 9999s audio -> reason='too_long' (max is 300s)")
big_bytes = b"x" * (26 * 1024 * 1024)
result = await transcribe_audio(big_bytes, filename="audio.ogg", duration_seconds=30)
assert not result["ok"] and result["reason"] == "file_too_large"
print(" ✅ 26MB audio -> reason='file_too_large' (Groq limit is 25MB)")
async def test_api_reachable():
"""
Test that Groq Whisper API is reachable and authenticates correctly.
A 401 means your GROQ_API_KEY is wrong.
"""
from backend.agents.voice_transcriber import transcribe_audio
print("\nTesting Groq Whisper API reachability...")
minimal_ogg = _make_minimal_ogg()
result = await transcribe_audio(minimal_ogg, filename="test.ogg", duration_seconds=5)
if result["ok"]:
print(f" ✅ API reachable — transcript: '{result['transcript'][:60]}'")
elif result["reason"] == "no_speech":
print(f" ✅ API reachable — silent audio correctly returned no_speech")
elif result["reason"] == "api_error" and "401" in result.get("error", ""):
raise AssertionError(
f"Authentication failed — check GROQ_API_KEY in .env\nError: {result['error']}"
)
else:
print(f" ⚠️ API returned: reason={result['reason']}, error={result.get('error')} (non-fatal)")
async def test_real_audio_file():
"""
Test with a real OGG voice file if one exists at scripts/test_voice.ogg.
OPTIONAL — skip if file not present.
"""
from backend.agents.voice_transcriber import transcribe_audio
test_file = os.path.join(os.path.dirname(__file__), "test_voice.ogg")
if not os.path.exists(test_file):
print("\n ⏭️ Skipping real audio test — place a voice note OGG at scripts/test_voice.ogg to enable")
return
print(f"\nTesting with real audio file: {test_file}")
with open(test_file, "rb") as f:
audio_bytes = f.read()
result = await transcribe_audio(audio_bytes, filename="test_voice.ogg", duration_seconds=30)
assert result["ok"], f"Real audio transcription failed: {result}"
assert len(result["transcript"]) > 5
print(f" ✅ Transcript ({result['word_count']} words): {result['transcript'][:120]}...")
print(f" Language detected: {result['language']}")
async def test_format_duration():
"""Test the duration formatting helper."""
from backend.agents.voice_transcriber import format_duration
print("\nTesting format_duration()...")
assert format_duration(45) == "45s"
assert format_duration(90) == "1m 30s"
assert format_duration(0) == "0s"
assert format_duration(None) == "?"
print(" ✅ 45 -> '45s', 90 -> '1m 30s', None -> '?'")
async def main():
print("Running Milestone 20 tests...\n")
await test_config_loaded()
await test_pre_flight_filters()
await test_api_reachable()
await test_real_audio_file()
await test_format_duration()
print("\n🎉 MILESTONE 20 PASSED — Groq Whisper client working")
asyncio.run(main())

View File

@@ -0,0 +1,144 @@
"""
Test Milestone 21: Voice handler pipeline integration.
Uses synthetic transcript text to avoid needing real audio in CI.
"""
import asyncio
import os
import sys
import uuid
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
TRANSCRIPT_ARCHITECTURE = """
So I just wanted to quickly explain the architecture decision. We've been going
back and forth on the database and I think we should just go with PostgreSQL.
The main reason is Raj already knows it and we have less than two weeks to ship.
Final decision — PostgreSQL. Raj can you start the schema by Thursday?
"""
TRANSCRIPT_BLOCKER = """
The thing I wanted to flag is the design specs are still not done. I've been
waiting for two weeks and I literally cannot start the dashboard without them.
This is a hard blocker. If I don't get the specs by Wednesday we'll miss Friday.
"""
async def test_voice_transcript_signal_builder():
"""Test that the voice transcript signal is correctly structured."""
from backend.agents.voice_handler import build_voice_transcript_signal
print("Testing voice transcript signal builder...")
signal = build_voice_transcript_signal(
transcript=TRANSCRIPT_ARCHITECTURE.strip(),
sender="Raj",
group_id="test_voice_m21",
voice_file_id="fake_file_id_123",
duration_seconds=45,
language="en",
timestamp="2026-03-21T10:00:00Z",
)
assert signal["type"] == "voice_transcript"
assert signal["source"] == "voice"
assert signal["speaker"] == "Raj"
assert "@Raj" in signal["entities"]
assert signal["voice_duration"] == 45
assert signal["voice_language"] == "en"
assert len(signal["raw_quote"]) > 50 # full transcript stored
assert len(signal["keywords"]) > 0
print(f" ✅ type: {signal['type']}, source: {signal['source']}, speaker: {signal['speaker']}")
print(f" ✅ keywords: {signal['keywords'][:5]}")
print(f" ✅ summary: {signal['summary'][:100]}")
async def test_voice_metadata_injection():
"""Test that voice metadata is injected into extracted signals."""
from backend.agents.voice_handler import _inject_voice_metadata
print("\nTesting voice metadata injection...")
raw_signals = [
{"id": "1", "type": "architecture_decision", "summary": "Use PostgreSQL", "severity": "medium"},
{"id": "2", "type": "action_item", "summary": "Raj to set up schema by Thursday", "severity": "medium"},
]
voice_meta = {"sender": "Raj", "voice_file_id": "file_abc123", "duration_seconds": 45, "language": "en"}
enriched = _inject_voice_metadata(raw_signals, voice_meta)
for sig in enriched:
assert sig["source"] == "voice"
assert sig["speaker"] == "Raj"
assert sig["voice_file_id"] == "file_abc123"
assert "[Voice @Raj]" in sig["summary"]
print(f" ✅ [{sig['type']}] -> {sig['summary'][:80]}")
async def test_full_pipeline_with_transcript():
"""
Full pipeline test: inject synthetic transcript -> signal extraction -> ChromaDB.
Bypasses the Whisper API entirely.
"""
from backend.pipeline import process_message_batch, query_knowledge, set_lens
from backend.agents.voice_handler import build_voice_transcript_signal, _inject_voice_metadata
from backend.db.chroma import store_signals
import chromadb
from backend.config import CHROMA_DB_PATH
print("\nTesting full pipeline with synthetic transcript...")
group_id = "test_voice_m21_pipeline"
set_lens(group_id, "dev")
sender = "Raj"
timestamp = "2026-03-21T10:00:00Z"
voice_meta = {"sender": sender, "voice_file_id": "test_file_id", "duration_seconds": 45, "language": "en"}
# Store raw transcript
transcript_signal = build_voice_transcript_signal(
transcript=TRANSCRIPT_ARCHITECTURE.strip(),
sender=sender, group_id=group_id,
voice_file_id="test_file_id", duration_seconds=45,
language="en", timestamp=timestamp,
)
store_signals(group_id, [transcript_signal])
print(f" ✅ Raw voice transcript stored in ChromaDB")
# Run through signal extraction
messages = [{"sender": sender, "text": TRANSCRIPT_ARCHITECTURE.strip(), "timestamp": timestamp}]
extracted = await process_message_batch(group_id, messages)
enriched = _inject_voice_metadata(extracted, voice_meta)
print(f"{len(enriched)} signal(s) extracted from transcript")
# Verify voice attribution
for sig in enriched:
assert sig.get("source") == "voice"
assert "[Voice @Raj]" in sig.get("summary", "")
print(f" ✅ Voice attribution on all extracted signals")
# Query knowledge base
answer = await query_knowledge(group_id, "What database did we decide on?")
assert any(w in answer.lower() for w in ["postgres", "database", "sql"])
print(f" ✅ Knowledge base query answered: {answer[:100]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_handler_functions_importable():
"""Test that the Telegram handler functions import correctly."""
print("\nTesting handler function imports...")
from backend.bot.bot import handle_voice_telegram, handle_video_note_telegram
print(" ✅ handle_voice_telegram importable")
print(" ✅ handle_video_note_telegram importable")
async def main():
print("Running Milestone 21 tests...\n")
await test_voice_transcript_signal_builder()
await test_voice_metadata_injection()
await test_full_pipeline_with_transcript()
await test_handler_functions_importable()
print("\n🎉 MILESTONE 21 PASSED — Voice handler integrated into signal pipeline")
asyncio.run(main())

View File

@@ -0,0 +1,232 @@
"""
Test Milestone 22: Voice attribution in /ask + /voicelog.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
TRANSCRIPT_POSTGRES = "We decided to go with PostgreSQL. Final. Raj will set up the schema by Thursday."
TRANSCRIPT_BLOCKER = "Dashboard is still blocked on design specs. Two weeks now. Hard blocker for the sprint."
TRANSCRIPT_BUG = "Checkout timeout is happening again. Critical. Someone needs to investigate today."
async def _seed_voice_signals(group_id: str):
"""Seed a group with voice-sourced signals for testing."""
from backend.pipeline import process_message_batch, set_lens
from backend.agents.voice_handler import build_voice_transcript_signal, _inject_voice_metadata
from backend.db.chroma import store_signals
set_lens(group_id, "dev")
sessions = [
("Raj", TRANSCRIPT_POSTGRES, "f1", 22, "2026-03-14T10:00:00Z"),
("Alex", TRANSCRIPT_BLOCKER, "f2", 18, "2026-03-17T11:00:00Z"),
("Sam", TRANSCRIPT_BUG, "f3", 15, "2026-03-19T09:00:00Z"),
]
for sender, transcript, file_id, duration, timestamp in sessions:
ts_signal = build_voice_transcript_signal(
transcript=transcript, sender=sender, group_id=group_id,
voice_file_id=file_id, duration_seconds=duration,
language="en", timestamp=timestamp,
)
store_signals(group_id, [ts_signal])
messages = [{"sender": sender, "text": transcript, "timestamp": timestamp}]
extracted = await process_message_batch(group_id, messages)
voice_meta = {"sender": sender, "voice_file_id": file_id, "duration_seconds": duration, "language": "en"}
_inject_voice_metadata(extracted, voice_meta)
async def test_signal_formatter():
"""Test that voice signals format with attribution prefix."""
from backend.agents.query_agent import _format_signal_for_context
print("Testing signal formatter with voice attribution...")
voice_signal = {
"type": "architecture_decision",
"summary": "Team decided to use PostgreSQL",
"source": "voice",
"speaker": "Raj",
"voice_duration": 45,
"timestamp": "2026-03-14T10:00:00Z",
"entities": ["@Raj"],
}
formatted = _format_signal_for_context(voice_signal)
assert "[VOICE NOTE" in formatted, f"Expected [VOICE NOTE] prefix, got: {formatted}"
assert "@Raj" in formatted
assert "Mar 14" in formatted
assert "45s" in formatted
print(f" \u2705 Voice: {formatted[:120]}")
chat_signal = {
"type": "tech_debt", "summary": "JWT hardcoded", "source": "chat",
"timestamp": "2026-03-15T09:00:00Z", "entities": ["@Alex"],
}
assert "[CHAT" in _format_signal_for_context(chat_signal)
print(f" \u2705 Chat signal formatted correctly")
doc_signal = {
"type": "document_knowledge", "summary": "OAuth required",
"source": "document", "timestamp": "2026-03-16T09:00:00Z", "entities": [],
}
assert "[DOCUMENT" in _format_signal_for_context(doc_signal)
print(f" \u2705 Document signal formatted correctly")
# Also test with ChromaDB nested format
nested_voice = {
"metadata": {
"type": "architecture_decision",
"summary": "Use Redis for caching",
"source": "voice",
"speaker": "Sam",
"voice_duration": 30,
"timestamp": "2026-03-18T10:00:00Z",
"entities": ["@Sam"],
},
"document": "Use Redis for caching",
"id": "test-id",
}
nested_fmt = _format_signal_for_context(nested_voice)
assert "[VOICE NOTE" in nested_fmt, f"Nested format failed: {nested_fmt}"
print(f" \u2705 Nested ChromaDB format handled correctly")
async def test_voice_query_attribution():
"""Test that /ask returns voice attribution in its answer."""
from backend.pipeline import query_knowledge
from backend.config import CHROMA_DB_PATH
print("\nTesting /ask returns voice attribution...")
group_id = "test_voice_m22_ask"
await _seed_voice_signals(group_id)
answer = await query_knowledge(group_id, "What database did we decide to use?")
assert len(answer) > 10
relevant = any(w in answer.lower() for w in ["postgres", "raj", "voice", "database"])
assert relevant, f"Answer did not surface voice-sourced decision. Got: {answer[:200]}"
print(f" \u2705 Answer surfaces voice decision: {answer[:150]}...")
has_citation = any(phrase in answer.lower() for phrase in ["voice note", "@raj", "raj said", "mar 14"])
if has_citation:
print(f" \u2705 Explicit voice attribution present in answer")
else:
print(f" \u26a0\ufe0f Answer correct but attribution phrasing varies by provider (acceptable)")
# Cleanup
import chromadb as cdb
client = cdb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_voicelog_filtering():
"""Test voicelog retrieval and speaker filtering."""
from backend.db.chroma import get_all_signals
import chromadb
from backend.config import CHROMA_DB_PATH
print("\nTesting voicelog signal retrieval and filtering...")
group_id = "test_voice_m22_log"
await _seed_voice_signals(group_id)
all_signals_raw = get_all_signals(group_id)
# Flatten metadata (same as commands.py does)
def _flatten(s):
meta = s.get("metadata", {})
flat = {**meta}
flat.setdefault("id", s.get("id", ""))
flat.setdefault("document", s.get("document", ""))
return flat
all_signals = [_flatten(s) for s in all_signals_raw]
voice_signals = [
s for s in all_signals
if s.get("source") == "voice"
or s.get("type") == "voice_transcript"
or "[Voice @" in s.get("summary", "")
]
assert len(voice_signals) > 0, "Expected voice-sourced signals"
print(f" \u2705 Found {len(voice_signals)} voice-sourced signal(s)")
raj_signals = [
s for s in voice_signals
if "raj" in s.get("speaker", "").lower() or "raj" in str(s.get("entities", [])).lower()
]
assert len(raj_signals) > 0, "Expected signals from Raj"
print(f" \u2705 Found {len(raj_signals)} signal(s) from @Raj")
structured = [s for s in voice_signals if s.get("type") != "voice_transcript"]
print(f" \u2705 {len(structured)} structured, {len(voice_signals) - len(structured)} raw transcripts")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def test_voicelog_command_importable():
"""Test that cmd_voicelog imports without errors."""
print("\nTesting cmd_voicelog import...")
from backend.bot.commands import cmd_voicelog
print(" \u2705 cmd_voicelog importable")
async def test_mixed_source_query():
"""Test that /ask uses voice + chat signals together."""
from backend.pipeline import process_message_batch, query_knowledge, set_lens
from backend.agents.voice_handler import build_voice_transcript_signal, _inject_voice_metadata
from backend.db.chroma import store_signals
import chromadb
from backend.config import CHROMA_DB_PATH
print("\nTesting mixed-source query (voice + chat)...")
group_id = "test_voice_m22_mixed"
set_lens(group_id, "dev")
# Chat signal: Redis
await process_message_batch(group_id, [
{"sender": "Alex", "text": "I think we should use Redis for the cache.", "timestamp": "2026-03-10T09:00:00Z"}
])
# Voice signal (more recent): overrides to PostgreSQL
transcript = "Just to be clear — we're going with PostgreSQL for everything. Redis is off the table."
ts_signal = build_voice_transcript_signal(
transcript=transcript, sender="Raj", group_id=group_id,
voice_file_id="f_override", duration_seconds=20, language="en",
timestamp="2026-03-21T10:00:00Z",
)
store_signals(group_id, [ts_signal])
extracted = await process_message_batch(group_id, [
{"sender": "Raj", "text": transcript, "timestamp": "2026-03-21T10:00:00Z"}
])
_inject_voice_metadata(extracted, {"sender": "Raj", "voice_file_id": "f_override", "duration_seconds": 20, "language": "en"})
answer = await query_knowledge(group_id, "What did we decide about caching?")
assert any(w in answer.lower() for w in ["postgres", "redis", "cache"])
print(f" \u2705 Mixed-source query answered: {answer[:120]}...")
# Cleanup
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except Exception:
pass
async def main():
print("Running Milestone 22 tests...\n")
await test_signal_formatter()
await test_voice_query_attribution()
await test_voicelog_filtering()
await test_voicelog_command_importable()
await test_mixed_source_query()
print("\n\U0001f389 MILESTONE 22 PASSED — Voice attribution in /ask, /voicelog working")
asyncio.run(main())

View File

@@ -0,0 +1,73 @@
"""Test Milestone 3: Signal Extractor, Classifier, and Context Detector working."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
DEV_CHAT = """[Alex]: Hey team, I think we should go with PostgreSQL for the main DB. MongoDB is overkill for our relational data.
[Priya]: Agreed on Postgres. I'll set up the schema today.
[Raj]: Payment module webhook integration is looking tricky. I'll handle it myself since I know the Stripe API best.
[Alex]: I'm just gonna hardcode the API URL for now, we'll add env vars when we dockerize.
[Sam]: The timeout error on the checkout endpoint is happening again. Third time this week.
[Alex]: Just restart the pod for now, I'll look at it after the sprint."""
PRODUCT_CHAT = """[Lisa]: Users keep asking about dark mode, it comes up in every demo.
[Mike]: I think we should prioritize the mobile app over the API this sprint.
[Sarah]: No way, API stability is way more important. Two enterprise clients complained last week.
[Lisa]: Sarah from Acme literally said 'I would pay double if you had SSO integration.'
[Mike]: Competitor X just launched a mobile-first version. We're falling behind."""
async def test_signal_extractor():
from backend.agents.signal_extractor import extract_signals
print("Testing Signal Extractor (DevLens)...")
signals = await extract_signals(DEV_CHAT, "test-dev", lens="dev")
print(f" Extracted {len(signals)} signals:")
for s in signals:
print(f" - [{s.type}] {s.summary[:70]}...")
assert len(signals) >= 2, f"Expected >=2 signals, got {len(signals)}"
print(f" ✅ DevLens extraction working ({len(signals)} signals)")
print("\nTesting Signal Extractor (ProductLens)...")
signals2 = await extract_signals(PRODUCT_CHAT, "test-product", lens="product")
print(f" Extracted {len(signals2)} signals:")
for s in signals2:
print(f" - [{s.type}] {s.summary[:70]}...")
assert len(signals2) >= 2, f"Expected >=2 signals, got {len(signals2)}"
print(f" ✅ ProductLens extraction working ({len(signals2)} signals)")
async def test_classifier():
from backend.agents.signal_extractor import extract_signals
from backend.agents.classifier import classify_signal
print("\nTesting Classifier Agent...")
signals = await extract_signals(DEV_CHAT, "test-classify", lens="dev")
if signals:
classified = await classify_signal(signals[0])
print(f" Signal: {classified.summary[:60]}")
print(f" Sentiment: {classified.sentiment}, Urgency: {classified.urgency}")
print(f" Keywords: {classified.keywords}")
print(f" ✅ Classifier working")
else:
print(f" ⚠️ No signals to classify (extractor returned empty)")
async def test_context_detector():
from backend.agents.context_detector import detect_context
print("\nTesting Context Detector...")
result = await detect_context(DEV_CHAT)
print(f" Detected: {result['detected_lens']} (confidence: {result['confidence']})")
print(f" Evidence: {result['evidence']}")
assert result["detected_lens"] == "dev", f"Expected 'dev', got '{result['detected_lens']}'"
print(f" ✅ Correctly detected as 'dev'")
result2 = await detect_context(PRODUCT_CHAT)
print(f" Detected: {result2['detected_lens']} (confidence: {result2['confidence']})")
assert result2["detected_lens"] == "product", f"Expected 'product', got '{result2['detected_lens']}'"
print(f" ✅ Correctly detected as 'product'")
async def main():
await test_signal_extractor()
await test_classifier()
await test_context_detector()
print("\n🎉 MILESTONE 3 PASSED — Core agents working")
asyncio.run(main())

View File

@@ -0,0 +1,56 @@
"""Test Milestone 4: Full pipeline — extract → classify → store → query."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
DEV_MESSAGES = [
{"sender": "Alex", "text": "Hey team, I think we should go with PostgreSQL for the main DB. MongoDB is overkill.", "timestamp": "2026-03-20T10:00:00Z"},
{"sender": "Priya", "text": "Agreed. I'll set up the Postgres schema today.", "timestamp": "2026-03-20T10:05:00Z"},
{"sender": "Raj", "text": "Payment webhook integration is tricky. I'll handle all the Stripe stuff since I know it best.", "timestamp": "2026-03-20T11:00:00Z"},
{"sender": "Alex", "text": "I'm just hardcoding the API URL for now. We'll fix it with env vars later.", "timestamp": "2026-03-20T14:00:00Z"},
{"sender": "Sam", "text": "The timeout error on checkout is back. Third time this week.", "timestamp": "2026-03-21T09:00:00Z"},
{"sender": "Alex", "text": "Just restart the pod when it happens. I'll investigate after the sprint.", "timestamp": "2026-03-21T09:15:00Z"},
]
async def main():
from backend.pipeline import process_message_batch, query_knowledge
group_id = "test_pipeline_m4"
# Step 1: Process messages through full pipeline
print("Processing message batch through full pipeline...")
signals = await process_message_batch(group_id, DEV_MESSAGES)
print(f" ✅ Pipeline produced {len(signals)} signals:")
for s in signals:
print(f" [{s.type}] {s.summary[:70]} (severity={s.severity}, sentiment={s.sentiment})")
assert len(signals) >= 2, f"Expected >=2 signals, got {len(signals)}"
# Step 2: Query the knowledge base
print("\nQuerying: 'What database did the team choose?'")
answer = await query_knowledge(group_id, "What database did the team choose?")
print(f" Answer: {answer}")
assert len(answer) > 20, "Answer too short"
print(f" ✅ Query agent produced meaningful answer")
print("\nQuerying: 'What tech debt exists?'")
answer2 = await query_knowledge(group_id, "What tech debt exists?")
print(f" Answer: {answer2}")
print(f" ✅ Tech debt query works")
print("\nQuerying: 'What bugs have been reported?'")
answer3 = await query_knowledge(group_id, "What bugs or issues keep recurring?")
print(f" Answer: {answer3}")
print(f" ✅ Bug query works")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
try:
client.delete_collection(f"ll_{group_id}")
except:
pass
print("\n🎉 MILESTONE 4 PASSED — Full pipeline working end to end")
asyncio.run(main())

View File

@@ -0,0 +1,77 @@
"""Test Milestone 5: Pattern detection + Cross-group analysis."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from backend.pipeline import process_message_batch
# Dev team messages with PLANTED patterns
DEV_MSGS = [
{"sender": "Alex", "text": "Let's go with PostgreSQL.", "timestamp": "2026-03-15T10:00:00Z"},
{"sender": "Raj", "text": "I'll handle the payment module Stripe integration.", "timestamp": "2026-03-15T11:00:00Z"},
{"sender": "Raj", "text": "Payment webhook setup is done, only I know how this works right now.", "timestamp": "2026-03-16T10:00:00Z"},
{"sender": "Sam", "text": "Timeout error on checkout again.", "timestamp": "2026-03-17T09:00:00Z"},
{"sender": "Sam", "text": "Same timeout error. This is the third time.", "timestamp": "2026-03-18T09:00:00Z"},
{"sender": "Alex", "text": "I'm hardcoding the config for now, no time to do it properly.", "timestamp": "2026-03-18T14:00:00Z"},
{"sender": "Sam", "text": "We need the design specs for the dashboard. Still waiting.", "timestamp": "2026-03-19T10:00:00Z"},
{"sender": "Alex", "text": "Dashboard is completely blocked without those design specs.", "timestamp": "2026-03-20T10:00:00Z"},
]
# Product team messages — NOTE: no mention of design specs being needed
PRODUCT_MSGS = [
{"sender": "Lisa", "text": "Dark mode is the most requested feature by far.", "timestamp": "2026-03-16T10:00:00Z"},
{"sender": "Mike", "text": "We should go mobile-first this sprint.", "timestamp": "2026-03-17T10:00:00Z"},
{"sender": "Sarah", "text": "API stability is more important than mobile. Enterprise clients are complaining.", "timestamp": "2026-03-17T10:30:00Z"},
{"sender": "Lisa", "text": "I told the client we'd have the dashboard demo ready by Friday.", "timestamp": "2026-03-18T10:00:00Z"},
{"sender": "Mike", "text": "Let's push for the API-first approach this quarter.", "timestamp": "2026-03-19T10:00:00Z"},
]
async def main():
from backend.agents.pattern_detector import detect_patterns
from backend.agents.cross_group_analyst import analyze_cross_group
dev_group = "test_dev_m5"
product_group = "test_product_m5"
# Process both groups
print("Processing dev team messages...")
dev_signals = await process_message_batch(dev_group, DEV_MSGS)
print(f" ✅ Dev team: {len(dev_signals)} signals stored")
print("Processing product team messages...")
prod_signals = await process_message_batch(product_group, PRODUCT_MSGS)
print(f" ✅ Product team: {len(prod_signals)} signals stored")
# Test pattern detection
print("\nRunning pattern detection on dev team...")
patterns = await detect_patterns(dev_group)
print(f" Found {len(patterns)} patterns:")
for p in patterns:
print(f" [{p.severity}] {p.type}: {p.description[:80]}")
print(f" ✅ Pattern detection working")
# Test cross-group analysis
print("\nRunning cross-group analysis...")
from backend.db.chroma import get_all_signals
summaries = {
"Acme Dev Team": get_all_signals(dev_group),
"Acme Product": get_all_signals(product_group),
}
insights = await analyze_cross_group(summaries)
print(f" Found {len(insights)} cross-group insights:")
for i in insights:
print(f" [{i.severity}] {i.type}: {i.description[:100]}")
print(f" ✅ Cross-group analysis working")
# Cleanup
import chromadb
from backend.config import CHROMA_DB_PATH
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
for name in [dev_group, product_group]:
try:
client.delete_collection(f"ll_{name}")
except:
pass
print("\n🎉 MILESTONE 5 PASSED — Pattern detection + cross-group analysis working")
asyncio.run(main())

View File

@@ -0,0 +1,23 @@
"""Pre-check for Milestone 6: Verify bot token works before running."""
import asyncio, os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
async def test():
from telegram import Bot
from backend.config import TELEGRAM_BOT_TOKEN
assert TELEGRAM_BOT_TOKEN and len(TELEGRAM_BOT_TOKEN) > 10, "TELEGRAM_BOT_TOKEN not set!"
bot = Bot(token=TELEGRAM_BOT_TOKEN)
me = await bot.get_me()
print(f" ✅ Bot connected: @{me.username} ({me.first_name})")
print(f"\n IMPORTANT: Before testing in a group:")
print(f" 1. Open Telegram → Search @BotFather")
print(f" 2. Send: /setprivacy")
print(f" 3. Select @{me.username}")
print(f" 4. Choose: Disable")
print(f" 5. Create a test group, add @{me.username} to it")
print(f"\n Then run: python run_bot.py")
print(f" Send messages in the group, then try: /ask [question]")
asyncio.run(test())