B.Tech-Project-III/thirdeye/scripts/test_m15.py

"""
Test Milestone 15: Meet transcript processing agent.
Tests signal extraction from transcript text WITHOUT needing the extension or Chrome.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))


# Sample meeting transcript (realistic, multi-topic)
SAMPLE_TRANSCRIPT_1 = """
Alex: Alright, let's get started. So the main thing today is the database migration.
Sam: Yeah, we've been going back and forth but I think we should just commit to PostgreSQL.
     It has better support for our JSON query patterns and the team already knows it.
Alex: Agreed, let's make that the decision. We go with PostgreSQL.
Priya: I can set up the initial schema. I'll have it ready by Thursday.
Sam: Great. One thing though — the legacy MySQL tables still have some data we need to migrate.
     I have no idea how long that's going to take. That's a real risk.
Alex: Who's owning the migration scripts?
Priya: I'll do it, but I'll need the final schema signed off before I start. That's a blocker for me.
Sam: When do we need the migration done by?
Alex: End of sprint, so March 28th.
Sam: Can we do that? I'm not sure.
Alex: We'll try. Priya, can you at least start the schema this week?
Priya: Yes, schema by Thursday, migration scripts next week if all goes well.
"""

SAMPLE_TRANSCRIPT_2 = """
Lisa: Moving on — the client dashboard is still blocked waiting on design specs.
Alex: Yeah that's been two weeks now. We literally cannot start without those specs.
Lisa: I know, I'll follow up with design today. That's on me.
Sam: Also, the checkout endpoint is still hitting intermittent timeouts.
     Third time this sprint. We need to actually fix this, not just restart pods.
Alex: Agreed, that needs an owner. Sam can you pick that up?
Sam: Yeah I'll investigate this week. I'll add a ticket.
Lisa: Any risks before we close?
Priya: The OAuth integration is touching a lot of the auth layer.
       If something breaks there, it could affect all our users at once. High risk.
Alex: Good call. Let's make sure we do that in a feature branch and have a rollback plan.
"""


async def test_signal_extraction_chunk_1():
    """Test extraction from a decision-heavy transcript."""
    from backend.agents.meet_ingestor import process_meet_chunk
    from backend.db.chroma import query_signals
    import chromadb
    from backend.config import CHROMA_DB_PATH

    group_id = "test_meet_m15_a"
    meeting_id = "sprint-planning-m15"

    print("Testing signal extraction from transcript chunk 1 (decisions + action items)...")
    signals = await process_meet_chunk(
        meeting_id=meeting_id,
        group_id=group_id,
        chunk_index=0,
        text=SAMPLE_TRANSCRIPT_1.strip(),
        speaker="Alex",
        timestamp="2026-03-21T10:00:00Z",
        is_final=False,
    )

    assert len(signals) > 0, "Expected at least some signals to be extracted"
    print(f"  ✅ {len(signals)} total signals produced")

    types = [s["type"] for s in signals]
    print(f"  Types found: {set(types)}")

    # Must have at least a raw chunk
    assert "meet_chunk_raw" in types, "Expected raw chunk signal"
    print("  ✅ Raw chunk stored (enables full-text search)")

    # Should have extracted decisions (PostgreSQL decision is clear)
    decisions = [s for s in signals if s["type"] == "meet_decision"]
    assert len(decisions) > 0, "Expected at least one decision (PostgreSQL decision is explicit)"
    print(f"  ✅ {len(decisions)} decision signal(s) extracted")
    print(f"     First decision: {decisions[0]['summary'][:100]}")

    # Should have extracted action items (Priya - schema by Thursday)
    actions = [s for s in signals if s["type"] == "meet_action_item"]
    assert len(actions) > 0, "Expected at least one action item (Priya - schema by Thursday)"
    print(f"  ✅ {len(actions)} action item(s) extracted")
    print(f"     First action: {actions[0]['summary'][:100]}")

    # Verify signals are in ChromaDB
    results = query_signals(group_id, "database decision PostgreSQL")
    assert len(results) > 0, "Expected signals to be queryable from ChromaDB"
    print(f"  ✅ Signals queryable from ChromaDB ({len(results)} results for 'database decision PostgreSQL')")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def test_signal_extraction_chunk_2():
    """Test extraction from a blocker + risk heavy transcript."""
    from backend.agents.meet_ingestor import process_meet_chunk
    import chromadb
    from backend.config import CHROMA_DB_PATH

    group_id = "test_meet_m15_b"
    meeting_id = "standup-m15"

    print("\nTesting signal extraction from transcript chunk 2 (blockers + risks)...")
    signals = await process_meet_chunk(
        meeting_id=meeting_id,
        group_id=group_id,
        chunk_index=0,
        text=SAMPLE_TRANSCRIPT_2.strip(),
        speaker="Lisa",
        timestamp="2026-03-21T10:30:00Z",
        is_final=False,
    )

    types = [s["type"] for s in signals]
    print(f"  Types found: {set(types)}")

    blockers = [s for s in signals if s["type"] == "meet_blocker"]
    risks = [s for s in signals if s["type"] == "meet_risk"]

    assert len(blockers) > 0, "Expected at least one blocker (dashboard blocked on design specs)"
    print(f"  ✅ {len(blockers)} blocker(s) extracted")
    print(f"     First blocker: {blockers[0]['summary'][:100]}")

    assert len(risks) > 0, "Expected at least one risk (OAuth touching auth layer)"
    print(f"  ✅ {len(risks)} risk(s) extracted")
    print(f"     First risk: {risks[0]['summary'][:100]}")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def test_final_chunk_generates_summary():
    """Test that is_final=True triggers a summary signal generation."""
    from backend.agents.meet_ingestor import process_meet_chunk
    import chromadb
    from backend.config import CHROMA_DB_PATH

    group_id = "test_meet_m15_c"
    meeting_id = "full-meeting-m15"

    print("\nTesting final chunk triggers meeting summary...")

    # First chunk
    await process_meet_chunk(
        meeting_id=meeting_id,
        group_id=group_id,
        chunk_index=0,
        text=SAMPLE_TRANSCRIPT_1.strip(),
        speaker="Alex",
        timestamp="2026-03-21T10:00:00Z",
        is_final=False,
    )

    # Final chunk
    signals = await process_meet_chunk(
        meeting_id=meeting_id,
        group_id=group_id,
        chunk_index=1,
        text=SAMPLE_TRANSCRIPT_2.strip(),
        speaker="Lisa",
        timestamp="2026-03-21T10:30:00Z",
        is_final=True,
    )

    types = [s["type"] for s in signals]
    assert "meet_summary" in types, "Expected a meet_summary signal on is_final=True"
    summary_sig = next(s for s in signals if s["type"] == "meet_summary")
    assert len(summary_sig["summary"]) > 50, "Summary should be at least 50 chars"
    print(f"  ✅ Meeting summary generated ({len(summary_sig['summary'])} chars)")
    print(f"     Preview: {summary_sig['summary'][:150]}...")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def test_signals_coexist_with_chat_signals():
    """Test that meet signals are queryable alongside existing chat signals."""
    from backend.agents.meet_ingestor import process_meet_chunk
    from backend.pipeline import process_message_batch, query_knowledge, set_lens
    import chromadb
    from backend.config import CHROMA_DB_PATH

    group_id = "test_meet_m15_d"
    meeting_id = "integration-test-m15"
    set_lens(group_id, "dev")

    print("\nTesting meet signals + chat signals coexist...")

    # Add chat signals
    chat_messages = [
        {"sender": "Alex", "text": "The team agreed in a previous meeting we'd use Redis for caching.", "timestamp": "2026-03-20T09:00:00Z"},
        {"sender": "Priya", "text": "The timeout bug on checkout is still unresolved from last sprint.", "timestamp": "2026-03-20T09:05:00Z"},
    ]
    await process_message_batch(group_id, chat_messages)
    print("  ✅ Chat signals stored")

    # Add meet signals
    await process_meet_chunk(
        meeting_id=meeting_id,
        group_id=group_id,
        chunk_index=0,
        text="We decided in today's meeting to switch from Redis to Memcached for the caching layer. Sam will update the config by Friday.",
        speaker="Alex",
        timestamp="2026-03-21T10:00:00Z",
        is_final=False,
    )
    print("  ✅ Meet signals stored")

    # Query across both
    answer = await query_knowledge(group_id, "What did we decide about caching?")
    assert len(answer) > 20, "Expected a substantive answer about caching"
    print(f"  ✅ Query across chat + meet: {answer[:120]}...")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def main():
    print("Running Milestone 15 tests...\n")
    await test_signal_extraction_chunk_1()
    await test_signal_extraction_chunk_2()
    await test_final_chunk_generates_summary()
    await test_signals_coexist_with_chat_signals()
    print("\n🎉 MILESTONE 15 PASSED — Meet transcript agent extracting and storing signals correctly")


asyncio.run(main())