"""
Test Milestone 22: Voice attribution in /ask + /voicelog.
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

TRANSCRIPT_POSTGRES = "We decided to go with PostgreSQL. Final. Raj will set up the schema by Thursday."
TRANSCRIPT_BLOCKER  = "Dashboard is still blocked on design specs. Two weeks now. Hard blocker for the sprint."
TRANSCRIPT_BUG      = "Checkout timeout is happening again. Critical. Someone needs to investigate today."


async def _seed_voice_signals(group_id: str):
    """Seed a group with voice-sourced signals for testing."""
    from backend.pipeline import process_message_batch, set_lens
    from backend.agents.voice_handler import build_voice_transcript_signal, _inject_voice_metadata
    from backend.db.chroma import store_signals

    set_lens(group_id, "dev")
    sessions = [
        ("Raj",  TRANSCRIPT_POSTGRES, "f1", 22, "2026-03-14T10:00:00Z"),
        ("Alex", TRANSCRIPT_BLOCKER,  "f2", 18, "2026-03-17T11:00:00Z"),
        ("Sam",  TRANSCRIPT_BUG,      "f3", 15, "2026-03-19T09:00:00Z"),
    ]
    for sender, transcript, file_id, duration, timestamp in sessions:
        ts_signal = build_voice_transcript_signal(
            transcript=transcript, sender=sender, group_id=group_id,
            voice_file_id=file_id, duration_seconds=duration,
            language="en", timestamp=timestamp,
        )
        store_signals(group_id, [ts_signal])
        messages = [{"sender": sender, "text": transcript, "timestamp": timestamp}]
        extracted = await process_message_batch(group_id, messages)
        voice_meta = {"sender": sender, "voice_file_id": file_id, "duration_seconds": duration, "language": "en"}
        _inject_voice_metadata(extracted, voice_meta)


async def test_signal_formatter():
    """Test that voice signals format with attribution prefix."""
    from backend.agents.query_agent import _format_signal_for_context

    print("Testing signal formatter with voice attribution...")

    voice_signal = {
        "type": "architecture_decision",
        "summary": "Team decided to use PostgreSQL",
        "source": "voice",
        "speaker": "Raj",
        "voice_duration": 45,
        "timestamp": "2026-03-14T10:00:00Z",
        "entities": ["@Raj"],
    }
    formatted = _format_signal_for_context(voice_signal)
    assert "[VOICE NOTE" in formatted, f"Expected [VOICE NOTE] prefix, got: {formatted}"
    assert "@Raj" in formatted
    assert "Mar 14" in formatted
    assert "45s" in formatted
    print(f"  \u2705 Voice: {formatted[:120]}")

    chat_signal = {
        "type": "tech_debt", "summary": "JWT hardcoded", "source": "chat",
        "timestamp": "2026-03-15T09:00:00Z", "entities": ["@Alex"],
    }
    assert "[CHAT" in _format_signal_for_context(chat_signal)
    print(f"  \u2705 Chat signal formatted correctly")

    doc_signal = {
        "type": "document_knowledge", "summary": "OAuth required",
        "source": "document", "timestamp": "2026-03-16T09:00:00Z", "entities": [],
    }
    assert "[DOCUMENT" in _format_signal_for_context(doc_signal)
    print(f"  \u2705 Document signal formatted correctly")

    # Also test with ChromaDB nested format
    nested_voice = {
        "metadata": {
            "type": "architecture_decision",
            "summary": "Use Redis for caching",
            "source": "voice",
            "speaker": "Sam",
            "voice_duration": 30,
            "timestamp": "2026-03-18T10:00:00Z",
            "entities": ["@Sam"],
        },
        "document": "Use Redis for caching",
        "id": "test-id",
    }
    nested_fmt = _format_signal_for_context(nested_voice)
    assert "[VOICE NOTE" in nested_fmt, f"Nested format failed: {nested_fmt}"
    print(f"  \u2705 Nested ChromaDB format handled correctly")


async def test_voice_query_attribution():
    """Test that /ask returns voice attribution in its answer."""
    from backend.pipeline import query_knowledge
    from backend.config import CHROMA_DB_PATH

    print("\nTesting /ask returns voice attribution...")
    group_id = "test_voice_m22_ask"
    await _seed_voice_signals(group_id)

    answer = await query_knowledge(group_id, "What database did we decide to use?")
    assert len(answer) > 10
    relevant = any(w in answer.lower() for w in ["postgres", "raj", "voice", "database"])
    assert relevant, f"Answer did not surface voice-sourced decision. Got: {answer[:200]}"
    print(f"  \u2705 Answer surfaces voice decision: {answer[:150]}...")

    has_citation = any(phrase in answer.lower() for phrase in ["voice note", "@raj", "raj said", "mar 14"])
    if has_citation:
        print(f"  \u2705 Explicit voice attribution present in answer")
    else:
        print(f"  \u26a0\ufe0f  Answer correct but attribution phrasing varies by provider (acceptable)")

    # Cleanup
    import chromadb as cdb
    client = cdb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def test_voicelog_filtering():
    """Test voicelog retrieval and speaker filtering."""
    from backend.db.chroma import get_all_signals
    import chromadb
    from backend.config import CHROMA_DB_PATH

    print("\nTesting voicelog signal retrieval and filtering...")
    group_id = "test_voice_m22_log"
    await _seed_voice_signals(group_id)

    all_signals_raw = get_all_signals(group_id)

    # Flatten metadata (same as commands.py does)
    def _flatten(s):
        meta = s.get("metadata", {})
        flat = {**meta}
        flat.setdefault("id", s.get("id", ""))
        flat.setdefault("document", s.get("document", ""))
        return flat

    all_signals = [_flatten(s) for s in all_signals_raw]

    voice_signals = [
        s for s in all_signals
        if s.get("source") == "voice"
        or s.get("type") == "voice_transcript"
        or "[Voice @" in s.get("summary", "")
    ]
    assert len(voice_signals) > 0, "Expected voice-sourced signals"
    print(f"  \u2705 Found {len(voice_signals)} voice-sourced signal(s)")

    raj_signals = [
        s for s in voice_signals
        if "raj" in s.get("speaker", "").lower() or "raj" in str(s.get("entities", [])).lower()
    ]
    assert len(raj_signals) > 0, "Expected signals from Raj"
    print(f"  \u2705 Found {len(raj_signals)} signal(s) from @Raj")

    structured = [s for s in voice_signals if s.get("type") != "voice_transcript"]
    print(f"  \u2705 {len(structured)} structured, {len(voice_signals) - len(structured)} raw transcripts")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def test_voicelog_command_importable():
    """Test that cmd_voicelog imports without errors."""
    print("\nTesting cmd_voicelog import...")
    from backend.bot.commands import cmd_voicelog
    print("  \u2705 cmd_voicelog importable")


async def test_mixed_source_query():
    """Test that /ask uses voice + chat signals together."""
    from backend.pipeline import process_message_batch, query_knowledge, set_lens
    from backend.agents.voice_handler import build_voice_transcript_signal, _inject_voice_metadata
    from backend.db.chroma import store_signals
    import chromadb
    from backend.config import CHROMA_DB_PATH

    print("\nTesting mixed-source query (voice + chat)...")
    group_id = "test_voice_m22_mixed"
    set_lens(group_id, "dev")

    # Chat signal: Redis
    await process_message_batch(group_id, [
        {"sender": "Alex", "text": "I think we should use Redis for the cache.", "timestamp": "2026-03-10T09:00:00Z"}
    ])

    # Voice signal (more recent): overrides to PostgreSQL
    transcript = "Just to be clear — we're going with PostgreSQL for everything. Redis is off the table."
    ts_signal = build_voice_transcript_signal(
        transcript=transcript, sender="Raj", group_id=group_id,
        voice_file_id="f_override", duration_seconds=20, language="en",
        timestamp="2026-03-21T10:00:00Z",
    )
    store_signals(group_id, [ts_signal])
    extracted = await process_message_batch(group_id, [
        {"sender": "Raj", "text": transcript, "timestamp": "2026-03-21T10:00:00Z"}
    ])
    _inject_voice_metadata(extracted, {"sender": "Raj", "voice_file_id": "f_override", "duration_seconds": 20, "language": "en"})

    answer = await query_knowledge(group_id, "What did we decide about caching?")
    assert any(w in answer.lower() for w in ["postgres", "redis", "cache"])
    print(f"  \u2705 Mixed-source query answered: {answer[:120]}...")

    # Cleanup
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except Exception:
        pass


async def main():
    print("Running Milestone 22 tests...\n")
    await test_signal_formatter()
    await test_voice_query_attribution()
    await test_voicelog_filtering()
    await test_voicelog_command_importable()
    await test_mixed_source_query()
    print("\n\U0001f389 MILESTONE 22 PASSED — Voice attribution in /ask, /voicelog working")


asyncio.run(main())