B.Tech-Project-III/thirdeye/scripts/test_m11.py

"""Test Milestone 11: Document & PDF ingestion into RAG."""
import os, sys, tempfile
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))


def test_text_extraction():
    """Test extraction from each supported file type."""
    from backend.agents.document_ingestor import extract_text

    # Test 1: Plain text file
    print("Testing TXT extraction...")
    tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
    tmp.write("This is a test document.\nIt has multiple lines.\nThird line about PostgreSQL decisions.")
    tmp.close()

    pages = extract_text(tmp.name)
    assert len(pages) == 1, f"Expected 1 page, got {len(pages)}"
    assert "PostgreSQL" in pages[0]["text"]
    print(f"  ✅ TXT extraction works ({len(pages[0]['text'])} chars)")
    os.unlink(tmp.name)

    # Test 2: DOCX file
    print("Testing DOCX extraction...")
    try:
        from docx import Document
        doc = Document()
        doc.add_paragraph("Architecture Decision: We chose Redis for caching.")
        doc.add_paragraph("Tech Debt: The API keys are hardcoded in config.py.")
        doc.add_paragraph("Promise: Dashboard mockups will be ready by Friday March 21st.")
        tmp_docx = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
        doc.save(tmp_docx.name)
        tmp_docx.close()

        pages = extract_text(tmp_docx.name)
        assert len(pages) == 1, f"Expected 1 page, got {len(pages)}"
        assert "Redis" in pages[0]["text"]
        print(f"  ✅ DOCX extraction works ({len(pages[0]['text'])} chars)")
        os.unlink(tmp_docx.name)
    except ImportError:
        print("  ⚠️ python-docx not installed, skipping DOCX test")

    # Test 3: PDF file
    print("Testing PDF extraction...")
    try:
        from PyPDF2 import PdfWriter
        from io import BytesIO
        # PyPDF2 can't easily create PDFs with text from scratch,
        # so we test the extractor handles an empty/corrupt file gracefully
        tmp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
        writer = PdfWriter()
        writer.add_blank_page(width=612, height=792)
        writer.write(tmp_pdf)
        tmp_pdf.close()

        pages = extract_text(tmp_pdf.name)
        # Blank page = no text, should return empty gracefully
        print(f"  ✅ PDF extraction handles blank PDF gracefully ({len(pages)} pages with text)")
        os.unlink(tmp_pdf.name)
    except ImportError:
        print("  ⚠️ PyPDF2 not installed, skipping PDF test")

    # Test 4: Unsupported file type
    print("Testing unsupported file type...")
    tmp_bin = tempfile.NamedTemporaryFile(suffix=".exe", delete=False)
    tmp_bin.write(b"binary data")
    tmp_bin.close()
    pages = extract_text(tmp_bin.name)
    assert len(pages) == 0, "Should return empty for unsupported types"
    print(f"  ✅ Unsupported file type handled gracefully")
    os.unlink(tmp_bin.name)


def test_chunking():
    """Test text chunking logic."""
    from backend.agents.document_ingestor import chunk_text

    print("\nTesting chunking...")

    # Test 1: Short text — should NOT be split
    short = "This is a short text that fits in one chunk."
    chunks = chunk_text(short, max_chars=1500)
    assert len(chunks) == 1, f"Short text should be 1 chunk, got {len(chunks)}"
    print(f"  ✅ Short text → 1 chunk")

    # Test 2: Long text — should be split
    long_text = "\n".join([f"This is paragraph {i} with enough content to fill the chunk. " * 5 for i in range(20)])
    chunks = chunk_text(long_text, max_chars=500, overlap_chars=100)
    assert len(chunks) > 1, f"Long text should produce multiple chunks, got {len(chunks)}"
    print(f"  ✅ Long text ({len(long_text)} chars) → {len(chunks)} chunks")

    # Test 3: All chunks are within size limit (with some tolerance for overlap)
    for i, c in enumerate(chunks):
        # Overlap can push slightly over max_chars, that's fine
        assert len(c) < 800, f"Chunk {i} too large: {len(c)} chars"
    print(f"  ✅ All chunks are within size bounds")

    # Test 4: Empty text
    chunks = chunk_text("")
    assert len(chunks) == 1 and chunks[0] == "", "Empty text should return ['']"
    print(f"  ✅ Empty text handled")


def test_full_ingestion():
    """Test full ingestion pipeline: file → extract → chunk → signals → store → query."""
    from backend.agents.document_ingestor import ingest_document
    from backend.db.chroma import store_signals, query_signals

    print("\nTesting full ingestion pipeline...")

    # Create a realistic test document
    tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
    tmp.write("""API Specification v2.0 — Acme Project

Authentication:
All endpoints require OAuth 2.0 Bearer tokens. The recommended flow for SPAs is Authorization Code with PKCE.
Tokens expire after 3600 seconds. Refresh tokens are valid for 30 days.

Endpoints:
POST /api/v2/orders — Create a new order. Requires 'orders:write' scope.
GET /api/v2/orders/{id} — Retrieve order details. Requires 'orders:read' scope.
DELETE /api/v2/orders/{id} — Cancel an order. Only allowed within 24 hours of creation.

Rate Limits:
Standard tier: 100 requests per minute.
Enterprise tier: 1000 requests per minute.
Rate limit headers (X-RateLimit-Remaining) are included in every response.

Compliance:
All data must be encrypted at rest using AES-256.
PII fields are redacted in logs automatically.
GDPR deletion requests must be processed within 72 hours.
The compliance deadline for the new data residency requirements is April 1st 2026.
""")
    tmp.close()

    group_id = "test_doc_m11"

    # Ingest
    signals = ingest_document(tmp.name, group_id, shared_by="Priya", filename="api_spec_v2.txt")
    assert len(signals) > 0, f"Expected signals, got {len(signals)}"
    print(f"  ✅ Ingestion produced {len(signals)} signals")

    # Verify signal structure
    for s in signals:
        assert s["type"] == "document_knowledge"
        assert s["group_id"] == group_id
        assert "@Priya" in s["entities"]
        assert "api_spec_v2.txt" in s["entities"]
    print(f"  ✅ All signals have correct type and metadata")

    # Store in ChromaDB
    store_signals(group_id, signals)
    print(f"  ✅ Stored {len(signals)} document signals in ChromaDB")

    # Query: can we find document content?
    results = query_signals(group_id, "What authentication method is recommended?")
    assert len(results) > 0, "No results for auth query"
    found_auth = any("oauth" in r["document"].lower() or "auth" in r["document"].lower() for r in results)
    assert found_auth, "Expected to find OAuth/auth info in results"
    print(f"  ✅ Query 'authentication method' returns relevant results")

    results2 = query_signals(group_id, "What is the compliance deadline?")
    assert len(results2) > 0, "No results for compliance query"
    found_compliance = any("april" in r["document"].lower() or "compliance" in r["document"].lower() for r in results2)
    assert found_compliance, "Expected to find compliance deadline in results"
    print(f"  ✅ Query 'compliance deadline' returns relevant results")

    results3 = query_signals(group_id, "rate limits")
    assert len(results3) > 0, "No results for rate limits query"
    print(f"  ✅ Query 'rate limits' returns {len(results3)} results")

    # Cleanup
    os.unlink(tmp.name)
    import chromadb
    from backend.config import CHROMA_DB_PATH
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
        print(f"  ✅ Cleaned up test collection")
    except:
        pass


def test_mixed_query():
    """Test that document signals AND chat signals coexist and are both queryable."""
    from backend.agents.document_ingestor import ingest_document
    from backend.pipeline import process_message_batch, query_knowledge
    from backend.db.chroma import store_signals
    import asyncio

    print("\nTesting mixed query (documents + chat signals)...")

    group_id = "test_mixed_m11"

    # 1. Ingest a document
    tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, encoding="utf-8")
    tmp.write("Architecture Decision Record: The team has selected Redis for session caching due to sub-millisecond latency.")
    tmp.close()

    doc_signals = ingest_document(tmp.name, group_id, shared_by="Priya", filename="adr_001.txt")
    store_signals(group_id, doc_signals)
    os.unlink(tmp.name)

    # 2. Process some chat messages (that mention a DIFFERENT topic)
    chat_messages = [
        {"sender": "Alex", "text": "The timeout bug on checkout is back. Third time this sprint.", "timestamp": "2026-03-20T10:00:00Z"},
        {"sender": "Sam", "text": "I think it's a database connection pool issue.", "timestamp": "2026-03-20T10:05:00Z"},
    ]
    chat_signals = asyncio.run(process_message_batch(group_id, chat_messages))

    # 3. Query for document knowledge
    answer1 = asyncio.run(query_knowledge(group_id, "What caching solution was selected?"))
    assert "redis" in answer1.lower() or "caching" in answer1.lower(), f"Expected Redis/caching mention, got: {answer1[:100]}"
    print(f"  ✅ Document query works: {answer1[:80]}...")

    # 4. Query for chat knowledge
    answer2 = asyncio.run(query_knowledge(group_id, "What bugs have been reported?"))
    assert "timeout" in answer2.lower() or "bug" in answer2.lower(), f"Expected timeout/bug mention, got: {answer2[:100]}"
    print(f"  ✅ Chat query works alongside documents: {answer2[:80]}...")

    # Cleanup
    import chromadb
    from backend.config import CHROMA_DB_PATH
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    try:
        client.delete_collection(f"ll_{group_id}")
    except:
        pass

    print(f"  ✅ Mixed query (document + chat) both return correct results")


test_text_extraction()
test_chunking()
test_full_ingestion()
test_mixed_query()
print("\n🎉 MILESTONE 11 PASSED — Document & PDF ingestion working")