init

2026-04-19 20:51:49 +00:00 · 2026-04-05 00:43:23 +05:30
commit 8be37d3e92
425 changed files with 101853 additions and 0 deletions
--- a/thirdeye/backend/agents/document_ingestor.py
+++ b/thirdeye/backend/agents/document_ingestor.py
@@ -0,0 +1,200 @@
+"""Document Ingestor — extracts text from PDFs, DOCX, TXT and chunks for RAG storage."""
+import os
+import logging
+import uuid
+from datetime import datetime
+
+logger = logging.getLogger("thirdeye.agents.document_ingestor")
+
+# --- Text Extraction ---
+
+def extract_text_from_pdf(file_path: str) -> list[dict]:
+    """Extract text from PDF, returns list of {page: int, text: str}."""
+    from PyPDF2 import PdfReader
+
+    pages = []
+    try:
+        reader = PdfReader(file_path)
+        for i, page in enumerate(reader.pages):
+            text = page.extract_text()
+            if text and text.strip():
+                pages.append({"page": i + 1, "text": text.strip()})
+    except Exception as e:
+        logger.error(f"PDF extraction failed for {file_path}: {e}")
+
+    return pages
+
+
+def extract_text_from_docx(file_path: str) -> list[dict]:
+    """Extract text from DOCX, returns list of {page: 1, text: str} (DOCX has no real pages)."""
+    from docx import Document
+
+    try:
+        doc = Document(file_path)
+        full_text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+        if full_text.strip():
+            return [{"page": 1, "text": full_text.strip()}]
+    except Exception as e:
+        logger.error(f"DOCX extraction failed for {file_path}: {e}")
+
+    return []
+
+
+def extract_text_from_txt(file_path: str) -> list[dict]:
+    """Extract text from plain text file."""
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            text = f.read().strip()
+        if text:
+            return [{"page": 1, "text": text}]
+    except Exception as e:
+        logger.error(f"TXT extraction failed for {file_path}: {e}")
+
+    return []
+
+
+EXTRACTORS = {
+    ".pdf": extract_text_from_pdf,
+    ".docx": extract_text_from_docx,
+    ".txt": extract_text_from_txt,
+    ".md": extract_text_from_txt,
+    ".csv": extract_text_from_txt,
+    ".json": extract_text_from_txt,
+    ".log": extract_text_from_txt,
+}
+
+
+def extract_text(file_path: str) -> list[dict]:
+    """Route to correct extractor based on file extension."""
+    ext = os.path.splitext(file_path)[1].lower()
+    extractor = EXTRACTORS.get(ext)
+    if not extractor:
+        logger.warning(f"Unsupported file type: {ext} ({file_path})")
+        return []
+    return extractor(file_path)
+
+
+# --- Chunking ---
+
+def chunk_text(text: str, max_chars: int = 1500, overlap_chars: int = 200) -> list[str]:
+    """
+    Split text into overlapping chunks.
+    
+    Uses paragraph boundaries when possible, falls back to sentence boundaries,
+    then hard character splits. ~1500 chars ≈ ~375 tokens for embedding.
+    """
+    if len(text) <= max_chars:
+        return [text]
+
+    # Split by paragraphs first
+    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
+
+    chunks = []
+    current_chunk = ""
+
+    for para in paragraphs:
+        # If adding this paragraph stays under limit, add it
+        if len(current_chunk) + len(para) + 1 <= max_chars:
+            current_chunk = (current_chunk + "\n" + para).strip()
+        else:
+            # Save current chunk if it has content
+            if current_chunk:
+                chunks.append(current_chunk)
+
+            # If single paragraph is too long, split it by sentences
+            if len(para) > max_chars:
+                sentences = para.replace(". ", ".\n").split("\n")
+                sub_chunk = ""
+                for sent in sentences:
+                    if len(sub_chunk) + len(sent) + 1 <= max_chars:
+                        sub_chunk = (sub_chunk + " " + sent).strip()
+                    else:
+                        if sub_chunk:
+                            chunks.append(sub_chunk)
+                        sub_chunk = sent
+                if sub_chunk:
+                    current_chunk = sub_chunk
+                else:
+                    current_chunk = ""
+            else:
+                current_chunk = para
+
+    if current_chunk:
+        chunks.append(current_chunk)
+
+    # Add overlap: prepend last N chars of previous chunk to each subsequent chunk
+    if overlap_chars > 0 and len(chunks) > 1:
+        overlapped = [chunks[0]]
+        for i in range(1, len(chunks)):
+            prev_tail = chunks[i - 1][-overlap_chars:]
+            # Find a word boundary in the overlap
+            space_idx = prev_tail.find(" ")
+            if space_idx > 0:
+                prev_tail = prev_tail[space_idx + 1:]
+            overlapped.append(prev_tail + " " + chunks[i])
+        chunks = overlapped
+
+    return chunks
+
+
+# --- Main Ingestion ---
+
+def ingest_document(
+    file_path: str,
+    group_id: str,
+    shared_by: str = "Unknown",
+    filename: str = None,
+) -> list[dict]:
+    """
+    Full pipeline: extract text → chunk → produce signal dicts ready for ChromaDB.
+    
+    Args:
+        file_path: Path to the downloaded file on disk
+        group_id: Telegram group ID
+        shared_by: Who shared the file
+        filename: Original filename (for metadata)
+    
+    Returns:
+        List of signal dicts ready for store_signals()
+    """
+    if filename is None:
+        filename = os.path.basename(file_path)
+
+    # Extract
+    pages = extract_text(file_path)
+    if not pages:
+        logger.warning(f"No text extracted from {filename}")
+        return []
+
+    # Chunk each page
+    signals = []
+    total_chunks = 0
+
+    for page_data in pages:
+        page_num = page_data["page"]
+        chunks = chunk_text(page_data["text"])
+
+        for chunk_idx, chunk_text_str in enumerate(chunks):
+            if len(chunk_text_str.strip()) < 30:
+                continue  # Skip tiny chunks
+
+            signal = {
+                "id": str(uuid.uuid4()),
+                "type": "document_knowledge",
+                "summary": f"[{filename} p{page_num}] {chunk_text_str[:150]}...",
+                "entities": [f"@{shared_by}", filename],
+                "severity": "low",
+                "status": "reference",
+                "sentiment": "neutral",
+                "urgency": "none",
+                "raw_quote": chunk_text_str,
+                "timestamp": datetime.utcnow().isoformat(),
+                "group_id": group_id,
+                "lens": "document",
+                "keywords": [filename, f"page_{page_num}", "document", shared_by],
+            }
+            signals.append(signal)
+            total_chunks += 1
+
+    logger.info(f"Ingested {filename}: {len(pages)} pages → {total_chunks} chunks for group {group_id}")
+    return signals