mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 20:51:49 +00:00
init
This commit is contained in:
213
thirdeye/backend/agents/link_fetcher.py
Normal file
213
thirdeye/backend/agents/link_fetcher.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Link Fetcher — extracts, summarizes, and stores content from URLs shared in chat."""
|
||||
import re
|
||||
import uuid
|
||||
import logging
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from backend.providers import call_llm
|
||||
from backend.config import ENABLE_LINK_FETCH
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.link_fetcher")
|
||||
|
||||
# Patterns to skip (images, downloads, social media embeds, etc.)
|
||||
SKIP_PATTERNS = [
|
||||
r"\.(png|jpg|jpeg|gif|svg|webp|ico|bmp)(\?.*)?$",
|
||||
r"\.(zip|tar|gz|rar|7z|exe|msi|dmg|apk|deb)(\?.*)?$",
|
||||
r"\.(mp3|mp4|avi|mov|mkv|wav|flac)(\?.*)?$",
|
||||
r"^https?://(www\.)?(twitter|x)\.com/.*/status/",
|
||||
r"^https?://(www\.)?instagram\.com/p/",
|
||||
r"^https?://(www\.)?tiktok\.com/",
|
||||
r"^https?://(www\.)?youtube\.com/shorts/",
|
||||
r"^https?://t\.me/", # Other Telegram links
|
||||
]
|
||||
|
||||
SKIP_COMPILED = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
|
||||
|
||||
|
||||
def extract_urls(text: str) -> list[str]:
|
||||
"""Extract all HTTP/HTTPS URLs from a text string."""
|
||||
url_pattern = re.compile(
|
||||
r"https?://[^\s<>\"')\]},;]+"
|
||||
)
|
||||
urls = url_pattern.findall(text)
|
||||
|
||||
# Clean trailing punctuation
|
||||
cleaned = []
|
||||
for url in urls:
|
||||
url = url.rstrip(".,;:!?)")
|
||||
if len(url) > 10:
|
||||
cleaned.append(url)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def should_fetch(url: str) -> bool:
|
||||
"""Decide if a URL is worth fetching (skip images, downloads, social embeds)."""
|
||||
for pattern in SKIP_COMPILED:
|
||||
if pattern.search(url):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def fetch_url_content(url: str, timeout: float = 15.0) -> dict | None:
|
||||
"""
|
||||
Fetch a URL and extract main text content.
|
||||
|
||||
Returns:
|
||||
{title, text, url} or None if fetch fails
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ThirdEye/1.0; +https://thirdeye.dev)",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.info(f"URL returned {response.status_code}: {url[:80]}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
||||
logger.info(f"Skipping non-HTML content ({content_type}): {url[:80]}")
|
||||
return None
|
||||
|
||||
html = response.text
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.info(f"URL timed out: {url[:80]}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.info(f"URL fetch failed ({type(e).__name__}): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Parse HTML
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
|
||||
# Remove script, style, nav, footer, header elements
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
|
||||
tag.decompose()
|
||||
|
||||
# Try to find main content area
|
||||
main = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})
|
||||
if main:
|
||||
text = main.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up
|
||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
|
||||
# Skip if too little content
|
||||
if len(text) < 100:
|
||||
logger.info(f"Too little text content ({len(text)} chars): {url[:80]}")
|
||||
return None
|
||||
|
||||
# Truncate very long content
|
||||
if len(text) > 8000:
|
||||
text = text[:8000] + "\n\n[Content truncated]"
|
||||
|
||||
return {
|
||||
"title": title or url,
|
||||
"text": text,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"HTML parsing failed for {url[:80]}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def summarize_content(title: str, text: str, url: str) -> str:
|
||||
"""Use LLM to create a concise summary of fetched content."""
|
||||
# Limit text sent to LLM
|
||||
text_preview = text[:3000]
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": """You are a content summarizer for ThirdEye.
|
||||
Given the title and text of a web page, produce a concise 2-4 sentence summary that captures the key information.
|
||||
Focus on: main topic, key facts, any actionable insights, any deadlines or decisions mentioned.
|
||||
Respond with ONLY the summary text, nothing else."""},
|
||||
{"role": "user", "content": f"Title: {title}\nURL: {url}\n\nContent:\n{text_preview}"},
|
||||
]
|
||||
|
||||
try:
|
||||
result = await call_llm("fast_small", messages, temperature=0.2, max_tokens=300)
|
||||
return result["content"].strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Link summarization failed: {e}")
|
||||
# Fallback: use first 200 chars of text
|
||||
return text[:200] + "..."
|
||||
|
||||
|
||||
async def process_links_from_message(
|
||||
text: str,
|
||||
group_id: str,
|
||||
shared_by: str = "Unknown",
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Full pipeline: extract URLs from message → fetch → summarize → produce signals.
|
||||
|
||||
Designed to be called in the background (non-blocking to the main message pipeline).
|
||||
|
||||
Returns:
|
||||
List of signal dicts ready for store_signals()
|
||||
"""
|
||||
if not ENABLE_LINK_FETCH:
|
||||
return []
|
||||
|
||||
urls = extract_urls(text)
|
||||
fetchable = [u for u in urls if should_fetch(u)]
|
||||
|
||||
if not fetchable:
|
||||
return []
|
||||
|
||||
signals = []
|
||||
|
||||
# Process up to 3 links per message to avoid overload
|
||||
for url in fetchable[:3]:
|
||||
try:
|
||||
content = await fetch_url_content(url)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
summary = await summarize_content(content["title"], content["text"], url)
|
||||
|
||||
signal = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "link_knowledge",
|
||||
"summary": f"[Link: {content['title'][:80]}] {summary[:200]}",
|
||||
"entities": [f"@{shared_by}", url[:100]],
|
||||
"severity": "low",
|
||||
"status": "reference",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"raw_quote": summary,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"group_id": group_id,
|
||||
"lens": "link",
|
||||
"keywords": [content["title"][:50], "link", "web", shared_by],
|
||||
}
|
||||
signals.append(signal)
|
||||
logger.info(f"Link ingested: {content['title'][:50]} ({url[:60]})")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Link processing failed for {url[:60]}: {e}")
|
||||
continue
|
||||
|
||||
return signals
|
||||
Reference in New Issue
Block a user