B.Tech-Project-III/thirdeye/backend/agents/json_utils.py

"""Utilities for robustly parsing JSON from LLM responses."""

import json
import re


def extract_json_object(content: str) -> dict:
    """Extract and parse the first JSON object from raw LLM output."""
    text = (content or "").strip()
    if not text:
        raise json.JSONDecodeError("Empty LLM response", text, 0)

    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
        text = re.sub(r"\s*```$", "", text)

    text = text.strip()
    if not text:
        raise json.JSONDecodeError("Empty LLM response after cleanup", text, 0)

    decoder = json.JSONDecoder()

    # Direct parse for pure JSON responses.
    try:
        parsed = json.loads(text)
        if isinstance(parsed, dict):
            return parsed
    except json.JSONDecodeError:
        pass

    # Try to decode from each object start. This handles wrapper text more
    # reliably than regex, especially with nested braces.
    for idx, ch in enumerate(text):
        if ch != "{":
            continue
        try:
            parsed, _ = decoder.raw_decode(text[idx:])
            if isinstance(parsed, dict):
                return parsed
        except json.JSONDecodeError:
            continue

    raise json.JSONDecodeError("No valid top-level JSON object found", text, 0)