mirror of
https://github.com/arkorty/B.Tech-Project-III.git
synced 2026-04-19 20:51:49 +00:00
init
This commit is contained in:
281
thirdeye/backend/agents/voice_handler.py
Normal file
281
thirdeye/backend/agents/voice_handler.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Voice Handler
|
||||
Orchestrates the full pipeline for Telegram voice messages and video notes:
|
||||
|
||||
Telegram voice/video_note message
|
||||
-> download audio bytes
|
||||
-> transcribe via Groq Whisper (voice_transcriber.py)
|
||||
-> build a voice_transcript signal (stored raw for full-text search)
|
||||
-> run transcript through process_message_batch (signal extraction)
|
||||
-> all extracted signals carry voice attribution metadata
|
||||
|
||||
Voice metadata attached to every extracted signal:
|
||||
source: "voice"
|
||||
voice_file_id: Telegram file ID
|
||||
voice_duration: seconds
|
||||
speaker: sender display name
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from backend.agents.voice_transcriber import (
|
||||
transcribe_audio, download_telegram_audio, format_duration
|
||||
)
|
||||
from backend.config import ENABLE_VOICE_TRANSCRIPTION, VOICE_STORE_TRANSCRIPT
|
||||
from backend.db.chroma import store_signals
|
||||
from backend.pipeline import process_message_batch
|
||||
|
||||
logger = logging.getLogger("thirdeye.agents.voice_handler")
|
||||
|
||||
|
||||
# --- Voice transcript signal builder -----------------------------------------
|
||||
|
||||
def build_voice_transcript_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
voice_file_id: str,
|
||||
duration_seconds: int,
|
||||
language: str,
|
||||
timestamp: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a voice_transcript signal that stores the full raw transcription.
|
||||
Always stored alongside extracted signals so the full transcript is
|
||||
searchable in ChromaDB even if no structured signals were extracted.
|
||||
"""
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": "voice_transcript",
|
||||
"summary": f"[Voice {format_duration(duration_seconds)}] @{sender}: {transcript[:200]}",
|
||||
"raw_quote": transcript,
|
||||
"severity": "low",
|
||||
"status": "transcribed",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "none",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"voice_file_id": voice_file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
"voice_language": language,
|
||||
"speaker": sender,
|
||||
}
|
||||
|
||||
|
||||
def _extract_voice_keywords(text: str) -> list[str]:
|
||||
"""Simple keyword extraction from transcript text."""
|
||||
stopwords = {
|
||||
"the", "a", "an", "is", "are", "was", "were", "will", "to", "of",
|
||||
"in", "on", "at", "for", "by", "with", "this", "that", "and", "or",
|
||||
"but", "we", "i", "it", "be", "do", "have", "has", "had", "not",
|
||||
"so", "just", "like", "yeah", "okay", "um", "uh", "you", "me",
|
||||
}
|
||||
words = text.lower().split()
|
||||
keywords = [w.strip(".,!?;:\"'") for w in words if len(w) > 3 and w not in stopwords]
|
||||
return list(dict.fromkeys(keywords))[:12]
|
||||
|
||||
|
||||
def _inject_voice_metadata(signals: list, voice_meta: dict) -> list[dict]:
|
||||
"""
|
||||
Inject voice attribution into every signal extracted from a voice transcript.
|
||||
Accepts both Signal Pydantic model objects and plain dicts.
|
||||
This ensures /ask can cite the voice source in its answers.
|
||||
"""
|
||||
result = []
|
||||
for signal in signals:
|
||||
sig = signal.model_dump() if hasattr(signal, "model_dump") else dict(signal)
|
||||
sig["source"] = "voice"
|
||||
sig["voice_file_id"] = voice_meta.get("voice_file_id", "")
|
||||
sig["voice_duration"] = voice_meta.get("duration_seconds", 0)
|
||||
sig["voice_language"] = voice_meta.get("language", "")
|
||||
sig["speaker"] = voice_meta.get("sender", "Unknown")
|
||||
if "[Voice]" not in sig.get("summary", ""):
|
||||
sig["summary"] = f"[Voice @{voice_meta.get('sender', '?')}] {sig['summary']}"
|
||||
result.append(sig)
|
||||
return result
|
||||
|
||||
|
||||
# --- Fallback signal builder -------------------------------------------------
|
||||
|
||||
# Keywords that hint at a signal type when the LLM extraction returns nothing
|
||||
_FALLBACK_TYPE_HINTS = {
|
||||
"feature_request": {
|
||||
"need", "needs", "required", "require", "want", "should", "missing",
|
||||
"add", "feature", "ui", "ux", "design", "change", "changes", "update",
|
||||
"improve", "improvement", "responsiveness", "responsive",
|
||||
},
|
||||
"blocker": {
|
||||
"blocked", "blocking", "blocker", "stuck", "waiting", "can't", "cannot",
|
||||
"issue", "problem", "broken", "fails", "failing",
|
||||
},
|
||||
"action_item": {
|
||||
"will", "going", "plan", "todo", "do", "fix", "implement", "setup",
|
||||
"create", "build", "deploy", "check",
|
||||
},
|
||||
"risk": {
|
||||
"risk", "risky", "concern", "worried", "urgent", "urgently", "critical",
|
||||
"deadline", "delay", "late",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _build_fallback_signal(
|
||||
transcript: str,
|
||||
sender: str,
|
||||
group_id: str,
|
||||
timestamp: str,
|
||||
voice_meta: dict,
|
||||
) -> dict:
|
||||
"""
|
||||
Build a best-effort structured signal from a voice transcript when the LLM
|
||||
returned 0 signals. Picks the most likely signal type from keyword hints,
|
||||
falling back to 'feature_request' as the safe default.
|
||||
"""
|
||||
words = set(transcript.lower().split())
|
||||
scores = {sig_type: len(words & hints) for sig_type, hints in _FALLBACK_TYPE_HINTS.items()}
|
||||
best_type = max(scores, key=scores.get) if any(scores.values()) else "feature_request"
|
||||
|
||||
urgency_words = {"urgent", "urgently", "asap", "immediately", "critical", "now"}
|
||||
severity = "high" if words & urgency_words else "medium"
|
||||
|
||||
summary = transcript[:200].strip()
|
||||
if len(transcript) > 200:
|
||||
summary += "..."
|
||||
|
||||
return {
|
||||
"id": str(uuid.uuid4()),
|
||||
"type": best_type,
|
||||
"summary": f"[Voice @{sender}] {summary}",
|
||||
"raw_quote": transcript[:500],
|
||||
"severity": severity,
|
||||
"status": "unresolved",
|
||||
"sentiment": "neutral",
|
||||
"urgency": "high" if severity == "high" else "medium",
|
||||
"entities": [f"@{sender}"],
|
||||
"keywords": _extract_voice_keywords(transcript),
|
||||
"timestamp": timestamp,
|
||||
"group_id": group_id,
|
||||
"lens": "voice",
|
||||
"source": "voice",
|
||||
"speaker": sender,
|
||||
"voice_file_id": voice_meta.get("voice_file_id", ""),
|
||||
"voice_duration": voice_meta.get("duration_seconds", 0),
|
||||
"voice_language": voice_meta.get("language", ""),
|
||||
}
|
||||
|
||||
|
||||
# --- Main handler ------------------------------------------------------------
|
||||
|
||||
async def handle_voice_message(
|
||||
bot,
|
||||
group_id: str,
|
||||
sender: str,
|
||||
file_id: str,
|
||||
duration_seconds: int,
|
||||
message_date,
|
||||
is_video_note: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Full pipeline for a single voice or video note message.
|
||||
|
||||
Returns:
|
||||
{"ok": True, "transcript": "...", "signals_extracted": 3, "duration": 45, ...}
|
||||
OR {"ok": False, "reason": "...", "error": "..."}
|
||||
"""
|
||||
if not ENABLE_VOICE_TRANSCRIPTION:
|
||||
return {"ok": False, "reason": "disabled", "error": "Voice transcription is disabled"}
|
||||
|
||||
msg_type = "video note" if is_video_note else "voice message"
|
||||
logger.info(f"Processing {msg_type} from {sender} in {group_id} ({duration_seconds}s)")
|
||||
|
||||
# 1. Download audio
|
||||
try:
|
||||
audio_bytes = await download_telegram_audio(bot, file_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download audio from {sender}: {e}")
|
||||
return {"ok": False, "reason": "download_failed", "error": str(e)}
|
||||
|
||||
# 2. Transcribe
|
||||
filename = "audio.mp4" if is_video_note else "audio.ogg"
|
||||
transcription = await transcribe_audio(
|
||||
audio_bytes,
|
||||
filename=filename,
|
||||
duration_seconds=duration_seconds,
|
||||
)
|
||||
|
||||
if not transcription["ok"]:
|
||||
logger.info(f"Transcription skipped for {sender}: {transcription['reason']}")
|
||||
return {"ok": False, "reason": transcription["reason"], "error": transcription.get("error", "")}
|
||||
|
||||
transcript = transcription["transcript"]
|
||||
language = transcription.get("language", "unknown")
|
||||
timestamp = (
|
||||
message_date.replace(tzinfo=timezone.utc).isoformat()
|
||||
if message_date else datetime.utcnow().isoformat()
|
||||
)
|
||||
|
||||
# 3. Store raw voice transcript signal
|
||||
if VOICE_STORE_TRANSCRIPT:
|
||||
transcript_signal = build_voice_transcript_signal(
|
||||
transcript=transcript,
|
||||
sender=sender,
|
||||
group_id=group_id,
|
||||
voice_file_id=file_id,
|
||||
duration_seconds=duration_seconds,
|
||||
language=language,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
store_signals(group_id, [transcript_signal])
|
||||
logger.info(f"Voice transcript stored for {sender} ({len(transcript)} chars)")
|
||||
|
||||
# 4. Run through signal extraction pipeline — treat as a regular text message
|
||||
voice_meta = {
|
||||
"sender": sender,
|
||||
"voice_file_id": file_id,
|
||||
"duration_seconds": duration_seconds,
|
||||
"language": language,
|
||||
}
|
||||
|
||||
messages = [{
|
||||
"sender": sender,
|
||||
"text": transcript,
|
||||
"timestamp": timestamp,
|
||||
"source": "voice",
|
||||
"voice_file_id": file_id,
|
||||
"voice_duration": duration_seconds,
|
||||
}]
|
||||
|
||||
try:
|
||||
extracted_signals = await process_message_batch(group_id, messages)
|
||||
extracted_signals = _inject_voice_metadata(extracted_signals, voice_meta)
|
||||
signals_count = len(extracted_signals)
|
||||
|
||||
# Fallback: if the LLM extracted nothing from a meaningful voice message,
|
||||
# create a generic signal so the content is still searchable as structured data.
|
||||
if signals_count == 0 and len(transcript.split()) >= 5:
|
||||
fallback = _build_fallback_signal(transcript, sender, group_id, timestamp, voice_meta)
|
||||
store_signals(group_id, [fallback])
|
||||
signals_count = 1
|
||||
logger.info(f"Voice fallback signal created for {sender} (0 from LLM)")
|
||||
except Exception as e:
|
||||
logger.error(f"Signal extraction failed for voice from {sender}: {e}")
|
||||
signals_count = 0
|
||||
|
||||
logger.info(
|
||||
f"Voice pipeline complete: {sender}, {duration_seconds}s, "
|
||||
f"{signals_count} signals, transcript={len(transcript)} chars"
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"transcript": transcript,
|
||||
"signals_extracted": signals_count,
|
||||
"duration": duration_seconds,
|
||||
"sender": f"@{sender}",
|
||||
"language": language,
|
||||
}
|
||||
Reference in New Issue
Block a user