"""Multimodal LangGraph agent — accepts image + document (PDF) attachments. Wave 2b design: a *dedicated* vision-capable graph scoped to the `/demos/multimodal` cell. Other demos continue to use their own (cheaper, text-only) models — this keeps vision cost isolated to the one demo that exercises it. Wire format the agent sees ========================== Attachments arrive here after travelling through: CopilotChat → AG-UI message content parts → @ag-ui/langgraph runtime (ag-ui → LangChain converter) → this agent (LangChain HumanMessage content parts) The ag-ui-langgraph converter only understands the legacy ``{ type: "binary", mimeType, data | url }`` AG-UI part shape — the page at ``src/app/demos/multimodal/page.tsx`` installs an ``onRunInitialized`` shim that rewrites the modern ``{ type: "image" | "document", source: {...} }`` shape CopilotChat emits to the legacy shape before it hits the runtime. Once the converter has run, every attachment shows up in this agent as a LangChain ``image_url`` content part:: {"type": "image_url", "image_url": {"url": "data:;base64,"}} regardless of whether the upstream modality was ``image`` or ``document``. We therefore route on ``mimeType``, not the part ``type``: ``image/*`` parts are forwarded to GPT-4o unchanged (vision-native); ``application/pdf`` parts are flattened to inline text via ``pypdf`` so the model can read them without needing file-part support. References: - src/agents/main.py, src/agents/agentic_chat.py (baseline pattern) - packages/runtime/src/agent/converters/tanstack.ts (the modern content- part shape — useful context when the runtime gets upgraded and this agent can drop the pypdf flatten) """ from __future__ import annotations import base64 import io from typing import Any from copilotkit import CopilotKitMiddleware from langchain.agents import create_agent from langchain.agents.middleware import AgentMiddleware from langchain_core.messages import HumanMessage from langchain_openai import ChatOpenAI from pypdf import PdfReader SYSTEM_PROMPT = ( "You are a helpful assistant. The user may attach images or documents " "(PDFs). When they do, analyze the attachment carefully and answer the " "user's question. If no attachment is present, answer the text question " "normally. Keep responses concise (1-3 sentences) unless asked to go deep." ) def _extract_data_url_parts(url: str) -> tuple[str, str]: """Split a ``data:;base64,`` URL into (mime, base64-payload). Returns ("", url) if the input is not a base64 data URL — callers can fall back to treating the url as a fetchable reference. """ if not url.startswith("data:"): return "", url header, _, payload = url.partition(",") # Header looks like "data:application/pdf;base64" — take the piece # between the colon and the first semicolon. if ":" not in header: return "", payload meta = header.split(":", 1)[1] mime = meta.split(";", 1)[0] if ";" in meta else meta return mime, payload def _extract_pdf_text(b64: str) -> str: """Decode an inline-base64 PDF and extract its text. Returns "" on any failure so one malformed attachment doesn't tank the user turn — callers must treat the extracted text as best-effort.""" try: raw = base64.b64decode(b64, validate=False) reader = PdfReader(io.BytesIO(raw)) pages = [page.extract_text() or "" for page in reader.pages] return "\n\n".join(pages).strip() except Exception as exc: # pragma: no cover - defensive # One log line so a malformed attachment stays triageable in # Railway logs without restoring the per-stage noise the # cleanup removed. print(f"[multimodal_agent] PDF extract failed: {exc!r}") return "" def _classify_attachment_part(part: Any) -> tuple[str, str, str] | None: """Inspect a content part and return (kind, mime, base64_payload). ``kind`` is one of ``"image"``, ``"pdf"``, ``"other"``. Returns ``None`` if the part is not an attachment we recognise (plain text, unrelated dict, string, etc.). Handles the shapes we actually see in practice: - ``{"type": "image_url", "image_url": {"url": "data:..."}}`` (what the ag-ui-langgraph converter emits for every attachment after the page rewrites to legacy ``binary``). - ``{"type": "image_url", "image_url": "data:..."}`` (older LangChain/OpenAI shape where ``image_url`` is a raw string). - ``{"type": "document", "source": {"type": "data", "value": "", "mimeType": "application/pdf"}}`` (modern AG-UI shape — preserved for forward-compat if the runtime ever starts forwarding modern parts directly). """ if not isinstance(part, dict): return None part_type = part.get("type") if part_type == "image_url": image_url = part.get("image_url") url: str | None = None if isinstance(image_url, str): url = image_url elif isinstance(image_url, dict): raw_url = image_url.get("url") if isinstance(raw_url, str): url = raw_url if not url: return None mime, payload = _extract_data_url_parts(url) if not payload or not mime: return None if mime.startswith("image/"): return ("image", mime, payload) if "pdf" in mime.lower(): return ("pdf", mime, payload) return ("other", mime, payload) if part_type == "document": source = part.get("source") if not isinstance(source, dict) or source.get("type") != "data": return None value = source.get("value") mime = source.get("mimeType", "") if not isinstance(value, str) or not isinstance(mime, str): return None if "pdf" in mime.lower(): return ("pdf", mime, value) return ("other", mime, value) return None def _preprocess_part(part: Any) -> Any: """Flatten PDF attachments to text; pass everything else through. Images stay as-is so GPT-4o consumes them natively via its vision adapter. PDFs (which gpt-4o cannot read directly) become a text part prefixed with ``[Attached document]`` and the extracted body. If extraction fails we emit a structured placeholder so the model can tell the user the document was unreadable instead of pretending no attachment was sent. """ classified = _classify_attachment_part(part) if classified is None: return part kind, _mime, payload = classified if kind != "pdf": return part text = _extract_pdf_text(payload) if not text: return { "type": "text", "text": "[Attached document: PDF could not be read.]", } return {"type": "text", "text": f"[Attached document]\n{text}"} def _rewrite_messages(messages: list[Any]) -> list[Any]: """Rewrite user messages so non-image attachments become text parts. Operates on the messages list stored in agent state. Returns a *new* list; the input list is not mutated. """ rewritten: list[Any] = [] for message in messages: # Only touch HumanMessage — assistant/tool messages stay as-is. if not isinstance(message, HumanMessage): rewritten.append(message) continue content = message.content if not isinstance(content, list): rewritten.append(message) continue new_parts = [_preprocess_part(part) for part in content] rewritten.append(HumanMessage(content=new_parts, id=message.id)) return rewritten class _PdfFlattenMiddleware(AgentMiddleware): """Flatten PDF content parts to text for the model call only. Uses ``wrap_model_call`` instead of ``before_model`` so the PDF→text rewrite is scoped to the outgoing model request and never persists back into agent state. This matters because the agent state is streamed verbatim to the chat UI: if we mutated state with a ``{"type": "text", "text": "[Attached document]\\n"}`` part, the chat would render that flattened text inline in the user message bubble (in addition to the PDF chip preview the modern ``document`` part already drives), turning a clean attachment chip into a wall of raw PDF text. With ``wrap_model_call`` we copy the request, rewrite messages on the copy, hand the copy to the model, and return the model's response unchanged. The handler closure keeps state untouched. """ def wrap_model_call(self, request, handler): # type: ignore[override] messages = list(request.messages) if request.messages else [] rewritten = _rewrite_messages(messages) if rewritten == messages: return handler(request) return handler(request.override(messages=rewritten)) async def awrap_model_call(self, request, handler): # type: ignore[override] messages = list(request.messages) if request.messages else [] rewritten = _rewrite_messages(messages) if rewritten == messages: return await handler(request) return await handler(request.override(messages=rewritten)) # Vision-capable model. gpt-4o consumes `image_url` content parts natively. _MODEL = ChatOpenAI(model="gpt-5.4", temperature=0.2) graph = create_agent( model=_MODEL, tools=[], middleware=[_PdfFlattenMiddleware(), CopilotKitMiddleware()], system_prompt=SYSTEM_PROMPT, ) # Re-export under both names — `graph` matches the langgraph.json convention # used by the rest of the package; `multimodal_agent` is a friendlier alias # for any future non-langgraph.json import paths. multimodal_agent = graph __all__ = ["graph", "multimodal_agent"]