diff --git a/agent/component/docs_generator.py b/agent/component/docs_generator.py index f73ff441aa..297fac9b1e 100644 --- a/agent/component/docs_generator.py +++ b/agent/component/docs_generator.py @@ -13,6 +13,7 @@ from xml.sax.saxutils import escape from agent.component.base import ComponentParamBase from api.utils.api_utils import timeout +from api.utils.file_response import agent_attachment_preview_path from common import settings from common.misc_utils import get_uuid from .message import Message @@ -136,6 +137,7 @@ class DocGenerator(Message, ABC): "mime_type": mime_type, "size": file_size, "base64": file_base64, + "preview_url": agent_attachment_preview_path(doc_id, ext=output_format, mime_type=mime_type), "include_download_info_in_content": self._param.include_download_info_in_content, } self.set_output("doc_id", doc_id) diff --git a/api/apps/restful_apis/agent_api.py b/api/apps/restful_apis/agent_api.py index bf8840c26f..06e8041a46 100644 --- a/api/apps/restful_apis/agent_api.py +++ b/api/apps/restful_apis/agent_api.py @@ -27,7 +27,11 @@ import time from functools import partial, wraps from typing import Set -from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers +from api.utils.file_response import ( + apply_download_file_response_headers, + apply_preview_file_response_headers, + resolve_attachment_content_type, +) import jwt from quart import Response, jsonify, request, make_response @@ -2463,37 +2467,51 @@ async def webhook_trace(agent_id: str): } ) +def _attachment_request_metadata(): + ext = request.args.get("ext") + mime_type = request.args.get("mime_type") + filename = request.args.get("filename") + content_type, resolved_ext = resolve_attachment_content_type(ext, mime_type) + if not content_type: + content_type = "application/octet-stream" + if not resolved_ext: + resolved_ext = ext.lower().strip(".") if isinstance(ext, str) and ext else "bin" + return content_type, resolved_ext, filename + + +async def _stream_agent_attachment(tenant_id, attachment_id, *, inline: bool): + attachment_id = attachment_id or request.view_args.get("attachment_id") + content_type, ext, filename = _attachment_request_metadata() + data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, attachment_id) + if not data: + return get_data_error_result(message="Document not found!") + response = await make_response(data) + if inline: + apply_preview_file_response_headers(response, content_type, ext, filename) + else: + apply_download_file_response_headers(response, content_type, ext, filename) + return response + + +@manager.route("/agents/attachments//preview", methods=["GET"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def preview_attachment(tenant_id=None, attachment_id=None): + """Stream an agent-generated attachment for inline preview in MCP clients.""" + try: + return await _stream_agent_attachment(tenant_id, attachment_id, inline=True) + except Exception as e: + return server_error_response(e) + @manager.route("/agents/attachments//download", methods=["GET"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def download_attachment(tenant_id=None, attachment_id=None): - """Stream a document's underlying file to the requesting user. - - Mirrors the authorization model of the preview endpoint: the user must belong - to the tenant that owns the document's knowledge base. A denial returns the - same "Attachment not found!" response so the endpoint cannot be used to - enumerate doc ids across tenants. - """ + """Stream an agent-generated attachment as a download.""" try: - # Keep backward compatibility with older callers and unit tests that still - # pass `attachment_id` instead of the route parameter name. - ext = request.args.get("ext", "markdown") - data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, attachment_id) - if not data: - # Storage object missing or empty (orphaned DB metadata, tenant - # mismatch). Without this guard `make_response(None)` raises - # `TypeError: response value cannot be None` and the caller - # sees HTTP 500 — same bug class as #15365 on document - # preview. Return the same "Attachment not found!" shape used - # by the preview route's missing-record path so byte-streaming - # endpoints respond consistently on a not-found. - return get_data_error_result(message="Attachment not found!") - response = await make_response(data) - content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") - apply_safe_file_response_headers(response, content_type, ext) - - return response - + if request.args.get("disposition", "").lower() == "inline": + return await _stream_agent_attachment(tenant_id, attachment_id, inline=True) + return await _stream_agent_attachment(tenant_id, attachment_id, inline=False) except Exception as e: return server_error_response(e) diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 541ce2850a..37eeb905b9 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -72,6 +72,7 @@ from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_B from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec from api.utils.file_utils import filename_type, thumbnail +from api.utils.file_response import apply_preview_file_response_headers from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers from common.ssrf_guard import assert_url_is_safe from rag.nlp import search @@ -2088,7 +2089,7 @@ async def get(doc_id): if ext: fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application" content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}") - apply_safe_file_response_headers(response, content_type, ext) + apply_preview_file_response_headers(response, content_type, ext, doc.name) return response except Exception as e: return server_error_response(e) diff --git a/api/utils/file_response.py b/api/utils/file_response.py new file mode 100644 index 0000000000..f9f4751b55 --- /dev/null +++ b/api/utils/file_response.py @@ -0,0 +1,147 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# + +import re +from urllib.parse import urlencode + +CONTENT_TYPE_MAP = { + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "doc": "application/msword", + "pdf": "application/pdf", + "csv": "text/csv", + "xls": "application/vnd.ms-excel", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "txt": "text/plain", + "py": "text/plain", + "js": "text/plain", + "java": "text/plain", + "c": "text/plain", + "cpp": "text/plain", + "h": "text/plain", + "php": "text/plain", + "go": "text/plain", + "ts": "text/plain", + "sh": "text/plain", + "cs": "text/plain", + "kt": "text/plain", + "sql": "text/plain", + "md": "text/markdown", + "markdown": "text/markdown", + "mdx": "text/markdown", + "htm": "text/html", + "html": "text/html", + "json": "application/json", + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "tif": "image/tiff", + "webp": "image/webp", + "svg": "image/svg+xml", + "ico": "image/x-icon", + "avif": "image/avif", + "heic": "image/heic", + "ppt": "application/vnd.ms-powerpoint", + "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + +FORCE_ATTACHMENT_EXTENSIONS = { + "htm", + "html", + "shtml", + "xht", + "xhtml", + "xml", + "mhtml", + "svg", +} + +FORCE_ATTACHMENT_CONTENT_TYPES = { + "text/html", + "image/svg+xml", + "application/xhtml+xml", + "text/xml", + "application/xml", + "multipart/related", +} + + +def should_force_attachment(ext: str | None, content_type: str | None = None) -> bool: + normalized_ext = (ext or "").lower().strip(".") + if normalized_ext in FORCE_ATTACHMENT_EXTENSIONS: + return True + normalized_type = (content_type or "").lower().split(";")[0].strip() + return normalized_type in FORCE_ATTACHMENT_CONTENT_TYPES + + +def sanitize_content_disposition_filename(filename: str | None) -> str | None: + if not filename: + return None + base = re.sub(r"[^\w.\-]", "_", str(filename).split("/")[-1].split("\\")[-1]) + return base or None + + +def resolve_attachment_content_type(ext: str | None = None, mime_type: str | None = None) -> tuple[str | None, str | None]: + if mime_type: + normalized_type = mime_type.lower().split(";")[0].strip() + for known_ext, known_type in CONTENT_TYPE_MAP.items(): + if known_type == normalized_type: + return normalized_type, known_ext + return normalized_type, (ext or "").lower().strip(".") or None + if ext: + normalized_ext = ext.lower().strip(".") + return CONTENT_TYPE_MAP.get(normalized_ext, f"application/{normalized_ext}"), normalized_ext + return None, None + + +def apply_preview_file_response_headers( + response, + content_type: str | None, + ext: str | None = None, + filename: str | None = None, +): + if content_type: + response.headers.set("Content-Type", content_type) + if should_force_attachment(ext, content_type): + response.headers.set("X-Content-Type-Options", "nosniff") + response.headers.set("Content-Disposition", "attachment") + return response + safe_filename = sanitize_content_disposition_filename(filename) + if safe_filename: + response.headers.set("Content-Disposition", f'inline; filename="{safe_filename}"') + else: + response.headers.set("Content-Disposition", "inline") + return response + + +def apply_download_file_response_headers( + response, + content_type: str | None, + ext: str | None = None, + filename: str | None = None, +): + if content_type: + response.headers.set("Content-Type", content_type) + if should_force_attachment(ext, content_type): + response.headers.set("X-Content-Type-Options", "nosniff") + response.headers.set("Content-Disposition", "attachment") + return response + safe_filename = sanitize_content_disposition_filename(filename) + if safe_filename: + response.headers.set("Content-Disposition", f'attachment; filename="{safe_filename}"') + else: + response.headers.set("Content-Disposition", "attachment") + return response + + +def agent_attachment_preview_path(attachment_id: str, *, ext: str | None = None, mime_type: str | None = None) -> str: + query: dict[str, str] = {} + if ext: + query["ext"] = ext + if mime_type: + query["mime_type"] = mime_type + suffix = f"?{urlencode(query)}" if query else "" + return f"/api/v1/agents/attachments/{attachment_id}/preview{suffix}" diff --git a/api/utils/web_utils.py b/api/utils/web_utils.py index e7c1b48f51..d561e944d1 100644 --- a/api/utils/web_utils.py +++ b/api/utils/web_utils.py @@ -40,83 +40,17 @@ ATTEMPT_LOCK_SECONDS = 30 * 60 # lock for 30 minutes RESEND_COOLDOWN_SECONDS = 60 # cooldown for 1 minute -CONTENT_TYPE_MAP = { - # Office - "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "doc": "application/msword", - "pdf": "application/pdf", - "csv": "text/csv", - "xls": "application/vnd.ms-excel", - "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - # Text/code - "txt": "text/plain", - "py": "text/plain", - "js": "text/plain", - "java": "text/plain", - "c": "text/plain", - "cpp": "text/plain", - "h": "text/plain", - "php": "text/plain", - "go": "text/plain", - "ts": "text/plain", - "sh": "text/plain", - "cs": "text/plain", - "kt": "text/plain", - "sql": "text/plain", - # Web - "md": "text/markdown", - "markdown": "text/markdown", - "mdx": "text/markdown", - "htm": "text/html", - "html": "text/html", - "json": "application/json", - # Image formats - "png": "image/png", - "jpg": "image/jpeg", - "jpeg": "image/jpeg", - "gif": "image/gif", - "bmp": "image/bmp", - "tiff": "image/tiff", - "tif": "image/tiff", - "webp": "image/webp", - "svg": "image/svg+xml", - "ico": "image/x-icon", - "avif": "image/avif", - "heic": "image/heic", - # PPTX - "ppt": "application/vnd.ms-powerpoint", - "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", -} - - -FORCE_ATTACHMENT_EXTENSIONS = { - "htm", - "html", - "shtml", - "xht", - "xhtml", - "xml", - "mhtml", - "svg", -} - - -FORCE_ATTACHMENT_CONTENT_TYPES = { - "text/html", - "image/svg+xml", - "application/xhtml+xml", - "text/xml", - "application/xml", - "multipart/related", -} - - -def should_force_attachment(ext: str | None, content_type: str | None = None) -> bool: - normalized_ext = (ext or "").lower().strip(".") - if normalized_ext in FORCE_ATTACHMENT_EXTENSIONS: - return True - normalized_type = (content_type or "").lower() - return normalized_type in FORCE_ATTACHMENT_CONTENT_TYPES +from api.utils.file_response import ( # noqa: F401 + CONTENT_TYPE_MAP, + FORCE_ATTACHMENT_CONTENT_TYPES, + FORCE_ATTACHMENT_EXTENSIONS, + agent_attachment_preview_path, + apply_download_file_response_headers, + apply_preview_file_response_headers, + resolve_attachment_content_type, + sanitize_content_disposition_filename, + should_force_attachment, +) def apply_safe_file_response_headers(response, content_type: str | None, ext: str | None = None): diff --git a/test/unit_test/api/apps/restful_apis/test_attachment_download_missing_blob.py b/test/unit_test/api/apps/restful_apis/test_attachment_download_missing_blob.py index 2bcdd8840c..38d941553f 100644 --- a/test/unit_test/api/apps/restful_apis/test_attachment_download_missing_blob.py +++ b/test/unit_test/api/apps/restful_apis/test_attachment_download_missing_blob.py @@ -47,6 +47,11 @@ class _LenientModule(ModuleType): return lambda *_a, **_k: None +class _Headers(dict): + def set(self, key, value): + self[key] = value + + def _stub(monkeypatch, name, **attrs): mod = _LenientModule(name) for key, value in attrs.items(): @@ -61,7 +66,7 @@ def _load_agent_api(monkeypatch, *, storage_get): async def _make_response(payload): if payload is None: raise TypeError("response value cannot be None") - return SimpleNamespace(payload=payload, headers={}) + return SimpleNamespace(payload=payload, headers=_Headers()) _stub( monkeypatch, diff --git a/test/unit_test/api/utils/test_file_response_headers.py b/test/unit_test/api/utils/test_file_response_headers.py new file mode 100644 index 0000000000..ab0a821a5c --- /dev/null +++ b/test/unit_test/api/utils/test_file_response_headers.py @@ -0,0 +1,64 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# + +from urllib.parse import urlencode + +import pytest + +from api.utils import file_response as module + + +class _DummyHeaders(dict): + def set(self, key, value): + self[key] = value + + +class _DummyResponse: + def __init__(self): + self.headers = _DummyHeaders() + + +@pytest.mark.p2 +def test_apply_preview_sets_inline_for_pdf(): + response = _DummyResponse() + module.apply_preview_file_response_headers(response, "application/pdf", "pdf", "report.pdf") + assert response.headers["Content-Type"] == "application/pdf" + assert response.headers["Content-Disposition"] == 'inline; filename="report.pdf"' + + +@pytest.mark.p2 +def test_apply_preview_forces_attachment_for_html(): + response = _DummyResponse() + module.apply_preview_file_response_headers(response, "text/html", "html", "page.html") + assert response.headers["Content-Disposition"] == "attachment" + assert response.headers["X-Content-Type-Options"] == "nosniff" + + +@pytest.mark.p2 +def test_apply_download_sets_attachment_for_pdf(): + response = _DummyResponse() + module.apply_download_file_response_headers(response, "application/pdf", "pdf", "report.pdf") + assert response.headers["Content-Disposition"] == 'attachment; filename="report.pdf"' + + +@pytest.mark.p2 +def test_resolve_attachment_content_type_prefers_mime_type(): + content_type, ext = module.resolve_attachment_content_type(ext="md", mime_type="application/pdf") + assert content_type == "application/pdf" + assert ext == "pdf" + + +@pytest.mark.p2 +def test_agent_attachment_preview_path_includes_query(): + path = module.agent_attachment_preview_path("doc-1", ext="pdf", mime_type="application/pdf") + query = urlencode({"ext": "pdf", "mime_type": "application/pdf"}) + assert path == f"/api/v1/agents/attachments/doc-1/preview?{query}" + + +@pytest.mark.p2 +def test_agent_attachment_preview_path_encodes_svg_mime_type(): + path = module.agent_attachment_preview_path("doc-1", ext="svg", mime_type="image/svg+xml") + query = urlencode({"ext": "svg", "mime_type": "image/svg+xml"}) + assert path == f"/api/v1/agents/attachments/doc-1/preview?{query}" + assert "%2B" in path or "svg%2Bxml" in path diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index 806090d277..8344d19bfd 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -280,6 +280,24 @@ export default { cancelDataflow: (id: string) => `${restAPIv1}/tasks/${id}/cancel`, getAttachmentFileDownload: (docId: string) => `${restAPIv1}/agents/attachments/${docId}/download`, + getAttachmentFilePreview: ({ + docId, + ext, + mimeType, + filename, + }: { + docId: string; + ext?: string; + mimeType?: string; + filename?: string; + }) => { + const params = new URLSearchParams(); + if (ext) params.set('ext', ext); + if (mimeType) params.set('mime_type', mimeType); + if (filename) params.set('filename', filename); + const query = params.toString(); + return `${restAPIv1}/agents/attachments/${docId}/preview${query ? `?${query}` : ''}`; + }, downloadFile: `${restAPIv1}/agents/download`, testWebhook: (id: string) => `${restAPIv1}/agents/${id}/webhook/test`, fetchWebhookTrace: (id: string) => `${restAPIv1}/agents/${id}/webhook/logs`,