diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index d1aca22ae0..cc4786cad6 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -1641,6 +1641,25 @@ async def stop_parse_documents(tenant_id, dataset_id): return get_error_data_result(message="Internal server error") +def _parse_document_image_id(image_id: str) -> tuple[str, str] | None: + """Split a composite document image ID into storage bucket and object key. + + Thumbnail URLs use ``{dataset_id}-{thumbnail}``. Only the first hyphen + separates the dataset/kb id (bucket) from the object key, which may + contain additional hyphens (e.g. ``page-1.png``). + + Args: + image_id: Path segment from ``GET /documents/images/``. + + Returns: + ``(bucket, object_key)`` when valid, otherwise ``None``. + """ + parts = image_id.split("-", 1) + if len(parts) != 2 or not parts[0] or not parts[1]: + return None + return parts[0], parts[1] + + def _detect_image_content_type_from_bytes(data): if data.startswith(b"\x89PNG\r\n\x1a\n"): return "image/png" @@ -1680,7 +1699,7 @@ async def get_document_image(image_id): required: true schema: type: string - description: The image ID (format: bucket-name-image-name) + description: Composite ID ``{dataset_id}-{thumbnail_object_key}`` (split on first hyphen only) responses: 200: description: Image file @@ -1691,10 +1710,10 @@ async def get_document_image(image_id): format: binary """ try: - arr = image_id.split("-") - if len(arr) != 2: + parsed = _parse_document_image_id(image_id) + if not parsed: return get_data_error_result(message="Image not found.") - bkt, nm = image_id.split("-") + bkt, nm = parsed data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm) if not data: return get_data_error_result(message="Image not found.") diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index e7e196ff00..be852e514b 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -563,6 +563,50 @@ class TestDocumentMetadataUnit: assert res["code"] == 500 assert "image boom" in res["message"] + def test_get_document_image_hyphenated_object_key(self, document_app_module, monkeypatch): + """Hyphenated thumbnail keys are parsed with split('-', 1) and return correct MIME type.""" + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + storage_calls = [] + + def _storage_get(bkt, nm): + storage_calls.append((bkt, nm)) + return b"png-bytes" + + async def fake_thread_pool_exec(fn, *args, **kwargs): + return fn(*args, **kwargs) + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + monkeypatch.setattr(module, "make_response", fake_make_response) + monkeypatch.setattr( + module.settings, + "STORAGE_IMPL", + SimpleNamespace(get=_storage_get), + ) + + image_id = "kb12345678901234567890123456789012-page-1.png" + res = _run(module.get_document_image(image_id)) + assert isinstance(res, _ImageResponse) + assert storage_calls == [("kb12345678901234567890123456789012", "page-1.png")] + assert res.headers["Content-Type"] == "image/png" + + res = _run(module.get_document_image("only-one-part")) + assert res["code"] == RetCode.DATA_ERROR + assert "Image not found" in res["message"] + + class TestDocumentBatchChangeStatus: @pytest.mark.p2 def test_change_status_partial_failure_matrix(self, WebApiAuth, add_dataset, ragflow_tmp_dir):