diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index da4643faf1..d1aca22ae0 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -1641,6 +1641,32 @@ async def stop_parse_documents(tenant_id, dataset_id): return get_error_data_result(message="Internal server error") +def _detect_image_content_type_from_bytes(data): + if data.startswith(b"\x89PNG\r\n\x1a\n"): + return "image/png" + if data[:3] == b"\xff\xd8\xff": + return "image/jpeg" + if data[:6] in (b"GIF87a", b"GIF89a"): + return "image/gif" + if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP": + return "image/webp" + if data[:2] == b"BM": + return "image/bmp" + return None + + +def _content_type_for_document_image(object_name, data): + ext_match = re.search(r"\.([^.]+)$", object_name.lower()) + if ext_match: + content_type = CONTENT_TYPE_MAP.get(ext_match.group(1)) + if content_type and content_type.startswith("image/"): + return content_type + detected = _detect_image_content_type_from_bytes(data) + if detected: + return detected + return "application/octet-stream" + + @manager.route("/documents/images/", methods=["GET"]) # noqa: F821 async def get_document_image(image_id): """ @@ -1670,8 +1696,11 @@ async def get_document_image(image_id): return get_data_error_result(message="Image not found.") bkt, nm = image_id.split("-") data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm) + if not data: + return get_data_error_result(message="Image not found.") + content_type = _content_type_for_document_image(nm, data) response = await make_response(data) - response.headers.set("Content-Type", "image/JPEG") + response.headers.set("Content-Type", content_type) return response except Exception as e: return server_error_response(e) diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index 71bf32d565..e7e196ff00 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -450,6 +450,82 @@ class TestDocumentMetadataUnit: assert res["code"] == 500 assert "download boom" in res["message"] + @pytest.mark.p2 + def test_get_document_image_content_type_from_object_extension_unit(self, document_app_module, monkeypatch): + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + png_bytes = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89" + b"\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01" + b"\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" + ) + + async def fake_thread_pool_exec(*_args, **_kwargs): + return png_bytes + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + monkeypatch.setattr(module, "make_response", fake_make_response) + res = _run(module.get_document_image("kb1-object.png")) + assert isinstance(res, _ImageResponse) + assert res.headers["Content-Type"] == "image/png" + + @pytest.mark.p2 + def test_get_document_image_content_type_from_magic_bytes_unit(self, document_app_module, monkeypatch): + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + png_bytes = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89" + b"\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01" + b"\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" + ) + + async def fake_thread_pool_exec(*_args, **_kwargs): + return png_bytes + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + monkeypatch.setattr(module, "make_response", fake_make_response) + res = _run(module.get_document_image("kb1-a1b2c3d4e5f6")) + assert isinstance(res, _ImageResponse) + assert res.headers["Content-Type"] == "image/png" + + @pytest.mark.p2 + def test_get_document_image_missing_blob_unit(self, document_app_module, monkeypatch): + module = document_app_module + + async def fake_thread_pool_exec(*_args, **_kwargs): + return None + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + res = _run(module.get_document_image("kb1-object-key")) + assert res["code"] == RetCode.DATA_ERROR + assert res["message"] == "Image not found." + @pytest.mark.skip(reason="Moved to /api/v1/documents/images/") def test_get_image_success_and_exception_unit(self, document_app_module, monkeypatch):