From 252cc19f93a24dc212cb6c57db68cf4901790ad5 Mon Sep 17 00:00:00 2001 From: kpdev <156195510+kiannidev@users.noreply.github.com> Date: Mon, 1 Jun 2026 04:08:32 -0700 Subject: [PATCH] Infer Content-Type for document image endpoint (#15368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes [#15367](https://github.com/infiniflow/ragflow/issues/15367) — `GET /api/v1/documents/images/` always returned `Content-Type: image/JPEG` even for PNG/WebP chunk images and extensioned thumbnails. ## Related Issue Fixes #15367 ## Change Type - [x] Bug fix - [x] Regression tests - [ ] New feature - [ ] Refactor ## What Changed - Added `_detect_image_content_type_from_bytes()` — PNG/JPEG/GIF/WebP/BMP magic-byte detection - Added `_content_type_for_document_image()` — object-key extension via `CONTENT_TYPE_MAP`, then magic bytes, else `application/octet-stream` - **`get_document_image()`** — set inferred `Content-Type` instead of hardcoded `image/JPEG` - Also guards missing storage blob (`Image not found.`) to avoid `make_response(None)` (same handler; complements #15365) ## Files Changed | File | Change | |------|--------| | `api/apps/restful_apis/document_api.py` | MIME inference helpers + handler update | | `test/testcases/test_web_api/test_document_app/test_document_metadata.py` | 3 unit tests | ## Validation ```bash cd /root/gittensor/ragflow pytest test/testcases/test_web_api/test_document_app/test_document_metadata.py::TestDocumentMetadataUnit::test_get_document_image_content_type_from_object_extension_unit -v pytest test/testcases/test_web_api/test_document_app/test_document_metadata.py::TestDocumentMetadataUnit::test_get_document_image_content_type_from_magic_bytes_unit -v pytest test/testcases/test_web_api/test_document_app/test_document_metadata.py::TestDocumentMetadataUnit::test_get_document_image_missing_blob_unit -v ``` ## Test Plan - [x] `.png` object key → `image/png` - [x] Extensionless chunk key + PNG bytes → `image/png` (magic bytes) - [x] Missing blob → 4xx `"Image not found."` - [ ] CI green --- api/apps/restful_apis/document_api.py | 31 +++++++- .../test_document_metadata.py | 76 +++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index da4643faf1..d1aca22ae0 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -1641,6 +1641,32 @@ async def stop_parse_documents(tenant_id, dataset_id): return get_error_data_result(message="Internal server error") +def _detect_image_content_type_from_bytes(data): + if data.startswith(b"\x89PNG\r\n\x1a\n"): + return "image/png" + if data[:3] == b"\xff\xd8\xff": + return "image/jpeg" + if data[:6] in (b"GIF87a", b"GIF89a"): + return "image/gif" + if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP": + return "image/webp" + if data[:2] == b"BM": + return "image/bmp" + return None + + +def _content_type_for_document_image(object_name, data): + ext_match = re.search(r"\.([^.]+)$", object_name.lower()) + if ext_match: + content_type = CONTENT_TYPE_MAP.get(ext_match.group(1)) + if content_type and content_type.startswith("image/"): + return content_type + detected = _detect_image_content_type_from_bytes(data) + if detected: + return detected + return "application/octet-stream" + + @manager.route("/documents/images/", methods=["GET"]) # noqa: F821 async def get_document_image(image_id): """ @@ -1670,8 +1696,11 @@ async def get_document_image(image_id): return get_data_error_result(message="Image not found.") bkt, nm = image_id.split("-") data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm) + if not data: + return get_data_error_result(message="Image not found.") + content_type = _content_type_for_document_image(nm, data) response = await make_response(data) - response.headers.set("Content-Type", "image/JPEG") + response.headers.set("Content-Type", content_type) return response except Exception as e: return server_error_response(e) diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index 71bf32d565..e7e196ff00 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -450,6 +450,82 @@ class TestDocumentMetadataUnit: assert res["code"] == 500 assert "download boom" in res["message"] + @pytest.mark.p2 + def test_get_document_image_content_type_from_object_extension_unit(self, document_app_module, monkeypatch): + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + png_bytes = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89" + b"\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01" + b"\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" + ) + + async def fake_thread_pool_exec(*_args, **_kwargs): + return png_bytes + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + monkeypatch.setattr(module, "make_response", fake_make_response) + res = _run(module.get_document_image("kb1-object.png")) + assert isinstance(res, _ImageResponse) + assert res.headers["Content-Type"] == "image/png" + + @pytest.mark.p2 + def test_get_document_image_content_type_from_magic_bytes_unit(self, document_app_module, monkeypatch): + module = document_app_module + + class _Headers(dict): + def set(self, key, value): + self[key] = value + + class _ImageResponse: + def __init__(self, data): + self.data = data + self.headers = _Headers() + + png_bytes = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + b"\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89" + b"\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01" + b"\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" + ) + + async def fake_thread_pool_exec(*_args, **_kwargs): + return png_bytes + + async def fake_make_response(data): + return _ImageResponse(data) + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + monkeypatch.setattr(module, "make_response", fake_make_response) + res = _run(module.get_document_image("kb1-a1b2c3d4e5f6")) + assert isinstance(res, _ImageResponse) + assert res.headers["Content-Type"] == "image/png" + + @pytest.mark.p2 + def test_get_document_image_missing_blob_unit(self, document_app_module, monkeypatch): + module = document_app_module + + async def fake_thread_pool_exec(*_args, **_kwargs): + return None + + monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) + res = _run(module.get_document_image("kb1-object-key")) + assert res["code"] == RetCode.DATA_ERROR + assert res["message"] == "Image not found." + @pytest.mark.skip(reason="Moved to /api/v1/documents/images/") def test_get_image_success_and_exception_unit(self, document_app_module, monkeypatch):