fix(api): document image_id parsing for hyphenated thumbnail keys (#15115) (#15116)

### What problem does this PR solve? Fixes #15115. `GET /api/v1/documents/images/<image_id>` returned **Image not found** when the thumbnail storage object key contained hyphens (e.g. `page-1.png`). Document APIs build URLs as `{dataset_id}-{thumbnail}`, but `get_document_image()` used `image_id.split("-")` and required exactly two segments, so keys like `<kb_id>-page-1.png` were rejected even though the blob existed. This PR splits only on the first hyphen (`split("-", 1)`) and sets `Content-Type` from the object key extension via `CONTENT_TYPE_MAP` instead of hardcoding `image/JPEG`.
2026-06-29 23:41:12 +08:00 · 2026-06-01 19:54:14 -07:00
parent a4bc066f74
commit 0f6f7b3c3c
2 changed files with 67 additions and 4 deletions
--- a/api/apps/restful_apis/document_api.py
+++ b/api/apps/restful_apis/document_api.py
@@ -1641,6 +1641,25 @@ async def stop_parse_documents(tenant_id, dataset_id):
        return get_error_data_result(message="Internal server error")


+def _parse_document_image_id(image_id: str) -> tuple[str, str] | None:
+    """Split a composite document image ID into storage bucket and object key.
+
+    Thumbnail URLs use ``{dataset_id}-{thumbnail}``. Only the first hyphen
+    separates the dataset/kb id (bucket) from the object key, which may
+    contain additional hyphens (e.g. ``page-1.png``).
+
+    Args:
+        image_id: Path segment from ``GET /documents/images/<image_id>``.
+
+    Returns:
+        ``(bucket, object_key)`` when valid, otherwise ``None``.
+    """
+    parts = image_id.split("-", 1)
+    if len(parts) != 2 or not parts[0] or not parts[1]:
+        return None
+    return parts[0], parts[1]
+
+
 def _detect_image_content_type_from_bytes(data):
    if data.startswith(b"\x89PNG\r\n\x1a\n"):
        return "image/png"
@@ -1680,7 +1699,7 @@ async def get_document_image(image_id):
        required: true
        schema:
          type: string
-        description: The image ID (format: bucket-name-image-name)
+        description: Composite ID ``{dataset_id}-{thumbnail_object_key}`` (split on first hyphen only)
    responses:
      200:
        description: Image file
@@ -1691,10 +1710,10 @@ async def get_document_image(image_id):
              format: binary
    """
    try:
-        arr = image_id.split("-")
-        if len(arr) != 2:
+        parsed = _parse_document_image_id(image_id)
+        if not parsed:
            return get_data_error_result(message="Image not found.")
-        bkt, nm = image_id.split("-")
+        bkt, nm = parsed
        data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
        if not data:
            return get_data_error_result(message="Image not found.")
--- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py
+++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py
@@ -563,6 +563,50 @@ class TestDocumentMetadataUnit:
        assert res["code"] == 500
        assert "image boom" in res["message"]

+    def test_get_document_image_hyphenated_object_key(self, document_app_module, monkeypatch):
+        """Hyphenated thumbnail keys are parsed with split('-', 1) and return correct MIME type."""
+        module = document_app_module
+
+        class _Headers(dict):
+            def set(self, key, value):
+                self[key] = value
+
+        class _ImageResponse:
+            def __init__(self, data):
+                self.data = data
+                self.headers = _Headers()
+
+        storage_calls = []
+
+        def _storage_get(bkt, nm):
+            storage_calls.append((bkt, nm))
+            return b"png-bytes"
+
+        async def fake_thread_pool_exec(fn, *args, **kwargs):
+            return fn(*args, **kwargs)
+
+        async def fake_make_response(data):
+            return _ImageResponse(data)
+
+        monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec)
+        monkeypatch.setattr(module, "make_response", fake_make_response)
+        monkeypatch.setattr(
+            module.settings,
+            "STORAGE_IMPL",
+            SimpleNamespace(get=_storage_get),
+        )
+
+        image_id = "kb12345678901234567890123456789012-page-1.png"
+        res = _run(module.get_document_image(image_id))
+        assert isinstance(res, _ImageResponse)
+        assert storage_calls == [("kb12345678901234567890123456789012", "page-1.png")]
+        assert res.headers["Content-Type"] == "image/png"
+
+        res = _run(module.get_document_image("only-one-part"))
+        assert res["code"] == RetCode.DATA_ERROR
+        assert "Image not found" in res["message"]
+
+
 class TestDocumentBatchChangeStatus:
    @pytest.mark.p2
    def test_change_status_partial_failure_matrix(self, WebApiAuth, add_dataset, ragflow_tmp_dir):