fix(api): document image_id parsing for hyphenated thumbnail keys (#15115) (#15116)

### What problem does this PR solve?

Fixes #15115.

`GET /api/v1/documents/images/<image_id>` returned **Image not found**
when the thumbnail storage object key contained hyphens (e.g.
`page-1.png`). Document APIs build URLs as `{dataset_id}-{thumbnail}`,
but `get_document_image()` used `image_id.split("-")` and required
exactly two segments, so keys like `<kb_id>-page-1.png` were rejected
even though the blob existed.

This PR splits only on the first hyphen (`split("-", 1)`) and sets
`Content-Type` from the object key extension via `CONTENT_TYPE_MAP`
instead of hardcoding `image/JPEG`.
This commit is contained in:
kpdev
2026-06-01 19:54:14 -07:00
committed by GitHub
parent a4bc066f74
commit 0f6f7b3c3c
2 changed files with 67 additions and 4 deletions

View File

@@ -1641,6 +1641,25 @@ async def stop_parse_documents(tenant_id, dataset_id):
return get_error_data_result(message="Internal server error")
def _parse_document_image_id(image_id: str) -> tuple[str, str] | None:
"""Split a composite document image ID into storage bucket and object key.
Thumbnail URLs use ``{dataset_id}-{thumbnail}``. Only the first hyphen
separates the dataset/kb id (bucket) from the object key, which may
contain additional hyphens (e.g. ``page-1.png``).
Args:
image_id: Path segment from ``GET /documents/images/<image_id>``.
Returns:
``(bucket, object_key)`` when valid, otherwise ``None``.
"""
parts = image_id.split("-", 1)
if len(parts) != 2 or not parts[0] or not parts[1]:
return None
return parts[0], parts[1]
def _detect_image_content_type_from_bytes(data):
if data.startswith(b"\x89PNG\r\n\x1a\n"):
return "image/png"
@@ -1680,7 +1699,7 @@ async def get_document_image(image_id):
required: true
schema:
type: string
description: The image ID (format: bucket-name-image-name)
description: Composite ID ``{dataset_id}-{thumbnail_object_key}`` (split on first hyphen only)
responses:
200:
description: Image file
@@ -1691,10 +1710,10 @@ async def get_document_image(image_id):
format: binary
"""
try:
arr = image_id.split("-")
if len(arr) != 2:
parsed = _parse_document_image_id(image_id)
if not parsed:
return get_data_error_result(message="Image not found.")
bkt, nm = image_id.split("-")
bkt, nm = parsed
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
if not data:
return get_data_error_result(message="Image not found.")

View File

@@ -563,6 +563,50 @@ class TestDocumentMetadataUnit:
assert res["code"] == 500
assert "image boom" in res["message"]
def test_get_document_image_hyphenated_object_key(self, document_app_module, monkeypatch):
"""Hyphenated thumbnail keys are parsed with split('-', 1) and return correct MIME type."""
module = document_app_module
class _Headers(dict):
def set(self, key, value):
self[key] = value
class _ImageResponse:
def __init__(self, data):
self.data = data
self.headers = _Headers()
storage_calls = []
def _storage_get(bkt, nm):
storage_calls.append((bkt, nm))
return b"png-bytes"
async def fake_thread_pool_exec(fn, *args, **kwargs):
return fn(*args, **kwargs)
async def fake_make_response(data):
return _ImageResponse(data)
monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec)
monkeypatch.setattr(module, "make_response", fake_make_response)
monkeypatch.setattr(
module.settings,
"STORAGE_IMPL",
SimpleNamespace(get=_storage_get),
)
image_id = "kb12345678901234567890123456789012-page-1.png"
res = _run(module.get_document_image(image_id))
assert isinstance(res, _ImageResponse)
assert storage_calls == [("kb12345678901234567890123456789012", "page-1.png")]
assert res.headers["Content-Type"] == "image/png"
res = _run(module.get_document_image("only-one-part"))
assert res["code"] == RetCode.DATA_ERROR
assert "Image not found" in res["message"]
class TestDocumentBatchChangeStatus:
@pytest.mark.p2
def test_change_status_partial_failure_matrix(self, WebApiAuth, add_dataset, ragflow_tmp_dir):