fix(api): document image_id parsing for hyphenated thumbnail keys (#15115) (#15116)

### What problem does this PR solve? Fixes #15115. `GET /api/v1/documents/images/<image_id>` returned **Image not found** when the thumbnail storage object key contained hyphens (e.g. `page-1.png`). Document APIs build URLs as `{dataset_id}-{thumbnail}`, but `get_document_image()` used `image_id.split("-")` and required exactly two segments, so keys like `<kb_id>-page-1.png` were rejected even though the blob existed. This PR splits only on the first hyphen (`split("-", 1)`) and sets `Content-Type` from the object key extension via `CONTENT_TYPE_MAP` instead of hardcoding `image/JPEG`.
2026-06-29 23:41:12 +08:00 · 2026-06-01 19:54:14 -07:00
parent a4bc066f74
commit 0f6f7b3c3c
2 changed files with 67 additions and 4 deletions
--- a/api/apps/restful_apis/document_api.py
+++ b/api/apps/restful_apis/document_api.py
@@ -1641,6 +1641,25 @@ async def stop_parse_documents(tenant_id, dataset_id):
        return get_error_data_result(message="Internal server error")


+def _parse_document_image_id(image_id: str) -> tuple[str, str] | None:
+    """Split a composite document image ID into storage bucket and object key.
+
+    Thumbnail URLs use ``{dataset_id}-{thumbnail}``. Only the first hyphen
+    separates the dataset/kb id (bucket) from the object key, which may
+    contain additional hyphens (e.g. ``page-1.png``).
+
+    Args:
+        image_id: Path segment from ``GET /documents/images/<image_id>``.
+
+    Returns:
+        ``(bucket, object_key)`` when valid, otherwise ``None``.
+    """
+    parts = image_id.split("-", 1)
+    if len(parts) != 2 or not parts[0] or not parts[1]:
+        return None
+    return parts[0], parts[1]
+
+
 def _detect_image_content_type_from_bytes(data):
    if data.startswith(b"\x89PNG\r\n\x1a\n"):
        return "image/png"
@@ -1680,7 +1699,7 @@ async def get_document_image(image_id):
        required: true
        schema:
          type: string
-        description: The image ID (format: bucket-name-image-name)
+        description: Composite ID ``{dataset_id}-{thumbnail_object_key}`` (split on first hyphen only)
    responses:
      200:
        description: Image file
@@ -1691,10 +1710,10 @@ async def get_document_image(image_id):
              format: binary
    """
    try:
-        arr = image_id.split("-")
-        if len(arr) != 2:
+        parsed = _parse_document_image_id(image_id)
+        if not parsed:
            return get_data_error_result(message="Image not found.")
-        bkt, nm = image_id.split("-")
+        bkt, nm = parsed
        data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
        if not data:
            return get_data_error_result(message="Image not found.")