Validate chunk image_base64 before doc-store write (#15364)

## Summary

Fixes [#15363](https://github.com/infiniflow/ragflow/issues/15363) —
`add_chunk` / `update_chunk` indexed chunks with `image_id` before
validating or storing `image_base64`, leaving orphan chunks on invalid
input.

## Related Issue

Fixes #15363

## Change Type

- [x] Bug fix
- [x] Regression tests

## What Changed

- Added `_decode_chunk_image_base64()` — strict base64 decode with
structured 4xx errors
- Added `_store_chunk_image_or_error()` — catches `store_chunk_image`
failures
- **`add_chunk` / `update_chunk`**: decode + store image **before**
`docStoreConn.insert` / `update`; only set `img_id` after successful
storage

## Files Changed

| File | Change |
|------|--------|
| `api/apps/restful_apis/chunk_api.py` | Helpers + reorder image
handling |
| `test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py`
| 3 regression tests |

## Validation

```bash
cd /root/gittensor/ragflow
pytest test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py::test_restful_add_chunk_invalid_image_base64_does_not_index_chunk -v
pytest test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py::test_restful_update_chunk_invalid_image_base64_does_not_update_chunk -v
pytest test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py::test_restful_add_chunk_valid_image_base64_stores_before_insert -v
pytest test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py -v
```

## Test Plan

- [x] Invalid `image_base64` on add → 4xx, no doc-store insert
- [x] Invalid `image_base64` on update → 4xx, no doc-store update
- [x] Valid PNG base64 on add → image stored, chunk indexed with
`img_id`
- [ ] CI green
This commit is contained in:
kpdev
2026-05-29 04:36:46 -07:00
committed by GitHub
parent 04aa8d04e8
commit cb1ea5a47f
2 changed files with 116 additions and 9 deletions

View File

@@ -14,6 +14,7 @@
# limitations under the License.
#
import base64
import binascii
import datetime
import logging
import re
@@ -66,6 +67,31 @@ DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has n
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"
def _decode_chunk_image_base64(image_base64):
if not isinstance(image_base64, str) or not image_base64.strip():
return None, "`image_base64` must be a non-empty string"
try:
image_binary = base64.b64decode(image_base64, validate=True)
except (binascii.Error, ValueError):
return None, "Invalid `image_base64`"
if not image_binary:
return None, "`image_base64` is empty"
return image_binary, None
def _store_chunk_image_or_error(dataset_id, chunk_id, image_binary):
try:
store_chunk_image(dataset_id, chunk_id, image_binary)
except Exception:
logging.exception(
"Failed to store chunk image. dataset_id=%s chunk_id=%s",
dataset_id,
chunk_id,
)
return "Failed to store chunk image"
return None
class Chunk(BaseModel):
id: str = ""
content: str = ""
@@ -507,8 +533,13 @@ async def add_chunk(tenant_id, dataset_id, document_id):
except ValueError as exc:
return get_error_data_result(f"`tag_feas` {exc}")
image_base64 = req.get("image_base64")
if image_base64:
if "image_base64" in req:
image_binary, image_err = _decode_chunk_image_base64(req.get("image_base64"))
if image_err:
return get_error_data_result(message=image_err)
store_err = _store_chunk_image_or_error(dataset_id, chunk_id, image_binary)
if store_err:
return get_error_data_result(message=store_err)
d["img_id"] = f"{dataset_id}-{chunk_id}"
d["doc_type_kwd"] = "image"
@@ -520,9 +551,6 @@ async def add_chunk(tenant_id, dataset_id, document_id):
d[f"q_{len(v)}_vec"] = v.tolist()
settings.docStoreConn.insert([d], search.index_name(dataset_tenant_id), dataset_id)
if image_base64:
store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64))
DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
key_mapping = {
"id": "id",
@@ -647,8 +675,13 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
d["tag_feas"] = validate_tag_features(req["tag_feas"])
except ValueError as exc:
return get_error_data_result(f"`tag_feas` {exc}")
image_base64 = req.get("image_base64")
if image_base64:
if "image_base64" in req:
image_binary, image_err = _decode_chunk_image_base64(req.get("image_base64"))
if image_err:
return get_error_data_result(message=image_err)
store_err = _store_chunk_image_or_error(dataset_id, chunk_id, image_binary)
if store_err:
return get_error_data_result(message=store_err)
d["img_id"] = f"{dataset_id}-{chunk_id}"
d["doc_type_kwd"] = "image"
@@ -671,8 +704,6 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d[f"q_{len(v)}_vec"] = v.tolist()
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(dataset_tenant_id), dataset_id)
if image_base64:
store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64))
return get_result()

View File

@@ -653,3 +653,79 @@ def test_restful_chunk_guard_branches_unit(monkeypatch):
res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1"))
assert res["message"] == "`available_int` or `available` is required.", res
@pytest.mark.p2
def test_restful_add_chunk_invalid_image_base64_does_not_index_chunk(monkeypatch):
module = _load_chunk_api_module(monkeypatch)
module.request = SimpleNamespace(args={}, headers={})
module.settings.docStoreConn.inserted.clear()
monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"content": "chunk with bad image", "image_base64": "not-valid-base64!!!"}),
)
res = _run(_route_core(module.add_chunk)("tenant-1", "kb-1", "doc-1"))
assert res["code"] == module.RetCode.DATA_ERROR, res
assert res["message"] == "Invalid `image_base64`", res
assert module.settings.docStoreConn.inserted == [], res
assert module.DocumentService.increment_calls == [], res
@pytest.mark.p2
def test_restful_add_chunk_empty_image_base64_does_not_index_chunk(monkeypatch):
module = _load_chunk_api_module(monkeypatch)
module.request = SimpleNamespace(args={}, headers={})
module.settings.docStoreConn.inserted.clear()
monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"content": "chunk with empty image", "image_base64": ""}),
)
res = _run(_route_core(module.add_chunk)("tenant-1", "kb-1", "doc-1"))
assert res["code"] == module.RetCode.DATA_ERROR, res
assert res["message"] == "`image_base64` must be a non-empty string", res
assert module.settings.docStoreConn.inserted == [], res
assert module.DocumentService.increment_calls == [], res
@pytest.mark.p2
def test_restful_update_chunk_invalid_image_base64_does_not_update_chunk(monkeypatch):
module = _load_chunk_api_module(monkeypatch)
module.request = SimpleNamespace(args={}, headers={})
module.settings.docStoreConn.updated.clear()
monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"content": "updated chunk", "image_base64": "not-valid-base64!!!"}),
)
res = _run(_route_core(module.update_chunk)("tenant-1", "kb-1", "doc-1", "chunk-1"))
assert res["code"] == module.RetCode.DATA_ERROR, res
assert res["message"] == "Invalid `image_base64`", res
assert module.settings.docStoreConn.updated == [], res
@pytest.mark.p2
def test_restful_add_chunk_valid_image_base64_stores_before_insert(monkeypatch):
module = _load_chunk_api_module(monkeypatch)
module.request = SimpleNamespace(args={}, headers={})
module.settings.docStoreConn.inserted.clear()
store_calls = []
monkeypatch.setattr(module, "store_chunk_image", lambda bucket, name, binary: store_calls.append((bucket, name, binary)))
valid_b64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8BQDwAEhQGAhKmMIQAAAABJRU5ErkJggg=="
monkeypatch.setattr(
module,
"get_request_json",
lambda: _AwaitableValue({"content": "chunk with image", "image_base64": valid_b64}),
)
res = _run(_route_core(module.add_chunk)("tenant-1", "kb-1", "doc-1"))
assert res["code"] == 0, res
assert store_calls, "store_chunk_image should run before doc-store insert"
assert module.settings.docStoreConn.inserted, "chunk should be indexed after image stored"
inserted = module.settings.docStoreConn.inserted[-1]
assert inserted.get("img_id"), inserted
assert inserted.get("doc_type_kwd") == "image", inserted