fix(api): POST /documents/stop removes partial chunks and resets counters (#15789)

### What problem does this PR solve?

`POST /api/v1/datasets/{dataset_id}/documents/stop`
(`stop_parse_documents`) cancels parsing tasks and sets `run` to
`CANCEL`, but it does **not** remove chunks already indexed in the doc
store or reset `progress` / `chunk_num`. REST callers can end up with a
“cancelled” document that still returns partial chunks in `GET
.../chunks` and in retrieval.

Legacy `DELETE /api/v1/datasets/{dataset_id}/chunks` (`stop_parsing`)
already performs full cleanup: it resets counters and calls
`docStoreConn.delete`. This PR aligns the newer stop endpoint with that
behavior so both paths leave the dataset consistent.

Fixes [#15788](https://github.com/infiniflow/ragflow/issues/15788).

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

### Changes

- Update `stop_parse_documents` in `document_api.py` to reset `progress`
and `chunk_num` to `0` and delete partial chunks via
`docStoreConn.delete` after `cancel_all_task_of`.
- Add unit test `test_stop_parse_documents_cleans_partial_chunks` to
assert counters reset and doc store delete is invoked.

### Test plan

- [x] Unit test: `pytest
test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py::TestDocRoutesUnit::test_stop_parse_documents_cleans_partial_chunks
-v`
- [ ] Manual: upload a slow document, start parse, call `POST
.../documents/stop` while `RUNNING`, verify `GET .../chunks` returns
zero chunks and UI `chunk_count` is 0
- [ ] Control: legacy `DELETE .../chunks` behavior unchanged

---------

Co-authored-by: Wang Qi <wangq8@outlook.com>
This commit is contained in:
kpdev
2026-06-11 00:51:32 -07:00
committed by GitHub
parent c15b2b3f66
commit de18313f97
2 changed files with 61 additions and 1 deletions

View File

@@ -1625,7 +1625,17 @@ async def stop_parse_documents(tenant_id, dataset_id):
continue
cancel_all_task_of(doc_id)
DocumentService.update_by_id(doc_id, {"run": str(TaskStatus.CANCEL.value)})
DocumentService.update_by_id(
doc_id,
{
"run": str(TaskStatus.CANCEL.value),
"progress": 0,
"chunk_num": 0,
},
)
index_name = search.index_name(tenant_id)
if settings.docStoreConn.index_exist(index_name, doc.kb_id):
settings.docStoreConn.delete({"doc_id": doc.id}, index_name, doc.kb_id)
success_count += 1
result = {"success_count": success_count}

View File

@@ -731,6 +731,56 @@ class TestDocRoutesUnit:
res = _run(module.stop_parsing.__wrapped__("tenant-1", "ds-1"))
assert res["code"] == 0
def test_stop_parse_documents_cleans_partial_chunks(self, monkeypatch):
module = _load_doc_module(monkeypatch, module_basename="document_api")
updated = []
deleted = []
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"document_ids": ["doc-1"]}))
monkeypatch.setattr(module, "check_duplicate_ids", lambda ids, _kind: (ids, []))
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [object()])
monkeypatch.setattr(
module.DocumentService,
"get_by_id",
lambda _id: (True, _DummyDoc(doc_id="doc-1", run=module.TaskStatus.RUNNING.value)),
)
monkeypatch.setattr(module.TaskService, "query", lambda **_kwargs: [SimpleNamespace(progress=0.5)])
monkeypatch.setattr(module, "cancel_all_task_of", lambda *_args, **_kwargs: None)
monkeypatch.setattr(
module.DocumentService,
"update_by_id",
lambda doc_id, info: updated.append((doc_id, info)) or True,
)
_patch_docstore(
monkeypatch,
module,
index_exist=lambda *_args, **_kwargs: True,
delete=lambda condition, index, kb_id: deleted.append((condition, index, kb_id)),
)
res = _run(module.stop_parse_documents.__wrapped__("tenant-1", "ds-1"))
assert res["code"] == 0
assert res["data"]["success_count"] == 1
assert updated == [
(
"doc-1",
{
"run": module.TaskStatus.CANCEL.value,
"progress": 0,
"chunk_num": 0,
},
)
]
assert deleted == [({"doc_id": "doc-1"}, module.search.index_name("tenant-1"), "kb-1")]
deleted.clear()
_patch_docstore(monkeypatch, module, index_exist=lambda *_args, **_kwargs: False)
res = _run(module.stop_parse_documents.__wrapped__("tenant-1", "ds-1"))
assert res["code"] == 0
assert deleted == []
def test_list_chunks_branches(self, monkeypatch):
module = _load_restful_chunk_module(monkeypatch)
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)