fix(api): guard dataset delete when File2Document row is missing (#15533)

## Summary
Fixes #15532 — `delete_datasets()` crashes with `IndexError` when a
document has no `File2Document` row.
`delete_datasets()` in `dataset_api_service.py` called
`File2DocumentService.get_by_document_id()` and immediately accessed
`f2d[0].file_id` without checking whether the lookup returned any rows.
Documents created via API ingestion or connector sync may exist without
a linked file record, causing dataset deletion to abort with HTTP 500.
This PR mirrors the existing guard already used in `file_service.py` and
`document_api_service.py`.
This commit is contained in:
bohdansolovie
2026-06-11 15:18:08 +08:00
committed by GitHub
parent 9871a7e0b6
commit 47fb462e46
2 changed files with 197 additions and 6 deletions

View File

@@ -143,12 +143,23 @@ async def delete_datasets(tenant_id: str, ids: list = None, delete_all: bool = F
errors.append(f"Remove document '{doc.id}' error for dataset '{kb_id}'")
continue
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete(
[
File.source_type == FileSource.KNOWLEDGEBASE,
File.id == f2d[0].file_id,
]
)
if f2d:
FileService.filter_delete(
[
File.source_type == FileSource.KNOWLEDGEBASE,
File.id == f2d[0].file_id,
]
)
else:
# Normal uploads create a File2Document row via FileService.add_file_from_kb.
# A missing row usually means stale/partial data (e.g. link removed earlier,
# failed post-insert file linkage, or legacy rows). Deletion still proceeds.
logging.warning(
"delete_datasets: document %s in dataset %s has no File2Document row; "
"skipping linked file delete",
doc.id,
kb_id,
)
File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kb.name])

View File

@@ -0,0 +1,180 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Regression tests for delete_datasets() in dataset_api_service."""
import importlib.util
import sys
from pathlib import Path
from types import ModuleType, SimpleNamespace
from unittest.mock import MagicMock
import pytest
pytestmark = pytest.mark.p2
def _stub(monkeypatch, name, **attrs):
mod = ModuleType(name)
for key, value in attrs.items():
setattr(mod, key, value)
monkeypatch.setitem(sys.modules, name, mod)
if "." in name:
parent_name, _, child_name = name.rpartition(".")
parent_mod = sys.modules.get(parent_name)
if parent_mod is not None:
monkeypatch.setattr(parent_mod, child_name, mod, raising=False)
return mod
def _load_delete_datasets_module(monkeypatch, *, f2d_rows, file_filter_delete):
f2d_delete = MagicMock()
kb = SimpleNamespace(id="kb-1", tenant_id="tenant-1", name="test-kb")
doc = SimpleNamespace(id="doc-1")
_stub(
monkeypatch,
"api.db.services.document_service",
DocumentService=SimpleNamespace(
query=lambda kb_id: [doc],
remove_document=lambda doc, tenant_id: True,
),
queue_raptor_o_graphrag_tasks=MagicMock(),
)
_stub(
monkeypatch,
"api.db.services.file2document_service",
File2DocumentService=SimpleNamespace(
get_by_document_id=lambda doc_id: f2d_rows,
delete_by_document_id=f2d_delete,
),
)
_stub(
monkeypatch,
"api.db.services.file_service",
FileService=SimpleNamespace(filter_delete=file_filter_delete),
)
_stub(
monkeypatch,
"api.db.services.knowledgebase_service",
KnowledgebaseService=SimpleNamespace(
get_or_none=lambda id, tenant_id: kb,
delete_by_id=lambda kb_id: True,
query=lambda **kwargs: [],
),
)
_stub(
monkeypatch,
"api.db.services.connector_service",
Connector2KbService=SimpleNamespace(),
)
_stub(
monkeypatch,
"api.db.services.task_service",
TaskService=SimpleNamespace(),
GRAPH_RAPTOR_FAKE_DOC_ID="fake-doc",
)
_stub(
monkeypatch,
"api.db.services.user_service",
TenantService=SimpleNamespace(),
UserService=SimpleNamespace(),
UserTenantService=SimpleNamespace(),
)
_stub(
monkeypatch,
"api.db.services.tenant_llm_service",
TenantLLMService=SimpleNamespace(),
)
_stub(
monkeypatch,
"api.db.joint_services.tenant_model_service",
get_model_config_from_provider_instance=MagicMock(),
)
_stub(
monkeypatch,
"api.utils.api_utils",
deep_merge=MagicMock(),
get_parser_config=MagicMock(),
remap_dictionary_keys=MagicMock(),
verify_embedding_availability=MagicMock(),
)
_stub(
monkeypatch,
"common.settings",
docStoreConn=SimpleNamespace(delete_idx=lambda *_args, **_kwargs: None),
)
_stub(
monkeypatch,
"api.db.db_models",
File=SimpleNamespace(source_type="source_type", id="id", type="type", name="name"),
)
_stub(
monkeypatch,
"common.constants",
PAGERANK_FLD="pagerank",
TAG_FLD="tag",
FileSource=SimpleNamespace(KNOWLEDGEBASE="knowledgebase"),
StatusEnum=SimpleNamespace(),
LLMType=SimpleNamespace(),
)
_stub(
monkeypatch,
"rag.nlp.search",
index_name=lambda tenant_id: f"idx-{tenant_id}",
)
repo_root = Path(__file__).resolve().parents[5]
module_path = repo_root / "api" / "apps" / "services" / "dataset_api_service.py"
spec = importlib.util.spec_from_file_location("test_delete_datasets_module", module_path)
module = importlib.util.module_from_spec(spec)
monkeypatch.setitem(sys.modules, "test_delete_datasets_module", module)
spec.loader.exec_module(module)
return module, f2d_delete
@pytest.mark.asyncio
async def test_delete_datasets_skips_file_delete_when_no_file2document(monkeypatch):
"""Documents without a File2Document row must not crash dataset deletion."""
file_filter_delete = MagicMock(return_value=0)
module, f2d_delete = _load_delete_datasets_module(
monkeypatch,
f2d_rows=[],
file_filter_delete=file_filter_delete,
)
ok, result = await module.delete_datasets("tenant-1", ids=["kb-1"])
assert ok is True
assert result == {"success_count": 1}
file_filter_delete.assert_called_once()
f2d_delete.assert_called_once_with("doc-1")
@pytest.mark.asyncio
async def test_delete_datasets_deletes_linked_file_when_file2document_exists(monkeypatch):
f2d_row = SimpleNamespace(file_id="file-1")
file_filter_delete = MagicMock(side_effect=[1, 0])
module, _f2d_delete = _load_delete_datasets_module(
monkeypatch,
f2d_rows=[f2d_row],
file_filter_delete=file_filter_delete,
)
ok, result = await module.delete_datasets("tenant-1", ids=["kb-1"])
assert ok is True
assert result == {"success_count": 1}
assert file_filter_delete.call_count == 2