diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 429de7be45..d48885ec90 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -23,16 +23,11 @@ from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.utils.api_utils import ( get_data_error_result, - get_json_result, - get_request_json, server_error_response, - validate_request, ) from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers from common import settings -from common.constants import RetCode, TaskStatus from common.misc_utils import thread_pool_exec -from rag.nlp import search @manager.route("/get/", methods=["GET"]) # noqa: F821 @@ -74,56 +69,3 @@ async def download_attachment(attachment_id): except Exception as e: return server_error_response(e) - -@manager.route("/change_parser", methods=["POST"]) # noqa: F821 -@login_required -@validate_request("doc_id") -async def change_parser(): - req = await get_request_json() - if not DocumentService.accessible(req["doc_id"], current_user.id): - return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) - - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - - def reset_doc(): - nonlocal doc - e = DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"], "parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value}) - if not e: - return get_data_error_result(message="Document not found!") - if doc.token_num > 0: - e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1) - if not e: - return get_data_error_result(message="Document not found!") - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - if not tenant_id: - return get_data_error_result(message="Tenant not found!") - DocumentService.delete_chunk_images(doc, tenant_id) - if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): - settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) - return None - - try: - if "pipeline_id" in req and req["pipeline_id"] != "": - if doc.pipeline_id == req["pipeline_id"]: - return get_json_result(data=True) - DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]}) - reset_doc() - return get_json_result(data=True) - - if doc.parser_id.lower() == req["parser_id"].lower(): - if "parser_config" in req: - if req["parser_config"] == doc.parser_config: - return get_json_result(data=True) - else: - return get_json_result(data=True) - - if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"): - return get_data_error_result(message="Not supported yet!") - if "parser_config" in req: - DocumentService.update_parser_config(doc.id, req["parser_config"]) - reset_doc() - return get_json_result(data=True) - except Exception as e: - return server_error_response(e) diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index f9687bfea5..3a3f3cd30f 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -24,10 +24,11 @@ from peewee import OperationalError from pydantic import ValidationError from api.apps import login_required -from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ - map_doc_keys_with_run_status, update_document_name_only, update_chunk_method_only, update_document_status_only from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX -from api.db import FileType, VALID_FILE_TYPES +from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ + map_doc_keys_with_run_status, update_document_name_only, update_chunk_method, update_document_status_only, \ + reset_document_for_reparse +from api.db import VALID_FILE_TYPES, FileType from api.db.services import duplicate_name from api.db.services.doc_metadata_service import DocMetadataService from api.db.db_models import Task @@ -204,16 +205,26 @@ async def update_document(tenant_id, dataset_id, document_id): if error := update_document_name_only(document_id, req["name"]): return error + # "parser_id" provided but does not match with existing doc's file type + if "parser_id" in req and ((doc.type == FileType.VISUAL and req["parser_id"] != "picture") + or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")): + return get_data_error_result(message="Not supported yet!") + # parser config provided (already validated in UpdateDocumentReq), update it if update_doc_req.parser_config: + req["parser_config"].update(update_doc_req.parser_config.ext) DocumentService.update_parser_config(doc.id, req["parser_config"]) + # pipeline_id provided - reset document for reparse + if update_doc_req.pipeline_id: + if error := reset_document_for_reparse(doc, tenant_id, pipeline_id=update_doc_req.pipeline_id): + return error # chunk method provided - the update method will check if it's different with existing one - if update_doc_req.chunk_method: - if error := update_chunk_method_only(req, doc, dataset_id, tenant_id): + elif update_doc_req.chunk_method: + if error := update_chunk_method(req, doc, tenant_id): return error - if "enabled" in req: # already checked in UpdateDocumentReq - it's int if it's present + if "enabled" in req: # already checked in UpdateDocumentReq - it's int if present # "enabled" flag provided, the update method will check if it's changed and then update if so if error := update_document_status_only(int(req["enabled"]), doc, kb): return error diff --git a/api/apps/services/document_api_service.py b/api/apps/services/document_api_service.py index 82dfa37e35..59abbd2507 100644 --- a/api/apps/services/document_api_service.py +++ b/api/apps/services/document_api_service.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging + from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService @@ -58,7 +60,7 @@ def update_document_name_only(document_id, req_doc_name): ) return None -def update_chunk_method_only(req, doc, dataset_id, tenant_id): +def update_chunk_method(req, doc, tenant_id): """ Update chunk method only (without validation). @@ -69,28 +71,56 @@ def update_chunk_method_only(req, doc, dataset_id, tenant_id): Args: req: The request dictionary containing chunk_method and parser_config. doc: The document model from the database. - dataset_id: The ID of the dataset containing the document. tenant_id: The tenant ID for the document store. Returns: None if successful, or an error result dictionary if failed. """ if doc.parser_id.lower() != req["chunk_method"].lower(): - # if chunk method changed - e = DocumentService.update_by_id( - doc.id, - { - "parser_id": req["chunk_method"], - "progress": 0, - "progress_msg": "", - "run": TaskStatus.UNSTART.value, - }, - ) - if not e: - return get_error_data_result(message="Document not found!") + # if chunk method changed, reset document for reparse + result = reset_document_for_reparse(doc, tenant_id, parser_id=req["chunk_method"]) + if result: + return result if not req.get("parser_config"): req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config")) DocumentService.update_parser_config(doc.id, req["parser_config"]) + return None + + +def reset_document_for_reparse(doc, tenant_id, parser_id=None, pipeline_id=None): + """ + Reset document for reparsing. + + Updates the parser_id and/or pipeline_id for a document, resets its progress, + clears existing chunks from the document store, and removes chunk images. + + Args: + doc: The document model from the database. + tenant_id: The tenant ID for the document store. + parser_id: Optional new parser_id (chunk method). If None, keeps existing. + pipeline_id: Optional new pipeline_id. If None, keeps existing. + + Returns: + None if successful, or an error result dictionary if failed. + """ + + # Build update fields + update_fields = { + "progress": 0, + "progress_msg": "", + "run": TaskStatus.UNSTART.value, + } + if parser_id is not None: + update_fields["parser_id"] = parser_id + if pipeline_id is not None: + update_fields["pipeline_id"] = pipeline_id + + # Update document + e = DocumentService.update_by_id(doc.id, update_fields) + if not e: + return get_error_data_result(message="Document not found!") + + # Delete chunks from document store if doc.token_num > 0: e = DocumentService.increment_chunk_num( doc.id, @@ -98,12 +128,20 @@ def update_chunk_method_only(req, doc, dataset_id, tenant_id): doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1, - ) + ) if not e: return get_error_data_result(message="Document not found!") - settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id) + settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) + + # Delete chunk images + try: + DocumentService.delete_chunk_images(doc, tenant_id) + except Exception as e: + logging.error(f"error when delete chunk images:{e}") + return None + def update_document_status_only(status:int, doc, kb): """ Update document status only (without validation). diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index 4f3ed490d6..3c680aa50c 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -411,6 +411,7 @@ class UpdateDocumentReq(Base): model_config = ConfigDict(extra='ignore') name: Annotated[str | None, Field(default=None, max_length=65535)] chunk_method: Annotated[str | None, Field(default=None, max_length=65535)] + pipeline_id: Annotated[str | None, Field(default=None, max_length=65535)] enabled: Annotated[int | None, Field(default=None, ge=0, le=1)] chunk_count: Annotated[int | None, Field(default=None, ge=0)] token_count: Annotated[int | None, Field(default=None, ge=0)] diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index cfe9c1ce63..383dd1b918 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -451,6 +451,12 @@ def document_change_status(auth, dataset_id, payload=None, *, headers=HEADERS, d return res.json() +def document_update(auth, dataset_id, doc_id, payload=None, *, headers=HEADERS, data=None): + """Update document via PATCH /api/v1/datasets//documents/""" + res = requests.patch(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/documents/{doc_id}", headers=headers, auth=auth, json=payload, data=data) + return res.json() + + def document_thumbnails(auth, params=None, *, headers=HEADERS, data=None): """Get document thumbnails. diff --git a/test/testcases/test_web_api/test_document_app/conftest.py b/test/testcases/test_web_api/test_document_app/conftest.py index 5af8d26277..78b5a5fdf8 100644 --- a/test/testcases/test_web_api/test_document_app/conftest.py +++ b/test/testcases/test_web_api/test_document_app/conftest.py @@ -204,8 +204,9 @@ def document_rest_api_module(monkeypatch): document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None - document_api_service_mod.update_chunk_method_only = lambda *_args, **_kwargs: None + document_api_service_mod.update_chunk_method = lambda *_args, **_kwargs: None document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None + document_api_service_mod.reset_document_for_reparse = lambda *_args, **_kwargs: None monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod) module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py" diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index 6e77983e9a..bb69ef9803 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -26,8 +26,10 @@ from test_common import ( document_update_metadata_setting, bulk_upload_documents, delete_document, + document_update, ) +from common.constants import RetCode from configs import INVALID_API_TOKEN from libs.auth import RAGFlowWebApiAuth @@ -155,6 +157,57 @@ class TestDocumentMetadata: assert info_res["data"]["docs"][0]["status"] == "1", info_res + @pytest.mark.p2 + def test_update_document_change_parser(self, WebApiAuth, add_document_func): + """Test updating document chunk_method via PATCH /api/v1/datasets//documents/.""" + dataset_id, doc_id = add_document_func + + # Get initial document info + res = document_infos(WebApiAuth, dataset_id, {"doc_ids": [doc_id]}) + + assert res["code"] == 0, res + original_parser_id = res["data"]["docs"][0].get("parser_id") + + res = document_update(WebApiAuth, dataset_id, doc_id, {"chunk_method": "invalid_chunk_method"}) + assert res["code"] == 102, res + assert res["message"] == "Field: - Message: <`chunk_method` invalid_chunk_method doesn't exist> - Value: ", res + + # Change to a different parser (naive bayes) + # valid_chunk_method = {"naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "knowledge_graph", "email", "tag"} + new_parser_id = "naive" + if original_parser_id == new_parser_id: + new_parser_id = "paper" + document_update(WebApiAuth, dataset_id, doc_id, {"chunk_method": new_parser_id}) + + # Verify the document was updated + res = document_infos(WebApiAuth, dataset_id, {"doc_ids": [doc_id]}) + + assert res["code"] == 0, res + assert res["data"]["docs"][0]["chunk_method"] == new_parser_id, res + + + @pytest.mark.p2 + def test_update_document_change_pipeline(self, WebApiAuth, add_document_func): + """Test updating document pipeline via PATCH /api/v1/datasets//documents/.""" + dataset_id, doc_id = add_document_func + + # Get initial document info + res = document_infos(WebApiAuth, dataset_id, {"doc_ids": [doc_id]}) + assert res["code"] == 0, res + original_pipeline_id = res["data"]["docs"][0].get("pipeline_id") + + # Change to a different pipeline (if available) + # Note: This test assumes there's at least one other pipeline available + new_pipeline_id = "general" if original_pipeline_id != "general" else "resume" + res = document_update(WebApiAuth, dataset_id, doc_id, {"pipeline_id": new_pipeline_id}) + assert res["code"] == 0, res + + # Verify the document was updated + res = document_infos(WebApiAuth, dataset_id, {"doc_ids": [doc_id]}) + assert res["code"] == 0, res + assert res["data"]["docs"][0]["pipeline_id"] == new_pipeline_id, res + + class TestDocumentMetadataNegative: @pytest.mark.p2 def test_filter_missing_kb_id(self, WebApiAuth, add_document_func): @@ -292,7 +345,7 @@ class TestDocumentMetadataUnit: module = document_app_module monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) res = _run(module.get("doc1")) - assert res["code"] == module.RetCode.DATA_ERROR + assert res["code"] == RetCode.DATA_ERROR assert "Document not found!" in res["message"] async def fake_thread_pool_exec(*_args, **_kwargs): @@ -356,164 +409,6 @@ class TestDocumentMetadataUnit: assert res["code"] == 500 assert "download boom" in res["message"] - def test_change_parser_guards_and_reset_update_failure_unit(self, document_app_module, monkeypatch): - module = document_app_module - - monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) - - async def req_auth_fail(): - return {"doc_id": "doc1", "parser_id": "naive", "pipeline_id": "pipe2"} - - monkeypatch.setattr(module, "get_request_json", req_auth_fail) - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == module.RetCode.AUTHENTICATION_ERROR - - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == module.RetCode.DATA_ERROR - assert "Document not found!" in res["message"] - - async def req_same_pipeline(): - return {"doc_id": "doc1", "parser_id": "naive", "pipeline_id": "pipe1"} - - doc_same = SimpleNamespace( - id="doc1", - pipeline_id="pipe1", - parser_id="naive", - parser_config={"k": "v"}, - token_num=0, - chunk_num=0, - process_duration=0, - kb_id="kb1", - type="doc", - name="doc.txt", - ) - monkeypatch.setattr(module, "get_request_json", req_same_pipeline) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_same)) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - - calls = [] - - async def req_pipeline_change(): - return {"doc_id": "doc1", "parser_id": "naive", "pipeline_id": "pipe2"} - - doc = SimpleNamespace( - id="doc1", - pipeline_id="pipe1", - parser_id="naive", - parser_config={}, - token_num=0, - chunk_num=0, - process_duration=0, - kb_id="kb1", - type="doc", - name="doc.txt", - ) - - def fake_update_by_id(doc_id, payload): - calls.append((doc_id, payload)) - return True - - monkeypatch.setattr(module, "get_request_json", req_pipeline_change) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc)) - monkeypatch.setattr(module.DocumentService, "update_by_id", fake_update_by_id) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - assert calls[0][1] == {"pipeline_id": "pipe2"} - assert calls[1][1]["run"] == module.TaskStatus.UNSTART.value - - doc.token_num = 3 - doc.chunk_num = 2 - doc.process_duration = 9 - monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: False) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - - monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: None) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - - side_effects = {"img": [], "delete": []} - - class _DocStore: - def index_exist(self, _idx, _kb_id): - return True - - def delete(self, where, _idx, kb_id): - side_effects["delete"].append((where["doc_id"], kb_id)) - - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant1") - monkeypatch.setattr(module.DocumentService, "delete_chunk_images", lambda _doc, _tenant: side_effects["img"].append((_doc.id, _tenant))) - monkeypatch.setattr(module.search, "index_name", lambda tenant_id: f"idx_{tenant_id}") - monkeypatch.setattr(module.settings, "docStoreConn", _DocStore()) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - assert ("doc1", "tenant1") in side_effects["img"] - assert ("doc1", "kb1") in side_effects["delete"] - - async def req_same_parser_with_cfg(): - return {"doc_id": "doc1", "parser_id": "naive", "parser_config": {"a": 1}} - - doc_same_parser = SimpleNamespace( - id="doc1", - pipeline_id="pipe1", - parser_id="naive", - parser_config={"a": 1}, - token_num=0, - chunk_num=0, - process_duration=0, - kb_id="kb1", - type="doc", - name="doc.txt", - ) - monkeypatch.setattr(module, "get_request_json", req_same_parser_with_cfg) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_same_parser)) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - - async def req_same_parser_no_cfg(): - return {"doc_id": "doc1", "parser_id": "naive"} - - monkeypatch.setattr(module, "get_request_json", req_same_parser_no_cfg) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - - parser_cfg_updates = [] - - async def req_parser_update(): - return {"doc_id": "doc1", "parser_id": "paper", "pipeline_id": "", "parser_config": {"beta": True}} - - doc_parser_update = SimpleNamespace( - id="doc1", - pipeline_id="pipe1", - parser_id="naive", - parser_config={"alpha": 1}, - token_num=0, - chunk_num=0, - process_duration=0, - kb_id="kb1", - type="doc", - name="doc.txt", - ) - monkeypatch.setattr(module, "get_request_json", req_parser_update) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, doc_parser_update)) - monkeypatch.setattr(module.DocumentService, "update_parser_config", lambda doc_id, cfg: parser_cfg_updates.append((doc_id, cfg))) - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 0 - assert parser_cfg_updates == [("doc1", {"beta": True})] - - def raise_parser_config(*_args, **_kwargs): - raise RuntimeError("parser boom") - - monkeypatch.setattr(module.DocumentService, "update_parser_config", raise_parser_config) - res = _run(module.change_parser.__wrapped__()) - assert res["code"] == 500 - assert "parser boom" in res["message"] @pytest.mark.skip(reason="Moved to /api/v1/documents/images/") def test_get_image_success_and_exception_unit(self, document_app_module, monkeypatch): diff --git a/web/src/hooks/parser-config-utils.ts b/web/src/hooks/parser-config-utils.ts new file mode 100644 index 0000000000..bc617cfde1 --- /dev/null +++ b/web/src/hooks/parser-config-utils.ts @@ -0,0 +1,90 @@ +/** + * Utility functions for extracting parser and raptor config extensions. + * These functions extract known fields from parser/raptor config objects + * and merge unknown fields into the `ext` field for flexible configuration. + */ + +/** + * Extracts Raptor configuration with extra fields merged into ext. + * @param raptorConfig - The raptor configuration object + * @returns Processed raptor config with extra fields in ext + */ +export const extractRaptorConfigExt = ( + raptorConfig: Record | undefined, +) => { + if (!raptorConfig) return raptorConfig; + const { + use_raptor, + prompt, + max_token, + threshold, + max_cluster, + random_seed, + auto_disable_for_structured_data, + ext, + ...raptorExt + } = raptorConfig; + return { + use_raptor, + prompt, + max_token, + threshold, + max_cluster, + random_seed, + auto_disable_for_structured_data, + ext: { ...ext, ...raptorExt }, + }; +}; + +/** + * Extracts Parser configuration with extra fields merged into ext. + * @param parserConfig - The parser configuration object + * @returns Processed parser config with extra fields in ext + */ +export const extractParserConfigExt = ( + parserConfig: Record | undefined, +) => { + if (!parserConfig) return parserConfig; + const { + auto_keywords, + auto_questions, + chunk_token_num, + delimiter, + graphrag, + html4excel, + layout_recognize, + raptor, + tag_kb_ids, + topn_tags, + filename_embd_weight, + task_page_size, + pages, + children_delimiter, + use_parent_child, + enable_children, + ext, + ...parserExt + } = parserConfig; + return { + auto_keywords, + auto_questions, + chunk_token_num, + delimiter, + graphrag, + html4excel, + layout_recognize, + raptor: extractRaptorConfigExt(raptor), + tag_kb_ids, + topn_tags, + filename_embd_weight, + task_page_size, + pages, + parent_child: enable_children + ? { + children_delimiter, + use_parent_child: use_parent_child ?? enable_children, + } + : undefined, + ext: { ...ext, ...parserExt }, + }; +}; diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts index 3ac6b9735f..65257ae56e 100644 --- a/web/src/hooks/use-document-request.ts +++ b/web/src/hooks/use-document-request.ts @@ -16,6 +16,7 @@ import { import i18n from '@/locales/config'; import { EMPTY_METADATA_FIELD } from '@/pages/dataset/dataset/use-select-filters'; import kbService, { + changeDocumentParser, changeDocumentsStatus, createDocument, deleteDocument, @@ -38,6 +39,7 @@ import { useGetPaginationWithRouter, useHandleSearchChange, } from './logic-hooks'; +import { extractParserConfigExt } from './parser-config-utils'; import { useGetKnowledgeSearchParams, useSetPaginationParams, @@ -393,19 +395,33 @@ export const useSetDocumentParser = () => { parserId, pipelineId, documentId, + datasetId, parserConfig, }: { parserId: string; pipelineId: string; documentId: string; - parserConfig: IChangeParserConfigRequestBody; + datasetId: string; + parserConfig?: IChangeParserConfigRequestBody; }) => { - const { data } = await kbService.documentChangeParser({ - parser_id: parserId, - pipeline_id: pipelineId, - doc_id: documentId, - parser_config: parserConfig, - }); + // Build update payload + const updateData: Record = {}; + if (parserId) { + updateData.chunk_method = parserId; + } + if (pipelineId) { + updateData.pipeline_id = pipelineId; + } + + if (parserConfig) { + updateData.parser_config = extractParserConfigExt(parserConfig); + } + + const { data } = await changeDocumentParser( + datasetId, + documentId, + updateData, + ); if (data.code === 0) { queryClient.invalidateQueries({ queryKey: [DocumentApiAction.FetchDocumentList], diff --git a/web/src/hooks/use-knowledge-request.ts b/web/src/hooks/use-knowledge-request.ts index 853f3750a5..2c7040d565 100644 --- a/web/src/hooks/use-knowledge-request.ts +++ b/web/src/hooks/use-knowledge-request.ts @@ -37,10 +37,10 @@ import { useGetPaginationWithRouter, useHandleSearchChange, } from './logic-hooks'; +import { extractParserConfigExt } from './parser-config-utils'; import { useSetPaginationParams } from './route-hook'; export const enum KnowledgeApiAction { - TestRetrieval = 'testRetrieval', FetchKnowledgeListByPage = 'fetchKnowledgeListByPage', CreateKnowledge = 'createKnowledge', DeleteKnowledge = 'deleteKnowledge', @@ -258,81 +258,6 @@ export const useUpdateKnowledge = (shouldFetchList = false) => { const knowledgeBaseId = useKnowledgeBaseId(); const queryClient = useQueryClient(); - const extractRaptorConfigExt = ( - raptorConfig: Record | undefined, - ) => { - if (!raptorConfig) return raptorConfig; - const { - use_raptor, - prompt, - max_token, - threshold, - max_cluster, - random_seed, - auto_disable_for_structured_data, - ext, - ...raptorExt - } = raptorConfig; - return { - use_raptor, - prompt, - max_token, - threshold, - max_cluster, - random_seed, - auto_disable_for_structured_data, - ext: { ...ext, ...raptorExt }, - }; - }; - - const extractParserConfigExt = ( - parserConfig: Record | undefined, - ) => { - if (!parserConfig) return parserConfig; - const { - auto_keywords, - auto_questions, - chunk_token_num, - delimiter, - graphrag, - html4excel, - layout_recognize, - raptor, - tag_kb_ids, - topn_tags, - filename_embd_weight, - task_page_size, - pages, - children_delimiter, - use_parent_child, - enable_children, - ext, - ...parserExt - } = parserConfig; - return { - auto_keywords, - auto_questions, - chunk_token_num, - delimiter, - graphrag, - html4excel, - layout_recognize, - raptor: extractRaptorConfigExt(raptor), - tag_kb_ids, - topn_tags, - filename_embd_weight, - task_page_size, - pages, - parent_child: enable_children - ? { - children_delimiter, - use_parent_child: use_parent_child ?? enable_children, - } - : undefined, - ext: { ...ext, ...parserExt }, - }; - }; - const { data, isPending: loading, @@ -376,6 +301,7 @@ export const useUpdateKnowledge = (shouldFetchList = false) => { parser_config: extractParserConfigExt(parser_config), ...omit(ext, ['kb_id']), }; + const { data = {} } = await updateKb(kbId, requestBody); if (data.code === 0) { message.success(i18n.t(`message.updated`)); diff --git a/web/src/interfaces/request/document.ts b/web/src/interfaces/request/document.ts index f0e693207d..4f16b155d2 100644 --- a/web/src/interfaces/request/document.ts +++ b/web/src/interfaces/request/document.ts @@ -11,6 +11,18 @@ export interface IChangeParserConfigRequestBody { image_table_context_window?: number; image_context_size?: number; table_context_size?: number; + // Metadata fields + metadata?: Array<{ + key?: string; + description?: string; + enum?: string[]; + }>; + built_in_metadata?: Array<{ + key?: string; + description?: string; + enum?: string[]; + }>; + enable_metadata?: boolean; } export interface IChangeParserRequestBody { diff --git a/web/src/pages/dataset/dataset/use-change-document-parser.ts b/web/src/pages/dataset/dataset/use-change-document-parser.ts index 0457fad84c..cfa358cc10 100644 --- a/web/src/pages/dataset/dataset/use-change-document-parser.ts +++ b/web/src/pages/dataset/dataset/use-change-document-parser.ts @@ -16,11 +16,12 @@ export const useChangeDocumentParser = () => { const onChangeParserOk = useCallback( async (parserConfigInfo: IChangeParserRequestBody) => { - if (record?.id) { + if (record?.id && record?.dataset_id) { const ret = await setDocumentParser({ parserId: parserConfigInfo.parser_id, pipelineId: parserConfigInfo.pipeline_id, documentId: record?.id, + datasetId: record?.dataset_id, parserConfig: parserConfigInfo.parser_config, }); if (ret === 0) { @@ -28,7 +29,7 @@ export const useChangeDocumentParser = () => { } } }, - [record?.id, setDocumentParser, hideChangeParserModal], + [record?.id, record?.dataset_id, setDocumentParser, hideChangeParserModal], ); const handleShowChangeParserModal = useCallback( diff --git a/web/src/services/knowledge-service.ts b/web/src/services/knowledge-service.ts index 4e570f8676..2397a72563 100644 --- a/web/src/services/knowledge-service.ts +++ b/web/src/services/knowledge-service.ts @@ -328,6 +328,12 @@ export const renameDocument = ( data: { name?: string }, ) => request.patch(api.documentRename(datasetId, documentId), { data }); +export const changeDocumentParser = ( + datasetId: string, + documentId: string, + data: { name?: string }, +) => request.patch(api.documentChangeParser(datasetId, documentId), { data }); + export const deleteDocument = (datasetId: string, documentIds: string[]) => request.delete(api.documentDelete(datasetId), { data: { ids: documentIds } }); diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index c2f19d97e5..0220bfa220 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -122,7 +122,8 @@ export default { documentIngest: `${restAPIv1}/documents/ingest`, documentCreate: (datasetId: string) => `${restAPIv1}/datasets/${datasetId}/documents?type=empty`, - documentChangeParser: `${webAPI}/document/change_parser`, + documentChangeParser: (datasetId: string, documentId: string) => + `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, documentThumbnails: `${restAPIv1}/thumbnails`, getDocumentFile: `${webAPI}/document/get`, getDocumentFileDownload: (docId: string) =>