From d43aebe70184e867c83a27b1731f3f59a8dd1d49 Mon Sep 17 00:00:00 2001 From: PandaMan Date: Thu, 26 Feb 2026 10:25:48 +0800 Subject: [PATCH] Fix/13142 auto metadata (#13217) ### What problem does this PR solve? Close #13142 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/sdk/dataset.py | 115 ++++++++++++++++ api/utils/validation_utils.py | 18 +++ sdk/python/ragflow_sdk/ragflow.py | 25 +++- .../test_auto_metadata.py | 126 ++++++++++++++++++ 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 test/testcases/test_sdk_api/test_dataset_mangement/test_auto_metadata.py diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index d0d7ff0c66..6538d3a336 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -40,6 +40,7 @@ from api.utils.api_utils import ( verify_embedding_availability, ) from api.utils.validation_utils import ( + AutoMetadataConfig, CreateDatasetReq, DeleteDatasetReq, ListDatasetReq, @@ -119,6 +120,24 @@ async def create(tenant_id): req, err = await validate_and_parse_json_request(request, CreateDatasetReq) if err is not None: return get_error_argument_result(err) + # Map auto_metadata_config (if provided) into parser_config structure + auto_meta = req.pop("auto_metadata_config", None) + if auto_meta: + parser_cfg = req.get("parser_config") or {} + fields = [] + for f in auto_meta.get("fields", []): + fields.append( + { + "name": f.get("name", ""), + "type": f.get("type", ""), + "description": f.get("description"), + "examples": f.get("examples"), + "restrict_values": f.get("restrict_values", False), + } + ) + parser_cfg["metadata"] = fields + parser_cfg["enable_metadata"] = auto_meta.get("enabled", True) + req["parser_config"] = parser_cfg e, req = KnowledgebaseService.create_with_name( name = req.pop("name", None), tenant_id = tenant_id, @@ -341,6 +360,25 @@ async def update(tenant_id, dataset_id): return get_error_permission_result( message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'") + # Map auto_metadata_config into parser_config if present + auto_meta = req.pop("auto_metadata_config", None) + if auto_meta: + parser_cfg = req.get("parser_config") or {} + fields = [] + for f in auto_meta.get("fields", []): + fields.append( + { + "name": f.get("name", ""), + "type": f.get("type", ""), + "description": f.get("description"), + "examples": f.get("examples"), + "restrict_values": f.get("restrict_values", False), + } + ) + parser_cfg["metadata"] = fields + parser_cfg["enable_metadata"] = auto_meta.get("enabled", True) + req["parser_config"] = parser_cfg + if req.get("parser_config"): req["parser_config"] = deep_merge(kb.parser_config, req["parser_config"]) @@ -488,6 +526,83 @@ def list_datasets(tenant_id): return get_error_data_result(message="Database operation failed") +@manager.route("/datasets//auto_metadata", methods=["GET"]) # noqa: F821 +@token_required +def get_auto_metadata(tenant_id, dataset_id): + """ + Get auto-metadata configuration for a dataset. + """ + try: + kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id) + if kb is None: + return get_error_permission_result( + message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'" + ) + + parser_cfg = kb.parser_config or {} + metadata = parser_cfg.get("metadata") or [] + enabled = parser_cfg.get("enable_metadata", bool(metadata)) + # Normalize to AutoMetadataConfig-like JSON + fields = [] + for f in metadata: + if not isinstance(f, dict): + continue + fields.append( + { + "name": f.get("name", ""), + "type": f.get("type", ""), + "description": f.get("description"), + "examples": f.get("examples"), + "restrict_values": f.get("restrict_values", False), + } + ) + return get_result(data={"enabled": enabled, "fields": fields}) + except OperationalError as e: + logging.exception(e) + return get_error_data_result(message="Database operation failed") + + +@manager.route("/datasets//auto_metadata", methods=["PUT"]) # noqa: F821 +@token_required +async def update_auto_metadata(tenant_id, dataset_id): + """ + Update auto-metadata configuration for a dataset. + """ + cfg, err = await validate_and_parse_json_request(request, AutoMetadataConfig) + if err is not None: + return get_error_argument_result(err) + + try: + kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id) + if kb is None: + return get_error_permission_result( + message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'" + ) + + parser_cfg = kb.parser_config or {} + fields = [] + for f in cfg.get("fields", []): + fields.append( + { + "name": f.get("name", ""), + "type": f.get("type", ""), + "description": f.get("description"), + "examples": f.get("examples"), + "restrict_values": f.get("restrict_values", False), + } + ) + parser_cfg["metadata"] = fields + parser_cfg["enable_metadata"] = cfg.get("enabled", True) + + if not KnowledgebaseService.update_by_id(kb.id, {"parser_config": parser_cfg}): + return get_error_data_result(message="Update auto-metadata error.(Database error)") + + return get_result(data={"enabled": parser_cfg["enable_metadata"], "fields": fields}) + except OperationalError as e: + logging.exception(e) + return get_error_data_result(message="Database operation failed") + + @manager.route('/datasets//knowledge_graph', methods=['GET']) # noqa: F821 @token_required async def knowledge_graph(tenant_id, dataset_id): diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index d6178e641f..9e0b39aae2 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -344,6 +344,23 @@ class GraphragConfig(Base): resolution: Annotated[bool, Field(default=False)] +class AutoMetadataField(Base): + """Schema for a single auto-metadata field configuration.""" + + name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(...)] + type: Annotated[Literal["string", "list", "time"], Field(...)] + description: Annotated[str | None, Field(default=None, max_length=65535)] + examples: Annotated[list[str] | None, Field(default=None)] + restrict_values: Annotated[bool, Field(default=False)] + + +class AutoMetadataConfig(Base): + """Top-level auto-metadata configuration attached to a dataset.""" + + enabled: Annotated[bool, Field(default=True)] + fields: Annotated[list[AutoMetadataField], Field(default_factory=list)] + + class ParserConfig(Base): auto_keywords: Annotated[int, Field(default=0, ge=0, le=32)] auto_questions: Annotated[int, Field(default=0, ge=0, le=10)] @@ -370,6 +387,7 @@ class CreateDatasetReq(Base): parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)] pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")] parser_config: Annotated[ParserConfig | None, Field(default=None)] + auto_metadata_config: Annotated[AutoMetadataConfig | None, Field(default=None)] @field_validator("avatar", mode="after") @classmethod diff --git a/sdk/python/ragflow_sdk/ragflow.py b/sdk/python/ragflow_sdk/ragflow.py index 7d2bd31ee3..764bf8d7ec 100644 --- a/sdk/python/ragflow_sdk/ragflow.py +++ b/sdk/python/ragflow_sdk/ragflow.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Optional, Any import requests @@ -58,6 +58,7 @@ class RAGFlow: permission: str = "me", chunk_method: str = "naive", parser_config: Optional[DataSet.ParserConfig] = None, + auto_metadata_config: Optional[dict[str, Any]] = None, ) -> DataSet: payload = { "name": name, @@ -69,6 +70,8 @@ class RAGFlow: } if parser_config is not None: payload["parser_config"] = parser_config.to_json() + if auto_metadata_config is not None: + payload["auto_metadata_config"] = auto_metadata_config res = self.post("/datasets", payload) res = res.json() @@ -108,6 +111,26 @@ class RAGFlow: return result_list raise Exception(res["message"]) + def get_auto_metadata(self, dataset_id: str) -> dict[str, Any]: + """ + Retrieve auto-metadata configuration for a dataset via SDK. + """ + res = self.get(f"/datasets/{dataset_id}/auto_metadata") + res = res.json() + if res.get("code") == 0: + return res["data"] + raise Exception(res["message"]) + + def update_auto_metadata(self, dataset_id: str, **config: Any) -> dict[str, Any]: + """ + Update auto-metadata configuration for a dataset via SDK. + """ + res = self.put(f"/datasets/{dataset_id}/auto_metadata", config) + res = res.json() + if res.get("code") == 0: + return res["data"] + raise Exception(res["message"]) + def create_chat(self, name: str, avatar: str = "", dataset_ids=None, llm: Chat.LLM | None = None, prompt: Chat.Prompt | None = None) -> Chat: if dataset_ids is None: dataset_ids = [] diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_auto_metadata.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_auto_metadata.py new file mode 100644 index 0000000000..2d2dd92461 --- /dev/null +++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_auto_metadata.py @@ -0,0 +1,126 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + + +@pytest.mark.usefixtures("clear_datasets") +class TestAutoMetadataOnCreate: + @pytest.mark.p1 + def test_create_dataset_with_auto_metadata(self, client): + payload = { + "name": "auto_metadata_create", + "auto_metadata_config": { + "enabled": True, + "fields": [ + { + "name": "author", + "type": "string", + "description": "The author of the document", + "examples": ["John Doe", "Jane Smith"], + "restrict_values": False, + }, + { + "name": "category", + "type": "list", + "description": "Document category", + "examples": ["Technical", "Business"], + "restrict_values": True, + }, + ], + }, + } + dataset = client.create_dataset(**payload) + # The SDK should expose parser_config via internal properties or metadata; + # we rely on the HTTP API for verification via get_auto_metadata. + cfg = client.get_auto_metadata(dataset_id=dataset.id) + assert cfg["enabled"] is True + assert len(cfg["fields"]) == 2 + names = {f["name"] for f in cfg["fields"]} + assert names == {"author", "category"} + + +@pytest.mark.usefixtures("clear_datasets") +class TestAutoMetadataOnUpdate: + @pytest.mark.p1 + def test_update_auto_metadata_via_dataset_update(self, client, add_dataset_func): + dataset = add_dataset_func + + # Initially set auto-metadata via dataset.update + payload = { + "auto_metadata_config": { + "enabled": True, + "fields": [ + { + "name": "tags", + "type": "list", + "description": "Document tags", + "examples": ["AI", "ML", "RAG"], + "restrict_values": False, + } + ], + } + } + dataset.update(payload) + + cfg = client.get_auto_metadata(dataset_id=dataset.id) + assert cfg["enabled"] is True + assert len(cfg["fields"]) == 1 + assert cfg["fields"][0]["name"] == "tags" + assert cfg["fields"][0]["type"] == "list" + + # Disable auto-metadata and replace fields + update_cfg = { + "enabled": False, + "fields": [ + { + "name": "year", + "type": "time", + "description": "Publication year", + "examples": None, + "restrict_values": False, + } + ], + } + client.update_auto_metadata(dataset_id=dataset.id, **update_cfg) + + cfg2 = client.get_auto_metadata(dataset_id=dataset.id) + assert cfg2["enabled"] is False + assert len(cfg2["fields"]) == 1 + assert cfg2["fields"][0]["name"] == "year" + assert cfg2["fields"][0]["type"] == "time" + + +@pytest.mark.usefixtures("clear_datasets") +class TestAutoMetadataValidation: + @pytest.mark.p2 + def test_invalid_field_type_rejected(self, client): + payload = { + "name": "auto_metadata_invalid_type", + "auto_metadata_config": { + "enabled": True, + "fields": [ + { + "name": "invalid_type", + "type": "unknown", # invalid literal + } + ], + }, + } + with pytest.raises(Exception) as exc_info: + client.create_dataset(**payload) + msg = str(exc_info.value) + # Pydantic literal_error message should appear + assert "Input should be" in msg or "literal_error" in msg +