mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix/13142 auto metadata (#13217)
### What problem does this PR solve? Close #13142 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -40,6 +40,7 @@ from api.utils.api_utils import (
|
||||
verify_embedding_availability,
|
||||
)
|
||||
from api.utils.validation_utils import (
|
||||
AutoMetadataConfig,
|
||||
CreateDatasetReq,
|
||||
DeleteDatasetReq,
|
||||
ListDatasetReq,
|
||||
@@ -119,6 +120,24 @@ async def create(tenant_id):
|
||||
req, err = await validate_and_parse_json_request(request, CreateDatasetReq)
|
||||
if err is not None:
|
||||
return get_error_argument_result(err)
|
||||
# Map auto_metadata_config (if provided) into parser_config structure
|
||||
auto_meta = req.pop("auto_metadata_config", None)
|
||||
if auto_meta:
|
||||
parser_cfg = req.get("parser_config") or {}
|
||||
fields = []
|
||||
for f in auto_meta.get("fields", []):
|
||||
fields.append(
|
||||
{
|
||||
"name": f.get("name", ""),
|
||||
"type": f.get("type", ""),
|
||||
"description": f.get("description"),
|
||||
"examples": f.get("examples"),
|
||||
"restrict_values": f.get("restrict_values", False),
|
||||
}
|
||||
)
|
||||
parser_cfg["metadata"] = fields
|
||||
parser_cfg["enable_metadata"] = auto_meta.get("enabled", True)
|
||||
req["parser_config"] = parser_cfg
|
||||
e, req = KnowledgebaseService.create_with_name(
|
||||
name = req.pop("name", None),
|
||||
tenant_id = tenant_id,
|
||||
@@ -341,6 +360,25 @@ async def update(tenant_id, dataset_id):
|
||||
return get_error_permission_result(
|
||||
message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'")
|
||||
|
||||
# Map auto_metadata_config into parser_config if present
|
||||
auto_meta = req.pop("auto_metadata_config", None)
|
||||
if auto_meta:
|
||||
parser_cfg = req.get("parser_config") or {}
|
||||
fields = []
|
||||
for f in auto_meta.get("fields", []):
|
||||
fields.append(
|
||||
{
|
||||
"name": f.get("name", ""),
|
||||
"type": f.get("type", ""),
|
||||
"description": f.get("description"),
|
||||
"examples": f.get("examples"),
|
||||
"restrict_values": f.get("restrict_values", False),
|
||||
}
|
||||
)
|
||||
parser_cfg["metadata"] = fields
|
||||
parser_cfg["enable_metadata"] = auto_meta.get("enabled", True)
|
||||
req["parser_config"] = parser_cfg
|
||||
|
||||
if req.get("parser_config"):
|
||||
req["parser_config"] = deep_merge(kb.parser_config, req["parser_config"])
|
||||
|
||||
@@ -488,6 +526,83 @@ def list_datasets(tenant_id):
|
||||
return get_error_data_result(message="Database operation failed")
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/auto_metadata", methods=["GET"]) # noqa: F821
|
||||
@token_required
|
||||
def get_auto_metadata(tenant_id, dataset_id):
|
||||
"""
|
||||
Get auto-metadata configuration for a dataset.
|
||||
"""
|
||||
try:
|
||||
kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id)
|
||||
if kb is None:
|
||||
return get_error_permission_result(
|
||||
message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"
|
||||
)
|
||||
|
||||
parser_cfg = kb.parser_config or {}
|
||||
metadata = parser_cfg.get("metadata") or []
|
||||
enabled = parser_cfg.get("enable_metadata", bool(metadata))
|
||||
# Normalize to AutoMetadataConfig-like JSON
|
||||
fields = []
|
||||
for f in metadata:
|
||||
if not isinstance(f, dict):
|
||||
continue
|
||||
fields.append(
|
||||
{
|
||||
"name": f.get("name", ""),
|
||||
"type": f.get("type", ""),
|
||||
"description": f.get("description"),
|
||||
"examples": f.get("examples"),
|
||||
"restrict_values": f.get("restrict_values", False),
|
||||
}
|
||||
)
|
||||
return get_result(data={"enabled": enabled, "fields": fields})
|
||||
except OperationalError as e:
|
||||
logging.exception(e)
|
||||
return get_error_data_result(message="Database operation failed")
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/auto_metadata", methods=["PUT"]) # noqa: F821
|
||||
@token_required
|
||||
async def update_auto_metadata(tenant_id, dataset_id):
|
||||
"""
|
||||
Update auto-metadata configuration for a dataset.
|
||||
"""
|
||||
cfg, err = await validate_and_parse_json_request(request, AutoMetadataConfig)
|
||||
if err is not None:
|
||||
return get_error_argument_result(err)
|
||||
|
||||
try:
|
||||
kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id)
|
||||
if kb is None:
|
||||
return get_error_permission_result(
|
||||
message=f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"
|
||||
)
|
||||
|
||||
parser_cfg = kb.parser_config or {}
|
||||
fields = []
|
||||
for f in cfg.get("fields", []):
|
||||
fields.append(
|
||||
{
|
||||
"name": f.get("name", ""),
|
||||
"type": f.get("type", ""),
|
||||
"description": f.get("description"),
|
||||
"examples": f.get("examples"),
|
||||
"restrict_values": f.get("restrict_values", False),
|
||||
}
|
||||
)
|
||||
parser_cfg["metadata"] = fields
|
||||
parser_cfg["enable_metadata"] = cfg.get("enabled", True)
|
||||
|
||||
if not KnowledgebaseService.update_by_id(kb.id, {"parser_config": parser_cfg}):
|
||||
return get_error_data_result(message="Update auto-metadata error.(Database error)")
|
||||
|
||||
return get_result(data={"enabled": parser_cfg["enable_metadata"], "fields": fields})
|
||||
except OperationalError as e:
|
||||
logging.exception(e)
|
||||
return get_error_data_result(message="Database operation failed")
|
||||
|
||||
|
||||
@manager.route('/datasets/<dataset_id>/knowledge_graph', methods=['GET']) # noqa: F821
|
||||
@token_required
|
||||
async def knowledge_graph(tenant_id, dataset_id):
|
||||
|
||||
@@ -344,6 +344,23 @@ class GraphragConfig(Base):
|
||||
resolution: Annotated[bool, Field(default=False)]
|
||||
|
||||
|
||||
class AutoMetadataField(Base):
|
||||
"""Schema for a single auto-metadata field configuration."""
|
||||
|
||||
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(...)]
|
||||
type: Annotated[Literal["string", "list", "time"], Field(...)]
|
||||
description: Annotated[str | None, Field(default=None, max_length=65535)]
|
||||
examples: Annotated[list[str] | None, Field(default=None)]
|
||||
restrict_values: Annotated[bool, Field(default=False)]
|
||||
|
||||
|
||||
class AutoMetadataConfig(Base):
|
||||
"""Top-level auto-metadata configuration attached to a dataset."""
|
||||
|
||||
enabled: Annotated[bool, Field(default=True)]
|
||||
fields: Annotated[list[AutoMetadataField], Field(default_factory=list)]
|
||||
|
||||
|
||||
class ParserConfig(Base):
|
||||
auto_keywords: Annotated[int, Field(default=0, ge=0, le=32)]
|
||||
auto_questions: Annotated[int, Field(default=0, ge=0, le=10)]
|
||||
@@ -370,6 +387,7 @@ class CreateDatasetReq(Base):
|
||||
parse_type: Annotated[int | None, Field(default=None, ge=0, le=64)]
|
||||
pipeline_id: Annotated[str | None, Field(default=None, min_length=32, max_length=32, serialization_alias="pipeline_id")]
|
||||
parser_config: Annotated[ParserConfig | None, Field(default=None)]
|
||||
auto_metadata_config: Annotated[AutoMetadataConfig | None, Field(default=None)]
|
||||
|
||||
@field_validator("avatar", mode="after")
|
||||
@classmethod
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
from typing import Optional, Any
|
||||
|
||||
import requests
|
||||
|
||||
@@ -58,6 +58,7 @@ class RAGFlow:
|
||||
permission: str = "me",
|
||||
chunk_method: str = "naive",
|
||||
parser_config: Optional[DataSet.ParserConfig] = None,
|
||||
auto_metadata_config: Optional[dict[str, Any]] = None,
|
||||
) -> DataSet:
|
||||
payload = {
|
||||
"name": name,
|
||||
@@ -69,6 +70,8 @@ class RAGFlow:
|
||||
}
|
||||
if parser_config is not None:
|
||||
payload["parser_config"] = parser_config.to_json()
|
||||
if auto_metadata_config is not None:
|
||||
payload["auto_metadata_config"] = auto_metadata_config
|
||||
|
||||
res = self.post("/datasets", payload)
|
||||
res = res.json()
|
||||
@@ -108,6 +111,26 @@ class RAGFlow:
|
||||
return result_list
|
||||
raise Exception(res["message"])
|
||||
|
||||
def get_auto_metadata(self, dataset_id: str) -> dict[str, Any]:
|
||||
"""
|
||||
Retrieve auto-metadata configuration for a dataset via SDK.
|
||||
"""
|
||||
res = self.get(f"/datasets/{dataset_id}/auto_metadata")
|
||||
res = res.json()
|
||||
if res.get("code") == 0:
|
||||
return res["data"]
|
||||
raise Exception(res["message"])
|
||||
|
||||
def update_auto_metadata(self, dataset_id: str, **config: Any) -> dict[str, Any]:
|
||||
"""
|
||||
Update auto-metadata configuration for a dataset via SDK.
|
||||
"""
|
||||
res = self.put(f"/datasets/{dataset_id}/auto_metadata", config)
|
||||
res = res.json()
|
||||
if res.get("code") == 0:
|
||||
return res["data"]
|
||||
raise Exception(res["message"])
|
||||
|
||||
def create_chat(self, name: str, avatar: str = "", dataset_ids=None, llm: Chat.LLM | None = None, prompt: Chat.Prompt | None = None) -> Chat:
|
||||
if dataset_ids is None:
|
||||
dataset_ids = []
|
||||
|
||||
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("clear_datasets")
|
||||
class TestAutoMetadataOnCreate:
|
||||
@pytest.mark.p1
|
||||
def test_create_dataset_with_auto_metadata(self, client):
|
||||
payload = {
|
||||
"name": "auto_metadata_create",
|
||||
"auto_metadata_config": {
|
||||
"enabled": True,
|
||||
"fields": [
|
||||
{
|
||||
"name": "author",
|
||||
"type": "string",
|
||||
"description": "The author of the document",
|
||||
"examples": ["John Doe", "Jane Smith"],
|
||||
"restrict_values": False,
|
||||
},
|
||||
{
|
||||
"name": "category",
|
||||
"type": "list",
|
||||
"description": "Document category",
|
||||
"examples": ["Technical", "Business"],
|
||||
"restrict_values": True,
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
dataset = client.create_dataset(**payload)
|
||||
# The SDK should expose parser_config via internal properties or metadata;
|
||||
# we rely on the HTTP API for verification via get_auto_metadata.
|
||||
cfg = client.get_auto_metadata(dataset_id=dataset.id)
|
||||
assert cfg["enabled"] is True
|
||||
assert len(cfg["fields"]) == 2
|
||||
names = {f["name"] for f in cfg["fields"]}
|
||||
assert names == {"author", "category"}
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("clear_datasets")
|
||||
class TestAutoMetadataOnUpdate:
|
||||
@pytest.mark.p1
|
||||
def test_update_auto_metadata_via_dataset_update(self, client, add_dataset_func):
|
||||
dataset = add_dataset_func
|
||||
|
||||
# Initially set auto-metadata via dataset.update
|
||||
payload = {
|
||||
"auto_metadata_config": {
|
||||
"enabled": True,
|
||||
"fields": [
|
||||
{
|
||||
"name": "tags",
|
||||
"type": "list",
|
||||
"description": "Document tags",
|
||||
"examples": ["AI", "ML", "RAG"],
|
||||
"restrict_values": False,
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
dataset.update(payload)
|
||||
|
||||
cfg = client.get_auto_metadata(dataset_id=dataset.id)
|
||||
assert cfg["enabled"] is True
|
||||
assert len(cfg["fields"]) == 1
|
||||
assert cfg["fields"][0]["name"] == "tags"
|
||||
assert cfg["fields"][0]["type"] == "list"
|
||||
|
||||
# Disable auto-metadata and replace fields
|
||||
update_cfg = {
|
||||
"enabled": False,
|
||||
"fields": [
|
||||
{
|
||||
"name": "year",
|
||||
"type": "time",
|
||||
"description": "Publication year",
|
||||
"examples": None,
|
||||
"restrict_values": False,
|
||||
}
|
||||
],
|
||||
}
|
||||
client.update_auto_metadata(dataset_id=dataset.id, **update_cfg)
|
||||
|
||||
cfg2 = client.get_auto_metadata(dataset_id=dataset.id)
|
||||
assert cfg2["enabled"] is False
|
||||
assert len(cfg2["fields"]) == 1
|
||||
assert cfg2["fields"][0]["name"] == "year"
|
||||
assert cfg2["fields"][0]["type"] == "time"
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("clear_datasets")
|
||||
class TestAutoMetadataValidation:
|
||||
@pytest.mark.p2
|
||||
def test_invalid_field_type_rejected(self, client):
|
||||
payload = {
|
||||
"name": "auto_metadata_invalid_type",
|
||||
"auto_metadata_config": {
|
||||
"enabled": True,
|
||||
"fields": [
|
||||
{
|
||||
"name": "invalid_type",
|
||||
"type": "unknown", # invalid literal
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
with pytest.raises(Exception) as exc_info:
|
||||
client.create_dataset(**payload)
|
||||
msg = str(exc_info.value)
|
||||
# Pydantic literal_error message should appear
|
||||
assert "Input should be" in msg or "literal_error" in msg
|
||||
|
||||
Reference in New Issue
Block a user