Files
ragflow/test/unit_test/rag/svr/test_table_metadata_aggregation.py
euvre d9a04ef702 fix: support auto mode in table parser document metadata aggregation (#15780)
### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 19:08:23 +08:00

265 lines
9.8 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Unit tests for aggregate_table_doc_metadata."""
import pytest
from rag.utils.table_es_metadata import aggregate_table_doc_metadata, merge_table_parser_config_from_kb
@pytest.fixture
def es_engine(monkeypatch):
monkeypatch.setattr("rag.utils.table_es_metadata.settings.DOC_ENGINE_INFINITY", False)
monkeypatch.setattr("rag.utils.table_es_metadata.settings.DOC_ENGINE_OCEANBASE", False)
@pytest.fixture
def infinity_engine(monkeypatch):
monkeypatch.setattr("rag.utils.table_es_metadata.settings.DOC_ENGINE_INFINITY", True)
monkeypatch.setattr("rag.utils.table_es_metadata.settings.DOC_ENGINE_OCEANBASE", False)
def _table_task(**kb_extra):
return {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {"country": "metadata", "category": "metadata"},
"table_column_names": ["country", "category"],
"field_map": {
"country_tks": "country",
"category_tks": "category",
},
**kb_extra,
},
}
class TestAggregateTableManualDocMetadata:
def test_aggregate_manual_mode_happy_path(self, es_engine):
task = _table_task()
chunks = [
{
"country_raw": "Brazil",
"category_raw": "Economy",
"country_tks": "x",
"category_tks": "y",
},
{
"country_raw": "Turkey",
"category_raw": "Disaster",
"country_tks": "x",
"category_tks": "y",
},
{
"country_raw": "Brazil",
"category_raw": "Economy",
"country_tks": "x",
"category_tks": "y",
},
]
out = aggregate_table_doc_metadata(chunks, task)
assert out["country"] == ["Brazil", "Turkey"]
assert out["category"] == ["Economy", "Disaster"]
def test_aggregate_auto_mode_returns_data(self, es_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "auto",
"table_column_names": ["country"],
"field_map": {"country_tks": "country"},
},
}
chunks = [{"country_raw": "Brazil", "country_tks": "x"}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["Brazil"]}
def test_aggregate_auto_mode_all_columns_both(self, es_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "auto",
"table_column_names": ["country", "category"],
"field_map": {"country_tks": "country", "category_tks": "category"},
},
}
chunks = [
{"country_raw": "Brazil", "country_tks": "x", "category_raw": "Economy", "category_tks": "y"},
{"country_raw": "Turkey", "country_tks": "x", "category_raw": "Disaster", "category_tks": "y"},
]
out = aggregate_table_doc_metadata(chunks, task)
assert out["country"] == ["Brazil", "Turkey"]
assert out["category"] == ["Economy", "Disaster"]
def test_aggregate_no_mode_defaults_to_auto(self, es_engine):
"""When table_column_mode is missing, it defaults to 'auto'."""
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_names": ["country"],
"field_map": {"country_tks": "country"},
},
}
chunks = [{"country_raw": "Brazil", "country_tks": "x"}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["Brazil"]}
def test_aggregate_no_mode_no_columns_returns_empty(self, es_engine):
"""No mode and no column names/roles -> empty (nothing to aggregate)."""
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {},
}
assert aggregate_table_doc_metadata([{}], task) == {}
def test_aggregate_no_metadata_columns(self, es_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {"country": "indexing"},
"table_column_names": ["country"],
},
}
assert aggregate_table_doc_metadata([{"country_tks": "x"}], task) == {}
def test_aggregate_prefers_raw_over_tks(self, es_engine):
task = _table_task()
task["kb_parser_config"]["table_column_roles"] = {"country": "metadata"}
task["kb_parser_config"]["table_column_names"] = ["country"]
chunks = [{"country_raw": "Brazil", "country_tks": ["brazil"]}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["Brazil"]}
def test_aggregate_tks_fallback(self, es_engine):
task = _table_task()
task["kb_parser_config"]["table_column_roles"] = {"country": "metadata"}
task["kb_parser_config"]["table_column_names"] = ["country"]
chunks = [{"country_tks": ["brazil"]}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["brazil"]}
def test_aggregate_partial_roles_defaults_to_both(self, es_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {"country": "indexing"},
"table_column_names": ["country", "city"],
"field_map": {"city_tks": "city"},
},
}
chunks = [{"city_raw": "SP", "city_tks": "t", "country_tks": "x"}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"city": ["SP"]}
assert "country" not in out
def test_aggregate_empty_roles_all_columns_both(self, es_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {},
"table_column_names": ["country", "city"],
"field_map": {"country_tks": "country", "city_tks": "city"},
},
}
chunks = [
{"country_raw": "BR", "city_raw": "SP", "country_tks": "x", "city_tks": "y"},
]
out = aggregate_table_doc_metadata(chunks, task)
assert "country" in out and "city" in out
def test_aggregate_deduplicates_values(self, es_engine):
task = _table_task()
task["kb_parser_config"]["table_column_roles"] = {"country": "metadata"}
task["kb_parser_config"]["table_column_names"] = ["country"]
chunks = [
{"country_raw": "US", "country_tks": "x"},
{"country_raw": "UK", "country_tks": "y"},
{"country_raw": "US", "country_tks": "x"},
]
out = aggregate_table_doc_metadata(chunks, task)
assert out["country"] == ["US", "UK"]
def test_aggregate_kb_reload_field_map(self, es_engine, monkeypatch):
from unittest.mock import MagicMock
class MockKBS:
@staticmethod
def get_by_id(kid):
kb = MagicMock()
kb.parser_config = {"field_map": {"country_tks": "country"}}
return True, kb
monkeypatch.setattr(
"rag.utils.table_es_metadata._knowledgebase_service_cls",
lambda: MockKBS,
)
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {"country": "metadata"},
"table_column_names": ["country"],
},
"kb_id": "kb-1",
}
chunks = [{"country_raw": "X", "country_tks": "t"}]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["X"]}
def test_merge_infinity_chunk_data(self, infinity_engine):
task = {
"parser_id": "table",
"parser_config": {},
"kb_parser_config": {
"table_column_mode": "manual",
"table_column_roles": {"country": "both"},
"table_column_names": ["country"],
},
}
chunks = [
{"chunk_data": {"country": "US"}},
{"chunk_data": {"country": "UK"}},
]
out = aggregate_table_doc_metadata(chunks, task)
assert out == {"country": ["US", "UK"]}
class TestMergeTableParserConfigFromKbExtra:
"""Merge tests also covered in helpers file; keep one explicit case for aggregation module."""
def test_merge_preserves_parser_config_when_parser_not_table(self):
task = {
"parser_id": "naive",
"parser_config": {"a": 1},
"kb_parser_config": {"table_column_mode": "manual"},
}
assert merge_table_parser_config_from_kb(task) == {"a": 1}