Files
ragflow/rag/utils/raptor_utils.py
Jack f0cb7a544b Refactor: Task Executor (#15154)
### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-05-27 21:54:17 +08:00

253 lines
8.5 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Utility functions for Raptor processing decisions.
"""
import json
import logging
from typing import Optional
import xxhash
RAPTOR_TREE_BUILDER = "raptor"
PSI_TREE_BUILDER = "psi"
SUPPORTED_TREE_BUILDERS = {RAPTOR_TREE_BUILDER, PSI_TREE_BUILDER}
GMM_CLUSTERING_METHOD = "gmm"
AHC_CLUSTERING_METHOD = "ahc"
SUPPORTED_CLUSTERING_METHODS = {GMM_CLUSTERING_METHOD, AHC_CLUSTERING_METHOD}
# File extensions for structured data types
EXCEL_EXTENSIONS = {".xls", ".xlsx", ".xlsm", ".xlsb"}
CSV_EXTENSIONS = {".csv", ".tsv"}
STRUCTURED_EXTENSIONS = EXCEL_EXTENSIONS | CSV_EXTENSIONS
def get_raptor_tree_builder(raptor_config: dict | None) -> str:
"""Return the configured RAPTOR tree builder with legacy ext fallback."""
raptor_config = raptor_config or {}
ext = raptor_config.get("ext") or {}
tree_builder = ext.get("tree_builder") or raptor_config.get("tree_builder") or RAPTOR_TREE_BUILDER
if tree_builder not in SUPPORTED_TREE_BUILDERS:
raise ValueError(f"Unsupported RAPTOR tree builder: {tree_builder}")
return tree_builder
def get_raptor_clustering_method(raptor_config: dict | None) -> str:
"""Return the configured RAPTOR clustering method with legacy ext fallback."""
raptor_config = raptor_config or {}
ext = raptor_config.get("ext") or {}
clustering_method = ext.get("clustering_method") or raptor_config.get("clustering_method") or GMM_CLUSTERING_METHOD
if clustering_method not in SUPPORTED_CLUSTERING_METHODS:
raise ValueError(f"Unsupported RAPTOR clustering method: {clustering_method}")
return clustering_method
def _as_extra_dict(extra) -> dict:
"""Normalize a chunk extra payload into a dictionary."""
if isinstance(extra, dict):
return extra
if isinstance(extra, str) and extra:
# Try standard JSON first (double quotes)
try:
parsed = json.loads(extra)
return parsed if isinstance(parsed, dict) else {}
except json.JSONDecodeError:
last_exc = True
# Fallback: try parsing Python dict literal (single quotes)
try:
import ast
parsed = ast.literal_eval(extra)
if isinstance(parsed, dict):
return parsed
except (ValueError, SyntaxError):
last_exc = True
logging.warning(
"Ignoring malformed RAPTOR extra payload while collecting chunk metadata: %s",
extra[:200],
exc_info=last_exc,
)
return {}
return {}
def _has_raptor_marker(marker) -> bool:
"""Return whether a chunk marker identifies a RAPTOR summary chunk."""
if isinstance(marker, list):
return any(str(item) == RAPTOR_TREE_BUILDER for item in marker)
return str(marker) == RAPTOR_TREE_BUILDER
def _raptor_methods_from_fields(fields: dict, extra: dict | None = None) -> set[str]:
"""Read RAPTOR builder methods from stored chunk fields."""
extra = extra if extra is not None else _as_extra_dict(fields.get("extra"))
method = extra.get("raptor_method") or RAPTOR_TREE_BUILDER
if isinstance(method, list):
return {str(item) for item in method if item}
return {str(method)} if method else set()
def collect_raptor_methods(field_map: dict) -> set[str]:
"""Collect tree-builder methods from RAPTOR summary chunk fields."""
methods = set()
for fields in field_map.values():
extra = _as_extra_dict(fields.get("extra"))
marker = fields.get("raptor_kwd") or extra.get("raptor_kwd")
if not _has_raptor_marker(marker):
continue
methods.update(_raptor_methods_from_fields(fields, extra))
return methods
def collect_raptor_chunk_ids(field_map: dict, exclude_methods: set[str] | None = None) -> set[str]:
"""Collect RAPTOR summary chunk IDs, optionally excluding some methods."""
chunk_ids = set()
exclude_methods = exclude_methods or set()
for chunk_id, fields in field_map.items():
extra = _as_extra_dict(fields.get("extra"))
marker = fields.get("raptor_kwd") or extra.get("raptor_kwd")
if _has_raptor_marker(marker):
if _raptor_methods_from_fields(fields, extra).issubset(exclude_methods):
continue
chunk_ids.add(chunk_id)
return chunk_ids
def make_raptor_summary_chunk_id(content: str, doc_id: str) -> str:
"""Build the stable ID used for generated RAPTOR summary chunks."""
return xxhash.xxh64((content + str(doc_id)).encode("utf-8")).hexdigest()
def is_structured_file_type(file_type: Optional[str]) -> bool:
"""
Check if a file type is structured data (Excel, CSV, etc.)
Args:
file_type: File extension (e.g., ".xlsx", ".csv")
Returns:
True if file is structured data type
"""
if not file_type:
return False
# Normalize to lowercase and ensure leading dot
file_type = file_type.lower()
if not file_type.startswith("."):
file_type = f".{file_type}"
return file_type in STRUCTURED_EXTENSIONS
def is_tabular_pdf(parser_id: str = "", parser_config: Optional[dict] = None) -> bool:
"""
Check if a PDF is being parsed as tabular data.
Args:
parser_id: Parser ID (e.g., "table", "naive")
parser_config: Parser configuration dict
Returns:
True if PDF is being parsed as tabular data
"""
parser_config = parser_config or {}
# If using table parser, it's tabular
if parser_id and parser_id.lower() == "table":
return True
# Check if html4excel is enabled (Excel-like table parsing)
if parser_config.get("html4excel", False):
return True
return False
def should_skip_raptor(
file_type: Optional[str] = None,
parser_id: str = "",
parser_config: Optional[dict] = None,
raptor_config: Optional[dict] = None
) -> bool:
"""
Determine if Raptor should be skipped for a given document.
This function implements the logic to automatically disable Raptor for:
1. Excel files (.xls, .xlsx, .csv, etc.)
2. PDFs with tabular data (using table parser or html4excel)
Args:
file_type: File extension (e.g., ".xlsx", ".pdf")
parser_id: Parser ID being used
parser_config: Parser configuration dict
raptor_config: Raptor configuration dict (can override with auto_disable_for_structured_data)
Returns:
True if Raptor should be skipped, False otherwise
"""
parser_config = parser_config or {}
raptor_config = raptor_config or {}
# Check if auto-disable is explicitly disabled in config
if raptor_config.get("auto_disable_for_structured_data", True) is False:
logging.info("Raptor auto-disable is turned off via configuration")
return False
# Check for Excel/CSV files
if is_structured_file_type(file_type):
logging.info(f"Skipping Raptor for structured file type: {file_type}")
return True
# Check for tabular PDFs
if file_type and file_type.lower() in [".pdf", "pdf"]:
if is_tabular_pdf(parser_id, parser_config):
logging.info(f"Skipping Raptor for tabular PDF (parser_id={parser_id})")
return True
return False
def get_skip_reason(
file_type: Optional[str] = None,
parser_id: str = "",
parser_config: Optional[dict] = None
) -> str:
"""
Get a human-readable reason why Raptor was skipped.
Args:
file_type: File extension
parser_id: Parser ID being used
parser_config: Parser configuration dict
Returns:
Reason string, or empty string if Raptor should not be skipped
"""
parser_config = parser_config or {}
if is_structured_file_type(file_type):
return f"Structured data file ({file_type}) - Raptor auto-disabled"
if file_type and file_type.lower() in [".pdf", "pdf"]:
if is_tabular_pdf(parser_id, parser_config):
return f"Tabular PDF (parser={parser_id}) - Raptor auto-disabled"
return ""