tag/dev-20260622/common/metadata_utils.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import ast
import logging
from typing import Any, Callable, Dict

import json_repair


def convert_conditions(metadata_condition):
    if metadata_condition is None:
        metadata_condition = {}
    op_mapping = {
        "is": "=",
        "not is": "≠",
        ">=": "≥",
        "<=": "≤",
        "!=": "≠"
    }
    return [
        {
            "op": op_mapping.get(cond["comparison_operator"], cond["comparison_operator"]),
            "key": cond["name"],
            "value": cond["value"]
        }
        for cond in metadata_condition.get("conditions", [])
    ]


def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
    doc_ids = None

    def normalize_string_values(value):
        if isinstance(value, str):
            return value.lower()
        if isinstance(value, list):
            return [item.lower() if isinstance(item, str) else item for item in value]
        return value

    def filter_out(v2docs, operator, value):
        ids = []
        for input, docids in v2docs.items():

            if operator in ["=", "≠", ">", "<", "≥", "≤"]:
                # Check if input is in YYYY-MM-DD date format
                input_str = str(input).strip()
                value_str = str(value).strip()

                # Strict date format detection: YYYY-MM-DD (must be 10 chars with correct format)
                is_input_date = (
                        len(input_str) == 10 and
                        input_str[4] == '-' and
                        input_str[7] == '-' and
                        input_str[:4].isdigit() and
                        input_str[5:7].isdigit() and
                        input_str[8:10].isdigit()
                )

                is_value_date = (
                        len(value_str) == 10 and
                        value_str[4] == '-' and
                        value_str[7] == '-' and
                        value_str[:4].isdigit() and
                        value_str[5:7].isdigit() and
                        value_str[8:10].isdigit()
                )

                if is_value_date:
                    # Query value is in date format
                    if is_input_date:
                        # Data is also in date format: perform date comparison
                        input = input_str
                        value = value_str
                    else:
                        # Data is not in date format: skip this record (no match)
                        continue
                else:
                    # Query value is not in date format: use original logic
                    try:
                        if isinstance(input, list):
                            input = input[0]
                        input = ast.literal_eval(input)
                        value = ast.literal_eval(value)
                    except Exception:
                        pass

                    # Convert strings to lowercase
                    if isinstance(input, str):
                        input = input.lower()
                    if isinstance(value, str):
                        value = value.lower()
            else:
                # Non-comparison operators: maintain original logic
                input = normalize_string_values(input)
                value = normalize_string_values(value)

            matched = False
            try:
                if operator == "contains":
                    matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(
                        str(i).find(value) >= 0 for i in input)
                elif operator == "not contains":
                    matched = str(input).find(value) == -1 if not isinstance(input, list) else all(
                        str(i).find(value) == -1 for i in input)
                elif operator == "in":
                    matched = input in value if not isinstance(input, list) else all(i in value for i in input)
                elif operator == "not in":
                    matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
                elif operator == "start with":
                    matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input,
                                                                                                  list) else "".join(
                        [str(i).lower() for i in input]).startswith(str(value).lower())
                elif operator == "end with":
                    matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input,
                                                                                                list) else "".join(
                        [str(i).lower() for i in input]).endswith(str(value).lower())
                elif operator == "empty":
                    matched = not input
                elif operator == "not empty":
                    matched = bool(input)
                elif operator == "=":
                    matched = input == value
                elif operator == "≠":
                    matched = input != value
                elif operator == ">":
                    matched = input > value
                elif operator == "<":
                    matched = input < value
                elif operator == "≥":
                    matched = input >= value
                elif operator == "≤":
                    matched = input <= value
            except Exception:
                pass

            if matched:
                ids.extend(docids)
        return ids

    for f in filters:
        k = f["key"]
        if k not in metas:
            # Key not found in metas: treat as no match
            ids = []
        else:
            v2docs = metas[k]
            ids = filter_out(v2docs, f["op"], f["value"])

        if doc_ids is None:
            doc_ids = set(ids)
        else:
            if logic == "and":
                doc_ids = doc_ids & set(ids)
                if not doc_ids:
                    logging.debug(f"meta_filter filters={filters}, logic={logic}, early return []")
                    return []
            else:
                doc_ids = doc_ids | set(ids)
    return list(doc_ids or [])


async def apply_meta_data_filter(
        meta_data_filter: dict | None,
        metas: dict | None = None,
        question: str = "",
        chat_mdl: Any = None,
        base_doc_ids: list[str] | None = None,
        manual_value_resolver: Callable[[dict], dict] | None = None,
        kb_ids: list[str] | None = None,
        metas_loader: Callable[[], dict] | None = None,
) -> list[str] | None:
    """
    Apply metadata filtering rules and return the filtered doc_ids.

    meta_data_filter supports three modes:
    - auto: generate filter conditions via LLM (gen_meta_filter)
    - semi_auto: generate conditions using selected metadata keys only
    - manual: directly filter based on provided conditions

    When ``kb_ids`` is supplied, metadata filters are pushed down to the doc metadata
    index (ES/Infinity) via ``DocMetadataService.filter_doc_ids_by_metadata`` instead
    of being evaluated in Python over ``metas``. The in-memory ``meta_filter`` path
    remains the fallback so callers without a KB scope, or backends without push-down
    support, behave exactly as before.

    ``metas`` may be supplied eagerly or via ``metas_loader``. The loader is
    only invoked when the metadata dict is actually needed — i.e. for the LLM
    context in ``auto`` / ``semi_auto`` modes, or as the in-memory fallback
    when push-down can't service a request. ``manual`` mode that lands on the
    push-down path therefore skips the expensive
    ``get_flatted_meta_by_kbs`` round-trip entirely.

    Returns:
        list of doc_ids, ["-999"] when manual filters yield no result, or None
        when auto/semi_auto filters return empty.
    """
    from rag.prompts.generator import gen_meta_filter  # move from the top of the file to avoid circular import

    doc_ids = list(base_doc_ids) if base_doc_ids else []

    if not meta_data_filter:
        return doc_ids

    method = meta_data_filter.get("method")

    # Memoised metadata loader. ``_get_metas`` materialises the dict at most
    # once per call; downstream branches that never reach an in-memory eval
    # leave the loader untouched.
    cached_metas: dict | None = metas

    def _get_metas() -> dict:
        nonlocal cached_metas
        if cached_metas is None:
            cached_metas = metas_loader() if metas_loader else {}
        return cached_metas

    def _run_metadata_filter(conditions: list[dict], logic: str) -> list[str]:
        """Run conditions through ES/Infinity push-down when possible, in-memory otherwise."""
        if conditions and kb_ids:
            try:
                from api.db.services.doc_metadata_service import DocMetadataService
                doc_ids = DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
                logging.debug(f"Doc ids filtered by metadata: {doc_ids}")
                if doc_ids is not None:
                    return doc_ids
            except Exception as e:
                logging.error(f"Metadata filter push down errored: {e}")

        # In-memory fallback
        logging.debug("Metadata filter falls back to in-memory filter")
        return meta_filter(_get_metas(), conditions, logic)

    if method == "auto":
        filters: dict = await gen_meta_filter(chat_mdl, _get_metas(), question)
        logging.debug(f"Metadata filter(auto) generated: {filters}")
        doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
        if not doc_ids:
            return None
    elif method == "semi_auto":
        selected_keys = []
        constraints = {}
        for item in meta_data_filter.get("semi_auto", []):
            if isinstance(item, str):
                selected_keys.append(item)
            elif isinstance(item, dict):
                key = item.get("key")
                op = item.get("op")
                selected_keys.append(key)
                if op:
                    constraints[key] = op

        if selected_keys:
            current_metas = _get_metas()
            filtered_metas = {key: current_metas[key] for key in selected_keys if key in current_metas}
            if filtered_metas:
                filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints)
                logging.debug(f"Metadata filter(semi_auto) generated: {filters}")
                doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
                if not doc_ids:
                    return None
    elif method == "manual":
        filters = meta_data_filter.get("manual", [])
        if manual_value_resolver:
            filters = [manual_value_resolver(flt) for flt in filters]
        logging.debug(f"Metadata filter(manual): {filters}")
        doc_ids.extend(_run_metadata_filter(filters, meta_data_filter.get("logic", "and")))
        if filters and not doc_ids:
            doc_ids = ["-999"]

    logging.debug(f"apply_meta_data_filter meta_filter={meta_data_filter}, returning doc_ids={doc_ids}")
    return doc_ids


def _try_meta_pushdown(
        kb_ids: list[str],
        conditions: list[dict],
        logic: str,
) -> list[str] | None:
    """Attempt the ES push-down path; return ``None`` to fall back in-memory.

    Lazy-imports ``DocMetadataService`` so this module stays usable in
    environments where the API/db layer hasn't been wired up (e.g. unit tests
    that exercise ``meta_filter`` directly).
    """
    try:
        from api.db.services.doc_metadata_service import DocMetadataService
    except Exception as e:
        logging.debug(f"[apply_meta_data_filter] push-down disabled, import failed: {e}")
        return None
    try:
        return DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
    except Exception as e:
        logging.warning(f"[apply_meta_data_filter] push-down errored, falling back: {e}")
        return None


def dedupe_list(values: list) -> list:
    seen = set()
    deduped = []
    for item in values:
        key = str(item)
        if key in seen:
            continue
        seen.add(key)
        deduped.append(item)
    return deduped


def update_metadata_to(metadata, meta):
    if not meta:
        return metadata
    if isinstance(meta, str):
        try:
            meta = json_repair.loads(meta)
        except Exception:
            logging.error("Meta data format error.")
            return metadata
    if not isinstance(meta, dict):
        return metadata

    for k, v in meta.items():
        if isinstance(v, list):
            v = [vv for vv in v if isinstance(vv, str)]
            if not v:
                continue
            v = dedupe_list(v)
        if not isinstance(v, list) and not isinstance(v, str):
            continue
        if k not in metadata:
            metadata[k] = v
            continue
        if isinstance(metadata[k], list):
            if isinstance(v, list):
                metadata[k].extend(v)
            else:
                metadata[k].append(v)
            metadata[k] = dedupe_list(metadata[k])
        else:
            metadata[k] = v

    return metadata


def metadata_schema(metadata: dict | list | None) -> Dict[str, Any]:
    if not metadata:
        return {}
    properties = {}

    for item in metadata:
        key = item.get("key")
        if not key:
            continue

        prop_schema = {
            "description": item.get("description", "")
        }
        if "enum" in item and item["enum"]:
            prop_schema["enum"] = item["enum"]
            prop_schema["type"] = "string"

        properties[key] = prop_schema

    json_schema = {
        "type": "object",
        "properties": properties,
    }

    json_schema["additionalProperties"] = False
    return json_schema


def _is_json_schema(obj: dict) -> bool:
    if not isinstance(obj, dict):
        return False
    if "$schema" in obj:
        return True
    return obj.get("type") == "object" and isinstance(obj.get("properties"), dict)


def _is_metadata_list(obj: list) -> bool:
    if not isinstance(obj, list) or not obj:
        return False
    for item in obj:
        if not isinstance(item, dict):
            return False
        key = item.get("key")
        if not isinstance(key, str) or not key:
            return False
        if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
            return False
        if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
            return False
        if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
            return False
    return True


def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
    if isinstance(obj, dict) and _is_json_schema(obj):
        return obj
    if isinstance(obj, list) and _is_metadata_list(obj):
        normalized = []
        for item in obj:
            description = item.get("description") or item.get("descriptions") or ""
            normalized_item = {
                "key": item.get("key"),
                "description": description,
            }
            if "enum" in item and item["enum"] is not None:
                normalized_item["enum"] = item["enum"]
            normalized.append(normalized_item)
        return metadata_schema(normalized)
    return {}