ragflow/rag/graphrag/ner/dep_relation_extractor.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
Dependency-based relation extractor — full semantica alignment.

Extracts typed relations using spaCy dependency parse with:
- Multi-hop inference (A→B→C transitivity)
- Negation filtering
- Dynamic confidence scoring
- Multi-occurrence entity matching
"""
from typing import Dict, List, Optional

from .types import Entity, Relation

# Language-specific dependency label mappings
# Keys: pass_subj, subj, agent, dobj, prep_obj — each maps to a dep label
# or a tuple (dep, child_dep) for compound patterns.
# None = no standard mapping (language uses different structure)
_LANG_DEP_RULES: Dict[str, Dict[str, object]] = {
    "en": {"pass_subj": "nsubjpass", "subj": "nsubj",
           "agent": ("agent", "pobj"),
           "dobj": "dobj", "prep_obj": ("prep", "pobj")},
    "de": {"subj": "sb",
           "agent": ("sbp", "nk"),
           "prep_obj": ("mo", "nk"),
           "root_verb_child": "oc"},  # German ROOT is aux, real verb is "oc"
    "fr": {"pass_subj": "nsubj:pass", "subj": "nsubj",
           "agent": "obl:agent",
           "dobj": "obj", "prep_obj": ("case", "obl")},
    "es": {"subj": "nsubj",
           "agent": "obj",
           "prep_obj": ("case", "obl")},
    "pt": {"pass_subj": "nsubj:pass", "subj": "nsubj",
           "agent": "obl:agent",
           "dobj": "obj", "prep_obj": ("case", "obl")},
    "zh": {"subj": "nsubj",
           "agent": ("nmod:prep", None, "由"),  # case "由" marks agent
           "prep_obj": ("case", "nmod")},
    "ja": {"subj": "nsubj",
           "agent": ("obl", None, "によって"),  # "によって" marks agent
           "prep_obj": ("case", "obl")},
}

# Multi-hop inference rules: if A rel1 B and B rel2 C then A rel3 C
_MULTI_HOP: Dict[str, Dict[str, str]] = {
    "ceo_of": {"is_subsidiary_of": "works_for", "located_in": "works_for"},
    "works_for": {"is_subsidiary_of": "works_for"},
    "founded_by": {"is_subsidiary_of": "founded_by"},
}

_VERB_RELATIONS: Dict[str, str] = {
    # English
    "found+by": "founded_by", "co-found+by": "founded_by",
    "establish+by": "founded_by", "create+by": "founded_by",
    "set+up": "founded_by", "start+by": "founded_by",
    "work+for": "works_for", "employ+by": "works_for",
    "hire+by": "works_for", "join": "works_for",
    "lead+by": "works_for", "manage+by": "works_for",
    "head+by": "works_for", "run+by": "works_for",
    "own+by": "owns", "develop+by": "develops",
    "write+by": "wrote", "publish+by": "published",
    "invest+in": "invests_in", "partner+with": "partners_with",
    "collaborate+with": "collaborates_with",
    "merge+with": "merged_with", "subsidiar+y": "is_subsidiary_of",
    "base+in": "located_in", "locate+in": "located_in",
    "situate+in": "located_in", "headquarter+in": "located_in",
    "bear+in": "born_in", "bear+on": "born_in",
    "acquire+by": "acquired", "buy+by": "acquired",
    # German (de): spaCy lemmas
    "gründen+von": "founded_by", "errichten+von": "founded_by",
    "arbeiten+für": "works_for", "beschäftigen+bei": "works_for",
    "anstellen+bei": "works_for", "sich+befinden": "located_in",
    "liegen+in": "located_in", "sitzen+in": "located_in",
    "gebären+in": "born_in", "gebären+am": "born_in",
    "erwerben+durch": "acquired", "kaufen+durch": "acquired",
    "übernehmen+durch": "acquired",
    # French (fr): spaCy lemmas
    "fonder+par": "founded_by", "créer+par": "founded_by",
    "établir+par": "founded_by",
    "travailler+pour": "works_for", "employer+par": "works_for",
    "embaucher+par": "works_for",
    "situer+à": "located_in", "baser+à": "located_in",
    "implanter+à": "located_in",
    "naître+à": "born_in",
    "acquérir+par": "acquired", "racheter+par": "acquired",
    # Spanish + Portuguese (shared lemmas, no duplicate keys)
    "fundar+por": "founded_by", "crear+por": "founded_by",
    "criar+por": "founded_by",
    "establecer+por": "founded_by", "estabelecer+por": "founded_by",
    "trabajar+para": "works_for", "trabalhar+para": "works_for",
    "emplear+por": "works_for", "empregar+por": "works_for",
    "contratar+por": "works_for",
    "ubicar+en": "located_in", "situar+en": "located_in",
    "localizar+em": "located_in", "situar+em": "located_in",
    "sediar+em": "located_in", "tener+sede": "located_in",
    "nacer+en": "born_in", "nascer+em": "born_in",
    "adquirir+por": "acquired", "comprar+por": "acquired",
    # Chinese: verb + "由" (agent marker) or "被" (passive)
    "创立+由": "founded_by", "创建+由": "founded_by",
    "成立+由": "founded_by", "创办+由": "founded_by",
    "设立+由": "founded_by",
    "任职+于": "works_for", "就职+于": "works_for",
    "工作+在": "works_for", "位于+在": "located_in",
    "坐落+在": "located_in", "总部设+在": "located_in",
    "出生+在": "born_in", "出生+于": "born_in",
    "收购+由": "acquired", "并购+由": "acquired",
    # Japanese: verb + "によって" (agent marker)
    "設立+によって": "founded_by", "創立+によって": "founded_by",
    "勤務+で": "works_for", "在籍+で": "works_for",
    "位置+に": "located_in", "所在+に": "located_in",
    "本社+を": "located_in",
    "出生+に": "born_in",
    "買収+によって": "acquired",
}

_COPULA_TITLE_MAP: Dict[str, List[str]] = {
    "ceo": ["ceo_of", "works_for"], "cto": ["works_for"],
    "cfo": ["works_for"], "coo": ["works_for"],
    "vp": ["works_for"], "director": ["works_for"],
    "manager": ["works_for"], "engineer": ["works_for"],
    "employee": ["works_for"],
    "founder": ["founded_by"], "co-founder": ["founded_by"],
}


class DepRelationExtractor:
    """Extract typed relations using dependency parse — semantica-aligned."""

    def __init__(self, language: str = "en",
                 confidence_threshold: float = 0.3,
                 max_distance: int = 100):
        self.language = language
        self.confidence_threshold = confidence_threshold
        self.max_distance = max_distance

    def extract(self, text: str, entities: List[Entity],
                doc=None, **options) -> List[Relation]:
        semantica_rels = []
        if doc is not None:
            semantica_rels = self._extract_with_dep(text, doc, entities)
        semantica_rels.extend(self._extract_cooccurrence(text, entities))
        semantica_rels = self._infer_multi_hop(semantica_rels)
        semantica_rels = self._deduplicate(semantica_rels)
        return [r for r in semantica_rels if r.confidence >= self.confidence_threshold]

    # ------------------------------------------------------------------
    # Multi-hop inference (属性传递)
    # ------------------------------------------------------------------

    @staticmethod
    def _infer_multi_hop(relations: List[Relation]) -> List[Relation]:
        """Infer transitive relations: A→B→C ⇒ A→C."""
        by_subj: Dict[str, List[Relation]] = {}
        for r in relations:
            if r.predicate == "related_to":
                continue
            by_subj.setdefault(r.subject.text.lower(), []).append(r)

        inferred = []
        for r in relations:
            if r.predicate == "related_to":
                continue
            obj_key = r.obj.text.lower()
            if obj_key in by_subj:
                for r2 in by_subj[obj_key]:
                    if r2.predicate in _MULTI_HOP.get(r.predicate, {}):
                        inferred_rel = _MULTI_HOP[r.predicate][r2.predicate]
                        if inferred_rel:
                            inferred.append(Relation(
                                subject=r.subject, predicate=inferred_rel,
                                obj=r2.obj, confidence=min(r.confidence, r2.confidence) * 0.9,
                                metadata={"method": "multi_hop",
                                          "via": f"{r.predicate}→{r2.predicate}"},
                            ))
        return relations + inferred

    # ------------------------------------------------------------------
    # Dependency extraction
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Language-aware role mapping
    # ------------------------------------------------------------------

    def _roles(self) -> Dict[str, str]:
        """Get role → dep label mapping for current language."""
        return _LANG_DEP_RULES.get(self.language, _LANG_DEP_RULES["en"])

    def _get_by_role(self, root, role: str, entity_map) -> list:
        """Get entities for a semantic role (language-aware). Returns [(Entity, prep?)]"""
        rule = self._roles().get(role)
        if rule is None:
            return []
        results = []

        for c in root.children:
            dep = c.dep_
            if isinstance(rule, str):
                if dep == rule:
                    ent = self._entity_from_subtree(c, entity_map)
                    if ent:
                        results.append((ent, None))
            elif isinstance(rule, tuple):
                parent_dep, child_dep = rule[0], rule[1]
                # Check optional case marker (e.g., "由" for zh, "によって" for ja)
                case_marker = rule[2] if len(rule) > 2 else None
                if dep == parent_dep:
                    if case_marker:
                        # Check if any child has the expected case lemma
                        has_case = any(
                            gc.lemma_ == case_marker or gc.text == case_marker
                            for gc in c.subtree
                        )
                        if not has_case:
                            continue
                    if child_dep is None:
                        ent = self._entity_from_subtree(c, entity_map)
                        if ent:
                            results.append((ent, c.lemma_.lower() if role == "prep_obj" else None))
                    else:
                        for gc in c.children:
                            if gc.dep_ == child_dep:
                                ent = self._entity_from_subtree(gc, entity_map)
                                if ent:
                                    prep = c.lemma_.lower() if role == "prep_obj" else None
                                    results.append((ent, prep))
                                break
        return results

    def _extract_with_dep(self, text, doc, entities) -> List[Relation]:
        relations = []
        entity_map = self._build_entity_map_multi(entities)
        is_de = self.language == "de"

        for sent in doc.sents:
            for token in sent:
                # German: ROOT is aux verb, real verb is "oc" child
                if is_de:
                    if token.dep_ != "ROOT":
                        continue
                    for c in token.children:
                        if c.dep_ == "oc":
                            # German: args attach to aux (ROOT), not main verb (oc)
                            # Pass both: root aux for args, oc for verb lemma
                            relations.extend(self._extract_from_root(text, c, entity_map, aux_root=token))
                    continue

                if token.dep_ != "ROOT":
                    continue
                relations.extend(self._extract_from_root(text, token, entity_map))
                if token.lemma_ == "be":
                    relations.extend(self._extract_copula(text, token, entity_map))

        return relations

    def _extract_from_root(self, text, root, entity_map, aux_root=None) -> List[Relation]:
        relations = []
        # Fall back to text when lemma is empty (zh, ja don't have lemmatizers)
        verb_lemma = (root.lemma_ or root.text).lower()
        # For languages like German where args attach to aux verb
        check = root if aux_root is None else aux_root

        # Negation
        if any(c.dep_ in ("neg", "advmod:neg") for c in check.children):
            return relations

        # Extract roles (check both the main verb and optional aux parent)
        def first(lst):
            return lst[0][0] if lst else None
        def get_roles(token):
            return (
                first(self._get_by_role(token, "subj", entity_map)),
                first(self._get_by_role(token, "pass_subj", entity_map)),
                first(self._get_by_role(token, "dobj", entity_map)),
                first(self._get_by_role(token, "agent", entity_map)),
                self._get_by_role(token, "prep_obj", entity_map),
                any(c.dep_ == "aux" for c in token.children),
            )

        s1, sp1, d1, a1, p1, h1 = get_roles(root)
        s2, sp2, d2, a2, p2, h2 = (None, None, None, None, [], False)
        if aux_root:
            s2, sp2, d2, a2, p2, h2 = get_roles(aux_root)

        # Merge: prefer found roles from aux if main verb lacks them
        nsubj = s1 or s2
        nsubjpass = sp1 or sp2
        dobj = d1 or d2
        agent_entity = a1 or a2
        prep_list = p1 + p2
        has_aux = h1 or h2 or aux_root is not None
        has_explicit_agent = agent_entity is not None

        # Detect passive:
        # - explicit pass_subj (en, fr, pt)
        # - subj + agent + aux (Spanish-style)
        # - subj + agent for languages with agent marker (zh, ja)
        is_passive_candidate = has_explicit_agent and (has_aux or self.language in ("zh", "ja"))

        effective_nsubjpass = nsubjpass or (nsubj if is_passive_candidate else None)
        effective_nsubj = nsubj if not is_passive_candidate else None

        # Passive: X was founded/acquired by Y
        if effective_nsubjpass and agent_entity:
            prep = ""
            # Try language-appropriate prepositions/case markers
            candidates = ("by", "von", "par", "por", "durch", "由", "によって")
            for candidate in candidates:
                if self._lookup(verb_lemma, candidate):
                    prep = candidate
                    break
            rel_type = self._lookup(verb_lemma, prep) if prep else None
            if rel_type:
                if rel_type in ("founded_by", "acquired"):
                    subj, obj = effective_nsubjpass, agent_entity
                else:
                    subj, obj = agent_entity, effective_nsubjpass
                relations.append(self._make_rel(subj, rel_type, obj, 0.90, "passive", verb_lemma))

        # Active: X VERB Y or X VERB prep Y
        if effective_nsubj:
            if dobj:
                rt = self._lookup(verb_lemma, None)
                if rt:
                    relations.append(self._make_rel(effective_nsubj, rt, dobj, 0.85, "active", verb_lemma))
            for prep_entity, prep_l in prep_list:
                rt = self._lookup(verb_lemma, prep_l)
                if rt:
                    relations.append(self._make_rel(effective_nsubj, rt, prep_entity, 0.85,
                                                    "active_prep", verb_lemma, prep=prep_l))

        # Passive with prep ("is based in")
        if effective_nsubjpass and prep_list and not agent_entity:
            for prep_entity, prep_l in prep_list:
                rt = self._lookup(verb_lemma, prep_l)
                if not rt:
                    rt = self._lookup("be+" + verb_lemma, prep_l)
                if rt:
                    relations.append(self._make_rel(effective_nsubjpass, rt, prep_entity, 0.85,
                                                    "passive_prep", verb_lemma, prep=prep_l))

        return relations

    @staticmethod
    def _make_rel(subj, pred, obj, conf, method, verb, prep=""):
        m = {"method": method, "verb": verb}
        if prep:
            m["prep"] = prep
        return Relation(subject=subj, predicate=pred, obj=obj,
                         confidence=conf, metadata=m)

    @staticmethod
    def _already_has(rels, subj, pred, obj) -> bool:
        for r in rels:
            if r.subject.text == subj.text and r.predicate == pred and r.obj.text == obj.text:
                return True
        return False

    def _extract_copula(self, text, root, entity_map) -> List[Relation]:
        relations = []
        # Get subject using language-specific rules
        subjs = self._get_by_role(root, "subj", entity_map)
        subj = subjs[0][0] if subjs else None
        if not subj:
            return relations

        title_lemma = None
        prep_obj = None
        deps_to_check = ["attr", "pred"]  # attr=en, pred=de
        for c in root.children:
            if c.dep_ not in deps_to_check:
                continue
            for cc in c.children:
                prep_deps = {"prep", "mo", "case"}  # en=prep, de=mo, fr/case
                if cc.dep_ not in prep_deps:
                    continue
                for gc in cc.children:
                    pobj_deps = {"pobj", "nk", "obl"}
                    if gc.dep_ in pobj_deps or True:  # accept any child as object
                        prep_obj = self._entity_from_subtree(gc, entity_map)
                        if prep_obj:
                            title_lemma = c.lemma_.lower()
                        break

        if not title_lemma or not prep_obj:
            return relations
        for keyword, rel_types in _COPULA_TITLE_MAP.items():
            if keyword in title_lemma:
                for rt in rel_types:
                    relations.append(Relation(
                        subject=subj, predicate=rt, obj=prep_obj,
                        confidence=0.88, context=text,
                        metadata={"method": "copula", "title": title_lemma},
                    ))
                break
        return relations

    # ------------------------------------------------------------------
    # Better entity map: multi-occurrence aware
    # ------------------------------------------------------------------

    @staticmethod
    def _build_entity_map_multi(entities: List[Entity]) -> Dict[str, List[Entity]]:
        """Build entity map that keeps ALL occurrences per name."""
        result: Dict[str, List[Entity]] = {}
        for e in entities:
            key = e.text.lower()
            result.setdefault(key, []).append(e)
            cleaned = e.text.rstrip(".,;:!?").strip().lower()
            if cleaned != key:
                result.setdefault(cleaned, []).append(e)
        return result

    @staticmethod
    def _find_best_entity(key: str, entity_map: Dict[str, List[Entity]],
                          fallback_text: str = "") -> Optional[Entity]:
        """Find the best entity match. If multiple, prefer the one whose
        text is an exact match for fallback_text, or the first one."""
        entries = entity_map.get(key.lower(), [])
        if not entries:
            return None
        if len(entries) == 1:
            return entries[0]
        # Prefer exact text match
        for e in entries:
            if e.text.lower() == fallback_text.lower():
                return e
        return entries[0]

    # ------------------------------------------------------------------
    # Argument extraction helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _get_child_entity(token, dep, entity_map):
        for c in token.children:
            if c.dep_ == dep:
                return DepRelationExtractor._entity_from_subtree(c, entity_map)
        return None

    @staticmethod
    def _get_agent_pobj(root, entity_map):
        for c in root.children:
            if c.dep_ == "agent":
                for gc in c.children:
                    if gc.dep_ == "pobj":
                        return DepRelationExtractor._entity_from_subtree(gc, entity_map)
        return None

    @staticmethod
    def _get_prep_objs(root, entity_map):
        results = []
        for c in root.children:
            if c.dep_ == "prep":
                prep_lemma = c.lemma_.lower()
                for gc in c.children:
                    if gc.dep_ == "pobj":
                        ent = DepRelationExtractor._entity_from_subtree(gc, entity_map)
                        if ent:
                            results.append((prep_lemma, ent))
        return results

    @staticmethod
    def _entity_from_subtree(token, entity_map) -> Optional[Entity]:
        """Match token's subtree against entity map. Uses character positions
        for conjunction handling."""
        min_char = token.idx
        max_char = token.idx + len(token.text)
        for t in token.subtree:
            if t.dep_ not in ("prep", "punct", "det", "aux", "auxpass", "cc", "conj"):
                if t.idx < min_char:
                    min_char = t.idx
                end = t.idx + len(t.text)
                if end > max_char:
                    max_char = end
        text = token.doc.text[min_char:max_char].strip()
        key = text.lower()
        # Try multi-map lookup
        entries = entity_map.get(key, [])
        if not entries:
            for sep in (" and ", " or ", ", "):
                if sep in key:
                    entries = entity_map.get(key.split(sep)[0].strip(), [])
                    if entries:
                        break
        if not entries:
            for ek, ev in entity_map.items():
                if ek in key or key in ek:
                    entries = ev
                    break
        if entries:
            return entries[0]
        return None

    @staticmethod
    def _lookup(verb: str, prep: Optional[str] = None) -> Optional[str]:
        if prep:
            key = f"{verb}+{prep}"
            return _VERB_RELATIONS.get(key)
        return _VERB_RELATIONS.get(verb)

    @staticmethod
    def _deduplicate(relations: List[Relation]) -> List[Relation]:
        seen = set()
        result = []
        for r in relations:
            key = (r.subject.text.lower(), r.predicate, r.obj.text.lower())
            rev = (r.obj.text.lower(), r.predicate, r.subject.text.lower())
            if key in seen or rev in seen:
                continue
            seen.add(key)
            result.append(r)
        return result

    # ------------------------------------------------------------------
    # Co-occurrence
    # ------------------------------------------------------------------

    def _extract_cooccurrence(self, text, entities) -> List[Relation]:
        if len(entities) < 2:
            return []
        import re as _re
        spans = [(m.start(), m.end())
                 for m in _re.finditer(r'[^.!?]+(?:[.!?](?=\s|$))+', text)]

        def same_sent(c1, c2):
            return any(ss <= c1 < se and ss <= c2 < se for ss, se in spans)

        rels = []
        for i in range(len(entities)):
            for j in range(i + 1, len(entities)):
                e1, e2 = entities[i], entities[j]
                if not same_sent(e1.start_char, e2.start_char):
                    continue
                if abs(e2.start_char - e1.end_char) > self.max_distance:
                    continue
                cs = max(0, min(e1.start_char, e2.start_char) - 20)
                ce = min(len(text), max(e1.end_char, e2.end_char) + 20)
                rels.append(Relation(
                    subject=e1, predicate="related_to", obj=e2,
                    confidence=0.4, context=text[cs:ce],
                    metadata={"method": "cooccurrence"},
                ))
        return rels