2026-05-11 12:59:59 +08:00
|
|
|
|
#
|
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
|
#
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
#
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
#
|
|
|
|
|
|
"""
|
|
|
|
|
|
spaCy-based entity and relationship extractor for GraphRAG.
|
|
|
|
|
|
|
|
|
|
|
|
Combines techniques from **LinearRAG** and **MGranRAG**:
|
|
|
|
|
|
|
|
|
|
|
|
* **Entity extraction** uses MGranRAG's multi-pass stacking algorithm
|
|
|
|
|
|
(hyphen/apostrophe merging → capitalised-word merging → continuous
|
|
|
|
|
|
noun/number merging) combined with spaCy NER, then deduplicated via
|
|
|
|
|
|
``ner_all_keywords``.
|
|
|
|
|
|
* **Relationship inference** follows LinearRAG's *relation-free* approach:
|
|
|
|
|
|
entities co-occurring in the same sentence (or nearby sentences) are
|
|
|
|
|
|
linked by implicit semantic edges whose description is the shared
|
|
|
|
|
|
sentence text (semantic bridging). Edge weights are optionally TF-
|
|
|
|
|
|
normalised.
|
|
|
|
|
|
|
|
|
|
|
|
No LLM calls are needed for the extraction step itself. The LLM is only
|
|
|
|
|
|
used downstream (inherited from ``Extractor``) for merging / summarising
|
|
|
|
|
|
duplicate entity descriptions when the same entity appears in multiple
|
|
|
|
|
|
chunks.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
|
|
from rag.graphrag.general.extractor import Extractor
|
|
|
|
|
|
from rag.llm.chat_model import Base as CompletionLLM
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# spaCy model loading (lazy, module-level singleton)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_nlp = None
|
|
|
|
|
|
_nlp_model_name = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_spacy_model(model_name: str = "en_core_web_sm"):
|
|
|
|
|
|
"""Load (or return cached) spaCy language model.
|
|
|
|
|
|
|
|
|
|
|
|
Automatically downloads the model if it is not yet installed.
|
|
|
|
|
|
"""
|
|
|
|
|
|
global _nlp, _nlp_model_name
|
|
|
|
|
|
if _nlp is not None and _nlp_model_name == model_name:
|
|
|
|
|
|
return _nlp
|
|
|
|
|
|
try:
|
|
|
|
|
|
import spacy
|
|
|
|
|
|
except ImportError:
|
|
|
|
|
|
raise ImportError(
|
|
|
|
|
|
"spaCy is required for the spacy GraphRAG method. "
|
|
|
|
|
|
"Install it with: pip install spacy && python -m spacy download en_core_web_sm"
|
|
|
|
|
|
)
|
|
|
|
|
|
try:
|
|
|
|
|
|
_nlp = spacy.load(model_name)
|
|
|
|
|
|
logging.info("Loaded spaCy model '%s'", model_name)
|
|
|
|
|
|
except OSError:
|
|
|
|
|
|
logging.warning(
|
|
|
|
|
|
"spaCy model '%s' not found; downloading automatically …", model_name
|
|
|
|
|
|
)
|
|
|
|
|
|
from spacy.cli import download as spacy_download
|
|
|
|
|
|
spacy_download(model_name)
|
|
|
|
|
|
_nlp = spacy.load(model_name)
|
|
|
|
|
|
logging.info("Downloaded and loaded spaCy model '%s'", model_name)
|
|
|
|
|
|
_nlp_model_name = model_name
|
|
|
|
|
|
return _nlp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# spaCy ↔ application entity-type mapping
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# spaCy's built-in entity labels → the application-level types used by
|
|
|
|
|
|
# ``DEFAULT_ENTITY_TYPES``. Labels not listed here fall through to
|
|
|
|
|
|
# ``"category"``.
|
|
|
|
|
|
SPACY_TO_APP_ENTITY_TYPE: dict[str, str] = {
|
|
|
|
|
|
"PERSON": "person",
|
|
|
|
|
|
"ORG": "organization",
|
|
|
|
|
|
"GPE": "geo",
|
|
|
|
|
|
"LOC": "geo",
|
|
|
|
|
|
"FAC": "geo",
|
|
|
|
|
|
"EVENT": "event",
|
|
|
|
|
|
"PRODUCT": "category",
|
|
|
|
|
|
"WORK_OF_ART": "category",
|
|
|
|
|
|
"LAW": "category",
|
|
|
|
|
|
"LANGUAGE": "category",
|
|
|
|
|
|
"NORP": "category",
|
|
|
|
|
|
"MONEY": "category",
|
|
|
|
|
|
"QUANTITY": "category",
|
|
|
|
|
|
"TIME": "event",
|
|
|
|
|
|
"DATE": "event",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Labels to skip entirely (from LinearRAG: ordinals / cardinals are rarely
|
|
|
|
|
|
# useful as graph nodes).
|
|
|
|
|
|
_SKIP_SPACY_LABELS = {"ORDINAL", "CARDINAL"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# MGranRAG-style multi-pass keyword extraction
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def _has_uppercase(text: str) -> bool:
|
|
|
|
|
|
return any(c.isupper() for c in text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _replace_word(word: str) -> str:
|
|
|
|
|
|
"""Normalise spaces around hyphens and apostrophes (from MGranRAG)."""
|
|
|
|
|
|
return (
|
|
|
|
|
|
word.replace(" - ", "-")
|
|
|
|
|
|
.replace(" -", "-")
|
|
|
|
|
|
.replace("- ", "-")
|
|
|
|
|
|
.replace(" 's", "'s")
|
|
|
|
|
|
.replace(" 'S", "'S")
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_keywords(spacy_doc) -> set[str]:
|
|
|
|
|
|
"""MGranRAG-style 3-pass stacking keyword extraction.
|
|
|
|
|
|
|
|
|
|
|
|
Phase 1 — Hyphen / apostrophe merging:
|
|
|
|
|
|
Tokens connected by ``-`` or ``'s`` are merged into a single
|
|
|
|
|
|
phrase labelled ``NP`` (e.g. ``New-York``, ``cat's``).
|
|
|
|
|
|
|
|
|
|
|
|
Phase 2 — Capitalised-word merging:
|
|
|
|
|
|
Consecutive tokens whose ``shape_`` contains ``X`` (i.e. start
|
|
|
|
|
|
with an uppercase letter) are merged. Function words (ADP, CCONJ,
|
|
|
|
|
|
DET, PART) between them are absorbed as well, producing phrases
|
|
|
|
|
|
like ``King of England``. Merged results are labelled ``NX``
|
|
|
|
|
|
unless already ``PROPN``.
|
|
|
|
|
|
|
|
|
|
|
|
Phase 3 — Continuous noun / number merging:
|
|
|
|
|
|
Consecutive tokens with POS in ``[PROPN, NOUN, NUM, NX, NP]``
|
|
|
|
|
|
are merged and labelled ``NNN`` (unless already ``PROPN``).
|
|
|
|
|
|
|
|
|
|
|
|
Finally, results with a trailing lowercase non-noun word are
|
|
|
|
|
|
truncated, and coordinating conjunctions (``and``, ``or``) inside a
|
|
|
|
|
|
merged phrase cause it to be split so that each proper noun is
|
|
|
|
|
|
extracted individually (e.g. ``Bob and Lucy`` → ``Bob``, ``Lucy``).
|
|
|
|
|
|
"""
|
|
|
|
|
|
# ── Phase 1: hyphen / apostrophe ──────────────────────────────────
|
|
|
|
|
|
f1_word: list[str] = []
|
|
|
|
|
|
f1_shape: list[str] = []
|
|
|
|
|
|
f1_pos: list[str] = []
|
|
|
|
|
|
f1_pos_list: list[list[str]] = []
|
|
|
|
|
|
f1_word_list: list[list[str]] = []
|
|
|
|
|
|
|
|
|
|
|
|
is_right = False
|
|
|
|
|
|
for token in spacy_doc:
|
|
|
|
|
|
if token.shape_ in ("'x", "-") and token.pos_ in ("PUNCT", "PART"):
|
|
|
|
|
|
if token.shape_ == "-":
|
|
|
|
|
|
is_right = True
|
|
|
|
|
|
if f1_word:
|
|
|
|
|
|
f1_word[-1] += token.text
|
|
|
|
|
|
f1_pos[-1] = "NP"
|
|
|
|
|
|
f1_pos_list[-1].append(token.pos_)
|
|
|
|
|
|
f1_word_list[-1].append(token.text)
|
|
|
|
|
|
elif is_right:
|
|
|
|
|
|
is_right = False
|
|
|
|
|
|
if f1_word:
|
|
|
|
|
|
f1_word[-1] += token.text
|
|
|
|
|
|
f1_pos[-1] = "NP"
|
|
|
|
|
|
f1_pos_list[-1].append(token.pos_)
|
|
|
|
|
|
f1_word_list[-1].append(token.text)
|
|
|
|
|
|
else:
|
|
|
|
|
|
f1_word.append(token.text)
|
|
|
|
|
|
f1_shape.append(token.shape_)
|
|
|
|
|
|
f1_pos.append(token.pos_)
|
|
|
|
|
|
f1_pos_list.append([token.pos_])
|
|
|
|
|
|
f1_word_list.append([token.text])
|
|
|
|
|
|
|
|
|
|
|
|
# ── Phase 2: capitalised-word merging ───────────────────────────
|
|
|
|
|
|
f2_word: list[str] = []
|
|
|
|
|
|
f2_shape: list[str] = []
|
|
|
|
|
|
f2_pos: list[str] = []
|
|
|
|
|
|
f2_pos_list: list[list[str]] = []
|
|
|
|
|
|
f2_word_list: list[list[str]] = []
|
|
|
|
|
|
|
|
|
|
|
|
for cur in range(len(f1_word)):
|
|
|
|
|
|
cw = f1_word[cur]
|
|
|
|
|
|
cs = f1_shape[cur]
|
|
|
|
|
|
cp = f1_pos[cur]
|
|
|
|
|
|
cpl = f1_pos_list[cur]
|
|
|
|
|
|
cwl = f1_word_list[cur]
|
|
|
|
|
|
|
|
|
|
|
|
if "X" in cs or cp in ("ADP", "CCONJ", "DET", "PART"):
|
|
|
|
|
|
if f2_word and "X" in f2_shape[-1]:
|
|
|
|
|
|
# Merge with previous capitalised token.
|
|
|
|
|
|
f2_word[-1] += " " + cw
|
|
|
|
|
|
f2_shape[-1] += "X"
|
|
|
|
|
|
if f2_pos[-1] != "PROPN":
|
|
|
|
|
|
f2_pos[-1] = "NX"
|
|
|
|
|
|
f2_pos_list[-1].extend(cpl)
|
|
|
|
|
|
f2_word_list[-1].extend(cwl)
|
|
|
|
|
|
else:
|
|
|
|
|
|
f2_word.append(cw)
|
|
|
|
|
|
f2_shape.append(cs + "Start" if "X" in cs else cs)
|
|
|
|
|
|
f2_pos.append(cp)
|
|
|
|
|
|
f2_pos_list.append(cpl)
|
|
|
|
|
|
f2_word_list.append(cwl)
|
|
|
|
|
|
else:
|
|
|
|
|
|
f2_word.append(cw)
|
|
|
|
|
|
f2_shape.append(cs)
|
|
|
|
|
|
f2_pos.append(cp)
|
|
|
|
|
|
f2_pos_list.append(cpl)
|
|
|
|
|
|
f2_word_list.append(cwl)
|
|
|
|
|
|
|
|
|
|
|
|
# ── Phase 3: continuous noun / number merging ───────────────────
|
|
|
|
|
|
f3_word: list[str] = []
|
|
|
|
|
|
f3_shape: list[str] = []
|
|
|
|
|
|
f3_pos: list[str] = []
|
|
|
|
|
|
f3_pos_list: list[list[str]] = []
|
|
|
|
|
|
f3_word_list: list[list[str]] = []
|
|
|
|
|
|
|
|
|
|
|
|
_noun_pos = {"PROPN", "NOUN", "NUM", "NX", "NP"}
|
|
|
|
|
|
_noun_pos_ext = _noun_pos | {"NNN"}
|
|
|
|
|
|
|
|
|
|
|
|
for cur in range(len(f2_word)):
|
|
|
|
|
|
cw = f2_word[cur]
|
|
|
|
|
|
cs = f2_shape[cur]
|
|
|
|
|
|
cp = f2_pos[cur]
|
|
|
|
|
|
cpl = f2_pos_list[cur]
|
|
|
|
|
|
cwl = f2_word_list[cur]
|
|
|
|
|
|
|
|
|
|
|
|
if cp in _noun_pos:
|
|
|
|
|
|
if f3_word and f3_pos[-1] in _noun_pos_ext:
|
|
|
|
|
|
f3_word[-1] += " " + cw
|
|
|
|
|
|
f3_shape[-1] += "X"
|
|
|
|
|
|
if f3_pos[-1] != "PROPN":
|
|
|
|
|
|
f3_pos[-1] = "NNN"
|
|
|
|
|
|
f3_pos_list[-1].extend(cpl)
|
|
|
|
|
|
f3_word_list[-1].extend(cwl)
|
|
|
|
|
|
else:
|
|
|
|
|
|
f3_word.append(cw)
|
|
|
|
|
|
f3_shape.append(cs)
|
|
|
|
|
|
f3_pos.append(cp)
|
|
|
|
|
|
f3_pos_list.append(cpl)
|
|
|
|
|
|
f3_word_list.append(cwl)
|
|
|
|
|
|
else:
|
|
|
|
|
|
f3_word.append(cw)
|
|
|
|
|
|
f3_shape.append(cs)
|
|
|
|
|
|
f3_pos.append(cp)
|
|
|
|
|
|
f3_pos_list.append(cpl)
|
|
|
|
|
|
f3_word_list.append(cwl)
|
|
|
|
|
|
|
|
|
|
|
|
# ── Final keyword collection ────────────────────────────────────
|
|
|
|
|
|
keywords: set[str] = set()
|
|
|
|
|
|
for cur in range(len(f3_word)):
|
|
|
|
|
|
cw = f3_word[cur]
|
|
|
|
|
|
cp = f3_pos[cur]
|
|
|
|
|
|
cpl = f3_pos_list[cur]
|
|
|
|
|
|
cwl = f3_word_list[cur]
|
|
|
|
|
|
|
|
|
|
|
|
if cp not in _noun_pos_ext:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Truncate trailing lowercase non-noun / non-number words.
|
|
|
|
|
|
if cwl and not _has_uppercase(cwl[-1]) and cpl[-1] not in (
|
|
|
|
|
|
"PROPN",
|
|
|
|
|
|
"NOUN",
|
|
|
|
|
|
"NUM",
|
|
|
|
|
|
"PART",
|
|
|
|
|
|
):
|
|
|
|
|
|
for i in range(len(cpl) - 1, 0, -1):
|
|
|
|
|
|
if cpl[i] in ("PROPN", "NOUN", "NUM", "PART") or _has_uppercase(
|
|
|
|
|
|
cwl[i]
|
|
|
|
|
|
):
|
|
|
|
|
|
break
|
|
|
|
|
|
word = _replace_word(" ".join(cwl[: i + 1]))
|
|
|
|
|
|
keywords.add(word)
|
|
|
|
|
|
else:
|
|
|
|
|
|
word = _replace_word(cw)
|
|
|
|
|
|
keywords.add(word)
|
|
|
|
|
|
|
|
|
|
|
|
# Split on coordinating conjunctions (and/or) inside merged
|
|
|
|
|
|
# phrases so that individual proper nouns are also extracted
|
|
|
|
|
|
# (e.g. ``Bob and Lucy`` → ``Bob``, ``Lucy``).
|
|
|
|
|
|
if any(p in ("PROPN", "NOUN", "NUM") for p in cpl):
|
|
|
|
|
|
cur_kws: list[str] = []
|
|
|
|
|
|
for pidx, pos in enumerate(cpl):
|
|
|
|
|
|
if pos == "CCONJ" and cwl[pidx] and cwl[pidx][0].islower():
|
|
|
|
|
|
if cur_kws:
|
|
|
|
|
|
keywords.add(_replace_word(" ".join(cur_kws)))
|
|
|
|
|
|
cur_kws = []
|
|
|
|
|
|
else:
|
|
|
|
|
|
cur_kws.append(cwl[pidx])
|
|
|
|
|
|
if cur_kws:
|
|
|
|
|
|
keywords.add(_replace_word(" ".join(cur_kws)))
|
|
|
|
|
|
|
|
|
|
|
|
return keywords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_ner(spacy_doc) -> dict[str, str]:
|
|
|
|
|
|
"""Return ``{entity_text: spaCy_label}`` for all NER entities."""
|
|
|
|
|
|
entities_dict: dict[str, str] = {}
|
|
|
|
|
|
for ent in spacy_doc.ents:
|
|
|
|
|
|
if ent.label_ in _SKIP_SPACY_LABELS:
|
|
|
|
|
|
continue
|
|
|
|
|
|
text = ent.text.strip()
|
|
|
|
|
|
for t in text.split("\n"):
|
|
|
|
|
|
t = t.strip()
|
|
|
|
|
|
if t:
|
|
|
|
|
|
entities_dict[t] = ent.label_
|
|
|
|
|
|
return entities_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ner_all_keywords(spacy_doc) -> set[str]:
|
|
|
|
|
|
"""Combine rule-based keyword extraction with spaCy NER (MGranRAG).
|
|
|
|
|
|
|
|
|
|
|
|
Returns the union of:
|
|
|
|
|
|
- keywords from the 3-pass stacking algorithm (``extract_keywords``)
|
|
|
|
|
|
- entity texts from spaCy NER (``get_ner``)
|
|
|
|
|
|
"""
|
|
|
|
|
|
keywords = extract_keywords(spacy_doc)
|
|
|
|
|
|
ner_dict = get_ner(spacy_doc)
|
|
|
|
|
|
return keywords.union(ner_dict.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Main extractor class
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class GraphExtractor(Extractor):
|
|
|
|
|
|
"""Extract entities and relationships using spaCy (no LLM calls).
|
|
|
|
|
|
|
|
|
|
|
|
Entity extraction
|
|
|
|
|
|
MGranRAG's ``ner_all_keywords`` combines a 3-pass stacking
|
|
|
|
|
|
keyword algorithm with spaCy NER, yielding broader coverage than
|
|
|
|
|
|
NER alone (e.g. it catches compound nouns, hyphenated terms, and
|
|
|
|
|
|
multi-word proper nouns that NER might miss).
|
|
|
|
|
|
|
|
|
|
|
|
Relationship inference
|
|
|
|
|
|
LinearRAG's *relation-free* semantic bridging: entities
|
|
|
|
|
|
co-occurring in the same sentence (or within
|
|
|
|
|
|
``max_sentence_distance`` sentences) are linked by an implicit
|
|
|
|
|
|
edge. The edge description is the shared sentence text, which
|
|
|
|
|
|
provides natural language context without requiring an LLM.
|
|
|
|
|
|
|
|
|
|
|
|
Optionally, edge weights are TF-normalised (LinearRAG):
|
|
|
|
|
|
``weight = count(entity_in_chunk) / sum(all_entity_counts_in_chunk)``.
|
|
|
|
|
|
|
|
|
|
|
|
The ``llm_invoker`` is only used downstream for merging / summarising
|
|
|
|
|
|
duplicate descriptions (inherited from ``Extractor``).
|
|
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
|
----------
|
|
|
|
|
|
llm_invoker : CompletionLLM
|
|
|
|
|
|
LLM handle (used only for description summarisation, not extraction).
|
|
|
|
|
|
language : str
|
|
|
|
|
|
Language hint.
|
|
|
|
|
|
entity_types : list[str] | None
|
|
|
|
|
|
Application-level entity types to keep. Entities whose mapped
|
|
|
|
|
|
type is not in this list are discarded.
|
|
|
|
|
|
spacy_model : str
|
|
|
|
|
|
Name of the spaCy model to load (default ``en_core_web_sm``).
|
|
|
|
|
|
max_sentence_distance : int
|
|
|
|
|
|
When inferring relationships, pair entities that co-occur within
|
|
|
|
|
|
the same sentence. If > 1, also pair entities in sentences whose
|
|
|
|
|
|
indices differ by at most this value.
|
|
|
|
|
|
relationship_strength : int
|
|
|
|
|
|
Default weight assigned to every inferred relationship when
|
|
|
|
|
|
``use_tf_weight`` is ``False``.
|
|
|
|
|
|
use_tf_weight : bool
|
|
|
|
|
|
If ``True``, use TF-normalised weighting (LinearRAG-style) for
|
|
|
|
|
|
edge weights instead of the fixed ``relationship_strength``.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
llm_invoker: CompletionLLM,
|
|
|
|
|
|
language: str | None = "English",
|
|
|
|
|
|
entity_types: list[str] | None = None,
|
|
|
|
|
|
spacy_model: str = "en_core_web_sm",
|
|
|
|
|
|
max_sentence_distance: int = 1,
|
|
|
|
|
|
relationship_strength: int = 1,
|
|
|
|
|
|
use_tf_weight: bool = False,
|
|
|
|
|
|
):
|
|
|
|
|
|
super().__init__(llm_invoker, language, entity_types)
|
|
|
|
|
|
self._spacy_model_name = spacy_model
|
|
|
|
|
|
self._max_sentence_distance = max_sentence_distance
|
|
|
|
|
|
self._relationship_strength = relationship_strength
|
|
|
|
|
|
self._use_tf_weight = use_tf_weight
|
|
|
|
|
|
# Eagerly load the model so import errors surface early.
|
|
|
|
|
|
self._nlp = _load_spacy_model(spacy_model)
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Public interface – called by ``Extractor.__call__``
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
async def _process_single_content(
|
|
|
|
|
|
self,
|
|
|
|
|
|
chunk_key_dp: tuple[str, str],
|
|
|
|
|
|
chunk_seq: int,
|
|
|
|
|
|
num_chunks: int,
|
|
|
|
|
|
out_results,
|
|
|
|
|
|
task_id="",
|
|
|
|
|
|
):
|
|
|
|
|
|
"""Process one chunk through spaCy NER + keyword stacking + co-occurrence."""
|
|
|
|
|
|
chunk_key = chunk_key_dp[0]
|
|
|
|
|
|
content = chunk_key_dp[1]
|
|
|
|
|
|
doc = self._nlp(content)
|
|
|
|
|
|
|
|
|
|
|
|
# ── 1. Entity extraction (MGranRAG: ner_all_keywords) ────────
|
|
|
|
|
|
# Build a mapping from keyword text → spaCy label (if available).
|
|
|
|
|
|
ner_label_map: dict[str, str] = get_ner(doc)
|
|
|
|
|
|
all_keywords = ner_all_keywords(doc)
|
|
|
|
|
|
|
|
|
|
|
|
# For each keyword, determine its app-level entity type.
|
|
|
|
|
|
# - If the keyword matches a NER entity, use that label.
|
|
|
|
|
|
# - Otherwise, infer from POS heuristics.
|
|
|
|
|
|
ent_records: dict[str, dict] = {} # entity_name_upper → record
|
|
|
|
|
|
ent_by_sent: dict[int, list[dict]] = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
|
|
for kw in all_keywords:
|
|
|
|
|
|
kw_upper = kw.strip().upper()
|
|
|
|
|
|
if not kw_upper:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Determine entity type.
|
|
|
|
|
|
spacy_label = ner_label_map.get(kw)
|
|
|
|
|
|
if spacy_label:
|
|
|
|
|
|
app_type = SPACY_TO_APP_ENTITY_TYPE.get(spacy_label, "category")
|
|
|
|
|
|
else:
|
|
|
|
|
|
app_type = self._infer_type_from_pos(doc, kw)
|
|
|
|
|
|
|
|
|
|
|
|
if app_type not in self._entity_types_set:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Determine which sentence this keyword belongs to.
|
|
|
|
|
|
sent_idx = self._keyword_sent_idx(doc, kw)
|
|
|
|
|
|
|
|
|
|
|
|
# Description: use the containing sentence (LinearRAG semantic bridging).
|
|
|
|
|
|
#sent_text = self._keyword_sent_text(doc, kw)
|
|
|
|
|
|
|
|
|
|
|
|
ent_record = dict(
|
|
|
|
|
|
entity_name=kw_upper,
|
|
|
|
|
|
entity_type=app_type.upper(),
|
|
|
|
|
|
description="", #sent_text or kw,
|
|
|
|
|
|
source_id=chunk_key,
|
|
|
|
|
|
)
|
|
|
|
|
|
# A keyword may appear multiple times; keep the first.
|
|
|
|
|
|
if kw_upper not in ent_records:
|
|
|
|
|
|
ent_records[kw_upper] = ent_record
|
|
|
|
|
|
ent_by_sent[sent_idx].append(ent_record)
|
|
|
|
|
|
|
|
|
|
|
|
maybe_nodes: dict[str, list[dict]] = defaultdict(list)
|
|
|
|
|
|
for name, rec in ent_records.items():
|
|
|
|
|
|
maybe_nodes[name].append(rec)
|
|
|
|
|
|
|
|
|
|
|
|
# ── 2. Relationship inference (LinearRAG: sentence co-occurrence) ─
|
|
|
|
|
|
maybe_edges: dict[tuple, list[dict]] = defaultdict(list)
|
|
|
|
|
|
|
|
|
|
|
|
# Pre-compute TF weights if needed (LinearRAG).
|
|
|
|
|
|
entity_tf: dict[str, float] = {}
|
|
|
|
|
|
if self._use_tf_weight:
|
|
|
|
|
|
total_count = sum(
|
|
|
|
|
|
content.upper().count(name) for name in ent_records
|
|
|
|
|
|
)
|
|
|
|
|
|
for name in ent_records:
|
|
|
|
|
|
count = content.upper().count(name)
|
|
|
|
|
|
entity_tf[name] = count / total_count if total_count > 0 else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
seen_pairs: set[tuple[str, str]] = set()
|
|
|
|
|
|
for si in sorted(ent_by_sent.keys()):
|
|
|
|
|
|
ents_in_range = list(ent_by_sent[si])
|
|
|
|
|
|
# Expand with nearby sentences.
|
|
|
|
|
|
for offset in range(1, self._max_sentence_distance + 1):
|
|
|
|
|
|
for nb_si in (si + offset, si - offset):
|
|
|
|
|
|
if nb_si in ent_by_sent:
|
|
|
|
|
|
ents_in_range.extend(ent_by_sent[nb_si])
|
|
|
|
|
|
# Deduplicate by entity name.
|
|
|
|
|
|
unique: dict[str, dict] = {}
|
|
|
|
|
|
for e in ents_in_range:
|
|
|
|
|
|
unique[e["entity_name"]] = e
|
|
|
|
|
|
ent_list = list(unique.values())
|
|
|
|
|
|
|
|
|
|
|
|
for a_idx in range(len(ent_list)):
|
|
|
|
|
|
for b_idx in range(a_idx + 1, len(ent_list)):
|
|
|
|
|
|
ea, eb = ent_list[a_idx], ent_list[b_idx]
|
|
|
|
|
|
pair = tuple(sorted([ea["entity_name"], eb["entity_name"]]))
|
|
|
|
|
|
if pair in seen_pairs:
|
|
|
|
|
|
continue
|
|
|
|
|
|
seen_pairs.add(pair)
|
|
|
|
|
|
|
|
|
|
|
|
# Relationship description: shared sentence text
|
|
|
|
|
|
# (LinearRAG semantic bridging — the sentence is the
|
|
|
|
|
|
# semantic bridge between entities).
|
|
|
|
|
|
#desc = self._cooccurrence_description(doc, ea["entity_name"], eb["entity_name"])
|
|
|
|
|
|
|
|
|
|
|
|
# Edge weight: TF-normalised (LinearRAG) or fixed.
|
|
|
|
|
|
if self._use_tf_weight:
|
|
|
|
|
|
w = (entity_tf.get(ea["entity_name"], 0.0)
|
|
|
|
|
|
+ entity_tf.get(eb["entity_name"], 0.0))
|
|
|
|
|
|
weight = max(w, 0.01)
|
|
|
|
|
|
else:
|
|
|
|
|
|
weight = self._relationship_strength
|
|
|
|
|
|
|
|
|
|
|
|
# Keywords for the edge: the two entity names.
|
|
|
|
|
|
edge_record = dict(
|
|
|
|
|
|
src_id=pair[0],
|
|
|
|
|
|
tgt_id=pair[1],
|
|
|
|
|
|
weight=weight,
|
|
|
|
|
|
description="", #desc,
|
|
|
|
|
|
keywords=[ea["entity_name"], eb["entity_name"]],
|
|
|
|
|
|
source_id=chunk_key,
|
|
|
|
|
|
)
|
|
|
|
|
|
maybe_edges[pair].append(edge_record)
|
|
|
|
|
|
|
|
|
|
|
|
token_count = len(doc)
|
|
|
|
|
|
out_results.append((dict(maybe_nodes), dict(maybe_edges), token_count))
|
|
|
|
|
|
if self.callback:
|
|
|
|
|
|
self.callback(
|
|
|
|
|
|
0.5 + 0.1 * len(out_results) / num_chunks,
|
2026-05-22 13:16:39 +08:00
|
|
|
|
msg=f"[spacy] Entities extraction of chunk {chunk_seq+1} "
|
2026-05-11 12:59:59 +08:00
|
|
|
|
f"{len(out_results)}/{num_chunks} done, "
|
|
|
|
|
|
f"{len(maybe_nodes)} nodes, {len(maybe_edges)} edges, "
|
|
|
|
|
|
f"{token_count} tokens.",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
# Helpers
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
|
def _entity_types_set(self) -> set[str]:
|
|
|
|
|
|
return {t.lower() for t in self._entity_types}
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _infer_type_from_pos(doc, keyword: str) -> str:
|
|
|
|
|
|
"""Infer an application-level entity type from POS tags when the
|
|
|
|
|
|
keyword was found by the stacking algorithm but not by NER."""
|
|
|
|
|
|
kw_upper = keyword.upper()
|
|
|
|
|
|
for token in doc:
|
|
|
|
|
|
if token.text.upper() == kw_upper or token.text.upper().startswith(kw_upper.split()[0]):
|
|
|
|
|
|
if token.pos_ == "PROPN":
|
|
|
|
|
|
return "person"
|
|
|
|
|
|
if token.pos_ == "NOUN":
|
|
|
|
|
|
return "category"
|
|
|
|
|
|
if token.pos_ == "NUM":
|
|
|
|
|
|
return "event"
|
|
|
|
|
|
break
|
|
|
|
|
|
# Fallback: check for uppercase → likely a named entity.
|
|
|
|
|
|
if _has_uppercase(keyword):
|
|
|
|
|
|
return "person"
|
|
|
|
|
|
return "category"
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _keyword_sent_idx(doc, keyword: str) -> int:
|
|
|
|
|
|
"""Return the sentence index that contains *keyword*."""
|
|
|
|
|
|
kw_lower = keyword.lower()
|
|
|
|
|
|
for i, sent in enumerate(doc.sents):
|
|
|
|
|
|
if kw_lower in sent.text.lower():
|
|
|
|
|
|
return i
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _keyword_sent_text(doc, keyword: str) -> str | None:
|
|
|
|
|
|
"""Return the sentence text containing *keyword* (LinearRAG semantic bridging)."""
|
|
|
|
|
|
kw_lower = keyword.lower()
|
|
|
|
|
|
for sent in doc.sents:
|
|
|
|
|
|
if kw_lower in sent.text.lower():
|
|
|
|
|
|
return sent.text.strip()
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _cooccurrence_description(doc, head_name: str, tail_name: str) -> str:
|
|
|
|
|
|
"""Derive a relationship description using sentence co-occurrence
|
|
|
|
|
|
(LinearRAG) with dependency-path enhancement as fallback.
|
|
|
|
|
|
|
|
|
|
|
|
If both entities appear in the same sentence, that sentence is
|
|
|
|
|
|
used as the description (semantic bridging). Otherwise, try to
|
|
|
|
|
|
find a lowest common ancestor in the dependency tree. As a last
|
|
|
|
|
|
resort, return a generic statement.
|
|
|
|
|
|
"""
|
|
|
|
|
|
head_lower = head_name.lower()
|
|
|
|
|
|
tail_lower = tail_name.lower()
|
|
|
|
|
|
|
|
|
|
|
|
# Primary: shared sentence text (LinearRAG semantic bridging).
|
|
|
|
|
|
for sent in doc.sents:
|
|
|
|
|
|
sent_lower = sent.text.lower()
|
|
|
|
|
|
if head_lower in sent_lower and tail_lower in sent_lower:
|
|
|
|
|
|
return sent.text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# Fallback: dependency path via LCA.
|
|
|
|
|
|
head_tok = GraphExtractor._find_token_by_text(doc, head_name)
|
|
|
|
|
|
tail_tok = GraphExtractor._find_token_by_text(doc, tail_name)
|
|
|
|
|
|
if head_tok is not None and tail_tok is not None:
|
|
|
|
|
|
path_head = list(GraphExtractor._ancestor_path(head_tok))
|
|
|
|
|
|
path_tail = list(GraphExtractor._ancestor_path(tail_tok))
|
|
|
|
|
|
lca = None
|
|
|
|
|
|
for h in path_head:
|
|
|
|
|
|
for t in path_tail:
|
|
|
|
|
|
if h == t:
|
|
|
|
|
|
lca = h
|
|
|
|
|
|
break
|
|
|
|
|
|
if lca is not None:
|
|
|
|
|
|
break
|
|
|
|
|
|
if lca is not None and lca is not head_tok and lca is not tail_tok:
|
|
|
|
|
|
return f"{head_name} is related to {tail_name} via '{lca.lemma_}'"
|
|
|
|
|
|
|
|
|
|
|
|
# Final fallback: nearby sentences.
|
|
|
|
|
|
head_sent = GraphExtractor._find_sent_for_text(doc, head_lower)
|
|
|
|
|
|
if head_sent is not None:
|
|
|
|
|
|
return head_sent.text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
return f"{head_name} is related to {tail_name}"
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _find_token_by_text(doc, ent_name: str):
|
|
|
|
|
|
"""Return the head token of the first spaCy entity matching *ent_name*."""
|
|
|
|
|
|
target = ent_name.upper()
|
|
|
|
|
|
for ent in doc.ents:
|
|
|
|
|
|
if ent.text.strip().upper() == target:
|
|
|
|
|
|
return ent.root
|
|
|
|
|
|
# Fallback: token-level match for keywords not in doc.ents.
|
|
|
|
|
|
for token in doc:
|
|
|
|
|
|
if token.text.strip().upper() == target:
|
|
|
|
|
|
return token
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _find_sent_for_text(doc, text_lower: str):
|
|
|
|
|
|
"""Return the first ``Span`` whose text contains *text_lower*."""
|
|
|
|
|
|
for sent in doc.sents:
|
|
|
|
|
|
if text_lower in sent.text.lower():
|
|
|
|
|
|
return sent
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _ancestor_path(token):
|
|
|
|
|
|
"""Yield *token* then each ancestor up to the root."""
|
|
|
|
|
|
yield token
|
|
|
|
|
|
for anc in token.ancestors:
|
|
|
|
|
|
yield anc
|