mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-02 00:35:46 +08:00
Follow #15486 Co-authored-by: limuting <limuting233@gmail.com> Co-authored-by: lutianyi <lutianyi233@163.com> Co-authored-by: justinychuang <huangyicheng@soulcode.cn> Co-authored-by: maybehokori <138367708+maybehokori@users.noreply.github.com>
323 lines
13 KiB
Python
323 lines
13 KiB
Python
import importlib.util
|
|
import sys
|
|
from pathlib import Path
|
|
from types import ModuleType
|
|
|
|
|
|
def _load_somark_parser(monkeypatch):
|
|
"""Load somark_parser.py directly, bypassing deepdoc/__init__.py's
|
|
beartype_this_package() and the heavy deepdoc dependency chain.
|
|
|
|
Mirrors the pattern used by test_mineru_parser.py / test_opendataloader_parser.py.
|
|
"""
|
|
repo_root = Path(__file__).resolve().parents[4]
|
|
|
|
deepdoc_mod = ModuleType("deepdoc")
|
|
deepdoc_mod.__path__ = [str(repo_root / "deepdoc")]
|
|
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_mod)
|
|
|
|
parser_mod = ModuleType("deepdoc.parser")
|
|
parser_mod.__path__ = [str(repo_root / "deepdoc" / "parser")]
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_mod)
|
|
|
|
pdf_parser_mod = ModuleType("deepdoc.parser.pdf_parser")
|
|
|
|
class _RAGFlowPdfParser:
|
|
pass
|
|
|
|
pdf_parser_mod.RAGFlowPdfParser = _RAGFlowPdfParser
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser.pdf_parser", pdf_parser_mod)
|
|
|
|
utils_mod = ModuleType("deepdoc.parser.utils")
|
|
utils_mod.extract_pdf_outlines = lambda *_args, **_kwargs: []
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
|
|
|
|
module_name = "test_somark_parser_unit_module"
|
|
module_path = repo_root / "deepdoc" / "parser" / "somark_parser.py"
|
|
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
|
module = importlib.util.module_from_spec(spec)
|
|
monkeypatch.setitem(sys.modules, module_name, module)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
def _make_parser(m, **feature_kwargs):
|
|
"""Build a SoMarkParser instance without triggering any network call.
|
|
__init__ only sets attributes; check_installation() is what hits the network."""
|
|
return m.SoMarkParser(
|
|
base_url="https://example.invalid/api/v1",
|
|
api_key="",
|
|
**feature_kwargs,
|
|
)
|
|
|
|
|
|
def _sample_pages():
|
|
"""A minimal pages payload mixing text, title, figure, equation, table,
|
|
toc (discarded), and an image block with no bbox (must be skipped)."""
|
|
return [
|
|
{
|
|
"page_num": 0,
|
|
"page_size": {"w": 600, "h": 800},
|
|
"blocks": [
|
|
{"type": "text", "content": "hello world", "bbox": [10, 20, 100, 40]},
|
|
{"type": "title", "content": "Chapter 1", "title_level": 1, "bbox": [10, 5, 200, 18]},
|
|
{"type": "figure", "content": "Figure caption from understanding", "bbox": [50, 50, 300, 300]},
|
|
{"type": "table", "content": "<table><tr><td>a</td></tr></table>", "bbox": [10, 400, 500, 600]},
|
|
{"type": "equation", "content": "E=mc^2", "bbox": [10, 650, 200, 680]},
|
|
{"type": "cate_item", "content": "should be discarded", "bbox": [0, 0, 10, 10]},
|
|
{"type": "figure", "content": "no bbox -> skip"}, # no bbox at all
|
|
],
|
|
}
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Type-mapping integrity (regression guard)
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_type_mapping_covers_every_non_discarded_block_type(monkeypatch):
|
|
"""Every SoMark block type that is not in ALWAYS_DISCARDED and is not a
|
|
header/footer (which obey keep_header_footer) must have a mapping in
|
|
SOMARK_TYPE_TO_RAGFLOW. A new SoMark type added to the enum without a
|
|
mapping would silently fall back to "text"; this guard makes that
|
|
omission explicit at test time."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
header_footer = {m.SoMarkBlockType.HEADER, m.SoMarkBlockType.FOOTER}
|
|
for btype in m.SoMarkBlockType:
|
|
if btype in m.ALWAYS_DISCARDED or btype in header_footer:
|
|
continue
|
|
assert btype in m.SOMARK_TYPE_TO_RAGFLOW, f"{btype} missing from SOMARK_TYPE_TO_RAGFLOW"
|
|
|
|
|
|
def test_mapping_values_are_known_internal_layout_types(monkeypatch):
|
|
"""Mapping values must be one of the layout types that downstream
|
|
rag/flow consumers (and chunking) understand."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
allowed = {"text", "image", "table", "code", "equation"}
|
|
for btype, internal in m.SOMARK_TYPE_TO_RAGFLOW.items():
|
|
assert internal in allowed, f"{btype} -> {internal!r} is not a known internal type"
|
|
|
|
|
|
def test_always_discarded_contains_toc_and_blank(monkeypatch):
|
|
"""Table-of-contents items must be discarded; if they leaked through the
|
|
knowledge base would be polluted with chapter titles repeated as chunks."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
assert m.SoMarkBlockType.CATE in m.ALWAYS_DISCARDED
|
|
assert m.SoMarkBlockType.CATE_ITEM in m.ALWAYS_DISCARDED
|
|
assert m.SoMarkBlockType.BLANK in m.ALWAYS_DISCARDED
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# _resolve_internal_type — all branches
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_resolve_internal_type_discards_toc_and_blank(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.CATE) is None
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.CATE_ITEM) is None
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.BLANK) is None
|
|
|
|
|
|
def test_resolve_internal_type_header_footer_dropped_by_default(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m) # keep_header_footer=False (default)
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.HEADER) is None
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.FOOTER) is None
|
|
|
|
|
|
def test_resolve_internal_type_header_footer_kept_when_flagged(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m, keep_header_footer=True)
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.HEADER) == "text"
|
|
assert p._resolve_internal_type(m.SoMarkBlockType.FOOTER) == "text"
|
|
|
|
|
|
def test_resolve_internal_type_unknown_falls_back_to_text(monkeypatch):
|
|
"""If SoMark introduces a new block type before the mapping is updated,
|
|
we should fall back to ``text`` (silent loss is worse than a wrong layout
|
|
label)."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
assert p._resolve_internal_type("some_brand_new_type") == "text"
|
|
|
|
|
|
def test_resolve_internal_type_image_blocks(monkeypatch):
|
|
"""figure/cs/qrcode/stamp must all resolve to 'image' so they share the
|
|
crop() recovery path; otherwise figures would be lost on the naive path."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
for btype in (m.SoMarkBlockType.FIGURE, m.SoMarkBlockType.CS, m.SoMarkBlockType.QRCODE, m.SoMarkBlockType.STAMP):
|
|
assert p._resolve_internal_type(btype) == "image", btype
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# _block_text
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_block_text_image_returns_empty_string(monkeypatch):
|
|
"""Image-typed blocks contribute no text via _block_text; the figure is
|
|
later recovered from the rendered page by crop()."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
block = {"type": m.SoMarkBlockType.FIGURE.value, "content": "ignored"}
|
|
assert m.SoMarkParser._block_text(block, "image") == ""
|
|
|
|
|
|
def test_block_text_title_prepends_markdown_hashes(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
block = {"type": m.SoMarkBlockType.TITLE.value, "content": "Hello", "title_level": 2}
|
|
assert m.SoMarkParser._block_text(block, "text") == "## Hello"
|
|
|
|
|
|
def test_block_text_title_without_level_returns_plain_content(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
block = {"type": m.SoMarkBlockType.TITLE.value, "content": "Hello"} # no title_level
|
|
assert m.SoMarkParser._block_text(block, "text") == "Hello"
|
|
|
|
|
|
def test_block_text_text_strips_whitespace(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
block = {"type": m.SoMarkBlockType.TEXT.value, "content": " hi "}
|
|
assert m.SoMarkParser._block_text(block, "text") == "hi"
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# _transfer_to_sections — tuple shape contract
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_transfer_to_sections_naive_path_returns_2_tuples(monkeypatch):
|
|
"""parse_method=None (or anything not in {manual, pipeline}) — naive.py
|
|
consumer — must receive 2-tuples (text, line_tag)."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
|
|
secs = p._transfer_to_sections(_sample_pages())
|
|
|
|
assert all(isinstance(s, tuple) and len(s) == 2 for s in secs), "naive path must emit (text, line_tag) 2-tuples"
|
|
# 7 blocks - 1 cate_item discarded - 1 no-bbox figure skipped = 5 valid
|
|
assert len(secs) == 5
|
|
|
|
|
|
def test_transfer_to_sections_pipeline_path_returns_3_tuples(monkeypatch):
|
|
"""parse_method='pipeline' — rag/flow consumer — must receive typed
|
|
3-tuples (text, layout_type, line_tag), mirroring MinerU's contract."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
|
|
secs = p._transfer_to_sections(_sample_pages(), parse_method="pipeline")
|
|
|
|
assert all(isinstance(s, tuple) and len(s) == 3 for s in secs), "pipeline path must emit (text, layout_type, line_tag) 3-tuples"
|
|
# Layout types must reflect block diversity, not collapse to all "text"
|
|
layout_types = {s[1] for s in secs}
|
|
assert layout_types >= {"text", "image", "table", "equation"}, f"expected diverse layout types, got {layout_types}"
|
|
|
|
|
|
def test_transfer_to_sections_naive_image_carries_caption_and_tag(monkeypatch):
|
|
"""Image sections on the naive path must carry a unique caption in the
|
|
text field (to avoid chunk-id hash collision across figures) AND embed
|
|
the position tag so tokenize_chunks()->crop() can still recover the
|
|
figure. The pos field is empty by design."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
|
|
secs = p._transfer_to_sections(_sample_pages())
|
|
image_secs = [s for s in secs if s[1] == ""]
|
|
|
|
assert len(image_secs) == 1
|
|
text = image_secs[0][0]
|
|
assert "Figure caption from understanding" in text, "caption must be in text"
|
|
assert "@@" in text and "##" in text, "position tag must be embedded in text"
|
|
|
|
|
|
def test_transfer_to_sections_pipeline_image_keeps_caption_and_typed_position(monkeypatch):
|
|
"""On the pipeline path the image block keeps its caption text for semantic
|
|
retrieval and a real (separate) line_tag for crop(poss)."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
|
|
secs = p._transfer_to_sections(_sample_pages(), parse_method="pipeline")
|
|
image_secs = [s for s in secs if s[1] == "image"]
|
|
|
|
assert len(image_secs) == 1
|
|
text, layout, line_tag = image_secs[0]
|
|
assert text == "Figure caption from understanding"
|
|
assert layout == "image"
|
|
assert line_tag.startswith("@@") and line_tag.endswith("##")
|
|
|
|
|
|
def test_transfer_to_sections_discards_cate_item_in_both_modes(monkeypatch):
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
for mode in (None, "pipeline"):
|
|
secs = p._transfer_to_sections(_sample_pages(), parse_method=mode)
|
|
leaked = [s for s in secs if "should be discarded" in (s[0] or "")]
|
|
assert leaked == [], f"cate_item leaked in mode={mode}: {leaked}"
|
|
|
|
|
|
def test_transfer_to_sections_skips_image_block_without_bbox(monkeypatch):
|
|
"""No bbox means crop() can't recover anything; emitting an empty section
|
|
would only pollute the chunk stream."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
pages = [
|
|
{
|
|
"page_num": 0,
|
|
"page_size": {"w": 600, "h": 800},
|
|
"blocks": [{"type": "figure", "content": "no bbox"}],
|
|
}
|
|
]
|
|
assert p._transfer_to_sections(pages) == []
|
|
|
|
|
|
def test_transfer_to_sections_keeps_header_footer_when_flagged(monkeypatch):
|
|
"""With keep_header_footer=True, header/footer blocks should pass through
|
|
as text sections."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m, keep_header_footer=True)
|
|
pages = [
|
|
{
|
|
"page_num": 0,
|
|
"page_size": {"w": 600, "h": 800},
|
|
"blocks": [
|
|
{"type": "header", "content": "doc title", "bbox": [0, 0, 100, 10]},
|
|
{"type": "footer", "content": "page 1", "bbox": [0, 790, 100, 800]},
|
|
],
|
|
}
|
|
]
|
|
secs = p._transfer_to_sections(pages)
|
|
texts = [s[0] for s in secs]
|
|
assert any("doc title" in t for t in texts)
|
|
assert any("page 1" in t for t in texts)
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# _line_tag format
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_line_tag_format(monkeypatch):
|
|
"""Tag format ``@@<page1based>\\t<x0>\\t<x1>\\t<y0>\\t<y1>##`` is the
|
|
contract that downstream extract_positions() / crop() parse."""
|
|
m = _load_somark_parser(monkeypatch)
|
|
p = _make_parser(m)
|
|
bx = {
|
|
"page_idx": 0,
|
|
"bbox": [10, 20, 100, 40],
|
|
"page_size": {"w": 600, "h": 800},
|
|
}
|
|
tag = p._line_tag(bx)
|
|
|
|
assert tag.startswith("@@1\t"), "page index must be 1-based"
|
|
assert tag.endswith("##")
|
|
parts = tag.strip("@").strip("#").split("\t")
|
|
assert len(parts) == 5, f"expected 5 tab-separated parts, got {parts}"
|
|
# Absent page_images, _line_tag uses raw bbox coords
|
|
assert float(parts[1]) == 10.0 # x0
|
|
assert float(parts[2]) == 100.0 # x1
|
|
assert float(parts[3]) == 20.0 # y0 (top)
|
|
assert float(parts[4]) == 40.0 # y1 (bottom)
|