Files
ragflow/test/unit_test/deepdoc/parser/test_somark_parser.py
Lynn 400476f0b3 Feat: SoMark (#16482)
Follow #15486
Co-authored-by: limuting <limuting233@gmail.com>
Co-authored-by: lutianyi <lutianyi233@163.com>
Co-authored-by: justinychuang <huangyicheng@soulcode.cn>
Co-authored-by: maybehokori <138367708+maybehokori@users.noreply.github.com>
2026-07-01 13:29:28 +08:00

323 lines
13 KiB
Python

import importlib.util
import sys
from pathlib import Path
from types import ModuleType
def _load_somark_parser(monkeypatch):
"""Load somark_parser.py directly, bypassing deepdoc/__init__.py's
beartype_this_package() and the heavy deepdoc dependency chain.
Mirrors the pattern used by test_mineru_parser.py / test_opendataloader_parser.py.
"""
repo_root = Path(__file__).resolve().parents[4]
deepdoc_mod = ModuleType("deepdoc")
deepdoc_mod.__path__ = [str(repo_root / "deepdoc")]
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_mod)
parser_mod = ModuleType("deepdoc.parser")
parser_mod.__path__ = [str(repo_root / "deepdoc" / "parser")]
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_mod)
pdf_parser_mod = ModuleType("deepdoc.parser.pdf_parser")
class _RAGFlowPdfParser:
pass
pdf_parser_mod.RAGFlowPdfParser = _RAGFlowPdfParser
monkeypatch.setitem(sys.modules, "deepdoc.parser.pdf_parser", pdf_parser_mod)
utils_mod = ModuleType("deepdoc.parser.utils")
utils_mod.extract_pdf_outlines = lambda *_args, **_kwargs: []
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
module_name = "test_somark_parser_unit_module"
module_path = repo_root / "deepdoc" / "parser" / "somark_parser.py"
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
monkeypatch.setitem(sys.modules, module_name, module)
spec.loader.exec_module(module)
return module
def _make_parser(m, **feature_kwargs):
"""Build a SoMarkParser instance without triggering any network call.
__init__ only sets attributes; check_installation() is what hits the network."""
return m.SoMarkParser(
base_url="https://example.invalid/api/v1",
api_key="",
**feature_kwargs,
)
def _sample_pages():
"""A minimal pages payload mixing text, title, figure, equation, table,
toc (discarded), and an image block with no bbox (must be skipped)."""
return [
{
"page_num": 0,
"page_size": {"w": 600, "h": 800},
"blocks": [
{"type": "text", "content": "hello world", "bbox": [10, 20, 100, 40]},
{"type": "title", "content": "Chapter 1", "title_level": 1, "bbox": [10, 5, 200, 18]},
{"type": "figure", "content": "Figure caption from understanding", "bbox": [50, 50, 300, 300]},
{"type": "table", "content": "<table><tr><td>a</td></tr></table>", "bbox": [10, 400, 500, 600]},
{"type": "equation", "content": "E=mc^2", "bbox": [10, 650, 200, 680]},
{"type": "cate_item", "content": "should be discarded", "bbox": [0, 0, 10, 10]},
{"type": "figure", "content": "no bbox -> skip"}, # no bbox at all
],
}
]
# ---------------------------------------------------------------------
# Type-mapping integrity (regression guard)
# ---------------------------------------------------------------------
def test_type_mapping_covers_every_non_discarded_block_type(monkeypatch):
"""Every SoMark block type that is not in ALWAYS_DISCARDED and is not a
header/footer (which obey keep_header_footer) must have a mapping in
SOMARK_TYPE_TO_RAGFLOW. A new SoMark type added to the enum without a
mapping would silently fall back to "text"; this guard makes that
omission explicit at test time."""
m = _load_somark_parser(monkeypatch)
header_footer = {m.SoMarkBlockType.HEADER, m.SoMarkBlockType.FOOTER}
for btype in m.SoMarkBlockType:
if btype in m.ALWAYS_DISCARDED or btype in header_footer:
continue
assert btype in m.SOMARK_TYPE_TO_RAGFLOW, f"{btype} missing from SOMARK_TYPE_TO_RAGFLOW"
def test_mapping_values_are_known_internal_layout_types(monkeypatch):
"""Mapping values must be one of the layout types that downstream
rag/flow consumers (and chunking) understand."""
m = _load_somark_parser(monkeypatch)
allowed = {"text", "image", "table", "code", "equation"}
for btype, internal in m.SOMARK_TYPE_TO_RAGFLOW.items():
assert internal in allowed, f"{btype} -> {internal!r} is not a known internal type"
def test_always_discarded_contains_toc_and_blank(monkeypatch):
"""Table-of-contents items must be discarded; if they leaked through the
knowledge base would be polluted with chapter titles repeated as chunks."""
m = _load_somark_parser(monkeypatch)
assert m.SoMarkBlockType.CATE in m.ALWAYS_DISCARDED
assert m.SoMarkBlockType.CATE_ITEM in m.ALWAYS_DISCARDED
assert m.SoMarkBlockType.BLANK in m.ALWAYS_DISCARDED
# ---------------------------------------------------------------------
# _resolve_internal_type — all branches
# ---------------------------------------------------------------------
def test_resolve_internal_type_discards_toc_and_blank(monkeypatch):
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
assert p._resolve_internal_type(m.SoMarkBlockType.CATE) is None
assert p._resolve_internal_type(m.SoMarkBlockType.CATE_ITEM) is None
assert p._resolve_internal_type(m.SoMarkBlockType.BLANK) is None
def test_resolve_internal_type_header_footer_dropped_by_default(monkeypatch):
m = _load_somark_parser(monkeypatch)
p = _make_parser(m) # keep_header_footer=False (default)
assert p._resolve_internal_type(m.SoMarkBlockType.HEADER) is None
assert p._resolve_internal_type(m.SoMarkBlockType.FOOTER) is None
def test_resolve_internal_type_header_footer_kept_when_flagged(monkeypatch):
m = _load_somark_parser(monkeypatch)
p = _make_parser(m, keep_header_footer=True)
assert p._resolve_internal_type(m.SoMarkBlockType.HEADER) == "text"
assert p._resolve_internal_type(m.SoMarkBlockType.FOOTER) == "text"
def test_resolve_internal_type_unknown_falls_back_to_text(monkeypatch):
"""If SoMark introduces a new block type before the mapping is updated,
we should fall back to ``text`` (silent loss is worse than a wrong layout
label)."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
assert p._resolve_internal_type("some_brand_new_type") == "text"
def test_resolve_internal_type_image_blocks(monkeypatch):
"""figure/cs/qrcode/stamp must all resolve to 'image' so they share the
crop() recovery path; otherwise figures would be lost on the naive path."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
for btype in (m.SoMarkBlockType.FIGURE, m.SoMarkBlockType.CS, m.SoMarkBlockType.QRCODE, m.SoMarkBlockType.STAMP):
assert p._resolve_internal_type(btype) == "image", btype
# ---------------------------------------------------------------------
# _block_text
# ---------------------------------------------------------------------
def test_block_text_image_returns_empty_string(monkeypatch):
"""Image-typed blocks contribute no text via _block_text; the figure is
later recovered from the rendered page by crop()."""
m = _load_somark_parser(monkeypatch)
block = {"type": m.SoMarkBlockType.FIGURE.value, "content": "ignored"}
assert m.SoMarkParser._block_text(block, "image") == ""
def test_block_text_title_prepends_markdown_hashes(monkeypatch):
m = _load_somark_parser(monkeypatch)
block = {"type": m.SoMarkBlockType.TITLE.value, "content": "Hello", "title_level": 2}
assert m.SoMarkParser._block_text(block, "text") == "## Hello"
def test_block_text_title_without_level_returns_plain_content(monkeypatch):
m = _load_somark_parser(monkeypatch)
block = {"type": m.SoMarkBlockType.TITLE.value, "content": "Hello"} # no title_level
assert m.SoMarkParser._block_text(block, "text") == "Hello"
def test_block_text_text_strips_whitespace(monkeypatch):
m = _load_somark_parser(monkeypatch)
block = {"type": m.SoMarkBlockType.TEXT.value, "content": " hi "}
assert m.SoMarkParser._block_text(block, "text") == "hi"
# ---------------------------------------------------------------------
# _transfer_to_sections — tuple shape contract
# ---------------------------------------------------------------------
def test_transfer_to_sections_naive_path_returns_2_tuples(monkeypatch):
"""parse_method=None (or anything not in {manual, pipeline}) — naive.py
consumer — must receive 2-tuples (text, line_tag)."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
secs = p._transfer_to_sections(_sample_pages())
assert all(isinstance(s, tuple) and len(s) == 2 for s in secs), "naive path must emit (text, line_tag) 2-tuples"
# 7 blocks - 1 cate_item discarded - 1 no-bbox figure skipped = 5 valid
assert len(secs) == 5
def test_transfer_to_sections_pipeline_path_returns_3_tuples(monkeypatch):
"""parse_method='pipeline' — rag/flow consumer — must receive typed
3-tuples (text, layout_type, line_tag), mirroring MinerU's contract."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
secs = p._transfer_to_sections(_sample_pages(), parse_method="pipeline")
assert all(isinstance(s, tuple) and len(s) == 3 for s in secs), "pipeline path must emit (text, layout_type, line_tag) 3-tuples"
# Layout types must reflect block diversity, not collapse to all "text"
layout_types = {s[1] for s in secs}
assert layout_types >= {"text", "image", "table", "equation"}, f"expected diverse layout types, got {layout_types}"
def test_transfer_to_sections_naive_image_carries_caption_and_tag(monkeypatch):
"""Image sections on the naive path must carry a unique caption in the
text field (to avoid chunk-id hash collision across figures) AND embed
the position tag so tokenize_chunks()->crop() can still recover the
figure. The pos field is empty by design."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
secs = p._transfer_to_sections(_sample_pages())
image_secs = [s for s in secs if s[1] == ""]
assert len(image_secs) == 1
text = image_secs[0][0]
assert "Figure caption from understanding" in text, "caption must be in text"
assert "@@" in text and "##" in text, "position tag must be embedded in text"
def test_transfer_to_sections_pipeline_image_keeps_caption_and_typed_position(monkeypatch):
"""On the pipeline path the image block keeps its caption text for semantic
retrieval and a real (separate) line_tag for crop(poss)."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
secs = p._transfer_to_sections(_sample_pages(), parse_method="pipeline")
image_secs = [s for s in secs if s[1] == "image"]
assert len(image_secs) == 1
text, layout, line_tag = image_secs[0]
assert text == "Figure caption from understanding"
assert layout == "image"
assert line_tag.startswith("@@") and line_tag.endswith("##")
def test_transfer_to_sections_discards_cate_item_in_both_modes(monkeypatch):
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
for mode in (None, "pipeline"):
secs = p._transfer_to_sections(_sample_pages(), parse_method=mode)
leaked = [s for s in secs if "should be discarded" in (s[0] or "")]
assert leaked == [], f"cate_item leaked in mode={mode}: {leaked}"
def test_transfer_to_sections_skips_image_block_without_bbox(monkeypatch):
"""No bbox means crop() can't recover anything; emitting an empty section
would only pollute the chunk stream."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
pages = [
{
"page_num": 0,
"page_size": {"w": 600, "h": 800},
"blocks": [{"type": "figure", "content": "no bbox"}],
}
]
assert p._transfer_to_sections(pages) == []
def test_transfer_to_sections_keeps_header_footer_when_flagged(monkeypatch):
"""With keep_header_footer=True, header/footer blocks should pass through
as text sections."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m, keep_header_footer=True)
pages = [
{
"page_num": 0,
"page_size": {"w": 600, "h": 800},
"blocks": [
{"type": "header", "content": "doc title", "bbox": [0, 0, 100, 10]},
{"type": "footer", "content": "page 1", "bbox": [0, 790, 100, 800]},
],
}
]
secs = p._transfer_to_sections(pages)
texts = [s[0] for s in secs]
assert any("doc title" in t for t in texts)
assert any("page 1" in t for t in texts)
# ---------------------------------------------------------------------
# _line_tag format
# ---------------------------------------------------------------------
def test_line_tag_format(monkeypatch):
"""Tag format ``@@<page1based>\\t<x0>\\t<x1>\\t<y0>\\t<y1>##`` is the
contract that downstream extract_positions() / crop() parse."""
m = _load_somark_parser(monkeypatch)
p = _make_parser(m)
bx = {
"page_idx": 0,
"bbox": [10, 20, 100, 40],
"page_size": {"w": 600, "h": 800},
}
tag = p._line_tag(bx)
assert tag.startswith("@@1\t"), "page index must be 1-based"
assert tag.endswith("##")
parts = tag.strip("@").strip("#").split("\t")
assert len(parts) == 5, f"expected 5 tab-separated parts, got {parts}"
# Absent page_images, _line_tag uses raw bbox coords
assert float(parts[1]) == 10.0 # x0
assert float(parts[2]) == 100.0 # x1
assert float(parts[3]) == 20.0 # y0 (top)
assert float(parts[4]) == 40.0 # y1 (bottom)