Files
ragflow/test/unit_test/deepdoc/parser/test_mineru_parser.py
Jonathan Chang 9d1006e4ec fix: The output of the parser in the ingestion pipeline contains HTML tags (#14920)
## Summary
This change fixes ingestion quality issues where MinerU parser output
may contain HTML fragments (for example, table-related tags like `<tr>`,
`<td>`, `<br>`), which were previously passed directly into
chunking/tokenization and degraded chunk quality.

The fix adds a sanitization step in the MinerU parser path so parsed
sections are normalized to clean text before chunking.

## Change Type (select all)
- [x] Bug fix
- [x] Ingestion pipeline improvement
- [x] Parser/chunking quality fix

## Related Issue
- https://github.com/infiniflow/ragflow/issues/14831
2026-05-25 16:06:36 +08:00

69 lines
2.4 KiB
Python

import importlib.util
import logging
import sys
from pathlib import Path
from types import ModuleType
def _load_mineru_parser(monkeypatch):
repo_root = Path(__file__).resolve().parents[4]
deepdoc_mod = ModuleType("deepdoc")
deepdoc_mod.__path__ = [str(repo_root / "deepdoc")]
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_mod)
parser_mod = ModuleType("deepdoc.parser")
parser_mod.__path__ = [str(repo_root / "deepdoc" / "parser")]
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_mod)
pdf_parser_mod = ModuleType("deepdoc.parser.pdf_parser")
class _RAGFlowPdfParser:
pass
pdf_parser_mod.RAGFlowPdfParser = _RAGFlowPdfParser
monkeypatch.setitem(sys.modules, "deepdoc.parser.pdf_parser", pdf_parser_mod)
utils_mod = ModuleType("deepdoc.parser.utils")
utils_mod.extract_pdf_outlines = lambda *_args, **_kwargs: []
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
module_name = "test_mineru_parser_unit_module"
module_path = repo_root / "deepdoc" / "parser" / "mineru_parser.py"
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
monkeypatch.setitem(sys.modules, module_name, module)
spec.loader.exec_module(module)
return module
def test_sanitize_section_text_removes_escaped_html_tags(monkeypatch):
module = _load_mineru_parser(monkeypatch)
text = "&lt;table&gt;&lt;tr&gt;&lt;td&gt;Alpha&lt;/td&gt;&lt;td&gt;Beta&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;"
sanitized = module.MinerUParser._sanitize_section_text(text)
assert sanitized == "AlphaBeta"
assert "<td>" not in sanitized
assert "</td>" not in sanitized
def test_transfer_to_sections_logs_sections_dropped_after_sanitization(monkeypatch, caplog):
module = _load_mineru_parser(monkeypatch)
parser = module.MinerUParser()
outputs = [
{
"type": module.MinerUContentType.TEXT,
"text": "&lt;td&gt;&lt;/td&gt;",
"page_idx": 0,
"bbox": (0, 0, 1, 1),
}
]
with caplog.at_level(logging.DEBUG, logger=parser.logger.name):
sections = parser._transfer_to_sections(outputs, parse_method="pipeline")
assert sections == []
assert "Skip section after sanitization" in caplog.text
assert f"type={module.MinerUContentType.TEXT}" in caplog.text