fix: The output of the parser in the ingestion pipeline contains HTML tags (#14920)

## Summary
This change fixes ingestion quality issues where MinerU parser output
may contain HTML fragments (for example, table-related tags like `<tr>`,
`<td>`, `<br>`), which were previously passed directly into
chunking/tokenization and degraded chunk quality.

The fix adds a sanitization step in the MinerU parser path so parsed
sections are normalized to clean text before chunking.

## Change Type (select all)
- [x] Bug fix
- [x] Ingestion pipeline improvement
- [x] Parser/chunking quality fix

## Related Issue
- https://github.com/infiniflow/ragflow/issues/14831
This commit is contained in:
Jonathan Chang
2026-05-24 22:06:36 -10:00
committed by GitHub
parent e6068a7f7e
commit 9d1006e4ec
5 changed files with 235 additions and 15 deletions

View File

@@ -14,6 +14,7 @@
# limitations under the License.
#
import json
import html
import logging
import os
import re
@@ -205,6 +206,26 @@ class MinerUParser(RAGFlowPdfParser):
except Exception:
return False
@staticmethod
def _sanitize_section_text(section: str) -> str:
"""Normalize MinerU text blocks before chunking.
MinerU may return HTML fragments (e.g. table_body with <tr>/<td>/<br>).
Keep human-readable text while removing tag noise that hurts chunking.
"""
if not section:
return ""
section = html.unescape(section)
# Preserve rough structure before dropping tags.
section = re.sub(r"(?is)<\s*br\s*/?\s*>", "\n", section)
section = re.sub(r"(?is)</\s*(p|div|li|tr|h[1-6]|table|caption)\s*>", "\n", section)
section = re.sub(r"(?is)<[^>]+>", "", section)
# Collapse whitespace while preserving line boundaries.
section = re.sub(r"[ \t]+\n", "\n", section)
section = re.sub(r"\n{3,}", "\n\n", section)
section = re.sub(r"[ \t]{2,}", " ", section)
return section.strip()
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
reason = ""
@@ -659,6 +680,11 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.DISCARDED:
continue # Skip discarded blocks entirely
section = self._sanitize_section_text(section)
if not section:
self.logger.debug("[MinerU] Skip section after sanitization: type=%s", output.get("type"))
continue
if section and parse_method in {"manual", "pipeline"}:
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":