fix: The output of the parser in the ingestion pipeline contains HTML tags (#14920)

## Summary This change fixes ingestion quality issues where MinerU parser output may contain HTML fragments (for example, table-related tags like `<tr>`, `<td>`, `<br>`), which were previously passed directly into chunking/tokenization and degraded chunk quality. The fix adds a sanitization step in the MinerU parser path so parsed sections are normalized to clean text before chunking. ## Change Type (select all) - [x] Bug fix - [x] Ingestion pipeline improvement - [x] Parser/chunking quality fix ## Related Issue - https://github.com/infiniflow/ragflow/issues/14831
2026-06-29 15:31:05 +08:00 · 2026-05-24 22:06:36 -10:00
parent e6068a7f7e
commit 9d1006e4ec
5 changed files with 235 additions and 15 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 import json
+import html
 import logging
 import os
 import re
@@ -205,6 +206,26 @@ class MinerUParser(RAGFlowPdfParser):
        except Exception:
            return False

+    @staticmethod
+    def _sanitize_section_text(section: str) -> str:
+        """Normalize MinerU text blocks before chunking.
+
+        MinerU may return HTML fragments (e.g. table_body with <tr>/<td>/<br>).
+        Keep human-readable text while removing tag noise that hurts chunking.
+        """
+        if not section:
+            return ""
+        section = html.unescape(section)
+        # Preserve rough structure before dropping tags.
+        section = re.sub(r"(?is)<\s*br\s*/?\s*>", "\n", section)
+        section = re.sub(r"(?is)</\s*(p|div|li|tr|h[1-6]|table|caption)\s*>", "\n", section)
+        section = re.sub(r"(?is)<[^>]+>", "", section)
+        # Collapse whitespace while preserving line boundaries.
+        section = re.sub(r"[ \t]+\n", "\n", section)
+        section = re.sub(r"\n{3,}", "\n\n", section)
+        section = re.sub(r"[ \t]{2,}", " ", section)
+        return section.strip()
+
    def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
        reason = ""

@@ -659,6 +680,11 @@ class MinerUParser(RAGFlowPdfParser):
                case MinerUContentType.DISCARDED:
                    continue  # Skip discarded blocks entirely

+            section = self._sanitize_section_text(section)
+            if not section:
+                self.logger.debug("[MinerU] Skip section after sanitization: type=%s", output.get("type"))
+                continue
+
            if section and parse_method in {"manual", "pipeline"}:
                sections.append((section, output["type"], self._line_tag(output)))
            elif section and parse_method == "paper":