mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix: The output of the parser in the ingestion pipeline contains HTML tags (#14920)
## Summary This change fixes ingestion quality issues where MinerU parser output may contain HTML fragments (for example, table-related tags like `<tr>`, `<td>`, `<br>`), which were previously passed directly into chunking/tokenization and degraded chunk quality. The fix adds a sanitization step in the MinerU parser path so parsed sections are normalized to clean text before chunking. ## Change Type (select all) - [x] Bug fix - [x] Ingestion pipeline improvement - [x] Parser/chunking quality fix ## Related Issue - https://github.com/infiniflow/ragflow/issues/14831
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import html
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@@ -205,6 +206,26 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _sanitize_section_text(section: str) -> str:
|
||||
"""Normalize MinerU text blocks before chunking.
|
||||
|
||||
MinerU may return HTML fragments (e.g. table_body with <tr>/<td>/<br>).
|
||||
Keep human-readable text while removing tag noise that hurts chunking.
|
||||
"""
|
||||
if not section:
|
||||
return ""
|
||||
section = html.unescape(section)
|
||||
# Preserve rough structure before dropping tags.
|
||||
section = re.sub(r"(?is)<\s*br\s*/?\s*>", "\n", section)
|
||||
section = re.sub(r"(?is)</\s*(p|div|li|tr|h[1-6]|table|caption)\s*>", "\n", section)
|
||||
section = re.sub(r"(?is)<[^>]+>", "", section)
|
||||
# Collapse whitespace while preserving line boundaries.
|
||||
section = re.sub(r"[ \t]+\n", "\n", section)
|
||||
section = re.sub(r"\n{3,}", "\n\n", section)
|
||||
section = re.sub(r"[ \t]{2,}", " ", section)
|
||||
return section.strip()
|
||||
|
||||
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
reason = ""
|
||||
|
||||
@@ -659,6 +680,11 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
case MinerUContentType.DISCARDED:
|
||||
continue # Skip discarded blocks entirely
|
||||
|
||||
section = self._sanitize_section_text(section)
|
||||
if not section:
|
||||
self.logger.debug("[MinerU] Skip section after sanitization: type=%s", output.get("type"))
|
||||
continue
|
||||
|
||||
if section and parse_method in {"manual", "pipeline"}:
|
||||
sections.append((section, output["type"], self._line_tag(output)))
|
||||
elif section and parse_method == "paper":
|
||||
|
||||
Reference in New Issue
Block a user