Fix MinerU table option sanitization (#16118)

Follow on issue: #14831 and PR: #14920 to fix the table options, with
table recognition enabled, do not sanitize html tags.
This commit is contained in:
Wang Qi
2026-06-17 13:06:07 +08:00
committed by GitHub
parent 9bd53ce675
commit 2290bb0023

View File

@@ -654,7 +654,7 @@ class MinerUParser(RAGFlowPdfParser):
item[key] = str((subdir / item[key]).resolve())
return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None, table_enable: bool = False):
sections = []
for output in outputs:
match output.get("type"):
@@ -691,7 +691,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
continue
section = self._sanitize_section_text(section)
if not table_enable:
section = self._sanitize_section_text(section)
if not section:
self.logger.debug("[MinerU] Skip section after sanitization: type=%s", output.get("type"))
continue
@@ -837,7 +838,7 @@ class MinerUParser(RAGFlowPdfParser):
except Exception as e:
self.logger.warning(f"[MinerU] VLM image enhancement failed: {e}. Continuing without descriptions.")
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
return self._transfer_to_sections(outputs, parse_method, enable_table), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():
try: