From f0dd12289c12ed7be74a470982532d3e604c9c87 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Mon, 2 Mar 2026 13:18:57 +0800 Subject: [PATCH] Feat: add preprocess parameters for ingestion pipeline (#13300) ### What problem does this PR solve? Feat: add preprocess parameters for ingestion pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/flow/parser/parser.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 206f4ede47..e42d007d08 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -304,8 +304,10 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.") conf = self._param.setups["pdf"] self.set_output("output_format", conf["output_format"]) - abstract_enabled = kwargs.get("abstract", False) - author_enabled = kwargs.get("author", False) + + abstract_enabled = "abstract" in self._param.setups["pdf"].get("preprocess", []) + author_enabled = "author" in self._param.setups["pdf"].get("preprocess", []) + title_enabled = "title" in self._param.setups["pdf"].get("preprocess", []) raw_parse_method = conf.get("parse_method", "") parser_model_name = None @@ -388,8 +390,6 @@ class Parser(ProcessBase): if position_tag: # Extract position information from TCADP's position tag # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## - import re - match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) if match: pn, x0, x1, top, bott = match.groups() @@ -477,6 +477,8 @@ class Parser(ProcessBase): b["doc_type_kwd"] = "image" elif layout == "table": b["doc_type_kwd"] = "table" + if title_enabled and "title" in str(b.get("layout_type", "").lower()): + b["title"] = True # Get authors if author_enabled: @@ -540,7 +542,6 @@ class Parser(ProcessBase): if abstract_idx is not None: bboxes[abstract_idx]["abstract"] = True - if conf.get("output_format") == "json": self.set_output("json", bboxes) if conf.get("output_format") == "markdown": @@ -654,7 +655,7 @@ class Parser(ProcessBase): for text, image, html in main_sections: section = {"text": text, "image": image} text_key = text.strip() if isinstance(text, str) else "" - if text_key and text_key in title_texts: + if text_key and text_key in title_texts and "title" in self._param.setups["word"].get("preprocess", []): section["title"] = True sections.append(section) tbls.append(((None, html), "")) @@ -761,7 +762,7 @@ class Parser(ProcessBase): "text": section_text, } text_key = section_text.strip() if isinstance(section_text, str) else "" - if text_key and text_key in title_texts: + if text_key and text_key in title_texts and "title" in self._param.setups["text&markdown"].get("preprocess", []): json_result["title"] = True images = [] @@ -1013,6 +1014,7 @@ class Parser(ProcessBase): call_kwargs = dict(kwargs) call_kwargs.pop("name", None) call_kwargs.pop("blob", None) + await thread_pool_exec(function_map[p_type], name, blob, **call_kwargs) done = True break