From f0dd12289c12ed7be74a470982532d3e604c9c87 Mon Sep 17 00:00:00 2001
From: Magicbook1108 <newyorkupperbay@gmail.com>
Date: Mon, 2 Mar 2026 13:18:57 +0800
Subject: [PATCH] Feat: add preprocess parameters for ingestion pipeline
 (#13300)

### What problem does this PR solve?
Feat: add preprocess parameters for ingestion pipeline

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 rag/flow/parser/parser.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index 206f4ede47..e42d007d08 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -304,8 +304,10 @@ class Parser(ProcessBase):
         self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
         conf = self._param.setups["pdf"]
         self.set_output("output_format", conf["output_format"])
-        abstract_enabled = kwargs.get("abstract", False)
-        author_enabled = kwargs.get("author", False)
+
+        abstract_enabled = "abstract" in self._param.setups["pdf"].get("preprocess", [])
+        author_enabled = "author" in self._param.setups["pdf"].get("preprocess", [])
+        title_enabled = "title" in self._param.setups["pdf"].get("preprocess", [])
 
         raw_parse_method = conf.get("parse_method", "")
         parser_model_name = None
@@ -388,8 +390,6 @@ class Parser(ProcessBase):
                 if position_tag:
                     # Extract position information from TCADP's position tag
                     # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
-                    import re
-
                     match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
                     if match:
                         pn, x0, x1, top, bott = match.groups()
@@ -477,6 +477,8 @@ class Parser(ProcessBase):
                 b["doc_type_kwd"] = "image"
             elif layout == "table":
                 b["doc_type_kwd"] = "table"
+            if title_enabled and "title" in str(b.get("layout_type", "").lower()):
+                b["title"] = True
 
         # Get authors
         if author_enabled:
@@ -540,7 +542,6 @@ class Parser(ProcessBase):
             if abstract_idx is not None:
                 bboxes[abstract_idx]["abstract"] = True
 
-
         if conf.get("output_format") == "json":
             self.set_output("json", bboxes)
         if conf.get("output_format") == "markdown":
@@ -654,7 +655,7 @@ class Parser(ProcessBase):
             for text, image, html in main_sections:
                 section = {"text": text, "image": image}
                 text_key = text.strip() if isinstance(text, str) else ""
-                if text_key and text_key in title_texts:
+                if text_key and text_key in title_texts and "title" in self._param.setups["word"].get("preprocess", []):
                     section["title"] = True
                 sections.append(section)
                 tbls.append(((None, html), ""))
@@ -761,7 +762,7 @@ class Parser(ProcessBase):
                     "text": section_text,
                 }
                 text_key = section_text.strip() if isinstance(section_text, str) else ""
-                if text_key and text_key in title_texts:
+                if text_key and text_key in title_texts and "title" in self._param.setups["text&markdown"].get("preprocess", []):
                     json_result["title"] = True
 
                 images = []
@@ -1013,6 +1014,7 @@ class Parser(ProcessBase):
             call_kwargs = dict(kwargs)
             call_kwargs.pop("name", None)
             call_kwargs.pop("blob", None)
+
             await thread_pool_exec(function_map[p_type], name, blob, **call_kwargs)
             done = True
             break