Feat: add preprocess parameters for ingestion pipeline (#13300)

### What problem does this PR solve? Feat: add preprocess parameters for ingestion pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-06-29 23:41:12 +08:00 · 2026-03-02 13:18:57 +08:00
parent 7fc97da610
commit f0dd12289c
1 changed files with 9 additions and 7 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -304,8 +304,10 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])
-        abstract_enabled = kwargs.get("abstract", False)
-        author_enabled = kwargs.get("author", False)
+
+        abstract_enabled = "abstract" in self._param.setups["pdf"].get("preprocess", [])
+        author_enabled = "author" in self._param.setups["pdf"].get("preprocess", [])
+        title_enabled = "title" in self._param.setups["pdf"].get("preprocess", [])

        raw_parse_method = conf.get("parse_method", "")
        parser_model_name = None
@@ -388,8 +390,6 @@ class Parser(ProcessBase):
                if position_tag:
                    # Extract position information from TCADP's position tag
                    # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
-                    import re
-
                    match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
                    if match:
                        pn, x0, x1, top, bott = match.groups()
@@ -477,6 +477,8 @@ class Parser(ProcessBase):
                b["doc_type_kwd"] = "image"
            elif layout == "table":
                b["doc_type_kwd"] = "table"
+            if title_enabled and "title" in str(b.get("layout_type", "").lower()):
+                b["title"] = True

        # Get authors
        if author_enabled:
@@ -540,7 +542,6 @@ class Parser(ProcessBase):
            if abstract_idx is not None:
                bboxes[abstract_idx]["abstract"] = True

-
        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
        if conf.get("output_format") == "markdown":
@@ -654,7 +655,7 @@ class Parser(ProcessBase):
            for text, image, html in main_sections:
                section = {"text": text, "image": image}
                text_key = text.strip() if isinstance(text, str) else ""
-                if text_key and text_key in title_texts:
+                if text_key and text_key in title_texts and "title" in self._param.setups["word"].get("preprocess", []):
                    section["title"] = True
                sections.append(section)
                tbls.append(((None, html), ""))
@@ -761,7 +762,7 @@ class Parser(ProcessBase):
                    "text": section_text,
                }
                text_key = section_text.strip() if isinstance(section_text, str) else ""
-                if text_key and text_key in title_texts:
+                if text_key and text_key in title_texts and "title" in self._param.setups["text&markdown"].get("preprocess", []):
                    json_result["title"] = True

                images = []
@@ -1013,6 +1014,7 @@ class Parser(ProcessBase):
            call_kwargs = dict(kwargs)
            call_kwargs.pop("name", None)
            call_kwargs.pop("blob", None)
+
            await thread_pool_exec(function_map[p_type], name, blob, **call_kwargs)
            done = True
            break