Feat: add preprocess parameters for ingestion pipeline (#13300)

### What problem does this PR solve?
Feat: add preprocess parameters for ingestion pipeline

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-03-02 13:18:57 +08:00
committed by GitHub
parent 7fc97da610
commit f0dd12289c

View File

@@ -304,8 +304,10 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
abstract_enabled = kwargs.get("abstract", False)
author_enabled = kwargs.get("author", False)
abstract_enabled = "abstract" in self._param.setups["pdf"].get("preprocess", [])
author_enabled = "author" in self._param.setups["pdf"].get("preprocess", [])
title_enabled = "title" in self._param.setups["pdf"].get("preprocess", [])
raw_parse_method = conf.get("parse_method", "")
parser_model_name = None
@@ -388,8 +390,6 @@ class Parser(ProcessBase):
if position_tag:
# Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match:
pn, x0, x1, top, bott = match.groups()
@@ -477,6 +477,8 @@ class Parser(ProcessBase):
b["doc_type_kwd"] = "image"
elif layout == "table":
b["doc_type_kwd"] = "table"
if title_enabled and "title" in str(b.get("layout_type", "").lower()):
b["title"] = True
# Get authors
if author_enabled:
@@ -540,7 +542,6 @@ class Parser(ProcessBase):
if abstract_idx is not None:
bboxes[abstract_idx]["abstract"] = True
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
if conf.get("output_format") == "markdown":
@@ -654,7 +655,7 @@ class Parser(ProcessBase):
for text, image, html in main_sections:
section = {"text": text, "image": image}
text_key = text.strip() if isinstance(text, str) else ""
if text_key and text_key in title_texts:
if text_key and text_key in title_texts and "title" in self._param.setups["word"].get("preprocess", []):
section["title"] = True
sections.append(section)
tbls.append(((None, html), ""))
@@ -761,7 +762,7 @@ class Parser(ProcessBase):
"text": section_text,
}
text_key = section_text.strip() if isinstance(section_text, str) else ""
if text_key and text_key in title_texts:
if text_key and text_key in title_texts and "title" in self._param.setups["text&markdown"].get("preprocess", []):
json_result["title"] = True
images = []
@@ -1013,6 +1014,7 @@ class Parser(ProcessBase):
call_kwargs = dict(kwargs)
call_kwargs.pop("name", None)
call_kwargs.pop("blob", None)
await thread_pool_exec(function_map[p_type], name, blob, **call_kwargs)
done = True
break