Feat: Refact pipeline (#13826)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring --------- Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-29 15:31:05 +08:00 · 2026-04-03 19:26:45 +08:00
parent 6d9430a125
commit 69264b3a70
71 changed files with 3055 additions and 990 deletions
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@@ -41,6 +41,8 @@ except Exception:
    class RAGFlowPdfParser:  
        pass

+from deepdoc.parser.utils import extract_pdf_outlines
+

 class DoclingContentType(str, Enum):
    IMAGE = "image"
@@ -242,7 +244,7 @@ class DoclingParser(RAGFlowPdfParser):
                continue
            
            tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
-            if parse_method == "manual":
+            if parse_method in {"manual", "pipeline"}:
                sections.append((section, typ, tag))
            elif parse_method == "paper":
                sections.append((section + tag, typ))
@@ -311,7 +313,7 @@ class DoclingParser(RAGFlowPdfParser):
        txt = (text or "").strip()
        if not txt:
            return []
-        if parse_method == "manual":
+        if parse_method in {"manual", "pipeline"}:
            return [(txt, DoclingContentType.TEXT.value, "")]
        if parse_method == "paper":
            return [(txt, DoclingContentType.TEXT.value)]
@@ -455,6 +457,7 @@ class DoclingParser(RAGFlowPdfParser):
        docling_server_url: Optional[str] = None,
        request_timeout: Optional[int] = None,
    ):
+        self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)

        if not self.check_installation(docling_server_url=docling_server_url):
            raise RuntimeError("Docling not available, please install `docling`")
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -35,6 +35,7 @@ from PIL import Image
 from strenum import StrEnum

 from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from deepdoc.parser.utils import extract_pdf_outlines

 LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
 if LOCK_KEY_pdfplumber not in sys.modules:
@@ -576,7 +577,7 @@ class MinerUParser(RAGFlowPdfParser):
                case MinerUContentType.DISCARDED:
                    continue  # Skip discarded blocks entirely

-            if section and parse_method == "manual":
+            if section and parse_method in {"manual", "pipeline"}:
                sections.append((section, output["type"], self._line_tag(output)))
            elif section and parse_method == "paper":
                sections.append((section + self._line_tag(output), output["type"]))
@@ -602,6 +603,7 @@ class MinerUParser(RAGFlowPdfParser):
    ) -> tuple:
        import shutil

+        self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
        temp_pdf = None
        created_tmp_dir = False

--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@@ -36,6 +36,8 @@ except Exception:
    class RAGFlowPdfParser:
        pass

+from deepdoc.parser.utils import extract_pdf_outlines
+

 AlgorithmType = Literal["PaddleOCR-VL"]
 SectionTuple = tuple[str, ...]
@@ -253,6 +255,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
        **kwargs: Any,
    ) -> ParseResult:
        """Parse PDF document using PaddleOCR API."""
+        self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
        # Create configuration - pass all kwargs to capture VL config parameters
        config_dict = {
            "api_url": api_url if api_url is not None else self.api_url,
@@ -409,7 +412,7 @@ class PaddleOCRParser(RAGFlowPdfParser):

                    tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"

-                    if parse_method == "manual":
+                    if parse_method in {"manual", "pipeline"}:
                        sections.append((block_content, label, tag))
                    elif parse_method == "paper":
                        sections.append((block_content + tag, label))
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -42,6 +42,7 @@ from common.misc_utils import pip_install_torch
 from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
 from rag.nlp import rag_tokenizer
 from rag.prompts.generator import vision_llm_describe_prompt
+from deepdoc.parser.utils import extract_pdf_outlines
 from common import settings


@@ -1582,28 +1583,6 @@ class RAGFlowPdfParser:
            logging.exception(f"RAGFlowPdfParser __images__, exception: {e}")
        logging.info(f"__images__ dedupe_chars cost {timer() - start}s")

-        self.outlines = []
-        try:
-            with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf:
-                self.pdf = pdf
-
-                outlines = self.pdf.outline
-
-                def dfs(arr, depth):
-                    for a in arr:
-                        if isinstance(a, dict):
-                            self.outlines.append((a["/Title"], depth))
-                            continue
-                        dfs(a, depth + 1)
-
-                dfs(outlines, 0)
-
-        except Exception as e:
-            logging.warning(f"Outlines exception: {e}")
-
-        if not self.outlines:
-            logging.warning("Miss outlines")
-
        logging.debug("Images converted.")
        self.is_english = [
            re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
@@ -1711,6 +1690,7 @@ class RAGFlowPdfParser:
        if auto_rotate_tables is None:
            auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes")

+        self.outlines = extract_pdf_outlines(fnm)
        self.__images__(fnm, zoomin)
        self._layouts_rec(zoomin)
        self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables)
@@ -1722,6 +1702,7 @@ class RAGFlowPdfParser:

    def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
        start = timer()
+        self.outlines = extract_pdf_outlines(fnm)
        self.__images__(fnm, zoomin, callback=callback)
        if callback:
            callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
@@ -1969,27 +1950,14 @@ class RAGFlowPdfParser:

 class PlainParser:
    def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
-        self.outlines = []
        lines = []
        try:
            self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
            for page in self.pdf.pages[from_page:to_page]:
                lines.extend([t for t in page.extract_text().split("\n")])
-
-            outlines = self.pdf.outline
-
-            def dfs(arr, depth):
-                for a in arr:
-                    if isinstance(a, dict):
-                        self.outlines.append((a["/Title"], depth))
-                        continue
-                    dfs(a, depth + 1)
-
-            dfs(outlines, 0)
        except Exception:
            logging.exception("Outlines exception")
-        if not self.outlines:
-            logging.warning("Miss outlines")
+        self.outlines = extract_pdf_outlines(filename)

        return [(line, "") for line in lines], []

--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@@ -39,6 +39,7 @@ from tencentcloud.lkeap.v20240522 import lkeap_client, models

 from common.config_utils import get_base_config
 from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from deepdoc.parser.utils import extract_pdf_outlines


 class TencentCloudAPIClient:
@@ -392,6 +393,7 @@ class TCADPParser(RAGFlowPdfParser):
    ) -> tuple:
        """Parse PDF document"""

+        self.outlines = extract_pdf_outlines(binary if binary else filepath)
        temp_file = None
        created_tmp_dir = False

--- a/deepdoc/parser/utils.py
+++ b/deepdoc/parser/utils.py
@@ -14,6 +14,10 @@
 #  limitations under the License.
 #

+from io import BytesIO
+
+from pypdf import PdfReader as pdf2_read
+
 from rag.nlp import find_codec


@@ -30,3 +34,21 @@ def get_text(fnm: str, binary=None) -> str:
                    break
                txt += line
    return txt
+
+
+def extract_pdf_outlines(source):
+    try:
+        with pdf2_read(source if isinstance(source, str) else BytesIO(source)) as pdf:
+            outlines = []
+
+            def dfs(nodes, depth):
+                for node in nodes:
+                    if isinstance(node, list):
+                        dfs(node, depth + 1)
+                    else:
+                        outlines.append((node["/Title"], depth, pdf.get_destination_page_number(node) + 1))
+
+            dfs(pdf.outline, 0)
+            return outlines
+    except Exception:
+        return []