Feat: Refact pipeline (#13826)

### What problem does this PR solve?

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Magicbook1108
2026-04-03 19:26:45 +08:00
committed by GitHub
parent 6d9430a125
commit 69264b3a70
71 changed files with 3055 additions and 990 deletions

View File

@@ -41,6 +41,8 @@ except Exception:
class RAGFlowPdfParser:
pass
from deepdoc.parser.utils import extract_pdf_outlines
class DoclingContentType(str, Enum):
IMAGE = "image"
@@ -242,7 +244,7 @@ class DoclingParser(RAGFlowPdfParser):
continue
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
if parse_method == "manual":
if parse_method in {"manual", "pipeline"}:
sections.append((section, typ, tag))
elif parse_method == "paper":
sections.append((section + tag, typ))
@@ -311,7 +313,7 @@ class DoclingParser(RAGFlowPdfParser):
txt = (text or "").strip()
if not txt:
return []
if parse_method == "manual":
if parse_method in {"manual", "pipeline"}:
return [(txt, DoclingContentType.TEXT.value, "")]
if parse_method == "paper":
return [(txt, DoclingContentType.TEXT.value)]
@@ -455,6 +457,7 @@ class DoclingParser(RAGFlowPdfParser):
docling_server_url: Optional[str] = None,
request_timeout: Optional[int] = None,
):
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
if not self.check_installation(docling_server_url=docling_server_url):
raise RuntimeError("Docling not available, please install `docling`")

View File

@@ -35,6 +35,7 @@ from PIL import Image
from strenum import StrEnum
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from deepdoc.parser.utils import extract_pdf_outlines
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
@@ -576,7 +577,7 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.DISCARDED:
continue # Skip discarded blocks entirely
if section and parse_method == "manual":
if section and parse_method in {"manual", "pipeline"}:
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"]))
@@ -602,6 +603,7 @@ class MinerUParser(RAGFlowPdfParser):
) -> tuple:
import shutil
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
temp_pdf = None
created_tmp_dir = False

View File

@@ -36,6 +36,8 @@ except Exception:
class RAGFlowPdfParser:
pass
from deepdoc.parser.utils import extract_pdf_outlines
AlgorithmType = Literal["PaddleOCR-VL"]
SectionTuple = tuple[str, ...]
@@ -253,6 +255,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
**kwargs: Any,
) -> ParseResult:
"""Parse PDF document using PaddleOCR API."""
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
# Create configuration - pass all kwargs to capture VL config parameters
config_dict = {
"api_url": api_url if api_url is not None else self.api_url,
@@ -409,7 +412,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"
if parse_method == "manual":
if parse_method in {"manual", "pipeline"}:
sections.append((block_content, label, tag))
elif parse_method == "paper":
sections.append((block_content + tag, label))

View File

@@ -42,6 +42,7 @@ from common.misc_utils import pip_install_torch
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
from rag.nlp import rag_tokenizer
from rag.prompts.generator import vision_llm_describe_prompt
from deepdoc.parser.utils import extract_pdf_outlines
from common import settings
@@ -1582,28 +1583,6 @@ class RAGFlowPdfParser:
logging.exception(f"RAGFlowPdfParser __images__, exception: {e}")
logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
self.outlines = []
try:
with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf:
self.pdf = pdf
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth + 1)
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
if not self.outlines:
logging.warning("Miss outlines")
logging.debug("Images converted.")
self.is_english = [
re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
@@ -1711,6 +1690,7 @@ class RAGFlowPdfParser:
if auto_rotate_tables is None:
auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes")
self.outlines = extract_pdf_outlines(fnm)
self.__images__(fnm, zoomin)
self._layouts_rec(zoomin)
self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables)
@@ -1722,6 +1702,7 @@ class RAGFlowPdfParser:
def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
start = timer()
self.outlines = extract_pdf_outlines(fnm)
self.__images__(fnm, zoomin, callback=callback)
if callback:
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
@@ -1969,27 +1950,14 @@ class RAGFlowPdfParser:
class PlainParser:
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
self.outlines = []
lines = []
try:
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
for page in self.pdf.pages[from_page:to_page]:
lines.extend([t for t in page.extract_text().split("\n")])
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth + 1)
dfs(outlines, 0)
except Exception:
logging.exception("Outlines exception")
if not self.outlines:
logging.warning("Miss outlines")
self.outlines = extract_pdf_outlines(filename)
return [(line, "") for line in lines], []

View File

@@ -39,6 +39,7 @@ from tencentcloud.lkeap.v20240522 import lkeap_client, models
from common.config_utils import get_base_config
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from deepdoc.parser.utils import extract_pdf_outlines
class TencentCloudAPIClient:
@@ -392,6 +393,7 @@ class TCADPParser(RAGFlowPdfParser):
) -> tuple:
"""Parse PDF document"""
self.outlines = extract_pdf_outlines(binary if binary else filepath)
temp_file = None
created_tmp_dir = False

View File

@@ -14,6 +14,10 @@
# limitations under the License.
#
from io import BytesIO
from pypdf import PdfReader as pdf2_read
from rag.nlp import find_codec
@@ -30,3 +34,21 @@ def get_text(fnm: str, binary=None) -> str:
break
txt += line
return txt
def extract_pdf_outlines(source):
try:
with pdf2_read(source if isinstance(source, str) else BytesIO(source)) as pdf:
outlines = []
def dfs(nodes, depth):
for node in nodes:
if isinstance(node, list):
dfs(node, depth + 1)
else:
outlines.append((node["/Title"], depth, pdf.get_destination_page_number(node) + 1))
dfs(pdf.outline, 0)
return outlines
except Exception:
return []