mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Feat: Refact pipeline (#13826)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring --------- Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,8 @@ except Exception:
|
||||
class RAGFlowPdfParser:
|
||||
pass
|
||||
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
|
||||
|
||||
class DoclingContentType(str, Enum):
|
||||
IMAGE = "image"
|
||||
@@ -242,7 +244,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
continue
|
||||
|
||||
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
|
||||
if parse_method == "manual":
|
||||
if parse_method in {"manual", "pipeline"}:
|
||||
sections.append((section, typ, tag))
|
||||
elif parse_method == "paper":
|
||||
sections.append((section + tag, typ))
|
||||
@@ -311,7 +313,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
txt = (text or "").strip()
|
||||
if not txt:
|
||||
return []
|
||||
if parse_method == "manual":
|
||||
if parse_method in {"manual", "pipeline"}:
|
||||
return [(txt, DoclingContentType.TEXT.value, "")]
|
||||
if parse_method == "paper":
|
||||
return [(txt, DoclingContentType.TEXT.value)]
|
||||
@@ -455,6 +457,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
docling_server_url: Optional[str] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
):
|
||||
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
|
||||
|
||||
if not self.check_installation(docling_server_url=docling_server_url):
|
||||
raise RuntimeError("Docling not available, please install `docling`")
|
||||
|
||||
@@ -35,6 +35,7 @@ from PIL import Image
|
||||
from strenum import StrEnum
|
||||
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
@@ -576,7 +577,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
case MinerUContentType.DISCARDED:
|
||||
continue # Skip discarded blocks entirely
|
||||
|
||||
if section and parse_method == "manual":
|
||||
if section and parse_method in {"manual", "pipeline"}:
|
||||
sections.append((section, output["type"], self._line_tag(output)))
|
||||
elif section and parse_method == "paper":
|
||||
sections.append((section + self._line_tag(output), output["type"]))
|
||||
@@ -602,6 +603,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
) -> tuple:
|
||||
import shutil
|
||||
|
||||
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
|
||||
temp_pdf = None
|
||||
created_tmp_dir = False
|
||||
|
||||
|
||||
@@ -36,6 +36,8 @@ except Exception:
|
||||
class RAGFlowPdfParser:
|
||||
pass
|
||||
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
|
||||
|
||||
AlgorithmType = Literal["PaddleOCR-VL"]
|
||||
SectionTuple = tuple[str, ...]
|
||||
@@ -253,6 +255,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
**kwargs: Any,
|
||||
) -> ParseResult:
|
||||
"""Parse PDF document using PaddleOCR API."""
|
||||
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
|
||||
# Create configuration - pass all kwargs to capture VL config parameters
|
||||
config_dict = {
|
||||
"api_url": api_url if api_url is not None else self.api_url,
|
||||
@@ -409,7 +412,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
|
||||
tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"
|
||||
|
||||
if parse_method == "manual":
|
||||
if parse_method in {"manual", "pipeline"}:
|
||||
sections.append((block_content, label, tag))
|
||||
elif parse_method == "paper":
|
||||
sections.append((block_content + tag, label))
|
||||
|
||||
@@ -42,6 +42,7 @@ from common.misc_utils import pip_install_torch
|
||||
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
from rag.prompts.generator import vision_llm_describe_prompt
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
from common import settings
|
||||
|
||||
|
||||
@@ -1582,28 +1583,6 @@ class RAGFlowPdfParser:
|
||||
logging.exception(f"RAGFlowPdfParser __images__, exception: {e}")
|
||||
logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
|
||||
outlines = self.pdf.outline
|
||||
|
||||
def dfs(arr, depth):
|
||||
for a in arr:
|
||||
if isinstance(a, dict):
|
||||
self.outlines.append((a["/Title"], depth))
|
||||
continue
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
|
||||
if not self.outlines:
|
||||
logging.warning("Miss outlines")
|
||||
|
||||
logging.debug("Images converted.")
|
||||
self.is_english = [
|
||||
re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
|
||||
@@ -1711,6 +1690,7 @@ class RAGFlowPdfParser:
|
||||
if auto_rotate_tables is None:
|
||||
auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
self.outlines = extract_pdf_outlines(fnm)
|
||||
self.__images__(fnm, zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables)
|
||||
@@ -1722,6 +1702,7 @@ class RAGFlowPdfParser:
|
||||
|
||||
def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
|
||||
start = timer()
|
||||
self.outlines = extract_pdf_outlines(fnm)
|
||||
self.__images__(fnm, zoomin, callback=callback)
|
||||
if callback:
|
||||
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
|
||||
@@ -1969,27 +1950,14 @@ class RAGFlowPdfParser:
|
||||
|
||||
class PlainParser:
|
||||
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
||||
self.outlines = []
|
||||
lines = []
|
||||
try:
|
||||
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
|
||||
for page in self.pdf.pages[from_page:to_page]:
|
||||
lines.extend([t for t in page.extract_text().split("\n")])
|
||||
|
||||
outlines = self.pdf.outline
|
||||
|
||||
def dfs(arr, depth):
|
||||
for a in arr:
|
||||
if isinstance(a, dict):
|
||||
self.outlines.append((a["/Title"], depth))
|
||||
continue
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception:
|
||||
logging.exception("Outlines exception")
|
||||
if not self.outlines:
|
||||
logging.warning("Miss outlines")
|
||||
self.outlines = extract_pdf_outlines(filename)
|
||||
|
||||
return [(line, "") for line in lines], []
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ from tencentcloud.lkeap.v20240522 import lkeap_client, models
|
||||
|
||||
from common.config_utils import get_base_config
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
|
||||
|
||||
class TencentCloudAPIClient:
|
||||
@@ -392,6 +393,7 @@ class TCADPParser(RAGFlowPdfParser):
|
||||
) -> tuple:
|
||||
"""Parse PDF document"""
|
||||
|
||||
self.outlines = extract_pdf_outlines(binary if binary else filepath)
|
||||
temp_file = None
|
||||
created_tmp_dir = False
|
||||
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from pypdf import PdfReader as pdf2_read
|
||||
|
||||
from rag.nlp import find_codec
|
||||
|
||||
|
||||
@@ -30,3 +34,21 @@ def get_text(fnm: str, binary=None) -> str:
|
||||
break
|
||||
txt += line
|
||||
return txt
|
||||
|
||||
|
||||
def extract_pdf_outlines(source):
|
||||
try:
|
||||
with pdf2_read(source if isinstance(source, str) else BytesIO(source)) as pdf:
|
||||
outlines = []
|
||||
|
||||
def dfs(nodes, depth):
|
||||
for node in nodes:
|
||||
if isinstance(node, list):
|
||||
dfs(node, depth + 1)
|
||||
else:
|
||||
outlines.append((node["/Title"], depth, pdf.get_destination_page_number(node) + 1))
|
||||
|
||||
dfs(pdf.outline, 0)
|
||||
return outlines
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user