mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: introduce minimum type check for pipeline (#14354)
### What problem does this PR solve? Feat: introduce minimum type check for pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -38,6 +38,7 @@ from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.pdf_chunk_metadata import (
|
||||
extract_pdf_positions,
|
||||
normalize_pdf_items_metadata,
|
||||
reorder_multi_column_bboxes,
|
||||
)
|
||||
@@ -558,7 +559,12 @@ class Parser(ProcessBase):
|
||||
first_outline_page = pdf_parser.outlines[0][2]
|
||||
split_at = len(bboxes)
|
||||
for i, item in enumerate(bboxes):
|
||||
if item["page_number"] >= first_outline_page:
|
||||
page_number = item.get("page_number")
|
||||
if page_number is None:
|
||||
positions = extract_pdf_positions(item)
|
||||
if positions:
|
||||
page_number = positions[0][0]
|
||||
if page_number is not None and page_number >= first_outline_page:
|
||||
split_at = i
|
||||
break
|
||||
toc_bboxes, _ = remove_toc(bboxes[:split_at])
|
||||
|
||||
@@ -72,6 +72,7 @@ def extract_pdf_positions(item):
|
||||
return []
|
||||
|
||||
positions = _extract_raw_positions(item)
|
||||
uses_position_tag = isinstance(item.get("position_tag"), str) and bool(item.get("position_tag"))
|
||||
ref_page_number = item.get("page_number")
|
||||
ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None
|
||||
if ref_page_number is not None and ref_page_number <= 0:
|
||||
@@ -85,7 +86,9 @@ def extract_pdf_positions(item):
|
||||
page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0]
|
||||
try:
|
||||
page_number = int(page_number)
|
||||
if ref_page_number is not None and page_number == ref_page_number - 1:
|
||||
if uses_position_tag:
|
||||
page_number += 1
|
||||
elif ref_page_number is not None and page_number == ref_page_number - 1:
|
||||
page_number = ref_page_number
|
||||
elif page_number <= 0:
|
||||
page_number += 1
|
||||
|
||||
@@ -68,7 +68,8 @@ class Tokenizer(ProcessBase):
|
||||
embd_model_config = get_tenant_default_model_by_type(self._canvas._tenant_id, LLMType.EMBEDDING)
|
||||
embedding_model = LLMBundle(self._canvas._tenant_id, embd_model_config)
|
||||
texts = []
|
||||
for c in chunks:
|
||||
valid_pairs = []
|
||||
for i, c in enumerate(chunks):
|
||||
txt = ""
|
||||
if isinstance(self._param.fields, str):
|
||||
self._param.fields=[self._param.fields]
|
||||
@@ -78,7 +79,15 @@ class Tokenizer(ProcessBase):
|
||||
txt += f
|
||||
elif isinstance(f, list):
|
||||
txt += "\n".join(f)
|
||||
texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
|
||||
cleaned_txt = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt).strip()
|
||||
if not cleaned_txt:
|
||||
continue
|
||||
texts.append(cleaned_txt)
|
||||
valid_pairs.append((i, c))
|
||||
|
||||
if not texts:
|
||||
return chunks, token_count
|
||||
|
||||
vts, c = embedding_model.encode([name])
|
||||
token_count += c
|
||||
tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
|
||||
@@ -104,8 +113,8 @@ class Tokenizer(ProcessBase):
|
||||
title_w = float(self._param.filename_embd_weight)
|
||||
vects = (title_w * tts + (1 - title_w) * cnts) if len(tts) == len(cnts) else cnts
|
||||
|
||||
assert len(vects) == len(chunks)
|
||||
for i, ck in enumerate(chunks):
|
||||
assert len(vects) == len(valid_pairs)
|
||||
for i, (_, ck) in enumerate(valid_pairs):
|
||||
v = vects[i].tolist()
|
||||
ck["q_%d_vec" % len(v)] = v
|
||||
return chunks, token_count
|
||||
|
||||
Reference in New Issue
Block a user