Feat: introduce minimum type check for pipeline (#14354)

### What problem does this PR solve?

Feat: introduce minimum type check for pipeline

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-04-24 21:12:50 +08:00
committed by GitHub
parent 1c244df90d
commit 25089600d0
3 changed files with 24 additions and 6 deletions

View File

@@ -38,6 +38,7 @@ from deepdoc.parser.tcadp_parser import TCADPParser
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.pdf_chunk_metadata import (
extract_pdf_positions,
normalize_pdf_items_metadata,
reorder_multi_column_bboxes,
)
@@ -558,7 +559,12 @@ class Parser(ProcessBase):
first_outline_page = pdf_parser.outlines[0][2]
split_at = len(bboxes)
for i, item in enumerate(bboxes):
if item["page_number"] >= first_outline_page:
page_number = item.get("page_number")
if page_number is None:
positions = extract_pdf_positions(item)
if positions:
page_number = positions[0][0]
if page_number is not None and page_number >= first_outline_page:
split_at = i
break
toc_bboxes, _ = remove_toc(bboxes[:split_at])

View File

@@ -72,6 +72,7 @@ def extract_pdf_positions(item):
return []
positions = _extract_raw_positions(item)
uses_position_tag = isinstance(item.get("position_tag"), str) and bool(item.get("position_tag"))
ref_page_number = item.get("page_number")
ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None
if ref_page_number is not None and ref_page_number <= 0:
@@ -85,7 +86,9 @@ def extract_pdf_positions(item):
page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0]
try:
page_number = int(page_number)
if ref_page_number is not None and page_number == ref_page_number - 1:
if uses_position_tag:
page_number += 1
elif ref_page_number is not None and page_number == ref_page_number - 1:
page_number = ref_page_number
elif page_number <= 0:
page_number += 1

View File

@@ -68,7 +68,8 @@ class Tokenizer(ProcessBase):
embd_model_config = get_tenant_default_model_by_type(self._canvas._tenant_id, LLMType.EMBEDDING)
embedding_model = LLMBundle(self._canvas._tenant_id, embd_model_config)
texts = []
for c in chunks:
valid_pairs = []
for i, c in enumerate(chunks):
txt = ""
if isinstance(self._param.fields, str):
self._param.fields=[self._param.fields]
@@ -78,7 +79,15 @@ class Tokenizer(ProcessBase):
txt += f
elif isinstance(f, list):
txt += "\n".join(f)
texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
cleaned_txt = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt).strip()
if not cleaned_txt:
continue
texts.append(cleaned_txt)
valid_pairs.append((i, c))
if not texts:
return chunks, token_count
vts, c = embedding_model.encode([name])
token_count += c
tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
@@ -104,8 +113,8 @@ class Tokenizer(ProcessBase):
title_w = float(self._param.filename_embd_weight)
vects = (title_w * tts + (1 - title_w) * cnts) if len(tts) == len(cnts) else cnts
assert len(vects) == len(chunks)
for i, ck in enumerate(chunks):
assert len(vects) == len(valid_pairs)
for i, (_, ck) in enumerate(valid_pairs):
v = vects[i].tolist()
ck["q_%d_vec" % len(v)] = v
return chunks, token_count