diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index d1fd7ead38..4583b52263 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -38,6 +38,7 @@ from deepdoc.parser.tcadp_parser import TCADPParser from rag.app.naive import Docx from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.parser.pdf_chunk_metadata import ( + extract_pdf_positions, normalize_pdf_items_metadata, reorder_multi_column_bboxes, ) @@ -558,7 +559,12 @@ class Parser(ProcessBase): first_outline_page = pdf_parser.outlines[0][2] split_at = len(bboxes) for i, item in enumerate(bboxes): - if item["page_number"] >= first_outline_page: + page_number = item.get("page_number") + if page_number is None: + positions = extract_pdf_positions(item) + if positions: + page_number = positions[0][0] + if page_number is not None and page_number >= first_outline_page: split_at = i break toc_bboxes, _ = remove_toc(bboxes[:split_at]) diff --git a/rag/flow/parser/pdf_chunk_metadata.py b/rag/flow/parser/pdf_chunk_metadata.py index 175ac3772e..74921b201b 100644 --- a/rag/flow/parser/pdf_chunk_metadata.py +++ b/rag/flow/parser/pdf_chunk_metadata.py @@ -72,6 +72,7 @@ def extract_pdf_positions(item): return [] positions = _extract_raw_positions(item) + uses_position_tag = isinstance(item.get("position_tag"), str) and bool(item.get("position_tag")) ref_page_number = item.get("page_number") ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None if ref_page_number is not None and ref_page_number <= 0: @@ -85,7 +86,9 @@ def extract_pdf_positions(item): page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0] try: page_number = int(page_number) - if ref_page_number is not None and page_number == ref_page_number - 1: + if uses_position_tag: + page_number += 1 + elif ref_page_number is not None and page_number == ref_page_number - 1: page_number = ref_page_number elif page_number <= 0: page_number += 1 diff --git a/rag/flow/tokenizer/tokenizer.py b/rag/flow/tokenizer/tokenizer.py index 9992ca722b..467594a312 100644 --- a/rag/flow/tokenizer/tokenizer.py +++ b/rag/flow/tokenizer/tokenizer.py @@ -68,7 +68,8 @@ class Tokenizer(ProcessBase): embd_model_config = get_tenant_default_model_by_type(self._canvas._tenant_id, LLMType.EMBEDDING) embedding_model = LLMBundle(self._canvas._tenant_id, embd_model_config) texts = [] - for c in chunks: + valid_pairs = [] + for i, c in enumerate(chunks): txt = "" if isinstance(self._param.fields, str): self._param.fields=[self._param.fields] @@ -78,7 +79,15 @@ class Tokenizer(ProcessBase): txt += f elif isinstance(f, list): txt += "\n".join(f) - texts.append(re.sub(r"]{0,12})?>", " ", txt)) + cleaned_txt = re.sub(r"]{0,12})?>", " ", txt).strip() + if not cleaned_txt: + continue + texts.append(cleaned_txt) + valid_pairs.append((i, c)) + + if not texts: + return chunks, token_count + vts, c = embedding_model.encode([name]) token_count += c tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0) @@ -104,8 +113,8 @@ class Tokenizer(ProcessBase): title_w = float(self._param.filename_embd_weight) vects = (title_w * tts + (1 - title_w) * cnts) if len(tts) == len(cnts) else cnts - assert len(vects) == len(chunks) - for i, ck in enumerate(chunks): + assert len(vects) == len(valid_pairs) + for i, (_, ck) in enumerate(valid_pairs): v = vects[i].tolist() ck["q_%d_vec" % len(v)] = v return chunks, token_count