diff --git a/rag/flow/chunker/title_chunker/common.py b/rag/flow/chunker/title_chunker/common.py index 89981a83de..0ca6549a96 100644 --- a/rag/flow/chunker/title_chunker/common.py +++ b/rag/flow/chunker/title_chunker/common.py @@ -73,25 +73,61 @@ class BaseTitleChunker(ABC): def extract_line_records(self): - # Normalize all upstream payloads into an ordered record stream. - # Level resolution and chunk construction operate on this stream only, - # so strategy code does not depend on source-specific output layouts. + """ + Normalize all upstream input payloads into a unified ordered record stream. + All level resolution and chunk construction logic operates on this standard stream, + decoupling downstream chunking strategies from different upstream output formats. + """ + import logging + logger = logging.getLogger(__name__) + + payload = None + # Extract raw content payload based on upstream output format type if self.from_upstream.output_format == "markdown": payload = self.from_upstream.markdown_result or "" - return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] - - if self.from_upstream.output_format == "text": + elif self.from_upstream.output_format == "text": payload = self.from_upstream.text_result or "" - return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] - - if self.from_upstream.output_format == "html": + elif self.from_upstream.output_format == "html": payload = self.from_upstream.html_result or "" - return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] + + # Boundary robustness fix: explicit None check to distinguish `None` and empty string "" + # Prevents empty payload from unexpectedly falling through to structured chunk branch + if payload is not None: + lines = payload.split("\n") + input_line_count = len(lines) + + # Format-branched text processing to preserve original document semantics + # Plain text: perform full whitespace stripping and invalid empty line filtering + if self.from_upstream.output_format == "text": + clean_lines = [line.strip() for line in lines if line.strip()] + # Markdown & HTML: retain original indentation/spacing, only filter pure blank lines + else: + clean_lines = [line for line in lines if line.strip()] + + output_line_count = len(clean_lines) + # Production observability log: added format dimension per project coding guidelines + logger.info( + f"payload filter: format={self.from_upstream.output_format} before={input_line_count} after={output_line_count}" + ) + + return [ + { + "text": line, + "doc_type_kwd": "text", + "img_id": None, + "layout": "", + PDF_POSITIONS_KEY: [] + } + for line in clean_lines + ] + # Return empty array directly for null payload to block invalid branch fallthrough + return [] items = self.from_upstream.chunks if self.from_upstream.output_format == "chunks" else self.from_upstream.json_result return [ { - "text": str(item.get("text") or ""), + # Serialization fix: avoid None value being converted into literal "None" string + "text": item.get("text") or "", "doc_type_kwd": str(item.get("doc_type_kwd") or "text"), "img_id": item.get("img_id"), "layout": "{} {}".format(item.get("layout_type", ""), item.get("layoutno", "")).strip(), @@ -100,7 +136,6 @@ class BaseTitleChunker(ABC): for item in items or [] ] - def extract_outlines(self): file = self.from_upstream.file or {} source = (