From 020068dd16bead96b1cae9b90b004442f0e159ff Mon Sep 17 00:00:00 2001 From: tunsuy <957126743@qq.com> Date: Wed, 4 Mar 2026 21:42:02 +0800 Subject: [PATCH] =?UTF-8?q?Fix:=20preserve=20field=20boundaries=20in=20chu?= =?UTF-8?q?nked=20documents=20from=20MySQL=E2=80=A6=20(#13369)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? When multiple columns are used as content columns in RDBMS connector, the generated document text gets chunked by TxtParser which strips newline delimiters during merge. This causes field names and values from different columns to be concatenated without any separator, making the content unreadable. Changes: - txt_parser.py: restore newline separator when merging adjacent text segments within a chunk, so that split sections are not directly concatenated - rdbms_connector.py: use double newline between fields and place field value on a new line after the field name bracket, giving TxtParser clearer boundaries to work with Closes #13001 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: tunsuytang --- common/data_source/rdbms_connector.py | 8 ++++---- deepdoc/parser/txt_parser.py | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/common/data_source/rdbms_connector.py b/common/data_source/rdbms_connector.py index 2902041bd5..05628501c6 100644 --- a/common/data_source/rdbms_connector.py +++ b/common/data_source/rdbms_connector.py @@ -204,11 +204,11 @@ class RDBMSConnector(LoadConnector, PollConnector): value = row_dict[col] if isinstance(value, (dict, list)): value = json.dumps(value, ensure_ascii=False) - # Use brackets around field name to ensure it's distinguishable - # after chunking (TxtParser strips \n delimiters during merge) - content_parts.append(f"【{col}】: {value}") + # Use brackets around field name and put value on a new line + # so that TxtParser preserves field boundaries after chunking. + content_parts.append(f"【{col}】:\n{value}") - content = "\n".join(content_parts) + content = "\n\n".join(content_parts) if self.id_column and self.id_column in row_dict: doc_id = f"{self.db_type}:{self.database}:{row_dict[self.id_column]}" diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py index 64e200cbc6..6abf8591da 100644 --- a/deepdoc/parser/txt_parser.py +++ b/deepdoc/parser/txt_parser.py @@ -40,7 +40,10 @@ class RAGFlowTxtParser: cks.append(t) tk_nums.append(tnum) else: - cks[-1] += t + if cks[-1]: + cks[-1] += "\n" + t + else: + cks[-1] += t tk_nums[-1] += tnum dels = []