From 020068dd16bead96b1cae9b90b004442f0e159ff Mon Sep 17 00:00:00 2001
From: tunsuy <957126743@qq.com>
Date: Wed, 4 Mar 2026 21:42:02 +0800
Subject: [PATCH] =?UTF-8?q?Fix:=20preserve=20field=20boundaries=20in=20chu?=
 =?UTF-8?q?nked=20documents=20from=20MySQL=E2=80=A6=20(#13369)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What problem does this PR solve?

When multiple columns are used as content columns in RDBMS connector,
the generated document text gets chunked by TxtParser which strips
newline delimiters during merge. This causes field names and values from
different columns to be concatenated without any separator, making the
content unreadable.

Changes:
- txt_parser.py: restore newline separator when merging adjacent text
segments within a chunk, so that split sections are not directly
concatenated
- rdbms_connector.py: use double newline between fields and place field
value on a new line after the field name bracket, giving TxtParser
clearer boundaries to work with

Closes #13001

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: tunsuytang <tunsuytang@tencent.com>
---
 common/data_source/rdbms_connector.py | 8 ++++----
 deepdoc/parser/txt_parser.py          | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/common/data_source/rdbms_connector.py b/common/data_source/rdbms_connector.py
index 2902041bd5..05628501c6 100644
--- a/common/data_source/rdbms_connector.py
+++ b/common/data_source/rdbms_connector.py
@@ -204,11 +204,11 @@ class RDBMSConnector(LoadConnector, PollConnector):
                 value = row_dict[col]
                 if isinstance(value, (dict, list)):
                     value = json.dumps(value, ensure_ascii=False)
-                # Use brackets around field name to ensure it's distinguishable
-                # after chunking (TxtParser strips \n delimiters during merge)
-                content_parts.append(f"【{col}】: {value}")
+                # Use brackets around field name and put value on a new line
+                # so that TxtParser preserves field boundaries after chunking.
+                content_parts.append(f"【{col}】:\n{value}")
         
-        content = "\n".join(content_parts)
+        content = "\n\n".join(content_parts)
         
         if self.id_column and self.id_column in row_dict:
             doc_id = f"{self.db_type}:{self.database}:{row_dict[self.id_column]}"
diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py
index 64e200cbc6..6abf8591da 100644
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@@ -40,7 +40,10 @@ class RAGFlowTxtParser:
                 cks.append(t)
                 tk_nums.append(tnum)
             else:
-                cks[-1] += t
+                if cks[-1]:
+                    cks[-1] += "\n" + t
+                else:
+                    cks[-1] += t
                 tk_nums[-1] += tnum
 
         dels = []