fix(html_parser): preserve original text when splitting oversized blocks (#16052)

### Bug `RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing the **tokenized** string and storing the joined tokens: ```python tks_str = rag_tokenizer.tokenize(block) ... tokens = tks_str.split(" ") while start < len(tokens): chunks.append(" ".join(tokens[start:start + chunk_token_num])) # tokenized form, not source ``` On the default (Elasticsearch) backend `rag_tokenizer.tokenize` transforms text: it lowercases/stems Latin words and inserts spaces between CJK characters. So any text block longer than `chunk_token_num` is stored as garbled, lowercased, space-segmented text instead of the source content. The small-block branch correctly stores the original `block`, so only oversized blocks are corrupted. Affects HTML and EPUB ingestion (both go through `chunk_block`), degrading retrieved chunks and the answers generated from them. ### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend) ``` tokenize("Hello World FOO Bar Baz Qux Jumps") -> "hello world foo bar baz qux jump" # lowercased + stemmed tokenize("你好世界这是一个测试") -> "你好世界这是一个测试" # spaces inserted ``` ### Fix Split the **original** text: break it into atoms (whitespace-delimited runs for space-separated scripts, per-character for spaceless scripts such as Chinese) and pack them into pieces of at most `chunk_token_num` tokens. This preserves the source characters and still splits scripts that have no whitespace — a plain whitespace split would leave CJK as one un-splittable chunk. ### Proof (real tokenizer, before/after) Running the old vs new split against the real `infinity.rag_tokenizer`: ``` ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs" (chunk_token_num=4) OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog'] # lowercased + stemmed NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs'] # preserved; each <= 4 tokens NEW preserves text exactly: True CHINESE "你好世界这是一个测试用例需要被切分成多个块" (chunk_token_num=3) OLD: ['你好世界这是', '一个测试用例需要', ...] # spurious spaces NEW: ['你好世', '界这是', '一个测', ...] # preserved; each <= 3 tokens NEW preserves text exactly: True ``` ### Tests Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English + Chinese oversized blocks, plus small-block merge). Before the fix the two oversized tests fail (English shows lowercasing, Chinese shows inserted spaces); after the fix all pass. `ruff check` clean.
2026-06-29 15:31:05 +08:00 · 2026-06-25 04:43:35 -04:00
parent edfa9be67f
commit 091417980e
2 changed files with 216 additions and 11 deletions
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -16,6 +16,8 @@
 #

 from rag.nlp import find_codec, rag_tokenizer
+import logging
+import re
 import uuid
 import chardet
 from bs4 import BeautifulSoup, NavigableString, Tag, Comment
@@ -176,6 +178,74 @@ class RAGFlowHtmlParser:
            block_content.append(current_content)
        return block_content, table_info_list

+    # Characters from scripts written without spaces between words (CJK, kana,
+    # Hangul). These must be split per-character, since whitespace is not a
+    # usable word boundary for them.
+    _SPACELESS = (
+        "぀-ヿ"  # Hiragana, Katakana
+        "㐀-䶿"  # CJK Extension A
+        "一-鿿"  # CJK Unified Ideographs
+        "豈-﫿"  # CJK Compatibility Ideographs
+        "가-힯"  # Hangul syllables
+    )
+    _ATOM_RE = re.compile(r"[{s}]|[^\s{s}]+|\s+".format(s=_SPACELESS))
+
+    @classmethod
+    def _token_count(cls, text):
+        if not text:
+            return 0
+        tks_str = rag_tokenizer.tokenize(text)
+        return len(tks_str.split(" ")) if tks_str else 0
+
+    @classmethod
+    def _split_oversized_block(cls, block, chunk_token_num):
+        # Split the ORIGINAL text into pieces of at most chunk_token_num tokens,
+        # preserving the source characters. Break on whitespace for
+        # space-delimited scripts and per-character for scripts that have no
+        # spaces (e.g. Chinese), so both are split without mangling the text.
+        pieces = []
+        current = ""
+        current_tokens = 0
+        # Spaceless scripts yield many repeated single-character atoms, so cache
+        # the token count per distinct atom to avoid re-tokenizing each one.
+        token_cache = {}
+
+        def atom_token_count(atom):
+            if atom.isspace():
+                return 0
+            if atom not in token_cache:
+                token_cache[atom] = cls._token_count(atom)
+            return token_cache[atom]
+
+        for atom in cls._ATOM_RE.findall(block):
+            atom_tokens = atom_token_count(atom)
+            if current and current_tokens + atom_tokens > chunk_token_num:
+                pieces.append(current)
+                current = ""
+                current_tokens = 0
+            if atom_tokens > chunk_token_num and not atom.isspace():
+                # A single atom longer than the budget (e.g. a very long
+                # unbroken token): fall back to fixed character windows.
+                logging.debug(
+                    "html_parser: atom of %d chars exceeds chunk_token_num=%d; "
+                    "falling back to character windows",
+                    len(atom),
+                    chunk_token_num,
+                )
+                for i in range(0, len(atom), chunk_token_num):
+                    pieces.append(atom[i:i + chunk_token_num])
+                continue
+            current += atom
+            current_tokens += atom_tokens
+        if current:
+            pieces.append(current)
+        logging.debug(
+            "html_parser: split oversized block of %d chars into %d pieces",
+            len(block),
+            len(pieces),
+        )
+        return pieces
+
    @classmethod
    def chunk_block(cls, block_txt_list, chunk_token_num=512):
        chunks = []
@@ -183,20 +253,13 @@ class RAGFlowHtmlParser:
        current_token_count = 0

        for block in block_txt_list:
-            tks_str = rag_tokenizer.tokenize(block)
-            block_token_count = len(tks_str.split(" ")) if tks_str else 0
+            block_token_count = cls._token_count(block)
            if block_token_count > chunk_token_num:
                if current_block:
                    chunks.append(current_block)
-                start = 0
-                tokens = tks_str.split(" ")
-                while start < len(tokens):
-                    end = start + chunk_token_num
-                    split_tokens = tokens[start:end]
-                    chunks.append(" ".join(split_tokens))
-                    start = end
-                current_block = ""
-                current_token_count = 0
+                    current_block = ""
+                    current_token_count = 0
+                chunks.extend(cls._split_oversized_block(block, chunk_token_num))
            else:
                if current_token_count + block_token_count <= chunk_token_num:
                    current_block += ("\n" if current_block else "") + block