fix(deepdoc): parse bodyless HTML fragments (#16423)

2026-07-04 01:29:35 +08:00 · 2026-07-01 12:15:22 +05:30
parent 9bf57600cf
commit b42414b64a
2 changed files with 12 additions and 1 deletions
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -70,7 +70,10 @@ class RAGFlowHtmlParser:
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

-        cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
+        root = soup.body or soup
+        if soup.body is None:
+            logging.debug("html_parser: parsing HTML fragment without <body>; falling back to soup root")
+        cls.read_text_recursively(root, temp_sections, chunk_token_num=chunk_token_num)
        block_txt_list, table_list = cls.merge_block_text(temp_sections)
        sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
        for table in table_list:
--- a/test/unit_test/deepdoc/parser/test_html_parser.py
+++ b/test/unit_test/deepdoc/parser/test_html_parser.py
@@ -140,3 +140,11 @@ def test_small_blocks_are_merged_unchanged():

    assert "Alpha Beta" in "".join(chunks)
    assert "Gamma" in "".join(chunks)
+
+
+def test_parser_txt_extracts_bodyless_html_fragment():
+    chunks = RAGFlowHtmlParser.parser_txt("<h1>Title</h1><p>Fragment text</p>", chunk_token_num=512)
+
+    joined = "\n".join(chunks)
+    assert "# Title" in joined
+    assert "Fragment text" in joined