From b42414b64a49cad9718da46d258573cd6d389db0 Mon Sep 17 00:00:00 2001 From: Harsh Kashyap Date: Wed, 1 Jul 2026 12:15:22 +0530 Subject: [PATCH] fix(deepdoc): parse bodyless HTML fragments (#16423) --- deepdoc/parser/html_parser.py | 5 ++++- test/unit_test/deepdoc/parser/test_html_parser.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py index f8524abf36..c28fa99607 100644 --- a/deepdoc/parser/html_parser.py +++ b/deepdoc/parser/html_parser.py @@ -70,7 +70,10 @@ class RAGFlowHtmlParser: for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() - cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num) + root = soup.body or soup + if soup.body is None: + logging.debug("html_parser: parsing HTML fragment without ; falling back to soup root") + cls.read_text_recursively(root, temp_sections, chunk_token_num=chunk_token_num) block_txt_list, table_list = cls.merge_block_text(temp_sections) sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num) for table in table_list: diff --git a/test/unit_test/deepdoc/parser/test_html_parser.py b/test/unit_test/deepdoc/parser/test_html_parser.py index 02aff9c67b..e3a135a845 100644 --- a/test/unit_test/deepdoc/parser/test_html_parser.py +++ b/test/unit_test/deepdoc/parser/test_html_parser.py @@ -140,3 +140,11 @@ def test_small_blocks_are_merged_unchanged(): assert "Alpha Beta" in "".join(chunks) assert "Gamma" in "".join(chunks) + + +def test_parser_txt_extracts_bodyless_html_fragment(): + chunks = RAGFlowHtmlParser.parser_txt("

Title

Fragment text

", chunk_token_num=512) + + joined = "\n".join(chunks) + assert "# Title" in joined + assert "Fragment text" in joined