fix(deepdoc): parse bodyless HTML fragments (#16423)

This commit is contained in:
Harsh Kashyap
2026-07-01 12:15:22 +05:30
committed by GitHub
parent 9bf57600cf
commit b42414b64a
2 changed files with 12 additions and 1 deletions

View File

@@ -70,7 +70,10 @@ class RAGFlowHtmlParser:
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
root = soup.body or soup
if soup.body is None:
logging.debug("html_parser: parsing HTML fragment without <body>; falling back to soup root")
cls.read_text_recursively(root, temp_sections, chunk_token_num=chunk_token_num)
block_txt_list, table_list = cls.merge_block_text(temp_sections)
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
for table in table_list:

View File

@@ -140,3 +140,11 @@ def test_small_blocks_are_merged_unchanged():
assert "Alpha Beta" in "".join(chunks)
assert "Gamma" in "".join(chunks)
def test_parser_txt_extracts_bodyless_html_fragment():
chunks = RAGFlowHtmlParser.parser_txt("<h1>Title</h1><p>Fragment text</p>", chunk_token_num=512)
joined = "\n".join(chunks)
assert "# Title" in joined
assert "Fragment text" in joined