mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 01:29:35 +08:00
fix(deepdoc): parse bodyless HTML fragments (#16423)
This commit is contained in:
@@ -70,7 +70,10 @@ class RAGFlowHtmlParser:
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
|
||||
root = soup.body or soup
|
||||
if soup.body is None:
|
||||
logging.debug("html_parser: parsing HTML fragment without <body>; falling back to soup root")
|
||||
cls.read_text_recursively(root, temp_sections, chunk_token_num=chunk_token_num)
|
||||
block_txt_list, table_list = cls.merge_block_text(temp_sections)
|
||||
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
|
||||
for table in table_list:
|
||||
|
||||
@@ -140,3 +140,11 @@ def test_small_blocks_are_merged_unchanged():
|
||||
|
||||
assert "Alpha Beta" in "".join(chunks)
|
||||
assert "Gamma" in "".join(chunks)
|
||||
|
||||
|
||||
def test_parser_txt_extracts_bodyless_html_fragment():
|
||||
chunks = RAGFlowHtmlParser.parser_txt("<h1>Title</h1><p>Fragment text</p>", chunk_token_num=512)
|
||||
|
||||
joined = "\n".join(chunks)
|
||||
assert "# Title" in joined
|
||||
assert "Fragment text" in joined
|
||||
|
||||
Reference in New Issue
Block a user