mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
Feat: support doc for pipeline parser in word (#14005)
### What problem does this PR solve? Feat: support doc for pipeline parser in word ### Type of change - [x] New Feature (non-breaking change which adds functionality) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added support for processing legacy Word `.doc` file formats, extending document compatibility. * **Bug Fixes** * Enhanced error handling during document parsing to improve reliability and prevent processing failures.
This commit is contained in:
@@ -741,6 +741,41 @@ class Parser(ProcessBase):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
conf = self._param.setups["word"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if re.search(r"\.doc$", name, re.IGNORECASE):
|
||||
self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
|
||||
try:
|
||||
from tika import parser as tika_parser
|
||||
except Exception as e:
|
||||
msg = f"tika not available: {e}. Unsupported .doc parsing."
|
||||
self.callback(0.8, msg)
|
||||
logging.warning(f"{msg} for {name}.")
|
||||
return
|
||||
|
||||
doc_parsed = tika_parser.from_buffer(io.BytesIO(blob))
|
||||
content = doc_parsed.get("content")
|
||||
if content is None:
|
||||
msg = f"tika.parser got empty content from {name}."
|
||||
self.callback(0.8, msg)
|
||||
logging.warning(msg)
|
||||
return
|
||||
|
||||
sections = [line.strip() for line in content.splitlines() if line and line.strip()]
|
||||
if conf.get("remove_toc"):
|
||||
sections = remove_toc_word(sections, outlines)
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output(
|
||||
"json",
|
||||
[{"text": line, "image": None, "doc_type_kwd": "text"} for line in sections],
|
||||
)
|
||||
elif conf.get("output_format") == "markdown":
|
||||
# Tika gives us plain text lines, so join with blank lines to preserve paragraph boundaries in markdown.
|
||||
self.set_output("markdown", "\n\n".join(sections))
|
||||
|
||||
self.callback(0.8, "Finish parsing.")
|
||||
return
|
||||
|
||||
docx_parser = Docx()
|
||||
|
||||
# Extract heading-based outlines for metadata and TOC removal.
|
||||
@@ -784,7 +819,7 @@ class Parser(ProcessBase):
|
||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||
if conf.get("remove_toc"):
|
||||
markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
|
||||
|
||||
|
||||
self.set_output("markdown", markdown_text)
|
||||
|
||||
def _slides(self, name, blob, **kwargs):
|
||||
|
||||
Reference in New Issue
Block a user