Feat: support doc for pipeline parser in word (#14005)

### What problem does this PR solve?

Feat: support doc for pipeline parser in word

### Type of change

- [x] New Feature (non-breaking change which adds functionality)



<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **New Features**
* Added support for processing legacy Word `.doc` file formats,
extending document compatibility.

* **Bug Fixes**
* Enhanced error handling during document parsing to improve reliability
and prevent processing failures.
This commit is contained in:
Magicbook1108
2026-04-09 16:40:42 +08:00
committed by GitHub
parent 8d52ef2893
commit 107fe6cf90

View File

@@ -741,6 +741,41 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"])
if re.search(r"\.doc$", name, re.IGNORECASE):
self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
try:
from tika import parser as tika_parser
except Exception as e:
msg = f"tika not available: {e}. Unsupported .doc parsing."
self.callback(0.8, msg)
logging.warning(f"{msg} for {name}.")
return
doc_parsed = tika_parser.from_buffer(io.BytesIO(blob))
content = doc_parsed.get("content")
if content is None:
msg = f"tika.parser got empty content from {name}."
self.callback(0.8, msg)
logging.warning(msg)
return
sections = [line.strip() for line in content.splitlines() if line and line.strip()]
if conf.get("remove_toc"):
sections = remove_toc_word(sections, outlines)
if conf.get("output_format") == "json":
self.set_output(
"json",
[{"text": line, "image": None, "doc_type_kwd": "text"} for line in sections],
)
elif conf.get("output_format") == "markdown":
# Tika gives us plain text lines, so join with blank lines to preserve paragraph boundaries in markdown.
self.set_output("markdown", "\n\n".join(sections))
self.callback(0.8, "Finish parsing.")
return
docx_parser = Docx()
# Extract heading-based outlines for metadata and TOC removal.
@@ -784,7 +819,7 @@ class Parser(ProcessBase):
markdown_text = docx_parser.to_markdown(name, binary=blob)
if conf.get("remove_toc"):
markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
self.set_output("markdown", markdown_text)
def _slides(self, name, blob, **kwargs):