Feat: support doc for pipeline parser in word (#14005)

### What problem does this PR solve? Feat: support doc for pipeline parser in word ### Type of change - [x] New Feature (non-breaking change which adds functionality)  ## Summary by CodeRabbit * **New Features** * Added support for processing legacy Word `.doc` file formats, extending document compatibility. * **Bug Fixes** * Enhanced error handling during document parsing to improve reliability and prevent processing failures.
2026-07-04 18:45:38 +08:00 · 2026-04-09 16:40:42 +08:00
parent 8d52ef2893
commit 107fe6cf90
1 changed files with 36 additions and 1 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -741,6 +741,41 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
+        
+        if re.search(r"\.doc$", name, re.IGNORECASE):
+            self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
+            try:
+                from tika import parser as tika_parser
+            except Exception as e:
+                msg = f"tika not available: {e}. Unsupported .doc parsing."
+                self.callback(0.8, msg)
+                logging.warning(f"{msg} for {name}.")
+                return
+
+            doc_parsed = tika_parser.from_buffer(io.BytesIO(blob))
+            content = doc_parsed.get("content")
+            if content is None:
+                msg = f"tika.parser got empty content from {name}."
+                self.callback(0.8, msg)
+                logging.warning(msg)
+                return
+
+            sections = [line.strip() for line in content.splitlines() if line and line.strip()]
+            if conf.get("remove_toc"):
+                sections = remove_toc_word(sections, outlines)
+
+            if conf.get("output_format") == "json":
+                self.set_output(
+                    "json",
+                    [{"text": line, "image": None, "doc_type_kwd": "text"} for line in sections],
+                )
+            elif conf.get("output_format") == "markdown":
+                # Tika gives us plain text lines, so join with blank lines to preserve paragraph boundaries in markdown.
+                self.set_output("markdown", "\n\n".join(sections))
+
+            self.callback(0.8, "Finish parsing.")
+            return
+
        docx_parser = Docx()

        # Extract heading-based outlines for metadata and TOC removal.
@@ -784,7 +819,7 @@ class Parser(ProcessBase):
            markdown_text = docx_parser.to_markdown(name, binary=blob)
            if conf.get("remove_toc"):
                markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
-                 
+
            self.set_output("markdown", markdown_text)

    def _slides(self, name, blob, **kwargs):