diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index bbce185c39..81562781d8 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -741,6 +741,41 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document") conf = self._param.setups["word"] self.set_output("output_format", conf["output_format"]) + + if re.search(r"\.doc$", name, re.IGNORECASE): + self.set_output("file", {**kwargs.get("file", {}), "outlines": []}) + try: + from tika import parser as tika_parser + except Exception as e: + msg = f"tika not available: {e}. Unsupported .doc parsing." + self.callback(0.8, msg) + logging.warning(f"{msg} for {name}.") + return + + doc_parsed = tika_parser.from_buffer(io.BytesIO(blob)) + content = doc_parsed.get("content") + if content is None: + msg = f"tika.parser got empty content from {name}." + self.callback(0.8, msg) + logging.warning(msg) + return + + sections = [line.strip() for line in content.splitlines() if line and line.strip()] + if conf.get("remove_toc"): + sections = remove_toc_word(sections, outlines) + + if conf.get("output_format") == "json": + self.set_output( + "json", + [{"text": line, "image": None, "doc_type_kwd": "text"} for line in sections], + ) + elif conf.get("output_format") == "markdown": + # Tika gives us plain text lines, so join with blank lines to preserve paragraph boundaries in markdown. + self.set_output("markdown", "\n\n".join(sections)) + + self.callback(0.8, "Finish parsing.") + return + docx_parser = Docx() # Extract heading-based outlines for metadata and TOC removal. @@ -784,7 +819,7 @@ class Parser(ProcessBase): markdown_text = docx_parser.to_markdown(name, binary=blob) if conf.get("remove_toc"): markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines)) - + self.set_output("markdown", markdown_text) def _slides(self, name, blob, **kwargs):