Refact: refact on parser structure (#14012)

### What problem does this PR solve? Refact: refact on parser structure ### Type of change - [x] Refactoring
2026-07-01 00:05:43 +08:00 · 2026-04-10 10:03:44 +08:00
parent cd04467b9b
commit 27329b40ed
6 changed files with 110 additions and 49 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        callback(0.1, "Start to parse.")
        sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。；！？"))
        sections = _normalize_section_text_for_rtl_presentation_forms(sections)
+        print("\n", "-"*150, "\n")
+        print(sections)
+        print("\n", "-"*150, "\n")
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -66,7 +66,11 @@ class ParserParam(ProcessParamBase):
                "markdown",
                "html",
            ],
-            "word": [
+            "doc": [
+                "json",
+                "markdown",
+            ],
+            "docx": [
                "json",
                "markdown",
            ],
@@ -80,11 +84,11 @@ class ParserParam(ProcessParamBase):
                "text",
                "json",
            ],
-            "text&markdown": [
+            "markdown": [
                "text",
                "json",
            ],
-            "code": [
+            "text&code": [
                "text",
                "json",
            ],
@@ -121,21 +125,28 @@ class ParserParam(ProcessParamBase):
                    "csv",
                ],
            },
-            "word": {
+            "doc": {
                "remove_toc": False,
                "suffix": [
                    "doc",
+                ],
+                "output_format": "json",
+            },
+            "docx": {
+                "remove_toc": False,
+                "suffix": [
                    "docx",
                ],
                "output_format": "json",
            },
-            "text&markdown": {
-                "suffix": ["md", "markdown", "mdx", "txt"],
+            "markdown": {
+                "suffix": ["md", "markdown", "mdx"],
                "remove_toc": False,
                "output_format": "json",
            },
-            "code": {
+            "text&code": {
                "suffix": [
+                    "txt",
                    "py",
                    "js",
                    "java",
@@ -150,12 +161,12 @@ class ParserParam(ProcessParamBase):
                    "kt",
                    "sql",
                ],
-                "output_format": "text",
+                "output_format": "json",
            },
            "html": {
                "suffix": ["htm", "html"],
                "remove_toc": "false",
-                "output_format": "text",
+                "output_format": "json",
            },
            "slides": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@@ -235,10 +246,15 @@ class ParserParam(ProcessParamBase):
            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])

-        doc_config = self.setups.get("word", "")
+        doc_config = self.setups.get("doc", "")
        if doc_config:
            doc_output_format = doc_config.get("output_format", "")
-            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
+            self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
+
+        docx_config = self.setups.get("docx", "")
+        if docx_config:
+            docx_output_format = docx_config.get("output_format", "")
+            self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])

        slides_config = self.setups.get("slides", "")
        if slides_config:
@@ -251,15 +267,15 @@ class ParserParam(ProcessParamBase):
            if image_parse_method not in ["ocr"]:
                self.check_empty(image_config.get("lang", ""), "Image VLM language")

-        text_config = self.setups.get("text&markdown", "")
+        text_config = self.setups.get("markdown", "")
        if text_config:
            text_output_format = text_config.get("output_format", "")
-            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
+            self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])

-        code_config = self.setups.get("code", "")
+        code_config = self.setups.get("text&code", "")
        if code_config:
            code_output_format = code_config.get("output_format", "")
-            self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"])
+            self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])

        html_config = self.setups.get("html", "")
        if html_config:
@@ -733,10 +749,27 @@ class Parser(ProcessBase):
            elif conf.get("output_format") == "markdown":
                self.set_output("markdown", spreadsheet_parser.markdown(blob))

-    def _word(self, name, blob, **kwargs):
-        """Parse doc/docx files and optionally remove table-of-contents content."""
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-        conf = self._param.setups["word"]
+    def _doc(self, name, blob, **kwargs):
+        """Parse DOC files into text/json sections."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
+        conf = self._param.setups["doc"]
+        self.set_output("output_format", conf["output_format"])
+
+        from tika import parser as tika_parser
+
+        parsed = tika_parser.from_buffer(io.BytesIO(blob))
+        sections = [line for line in parsed["content"].split("\n") if line]
+
+        if conf.get("output_format") == "json":
+            self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
+            return
+
+        self.set_output("markdown", "\n".join(sections))
+
+    def _docx(self, name, blob, **kwargs):
+        """Parse DOCX files and optionally remove table-of-contents content."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
+        conf = self._param.setups["docx"]
        self.set_output("output_format", conf["output_format"])
        
        if re.search(r"\.doc$", name, re.IGNORECASE):
@@ -885,14 +918,14 @@ class Parser(ProcessBase):
                self.set_output("json", sections)

    def _markdown(self, name, blob, **kwargs):
-        """Parse markdown and txt files into text/json sections."""
+        """Parse markdown files into text/json sections."""
        from functools import reduce

        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
-        conf = self._param.setups["text&markdown"]
+        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])

        markdown_parser = naive_markdown_parser()
@@ -903,11 +936,6 @@ class Parser(ProcessBase):
            delimiter=conf.get("delimiter"),
            return_section_images=True,
        )
-        if name.lower().endswith(".txt") and conf.get("remove_toc") == "true":
-            sections, kept_indices = remove_toc(sections)
-            if section_images:
-                section_images = [section_images[i] for i in kept_indices if i < len(section_images)]
-
        if conf.get("output_format") == "json":
            json_results = []

@@ -937,11 +965,15 @@ class Parser(ProcessBase):
            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

    def _code(self, name, blob, **kwargs):
-        """Parse source code files as plain text chunks."""
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.")
-        conf = self._param.setups["code"]
+        """Parse text and source code files as plain text chunks."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
+        conf = self._param.setups["text&code"]
        self.set_output("output_format", conf["output_format"])

+        print("\n\n")
+        print(conf.get("output_format"))
+        print("\n\n")
+        
        sections = TxtParser()(
            name,
            blob,
@@ -952,6 +984,10 @@ class Parser(ProcessBase):
            self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
            return

+        print("\n", "-"*150, "\n")
+        print(sections)
+        print("\n", "-"*150, "\n")
+
        self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))

    def _html(self, name, blob, **kwargs):
@@ -1199,12 +1235,13 @@ class Parser(ProcessBase):
        """Dispatch the current file to the matching parser branch by suffix."""
        function_map = {
            "pdf": self._pdf,
-            "text&markdown": self._markdown,
-            "code": self._code,
+            "markdown": self._markdown,
+            "text&code": self._code,
            "html": self._html,
            "spreadsheet": self._spreadsheet,
            "slides": self._slides,
-            "word": self._word,
+            "doc": self._doc,
+            "docx": self._docx,
            "image": self._image,
            "audio": self._audio,
            "video": self._video,
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable
        spreadsheet: 'Spreadsheet',
        image: 'Image',
        email: 'Email',
-        'text&markdown': 'Text & Markup',
-        code: 'Code',
+        markdown: 'Markdown',
+        'text&code': 'Text & Code',
        html: 'HTML',
-        word: 'Word',
+        doc: 'DOC',
+        docx: 'DOCX',
        slides: 'PPTX',
        audio: 'Audio',
        video: 'Video',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1950,10 +1950,11 @@ General：实体和关系提取提示来自 GitHub - microsoft/graphrag：基于
        spreadsheet: '表格',
        image: '图片',
        email: '邮件',
-        'text&markdown': '文本与标记',
-        code: '代码',
+        markdown: 'Markdown',
+        'text&code': '文本与代码',
        html: 'HTML',
-        word: 'Word',
+        doc: 'DOC',
+        docx: 'DOCX',
        slides: 'PPTX',
        audio: '音频',
        video: '视频',
--- a/web/src/pages/agent/constant/pipeline.tsx
+++ b/web/src/pages/agent/constant/pipeline.tsx
@@ -9,10 +9,11 @@ export enum FileType {
  Spreadsheet = 'spreadsheet',
  Image = 'image',
  Email = 'email',
-  TextMarkdown = 'text&markdown',
-  Code = 'code',
+  TextMarkdown = 'markdown',
+  Code = 'text&code',
  Html = 'html',
-  Docx = 'word',
+  Doc = 'doc',
+  Docx = 'docx',
  PowerPoint = 'slides',
  Video = 'video',
  Audio = 'audio',
@@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat {
  Text = 'text',
 }

+export enum TextJsonOutputFormat {
+  Text = 'text',
+  Json = 'json',
+}
+
 export enum DocxOutputFormat {
  Markdown = 'markdown',
  Json = 'json',
@@ -64,8 +70,9 @@ export const OutputFormatMap = {
  [FileType.Image]: ImageOutputFormat,
  [FileType.Email]: EmailOutputFormat,
  [FileType.TextMarkdown]: TextMarkdownOutputFormat,
-  [FileType.Code]: TextMarkdownOutputFormat,
-  [FileType.Html]: TextMarkdownOutputFormat,
+  [FileType.Code]: TextJsonOutputFormat,
+  [FileType.Html]: TextJsonOutputFormat,
+  [FileType.Doc]: DocxOutputFormat,
  [FileType.Docx]: DocxOutputFormat,
  [FileType.PowerPoint]: PptOutputFormat,
  [FileType.Video]: VideoOutputFormat,
@@ -78,8 +85,9 @@ export const InitialOutputFormatMap = {
  [FileType.Image]: ImageOutputFormat.Text,
  [FileType.Email]: EmailOutputFormat.Text,
  [FileType.TextMarkdown]: TextMarkdownOutputFormat.Text,
-  [FileType.Code]: TextMarkdownOutputFormat.Text,
-  [FileType.Html]: TextMarkdownOutputFormat.Text,
+  [FileType.Code]: TextJsonOutputFormat.Json,
+  [FileType.Html]: TextJsonOutputFormat.Json,
+  [FileType.Doc]: DocxOutputFormat.Json,
  [FileType.Docx]: DocxOutputFormat.Json,
  [FileType.PowerPoint]: PptOutputFormat.Json,
  [FileType.Video]: VideoOutputFormat.Text,
@@ -216,12 +224,17 @@ export const initialParserValues = {
    },
    {
      fileFormat: FileType.Code,
-      output_format: TextMarkdownOutputFormat.Text,
+      output_format: TextJsonOutputFormat.Json,
      preprocess: PreprocessValue.main_content,
    },
    {
      fileFormat: FileType.Html,
-      output_format: TextMarkdownOutputFormat.Text,
+      output_format: TextJsonOutputFormat.Json,
+      preprocess: PreprocessValue.main_content,
+    },
+    {
+      fileFormat: FileType.Doc,
+      output_format: DocxOutputFormat.Json,
      preprocess: PreprocessValue.main_content,
    },
    {
@@ -340,8 +353,9 @@ export const FileTypeSuffixMap = {
  [FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'],
  [FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'],
  [FileType.Email]: ['eml', 'msg'],
-  [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
+  [FileType.TextMarkdown]: ['md', 'markdown', 'mdx'],
  [FileType.Code]: [
+    'txt',
    'py',
    'js',
    'java',
@@ -357,7 +371,8 @@ export const FileTypeSuffixMap = {
    'sql',
  ],
  [FileType.Html]: ['htm', 'html'],
-  [FileType.Docx]: ['doc', 'docx'],
+  [FileType.Doc]: ['doc'],
+  [FileType.Docx]: ['docx'],
  [FileType.PowerPoint]: ['pptx', 'ppt'],
  [FileType.Video]: ['mp4', 'avi', 'mkv'],
  [FileType.Audio]: [
--- a/web/src/pages/agent/form/parser-form/index.tsx
+++ b/web/src/pages/agent/form/parser-form/index.tsx
@@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial<
    { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
    { value: PreprocessValue.section_title },
  ],
+  [FileType.Doc]: [
+    { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
+    { value: PreprocessValue.section_title },
+  ],
  [FileType.Docx]: [
    { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
    { value: PreprocessValue.section_title },