Feat: add button to turn off vlm parsing (#14125)

### What problem does this PR solve? Feat: add button to turn off vlm parsing ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chanx <1243304602@qq.com>
2026-07-04 18:45:38 +08:00 · 2026-04-15 19:06:00 +08:00
parent dce0b1c030
commit 944a90d645
12 changed files with 120 additions and 31 deletions
--- a/rag/flow/chunker/title_chunker/common.py
+++ b/rag/flow/chunker/title_chunker/common.py
@@ -234,10 +234,6 @@ class BaseTitleChunker(ABC):
        return self.resolve_outline_levels(line_records) or self.resolve_frequency_levels(line_records)


-    def resolve_manual_levels(self, line_records):
-        return self.resolve_title_levels(line_records)["levels"]
-
-
    def build_chunks_from_record_groups(self, record_groups):
        # Strategy code decides record grouping. This method materializes each
        # group into the output chunk representation. For PDF-like inputs, the
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -110,6 +110,7 @@ class ParserParam(ProcessParamBase):
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/tcadp_parser/vlm
                "lang": "Chinese",
+                "flatten_media_to_text": False,
                "remove_toc": False,
                "suffix": [
                    "pdf",
@@ -118,6 +119,7 @@ class ParserParam(ProcessParamBase):
            },
            "spreadsheet": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
+                "flatten_media_to_text": False,
                "output_format": "html",
                "suffix": [
                    "xls",
@@ -133,6 +135,7 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "docx": {
+                "flatten_media_to_text": False,
                "remove_toc": False,
                "suffix": [
                    "docx",
@@ -140,6 +143,7 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "markdown": {
+                "flatten_media_to_text": False,
                "suffix": ["md", "markdown", "mdx"],
                "remove_toc": False,
                "output_format": "json",
@@ -312,6 +316,7 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])
+        flatten_media_to_text = conf.get("flatten_media_to_text")
        pdf_parser = None

        # Optional PDF post-processing flags applied after parsing.
@@ -571,7 +576,9 @@ class Parser(ProcessBase):
                layout_counters[layout] = seq + 1
                b["layoutno"] = f"{layout}-{seq}"

-            if layout == "table":
+            if flatten_media_to_text:
+                b["doc_type_kwd"] = "text"
+            elif layout == "table":
                b["doc_type_kwd"] = "table"
            elif layout == "figure":
                b["doc_type_kwd"] = "image"
@@ -668,6 +675,7 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
+        flatten_media_to_text = conf.get("flatten_media_to_text")

        parse_method = conf.get("parse_method", "deepdoc")

@@ -723,7 +731,12 @@ class Parser(ProcessBase):
                # Add tables as text
                for table in tables:
                    if table:
-                        result.append({"text": table, "doc_type_kwd": "table"})
+                        result.append(
+                            {
+                                "text": table,
+                                "doc_type_kwd": "text" if flatten_media_to_text else "table",
+                            }
+                        )

                self.set_output("json", result)

@@ -771,6 +784,7 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
        conf = self._param.setups["docx"]
        self.set_output("output_format", conf["output_format"])
+        flatten_media_to_text = conf.get("flatten_media_to_text")
        
        if re.search(r"\.doc$", name, re.IGNORECASE):
            self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
@@ -823,7 +837,7 @@ class Parser(ProcessBase):
                    {
                        "text": text,
                        "image": image,
-                        "doc_type_kwd": "image" if image is not None else "text",
+                        "doc_type_kwd": "text" if flatten_media_to_text or image is None else "image",
                    }
                )
                if html:
@@ -831,7 +845,7 @@ class Parser(ProcessBase):
                        {
                            "text": html,
                            "image": None,
-                            "doc_type_kwd": "table",
+                            "doc_type_kwd": "text" if flatten_media_to_text else "table",
                        }
                    )
            enhance_media_sections_with_vision(
@@ -927,6 +941,7 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])
+        flatten_media_to_text = conf.get("flatten_media_to_text")

        markdown_parser = naive_markdown_parser()
        sections, tables, section_images = markdown_parser(
@@ -952,7 +967,11 @@ class Parser(ProcessBase):
                    # If multiple images found, combine them using concat_img
                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
                    json_result["image"] = combined_image
-                json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text"
+                json_result["doc_type_kwd"] = (
+                    "text"
+                    if flatten_media_to_text or json_result.get("image") is None
+                    else "image"
+                )
                json_results.append(json_result)

            for table in tables:
@@ -961,7 +980,7 @@ class Parser(ProcessBase):
                    json_results.append(
                        {
                            "text": table_text,
-                            "doc_type_kwd": "table",
+                            "doc_type_kwd": "text" if flatten_media_to_text else "table",
                        }
                    )

--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1575,6 +1575,9 @@ Best for: Documents with flowing, contextually connected content — such as boo
      oneChunkTitle: 'Note',
      oneChunkDescription:
        'All parsed sections will be merged in order into a single chunk.',
+      flattenMediaToText: 'Disable vision model',
+      flattenMediaToTextTip:
+        'Treat image and table sections as plain text and skip vision enhancement.',
      enableChildrenDelimiters: 'Child chunk are used for retrieval',
      merge: 'Merge',
      split: 'Split',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1324,6 +1324,8 @@ General：实体和关系提取提示来自 GitHub - microsoft/graphrag：基于
      oneChunkTitle: 'Note',
      oneChunkDescription:
        '所有解析后的 sections 会按原始顺序合并为 1 个 chunk。',
+      flattenMediaToText: '禁用视觉模型',
+      flattenMediaToTextTip: '将图片和表格区块按普通文本处理，并跳过视觉增强。',
      merge: '合并',
      split: '拆分',
      script: '脚本',
--- a/web/src/pages/agent/constant/pipeline.tsx
+++ b/web/src/pages/agent/constant/pipeline.tsx
@@ -197,12 +197,14 @@ export const initialParserValues = {
      output_format: PdfOutputFormat.Json,
      parse_method: ParseDocumentType.DeepDOC,
      preprocess: PreprocessValue.main_content,
+      flatten_media_to_text: false,
    },
    {
      fileFormat: FileType.Spreadsheet,
      output_format: SpreadsheetOutputFormat.Html,
      parse_method: ParseDocumentType.DeepDOC,
      preprocess: PreprocessValue.main_content,
+      flatten_media_to_text: false,
    },
    {
      fileFormat: FileType.Image,
@@ -221,6 +223,7 @@ export const initialParserValues = {
      fileFormat: FileType.TextMarkdown,
      output_format: TextMarkdownOutputFormat.Text,
      preprocess: PreprocessValue.main_content,
+      flatten_media_to_text: false,
    },
    {
      fileFormat: FileType.Code,
@@ -241,6 +244,7 @@ export const initialParserValues = {
      fileFormat: FileType.Docx,
      output_format: DocxOutputFormat.Json,
      preprocess: PreprocessValue.main_content,
+      flatten_media_to_text: false,
    },
    {
      fileFormat: FileType.PowerPoint,
--- a/web/src/pages/agent/form/parser-form/common-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/common-form-fields.tsx
@@ -88,6 +88,29 @@ export function LargeModelFormField({
  );
 }

+export function FlattenMediaToTextFormField({ prefix }: CommonProps) {
+  const { t } = useTranslation();
+  return (
+    <RAGFlowFormItem
+      name={buildFieldNameWithPrefix(`flatten_media_to_text`, prefix)}
+      label={t('flow.flattenMediaToText')}
+      tooltip={t('flow.flattenMediaToTextTip')}
+      horizontal={true}
+      labelClassName="w-full"
+      valueClassName="w-8"
+    >
+      {(field) => (
+        <Switch
+          checked={field.value}
+          onCheckedChange={(checked) => {
+            field.onChange?.(checked);
+          }}
+        />
+      )}
+    </RAGFlowFormItem>
+  );
+}
+
 export function TwoColumnCheckFormField({ prefix }: CommonProps) {
  const { t } = useTranslation();
  return (
--- a/web/src/pages/agent/form/parser-form/index.tsx
+++ b/web/src/pages/agent/form/parser-form/index.tsx
@@ -154,6 +154,7 @@ export const FormSchema = z.object({
      lang: z.string().optional(),
      fields: z.array(z.string()).optional(),
      vlm: z.object({ llm_id: z.string().optional() }).optional(),
+      flatten_media_to_text: z.boolean().optional(),
      system_prompt: z.string().optional(),
      table_result_type: z.string().optional(),
      markdown_image_response_type: z.string().optional(),
--- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
 import { useFormContext, useWatch } from 'react-hook-form';
 import { useTranslation } from 'react-i18next';
 import {
+  FlattenMediaToTextFormField,
  LanguageFormField,
  LargeModelFormField,
  ParserMethodFormField,
@@ -42,6 +43,9 @@ export function PdfFormFields({ prefix }: CommonProps) {
  const parseMethod = useWatch({
    name: parseMethodName,
  });
+  const flattenMediaToText = useWatch({
+    name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+  });

  const languageShown = useMemo(() => {
    return (
@@ -101,11 +105,13 @@ export function PdfFormFields({ prefix }: CommonProps) {
      <TwoColumnCheckFormField prefix={prefix} />
      <RmdirFormField prefix={prefix} />
      <ParserMethodFormField prefix={prefix}></ParserMethodFormField>
-
-      <LargeModelFormField
-        prefix={prefix}
-        options={modelOptions}
-      ></LargeModelFormField>
+      <FlattenMediaToTextFormField prefix={prefix} />
+      {!flattenMediaToText && (
+        <LargeModelFormField
+          prefix={prefix}
+          options={modelOptions}
+        ></LargeModelFormField>
+      )}
      {languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
      {tcadpOptionsShown && (
        <>
--- a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
 import { useFormContext, useWatch } from 'react-hook-form';
 import { useTranslation } from 'react-i18next';
 import {
+  FlattenMediaToTextFormField,
  LargeModelFormField,
  ParserMethodFormField,
 } from './common-form-fields';
@@ -39,6 +40,9 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
  const parseMethod = useWatch({
    name: parseMethodName,
  });
+  const flattenMediaToText = useWatch({
+    name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+  });

  // Spreadsheet only supports DeepDOC and TCADPParser
  const optionsWithoutLLM = [
@@ -97,10 +101,13 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
        prefix={prefix}
        optionsWithoutLLM={optionsWithoutLLM}
      ></ParserMethodFormField>
-      <LargeModelFormField
-        prefix={prefix}
-        options={modelOptions}
-      ></LargeModelFormField>
+      <FlattenMediaToTextFormField prefix={prefix} />
+      {!flattenMediaToText && (
+        <LargeModelFormField
+          prefix={prefix}
+          options={modelOptions}
+        ></LargeModelFormField>
+      )}
      {tcadpOptionsShown && (
        <>
          <RAGFlowFormItem
--- a/web/src/pages/agent/form/parser-form/text-html-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/text-html-form-fields.tsx
@@ -1,20 +1,32 @@
 import { LlmModelType } from '@/constants/knowledge';
 import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request';
-import { LargeModelFormField, RmdirFormField } from './common-form-fields';
+import { useWatch } from 'react-hook-form';
+import {
+  FlattenMediaToTextFormField,
+  LargeModelFormField,
+  RmdirFormField,
+} from './common-form-fields';
 import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';

 export function TextMarkdownFormFields({ prefix }: CommonProps) {
  const modelOptions = useComposeLlmOptionsByModelTypes([
    LlmModelType.Image2text,
  ]);
+  const flattenMediaToText = useWatch({
+    name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+  });

  return (
    <>
      <RmdirFormField prefix={prefix} />
-      <LargeModelFormField
-        prefix={prefix}
-        options={modelOptions}
-      ></LargeModelFormField>
+      <FlattenMediaToTextFormField prefix={prefix} />
+      {!flattenMediaToText && (
+        <LargeModelFormField
+          prefix={prefix}
+          options={modelOptions}
+        ></LargeModelFormField>
+      )}
    </>
  );
 }
--- a/web/src/pages/agent/form/parser-form/word-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/word-form-fields.tsx
@@ -1,24 +1,32 @@
 import { LlmModelType } from '@/constants/knowledge';
 import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request';
+import { useWatch } from 'react-hook-form';
 import {
+  FlattenMediaToTextFormField,
  LargeModelFormField,
  OutputFormatFormFieldProps,
  RmdirFormField,
 } from './common-form-fields';
+import { buildFieldNameWithPrefix } from './utils';

 export function WordFormFields({ prefix }: OutputFormatFormFieldProps) {
  const modelOptions = useComposeLlmOptionsByModelTypes([
    LlmModelType.Image2text,
  ]);
+  const flattenMediaToText = useWatch({
+    name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+  });

  return (
    <>
      <RmdirFormField prefix={prefix} />
-      {/* Multimodal Model */}
-      <LargeModelFormField
-        prefix={prefix}
-        options={modelOptions}
-      ></LargeModelFormField>
+      <FlattenMediaToTextFormField prefix={prefix} />
+      {!flattenMediaToText && (
+        <LargeModelFormField
+          prefix={prefix}
+          options={modelOptions}
+        ></LargeModelFormField>
+      )}
    </>
  );
 }
--- a/web/src/pages/agent/utils.ts
+++ b/web/src/pages/agent/utils.ts
@@ -228,6 +228,7 @@ function transformParserParams(params: ParserFormSchemaType) {
            parse_method: cur.parse_method,
            lang: cur.lang,
            vlm: { llm_id: cur.vlm?.llm_id },
+            flatten_media_to_text: cur.flatten_media_to_text,
            enable_multi_column: cur.enable_multi_column,
            remove_toc: cur.remove_toc,
          };
@@ -243,6 +244,7 @@ function transformParserParams(params: ParserFormSchemaType) {
            ...filteredSetup,
            parse_method: cur.parse_method,
            vlm: { llm_id: cur.vlm?.llm_id },
+            flatten_media_to_text: cur.flatten_media_to_text,
          };
          // Only include TCADP parameters if TCADP Parser is selected
          if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
@@ -277,10 +279,16 @@ function transformParserParams(params: ParserFormSchemaType) {
            fields: cur.fields,
          };
          break;
-        case FileType.Video:
        case FileType.Docx:
-        case FileType.Audio:
        case FileType.TextMarkdown:
+          filteredSetup = {
+            ...filteredSetup,
+            vlm: { llm_id: cur.vlm?.llm_id },
+            flatten_media_to_text: cur.flatten_media_to_text,
+          };
+          break;
+        case FileType.Video:
+        case FileType.Audio:
          filteredSetup = {
            ...filteredSetup,
            vlm: { llm_id: cur.vlm?.llm_id },