From 944a90d645abf70b09b080438f5d10c391c65c04 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Wed, 15 Apr 2026 19:06:00 +0800 Subject: [PATCH] Feat: add button to turn off vlm parsing (#14125) ### What problem does this PR solve? Feat: add button to turn off vlm parsing ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chanx <1243304602@qq.com> --- rag/flow/chunker/title_chunker/common.py | 4 --- rag/flow/parser/parser.py | 31 +++++++++++++++---- web/src/locales/en.ts | 3 ++ web/src/locales/zh.ts | 2 ++ web/src/pages/agent/constant/pipeline.tsx | 4 +++ .../form/parser-form/common-form-fields.tsx | 23 ++++++++++++++ .../pages/agent/form/parser-form/index.tsx | 1 + .../form/parser-form/pdf-form-fields.tsx | 16 +++++++--- .../parser-form/spreadsheet-form-fields.tsx | 15 ++++++--- .../parser-form/text-html-form-fields.tsx | 22 ++++++++++--- .../form/parser-form/word-form-fields.tsx | 18 ++++++++--- web/src/pages/agent/utils.ts | 12 +++++-- 12 files changed, 120 insertions(+), 31 deletions(-) diff --git a/rag/flow/chunker/title_chunker/common.py b/rag/flow/chunker/title_chunker/common.py index e02389aa87..95a19fc3ed 100644 --- a/rag/flow/chunker/title_chunker/common.py +++ b/rag/flow/chunker/title_chunker/common.py @@ -234,10 +234,6 @@ class BaseTitleChunker(ABC): return self.resolve_outline_levels(line_records) or self.resolve_frequency_levels(line_records) - def resolve_manual_levels(self, line_records): - return self.resolve_title_levels(line_records)["levels"] - - def build_chunks_from_record_groups(self, record_groups): # Strategy code decides record grouping. This method materializes each # group into the output chunk representation. For PDF-like inputs, the diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index cf756649b7..3f98a7efed 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -110,6 +110,7 @@ class ParserParam(ProcessParamBase): "pdf": { "parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm "lang": "Chinese", + "flatten_media_to_text": False, "remove_toc": False, "suffix": [ "pdf", @@ -118,6 +119,7 @@ class ParserParam(ProcessParamBase): }, "spreadsheet": { "parse_method": "deepdoc", # deepdoc/tcadp_parser + "flatten_media_to_text": False, "output_format": "html", "suffix": [ "xls", @@ -133,6 +135,7 @@ class ParserParam(ProcessParamBase): "output_format": "json", }, "docx": { + "flatten_media_to_text": False, "remove_toc": False, "suffix": [ "docx", @@ -140,6 +143,7 @@ class ParserParam(ProcessParamBase): "output_format": "json", }, "markdown": { + "flatten_media_to_text": False, "suffix": ["md", "markdown", "mdx"], "remove_toc": False, "output_format": "json", @@ -312,6 +316,7 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.") conf = self._param.setups["pdf"] self.set_output("output_format", conf["output_format"]) + flatten_media_to_text = conf.get("flatten_media_to_text") pdf_parser = None # Optional PDF post-processing flags applied after parsing. @@ -571,7 +576,9 @@ class Parser(ProcessBase): layout_counters[layout] = seq + 1 b["layoutno"] = f"{layout}-{seq}" - if layout == "table": + if flatten_media_to_text: + b["doc_type_kwd"] = "text" + elif layout == "table": b["doc_type_kwd"] = "table" elif layout == "figure": b["doc_type_kwd"] = "image" @@ -668,6 +675,7 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.") conf = self._param.setups["spreadsheet"] self.set_output("output_format", conf["output_format"]) + flatten_media_to_text = conf.get("flatten_media_to_text") parse_method = conf.get("parse_method", "deepdoc") @@ -723,7 +731,12 @@ class Parser(ProcessBase): # Add tables as text for table in tables: if table: - result.append({"text": table, "doc_type_kwd": "table"}) + result.append( + { + "text": table, + "doc_type_kwd": "text" if flatten_media_to_text else "table", + } + ) self.set_output("json", result) @@ -771,6 +784,7 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document") conf = self._param.setups["docx"] self.set_output("output_format", conf["output_format"]) + flatten_media_to_text = conf.get("flatten_media_to_text") if re.search(r"\.doc$", name, re.IGNORECASE): self.set_output("file", {**kwargs.get("file", {}), "outlines": []}) @@ -823,7 +837,7 @@ class Parser(ProcessBase): { "text": text, "image": image, - "doc_type_kwd": "image" if image is not None else "text", + "doc_type_kwd": "text" if flatten_media_to_text or image is None else "image", } ) if html: @@ -831,7 +845,7 @@ class Parser(ProcessBase): { "text": html, "image": None, - "doc_type_kwd": "table", + "doc_type_kwd": "text" if flatten_media_to_text else "table", } ) enhance_media_sections_with_vision( @@ -927,6 +941,7 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.") conf = self._param.setups["markdown"] self.set_output("output_format", conf["output_format"]) + flatten_media_to_text = conf.get("flatten_media_to_text") markdown_parser = naive_markdown_parser() sections, tables, section_images = markdown_parser( @@ -952,7 +967,11 @@ class Parser(ProcessBase): # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] json_result["image"] = combined_image - json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text" + json_result["doc_type_kwd"] = ( + "text" + if flatten_media_to_text or json_result.get("image") is None + else "image" + ) json_results.append(json_result) for table in tables: @@ -961,7 +980,7 @@ class Parser(ProcessBase): json_results.append( { "text": table_text, - "doc_type_kwd": "table", + "doc_type_kwd": "text" if flatten_media_to_text else "table", } ) diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 33dc1a63fd..a2dea44bcd 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1575,6 +1575,9 @@ Best for: Documents with flowing, contextually connected content — such as boo oneChunkTitle: 'Note', oneChunkDescription: 'All parsed sections will be merged in order into a single chunk.', + flattenMediaToText: 'Disable vision model', + flattenMediaToTextTip: + 'Treat image and table sections as plain text and skip vision enhancement.', enableChildrenDelimiters: 'Child chunk are used for retrieval', merge: 'Merge', split: 'Split', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 924fc9147a..8251667840 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1324,6 +1324,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 oneChunkTitle: 'Note', oneChunkDescription: '所有解析后的 sections 会按原始顺序合并为 1 个 chunk。', + flattenMediaToText: '禁用视觉模型', + flattenMediaToTextTip: '将图片和表格区块按普通文本处理,并跳过视觉增强。', merge: '合并', split: '拆分', script: '脚本', diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index 67eca573f3..307dab82dc 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -197,12 +197,14 @@ export const initialParserValues = { output_format: PdfOutputFormat.Json, parse_method: ParseDocumentType.DeepDOC, preprocess: PreprocessValue.main_content, + flatten_media_to_text: false, }, { fileFormat: FileType.Spreadsheet, output_format: SpreadsheetOutputFormat.Html, parse_method: ParseDocumentType.DeepDOC, preprocess: PreprocessValue.main_content, + flatten_media_to_text: false, }, { fileFormat: FileType.Image, @@ -221,6 +223,7 @@ export const initialParserValues = { fileFormat: FileType.TextMarkdown, output_format: TextMarkdownOutputFormat.Text, preprocess: PreprocessValue.main_content, + flatten_media_to_text: false, }, { fileFormat: FileType.Code, @@ -241,6 +244,7 @@ export const initialParserValues = { fileFormat: FileType.Docx, output_format: DocxOutputFormat.Json, preprocess: PreprocessValue.main_content, + flatten_media_to_text: false, }, { fileFormat: FileType.PowerPoint, diff --git a/web/src/pages/agent/form/parser-form/common-form-fields.tsx b/web/src/pages/agent/form/parser-form/common-form-fields.tsx index c0f0082d36..de4757573b 100644 --- a/web/src/pages/agent/form/parser-form/common-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/common-form-fields.tsx @@ -88,6 +88,29 @@ export function LargeModelFormField({ ); } +export function FlattenMediaToTextFormField({ prefix }: CommonProps) { + const { t } = useTranslation(); + return ( + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + ); +} + export function TwoColumnCheckFormField({ prefix }: CommonProps) { const { t } = useTranslation(); return ( diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx index fc49cbdb17..1aa32a83e9 100644 --- a/web/src/pages/agent/form/parser-form/index.tsx +++ b/web/src/pages/agent/form/parser-form/index.tsx @@ -154,6 +154,7 @@ export const FormSchema = z.object({ lang: z.string().optional(), fields: z.array(z.string()).optional(), vlm: z.object({ llm_id: z.string().optional() }).optional(), + flatten_media_to_text: z.boolean().optional(), system_prompt: z.string().optional(), table_result_type: z.string().optional(), markdown_image_response_type: z.string().optional(), diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx index 33366a9bfc..94eb516e54 100644 --- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx @@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react'; import { useFormContext, useWatch } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { + FlattenMediaToTextFormField, LanguageFormField, LargeModelFormField, ParserMethodFormField, @@ -42,6 +43,9 @@ export function PdfFormFields({ prefix }: CommonProps) { const parseMethod = useWatch({ name: parseMethodName, }); + const flattenMediaToText = useWatch({ + name: buildFieldNameWithPrefix('flatten_media_to_text', prefix), + }); const languageShown = useMemo(() => { return ( @@ -101,11 +105,13 @@ export function PdfFormFields({ prefix }: CommonProps) { - - + + {!flattenMediaToText && ( + + )} {languageShown && } {tcadpOptionsShown && ( <> diff --git a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx index 091de1d19a..d7566c3b05 100644 --- a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx @@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react'; import { useFormContext, useWatch } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { + FlattenMediaToTextFormField, LargeModelFormField, ParserMethodFormField, } from './common-form-fields'; @@ -39,6 +40,9 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) { const parseMethod = useWatch({ name: parseMethodName, }); + const flattenMediaToText = useWatch({ + name: buildFieldNameWithPrefix('flatten_media_to_text', prefix), + }); // Spreadsheet only supports DeepDOC and TCADPParser const optionsWithoutLLM = [ @@ -97,10 +101,13 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) { prefix={prefix} optionsWithoutLLM={optionsWithoutLLM} > - + + {!flattenMediaToText && ( + + )} {tcadpOptionsShown && ( <> - + + {!flattenMediaToText && ( + + )} ); } diff --git a/web/src/pages/agent/form/parser-form/word-form-fields.tsx b/web/src/pages/agent/form/parser-form/word-form-fields.tsx index 1db5783b6f..a2808d7ba0 100644 --- a/web/src/pages/agent/form/parser-form/word-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/word-form-fields.tsx @@ -1,24 +1,32 @@ import { LlmModelType } from '@/constants/knowledge'; import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request'; +import { useWatch } from 'react-hook-form'; import { + FlattenMediaToTextFormField, LargeModelFormField, OutputFormatFormFieldProps, RmdirFormField, } from './common-form-fields'; +import { buildFieldNameWithPrefix } from './utils'; export function WordFormFields({ prefix }: OutputFormatFormFieldProps) { const modelOptions = useComposeLlmOptionsByModelTypes([ LlmModelType.Image2text, ]); + const flattenMediaToText = useWatch({ + name: buildFieldNameWithPrefix('flatten_media_to_text', prefix), + }); return ( <> - {/* Multimodal Model */} - + + {!flattenMediaToText && ( + + )} ); } diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index 94d8ab9cdf..5b21780741 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -228,6 +228,7 @@ function transformParserParams(params: ParserFormSchemaType) { parse_method: cur.parse_method, lang: cur.lang, vlm: { llm_id: cur.vlm?.llm_id }, + flatten_media_to_text: cur.flatten_media_to_text, enable_multi_column: cur.enable_multi_column, remove_toc: cur.remove_toc, }; @@ -243,6 +244,7 @@ function transformParserParams(params: ParserFormSchemaType) { ...filteredSetup, parse_method: cur.parse_method, vlm: { llm_id: cur.vlm?.llm_id }, + flatten_media_to_text: cur.flatten_media_to_text, }; // Only include TCADP parameters if TCADP Parser is selected if (cur.parse_method?.toLowerCase() === 'tcadp parser') { @@ -277,10 +279,16 @@ function transformParserParams(params: ParserFormSchemaType) { fields: cur.fields, }; break; - case FileType.Video: case FileType.Docx: - case FileType.Audio: case FileType.TextMarkdown: + filteredSetup = { + ...filteredSetup, + vlm: { llm_id: cur.vlm?.llm_id }, + flatten_media_to_text: cur.flatten_media_to_text, + }; + break; + case FileType.Video: + case FileType.Audio: filteredSetup = { ...filteredSetup, vlm: { llm_id: cur.vlm?.llm_id },