Feat: add button to turn off vlm parsing (#14125)

### What problem does this PR solve?

Feat: add button to turn off vlm parsing

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: chanx <1243304602@qq.com>
This commit is contained in:
Magicbook1108
2026-04-15 19:06:00 +08:00
committed by GitHub
parent dce0b1c030
commit 944a90d645
12 changed files with 120 additions and 31 deletions

View File

@@ -234,10 +234,6 @@ class BaseTitleChunker(ABC):
return self.resolve_outline_levels(line_records) or self.resolve_frequency_levels(line_records)
def resolve_manual_levels(self, line_records):
return self.resolve_title_levels(line_records)["levels"]
def build_chunks_from_record_groups(self, record_groups):
# Strategy code decides record grouping. This method materializes each
# group into the output chunk representation. For PDF-like inputs, the

View File

@@ -110,6 +110,7 @@ class ParserParam(ProcessParamBase):
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
"lang": "Chinese",
"flatten_media_to_text": False,
"remove_toc": False,
"suffix": [
"pdf",
@@ -118,6 +119,7 @@ class ParserParam(ProcessParamBase):
},
"spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"flatten_media_to_text": False,
"output_format": "html",
"suffix": [
"xls",
@@ -133,6 +135,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"docx": {
"flatten_media_to_text": False,
"remove_toc": False,
"suffix": [
"docx",
@@ -140,6 +143,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"markdown": {
"flatten_media_to_text": False,
"suffix": ["md", "markdown", "mdx"],
"remove_toc": False,
"output_format": "json",
@@ -312,6 +316,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
pdf_parser = None
# Optional PDF post-processing flags applied after parsing.
@@ -571,7 +576,9 @@ class Parser(ProcessBase):
layout_counters[layout] = seq + 1
b["layoutno"] = f"{layout}-{seq}"
if layout == "table":
if flatten_media_to_text:
b["doc_type_kwd"] = "text"
elif layout == "table":
b["doc_type_kwd"] = "table"
elif layout == "figure":
b["doc_type_kwd"] = "image"
@@ -668,6 +675,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
parse_method = conf.get("parse_method", "deepdoc")
@@ -723,7 +731,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
result.append({"text": table, "doc_type_kwd": "table"})
result.append(
{
"text": table,
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
self.set_output("json", result)
@@ -771,6 +784,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
conf = self._param.setups["docx"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
if re.search(r"\.doc$", name, re.IGNORECASE):
self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
@@ -823,7 +837,7 @@ class Parser(ProcessBase):
{
"text": text,
"image": image,
"doc_type_kwd": "image" if image is not None else "text",
"doc_type_kwd": "text" if flatten_media_to_text or image is None else "image",
}
)
if html:
@@ -831,7 +845,7 @@ class Parser(ProcessBase):
{
"text": html,
"image": None,
"doc_type_kwd": "table",
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
enhance_media_sections_with_vision(
@@ -927,6 +941,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
conf = self._param.setups["markdown"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
markdown_parser = naive_markdown_parser()
sections, tables, section_images = markdown_parser(
@@ -952,7 +967,11 @@ class Parser(ProcessBase):
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
json_result["image"] = combined_image
json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text"
json_result["doc_type_kwd"] = (
"text"
if flatten_media_to_text or json_result.get("image") is None
else "image"
)
json_results.append(json_result)
for table in tables:
@@ -961,7 +980,7 @@ class Parser(ProcessBase):
json_results.append(
{
"text": table_text,
"doc_type_kwd": "table",
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)

View File

@@ -1575,6 +1575,9 @@ Best for: Documents with flowing, contextually connected content — such as boo
oneChunkTitle: 'Note',
oneChunkDescription:
'All parsed sections will be merged in order into a single chunk.',
flattenMediaToText: 'Disable vision model',
flattenMediaToTextTip:
'Treat image and table sections as plain text and skip vision enhancement.',
enableChildrenDelimiters: 'Child chunk are used for retrieval',
merge: 'Merge',
split: 'Split',

View File

@@ -1324,6 +1324,8 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
oneChunkTitle: 'Note',
oneChunkDescription:
'所有解析后的 sections 会按原始顺序合并为 1 个 chunk。',
flattenMediaToText: '禁用视觉模型',
flattenMediaToTextTip: '将图片和表格区块按普通文本处理,并跳过视觉增强。',
merge: '合并',
split: '拆分',
script: '脚本',

View File

@@ -197,12 +197,14 @@ export const initialParserValues = {
output_format: PdfOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
},
{
fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
},
{
fileFormat: FileType.Image,
@@ -221,6 +223,7 @@ export const initialParserValues = {
fileFormat: FileType.TextMarkdown,
output_format: TextMarkdownOutputFormat.Text,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
},
{
fileFormat: FileType.Code,
@@ -241,6 +244,7 @@ export const initialParserValues = {
fileFormat: FileType.Docx,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
},
{
fileFormat: FileType.PowerPoint,

View File

@@ -88,6 +88,29 @@ export function LargeModelFormField({
);
}
export function FlattenMediaToTextFormField({ prefix }: CommonProps) {
const { t } = useTranslation();
return (
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`flatten_media_to_text`, prefix)}
label={t('flow.flattenMediaToText')}
tooltip={t('flow.flattenMediaToTextTip')}
horizontal={true}
labelClassName="w-full"
valueClassName="w-8"
>
{(field) => (
<Switch
checked={field.value}
onCheckedChange={(checked) => {
field.onChange?.(checked);
}}
/>
)}
</RAGFlowFormItem>
);
}
export function TwoColumnCheckFormField({ prefix }: CommonProps) {
const { t } = useTranslation();
return (

View File

@@ -154,6 +154,7 @@ export const FormSchema = z.object({
lang: z.string().optional(),
fields: z.array(z.string()).optional(),
vlm: z.object({ llm_id: z.string().optional() }).optional(),
flatten_media_to_text: z.boolean().optional(),
system_prompt: z.string().optional(),
table_result_type: z.string().optional(),
markdown_image_response_type: z.string().optional(),

View File

@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import {
FlattenMediaToTextFormField,
LanguageFormField,
LargeModelFormField,
ParserMethodFormField,
@@ -42,6 +43,9 @@ export function PdfFormFields({ prefix }: CommonProps) {
const parseMethod = useWatch({
name: parseMethodName,
});
const flattenMediaToText = useWatch({
name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
});
const languageShown = useMemo(() => {
return (
@@ -101,11 +105,13 @@ export function PdfFormFields({ prefix }: CommonProps) {
<TwoColumnCheckFormField prefix={prefix} />
<RmdirFormField prefix={prefix} />
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
)}
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
{tcadpOptionsShown && (
<>

View File

@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import {
FlattenMediaToTextFormField,
LargeModelFormField,
ParserMethodFormField,
} from './common-form-fields';
@@ -39,6 +40,9 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
const parseMethod = useWatch({
name: parseMethodName,
});
const flattenMediaToText = useWatch({
name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
});
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
@@ -97,10 +101,13 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
)}
{tcadpOptionsShown && (
<>
<RAGFlowFormItem

View File

@@ -1,20 +1,32 @@
import { LlmModelType } from '@/constants/knowledge';
import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request';
import { LargeModelFormField, RmdirFormField } from './common-form-fields';
import { useWatch } from 'react-hook-form';
import {
FlattenMediaToTextFormField,
LargeModelFormField,
RmdirFormField,
} from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
export function TextMarkdownFormFields({ prefix }: CommonProps) {
const modelOptions = useComposeLlmOptionsByModelTypes([
LlmModelType.Image2text,
]);
const flattenMediaToText = useWatch({
name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
});
return (
<>
<RmdirFormField prefix={prefix} />
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
)}
</>
);
}

View File

@@ -1,24 +1,32 @@
import { LlmModelType } from '@/constants/knowledge';
import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request';
import { useWatch } from 'react-hook-form';
import {
FlattenMediaToTextFormField,
LargeModelFormField,
OutputFormatFormFieldProps,
RmdirFormField,
} from './common-form-fields';
import { buildFieldNameWithPrefix } from './utils';
export function WordFormFields({ prefix }: OutputFormatFormFieldProps) {
const modelOptions = useComposeLlmOptionsByModelTypes([
LlmModelType.Image2text,
]);
const flattenMediaToText = useWatch({
name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
});
return (
<>
<RmdirFormField prefix={prefix} />
{/* Multimodal Model */}
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (
<LargeModelFormField
prefix={prefix}
options={modelOptions}
></LargeModelFormField>
)}
</>
);
}

View File

@@ -228,6 +228,7 @@ function transformParserParams(params: ParserFormSchemaType) {
parse_method: cur.parse_method,
lang: cur.lang,
vlm: { llm_id: cur.vlm?.llm_id },
flatten_media_to_text: cur.flatten_media_to_text,
enable_multi_column: cur.enable_multi_column,
remove_toc: cur.remove_toc,
};
@@ -243,6 +244,7 @@ function transformParserParams(params: ParserFormSchemaType) {
...filteredSetup,
parse_method: cur.parse_method,
vlm: { llm_id: cur.vlm?.llm_id },
flatten_media_to_text: cur.flatten_media_to_text,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
@@ -277,10 +279,16 @@ function transformParserParams(params: ParserFormSchemaType) {
fields: cur.fields,
};
break;
case FileType.Video:
case FileType.Docx:
case FileType.Audio:
case FileType.TextMarkdown:
filteredSetup = {
...filteredSetup,
vlm: { llm_id: cur.vlm?.llm_id },
flatten_media_to_text: cur.flatten_media_to_text,
};
break;
case FileType.Video:
case FileType.Audio:
filteredSetup = {
...filteredSetup,
vlm: { llm_id: cur.vlm?.llm_id },