diff --git a/rag/flow/chunker/title_chunker/common.py b/rag/flow/chunker/title_chunker/common.py
index e02389aa87..95a19fc3ed 100644
--- a/rag/flow/chunker/title_chunker/common.py
+++ b/rag/flow/chunker/title_chunker/common.py
@@ -234,10 +234,6 @@ class BaseTitleChunker(ABC):
return self.resolve_outline_levels(line_records) or self.resolve_frequency_levels(line_records)
- def resolve_manual_levels(self, line_records):
- return self.resolve_title_levels(line_records)["levels"]
-
-
def build_chunks_from_record_groups(self, record_groups):
# Strategy code decides record grouping. This method materializes each
# group into the output chunk representation. For PDF-like inputs, the
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index cf756649b7..3f98a7efed 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -110,6 +110,7 @@ class ParserParam(ProcessParamBase):
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
"lang": "Chinese",
+ "flatten_media_to_text": False,
"remove_toc": False,
"suffix": [
"pdf",
@@ -118,6 +119,7 @@ class ParserParam(ProcessParamBase):
},
"spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
+ "flatten_media_to_text": False,
"output_format": "html",
"suffix": [
"xls",
@@ -133,6 +135,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"docx": {
+ "flatten_media_to_text": False,
"remove_toc": False,
"suffix": [
"docx",
@@ -140,6 +143,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"markdown": {
+ "flatten_media_to_text": False,
"suffix": ["md", "markdown", "mdx"],
"remove_toc": False,
"output_format": "json",
@@ -312,6 +316,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
+ flatten_media_to_text = conf.get("flatten_media_to_text")
pdf_parser = None
# Optional PDF post-processing flags applied after parsing.
@@ -571,7 +576,9 @@ class Parser(ProcessBase):
layout_counters[layout] = seq + 1
b["layoutno"] = f"{layout}-{seq}"
- if layout == "table":
+ if flatten_media_to_text:
+ b["doc_type_kwd"] = "text"
+ elif layout == "table":
b["doc_type_kwd"] = "table"
elif layout == "figure":
b["doc_type_kwd"] = "image"
@@ -668,6 +675,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
+ flatten_media_to_text = conf.get("flatten_media_to_text")
parse_method = conf.get("parse_method", "deepdoc")
@@ -723,7 +731,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
- result.append({"text": table, "doc_type_kwd": "table"})
+ result.append(
+ {
+ "text": table,
+ "doc_type_kwd": "text" if flatten_media_to_text else "table",
+ }
+ )
self.set_output("json", result)
@@ -771,6 +784,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
conf = self._param.setups["docx"]
self.set_output("output_format", conf["output_format"])
+ flatten_media_to_text = conf.get("flatten_media_to_text")
if re.search(r"\.doc$", name, re.IGNORECASE):
self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
@@ -823,7 +837,7 @@ class Parser(ProcessBase):
{
"text": text,
"image": image,
- "doc_type_kwd": "image" if image is not None else "text",
+ "doc_type_kwd": "text" if flatten_media_to_text or image is None else "image",
}
)
if html:
@@ -831,7 +845,7 @@ class Parser(ProcessBase):
{
"text": html,
"image": None,
- "doc_type_kwd": "table",
+ "doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
enhance_media_sections_with_vision(
@@ -927,6 +941,7 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
conf = self._param.setups["markdown"]
self.set_output("output_format", conf["output_format"])
+ flatten_media_to_text = conf.get("flatten_media_to_text")
markdown_parser = naive_markdown_parser()
sections, tables, section_images = markdown_parser(
@@ -952,7 +967,11 @@ class Parser(ProcessBase):
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
json_result["image"] = combined_image
- json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text"
+ json_result["doc_type_kwd"] = (
+ "text"
+ if flatten_media_to_text or json_result.get("image") is None
+ else "image"
+ )
json_results.append(json_result)
for table in tables:
@@ -961,7 +980,7 @@ class Parser(ProcessBase):
json_results.append(
{
"text": table_text,
- "doc_type_kwd": "table",
+ "doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
index 33dc1a63fd..a2dea44bcd 100644
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1575,6 +1575,9 @@ Best for: Documents with flowing, contextually connected content — such as boo
oneChunkTitle: 'Note',
oneChunkDescription:
'All parsed sections will be merged in order into a single chunk.',
+ flattenMediaToText: 'Disable vision model',
+ flattenMediaToTextTip:
+ 'Treat image and table sections as plain text and skip vision enhancement.',
enableChildrenDelimiters: 'Child chunk are used for retrieval',
merge: 'Merge',
split: 'Split',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
index 924fc9147a..8251667840 100644
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1324,6 +1324,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
oneChunkTitle: 'Note',
oneChunkDescription:
'所有解析后的 sections 会按原始顺序合并为 1 个 chunk。',
+ flattenMediaToText: '禁用视觉模型',
+ flattenMediaToTextTip: '将图片和表格区块按普通文本处理,并跳过视觉增强。',
merge: '合并',
split: '拆分',
script: '脚本',
diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx
index 67eca573f3..307dab82dc 100644
--- a/web/src/pages/agent/constant/pipeline.tsx
+++ b/web/src/pages/agent/constant/pipeline.tsx
@@ -197,12 +197,14 @@ export const initialParserValues = {
output_format: PdfOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
+ flatten_media_to_text: false,
},
{
fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html,
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
+ flatten_media_to_text: false,
},
{
fileFormat: FileType.Image,
@@ -221,6 +223,7 @@ export const initialParserValues = {
fileFormat: FileType.TextMarkdown,
output_format: TextMarkdownOutputFormat.Text,
preprocess: PreprocessValue.main_content,
+ flatten_media_to_text: false,
},
{
fileFormat: FileType.Code,
@@ -241,6 +244,7 @@ export const initialParserValues = {
fileFormat: FileType.Docx,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
+ flatten_media_to_text: false,
},
{
fileFormat: FileType.PowerPoint,
diff --git a/web/src/pages/agent/form/parser-form/common-form-fields.tsx b/web/src/pages/agent/form/parser-form/common-form-fields.tsx
index c0f0082d36..de4757573b 100644
--- a/web/src/pages/agent/form/parser-form/common-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/common-form-fields.tsx
@@ -88,6 +88,29 @@ export function LargeModelFormField({
);
}
+export function FlattenMediaToTextFormField({ prefix }: CommonProps) {
+ const { t } = useTranslation();
+ return (
+
+ {(field) => (
+ {
+ field.onChange?.(checked);
+ }}
+ />
+ )}
+
+ );
+}
+
export function TwoColumnCheckFormField({ prefix }: CommonProps) {
const { t } = useTranslation();
return (
diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx
index fc49cbdb17..1aa32a83e9 100644
--- a/web/src/pages/agent/form/parser-form/index.tsx
+++ b/web/src/pages/agent/form/parser-form/index.tsx
@@ -154,6 +154,7 @@ export const FormSchema = z.object({
lang: z.string().optional(),
fields: z.array(z.string()).optional(),
vlm: z.object({ llm_id: z.string().optional() }).optional(),
+ flatten_media_to_text: z.boolean().optional(),
system_prompt: z.string().optional(),
table_result_type: z.string().optional(),
markdown_image_response_type: z.string().optional(),
diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
index 33366a9bfc..94eb516e54 100644
--- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import {
+ FlattenMediaToTextFormField,
LanguageFormField,
LargeModelFormField,
ParserMethodFormField,
@@ -42,6 +43,9 @@ export function PdfFormFields({ prefix }: CommonProps) {
const parseMethod = useWatch({
name: parseMethodName,
});
+ const flattenMediaToText = useWatch({
+ name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+ });
const languageShown = useMemo(() => {
return (
@@ -101,11 +105,13 @@ export function PdfFormFields({ prefix }: CommonProps) {
-
-
+
+ {!flattenMediaToText && (
+
+ )}
{languageShown && }
{tcadpOptionsShown && (
<>
diff --git a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
index 091de1d19a..d7566c3b05 100644
--- a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
@@ -11,6 +11,7 @@ import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import {
+ FlattenMediaToTextFormField,
LargeModelFormField,
ParserMethodFormField,
} from './common-form-fields';
@@ -39,6 +40,9 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
const parseMethod = useWatch({
name: parseMethodName,
});
+ const flattenMediaToText = useWatch({
+ name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+ });
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
@@ -97,10 +101,13 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) {
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
>
-
+
+ {!flattenMediaToText && (
+
+ )}
{tcadpOptionsShown && (
<>
-
+
+ {!flattenMediaToText && (
+
+ )}
>
);
}
diff --git a/web/src/pages/agent/form/parser-form/word-form-fields.tsx b/web/src/pages/agent/form/parser-form/word-form-fields.tsx
index 1db5783b6f..a2808d7ba0 100644
--- a/web/src/pages/agent/form/parser-form/word-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/word-form-fields.tsx
@@ -1,24 +1,32 @@
import { LlmModelType } from '@/constants/knowledge';
import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request';
+import { useWatch } from 'react-hook-form';
import {
+ FlattenMediaToTextFormField,
LargeModelFormField,
OutputFormatFormFieldProps,
RmdirFormField,
} from './common-form-fields';
+import { buildFieldNameWithPrefix } from './utils';
export function WordFormFields({ prefix }: OutputFormatFormFieldProps) {
const modelOptions = useComposeLlmOptionsByModelTypes([
LlmModelType.Image2text,
]);
+ const flattenMediaToText = useWatch({
+ name: buildFieldNameWithPrefix('flatten_media_to_text', prefix),
+ });
return (
<>
- {/* Multimodal Model */}
-
+
+ {!flattenMediaToText && (
+
+ )}
>
);
}
diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts
index 94d8ab9cdf..5b21780741 100644
--- a/web/src/pages/agent/utils.ts
+++ b/web/src/pages/agent/utils.ts
@@ -228,6 +228,7 @@ function transformParserParams(params: ParserFormSchemaType) {
parse_method: cur.parse_method,
lang: cur.lang,
vlm: { llm_id: cur.vlm?.llm_id },
+ flatten_media_to_text: cur.flatten_media_to_text,
enable_multi_column: cur.enable_multi_column,
remove_toc: cur.remove_toc,
};
@@ -243,6 +244,7 @@ function transformParserParams(params: ParserFormSchemaType) {
...filteredSetup,
parse_method: cur.parse_method,
vlm: { llm_id: cur.vlm?.llm_id },
+ flatten_media_to_text: cur.flatten_media_to_text,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
@@ -277,10 +279,16 @@ function transformParserParams(params: ParserFormSchemaType) {
fields: cur.fields,
};
break;
- case FileType.Video:
case FileType.Docx:
- case FileType.Audio:
case FileType.TextMarkdown:
+ filteredSetup = {
+ ...filteredSetup,
+ vlm: { llm_id: cur.vlm?.llm_id },
+ flatten_media_to_text: cur.flatten_media_to_text,
+ };
+ break;
+ case FileType.Video:
+ case FileType.Audio:
filteredSetup = {
...filteredSetup,
vlm: { llm_id: cur.vlm?.llm_id },