From 87a87a7122cf63f818d9cf95e26fe4145e6a9d84 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Fri, 10 Apr 2026 13:11:22 +0800 Subject: [PATCH] Feat: pipeline support ONE chunking method (#14024) ### What problem does this PR solve? Feat: pipeline support ONE chunking method ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Yingfeng --- rag/flow/chunker/token_chunker.py | 18 +++ rag/flow/parser/parser.py | 8 -- web/src/locales/en.ts | 4 + web/src/locales/zh.ts | 4 + web/src/pages/agent/constant/pipeline.tsx | 1 + .../agent/form/token-chunker-form/index.tsx | 125 ++++++++++-------- web/src/pages/agent/utils.ts | 5 +- 7 files changed, 102 insertions(+), 63 deletions(-) diff --git a/rag/flow/chunker/token_chunker.py b/rag/flow/chunker/token_chunker.py index 15d7891d19..7df4b43005 100644 --- a/rag/flow/chunker/token_chunker.py +++ b/rag/flow/chunker/token_chunker.py @@ -32,6 +32,7 @@ from rag.nlp import naive_merge class TokenChunkerParam(ProcessParamBase): def __init__(self): super().__init__() + self.delimiter_mode = "token_size" self.chunk_token_size = 512 self.delimiters = ["\n"] self.overlapped_percent = 0 @@ -40,6 +41,7 @@ class TokenChunkerParam(ProcessParamBase): self.image_context_size = 0 def check(self): + self.check_valid_value(self.delimiter_mode, "Delimiter mode abnormal.", ["token_size", "delimiter", "one"]) if self.delimiters is None: self.delimiters = [] elif isinstance(self.delimiters, str): @@ -310,6 +312,10 @@ class TokenChunker(ProcessBase): overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent) if from_upstream.output_format in ["markdown", "text", "html"]: payload = getattr(from_upstream, f"{from_upstream.output_format}_result") or "" + if self._param.delimiter_mode == "one": + self.set_output("chunks", [{"text": payload}] if payload.strip() else []) + self.callback(1, "Done.") + return cks = _split_text_by_pattern(payload, delimiter_pattern) if delimiter_pattern else naive_merge( payload, self._param.chunk_token_size, @@ -334,6 +340,18 @@ class TokenChunker(ProcessBase): # json json_result = from_upstream.json_result or [] + if self._param.delimiter_mode == "one": + sections = [] + for item in json_result: + text = item.get("text") + if not isinstance(text, str): + text = item.get("content_with_weight") + if isinstance(text, str) and text.strip(): + sections.append(text) + merged_text = "\n".join(sections) + self.set_output("chunks", [{"text": merged_text}] if merged_text.strip() else []) + self.callback(1, "Done.") + return # Structured JSON input is normalized first, then optionally enriched with # media context, and finally merged only when delimiter splitting is inactive. chunks = _build_json_chunks(json_result, delimiter_pattern) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index e9c06cb879..6f2f26d041 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -969,10 +969,6 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.") conf = self._param.setups["text&code"] self.set_output("output_format", conf["output_format"]) - - print("\n\n") - print(conf.get("output_format")) - print("\n\n") sections = TxtParser()( name, @@ -984,10 +980,6 @@ class Parser(ProcessBase): self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]]) return - print("\n", "-"*150, "\n") - print(sections) - print("\n", "-"*150, "\n") - self.set_output("text", "\n".join([section[0] for section in sections if section[0]])) def _html(self, name, blob, **kwargs): diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index b72a096c48..ef1f687f86 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1554,6 +1554,10 @@ Example: Virtual Hosted Style`, tab: 'Tab', space: 'Space', delimiters: 'Delimiters', + one: 'One', + oneChunkTitle: 'Note', + oneChunkDescription: + 'All parsed sections will be merged in order into a single chunk.', enableChildrenDelimiters: 'Child chunk are used for retrieval', merge: 'Merge', split: 'Split', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 1eb2d30cc2..5e749d80af 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1303,6 +1303,10 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 tab: '制表符', space: '空格', delimiters: '分隔符', + one: 'One', + oneChunkTitle: 'Note', + oneChunkDescription: + '所有解析后的 sections 会按原始顺序合并为 1 个 chunk。', merge: '合并', split: '拆分', script: '脚本', diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index 7578da3965..757c69c4b2 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -255,6 +255,7 @@ export const initialTokenChunkerValues = { outputs: { chunks: { type: 'Array', value: [] }, }, + delimiter_mode: 'token_size', chunk_token_size: 512, overlapped_percent: 0, delimiters: [{ value: '\n' }], diff --git a/web/src/pages/agent/form/token-chunker-form/index.tsx b/web/src/pages/agent/form/token-chunker-form/index.tsx index 14b4fd567b..1e0ceaf010 100644 --- a/web/src/pages/agent/form/token-chunker-form/index.tsx +++ b/web/src/pages/agent/form/token-chunker-form/index.tsx @@ -6,7 +6,7 @@ import { BlockButton, Button } from '@/components/ui/button'; import { Form, FormControl, FormField, FormItem } from '@/components/ui/form'; import { Switch } from '@/components/ui/switch'; import { zodResolver } from '@hookform/resolvers/zod'; -import { Trash2 } from 'lucide-react'; +import { Info, Trash2 } from 'lucide-react'; import { memo } from 'react'; import { useFieldArray, useForm } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; @@ -36,7 +36,7 @@ export const FormSchema = z.object({ }), ), overlapped_percent: z.number(), - delimiter_mode: z.enum(['token_size', 'delimiter']).optional(), + delimiter_mode: z.enum(['token_size', 'delimiter', 'one']).optional(), }); export type TokenChunkerFormSchemaType = z.infer; @@ -50,7 +50,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => { delimiter_mode: defaultValues.delimiter_mode || 'token_size', }; - const form = useForm({ + const form = useForm({ defaultValues: formDefaultValues, resolver: zodResolver(FormSchema), }); @@ -81,6 +81,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => { options: [ { label: 'Token Size', value: 'token_size' }, { label: t('flow.delimiters'), value: 'delimiter' }, + { label: t('flow.one'), value: 'one' }, ], }} /> @@ -141,58 +142,74 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => { )} -
-
- {t('flow.enableChildrenDelimiters')} - - ( - - - - - - )} - /> -
- - {form.getValues('enable_children') && ( -
- {childrenDelimiters.fields.map((field, index) => ( -
- - - - - -
- ))} - - childrenDelimiters.append({ value: '\n' })} - > - {t('common.add')} - + {delimiterMode === 'one' && ( +
+ +
+
+ {t('flow.oneChunkTitle')} +
+

+ {t('flow.oneChunkDescription')} +

- )} -
+ + )} + + {delimiterMode !== 'one' && ( +
+
+ {t('flow.enableChildrenDelimiters')} + + ( + + + + + + )} + /> +
+ + {form.getValues('enable_children') && ( +
+ {childrenDelimiters.fields.map((field, index) => ( +
+ + + + + +
+ ))} + + childrenDelimiters.append({ value: '\n' })} + > + {t('common.add')} + +
+ )} +
+ )}
diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index d9586af8c3..94d8ab9cdf 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -303,7 +303,10 @@ function transformTokenChunkerParams(params: TokenChunkerFormSchemaType) { const imageTableContextWindow = Number(image_table_context_window || 0); return { ...rest, - overlapped_percent: Number(params.overlapped_percent) / 100, + overlapped_percent: + params.delimiter_mode === 'one' + ? 0 + : Number(params.overlapped_percent) / 100, delimiters: params.delimiter_mode === 'delimiter' ? transformObjectArrayToPureArray(params.delimiters, 'value')