Feat: pipeline support ONE chunking method (#14024)

### What problem does this PR solve?

Feat: pipeline support ONE chunking method

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
Magicbook1108
2026-04-10 13:11:22 +08:00
committed by GitHub
parent a37605cbd2
commit 87a87a7122
7 changed files with 102 additions and 63 deletions

View File

@@ -32,6 +32,7 @@ from rag.nlp import naive_merge
class TokenChunkerParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.delimiter_mode = "token_size"
self.chunk_token_size = 512
self.delimiters = ["\n"]
self.overlapped_percent = 0
@@ -40,6 +41,7 @@ class TokenChunkerParam(ProcessParamBase):
self.image_context_size = 0
def check(self):
self.check_valid_value(self.delimiter_mode, "Delimiter mode abnormal.", ["token_size", "delimiter", "one"])
if self.delimiters is None:
self.delimiters = []
elif isinstance(self.delimiters, str):
@@ -310,6 +312,10 @@ class TokenChunker(ProcessBase):
overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent)
if from_upstream.output_format in ["markdown", "text", "html"]:
payload = getattr(from_upstream, f"{from_upstream.output_format}_result") or ""
if self._param.delimiter_mode == "one":
self.set_output("chunks", [{"text": payload}] if payload.strip() else [])
self.callback(1, "Done.")
return
cks = _split_text_by_pattern(payload, delimiter_pattern) if delimiter_pattern else naive_merge(
payload,
self._param.chunk_token_size,
@@ -334,6 +340,18 @@ class TokenChunker(ProcessBase):
# json
json_result = from_upstream.json_result or []
if self._param.delimiter_mode == "one":
sections = []
for item in json_result:
text = item.get("text")
if not isinstance(text, str):
text = item.get("content_with_weight")
if isinstance(text, str) and text.strip():
sections.append(text)
merged_text = "\n".join(sections)
self.set_output("chunks", [{"text": merged_text}] if merged_text.strip() else [])
self.callback(1, "Done.")
return
# Structured JSON input is normalized first, then optionally enriched with
# media context, and finally merged only when delimiter splitting is inactive.
chunks = _build_json_chunks(json_result, delimiter_pattern)

View File

@@ -969,10 +969,6 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
conf = self._param.setups["text&code"]
self.set_output("output_format", conf["output_format"])
print("\n\n")
print(conf.get("output_format"))
print("\n\n")
sections = TxtParser()(
name,
@@ -984,10 +980,6 @@ class Parser(ProcessBase):
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
return
print("\n", "-"*150, "\n")
print(sections)
print("\n", "-"*150, "\n")
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
def _html(self, name, blob, **kwargs):

View File

@@ -1554,6 +1554,10 @@ Example: Virtual Hosted Style`,
tab: 'Tab',
space: 'Space',
delimiters: 'Delimiters',
one: 'One',
oneChunkTitle: 'Note',
oneChunkDescription:
'All parsed sections will be merged in order into a single chunk.',
enableChildrenDelimiters: 'Child chunk are used for retrieval',
merge: 'Merge',
split: 'Split',

View File

@@ -1303,6 +1303,10 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
tab: '制表符',
space: '空格',
delimiters: '分隔符',
one: 'One',
oneChunkTitle: 'Note',
oneChunkDescription:
'所有解析后的 sections 会按原始顺序合并为 1 个 chunk。',
merge: '合并',
split: '拆分',
script: '脚本',

View File

@@ -255,6 +255,7 @@ export const initialTokenChunkerValues = {
outputs: {
chunks: { type: 'Array<Object>', value: [] },
},
delimiter_mode: 'token_size',
chunk_token_size: 512,
overlapped_percent: 0,
delimiters: [{ value: '\n' }],

View File

@@ -6,7 +6,7 @@ import { BlockButton, Button } from '@/components/ui/button';
import { Form, FormControl, FormField, FormItem } from '@/components/ui/form';
import { Switch } from '@/components/ui/switch';
import { zodResolver } from '@hookform/resolvers/zod';
import { Trash2 } from 'lucide-react';
import { Info, Trash2 } from 'lucide-react';
import { memo } from 'react';
import { useFieldArray, useForm } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
@@ -36,7 +36,7 @@ export const FormSchema = z.object({
}),
),
overlapped_percent: z.number(),
delimiter_mode: z.enum(['token_size', 'delimiter']).optional(),
delimiter_mode: z.enum(['token_size', 'delimiter', 'one']).optional(),
});
export type TokenChunkerFormSchemaType = z.infer<typeof FormSchema>;
@@ -50,7 +50,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
delimiter_mode: defaultValues.delimiter_mode || 'token_size',
};
const form = useForm<SplitterFormSchemaType>({
const form = useForm<TokenChunkerFormSchemaType>({
defaultValues: formDefaultValues,
resolver: zodResolver(FormSchema),
});
@@ -81,6 +81,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
options: [
{ label: 'Token Size', value: 'token_size' },
{ label: t('flow.delimiters'), value: 'delimiter' },
{ label: t('flow.one'), value: 'one' },
],
}}
/>
@@ -141,58 +142,74 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
</>
)}
<fieldset>
<div className="mb-2 flex justify-between items-center gap-1">
<span>{t('flow.enableChildrenDelimiters')}</span>
<FormField
control={form.control}
name="enable_children"
render={({ field: { value, onChange, ...restProps } }) => (
<FormItem>
<FormControl>
<Switch
checked={value}
onCheckedChange={onChange}
{...restProps}
/>
</FormControl>
</FormItem>
)}
/>
</div>
{form.getValues('enable_children') && (
<div className="space-y-4">
{childrenDelimiters.fields.map((field, index) => (
<div key={field.id} className="flex items-center gap-2">
<RAGFlowFormItem
name={`children_delimiters.${index}.value`}
label="children_delimiter"
labelClassName="!hidden"
className="flex-auto space-y-0"
>
<DelimiterInput className="!m-0"></DelimiterInput>
</RAGFlowFormItem>
<Button
type="button"
variant="ghost"
onClick={() => childrenDelimiters.remove(index)}
>
<Trash2 />
</Button>
</div>
))}
<BlockButton
onClick={() => childrenDelimiters.append({ value: '\n' })}
>
{t('common.add')}
</BlockButton>
{delimiterMode === 'one' && (
<div className="flex items-start gap-3 rounded-lg border bg-muted/50 px-4 py-3">
<Info className="mt-0.5 h-4 w-4 shrink-0 text-foreground/70" />
<div className="space-y-1">
<div className="text-sm font-medium">
{t('flow.oneChunkTitle')}
</div>
<p className="text-sm text-muted-foreground">
{t('flow.oneChunkDescription')}
</p>
</div>
)}
</fieldset>
</div>
)}
{delimiterMode !== 'one' && (
<fieldset>
<div className="mb-2 flex justify-between items-center gap-1">
<span>{t('flow.enableChildrenDelimiters')}</span>
<FormField
control={form.control}
name="enable_children"
render={({ field: { value, onChange, ...restProps } }) => (
<FormItem>
<FormControl>
<Switch
checked={value}
onCheckedChange={onChange}
{...restProps}
/>
</FormControl>
</FormItem>
)}
/>
</div>
{form.getValues('enable_children') && (
<div className="space-y-4">
{childrenDelimiters.fields.map((field, index) => (
<div key={field.id} className="flex items-center gap-2">
<RAGFlowFormItem
name={`children_delimiters.${index}.value`}
label="children_delimiter"
labelClassName="!hidden"
className="flex-auto space-y-0"
>
<DelimiterInput className="!m-0"></DelimiterInput>
</RAGFlowFormItem>
<Button
type="button"
variant="ghost"
onClick={() => childrenDelimiters.remove(index)}
>
<Trash2 />
</Button>
</div>
))}
<BlockButton
onClick={() => childrenDelimiters.append({ value: '\n' })}
>
{t('common.add')}
</BlockButton>
</div>
)}
</fieldset>
)}
</FormWrapper>
<div className="p-5">
<Output list={outputList}></Output>

View File

@@ -303,7 +303,10 @@ function transformTokenChunkerParams(params: TokenChunkerFormSchemaType) {
const imageTableContextWindow = Number(image_table_context_window || 0);
return {
...rest,
overlapped_percent: Number(params.overlapped_percent) / 100,
overlapped_percent:
params.delimiter_mode === 'one'
? 0
: Number(params.overlapped_percent) / 100,
delimiters:
params.delimiter_mode === 'delimiter'
? transformObjectArrayToPureArray(params.delimiters, 'value')