mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-05 10:58:34 +08:00
Feat: pipeline support ONE chunking method (#14024)
### What problem does this PR solve? Feat: pipeline support ONE chunking method ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from rag.nlp import naive_merge
|
||||
class TokenChunkerParam(ProcessParamBase):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.delimiter_mode = "token_size"
|
||||
self.chunk_token_size = 512
|
||||
self.delimiters = ["\n"]
|
||||
self.overlapped_percent = 0
|
||||
@@ -40,6 +41,7 @@ class TokenChunkerParam(ProcessParamBase):
|
||||
self.image_context_size = 0
|
||||
|
||||
def check(self):
|
||||
self.check_valid_value(self.delimiter_mode, "Delimiter mode abnormal.", ["token_size", "delimiter", "one"])
|
||||
if self.delimiters is None:
|
||||
self.delimiters = []
|
||||
elif isinstance(self.delimiters, str):
|
||||
@@ -310,6 +312,10 @@ class TokenChunker(ProcessBase):
|
||||
overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent)
|
||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
payload = getattr(from_upstream, f"{from_upstream.output_format}_result") or ""
|
||||
if self._param.delimiter_mode == "one":
|
||||
self.set_output("chunks", [{"text": payload}] if payload.strip() else [])
|
||||
self.callback(1, "Done.")
|
||||
return
|
||||
cks = _split_text_by_pattern(payload, delimiter_pattern) if delimiter_pattern else naive_merge(
|
||||
payload,
|
||||
self._param.chunk_token_size,
|
||||
@@ -334,6 +340,18 @@ class TokenChunker(ProcessBase):
|
||||
|
||||
# json
|
||||
json_result = from_upstream.json_result or []
|
||||
if self._param.delimiter_mode == "one":
|
||||
sections = []
|
||||
for item in json_result:
|
||||
text = item.get("text")
|
||||
if not isinstance(text, str):
|
||||
text = item.get("content_with_weight")
|
||||
if isinstance(text, str) and text.strip():
|
||||
sections.append(text)
|
||||
merged_text = "\n".join(sections)
|
||||
self.set_output("chunks", [{"text": merged_text}] if merged_text.strip() else [])
|
||||
self.callback(1, "Done.")
|
||||
return
|
||||
# Structured JSON input is normalized first, then optionally enriched with
|
||||
# media context, and finally merged only when delimiter splitting is inactive.
|
||||
chunks = _build_json_chunks(json_result, delimiter_pattern)
|
||||
|
||||
@@ -969,10 +969,6 @@ class Parser(ProcessBase):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
|
||||
conf = self._param.setups["text&code"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("\n\n")
|
||||
print(conf.get("output_format"))
|
||||
print("\n\n")
|
||||
|
||||
sections = TxtParser()(
|
||||
name,
|
||||
@@ -984,10 +980,6 @@ class Parser(ProcessBase):
|
||||
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
|
||||
return
|
||||
|
||||
print("\n", "-"*150, "\n")
|
||||
print(sections)
|
||||
print("\n", "-"*150, "\n")
|
||||
|
||||
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
|
||||
|
||||
def _html(self, name, blob, **kwargs):
|
||||
|
||||
@@ -1554,6 +1554,10 @@ Example: Virtual Hosted Style`,
|
||||
tab: 'Tab',
|
||||
space: 'Space',
|
||||
delimiters: 'Delimiters',
|
||||
one: 'One',
|
||||
oneChunkTitle: 'Note',
|
||||
oneChunkDescription:
|
||||
'All parsed sections will be merged in order into a single chunk.',
|
||||
enableChildrenDelimiters: 'Child chunk are used for retrieval',
|
||||
merge: 'Merge',
|
||||
split: 'Split',
|
||||
|
||||
@@ -1303,6 +1303,10 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
tab: '制表符',
|
||||
space: '空格',
|
||||
delimiters: '分隔符',
|
||||
one: 'One',
|
||||
oneChunkTitle: 'Note',
|
||||
oneChunkDescription:
|
||||
'所有解析后的 sections 会按原始顺序合并为 1 个 chunk。',
|
||||
merge: '合并',
|
||||
split: '拆分',
|
||||
script: '脚本',
|
||||
|
||||
@@ -255,6 +255,7 @@ export const initialTokenChunkerValues = {
|
||||
outputs: {
|
||||
chunks: { type: 'Array<Object>', value: [] },
|
||||
},
|
||||
delimiter_mode: 'token_size',
|
||||
chunk_token_size: 512,
|
||||
overlapped_percent: 0,
|
||||
delimiters: [{ value: '\n' }],
|
||||
|
||||
@@ -6,7 +6,7 @@ import { BlockButton, Button } from '@/components/ui/button';
|
||||
import { Form, FormControl, FormField, FormItem } from '@/components/ui/form';
|
||||
import { Switch } from '@/components/ui/switch';
|
||||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { Trash2 } from 'lucide-react';
|
||||
import { Info, Trash2 } from 'lucide-react';
|
||||
import { memo } from 'react';
|
||||
import { useFieldArray, useForm } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
@@ -36,7 +36,7 @@ export const FormSchema = z.object({
|
||||
}),
|
||||
),
|
||||
overlapped_percent: z.number(),
|
||||
delimiter_mode: z.enum(['token_size', 'delimiter']).optional(),
|
||||
delimiter_mode: z.enum(['token_size', 'delimiter', 'one']).optional(),
|
||||
});
|
||||
|
||||
export type TokenChunkerFormSchemaType = z.infer<typeof FormSchema>;
|
||||
@@ -50,7 +50,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
delimiter_mode: defaultValues.delimiter_mode || 'token_size',
|
||||
};
|
||||
|
||||
const form = useForm<SplitterFormSchemaType>({
|
||||
const form = useForm<TokenChunkerFormSchemaType>({
|
||||
defaultValues: formDefaultValues,
|
||||
resolver: zodResolver(FormSchema),
|
||||
});
|
||||
@@ -81,6 +81,7 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
options: [
|
||||
{ label: 'Token Size', value: 'token_size' },
|
||||
{ label: t('flow.delimiters'), value: 'delimiter' },
|
||||
{ label: t('flow.one'), value: 'one' },
|
||||
],
|
||||
}}
|
||||
/>
|
||||
@@ -141,58 +142,74 @@ const TokenChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
</>
|
||||
)}
|
||||
|
||||
<fieldset>
|
||||
<div className="mb-2 flex justify-between items-center gap-1">
|
||||
<span>{t('flow.enableChildrenDelimiters')}</span>
|
||||
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="enable_children"
|
||||
render={({ field: { value, onChange, ...restProps } }) => (
|
||||
<FormItem>
|
||||
<FormControl>
|
||||
<Switch
|
||||
checked={value}
|
||||
onCheckedChange={onChange}
|
||||
{...restProps}
|
||||
/>
|
||||
</FormControl>
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{form.getValues('enable_children') && (
|
||||
<div className="space-y-4">
|
||||
{childrenDelimiters.fields.map((field, index) => (
|
||||
<div key={field.id} className="flex items-center gap-2">
|
||||
<RAGFlowFormItem
|
||||
name={`children_delimiters.${index}.value`}
|
||||
label="children_delimiter"
|
||||
labelClassName="!hidden"
|
||||
className="flex-auto space-y-0"
|
||||
>
|
||||
<DelimiterInput className="!m-0"></DelimiterInput>
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<Button
|
||||
type="button"
|
||||
variant="ghost"
|
||||
onClick={() => childrenDelimiters.remove(index)}
|
||||
>
|
||||
<Trash2 />
|
||||
</Button>
|
||||
</div>
|
||||
))}
|
||||
|
||||
<BlockButton
|
||||
onClick={() => childrenDelimiters.append({ value: '\n' })}
|
||||
>
|
||||
{t('common.add')}
|
||||
</BlockButton>
|
||||
{delimiterMode === 'one' && (
|
||||
<div className="flex items-start gap-3 rounded-lg border bg-muted/50 px-4 py-3">
|
||||
<Info className="mt-0.5 h-4 w-4 shrink-0 text-foreground/70" />
|
||||
<div className="space-y-1">
|
||||
<div className="text-sm font-medium">
|
||||
{t('flow.oneChunkTitle')}
|
||||
</div>
|
||||
<p className="text-sm text-muted-foreground">
|
||||
{t('flow.oneChunkDescription')}
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</fieldset>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{delimiterMode !== 'one' && (
|
||||
<fieldset>
|
||||
<div className="mb-2 flex justify-between items-center gap-1">
|
||||
<span>{t('flow.enableChildrenDelimiters')}</span>
|
||||
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="enable_children"
|
||||
render={({ field: { value, onChange, ...restProps } }) => (
|
||||
<FormItem>
|
||||
<FormControl>
|
||||
<Switch
|
||||
checked={value}
|
||||
onCheckedChange={onChange}
|
||||
{...restProps}
|
||||
/>
|
||||
</FormControl>
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{form.getValues('enable_children') && (
|
||||
<div className="space-y-4">
|
||||
{childrenDelimiters.fields.map((field, index) => (
|
||||
<div key={field.id} className="flex items-center gap-2">
|
||||
<RAGFlowFormItem
|
||||
name={`children_delimiters.${index}.value`}
|
||||
label="children_delimiter"
|
||||
labelClassName="!hidden"
|
||||
className="flex-auto space-y-0"
|
||||
>
|
||||
<DelimiterInput className="!m-0"></DelimiterInput>
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<Button
|
||||
type="button"
|
||||
variant="ghost"
|
||||
onClick={() => childrenDelimiters.remove(index)}
|
||||
>
|
||||
<Trash2 />
|
||||
</Button>
|
||||
</div>
|
||||
))}
|
||||
|
||||
<BlockButton
|
||||
onClick={() => childrenDelimiters.append({ value: '\n' })}
|
||||
>
|
||||
{t('common.add')}
|
||||
</BlockButton>
|
||||
</div>
|
||||
)}
|
||||
</fieldset>
|
||||
)}
|
||||
</FormWrapper>
|
||||
<div className="p-5">
|
||||
<Output list={outputList}></Output>
|
||||
|
||||
@@ -303,7 +303,10 @@ function transformTokenChunkerParams(params: TokenChunkerFormSchemaType) {
|
||||
const imageTableContextWindow = Number(image_table_context_window || 0);
|
||||
return {
|
||||
...rest,
|
||||
overlapped_percent: Number(params.overlapped_percent) / 100,
|
||||
overlapped_percent:
|
||||
params.delimiter_mode === 'one'
|
||||
? 0
|
||||
: Number(params.overlapped_percent) / 100,
|
||||
delimiters:
|
||||
params.delimiter_mode === 'delimiter'
|
||||
? transformObjectArrayToPureArray(params.delimiters, 'value')
|
||||
|
||||
Reference in New Issue
Block a user