From 75a5548b85e7c1d27fc42f76aabbe1a26c646cdd Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Thu, 23 Apr 2026 18:55:55 +0800 Subject: [PATCH] Feat: optimize title chunk (#14325) ### What problem does this PR solve? Feat: optimize title chunk 1. Add a new button to enable "Use root chunk as H0 heading", so that the first chunk is carried on to all remaining chunks. 2. Update resume agent template ### Type of change - [x] New Feature (non-breaking change which adds functionality) img_v3_02111_63b04951-b3d7-4001-a08b-539db6d5298g image --- ...Book.json => ingestion_pipeline_book.json} | 0 ...l.json => ingestion_pipeline_general.json} | 0 ...Laws.json => ingestion_pipeline_laws.json} | 0 ...al.json => ingestion_pipeline_manual.json} | 0 ...e_One.json => ingestion_pipeline_one.json} | 0 ...per.json => ingestion_pipeline_paper.json} | 0 ...me.json => ingestion_pipeline_resume.json} | 49 +++++++------- rag/flow/chunker/title_chunker/common.py | 16 ++++- web/src/locales/en.ts | 3 + web/src/pages/agent/constant/pipeline.tsx | 2 + .../agent/form/title-chunker-form/hook.ts | 1 + .../agent/form/title-chunker-form/index.tsx | 65 ++++++++++++++----- web/src/pages/agent/utils.ts | 1 + 13 files changed, 96 insertions(+), 41 deletions(-) rename agent/templates/{ingestion_pipeline_Book.json => ingestion_pipeline_book.json} (100%) rename agent/templates/{ingestion_pipeline_General.json => ingestion_pipeline_general.json} (100%) rename agent/templates/{ingestion_pipeline_Laws.json => ingestion_pipeline_laws.json} (100%) rename agent/templates/{ingestion_pipeline_Manual.json => ingestion_pipeline_manual.json} (100%) rename agent/templates/{ingestion_pipeline_One.json => ingestion_pipeline_one.json} (100%) rename agent/templates/{ingestion_pipeline_Paper.json => ingestion_pipeline_paper.json} (100%) rename agent/templates/{ingestion_pipeline_Resume.json => ingestion_pipeline_resume.json} (98%) diff --git a/agent/templates/ingestion_pipeline_Book.json b/agent/templates/ingestion_pipeline_book.json similarity index 100% rename from agent/templates/ingestion_pipeline_Book.json rename to agent/templates/ingestion_pipeline_book.json diff --git a/agent/templates/ingestion_pipeline_General.json b/agent/templates/ingestion_pipeline_general.json similarity index 100% rename from agent/templates/ingestion_pipeline_General.json rename to agent/templates/ingestion_pipeline_general.json diff --git a/agent/templates/ingestion_pipeline_Laws.json b/agent/templates/ingestion_pipeline_laws.json similarity index 100% rename from agent/templates/ingestion_pipeline_Laws.json rename to agent/templates/ingestion_pipeline_laws.json diff --git a/agent/templates/ingestion_pipeline_Manual.json b/agent/templates/ingestion_pipeline_manual.json similarity index 100% rename from agent/templates/ingestion_pipeline_Manual.json rename to agent/templates/ingestion_pipeline_manual.json diff --git a/agent/templates/ingestion_pipeline_One.json b/agent/templates/ingestion_pipeline_one.json similarity index 100% rename from agent/templates/ingestion_pipeline_One.json rename to agent/templates/ingestion_pipeline_one.json diff --git a/agent/templates/ingestion_pipeline_Paper.json b/agent/templates/ingestion_pipeline_paper.json similarity index 100% rename from agent/templates/ingestion_pipeline_Paper.json rename to agent/templates/ingestion_pipeline_paper.json diff --git a/agent/templates/ingestion_pipeline_Resume.json b/agent/templates/ingestion_pipeline_resume.json similarity index 98% rename from agent/templates/ingestion_pipeline_Resume.json rename to agent/templates/ingestion_pipeline_resume.json index 7b8d989957..cb35eb2043 100644 --- a/agent/templates/ingestion_pipeline_Resume.json +++ b/agent/templates/ingestion_pipeline_resume.json @@ -242,13 +242,14 @@ "include_heading_content": false, "levels": [ [ - "^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$" + "^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$" ], [ "^\\s*(?:\\d+[\\.\u3001\\)]\\s*)?(?:\u6559\u80b2\u80cc\u666f|\u6559\u80b2\u7ecf\u5386|\u5b66\u5386\u80cc\u666f|\u5b66\u672f\u80cc\u666f|\u6280\u672f\u80cc\u666f|\u5de5\u4f5c\u7ecf\u5386|\u5de5\u4f5c\u7ecf\u9a8c|\u5b9e\u4e60\u7ecf\u5386|\u9879\u76ee\u7ecf\u5386|\u9879\u76ee\u7ecf\u9a8c|\u79d1\u7814\u7ecf\u5386|\u7814\u7a76\u7ecf\u5386|\u6821\u56ed\u7ecf\u5386|\u5b9e\u8df5\u7ecf\u5386|\u4e13\u4e1a\u7ecf\u5386|\u804c\u4e1a\u7ecf\u5386|\u6280\u80fd|\u4e13\u4e1a\u6280\u80fd|\u6280\u80fd\u7279\u957f|\u6838\u5fc3\u6280\u80fd|\u6280\u672f\u6808|\u4e2a\u4eba\u6280\u80fd|\u5de5\u4f5c\u6280\u80fd|\u804c\u4e1a\u6280\u80fd|\u6280\u80fd\u4e0e\u8bc4\u4ef7|\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u5de5\u4f5c\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u804c\u4e1a\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u8bc1\u4e66|\u8d44\u683c\u8bc1\u4e66|\u804c\u4e1a\u8d44\u683c|\u8d44\u8d28\u8bc1\u4e66|\u83b7\u5956\u60c5\u51b5|\u83b7\u5956\u7ecf\u5386|\u8363\u8a89|\u8363\u8a89\u5956\u9879|\u5956\u9879|\u79d1\u7814\u6210\u679c|\u8bba\u6587\u53d1\u8868|\u53d1\u8868\u8bba\u6587|\u9886\u5bfc\u7ecf\u5386|\u5b66\u751f\u5de5\u4f5c|\u6821\u56ed\u6d3b\u52a8|\u793e\u56e2\u7ecf\u5386|\u6d3b\u52a8\u7ecf\u5386|\u5fd7\u613f\u7ecf\u5386|\u5fd7\u613f\u670d\u52a1|\u793e\u4f1a\u5b9e\u8df5|\u8bed\u8a00\u80fd\u529b|\u8bed\u8a00|\u81ea\u6211\u8bc4\u4ef7|\u4e2a\u4eba\u8bc4\u4ef7|\u81ea\u6211\u603b\u7ed3|\u4e2a\u4eba\u603b\u7ed3|\u4e2a\u4eba\u4f18\u52bf|\u4e2a\u4eba\u7b80\u4ecb|\u4e2a\u4eba\u4fe1\u606f|\u57fa\u672c\u4fe1\u606f|\u8054\u7cfb\u65b9\u5f0f|\u6c42\u804c\u610f\u5411|\u5e94\u8058\u610f\u5411|\u804c\u4e1a\u76ee\u6807|\u6c42\u804c\u76ee\u6807|\u5174\u8da3\u7231\u597d|\u5174\u8da3\u7279\u957f|\u57f9\u8bad\u7ecf\u5386|\u5176\u4ed6\u4fe1\u606f|\u9644\u52a0\u4fe1\u606f)\\s*[:\uff1a]?\\s*$" ] ], - "method": "hierarchy" + "method": "hierarchy", + "root_chunk_as_heading": true } }, "upstream": [ @@ -299,16 +300,6 @@ "target": "TitleChunker:FlatMiceFix", "targetHandle": "end" }, - { - "data": { - "isHovered": false - }, - "id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend", - "source": "TitleChunker:FlatMiceFix", - "sourceHandle": "start", - "target": "Extractor:ThreeDrinksAct", - "targetHandle": "end" - }, { "data": { "isHovered": false @@ -321,6 +312,19 @@ "targetHandle": "end", "type": "buttonEdge", "zIndex": 1001 + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend", + "markerEnd": "logo", + "source": "TitleChunker:FlatMiceFix", + "sourceHandle": "start", + "target": "Extractor:ThreeDrinksAct", + "targetHandle": "end", + "type": "buttonEdge", + "zIndex": 1001 } ], "nodes": [ @@ -331,7 +335,7 @@ }, "id": "File", "measured": { - "height": 50, + "height": 49, "width": 200 }, "position": { @@ -460,7 +464,7 @@ "dragging": false, "id": "Parser:HipSignsRhyme", "measured": { - "height": 198, + "height": 197, "width": 200 }, "position": { @@ -489,12 +493,12 @@ "dragging": false, "id": "Tokenizer:KindHandsWin", "measured": { - "height": 114, + "height": 113, "width": 200 }, "position": { - "x": 876.4654525205967, - "y": 189.1906747329592 + "x": 883.0243372012395, + "y": 156.39625132974524 }, "selected": false, "sourcePosition": "right", @@ -514,6 +518,7 @@ } }, "promote_first_heading_to_root": false, + "root_chunk_as_heading": true, "rules": [ { "levels": [ @@ -537,14 +542,14 @@ "dragging": false, "id": "TitleChunker:FlatMiceFix", "measured": { - "height": 74, + "height": 73, "width": 200 }, "position": { "x": 572.7908769627791, "y": 141.55515313482098 }, - "selected": false, + "selected": true, "sourcePosition": "right", "targetPosition": "left", "type": "chunkerNode" @@ -580,12 +585,12 @@ "dragging": false, "id": "Extractor:ThreeDrinksAct", "measured": { - "height": 90, + "height": 89, "width": 200 }, "position": { - "x": 583.3659219536569, - "y": 274.7600100230409 + "x": 623.8123774842874, + "y": 236.49984938595793 }, "selected": false, "sourcePosition": "right", diff --git a/rag/flow/chunker/title_chunker/common.py b/rag/flow/chunker/title_chunker/common.py index 95a19fc3ed..89981a83de 100644 --- a/rag/flow/chunker/title_chunker/common.py +++ b/rag/flow/chunker/title_chunker/common.py @@ -41,6 +41,7 @@ class TitleChunkerParam(ProcessParamBase): self.levels = [] self.hierarchy = None self.include_heading_content = False + self.root_chunk_as_heading = False def check(self): if self.method in {"hierarchy", "group"}: @@ -240,13 +241,13 @@ class BaseTitleChunker(ABC): # chunk box is defined by merged source positions and the text payload # is normalized by removing parser tags. if self.from_upstream.output_format in ["markdown", "text", "html"]: - return [ + chunks = [ {"text": "".join(record["text"] + "\n" for record in records)} for records in record_groups if records ] - return [ + chunks = [ ( { "text": RAGFlowPdfParser.remove_tag("".join(record["text"] + "\n" for record in records)), @@ -264,6 +265,17 @@ class BaseTitleChunker(ABC): for records in record_groups if records ] + + if self.param.root_chunk_as_heading and len(chunks) > 1: + root_chunk = chunks[0] + root_text = root_chunk.get("text", "") + + for ck in chunks[1:]: + ck['text'] = root_text + "\n" + ck.get("text", "") + + return chunks[1:] + + return chunks async def set_chunks(self, chunks): diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index a2dea44bcd..5c0ff38c61 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1513,6 +1513,9 @@ Example: Virtual Hosted Style`, includeHeadingContent: 'Include heading content', includeHeadingContentTip: 'When enabled, content directly under a heading is kept as its own chunk. Child chunks keep only the heading path.', + rootAsHeading: 'Use root as H0 heading', + rootAsHeadingTip: + 'Treat the root node as a H0 heading when building the hierarchy', hierarchyTip: `Build a heading tree and produce self-contained chunks, each carrying its full ancestor heading path (e.g. Part 1 › Chapter 3 › Section 2 + body text).\n Best for: Documents with independent, structurally significant sections — such as legal statutes, regulations, contracts, and technical specifications — where each chunk must be identifiable by its structural position even without surrounding context.`, groupTip: `Split the document flat at a chosen heading level and automatically merge adjacent small sections to preserve content continuity. No parent-heading path is injected.\n diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index 307dab82dc..8271838f18 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -333,6 +333,7 @@ export const initialTitleChunkerValues = { method: 'hierarchy', hierarchy: Hierarchy.H3, include_heading_content: false, + root_chunk_as_heading: false, rules: rules, }; @@ -340,6 +341,7 @@ export const initialGroupValues = { method: 'group', hierarchy: '0', include_heading_content: false, + root_chunk_as_heading: false, rules: rules, }; diff --git a/web/src/pages/agent/form/title-chunker-form/hook.ts b/web/src/pages/agent/form/title-chunker-form/hook.ts index fca7ce9093..481d425f46 100644 --- a/web/src/pages/agent/form/title-chunker-form/hook.ts +++ b/web/src/pages/agent/form/title-chunker-form/hook.ts @@ -128,6 +128,7 @@ function transformApiResponseToForm( method, hierarchy, include_heading_content: Boolean(apiData.include_heading_content), + root_chunk_as_heading: Boolean(apiData.root_chunk_as_heading), rules, }; } diff --git a/web/src/pages/agent/form/title-chunker-form/index.tsx b/web/src/pages/agent/form/title-chunker-form/index.tsx index b800c4f023..0f6723577d 100644 --- a/web/src/pages/agent/form/title-chunker-form/index.tsx +++ b/web/src/pages/agent/form/title-chunker-form/index.tsx @@ -29,6 +29,7 @@ import { transformApiResponseToForm, useDynamicHierarchyOptions } from './hook'; type FormModeValues = { hierarchy?: string; include_heading_content?: boolean; + root_chunk_as_heading?: boolean; rules: Array<{ levels: Array<{ expression: string }> }>; }; @@ -60,6 +61,7 @@ export const FormSchema = z.object({ method: z.enum(['hierarchy', 'group']), hierarchy: z.string().optional(), include_heading_content: z.boolean().optional(), + root_chunk_as_heading: z.boolean().optional(), rules: rulesSchema, }); @@ -221,12 +223,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { hierarchyModeValues.current = { hierarchy: hierarchyValue, include_heading_content: form.getValues('include_heading_content'), + root_chunk_as_heading: form.getValues('root_chunk_as_heading'), rules: rulesValue, }; } else if (currentMode === 'group') { groupValues.current = { hierarchy: hierarchyValue, include_heading_content: form.getValues('include_heading_content'), + root_chunk_as_heading: form.getValues('root_chunk_as_heading'), rules: rulesValue, }; } @@ -239,6 +243,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { method: 'group', hierarchy: modeValues?.hierarchy ?? '0', include_heading_content: false, + root_chunk_as_heading: false, rules: modeValues?.rules || initialGroupValues.rules, }); } else { @@ -251,12 +256,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { hierarchy: modeValues.hierarchy || defaultHierarchy, include_heading_content: modeValues.include_heading_content || false, + root_chunk_as_heading: modeValues.root_chunk_as_heading || false, rules: modeValues.rules, }); } else { const newModeValues: FormModeValues = { hierarchy: defaultHierarchy, include_heading_content: false, + root_chunk_as_heading: false, rules: JSON.parse(JSON.stringify(initialTitleChunkerValues.rules)), }; @@ -264,6 +271,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { method: method, hierarchy: defaultHierarchy, include_heading_content: newModeValues.include_heading_content, + root_chunk_as_heading: newModeValues.root_chunk_as_heading, rules: newModeValues.rules, }); } @@ -323,23 +331,46 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { {method === 'hierarchy' && ( - - {(field) => ( - { - field.onChange?.(checked); - }} - /> - )} - + <> + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + )} {/* {method === 'group' ? ( diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index 5b21780741..d77948d93a 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -338,6 +338,7 @@ function transformTitleChunkerParams(params: TitleChunkerFormSchemaType) { method: params.method, hierarchy: Number(params.hierarchy || 0), include_heading_content: Boolean(params.include_heading_content), + root_chunk_as_heading: Boolean(params.root_chunk_as_heading), levels, }; }