Feat: optimize title chunk (#14325)

### What problem does this PR solve?

Feat: optimize title chunk
1. Add a new button to enable "Use root chunk as H0 heading", so that
the first chunk is carried on to all remaining chunks.
2. Update resume agent template

### Type of change

- [x] New Feature (non-breaking change which adds functionality)


<img width="700" alt="img_v3_02111_63b04951-b3d7-4001-a08b-539db6d5298g"
src="https://github.com/user-attachments/assets/4179ac4d-90e7-4353-9b93-d649a455e634"
/>

<img width="700" alt="image"
src="https://github.com/user-attachments/assets/c0ba0f3c-05aa-4f2c-b418-e808ca1a2641"
/>
This commit is contained in:
Magicbook1108
2026-04-23 18:55:55 +08:00
committed by GitHub
parent ba47c13eb5
commit 75a5548b85
13 changed files with 96 additions and 41 deletions

View File

@@ -242,13 +242,14 @@
"include_heading_content": false,
"levels": [
[
"^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$"
"^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$"
],
[
"^\\s*(?:\\d+[\\.\u3001\\)]\\s*)?(?:\u6559\u80b2\u80cc\u666f|\u6559\u80b2\u7ecf\u5386|\u5b66\u5386\u80cc\u666f|\u5b66\u672f\u80cc\u666f|\u6280\u672f\u80cc\u666f|\u5de5\u4f5c\u7ecf\u5386|\u5de5\u4f5c\u7ecf\u9a8c|\u5b9e\u4e60\u7ecf\u5386|\u9879\u76ee\u7ecf\u5386|\u9879\u76ee\u7ecf\u9a8c|\u79d1\u7814\u7ecf\u5386|\u7814\u7a76\u7ecf\u5386|\u6821\u56ed\u7ecf\u5386|\u5b9e\u8df5\u7ecf\u5386|\u4e13\u4e1a\u7ecf\u5386|\u804c\u4e1a\u7ecf\u5386|\u6280\u80fd|\u4e13\u4e1a\u6280\u80fd|\u6280\u80fd\u7279\u957f|\u6838\u5fc3\u6280\u80fd|\u6280\u672f\u6808|\u4e2a\u4eba\u6280\u80fd|\u5de5\u4f5c\u6280\u80fd|\u804c\u4e1a\u6280\u80fd|\u6280\u80fd\u4e0e\u8bc4\u4ef7|\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u5de5\u4f5c\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u804c\u4e1a\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u8bc1\u4e66|\u8d44\u683c\u8bc1\u4e66|\u804c\u4e1a\u8d44\u683c|\u8d44\u8d28\u8bc1\u4e66|\u83b7\u5956\u60c5\u51b5|\u83b7\u5956\u7ecf\u5386|\u8363\u8a89|\u8363\u8a89\u5956\u9879|\u5956\u9879|\u79d1\u7814\u6210\u679c|\u8bba\u6587\u53d1\u8868|\u53d1\u8868\u8bba\u6587|\u9886\u5bfc\u7ecf\u5386|\u5b66\u751f\u5de5\u4f5c|\u6821\u56ed\u6d3b\u52a8|\u793e\u56e2\u7ecf\u5386|\u6d3b\u52a8\u7ecf\u5386|\u5fd7\u613f\u7ecf\u5386|\u5fd7\u613f\u670d\u52a1|\u793e\u4f1a\u5b9e\u8df5|\u8bed\u8a00\u80fd\u529b|\u8bed\u8a00|\u81ea\u6211\u8bc4\u4ef7|\u4e2a\u4eba\u8bc4\u4ef7|\u81ea\u6211\u603b\u7ed3|\u4e2a\u4eba\u603b\u7ed3|\u4e2a\u4eba\u4f18\u52bf|\u4e2a\u4eba\u7b80\u4ecb|\u4e2a\u4eba\u4fe1\u606f|\u57fa\u672c\u4fe1\u606f|\u8054\u7cfb\u65b9\u5f0f|\u6c42\u804c\u610f\u5411|\u5e94\u8058\u610f\u5411|\u804c\u4e1a\u76ee\u6807|\u6c42\u804c\u76ee\u6807|\u5174\u8da3\u7231\u597d|\u5174\u8da3\u7279\u957f|\u57f9\u8bad\u7ecf\u5386|\u5176\u4ed6\u4fe1\u606f|\u9644\u52a0\u4fe1\u606f)\\s*[:\uff1a]?\\s*$"
]
],
"method": "hierarchy"
"method": "hierarchy",
"root_chunk_as_heading": true
}
},
"upstream": [
@@ -299,16 +300,6 @@
"target": "TitleChunker:FlatMiceFix",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend",
"source": "TitleChunker:FlatMiceFix",
"sourceHandle": "start",
"target": "Extractor:ThreeDrinksAct",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
@@ -321,6 +312,19 @@
"targetHandle": "end",
"type": "buttonEdge",
"zIndex": 1001
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend",
"markerEnd": "logo",
"source": "TitleChunker:FlatMiceFix",
"sourceHandle": "start",
"target": "Extractor:ThreeDrinksAct",
"targetHandle": "end",
"type": "buttonEdge",
"zIndex": 1001
}
],
"nodes": [
@@ -331,7 +335,7 @@
},
"id": "File",
"measured": {
"height": 50,
"height": 49,
"width": 200
},
"position": {
@@ -460,7 +464,7 @@
"dragging": false,
"id": "Parser:HipSignsRhyme",
"measured": {
"height": 198,
"height": 197,
"width": 200
},
"position": {
@@ -489,12 +493,12 @@
"dragging": false,
"id": "Tokenizer:KindHandsWin",
"measured": {
"height": 114,
"height": 113,
"width": 200
},
"position": {
"x": 876.4654525205967,
"y": 189.1906747329592
"x": 883.0243372012395,
"y": 156.39625132974524
},
"selected": false,
"sourcePosition": "right",
@@ -514,6 +518,7 @@
}
},
"promote_first_heading_to_root": false,
"root_chunk_as_heading": true,
"rules": [
{
"levels": [
@@ -537,14 +542,14 @@
"dragging": false,
"id": "TitleChunker:FlatMiceFix",
"measured": {
"height": 74,
"height": 73,
"width": 200
},
"position": {
"x": 572.7908769627791,
"y": 141.55515313482098
},
"selected": false,
"selected": true,
"sourcePosition": "right",
"targetPosition": "left",
"type": "chunkerNode"
@@ -580,12 +585,12 @@
"dragging": false,
"id": "Extractor:ThreeDrinksAct",
"measured": {
"height": 90,
"height": 89,
"width": 200
},
"position": {
"x": 583.3659219536569,
"y": 274.7600100230409
"x": 623.8123774842874,
"y": 236.49984938595793
},
"selected": false,
"sourcePosition": "right",

View File

@@ -41,6 +41,7 @@ class TitleChunkerParam(ProcessParamBase):
self.levels = []
self.hierarchy = None
self.include_heading_content = False
self.root_chunk_as_heading = False
def check(self):
if self.method in {"hierarchy", "group"}:
@@ -240,13 +241,13 @@ class BaseTitleChunker(ABC):
# chunk box is defined by merged source positions and the text payload
# is normalized by removing parser tags.
if self.from_upstream.output_format in ["markdown", "text", "html"]:
return [
chunks = [
{"text": "".join(record["text"] + "\n" for record in records)}
for records in record_groups
if records
]
return [
chunks = [
(
{
"text": RAGFlowPdfParser.remove_tag("".join(record["text"] + "\n" for record in records)),
@@ -264,6 +265,17 @@ class BaseTitleChunker(ABC):
for records in record_groups
if records
]
if self.param.root_chunk_as_heading and len(chunks) > 1:
root_chunk = chunks[0]
root_text = root_chunk.get("text", "")
for ck in chunks[1:]:
ck['text'] = root_text + "\n" + ck.get("text", "")
return chunks[1:]
return chunks
async def set_chunks(self, chunks):

View File

@@ -1513,6 +1513,9 @@ Example: Virtual Hosted Style`,
includeHeadingContent: 'Include heading content',
includeHeadingContentTip:
'When enabled, content directly under a heading is kept as its own chunk. Child chunks keep only the heading path.',
rootAsHeading: 'Use root as H0 heading',
rootAsHeadingTip:
'Treat the root node as a H0 heading when building the hierarchy',
hierarchyTip: `Build a heading tree and produce self-contained chunks, each carrying its full ancestor heading path (e.g. Part 1 Chapter 3 Section 2 + body text).\n
Best for: Documents with independent, structurally significant sections — such as legal statutes, regulations, contracts, and technical specifications — where each chunk must be identifiable by its structural position even without surrounding context.`,
groupTip: `Split the document flat at a chosen heading level and automatically merge adjacent small sections to preserve content continuity. No parent-heading path is injected.\n

View File

@@ -333,6 +333,7 @@ export const initialTitleChunkerValues = {
method: 'hierarchy',
hierarchy: Hierarchy.H3,
include_heading_content: false,
root_chunk_as_heading: false,
rules: rules,
};
@@ -340,6 +341,7 @@ export const initialGroupValues = {
method: 'group',
hierarchy: '0',
include_heading_content: false,
root_chunk_as_heading: false,
rules: rules,
};

View File

@@ -128,6 +128,7 @@ function transformApiResponseToForm(
method,
hierarchy,
include_heading_content: Boolean(apiData.include_heading_content),
root_chunk_as_heading: Boolean(apiData.root_chunk_as_heading),
rules,
};
}

View File

@@ -29,6 +29,7 @@ import { transformApiResponseToForm, useDynamicHierarchyOptions } from './hook';
type FormModeValues = {
hierarchy?: string;
include_heading_content?: boolean;
root_chunk_as_heading?: boolean;
rules: Array<{ levels: Array<{ expression: string }> }>;
};
@@ -60,6 +61,7 @@ export const FormSchema = z.object({
method: z.enum(['hierarchy', 'group']),
hierarchy: z.string().optional(),
include_heading_content: z.boolean().optional(),
root_chunk_as_heading: z.boolean().optional(),
rules: rulesSchema,
});
@@ -221,12 +223,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
hierarchyModeValues.current = {
hierarchy: hierarchyValue,
include_heading_content: form.getValues('include_heading_content'),
root_chunk_as_heading: form.getValues('root_chunk_as_heading'),
rules: rulesValue,
};
} else if (currentMode === 'group') {
groupValues.current = {
hierarchy: hierarchyValue,
include_heading_content: form.getValues('include_heading_content'),
root_chunk_as_heading: form.getValues('root_chunk_as_heading'),
rules: rulesValue,
};
}
@@ -239,6 +243,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
method: 'group',
hierarchy: modeValues?.hierarchy ?? '0',
include_heading_content: false,
root_chunk_as_heading: false,
rules: modeValues?.rules || initialGroupValues.rules,
});
} else {
@@ -251,12 +256,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
hierarchy: modeValues.hierarchy || defaultHierarchy,
include_heading_content:
modeValues.include_heading_content || false,
root_chunk_as_heading: modeValues.root_chunk_as_heading || false,
rules: modeValues.rules,
});
} else {
const newModeValues: FormModeValues = {
hierarchy: defaultHierarchy,
include_heading_content: false,
root_chunk_as_heading: false,
rules: JSON.parse(JSON.stringify(initialTitleChunkerValues.rules)),
};
@@ -264,6 +271,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
method: method,
hierarchy: defaultHierarchy,
include_heading_content: newModeValues.include_heading_content,
root_chunk_as_heading: newModeValues.root_chunk_as_heading,
rules: newModeValues.rules,
});
}
@@ -323,23 +331,46 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
<SelectWithSearch options={hierarchyOptions}></SelectWithSearch>
</RAGFlowFormItem>
{method === 'hierarchy' && (
<RAGFlowFormItem
name="include_heading_content"
label={t('flow.includeHeadingContent', 'Include heading content')}
tooltip={t('flow.includeHeadingContentTip')}
horizontal={true}
labelClassName="w-full"
valueClassName="w-8"
>
{(field) => (
<Switch
checked={field.value}
onCheckedChange={(checked) => {
field.onChange?.(checked);
}}
/>
)}
</RAGFlowFormItem>
<>
<RAGFlowFormItem
name="include_heading_content"
label={t('flow.includeHeadingContent', 'Include heading content')}
tooltip={t('flow.includeHeadingContentTip')}
horizontal={true}
labelClassName="w-full"
valueClassName="w-8"
>
{(field) => (
<Switch
checked={field.value}
onCheckedChange={(checked) => {
field.onChange?.(checked);
}}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name="root_chunk_as_heading"
label={t('flow.rootAsHeading', 'Use root as heading')}
tooltip={t(
'flow.rootAsHeadingTip',
'Treat the root node as a H0 heading when building the hierarchy',
)}
horizontal={true}
labelClassName="w-full"
valueClassName="w-8"
>
{(field) => (
<Switch
checked={field.value}
onCheckedChange={(checked) => {
field.onChange?.(checked);
}}
/>
)}
</RAGFlowFormItem>
</>
)}
{/* {method === 'group' ? (
<Card>

View File

@@ -338,6 +338,7 @@ function transformTitleChunkerParams(params: TitleChunkerFormSchemaType) {
method: params.method,
hierarchy: Number(params.hierarchy || 0),
include_heading_content: Boolean(params.include_heading_content),
root_chunk_as_heading: Boolean(params.root_chunk_as_heading),
levels,
};
}