mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: optimize title chunk (#14325)
### What problem does this PR solve? Feat: optimize title chunk 1. Add a new button to enable "Use root chunk as H0 heading", so that the first chunk is carried on to all remaining chunks. 2. Update resume agent template ### Type of change - [x] New Feature (non-breaking change which adds functionality) <img width="700" alt="img_v3_02111_63b04951-b3d7-4001-a08b-539db6d5298g" src="https://github.com/user-attachments/assets/4179ac4d-90e7-4353-9b93-d649a455e634" /> <img width="700" alt="image" src="https://github.com/user-attachments/assets/c0ba0f3c-05aa-4f2c-b418-e808ca1a2641" />
This commit is contained in:
@@ -242,13 +242,14 @@
|
||||
"include_heading_content": false,
|
||||
"levels": [
|
||||
[
|
||||
"^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$"
|
||||
"^\\s*(?i:(?:\\d+[\\.\\)]\\s*)?(?:EDUCATION|ACADEMIC\\s*BACKGROUND|ACADEMIC\\s*HISTORY|EDUCATIONAL\\s*BACKGROUND|RELEVANT\\s*COURSEWORK|COURSEWORK|EXPERIENCE|WORK\\s*EXPERIENCE|PROFESSIONAL\\s*EXPERIENCE|RELEVANT\\s*EXPERIENCE|EMPLOYMENT\\s*HISTORY|CAREER\\s*HISTORY|INTERNSHIP\\s*EXPERIENCE|PROJECTS|PROJECT\\s*EXPERIENCE|ACADEMIC\\s*PROJECTS|PROFESSIONAL\\s*PROJECTS|SKILLS|TECHNICAL\\s*SKILLS|CORE\\s*COMPETENCIES|COMPETENCIES|QUALIFICATIONS|SUMMARY\\s*OF\\s*QUALIFICATIONS|CERTIFICATIONS|LICENSES|CERTIFICATES|AWARDS|HONORS|HONOURS|ACHIEVEMENTS|PUBLICATIONS|RESEARCH|RESEARCH\\s*EXPERIENCE|LEADERSHIP|LEADERSHIP\\s*EXPERIENCE|ACTIVITIES|EXTRACURRICULAR\\s*ACTIVITIES|ACTIVITIES\\s*(?:&|AND)\\s*SKILLS|INVOLVEMENT|CAMPUS\\s*INVOLVEMENT|VOLUNTEER\\s*EXPERIENCE|VOLUNTEERING|COMMUNITY\\s*SERVICE|LANGUAGES|INTERESTS|HOBBIES|PROFILE|PROFESSIONAL\\s*PROFILE|SUMMARY|PROFESSIONAL\\s*SUMMARY|CAREER\\s*SUMMARY|OBJECTIVE|CAREER\\s*OBJECTIVE|PERSONAL\\s*INFORMATION|CONTACT\\s*INFORMATION|ADDITIONAL\\s*INFORMATION|TRAINING))\\s*[:\uff1a]?\\s*$"
|
||||
],
|
||||
[
|
||||
"^\\s*(?:\\d+[\\.\u3001\\)]\\s*)?(?:\u6559\u80b2\u80cc\u666f|\u6559\u80b2\u7ecf\u5386|\u5b66\u5386\u80cc\u666f|\u5b66\u672f\u80cc\u666f|\u6280\u672f\u80cc\u666f|\u5de5\u4f5c\u7ecf\u5386|\u5de5\u4f5c\u7ecf\u9a8c|\u5b9e\u4e60\u7ecf\u5386|\u9879\u76ee\u7ecf\u5386|\u9879\u76ee\u7ecf\u9a8c|\u79d1\u7814\u7ecf\u5386|\u7814\u7a76\u7ecf\u5386|\u6821\u56ed\u7ecf\u5386|\u5b9e\u8df5\u7ecf\u5386|\u4e13\u4e1a\u7ecf\u5386|\u804c\u4e1a\u7ecf\u5386|\u6280\u80fd|\u4e13\u4e1a\u6280\u80fd|\u6280\u80fd\u7279\u957f|\u6838\u5fc3\u6280\u80fd|\u6280\u672f\u6808|\u4e2a\u4eba\u6280\u80fd|\u5de5\u4f5c\u6280\u80fd|\u804c\u4e1a\u6280\u80fd|\u6280\u80fd\u4e0e\u8bc4\u4ef7|\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u5de5\u4f5c\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u804c\u4e1a\u6280\u80fd\u4e0e\u81ea\u6211\u8bc4\u4ef7|\u8bc1\u4e66|\u8d44\u683c\u8bc1\u4e66|\u804c\u4e1a\u8d44\u683c|\u8d44\u8d28\u8bc1\u4e66|\u83b7\u5956\u60c5\u51b5|\u83b7\u5956\u7ecf\u5386|\u8363\u8a89|\u8363\u8a89\u5956\u9879|\u5956\u9879|\u79d1\u7814\u6210\u679c|\u8bba\u6587\u53d1\u8868|\u53d1\u8868\u8bba\u6587|\u9886\u5bfc\u7ecf\u5386|\u5b66\u751f\u5de5\u4f5c|\u6821\u56ed\u6d3b\u52a8|\u793e\u56e2\u7ecf\u5386|\u6d3b\u52a8\u7ecf\u5386|\u5fd7\u613f\u7ecf\u5386|\u5fd7\u613f\u670d\u52a1|\u793e\u4f1a\u5b9e\u8df5|\u8bed\u8a00\u80fd\u529b|\u8bed\u8a00|\u81ea\u6211\u8bc4\u4ef7|\u4e2a\u4eba\u8bc4\u4ef7|\u81ea\u6211\u603b\u7ed3|\u4e2a\u4eba\u603b\u7ed3|\u4e2a\u4eba\u4f18\u52bf|\u4e2a\u4eba\u7b80\u4ecb|\u4e2a\u4eba\u4fe1\u606f|\u57fa\u672c\u4fe1\u606f|\u8054\u7cfb\u65b9\u5f0f|\u6c42\u804c\u610f\u5411|\u5e94\u8058\u610f\u5411|\u804c\u4e1a\u76ee\u6807|\u6c42\u804c\u76ee\u6807|\u5174\u8da3\u7231\u597d|\u5174\u8da3\u7279\u957f|\u57f9\u8bad\u7ecf\u5386|\u5176\u4ed6\u4fe1\u606f|\u9644\u52a0\u4fe1\u606f)\\s*[:\uff1a]?\\s*$"
|
||||
]
|
||||
],
|
||||
"method": "hierarchy"
|
||||
"method": "hierarchy",
|
||||
"root_chunk_as_heading": true
|
||||
}
|
||||
},
|
||||
"upstream": [
|
||||
@@ -299,16 +300,6 @@
|
||||
"target": "TitleChunker:FlatMiceFix",
|
||||
"targetHandle": "end"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"isHovered": false
|
||||
},
|
||||
"id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend",
|
||||
"source": "TitleChunker:FlatMiceFix",
|
||||
"sourceHandle": "start",
|
||||
"target": "Extractor:ThreeDrinksAct",
|
||||
"targetHandle": "end"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"isHovered": false
|
||||
@@ -321,6 +312,19 @@
|
||||
"targetHandle": "end",
|
||||
"type": "buttonEdge",
|
||||
"zIndex": 1001
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"isHovered": false
|
||||
},
|
||||
"id": "xy-edge__TitleChunker:FlatMiceFixstart-Extractor:ThreeDrinksActend",
|
||||
"markerEnd": "logo",
|
||||
"source": "TitleChunker:FlatMiceFix",
|
||||
"sourceHandle": "start",
|
||||
"target": "Extractor:ThreeDrinksAct",
|
||||
"targetHandle": "end",
|
||||
"type": "buttonEdge",
|
||||
"zIndex": 1001
|
||||
}
|
||||
],
|
||||
"nodes": [
|
||||
@@ -331,7 +335,7 @@
|
||||
},
|
||||
"id": "File",
|
||||
"measured": {
|
||||
"height": 50,
|
||||
"height": 49,
|
||||
"width": 200
|
||||
},
|
||||
"position": {
|
||||
@@ -460,7 +464,7 @@
|
||||
"dragging": false,
|
||||
"id": "Parser:HipSignsRhyme",
|
||||
"measured": {
|
||||
"height": 198,
|
||||
"height": 197,
|
||||
"width": 200
|
||||
},
|
||||
"position": {
|
||||
@@ -489,12 +493,12 @@
|
||||
"dragging": false,
|
||||
"id": "Tokenizer:KindHandsWin",
|
||||
"measured": {
|
||||
"height": 114,
|
||||
"height": 113,
|
||||
"width": 200
|
||||
},
|
||||
"position": {
|
||||
"x": 876.4654525205967,
|
||||
"y": 189.1906747329592
|
||||
"x": 883.0243372012395,
|
||||
"y": 156.39625132974524
|
||||
},
|
||||
"selected": false,
|
||||
"sourcePosition": "right",
|
||||
@@ -514,6 +518,7 @@
|
||||
}
|
||||
},
|
||||
"promote_first_heading_to_root": false,
|
||||
"root_chunk_as_heading": true,
|
||||
"rules": [
|
||||
{
|
||||
"levels": [
|
||||
@@ -537,14 +542,14 @@
|
||||
"dragging": false,
|
||||
"id": "TitleChunker:FlatMiceFix",
|
||||
"measured": {
|
||||
"height": 74,
|
||||
"height": 73,
|
||||
"width": 200
|
||||
},
|
||||
"position": {
|
||||
"x": 572.7908769627791,
|
||||
"y": 141.55515313482098
|
||||
},
|
||||
"selected": false,
|
||||
"selected": true,
|
||||
"sourcePosition": "right",
|
||||
"targetPosition": "left",
|
||||
"type": "chunkerNode"
|
||||
@@ -580,12 +585,12 @@
|
||||
"dragging": false,
|
||||
"id": "Extractor:ThreeDrinksAct",
|
||||
"measured": {
|
||||
"height": 90,
|
||||
"height": 89,
|
||||
"width": 200
|
||||
},
|
||||
"position": {
|
||||
"x": 583.3659219536569,
|
||||
"y": 274.7600100230409
|
||||
"x": 623.8123774842874,
|
||||
"y": 236.49984938595793
|
||||
},
|
||||
"selected": false,
|
||||
"sourcePosition": "right",
|
||||
@@ -41,6 +41,7 @@ class TitleChunkerParam(ProcessParamBase):
|
||||
self.levels = []
|
||||
self.hierarchy = None
|
||||
self.include_heading_content = False
|
||||
self.root_chunk_as_heading = False
|
||||
|
||||
def check(self):
|
||||
if self.method in {"hierarchy", "group"}:
|
||||
@@ -240,13 +241,13 @@ class BaseTitleChunker(ABC):
|
||||
# chunk box is defined by merged source positions and the text payload
|
||||
# is normalized by removing parser tags.
|
||||
if self.from_upstream.output_format in ["markdown", "text", "html"]:
|
||||
return [
|
||||
chunks = [
|
||||
{"text": "".join(record["text"] + "\n" for record in records)}
|
||||
for records in record_groups
|
||||
if records
|
||||
]
|
||||
|
||||
return [
|
||||
chunks = [
|
||||
(
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag("".join(record["text"] + "\n" for record in records)),
|
||||
@@ -264,6 +265,17 @@ class BaseTitleChunker(ABC):
|
||||
for records in record_groups
|
||||
if records
|
||||
]
|
||||
|
||||
if self.param.root_chunk_as_heading and len(chunks) > 1:
|
||||
root_chunk = chunks[0]
|
||||
root_text = root_chunk.get("text", "")
|
||||
|
||||
for ck in chunks[1:]:
|
||||
ck['text'] = root_text + "\n" + ck.get("text", "")
|
||||
|
||||
return chunks[1:]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
async def set_chunks(self, chunks):
|
||||
|
||||
@@ -1513,6 +1513,9 @@ Example: Virtual Hosted Style`,
|
||||
includeHeadingContent: 'Include heading content',
|
||||
includeHeadingContentTip:
|
||||
'When enabled, content directly under a heading is kept as its own chunk. Child chunks keep only the heading path.',
|
||||
rootAsHeading: 'Use root as H0 heading',
|
||||
rootAsHeadingTip:
|
||||
'Treat the root node as a H0 heading when building the hierarchy',
|
||||
hierarchyTip: `Build a heading tree and produce self-contained chunks, each carrying its full ancestor heading path (e.g. Part 1 › Chapter 3 › Section 2 + body text).\n
|
||||
Best for: Documents with independent, structurally significant sections — such as legal statutes, regulations, contracts, and technical specifications — where each chunk must be identifiable by its structural position even without surrounding context.`,
|
||||
groupTip: `Split the document flat at a chosen heading level and automatically merge adjacent small sections to preserve content continuity. No parent-heading path is injected.\n
|
||||
|
||||
@@ -333,6 +333,7 @@ export const initialTitleChunkerValues = {
|
||||
method: 'hierarchy',
|
||||
hierarchy: Hierarchy.H3,
|
||||
include_heading_content: false,
|
||||
root_chunk_as_heading: false,
|
||||
rules: rules,
|
||||
};
|
||||
|
||||
@@ -340,6 +341,7 @@ export const initialGroupValues = {
|
||||
method: 'group',
|
||||
hierarchy: '0',
|
||||
include_heading_content: false,
|
||||
root_chunk_as_heading: false,
|
||||
rules: rules,
|
||||
};
|
||||
|
||||
|
||||
@@ -128,6 +128,7 @@ function transformApiResponseToForm(
|
||||
method,
|
||||
hierarchy,
|
||||
include_heading_content: Boolean(apiData.include_heading_content),
|
||||
root_chunk_as_heading: Boolean(apiData.root_chunk_as_heading),
|
||||
rules,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ import { transformApiResponseToForm, useDynamicHierarchyOptions } from './hook';
|
||||
type FormModeValues = {
|
||||
hierarchy?: string;
|
||||
include_heading_content?: boolean;
|
||||
root_chunk_as_heading?: boolean;
|
||||
rules: Array<{ levels: Array<{ expression: string }> }>;
|
||||
};
|
||||
|
||||
@@ -60,6 +61,7 @@ export const FormSchema = z.object({
|
||||
method: z.enum(['hierarchy', 'group']),
|
||||
hierarchy: z.string().optional(),
|
||||
include_heading_content: z.boolean().optional(),
|
||||
root_chunk_as_heading: z.boolean().optional(),
|
||||
rules: rulesSchema,
|
||||
});
|
||||
|
||||
@@ -221,12 +223,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
hierarchyModeValues.current = {
|
||||
hierarchy: hierarchyValue,
|
||||
include_heading_content: form.getValues('include_heading_content'),
|
||||
root_chunk_as_heading: form.getValues('root_chunk_as_heading'),
|
||||
rules: rulesValue,
|
||||
};
|
||||
} else if (currentMode === 'group') {
|
||||
groupValues.current = {
|
||||
hierarchy: hierarchyValue,
|
||||
include_heading_content: form.getValues('include_heading_content'),
|
||||
root_chunk_as_heading: form.getValues('root_chunk_as_heading'),
|
||||
rules: rulesValue,
|
||||
};
|
||||
}
|
||||
@@ -239,6 +243,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
method: 'group',
|
||||
hierarchy: modeValues?.hierarchy ?? '0',
|
||||
include_heading_content: false,
|
||||
root_chunk_as_heading: false,
|
||||
rules: modeValues?.rules || initialGroupValues.rules,
|
||||
});
|
||||
} else {
|
||||
@@ -251,12 +256,14 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
hierarchy: modeValues.hierarchy || defaultHierarchy,
|
||||
include_heading_content:
|
||||
modeValues.include_heading_content || false,
|
||||
root_chunk_as_heading: modeValues.root_chunk_as_heading || false,
|
||||
rules: modeValues.rules,
|
||||
});
|
||||
} else {
|
||||
const newModeValues: FormModeValues = {
|
||||
hierarchy: defaultHierarchy,
|
||||
include_heading_content: false,
|
||||
root_chunk_as_heading: false,
|
||||
rules: JSON.parse(JSON.stringify(initialTitleChunkerValues.rules)),
|
||||
};
|
||||
|
||||
@@ -264,6 +271,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
method: method,
|
||||
hierarchy: defaultHierarchy,
|
||||
include_heading_content: newModeValues.include_heading_content,
|
||||
root_chunk_as_heading: newModeValues.root_chunk_as_heading,
|
||||
rules: newModeValues.rules,
|
||||
});
|
||||
}
|
||||
@@ -323,23 +331,46 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => {
|
||||
<SelectWithSearch options={hierarchyOptions}></SelectWithSearch>
|
||||
</RAGFlowFormItem>
|
||||
{method === 'hierarchy' && (
|
||||
<RAGFlowFormItem
|
||||
name="include_heading_content"
|
||||
label={t('flow.includeHeadingContent', 'Include heading content')}
|
||||
tooltip={t('flow.includeHeadingContentTip')}
|
||||
horizontal={true}
|
||||
labelClassName="w-full"
|
||||
valueClassName="w-8"
|
||||
>
|
||||
{(field) => (
|
||||
<Switch
|
||||
checked={field.value}
|
||||
onCheckedChange={(checked) => {
|
||||
field.onChange?.(checked);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
<>
|
||||
<RAGFlowFormItem
|
||||
name="include_heading_content"
|
||||
label={t('flow.includeHeadingContent', 'Include heading content')}
|
||||
tooltip={t('flow.includeHeadingContentTip')}
|
||||
horizontal={true}
|
||||
labelClassName="w-full"
|
||||
valueClassName="w-8"
|
||||
>
|
||||
{(field) => (
|
||||
<Switch
|
||||
checked={field.value}
|
||||
onCheckedChange={(checked) => {
|
||||
field.onChange?.(checked);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name="root_chunk_as_heading"
|
||||
label={t('flow.rootAsHeading', 'Use root as heading')}
|
||||
tooltip={t(
|
||||
'flow.rootAsHeadingTip',
|
||||
'Treat the root node as a H0 heading when building the hierarchy',
|
||||
)}
|
||||
horizontal={true}
|
||||
labelClassName="w-full"
|
||||
valueClassName="w-8"
|
||||
>
|
||||
{(field) => (
|
||||
<Switch
|
||||
checked={field.value}
|
||||
onCheckedChange={(checked) => {
|
||||
field.onChange?.(checked);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
</>
|
||||
)}
|
||||
{/* {method === 'group' ? (
|
||||
<Card>
|
||||
|
||||
@@ -338,6 +338,7 @@ function transformTitleChunkerParams(params: TitleChunkerFormSchemaType) {
|
||||
method: params.method,
|
||||
hierarchy: Number(params.hierarchy || 0),
|
||||
include_heading_content: Boolean(params.include_heading_content),
|
||||
root_chunk_as_heading: Boolean(params.root_chunk_as_heading),
|
||||
levels,
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user