From 1031aebc8f8dfea2f35806212acd5309f7fa4466 Mon Sep 17 00:00:00 2001 From: chanx <1243304602@qq.com> Date: Tue, 14 Apr 2026 15:22:03 +0800 Subject: [PATCH] feat(file): Add file ancestor directory lookup feature by go (#14037) ### What problem does this PR solve? feat(file): Add file ancestor directory lookup feature by go ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- internal/handler/file.go | 36 +++ internal/router/router.go | 1 + web/src/locales/en.ts | 16 +- web/src/locales/zh.ts | 16 +- .../form/parser-form/common-form-fields.tsx | 10 +- .../pages/agent/form/parser-form/index.tsx | 239 +++++++++--------- .../agent/form/title-chunker-form/index.tsx | 39 ++- 7 files changed, 224 insertions(+), 133 deletions(-) diff --git a/internal/handler/file.go b/internal/handler/file.go index 37beaa2055..195733146e 100644 --- a/internal/handler/file.go +++ b/internal/handler/file.go @@ -219,6 +219,41 @@ func (h *FileHandler) GetAllParentFolders(c *gin.Context) { }) } +// GetFileAncestors gets all ancestor folders of a file (matches Python /files//ancestors) +// @Summary Get File Ancestors +// @Description Get all ancestor folders in path from file to root +// @Tags file +// @Accept json +// @Produce json +// @Param id path string true "file ID" +// @Success 200 {object} map[string]interface{} +// @Router /api/v1/files/{id}/ancestors [get] +func (h *FileHandler) GetFileAncestors(c *gin.Context) { + _, errorCode, errorMessage := GetUser(c) + if errorCode != common.CodeSuccess { + jsonError(c, errorCode, errorMessage) + return + } + + fileID := c.Param("id") + if fileID == "" { + jsonError(c, common.CodeBadRequest, "file id is required") + return + } + + parentFolders, err := h.fileService.GetAllParentFolders(fileID) + if err != nil { + jsonError(c, common.CodeServerError, err.Error()) + return + } + + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeSuccess, + "data": gin.H{"parent_folders": parentFolders}, + "message": common.CodeSuccess.Message(), + }) +} + type CreateFolderRequest struct { Name string `json:"name" binding:"required"` ParentID string `json:"parent_id"` @@ -385,6 +420,7 @@ type MoveFileRequest struct { // - dest_file_id only: move files to a new folder (names unchanged) // - new_name only: rename a single file in place (no storage operation) // - both: move and rename simultaneously +// // @Tags file // @Accept json // @Produce json diff --git a/internal/router/router.go b/internal/router/router.go index db61d7702e..af255675cf 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -196,6 +196,7 @@ func (r *Router) Setup(engine *gin.Engine) { file.GET("", r.fileHandler.ListFiles) file.DELETE("", r.fileHandler.DeleteFiles) file.POST("/move", r.fileHandler.MoveFiles) + file.GET("/:id/ancestors", r.fileHandler.GetFileAncestors) file.GET("/:id", r.fileHandler.Download) } diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 56b4121aef..f3fc463727 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1507,6 +1507,19 @@ Example: Virtual Hosted Style`, author: 'Author', sectionTitle: 'Section title', }, + includeHeadingContent: 'Include heading content', + includeHeadingContentTip: + 'When enabled, content directly under a heading is kept as its own chunk. Child chunks keep only the heading path.', + hierarchyTip: `Build a heading tree and produce self-contained chunks, each carrying its full ancestor heading path (e.g. Part 1 › Chapter 3 › Section 2 + body text).
+Best for: Documents with independent, structurally significant sections — such as legal statutes, regulations, contracts, and technical specifications — where each chunk must be identifiable by its structural position even without surrounding context.`, + groupTip: `Split the document flat at a chosen heading level and automatically merge adjacent small sections to preserve content continuity. No parent-heading path is injected.
+Best for: Documents with flowing, contextually connected content — such as books, manuals, reports, and articles — where adjacent paragraphs should stay together to maintain narrative coherence.`, + enableMultiColumn: 'Remove original table of contents', + enableMultiColumnTip: + 'Detect and parse multi-column page layouts to preserve the correct reading order. Turn this on for PDFs or documents with two-column or newspaper-style layouts.', + removeToc: 'Remove original table of contents', + removeTocTip: + 'Remove the table of contents included in the original PDF, so it is not parsed as regular content or chunked for retrieval.', autoPlay: 'Auto play audio', downloadFileTypeTip: 'The file type to download', downloadFileType: 'Download file type', @@ -2244,7 +2257,7 @@ This process aggregates variables from multiple branches into a single variable tokenChunkerDescription: 'Split text into chunks by token length with optional delimiters and overlap.', titleChunkerDescription: - 'Split documents into sections by title hierarchy with regex rules for finer control.', + 'Split documents into sections by title hierarchy. Define heading levels with regex rules, then choose Hierarchy or Group mode to control how chunks are structured.', titleChunker: 'Title Chunker', extractor: 'Transformer', extractorDescription: @@ -2267,6 +2280,7 @@ This process aggregates variables from multiple branches into a single variable }, fields: 'Field', addParser: 'Add Parser', + group: 'Group', hierarchy: 'Hierarchy', regularExpressions: 'Regular Expressions', overlappedPercent: 'Overlapped percent (%)', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index c297603296..bb4918fcf1 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1253,6 +1253,19 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 author: '作者', sectionTitle: '章节标题', }, + includeHeadingContent: '包含标题内容', + includeHeadingContentTip: + '启用后,标题下的直接内容将作为一个独立的块保留。子块仅保留标题路径。', + hierarchyTip: `构建标题树并生成独立的块,每个块携带其完整的祖先标题路径(例如 第1部分 › 第3章 › 第2节 + 正文)。
+适用场景:具有独立的、结构性重要章节的文档——如法律条款、法规、合同和技术规范——其中每个块即使没有上下文也能通过其结构位置来识别。`, + groupTip: `在选定的标题级别将文档扁平分割,并自动合并相邻的小节以保持内容连续性。不注入父标题路径。
+适用场景:具有流动性的、内容相关联的文档——如书籍、手册、报告和文章——其中相邻段落应保持在一起以维持叙述连贯性。`, + enableMultiColumn: '启用多栏', + enableMultiColumnTip: + '检测并解析多栏页面布局以保持正确的阅读顺序。对于具有双栏或报纸式布局的PDF或文档,请开启此功能。', + removeToc: '移除原始目录', + removeTocTip: + '移除原始PDF中包含的目录,这样它就不会被解析为常规内容或作为检索块。', autoPlay: '自动播放', downloadFileTypeTip: '文件下载的类型', downloadFileType: '文件类型', @@ -1943,7 +1956,7 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 tokenChunkerDescription: '根据分词器长度将文本拆分成块,并带有可选的分隔符和重叠。', titleChunkerDescription: - '使用正则表达式规则按标题层次结构将文档拆分成多个部分,以实现更精细的控制。', + '按标题层级拆分文档。通过正则表达式定义各级标题,再选择层级或分组模式控制切片方式。', titleChunker: '按标题分块', extractor: '提取器', extractorDescription: @@ -1966,6 +1979,7 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 }, fields: '字段', addParser: '增加解析器', + group: '聚合', hierarchy: '层次结构', regularExpressions: '正则表达式', overlappedPercent: '重叠百分比(%)', diff --git a/web/src/pages/agent/form/parser-form/common-form-fields.tsx b/web/src/pages/agent/form/parser-form/common-form-fields.tsx index 3d75306306..71f29750c1 100644 --- a/web/src/pages/agent/form/parser-form/common-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/common-form-fields.tsx @@ -93,9 +93,10 @@ export function TwoColumnCheckFormField({ prefix }: CommonProps) { return ( {(field) => ( {(field) => ( -> = { - [FileType.PDF]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - { value: PreprocessValue.abstract }, - { value: PreprocessValue.author }, - { value: PreprocessValue.section_title }, - ], - [FileType.PowerPoint]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - ], - [FileType.Spreadsheet]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - ], - [FileType.TextMarkdown]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - { value: PreprocessValue.section_title }, - ], - [FileType.Code]: [{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }], - [FileType.Html]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - { value: PreprocessValue.section_title }, - ], - [FileType.Doc]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - { value: PreprocessValue.section_title }, - ], - [FileType.Docx]: [ - { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, - { value: PreprocessValue.section_title }, - ], -}; +// const PreprocessOptionConfigsMap: Partial< +// Record +// > = { +// [FileType.PDF]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// { value: PreprocessValue.abstract }, +// { value: PreprocessValue.author }, +// { value: PreprocessValue.section_title }, +// ], +// [FileType.PowerPoint]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// ], +// [FileType.Spreadsheet]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// ], +// [FileType.TextMarkdown]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// { value: PreprocessValue.section_title }, +// ], +// [FileType.Code]: [{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }], +// [FileType.Html]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// { value: PreprocessValue.section_title }, +// ], +// [FileType.Doc]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// { value: PreprocessValue.section_title }, +// ], +// [FileType.Docx]: [ +// { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, +// { value: PreprocessValue.section_title }, +// ], +// }; -function getPreprocessOptionConfigs(fileType?: FileType) { - if (!fileType) { - return DefaultPreprocessOptionConfigs; - } +// function getPreprocessOptionConfigs(fileType?: FileType) { +// if (!fileType) { +// return DefaultPreprocessOptionConfigs; +// } - return PreprocessOptionConfigsMap[fileType] ?? DefaultPreprocessOptionConfigs; -} +// return PreprocessOptionConfigsMap[fileType] ?? DefaultPreprocessOptionConfigs; +// } -function normalizePreprocessValuesByFileType( - fileType: FileType | undefined, - values: string[] | undefined, -) { - const optionConfigs = getPreprocessOptionConfigs(fileType); - const allowedValueSet = new Set(optionConfigs.map((x) => x.value)); - const requiredValues = optionConfigs - .filter((x) => x.required) - .map((x) => x.value); - const normalizedOptionalValues = (Array.isArray(values) ? values : []).filter( - (value) => allowedValueSet.has(value as PreprocessValue), - ) as PreprocessValue[]; +// function normalizePreprocessValuesByFileType( +// fileType: FileType | undefined, +// values: string[] | undefined, +// ) { +// const optionConfigs = getPreprocessOptionConfigs(fileType); +// const allowedValueSet = new Set(optionConfigs.map((x) => x.value)); +// const requiredValues = optionConfigs +// .filter((x) => x.required) +// .map((x) => x.value); +// const normalizedOptionalValues = (Array.isArray(values) ? values : []).filter( +// (value) => allowedValueSet.has(value as PreprocessValue), +// ) as PreprocessValue[]; - return Array.from( - new Set([...requiredValues, ...normalizedOptionalValues]), - ); -} +// return Array.from( +// new Set([...requiredValues, ...normalizedOptionalValues]), +// ); +// } -function isSameStringArray(a: string[] | undefined, b: string[]) { - if (!a || a.length !== b.length) { - return false; - } +// function isSameStringArray(a: string[] | undefined, b: string[]) { +// if (!a || a.length !== b.length) { +// return false; +// } - return a.every((item, idx) => item === b[idx]); -} +// return a.every((item, idx) => item === b[idx]); +// } const FileFormatWidgetMap = { [FileType.PDF]: PdfFormFields, @@ -151,7 +148,7 @@ export const FormSchema = z.object({ setups: z.array( z.object({ fileFormat: z.string().nullish(), - preprocess: z.array(z.string()).optional(), + // preprocess: z.array(z.string()).optional(), output_format: z.string().optional(), parse_method: z.string().optional(), lang: z.string().optional(), @@ -212,56 +209,56 @@ function ParserItem({ [form, index], ); - const handlePreprocessChange = useCallback( - (value: PreprocessValue[]) => { - form.setValue(`setups.${index}.preprocess`, value, { - shouldDirty: true, - shouldValidate: true, - shouldTouch: true, - }); - }, - [form, index], - ); + // const handlePreprocessChange = useCallback( + // (value: PreprocessValue[]) => { + // form.setValue(`setups.${index}.preprocess`, value, { + // shouldDirty: true, + // shouldValidate: true, + // shouldTouch: true, + // }); + // }, + // [form, index], + // ); - const preprocessOptions = useMemo(() => { - const optionConfigs = getPreprocessOptionConfigs(fileFormat as FileType); + // const preprocessOptions = useMemo(() => { + // const optionConfigs = getPreprocessOptionConfigs(fileFormat as FileType); - return optionConfigs.map((optionConfig) => { - const labelMap: Record = { - [MAIN_CONTENT_PREPROCESS_VALUE]: t('flow.preprocess.mainContent'), - [PreprocessValue.section_title]: t('flow.preprocess.sectionTitle'), - [PreprocessValue.abstract]: t('flow.preprocess.abstract'), - [PreprocessValue.author]: t('flow.preprocess.author'), - }; + // return optionConfigs.map((optionConfig) => { + // const labelMap: Record = { + // [MAIN_CONTENT_PREPROCESS_VALUE]: t('flow.preprocess.mainContent'), + // [PreprocessValue.section_title]: t('flow.preprocess.sectionTitle'), + // [PreprocessValue.abstract]: t('flow.preprocess.abstract'), + // [PreprocessValue.author]: t('flow.preprocess.author'), + // }; - const label = labelMap[optionConfig.value] || optionConfig.value; + // const label = labelMap[optionConfig.value] || optionConfig.value; - return { - value: optionConfig.value, - disabled: optionConfig.required, - label: label, - }; - }); - }, [fileFormat, t]); + // return { + // value: optionConfig.value, + // disabled: optionConfig.required, + // label: label, + // }; + // }); + // }, [fileFormat, t]); - useEffect(() => { - const currentPreprocessValues = form.getValues( - `setups.${index}.preprocess`, - ) as string[] | undefined; - const normalizedPreprocessValues = normalizePreprocessValuesByFileType( - fileFormat as FileType, - currentPreprocessValues, - ); + // useEffect(() => { + // const currentPreprocessValues = form.getValues( + // `setups.${index}.preprocess`, + // ) as string[] | undefined; + // const normalizedPreprocessValues = normalizePreprocessValuesByFileType( + // fileFormat as FileType, + // currentPreprocessValues, + // ); - if ( - !isSameStringArray(currentPreprocessValues, normalizedPreprocessValues) - ) { - form.setValue(`setups.${index}.preprocess`, normalizedPreprocessValues, { - shouldDirty: false, - shouldValidate: true, - }); - } - }, [fileFormat, form, index]); + // if ( + // !isSameStringArray(currentPreprocessValues, normalizedPreprocessValues) + // ) { + // form.setValue(`setups.${index}.preprocess`, normalizedPreprocessValues, { + // shouldDirty: false, + // shouldValidate: true, + // }); + // } + // }, [fileFormat, form, index]); return (
- @@ -320,7 +317,7 @@ function ParserItem({ options={preprocessOptions} > )} - + */} {index < fieldLength - 1 && }
); @@ -351,10 +348,10 @@ const ParserForm = ({ node }: INextOperatorForm) => { parse_method: '', lang: '', fields: [], - llm_id: '', + vlm: { llm_id: '' }, table_result_type: '', markdown_image_response_type: '', - preprocess: [], + // preprocess: [], }); }, [append]); diff --git a/web/src/pages/agent/form/title-chunker-form/index.tsx b/web/src/pages/agent/form/title-chunker-form/index.tsx index 2a974a9067..b5e7d64d50 100644 --- a/web/src/pages/agent/form/title-chunker-form/index.tsx +++ b/web/src/pages/agent/form/title-chunker-form/index.tsx @@ -8,7 +8,7 @@ import { Form } from '@/components/ui/form'; import { Input } from '@/components/ui/input'; import { zodResolver } from '@hookform/resolvers/zod'; import { Trash2 } from 'lucide-react'; -import { memo, useEffect, useRef } from 'react'; +import { memo, useEffect, useRef, useState } from 'react'; import { useFieldArray, useForm, useFormContext } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { z } from 'zod'; @@ -197,6 +197,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { }); const isInitialized = useRef(false); const initialMode = useRef(undefined); + const [showAllTip, setShowAllTip] = useState(true); const method = form.watch('method'); const name = 'rules'; @@ -210,6 +211,7 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { } if (method !== initialMode.current) { + setShowAllTip(true); const currentMode = initialMode.current; const hierarchyValue = form.getValues('hierarchy'); const rulesValue = form.getValues('rules'); @@ -290,6 +292,34 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { ], }} /> + {/*
+ {method === 'hierarchy' && t('flow.hierarchyTip')} + {method === 'group' && t('flow.groupTip')} +
*/} +
setShowAllTip(!showAllTip)} + > +
+ + {/* {method === 'hierarchy' && t('flow.hierarchyTip')} + {method === 'group' && t('flow.groupTip')} */} + + {/* + {showAllTip ? '▲' : ''} + */} +
+
@@ -297,12 +327,9 @@ const TitleChunkerForm = ({ node }: INextOperatorForm) => { {(field) => (