diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 9d10ac237f..e23e2c47a2 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -585,9 +585,25 @@ async def _upload_local_documents(kb, tenant_id): logging.error(msg) return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR) + # Parse optional parser_config overrides from form data + parser_config_override = None + raw_parser_config = form.get("parser_config") + if raw_parser_config: + try: + parsed = json.loads(raw_parser_config) + if isinstance(parsed, dict): + # Only allow known table column config keys to prevent arbitrary overrides + allowed_keys = {"table_column_mode", "table_column_roles"} + parser_config_override = {k: v for k, v in parsed.items() if k in allowed_keys} + if not parser_config_override: + parser_config_override = None + except (json.JSONDecodeError, TypeError): + parser_config_override = None + err, files = await thread_pool_exec( FileService.upload_document, kb, file_objs, tenant_id, - parent_path=form.get("parent_path") + parent_path=form.get("parent_path"), + parser_config_override=parser_config_override, ) if err: msg = "\n".join(err) diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index 7c5945d8af..5a36f57eaf 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -455,7 +455,7 @@ class FileService(CommonService): @classmethod @DB.connection_context() - def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None): + def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None, parser_config_override: dict | None = None): root_folder = self.get_root_folder(user_id) pf_id = root_folder["id"] self.init_knowledgebase_docs(pf_id, user_id) @@ -464,6 +464,13 @@ class FileService(CommonService): safe_parent_path = sanitize_path(parent_path) + # Merge parser_config_override with KB parser_config if provided + base_parser_config = kb.parser_config or {} + if parser_config_override and isinstance(parser_config_override, dict): + merged_parser_config = {**base_parser_config, **parser_config_override} + else: + merged_parser_config = base_parser_config + err, files = [], [] for file in file_objs: doc_id = file.id if hasattr(file, "id") else get_uuid() @@ -529,7 +536,7 @@ class FileService(CommonService): "kb_id": kb.id, "parser_id": self.get_parser(filetype, filename, kb.parser_id), "pipeline_id": kb.pipeline_id, - "parser_config": kb.parser_config, + "parser_config": merged_parser_config, "created_by": user_id, "type": filetype, "name": filename, diff --git a/web/src/components/file-upload-dialog/index.tsx b/web/src/components/file-upload-dialog/index.tsx index 251c52f4cb..ced2a011ea 100644 --- a/web/src/components/file-upload-dialog/index.tsx +++ b/web/src/components/file-upload-dialog/index.tsx @@ -6,17 +6,42 @@ import { DialogHeader, DialogTitle, } from '@/components/ui/dialog'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; import { IModalProps } from '@/interfaces/common'; +import { extractTableColumns, isTableFile } from '@/utils/table-column-extract'; import { zodResolver } from '@hookform/resolvers/zod'; import { TFunction } from 'i18next'; +import { useCallback, useEffect, useState } from 'react'; import { useForm } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { z } from 'zod'; import { FileUploader } from '../file-uploader'; import { RAGFlowFormItem } from '../ragflow-form'; import { Form } from '../ui/form'; +import { Label } from '../ui/label'; +import { RadioGroup, RadioGroupItem } from '../ui/radio-group'; import { Switch } from '../ui/switch'; +const ROLE_OPTIONS = [ + { value: 'both', labelKey: 'knowledgeConfiguration.tableColumnRoleBoth' }, + { + value: 'indexing', + labelKey: 'knowledgeConfiguration.tableColumnRoleIndexing', + }, + { + value: 'metadata', + labelKey: 'knowledgeConfiguration.tableColumnRoleMetadata', + }, +] as const; + +export type TableColumnRoles = Record; + function buildUploadFormSchema(t: TFunction) { const FormSchema = z.object({ parseOnCreation: z.boolean().optional(), @@ -31,6 +56,10 @@ function buildUploadFormSchema(t: TFunction) { ), ) .min(1, { message: t('fileManager.pleaseUploadAtLeastOneFile') }), + tableColumnMode: z.enum(['auto', 'manual']).optional(), + tableColumnRoles: z + .record(z.enum(['indexing', 'metadata', 'both'])) + .optional(), }); return FormSchema; @@ -45,8 +74,13 @@ const UploadFormId = 'UploadFormId'; type UploadFormProps = { submit: (values?: UploadFormSchemaType) => void; showParseOnCreation?: boolean; + isTableParser?: boolean; }; -function UploadForm({ submit, showParseOnCreation }: UploadFormProps) { +function UploadForm({ + submit, + showParseOnCreation, + isTableParser, +}: UploadFormProps) { const { t } = useTranslation(); const FormSchema = buildUploadFormSchema(t); @@ -56,9 +90,64 @@ function UploadForm({ submit, showParseOnCreation }: UploadFormProps) { defaultValues: { parseOnCreation: false, fileList: [], + tableColumnMode: 'auto', + tableColumnRoles: {}, }, }); + const [extractedColumns, setExtractedColumns] = useState([]); + const [columnMode, setColumnMode] = useState<'auto' | 'manual'>('auto'); + const [columnRoles, setColumnRoles] = useState({}); + + const handleFilesChange = useCallback( + async (files: any[]) => { + if (!isTableParser || !files || files.length === 0) { + setExtractedColumns([]); + return; + } + + // Extract columns from the first table file + const allColumns = new Set(); + for (const f of files) { + const file = f instanceof File ? f : f.file; + if (file && isTableFile(file)) { + const cols = await extractTableColumns(file); + cols.forEach((c) => allColumns.add(c)); + } + } + setExtractedColumns(Array.from(allColumns)); + }, + [isTableParser], + ); + + const handleModeChange = (value: 'auto' | 'manual') => { + setColumnMode(value); + form.setValue('tableColumnMode', value); + }; + + const handleRoleChange = (col: string, role: string) => { + const updated = { + ...columnRoles, + [col]: role as 'indexing' | 'metadata' | 'both', + }; + setColumnRoles(updated); + form.setValue('tableColumnRoles', updated); + }; + + // Sync column roles to form when columns are extracted + useEffect(() => { + if (columnMode === 'manual' && extractedColumns.length > 0) { + const roles: TableColumnRoles = {}; + extractedColumns.forEach((col) => { + roles[col] = columnRoles[col] || 'both'; + }); + setColumnRoles(roles); + form.setValue('tableColumnRoles', roles); + } + }, [extractedColumns, columnMode]); // eslint-disable-line react-hooks/exhaustive-deps + + const showColumnConfig = isTableParser && extractedColumns.length > 0; + return (
( { + field.onChange(files); + handleFilesChange(files); + }} accept={{}} data-testid="dataset-upload-dropzone" /> )} + + {showColumnConfig && ( +
+
+ + +
+ + +
+
+ + +
+
+
+ + {columnMode === 'auto' && ( +

+ {t('knowledgeConfiguration.tableColumnModeAutoDescription')} +

+ )} + + {columnMode === 'manual' && ( +
+

+ {t('knowledgeConfiguration.tableColumnRolesTip')} +

+
+ {extractedColumns.map((col) => ( +
+ + +
+ ))} +
+
+ )} +
+ )} ); } type FileUploadDialogProps = IModalProps & - Pick; + Pick; export function FileUploadDialog({ hideModal, onOk, loading, showParseOnCreation = false, + isTableParser = false, }: FileUploadDialogProps) { const { t } = useTranslation(); return ( - + {t('fileManager.uploadFile')} - {/* - - {t('fileManager.local')} - {t('fileManager.s3')} - - - - - {t('common.comingSoon')} - */} - + {t('common.save')} diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts index 96ad1b0e2c..9727f4b626 100644 --- a/web/src/hooks/use-document-request.ts +++ b/web/src/hooks/use-document-request.ts @@ -69,9 +69,13 @@ export const useUploadNextDocument = () => { data, isPending: loading, mutateAsync, - } = useMutation, Error, File[]>({ + } = useMutation< + ResponseType, + Error, + { fileList: File[]; parserConfig?: Record } + >({ mutationKey: [DocumentApiAction.UploadDocument], - mutationFn: async (fileList) => { + mutationFn: async ({ fileList, parserConfig }) => { if (!id) { return { code: 500, message: 'Dataset ID is required' }; } @@ -79,6 +83,9 @@ export const useUploadNextDocument = () => { fileList.forEach((file: any) => { formData.append('file', file); }); + if (parserConfig) { + formData.append('parser_config', JSON.stringify(parserConfig)); + } try { const ret = await uploadDocument(id, formData); @@ -100,7 +107,13 @@ export const useUploadNextDocument = () => { }, }); - return { uploadDocument: mutateAsync, loading, data }; + const upload = useCallback( + (fileList: File[], parserConfig?: Record) => + mutateAsync({ fileList, parserConfig }), + [mutateAsync], + ); + + return { uploadDocument: upload, loading, data }; }; export const useFetchDocumentList = (loop = true) => { diff --git a/web/src/pages/dataset/dataset/index.tsx b/web/src/pages/dataset/dataset/index.tsx index 16af309392..cfe388e810 100644 --- a/web/src/pages/dataset/dataset/index.tsx +++ b/web/src/pages/dataset/dataset/index.tsx @@ -242,6 +242,7 @@ export default function Dataset() { onOk={onDocumentUploadOk} loading={documentUploadLoading} showParseOnCreation + isTableParser={knowledgeBase?.chunk_method === 'table'} > )} {createVisible && ( diff --git a/web/src/pages/dataset/dataset/use-upload-document.ts b/web/src/pages/dataset/dataset/use-upload-document.ts index b1dc167f6f..0722582109 100644 --- a/web/src/pages/dataset/dataset/use-upload-document.ts +++ b/web/src/pages/dataset/dataset/use-upload-document.ts @@ -17,9 +17,27 @@ export const useHandleUploadDocument = () => { const { runDocumentByIds } = useRunDocument(); const onDocumentUploadOk = useCallback( - async ({ fileList, parseOnCreation }: UploadFormSchemaType) => { + async ({ + fileList, + parseOnCreation, + tableColumnMode, + tableColumnRoles, + }: UploadFormSchemaType) => { if (fileList.length > 0) { - const ret = await uploadDocument(fileList); + // Build parser_config if column roles are configured + let parserConfig: Record | undefined; + if ( + tableColumnMode === 'manual' && + tableColumnRoles && + Object.keys(tableColumnRoles).length > 0 + ) { + parserConfig = { + table_column_mode: 'manual', + table_column_roles: tableColumnRoles, + }; + } + + const ret = await uploadDocument(fileList as File[], parserConfig); // Check for success (code === 0) or partial success (code === 500 with some files) const isSuccess = ret?.code === 0; diff --git a/web/src/utils/table-column-extract.ts b/web/src/utils/table-column-extract.ts new file mode 100644 index 0000000000..f7343aba3a --- /dev/null +++ b/web/src/utils/table-column-extract.ts @@ -0,0 +1,76 @@ +import Papa from 'papaparse'; +import * as XLSX from 'xlsx'; + +/** + * Extracts column headers from a CSV or Excel file. + * Returns an empty array if the file type is not supported or headers cannot be read. + */ +export async function extractTableColumns(file: File): Promise { + const ext = file.name.split('.').pop()?.toLowerCase() ?? ''; + + if (ext === 'csv') { + return extractCsvColumns(file); + } + + if (['xlsx', 'xls'].includes(ext)) { + return extractExcelColumns(file); + } + + return []; +} + +function extractCsvColumns(file: File): Promise { + return new Promise((resolve) => { + Papa.parse(file, { + preview: 1, // Only read the first row (header) + header: true, + skipEmptyLines: true, + complete(results) { + const fields = results.meta?.fields ?? []; + resolve(fields.filter((f) => f.trim().length > 0)); + }, + error() { + resolve([]); + }, + }); + }); +} + +function extractExcelColumns(file: File): Promise { + return new Promise((resolve) => { + const reader = new FileReader(); + reader.onload = (e) => { + try { + const data = new Uint8Array(e.target?.result as ArrayBuffer); + const workbook = XLSX.read(data, { type: 'array', sheetRows: 1 }); + const firstSheetName = workbook.SheetNames[0]; + if (!firstSheetName) { + resolve([]); + return; + } + const sheet = workbook.Sheets[firstSheetName]; + const rows = XLSX.utils.sheet_to_json(sheet, { header: 1 }); + if (rows.length > 0) { + const headers = rows[0] + .map((h) => String(h ?? '').trim()) + .filter((h) => h.length > 0); + resolve(headers); + } else { + resolve([]); + } + } catch { + resolve([]); + } + }; + reader.onerror = () => resolve([]); + reader.readAsArrayBuffer(file); + }); +} + +/** + * Check if a file is a table file (CSV or Excel). + */ +export function isTableFile(file: File): boolean { + const ext = file.name.split('.').pop()?.toLowerCase() ?? ''; + return ['csv', 'xlsx', 'xls'].includes(ext); +}