mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix: table parser metadata (#15127)
### What problem does this PR solve? This PR improves the table upload flow for CSV/Excel files by allowing table column role configuration at upload time. Previously, users had to: 1. Upload and parse a table file. 2. Open parser settings and manually set table column roles. 3. Re-parse the file for the roles to take effect. This was inefficient and required an unnecessary second parse. With this change: 1. When the knowledge base uses table parsing, the upload dialog extracts CSV/Excel headers client-side. 2. Users can choose Auto mode or Manual mode. 3. In Manual mode, users can assign per-column roles before upload. 4. The selected parser config is sent with the upload request and applied server-side during document creation. Result: configured table column roles are applied from the first parse. ### Type of change - [x] New Feature (non-breaking change which adds functionality) Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
This commit is contained in:
@@ -585,9 +585,25 @@ async def _upload_local_documents(kb, tenant_id):
|
||||
logging.error(msg)
|
||||
return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
# Parse optional parser_config overrides from form data
|
||||
parser_config_override = None
|
||||
raw_parser_config = form.get("parser_config")
|
||||
if raw_parser_config:
|
||||
try:
|
||||
parsed = json.loads(raw_parser_config)
|
||||
if isinstance(parsed, dict):
|
||||
# Only allow known table column config keys to prevent arbitrary overrides
|
||||
allowed_keys = {"table_column_mode", "table_column_roles"}
|
||||
parser_config_override = {k: v for k, v in parsed.items() if k in allowed_keys}
|
||||
if not parser_config_override:
|
||||
parser_config_override = None
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
parser_config_override = None
|
||||
|
||||
err, files = await thread_pool_exec(
|
||||
FileService.upload_document, kb, file_objs, tenant_id,
|
||||
parent_path=form.get("parent_path")
|
||||
parent_path=form.get("parent_path"),
|
||||
parser_config_override=parser_config_override,
|
||||
)
|
||||
if err:
|
||||
msg = "\n".join(err)
|
||||
|
||||
@@ -455,7 +455,7 @@ class FileService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None):
|
||||
def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None, parser_config_override: dict | None = None):
|
||||
root_folder = self.get_root_folder(user_id)
|
||||
pf_id = root_folder["id"]
|
||||
self.init_knowledgebase_docs(pf_id, user_id)
|
||||
@@ -464,6 +464,13 @@ class FileService(CommonService):
|
||||
|
||||
safe_parent_path = sanitize_path(parent_path)
|
||||
|
||||
# Merge parser_config_override with KB parser_config if provided
|
||||
base_parser_config = kb.parser_config or {}
|
||||
if parser_config_override and isinstance(parser_config_override, dict):
|
||||
merged_parser_config = {**base_parser_config, **parser_config_override}
|
||||
else:
|
||||
merged_parser_config = base_parser_config
|
||||
|
||||
err, files = [], []
|
||||
for file in file_objs:
|
||||
doc_id = file.id if hasattr(file, "id") else get_uuid()
|
||||
@@ -529,7 +536,7 @@ class FileService(CommonService):
|
||||
"kb_id": kb.id,
|
||||
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
|
||||
"pipeline_id": kb.pipeline_id,
|
||||
"parser_config": kb.parser_config,
|
||||
"parser_config": merged_parser_config,
|
||||
"created_by": user_id,
|
||||
"type": filetype,
|
||||
"name": filename,
|
||||
|
||||
@@ -6,17 +6,42 @@ import {
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from '@/components/ui/dialog';
|
||||
import {
|
||||
Select,
|
||||
SelectContent,
|
||||
SelectItem,
|
||||
SelectTrigger,
|
||||
SelectValue,
|
||||
} from '@/components/ui/select';
|
||||
import { IModalProps } from '@/interfaces/common';
|
||||
import { extractTableColumns, isTableFile } from '@/utils/table-column-extract';
|
||||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { TFunction } from 'i18next';
|
||||
import { useCallback, useEffect, useState } from 'react';
|
||||
import { useForm } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { z } from 'zod';
|
||||
import { FileUploader } from '../file-uploader';
|
||||
import { RAGFlowFormItem } from '../ragflow-form';
|
||||
import { Form } from '../ui/form';
|
||||
import { Label } from '../ui/label';
|
||||
import { RadioGroup, RadioGroupItem } from '../ui/radio-group';
|
||||
import { Switch } from '../ui/switch';
|
||||
|
||||
const ROLE_OPTIONS = [
|
||||
{ value: 'both', labelKey: 'knowledgeConfiguration.tableColumnRoleBoth' },
|
||||
{
|
||||
value: 'indexing',
|
||||
labelKey: 'knowledgeConfiguration.tableColumnRoleIndexing',
|
||||
},
|
||||
{
|
||||
value: 'metadata',
|
||||
labelKey: 'knowledgeConfiguration.tableColumnRoleMetadata',
|
||||
},
|
||||
] as const;
|
||||
|
||||
export type TableColumnRoles = Record<string, 'indexing' | 'metadata' | 'both'>;
|
||||
|
||||
function buildUploadFormSchema(t: TFunction) {
|
||||
const FormSchema = z.object({
|
||||
parseOnCreation: z.boolean().optional(),
|
||||
@@ -31,6 +56,10 @@ function buildUploadFormSchema(t: TFunction) {
|
||||
),
|
||||
)
|
||||
.min(1, { message: t('fileManager.pleaseUploadAtLeastOneFile') }),
|
||||
tableColumnMode: z.enum(['auto', 'manual']).optional(),
|
||||
tableColumnRoles: z
|
||||
.record(z.enum(['indexing', 'metadata', 'both']))
|
||||
.optional(),
|
||||
});
|
||||
|
||||
return FormSchema;
|
||||
@@ -45,8 +74,13 @@ const UploadFormId = 'UploadFormId';
|
||||
type UploadFormProps = {
|
||||
submit: (values?: UploadFormSchemaType) => void;
|
||||
showParseOnCreation?: boolean;
|
||||
isTableParser?: boolean;
|
||||
};
|
||||
function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
|
||||
function UploadForm({
|
||||
submit,
|
||||
showParseOnCreation,
|
||||
isTableParser,
|
||||
}: UploadFormProps) {
|
||||
const { t } = useTranslation();
|
||||
const FormSchema = buildUploadFormSchema(t);
|
||||
|
||||
@@ -56,9 +90,64 @@ function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
|
||||
defaultValues: {
|
||||
parseOnCreation: false,
|
||||
fileList: [],
|
||||
tableColumnMode: 'auto',
|
||||
tableColumnRoles: {},
|
||||
},
|
||||
});
|
||||
|
||||
const [extractedColumns, setExtractedColumns] = useState<string[]>([]);
|
||||
const [columnMode, setColumnMode] = useState<'auto' | 'manual'>('auto');
|
||||
const [columnRoles, setColumnRoles] = useState<TableColumnRoles>({});
|
||||
|
||||
const handleFilesChange = useCallback(
|
||||
async (files: any[]) => {
|
||||
if (!isTableParser || !files || files.length === 0) {
|
||||
setExtractedColumns([]);
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract columns from the first table file
|
||||
const allColumns = new Set<string>();
|
||||
for (const f of files) {
|
||||
const file = f instanceof File ? f : f.file;
|
||||
if (file && isTableFile(file)) {
|
||||
const cols = await extractTableColumns(file);
|
||||
cols.forEach((c) => allColumns.add(c));
|
||||
}
|
||||
}
|
||||
setExtractedColumns(Array.from(allColumns));
|
||||
},
|
||||
[isTableParser],
|
||||
);
|
||||
|
||||
const handleModeChange = (value: 'auto' | 'manual') => {
|
||||
setColumnMode(value);
|
||||
form.setValue('tableColumnMode', value);
|
||||
};
|
||||
|
||||
const handleRoleChange = (col: string, role: string) => {
|
||||
const updated = {
|
||||
...columnRoles,
|
||||
[col]: role as 'indexing' | 'metadata' | 'both',
|
||||
};
|
||||
setColumnRoles(updated);
|
||||
form.setValue('tableColumnRoles', updated);
|
||||
};
|
||||
|
||||
// Sync column roles to form when columns are extracted
|
||||
useEffect(() => {
|
||||
if (columnMode === 'manual' && extractedColumns.length > 0) {
|
||||
const roles: TableColumnRoles = {};
|
||||
extractedColumns.forEach((col) => {
|
||||
roles[col] = columnRoles[col] || 'both';
|
||||
});
|
||||
setColumnRoles(roles);
|
||||
form.setValue('tableColumnRoles', roles);
|
||||
}
|
||||
}, [extractedColumns, columnMode]); // eslint-disable-line react-hooks/exhaustive-deps
|
||||
|
||||
const showColumnConfig = isTableParser && extractedColumns.length > 0;
|
||||
|
||||
return (
|
||||
<Form {...form}>
|
||||
<form
|
||||
@@ -84,47 +173,117 @@ function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
|
||||
{(field) => (
|
||||
<FileUploader
|
||||
value={field.value}
|
||||
onValueChange={field.onChange}
|
||||
onValueChange={(files) => {
|
||||
field.onChange(files);
|
||||
handleFilesChange(files);
|
||||
}}
|
||||
accept={{}}
|
||||
data-testid="dataset-upload-dropzone"
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
|
||||
{showColumnConfig && (
|
||||
<div className="space-y-3 border rounded-md p-3">
|
||||
<div className="space-y-2">
|
||||
<Label className="text-sm font-medium">
|
||||
{t('knowledgeConfiguration.tableColumnMode')}
|
||||
</Label>
|
||||
<RadioGroup
|
||||
value={columnMode}
|
||||
onValueChange={handleModeChange}
|
||||
className="flex gap-4"
|
||||
>
|
||||
<div className="flex items-center space-x-2">
|
||||
<RadioGroupItem value="auto" id="upload-mode-auto" />
|
||||
<label
|
||||
htmlFor="upload-mode-auto"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
{t('knowledgeConfiguration.tableColumnModeAuto')}
|
||||
</label>
|
||||
</div>
|
||||
<div className="flex items-center space-x-2">
|
||||
<RadioGroupItem value="manual" id="upload-mode-manual" />
|
||||
<label
|
||||
htmlFor="upload-mode-manual"
|
||||
className="text-sm font-normal cursor-pointer"
|
||||
>
|
||||
{t('knowledgeConfiguration.tableColumnModeManual')}
|
||||
</label>
|
||||
</div>
|
||||
</RadioGroup>
|
||||
</div>
|
||||
|
||||
{columnMode === 'auto' && (
|
||||
<p className="text-sm text-muted-foreground">
|
||||
{t('knowledgeConfiguration.tableColumnModeAutoDescription')}
|
||||
</p>
|
||||
)}
|
||||
|
||||
{columnMode === 'manual' && (
|
||||
<div className="space-y-2">
|
||||
<p className="text-sm text-muted-foreground">
|
||||
{t('knowledgeConfiguration.tableColumnRolesTip')}
|
||||
</p>
|
||||
<div className="space-y-2 max-h-[200px] overflow-y-auto">
|
||||
{extractedColumns.map((col) => (
|
||||
<div key={col} className="flex items-center gap-3">
|
||||
<Label className="min-w-[120px] shrink-0 text-sm font-normal truncate">
|
||||
{col}
|
||||
</Label>
|
||||
<Select
|
||||
value={columnRoles[col] || 'both'}
|
||||
onValueChange={(value) => handleRoleChange(col, value)}
|
||||
>
|
||||
<SelectTrigger className="w-[140px]">
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{ROLE_OPTIONS.map((opt) => (
|
||||
<SelectItem key={opt.value} value={opt.value}>
|
||||
{t(opt.labelKey)}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</form>
|
||||
</Form>
|
||||
);
|
||||
}
|
||||
|
||||
type FileUploadDialogProps = IModalProps<UploadFormSchemaType> &
|
||||
Pick<UploadFormProps, 'showParseOnCreation'>;
|
||||
Pick<UploadFormProps, 'showParseOnCreation' | 'isTableParser'>;
|
||||
export function FileUploadDialog({
|
||||
hideModal,
|
||||
onOk,
|
||||
loading,
|
||||
showParseOnCreation = false,
|
||||
isTableParser = false,
|
||||
}: FileUploadDialogProps) {
|
||||
const { t } = useTranslation();
|
||||
|
||||
return (
|
||||
<Dialog open onOpenChange={hideModal}>
|
||||
<DialogContent data-testid="dataset-upload-modal">
|
||||
<DialogContent
|
||||
data-testid="dataset-upload-modal"
|
||||
className="max-h-[85vh] overflow-y-auto"
|
||||
>
|
||||
<DialogHeader>
|
||||
<DialogTitle>{t('fileManager.uploadFile')}</DialogTitle>
|
||||
</DialogHeader>
|
||||
{/* <Tabs defaultValue="account">
|
||||
<TabsList className="grid w-full grid-cols-2 mb-4">
|
||||
<TabsTrigger value="account">{t('fileManager.local')}</TabsTrigger>
|
||||
<TabsTrigger value="password">{t('fileManager.s3')}</TabsTrigger>
|
||||
</TabsList>
|
||||
<TabsContent value="account">
|
||||
<UploadForm
|
||||
submit={onOk!}
|
||||
showParseOnCreation={showParseOnCreation}
|
||||
></UploadForm>
|
||||
</TabsContent>
|
||||
<TabsContent value="password">{t('common.comingSoon')}</TabsContent>
|
||||
</Tabs> */}
|
||||
<UploadForm submit={onOk!} showParseOnCreation={showParseOnCreation} />
|
||||
<UploadForm
|
||||
submit={onOk!}
|
||||
showParseOnCreation={showParseOnCreation}
|
||||
isTableParser={isTableParser}
|
||||
/>
|
||||
<DialogFooter>
|
||||
<ButtonLoading type="submit" loading={loading} form={UploadFormId}>
|
||||
{t('common.save')}
|
||||
|
||||
@@ -69,9 +69,13 @@ export const useUploadNextDocument = () => {
|
||||
data,
|
||||
isPending: loading,
|
||||
mutateAsync,
|
||||
} = useMutation<ResponseType<IDocumentInfo[]>, Error, File[]>({
|
||||
} = useMutation<
|
||||
ResponseType<IDocumentInfo[]>,
|
||||
Error,
|
||||
{ fileList: File[]; parserConfig?: Record<string, any> }
|
||||
>({
|
||||
mutationKey: [DocumentApiAction.UploadDocument],
|
||||
mutationFn: async (fileList) => {
|
||||
mutationFn: async ({ fileList, parserConfig }) => {
|
||||
if (!id) {
|
||||
return { code: 500, message: 'Dataset ID is required' };
|
||||
}
|
||||
@@ -79,6 +83,9 @@ export const useUploadNextDocument = () => {
|
||||
fileList.forEach((file: any) => {
|
||||
formData.append('file', file);
|
||||
});
|
||||
if (parserConfig) {
|
||||
formData.append('parser_config', JSON.stringify(parserConfig));
|
||||
}
|
||||
|
||||
try {
|
||||
const ret = await uploadDocument(id, formData);
|
||||
@@ -100,7 +107,13 @@ export const useUploadNextDocument = () => {
|
||||
},
|
||||
});
|
||||
|
||||
return { uploadDocument: mutateAsync, loading, data };
|
||||
const upload = useCallback(
|
||||
(fileList: File[], parserConfig?: Record<string, any>) =>
|
||||
mutateAsync({ fileList, parserConfig }),
|
||||
[mutateAsync],
|
||||
);
|
||||
|
||||
return { uploadDocument: upload, loading, data };
|
||||
};
|
||||
|
||||
export const useFetchDocumentList = (loop = true) => {
|
||||
|
||||
@@ -242,6 +242,7 @@ export default function Dataset() {
|
||||
onOk={onDocumentUploadOk}
|
||||
loading={documentUploadLoading}
|
||||
showParseOnCreation
|
||||
isTableParser={knowledgeBase?.chunk_method === 'table'}
|
||||
></FileUploadDialog>
|
||||
)}
|
||||
{createVisible && (
|
||||
|
||||
@@ -17,9 +17,27 @@ export const useHandleUploadDocument = () => {
|
||||
const { runDocumentByIds } = useRunDocument();
|
||||
|
||||
const onDocumentUploadOk = useCallback(
|
||||
async ({ fileList, parseOnCreation }: UploadFormSchemaType) => {
|
||||
async ({
|
||||
fileList,
|
||||
parseOnCreation,
|
||||
tableColumnMode,
|
||||
tableColumnRoles,
|
||||
}: UploadFormSchemaType) => {
|
||||
if (fileList.length > 0) {
|
||||
const ret = await uploadDocument(fileList);
|
||||
// Build parser_config if column roles are configured
|
||||
let parserConfig: Record<string, any> | undefined;
|
||||
if (
|
||||
tableColumnMode === 'manual' &&
|
||||
tableColumnRoles &&
|
||||
Object.keys(tableColumnRoles).length > 0
|
||||
) {
|
||||
parserConfig = {
|
||||
table_column_mode: 'manual',
|
||||
table_column_roles: tableColumnRoles,
|
||||
};
|
||||
}
|
||||
|
||||
const ret = await uploadDocument(fileList as File[], parserConfig);
|
||||
|
||||
// Check for success (code === 0) or partial success (code === 500 with some files)
|
||||
const isSuccess = ret?.code === 0;
|
||||
|
||||
76
web/src/utils/table-column-extract.ts
Normal file
76
web/src/utils/table-column-extract.ts
Normal file
@@ -0,0 +1,76 @@
|
||||
import Papa from 'papaparse';
|
||||
import * as XLSX from 'xlsx';
|
||||
|
||||
/**
|
||||
* Extracts column headers from a CSV or Excel file.
|
||||
* Returns an empty array if the file type is not supported or headers cannot be read.
|
||||
*/
|
||||
export async function extractTableColumns(file: File): Promise<string[]> {
|
||||
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
|
||||
|
||||
if (ext === 'csv') {
|
||||
return extractCsvColumns(file);
|
||||
}
|
||||
|
||||
if (['xlsx', 'xls'].includes(ext)) {
|
||||
return extractExcelColumns(file);
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
function extractCsvColumns(file: File): Promise<string[]> {
|
||||
return new Promise((resolve) => {
|
||||
Papa.parse(file, {
|
||||
preview: 1, // Only read the first row (header)
|
||||
header: true,
|
||||
skipEmptyLines: true,
|
||||
complete(results) {
|
||||
const fields = results.meta?.fields ?? [];
|
||||
resolve(fields.filter((f) => f.trim().length > 0));
|
||||
},
|
||||
error() {
|
||||
resolve([]);
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function extractExcelColumns(file: File): Promise<string[]> {
|
||||
return new Promise((resolve) => {
|
||||
const reader = new FileReader();
|
||||
reader.onload = (e) => {
|
||||
try {
|
||||
const data = new Uint8Array(e.target?.result as ArrayBuffer);
|
||||
const workbook = XLSX.read(data, { type: 'array', sheetRows: 1 });
|
||||
const firstSheetName = workbook.SheetNames[0];
|
||||
if (!firstSheetName) {
|
||||
resolve([]);
|
||||
return;
|
||||
}
|
||||
const sheet = workbook.Sheets[firstSheetName];
|
||||
const rows = XLSX.utils.sheet_to_json<string[]>(sheet, { header: 1 });
|
||||
if (rows.length > 0) {
|
||||
const headers = rows[0]
|
||||
.map((h) => String(h ?? '').trim())
|
||||
.filter((h) => h.length > 0);
|
||||
resolve(headers);
|
||||
} else {
|
||||
resolve([]);
|
||||
}
|
||||
} catch {
|
||||
resolve([]);
|
||||
}
|
||||
};
|
||||
reader.onerror = () => resolve([]);
|
||||
reader.readAsArrayBuffer(file);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a file is a table file (CSV or Excel).
|
||||
*/
|
||||
export function isTableFile(file: File): boolean {
|
||||
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
|
||||
return ['csv', 'xlsx', 'xls'].includes(ext);
|
||||
}
|
||||
Reference in New Issue
Block a user