Fix: table parser metadata (#15127)

### What problem does this PR solve?

This PR improves the table upload flow for CSV/Excel files by allowing
table column role configuration at upload time.

Previously, users had to:
1. Upload and parse a table file.
2. Open parser settings and manually set table column roles.
3. Re-parse the file for the roles to take effect.

This was inefficient and required an unnecessary second parse.

With this change:
1. When the knowledge base uses table parsing, the upload dialog
extracts CSV/Excel headers client-side.
2. Users can choose Auto mode or Manual mode.
3. In Manual mode, users can assign per-column roles before upload.
4. The selected parser config is sent with the upload request and
applied server-side during document creation.

Result: configured table column roles are applied from the first parse.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
This commit is contained in:
Ahmad Intisar
2026-05-25 13:05:38 +05:00
committed by GitHub
parent e7d45dd645
commit e6068a7f7e
7 changed files with 316 additions and 26 deletions

View File

@@ -585,9 +585,25 @@ async def _upload_local_documents(kb, tenant_id):
logging.error(msg)
return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR)
# Parse optional parser_config overrides from form data
parser_config_override = None
raw_parser_config = form.get("parser_config")
if raw_parser_config:
try:
parsed = json.loads(raw_parser_config)
if isinstance(parsed, dict):
# Only allow known table column config keys to prevent arbitrary overrides
allowed_keys = {"table_column_mode", "table_column_roles"}
parser_config_override = {k: v for k, v in parsed.items() if k in allowed_keys}
if not parser_config_override:
parser_config_override = None
except (json.JSONDecodeError, TypeError):
parser_config_override = None
err, files = await thread_pool_exec(
FileService.upload_document, kb, file_objs, tenant_id,
parent_path=form.get("parent_path")
parent_path=form.get("parent_path"),
parser_config_override=parser_config_override,
)
if err:
msg = "\n".join(err)

View File

@@ -455,7 +455,7 @@ class FileService(CommonService):
@classmethod
@DB.connection_context()
def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None):
def upload_document(self, kb, file_objs, user_id, src="local", parent_path: str | None = None, parser_config_override: dict | None = None):
root_folder = self.get_root_folder(user_id)
pf_id = root_folder["id"]
self.init_knowledgebase_docs(pf_id, user_id)
@@ -464,6 +464,13 @@ class FileService(CommonService):
safe_parent_path = sanitize_path(parent_path)
# Merge parser_config_override with KB parser_config if provided
base_parser_config = kb.parser_config or {}
if parser_config_override and isinstance(parser_config_override, dict):
merged_parser_config = {**base_parser_config, **parser_config_override}
else:
merged_parser_config = base_parser_config
err, files = [], []
for file in file_objs:
doc_id = file.id if hasattr(file, "id") else get_uuid()
@@ -529,7 +536,7 @@ class FileService(CommonService):
"kb_id": kb.id,
"parser_id": self.get_parser(filetype, filename, kb.parser_id),
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"parser_config": merged_parser_config,
"created_by": user_id,
"type": filetype,
"name": filename,

View File

@@ -6,17 +6,42 @@ import {
DialogHeader,
DialogTitle,
} from '@/components/ui/dialog';
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from '@/components/ui/select';
import { IModalProps } from '@/interfaces/common';
import { extractTableColumns, isTableFile } from '@/utils/table-column-extract';
import { zodResolver } from '@hookform/resolvers/zod';
import { TFunction } from 'i18next';
import { useCallback, useEffect, useState } from 'react';
import { useForm } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { z } from 'zod';
import { FileUploader } from '../file-uploader';
import { RAGFlowFormItem } from '../ragflow-form';
import { Form } from '../ui/form';
import { Label } from '../ui/label';
import { RadioGroup, RadioGroupItem } from '../ui/radio-group';
import { Switch } from '../ui/switch';
const ROLE_OPTIONS = [
{ value: 'both', labelKey: 'knowledgeConfiguration.tableColumnRoleBoth' },
{
value: 'indexing',
labelKey: 'knowledgeConfiguration.tableColumnRoleIndexing',
},
{
value: 'metadata',
labelKey: 'knowledgeConfiguration.tableColumnRoleMetadata',
},
] as const;
export type TableColumnRoles = Record<string, 'indexing' | 'metadata' | 'both'>;
function buildUploadFormSchema(t: TFunction) {
const FormSchema = z.object({
parseOnCreation: z.boolean().optional(),
@@ -31,6 +56,10 @@ function buildUploadFormSchema(t: TFunction) {
),
)
.min(1, { message: t('fileManager.pleaseUploadAtLeastOneFile') }),
tableColumnMode: z.enum(['auto', 'manual']).optional(),
tableColumnRoles: z
.record(z.enum(['indexing', 'metadata', 'both']))
.optional(),
});
return FormSchema;
@@ -45,8 +74,13 @@ const UploadFormId = 'UploadFormId';
type UploadFormProps = {
submit: (values?: UploadFormSchemaType) => void;
showParseOnCreation?: boolean;
isTableParser?: boolean;
};
function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
function UploadForm({
submit,
showParseOnCreation,
isTableParser,
}: UploadFormProps) {
const { t } = useTranslation();
const FormSchema = buildUploadFormSchema(t);
@@ -56,9 +90,64 @@ function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
defaultValues: {
parseOnCreation: false,
fileList: [],
tableColumnMode: 'auto',
tableColumnRoles: {},
},
});
const [extractedColumns, setExtractedColumns] = useState<string[]>([]);
const [columnMode, setColumnMode] = useState<'auto' | 'manual'>('auto');
const [columnRoles, setColumnRoles] = useState<TableColumnRoles>({});
const handleFilesChange = useCallback(
async (files: any[]) => {
if (!isTableParser || !files || files.length === 0) {
setExtractedColumns([]);
return;
}
// Extract columns from the first table file
const allColumns = new Set<string>();
for (const f of files) {
const file = f instanceof File ? f : f.file;
if (file && isTableFile(file)) {
const cols = await extractTableColumns(file);
cols.forEach((c) => allColumns.add(c));
}
}
setExtractedColumns(Array.from(allColumns));
},
[isTableParser],
);
const handleModeChange = (value: 'auto' | 'manual') => {
setColumnMode(value);
form.setValue('tableColumnMode', value);
};
const handleRoleChange = (col: string, role: string) => {
const updated = {
...columnRoles,
[col]: role as 'indexing' | 'metadata' | 'both',
};
setColumnRoles(updated);
form.setValue('tableColumnRoles', updated);
};
// Sync column roles to form when columns are extracted
useEffect(() => {
if (columnMode === 'manual' && extractedColumns.length > 0) {
const roles: TableColumnRoles = {};
extractedColumns.forEach((col) => {
roles[col] = columnRoles[col] || 'both';
});
setColumnRoles(roles);
form.setValue('tableColumnRoles', roles);
}
}, [extractedColumns, columnMode]); // eslint-disable-line react-hooks/exhaustive-deps
const showColumnConfig = isTableParser && extractedColumns.length > 0;
return (
<Form {...form}>
<form
@@ -84,47 +173,117 @@ function UploadForm({ submit, showParseOnCreation }: UploadFormProps) {
{(field) => (
<FileUploader
value={field.value}
onValueChange={field.onChange}
onValueChange={(files) => {
field.onChange(files);
handleFilesChange(files);
}}
accept={{}}
data-testid="dataset-upload-dropzone"
/>
)}
</RAGFlowFormItem>
{showColumnConfig && (
<div className="space-y-3 border rounded-md p-3">
<div className="space-y-2">
<Label className="text-sm font-medium">
{t('knowledgeConfiguration.tableColumnMode')}
</Label>
<RadioGroup
value={columnMode}
onValueChange={handleModeChange}
className="flex gap-4"
>
<div className="flex items-center space-x-2">
<RadioGroupItem value="auto" id="upload-mode-auto" />
<label
htmlFor="upload-mode-auto"
className="text-sm font-normal cursor-pointer"
>
{t('knowledgeConfiguration.tableColumnModeAuto')}
</label>
</div>
<div className="flex items-center space-x-2">
<RadioGroupItem value="manual" id="upload-mode-manual" />
<label
htmlFor="upload-mode-manual"
className="text-sm font-normal cursor-pointer"
>
{t('knowledgeConfiguration.tableColumnModeManual')}
</label>
</div>
</RadioGroup>
</div>
{columnMode === 'auto' && (
<p className="text-sm text-muted-foreground">
{t('knowledgeConfiguration.tableColumnModeAutoDescription')}
</p>
)}
{columnMode === 'manual' && (
<div className="space-y-2">
<p className="text-sm text-muted-foreground">
{t('knowledgeConfiguration.tableColumnRolesTip')}
</p>
<div className="space-y-2 max-h-[200px] overflow-y-auto">
{extractedColumns.map((col) => (
<div key={col} className="flex items-center gap-3">
<Label className="min-w-[120px] shrink-0 text-sm font-normal truncate">
{col}
</Label>
<Select
value={columnRoles[col] || 'both'}
onValueChange={(value) => handleRoleChange(col, value)}
>
<SelectTrigger className="w-[140px]">
<SelectValue />
</SelectTrigger>
<SelectContent>
{ROLE_OPTIONS.map((opt) => (
<SelectItem key={opt.value} value={opt.value}>
{t(opt.labelKey)}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
))}
</div>
</div>
)}
</div>
)}
</form>
</Form>
);
}
type FileUploadDialogProps = IModalProps<UploadFormSchemaType> &
Pick<UploadFormProps, 'showParseOnCreation'>;
Pick<UploadFormProps, 'showParseOnCreation' | 'isTableParser'>;
export function FileUploadDialog({
hideModal,
onOk,
loading,
showParseOnCreation = false,
isTableParser = false,
}: FileUploadDialogProps) {
const { t } = useTranslation();
return (
<Dialog open onOpenChange={hideModal}>
<DialogContent data-testid="dataset-upload-modal">
<DialogContent
data-testid="dataset-upload-modal"
className="max-h-[85vh] overflow-y-auto"
>
<DialogHeader>
<DialogTitle>{t('fileManager.uploadFile')}</DialogTitle>
</DialogHeader>
{/* <Tabs defaultValue="account">
<TabsList className="grid w-full grid-cols-2 mb-4">
<TabsTrigger value="account">{t('fileManager.local')}</TabsTrigger>
<TabsTrigger value="password">{t('fileManager.s3')}</TabsTrigger>
</TabsList>
<TabsContent value="account">
<UploadForm
submit={onOk!}
showParseOnCreation={showParseOnCreation}
></UploadForm>
</TabsContent>
<TabsContent value="password">{t('common.comingSoon')}</TabsContent>
</Tabs> */}
<UploadForm submit={onOk!} showParseOnCreation={showParseOnCreation} />
<UploadForm
submit={onOk!}
showParseOnCreation={showParseOnCreation}
isTableParser={isTableParser}
/>
<DialogFooter>
<ButtonLoading type="submit" loading={loading} form={UploadFormId}>
{t('common.save')}

View File

@@ -69,9 +69,13 @@ export const useUploadNextDocument = () => {
data,
isPending: loading,
mutateAsync,
} = useMutation<ResponseType<IDocumentInfo[]>, Error, File[]>({
} = useMutation<
ResponseType<IDocumentInfo[]>,
Error,
{ fileList: File[]; parserConfig?: Record<string, any> }
>({
mutationKey: [DocumentApiAction.UploadDocument],
mutationFn: async (fileList) => {
mutationFn: async ({ fileList, parserConfig }) => {
if (!id) {
return { code: 500, message: 'Dataset ID is required' };
}
@@ -79,6 +83,9 @@ export const useUploadNextDocument = () => {
fileList.forEach((file: any) => {
formData.append('file', file);
});
if (parserConfig) {
formData.append('parser_config', JSON.stringify(parserConfig));
}
try {
const ret = await uploadDocument(id, formData);
@@ -100,7 +107,13 @@ export const useUploadNextDocument = () => {
},
});
return { uploadDocument: mutateAsync, loading, data };
const upload = useCallback(
(fileList: File[], parserConfig?: Record<string, any>) =>
mutateAsync({ fileList, parserConfig }),
[mutateAsync],
);
return { uploadDocument: upload, loading, data };
};
export const useFetchDocumentList = (loop = true) => {

View File

@@ -242,6 +242,7 @@ export default function Dataset() {
onOk={onDocumentUploadOk}
loading={documentUploadLoading}
showParseOnCreation
isTableParser={knowledgeBase?.chunk_method === 'table'}
></FileUploadDialog>
)}
{createVisible && (

View File

@@ -17,9 +17,27 @@ export const useHandleUploadDocument = () => {
const { runDocumentByIds } = useRunDocument();
const onDocumentUploadOk = useCallback(
async ({ fileList, parseOnCreation }: UploadFormSchemaType) => {
async ({
fileList,
parseOnCreation,
tableColumnMode,
tableColumnRoles,
}: UploadFormSchemaType) => {
if (fileList.length > 0) {
const ret = await uploadDocument(fileList);
// Build parser_config if column roles are configured
let parserConfig: Record<string, any> | undefined;
if (
tableColumnMode === 'manual' &&
tableColumnRoles &&
Object.keys(tableColumnRoles).length > 0
) {
parserConfig = {
table_column_mode: 'manual',
table_column_roles: tableColumnRoles,
};
}
const ret = await uploadDocument(fileList as File[], parserConfig);
// Check for success (code === 0) or partial success (code === 500 with some files)
const isSuccess = ret?.code === 0;

View File

@@ -0,0 +1,76 @@
import Papa from 'papaparse';
import * as XLSX from 'xlsx';
/**
* Extracts column headers from a CSV or Excel file.
* Returns an empty array if the file type is not supported or headers cannot be read.
*/
export async function extractTableColumns(file: File): Promise<string[]> {
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
if (ext === 'csv') {
return extractCsvColumns(file);
}
if (['xlsx', 'xls'].includes(ext)) {
return extractExcelColumns(file);
}
return [];
}
function extractCsvColumns(file: File): Promise<string[]> {
return new Promise((resolve) => {
Papa.parse(file, {
preview: 1, // Only read the first row (header)
header: true,
skipEmptyLines: true,
complete(results) {
const fields = results.meta?.fields ?? [];
resolve(fields.filter((f) => f.trim().length > 0));
},
error() {
resolve([]);
},
});
});
}
function extractExcelColumns(file: File): Promise<string[]> {
return new Promise((resolve) => {
const reader = new FileReader();
reader.onload = (e) => {
try {
const data = new Uint8Array(e.target?.result as ArrayBuffer);
const workbook = XLSX.read(data, { type: 'array', sheetRows: 1 });
const firstSheetName = workbook.SheetNames[0];
if (!firstSheetName) {
resolve([]);
return;
}
const sheet = workbook.Sheets[firstSheetName];
const rows = XLSX.utils.sheet_to_json<string[]>(sheet, { header: 1 });
if (rows.length > 0) {
const headers = rows[0]
.map((h) => String(h ?? '').trim())
.filter((h) => h.length > 0);
resolve(headers);
} else {
resolve([]);
}
} catch {
resolve([]);
}
};
reader.onerror = () => resolve([]);
reader.readAsArrayBuffer(file);
});
}
/**
* Check if a file is a table file (CSV or Excel).
*/
export function isTableFile(file: File): boolean {
const ext = file.name.split('.').pop()?.toLowerCase() ?? '';
return ['csv', 'xlsx', 'xls'].includes(ext);
}