feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
This commit is contained in:
CaptainTimon
2026-05-11 15:42:31 -10:00
committed by GitHub
parent 415169d497
commit 2717ee283f
21 changed files with 1722 additions and 140 deletions

View File

@@ -17,7 +17,7 @@ import { DocumentParserType, ParseType } from '@/constants/knowledge';
import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-request';
import { IModalProps } from '@/interfaces/common';
import { IParserConfig } from '@/interfaces/database/document';
import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
import { IChangeParserRequestBody } from '@/interfaces/request/document';
import { MetadataType } from '@/pages/dataset/components/metedata/constant';
import {
AutoMetadata,
@@ -28,7 +28,6 @@ import {
} from '@/pages/dataset/dataset-setting/configuration/common-item';
import { zodResolver } from '@hookform/resolvers/zod';
import omit from 'lodash/omit';
import {} from 'module';
import { useEffect, useMemo } from 'react';
import { useForm, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
@@ -56,10 +55,7 @@ import {
const FormId = 'ChunkMethodDialogForm';
interface IProps extends IModalProps<{
parserId: string;
parserConfig: IChangeParserConfigRequestBody;
}> {
interface IProps extends IModalProps<IChangeParserRequestBody> {
loading: boolean;
parserId: string;
pipelineId?: string;
@@ -126,16 +122,19 @@ export function ChunkMethodDialog({
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
mineru_lang: z.string().optional(),
// raptor: z
// .object({
// use_raptor: z.boolean().optional(),
// prompt: z.string().optional().optional(),
// max_token: z.coerce.number().optional(),
// threshold: z.coerce.number().optional(),
// max_cluster: z.coerce.number().optional(),
// random_seed: z.coerce.number().optional(),
// })
// .optional(),
raptor: z
.object({
use_raptor: z.boolean().optional(),
prompt: z.string().optional(),
max_token: z.coerce.number().optional(),
threshold: z.coerce.number().optional(),
max_cluster: z.coerce.number().optional(),
random_seed: z.coerce.number().optional(),
scope: z.string().optional(),
clustering_method: z.enum(['gmm', 'ahc']).optional(),
tree_builder: z.enum(['raptor', 'psi']).optional(),
})
.optional(),
// graphrag: z.object({
// use_graphrag: z.boolean().optional(),
// }),

View File

@@ -23,14 +23,17 @@ export function useDefaultParserValues() {
mineru_formula_enable: true,
mineru_table_enable: true,
mineru_lang: 'English',
// raptor: {
// use_raptor: false,
// prompt: t('knowledgeConfiguration.promptText'),
// max_token: 256,
// threshold: 0.1,
// max_cluster: 64,
// random_seed: 0,
// },
raptor: {
use_raptor: false,
prompt: t('knowledgeConfiguration.promptText'),
max_token: 256,
threshold: 0.1,
max_cluster: 64,
random_seed: 0,
scope: 'file',
clustering_method: 'gmm',
tree_builder: 'raptor',
},
// graphrag: {
// use_graphrag: false,
// },

View File

@@ -8,7 +8,7 @@ import {
} from '@/pages/dataset/dataset/generate-button/generate';
import random from 'lodash/random';
import { Shuffle } from 'lucide-react';
import { useCallback } from 'react';
import { useCallback, useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { SliderInputFormField } from '../slider-input-form-field';
import {
@@ -50,10 +50,10 @@ export const showTagItems = (parserId: DocumentParserType) => {
const UseRaptorField = 'parser_config.raptor.use_raptor';
const RandomSeedField = 'parser_config.raptor.random_seed';
const MaxTokenField = 'parser_config.raptor.max_token';
const ThresholdField = 'parser_config.raptor.threshold';
const MaxCluster = 'parser_config.raptor.max_cluster';
const Prompt = 'parser_config.raptor.prompt';
const ClusteringMethodField = 'parser_config.raptor.clustering_method';
const ClusteringMethodExtField = 'parser_config.raptor.ext.clustering_method';
const TreeBuilderField = 'parser_config.raptor.tree_builder';
const MaxClusterMax = 1024;
// The three types "table", "resume" and "one" do not display this configuration.
@@ -67,17 +67,48 @@ const RaptorFormFields = ({
const form = useFormContext();
const { t } = useTranslate('knowledgeConfiguration');
const useRaptor = useWatch({ name: UseRaptorField });
const clusteringMethod = useWatch({ name: ClusteringMethodField });
const extClusteringMethod = useWatch({ name: ClusteringMethodExtField });
const selectedClusteringMethod = useMemo(
() =>
(clusteringMethod ??
extClusteringMethod ??
form.getValues(ClusteringMethodField) ??
form.getValues(ClusteringMethodExtField) ??
'gmm') as 'gmm' | 'ahc',
[clusteringMethod, extClusteringMethod, form],
);
const handleGenerate = useCallback(() => {
form.setValue(RandomSeedField, random(10000));
}, [form]);
const handleClusteringMethodChange = useCallback(
(method: 'gmm' | 'ahc') => {
form.setValue(ClusteringMethodField, method, {
shouldDirty: true,
shouldValidate: true,
});
form.setValue(TreeBuilderField, 'raptor', {
shouldDirty: true,
shouldValidate: true,
});
},
[form],
);
useEffect(() => {
if (!clusteringMethod && !extClusteringMethod) {
handleClusteringMethodChange('gmm');
}
}, [clusteringMethod, extClusteringMethod, handleClusteringMethodChange]);
return (
<>
<FormField
control={form.control}
name={UseRaptorField}
render={({ field }) => {
render={() => {
return (
<FormItem
defaultChecked={false}
@@ -209,11 +240,61 @@ const RaptorFormFields = ({
sliderTestId="ds-settings-raptor-threshold-slider"
numberInputTestId="ds-settings-raptor-threshold-input"
></SliderInputFormField>
<FormField
control={form.control}
name={ClusteringMethodField}
render={({ field }) => {
return (
<FormItem className=" items-center space-y-0 ">
<div className="flex items-start">
<FormLabel
tooltip={t('clusteringMethodTip')}
className="text-sm whitespace-nowrap w-1/4"
>
{t('clusteringMethod')}
</FormLabel>
<div className="w-3/4">
<FormControl>
<Radio.Group
{...field}
value={selectedClusteringMethod}
onChange={(value) =>
handleClusteringMethodChange(value as 'gmm' | 'ahc')
}
>
<div
className={'flex gap-4 w-full text-text-secondary '}
>
<Radio
value="gmm"
testId="ds-settings-raptor-clustering-method-option-gmm"
>
{t('clusteringMethodGmm')}
</Radio>
<Radio
value="ahc"
testId="ds-settings-raptor-clustering-method-option-ahc"
>
{t('clusteringMethodAhc')}
</Radio>
</div>
</Radio.Group>
</FormControl>
</div>
</div>
<div className="flex pt-1">
<div className="w-1/4"></div>
<FormMessage />
</div>
</FormItem>
);
}}
/>
<SliderInputFormField
name={'parser_config.raptor.max_cluster'}
label={t('maxCluster')}
tooltip={t('maxClusterTip')}
max={1024}
max={MaxClusterMax}
min={1}
layout={FormLayout.Horizontal}
sliderTestId="ds-settings-raptor-max-cluster-slider"

View File

@@ -13,6 +13,7 @@ type RadioProps = {
checked?: boolean;
disabled?: boolean;
onChange?: (checked: boolean) => void;
testId?: string;
children?: React.ReactNode;
} & Omit<
React.InputHTMLAttributes<HTMLInputElement>,
@@ -25,6 +26,7 @@ function Radio({
checked,
disabled,
onChange,
testId,
children,
...props
}: RadioProps) {
@@ -65,6 +67,7 @@ function Radio({
onChange={handleChange}
disabled={mergedDisabled}
className={cn('peer absolute size-[1px] opacity-0', className)}
data-testid={testId}
{...props}
name={groupContext?.name}
/>
@@ -151,9 +154,11 @@ const Group = React.forwardRef<HTMLDivElement, RadioGroupProps>(
)}
>
{React.Children.map(children, (child) => {
if (!React.isValidElement<RadioProps>(child)) return child;
if (!React.isValidElement<RadioProps>(child)) {
return child;
}
return React.cloneElement(child, {
disabled: disabled || child.props?.disabled,
disabled: disabled || child.props.disabled,
});
})}
</div>