mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-05 10:58:34 +08:00
feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)
### What problem does this PR solve? Closes #14674. This PR improves RAPTOR configuration and tree construction while preserving the existing RAPTOR behavior as the default. RAPTOR currently builds summary layers with the original UMAP + GMM clustering path. This PR keeps that default path, and adds: - A hidden backend tree-builder option: - `tree_builder="raptor"`: default, existing RAPTOR behavior. - `tree_builder="psi"`: rank-aware Psi-style tree builder using original embedding-space cosine ranking. - A user-facing clustering method option for the default RAPTOR builder: - `clustering_method="gmm"`: existing default. - `clustering_method="ahc"`: agglomerative hierarchical clustering path. - A RAPTOR UI setting for `Clustering method` and `Max cluster`. ### What changed #### Backend - Added `tree_builder` support for RAPTOR/Psi. - Added `clustering_method` support for GMM/AHC. - Kept existing RAPTOR + GMM as the default. - Added Psi tree building from original-space cosine similarity. - Added bucketed Psi building controls for large inputs: - `raptor.ext.psi_exact_max_leaves` - `raptor.ext.psi_bucket_size` - Added method-aware RAPTOR summary metadata using existing `extra.raptor_method`. - Avoided adding a dedicated DB schema field for experimental method tracking. - Added cleanup/migration logic to avoid mixing stale RAPTOR summary trees. - Added defensive checks for Psi tree construction and summary failures. #### Frontend/UI - Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`. - Added/kept `Max cluster` in RAPTOR settings. - Enlarged max cluster UI limit to `1024`, matching backend validation. - Kept AHC editable even when a RAPTOR task has already finished. - Fixed the UI save payload so `clustering_method` and `tree_builder` are serialized through `parser_config.raptor.ext`, avoiding backend validation errors for extra top-level RAPTOR fields. Example saved RAPTOR config: ```json { "raptor": { "max_cluster": 317, "ext": { "clustering_method": "ahc", "tree_builder": "raptor" } } } Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
This commit is contained in:
@@ -21,10 +21,17 @@ export const extractRaptorConfigExt = (
|
||||
max_cluster,
|
||||
random_seed,
|
||||
scope,
|
||||
clustering_method,
|
||||
tree_builder,
|
||||
auto_disable_for_structured_data,
|
||||
ext,
|
||||
...raptorExt
|
||||
} = raptorConfig;
|
||||
const extClusteringMethod = ext?.clustering_method;
|
||||
const normalizedClusteringMethod =
|
||||
clustering_method ?? extClusteringMethod ?? 'gmm';
|
||||
const normalizedTreeBuilder = tree_builder ?? ext?.tree_builder ?? 'raptor';
|
||||
|
||||
return {
|
||||
use_raptor,
|
||||
prompt,
|
||||
@@ -34,7 +41,12 @@ export const extractRaptorConfigExt = (
|
||||
random_seed,
|
||||
scope,
|
||||
auto_disable_for_structured_data,
|
||||
ext: { ...ext, ...raptorExt },
|
||||
ext: {
|
||||
...ext,
|
||||
...raptorExt,
|
||||
clustering_method: normalizedClusteringMethod,
|
||||
tree_builder: normalizedTreeBuilder,
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
45
web/src/hooks/tests/parser-config-utils.test.ts
Normal file
45
web/src/hooks/tests/parser-config-utils.test.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { extractParserConfigExt } from '../parser-config-utils';
|
||||
|
||||
describe('extractParserConfigExt', () => {
|
||||
it('serializes RAPTOR clustering fields through ext for API compatibility', () => {
|
||||
const result = extractParserConfigExt({
|
||||
raptor: {
|
||||
use_raptor: true,
|
||||
prompt: 'Summarize {cluster_content}',
|
||||
max_token: 256,
|
||||
threshold: 0.1,
|
||||
max_cluster: 317,
|
||||
random_seed: 0,
|
||||
scope: 'file',
|
||||
clustering_method: 'ahc',
|
||||
tree_builder: 'raptor',
|
||||
},
|
||||
});
|
||||
|
||||
expect(result?.raptor).not.toHaveProperty('clustering_method');
|
||||
expect(result?.raptor).not.toHaveProperty('tree_builder');
|
||||
expect(result?.raptor?.ext).toMatchObject({
|
||||
clustering_method: 'ahc',
|
||||
tree_builder: 'raptor',
|
||||
});
|
||||
});
|
||||
|
||||
it('preserves existing RAPTOR ext clustering values when the top-level field is absent', () => {
|
||||
const result = extractParserConfigExt({
|
||||
raptor: {
|
||||
max_cluster: 512,
|
||||
ext: {
|
||||
clustering_method: 'ahc',
|
||||
tree_builder: 'raptor',
|
||||
psi_bucket_size: 1024,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result?.raptor?.ext).toMatchObject({
|
||||
clustering_method: 'ahc',
|
||||
tree_builder: 'raptor',
|
||||
psi_bucket_size: 1024,
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user