feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
This commit is contained in:
CaptainTimon
2026-05-11 15:42:31 -10:00
committed by GitHub
parent 415169d497
commit 2717ee283f
21 changed files with 1722 additions and 140 deletions

View File

@@ -21,10 +21,17 @@ export const extractRaptorConfigExt = (
max_cluster,
random_seed,
scope,
clustering_method,
tree_builder,
auto_disable_for_structured_data,
ext,
...raptorExt
} = raptorConfig;
const extClusteringMethod = ext?.clustering_method;
const normalizedClusteringMethod =
clustering_method ?? extClusteringMethod ?? 'gmm';
const normalizedTreeBuilder = tree_builder ?? ext?.tree_builder ?? 'raptor';
return {
use_raptor,
prompt,
@@ -34,7 +41,12 @@ export const extractRaptorConfigExt = (
random_seed,
scope,
auto_disable_for_structured_data,
ext: { ...ext, ...raptorExt },
ext: {
...ext,
...raptorExt,
clustering_method: normalizedClusteringMethod,
tree_builder: normalizedTreeBuilder,
},
};
};

View File

@@ -0,0 +1,45 @@
import { extractParserConfigExt } from '../parser-config-utils';
describe('extractParserConfigExt', () => {
it('serializes RAPTOR clustering fields through ext for API compatibility', () => {
const result = extractParserConfigExt({
raptor: {
use_raptor: true,
prompt: 'Summarize {cluster_content}',
max_token: 256,
threshold: 0.1,
max_cluster: 317,
random_seed: 0,
scope: 'file',
clustering_method: 'ahc',
tree_builder: 'raptor',
},
});
expect(result?.raptor).not.toHaveProperty('clustering_method');
expect(result?.raptor).not.toHaveProperty('tree_builder');
expect(result?.raptor?.ext).toMatchObject({
clustering_method: 'ahc',
tree_builder: 'raptor',
});
});
it('preserves existing RAPTOR ext clustering values when the top-level field is absent', () => {
const result = extractParserConfigExt({
raptor: {
max_cluster: 512,
ext: {
clustering_method: 'ahc',
tree_builder: 'raptor',
psi_bucket_size: 1024,
},
},
});
expect(result?.raptor?.ext).toMatchObject({
clustering_method: 'ahc',
tree_builder: 'raptor',
psi_bucket_size: 1024,
});
});
});