fix(rag/raptor): handle max_cluster edge case in GMM cluster selection (#16199)

### What problem does this PR solve?
`_get_optimal_clusters` in `rag/raptor.py` had two edge-case issues in
GMM cluster-count selection:
1. It used `np.arange(1, max_clusters)`, which never evaluates the
upper-bound candidate (`max_clusters`).
2. When effective `max_clusters` becomes `1`, the candidate list was
empty and `argmin` crashed.

This PR makes candidate evaluation inclusive (`1..max_clusters`) and
guards the single-cluster case by returning `1` directly.

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)

### Validation
- `pytest test/unit_test/rag/test_raptor_psi_tree_builder.py
--config-file pyproject.toml -q`
- `ruff check rag/raptor.py
test/unit_test/rag/test_raptor_psi_tree_builder.py`

### Tests added
- Regression test for `max_cluster == 1` path (no crash, returns 1)
- Regression test verifying upper-bound candidate is evaluated and can
be selected

_AI-assistance disclosure: parts of this change (bug triage and test
scaffolding) were drafted with AI assistance and fully reviewed and
verified by me._

---------

Co-authored-by: Harsh Kashyap <harshkashyap@Harshs-MacBook-Pro.local>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Harsh Kashyap
2026-06-23 06:07:26 -07:00
committed by GitHub
parent 706e0d2d06
commit b4a8a90c73
2 changed files with 49 additions and 2 deletions

View File

@@ -207,6 +207,46 @@ def test_unknown_clustering_method_is_rejected(raptor_module):
_make_raptor(raptor_module, clustering_method="psi")
@pytest.mark.p2
def test_get_optimal_clusters_handles_max_cluster_equal_one(raptor_module):
raptor = _make_raptor(raptor_module, max_cluster=1)
optimal = raptor._get_optimal_clusters(
np.array([[0.0, 0.0], [0.1, 0.0], [1.0, 1.0], [1.1, 1.1]]),
random_state=0,
)
assert optimal == 1
@pytest.mark.p2
def test_get_optimal_clusters_evaluates_upper_bound_candidate(monkeypatch, raptor_module):
raptor = _make_raptor(raptor_module, max_cluster=3)
evaluated = []
class RecordingGaussianMixture:
def __init__(self, n_components, random_state=None):
self.n_components = n_components
evaluated.append(n_components)
def fit(self, embeddings):
return self
def bic(self, embeddings):
scores = {1: 30.0, 2: 20.0, 3: 10.0}
return scores[self.n_components]
monkeypatch.setattr(raptor_module, "GaussianMixture", RecordingGaussianMixture)
optimal = raptor._get_optimal_clusters(
np.array([[0.0, 0.0], [0.1, 0.0], [1.0, 1.0], [1.1, 1.1]]),
random_state=0,
)
assert optimal == 3
assert evaluated == [1, 2, 3]
def test_psi_tree_builder_ranks_all_leaf_pairs_by_original_cosine_similarity(raptor_module):
raptor = _make_raptor(raptor_module)
leaves = [