diff --git a/rag/nlp/search.py b/rag/nlp/search.py index e20cece79b..8201bb370c 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -560,6 +560,31 @@ class Dealer: rag_tokenizer.tokenize(ans).split(), rag_tokenizer.tokenize(inst).split()) + @staticmethod + def _rerank_window(page_size: int, top: int = 0) -> int: + """Candidate-window size shared by retrieval's block fetch and slice. + + ``retrieval`` reuses this value BOTH as the backend block size and as + the modulus for extracting a single page from a (re)ranked block:: + + req["page"] = global_offset // window # which block to fetch + begin = global_offset % window # where the page starts + + For those two to agree the window MUST be an exact multiple of + ``page_size``; otherwise blocks and pages drift apart and deep + pagination silently drops results and returns short pages. + + The window targets a provider-friendly pool of ~64 candidates, bounded + by ``top`` when given (i.e. when an external reranker is active), and is + always rounded UP to a whole number of pages to preserve the invariant. + """ + if page_size <= 1: + return min(30, top) if top > 0 else 30 + window = math.ceil(64 / page_size) * page_size + if top > 0: + window = min(window, math.ceil(top / page_size) * page_size) + return window + async def retrieval( self, question, @@ -582,12 +607,12 @@ class Dealer: if not question: return ranks - # Keep the historical windowing strategy by default, but when an external - # reranker is enabled cap candidate count by both top_k and provider-safe 64. - RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1 - RERANK_LIMIT = max(30, RERANK_LIMIT) - if rerank_mdl and top > 0: - RERANK_LIMIT = min(RERANK_LIMIT, top, 64) + # Candidate window for block-based pagination. It MUST stay a multiple + # of page_size so the block fetched (global_offset // RERANK_LIMIT) and + # the in-block page slice (global_offset % RERANK_LIMIT) stay aligned; + # see _rerank_window. When an external reranker is active the pool is + # also bounded by top. + RERANK_LIMIT = self._rerank_window(page_size, top if rerank_mdl else 0) page = max(page, 1) global_offset = (page - 1) * page_size req = { diff --git a/test/unit_test/rag/test_search_pagination.py b/test/unit_test/rag/test_search_pagination.py new file mode 100644 index 0000000000..75a33d1adb --- /dev/null +++ b/test/unit_test/rag/test_search_pagination.py @@ -0,0 +1,135 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Unit tests for the block-based pagination window used by Dealer.retrieval. + +retrieval reuses RERANK_LIMIT both as the backend block size +(req["page"] = global_offset // RERANK_LIMIT) and as the modulus for slicing a +page out of a (re)ranked block (begin = global_offset % RERANK_LIMIT). If the +window is not an exact multiple of page_size, blocks and pages drift apart, so +deep pages silently drop results and come back short. These tests pin that +invariant and verify cross-block pagination loses nothing. +""" +import math +import sys +import types + +import pytest + +# Stub the heavy / circular-importing dependencies before importing search, +# mirroring test_rank_feature_scores.py so the module imports in isolation. +_fake_query = types.ModuleType("rag.nlp.query") + + +class _DummyFulltextQueryer: + pass + + +_fake_query.FulltextQueryer = _DummyFulltextQueryer +sys.modules.setdefault("rag.nlp.query", _fake_query) +sys.modules.setdefault("common.settings", types.ModuleType("common.settings")) + +from rag.nlp.search import Dealer # noqa: E402 + +_rerank_window = Dealer._rerank_window + +# (page_size, top) combinations, including the common page sizes (10, 30) that +# do NOT divide 64 -- the exact case the old `min(..., 64)` clamp broke -- plus +# tiny / large / page-aligned tops. +GRID = [ + (page_size, top) + for page_size in (1, 5, 7, 10, 30, 50, 64) + for top in (0, 5, 30, 50, 55, 64, 100, 1024) +] + + +def _paginate(total, page_size, top, rerank): + """Replay retrieval's block-fetch + in-block slice over `total` candidates. + + Returns the concatenated global positions actually surfaced across every + page, exactly as Dealer.retrieval would emit them. + """ + window = _rerank_window(page_size, top if rerank else 0) + # The backend caps the candidate pool at `top` when an external reranker is + # active; otherwise the whole result set is windowed. + cap = min(total, top) if (rerank and top > 0) else total + surfaced = [] + page = 1 + while (page - 1) * page_size < cap: + global_offset = (page - 1) * page_size + block_index = global_offset // window + block_start = block_index * window + block = list(range(block_start, min(block_start + window, cap))) + begin = global_offset % window + surfaced.extend(block[begin:begin + page_size]) + page += 1 + return window, cap, surfaced + + +@pytest.mark.parametrize("page_size,top", GRID) +def test_window_is_page_aligned(page_size, top): + """The window must be a positive whole multiple of page_size.""" + for rerank in (False, True): + window = _rerank_window(page_size, top if rerank else 0) + assert window >= 1 + if page_size > 1: + assert window % page_size == 0, (page_size, top, rerank, window) + + +@pytest.mark.parametrize("page_size,top", GRID) +def test_pagination_loses_nothing(page_size, top): + """Walking every page reconstructs the candidate pool exactly: in order, + no gaps, no duplicates, and no short interior pages.""" + total = 250 + for rerank in (False, True): + window, cap, surfaced = _paginate(total, page_size, top, rerank) + assert surfaced == list(range(cap)), ( + f"page_size={page_size} top={top} rerank={rerank} window={window} " + f"cap={cap}: missing={sorted(set(range(cap)) - set(surfaced))[:10]} " + f"dups={len(surfaced) != len(set(surfaced))}" + ) + + +@pytest.mark.p1 +def test_reported_regression_page7_not_short(): + """The reported case: page_size=10, top=1024, reranker on. Page 7 (global + offset 60) used to return only 4 of 10 results because the window was + clamped to 64 (not a multiple of 10).""" + page_size, top = 10, 1024 + window = _rerank_window(page_size, top) + assert window % page_size == 0 + assert window >= 64 # still a provider-friendly ~64 candidate pool + + total = 250 + _, cap, surfaced = _paginate(total, page_size, top, rerank=True) + # Page 7 spans global positions 60..69 and must be full and contiguous. + assert surfaced[60:70] == list(range(60, 70)) + assert len(surfaced) == cap + + +@pytest.mark.p1 +def test_matches_legacy_window_on_non_buggy_paths(): + """Where the old formula already produced a page-aligned value, the new + window is unchanged (no behavioral regression on the non-buggy paths).""" + def legacy(page_size, top, rerank): + limit = math.ceil(64 / page_size) * page_size if page_size > 1 else 1 + limit = max(30, limit) + if rerank and top > 0: + limit = min(limit, top, 64) + return limit + + for page_size in (1, 5, 7, 10, 30, 50, 64): + # The non-rerank path was always page-aligned already -> must match. + assert _rerank_window(page_size, 0) == legacy(page_size, 0, False)