From 39b194453dd85336dd5a4ec6640e2d38f31ce775 Mon Sep 17 00:00:00 2001 From: Ju Boxiang <140971685+Dr1985@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:20:07 +0800 Subject: [PATCH] Fix: paginate get_flatted_meta_by_kbs to support datasets with >10k documents (#16034) (#16095) --- api/db/services/doc_metadata_service.py | 66 +++++++++++++++---------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/api/db/services/doc_metadata_service.py b/api/db/services/doc_metadata_service.py index ccc66925bd..f07c8fb052 100644 --- a/api/db/services/doc_metadata_service.py +++ b/api/db/services/doc_metadata_service.py @@ -774,31 +774,40 @@ class DocMetadataService: condition = {"kb_id": kb_ids} order_by = OrderByExpr() + if not settings.DOC_ENGINE_INFINITY: + order_by.asc("id") - # Query with large limit - results = settings.docStoreConn.search( - select_fields=["*"], # Get all fields - highlight_fields=[], - condition=condition, - match_expressions=[], - order_by=order_by, - offset=0, - limit=10000, - index_names=index_name, - knowledgebase_ids=kb_ids - ) + # Paginate to support datasets with more than 10,000 documents. + page_size = 1000 + offset = 0 + all_results = [] + while True: + batch = settings.docStoreConn.search( + select_fields=["*"], + highlight_fields=[], + condition=condition, + match_expressions=[], + order_by=order_by, + offset=offset, + limit=page_size, + index_names=index_name, + knowledgebase_ids=kb_ids + ) + batch_docs = list(cls._iter_search_results(batch)) + if not batch_docs: + break + all_results.extend(batch_docs) + logging.debug( + "[get_flatted_meta_by_kbs] offset=%d batch=%d total=%d kb_ids=%s", + offset, len(batch_docs), len(all_results), kb_ids, + ) + if len(batch_docs) < page_size: + break + offset += page_size - logging.debug(f"[get_flatted_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}") - logging.debug(f"[get_flatted_meta_by_kbs] results type: {type(results)}") - - # Aggregate metadata + # Aggregate metadata over all retrieved results meta = {} - doc_count = 0 - - # Use helper to iterate over results in any format - for doc_id, doc in cls._iter_search_results(results): - doc_count += 1 - # Extract metadata fields (exclude system fields) + for doc_id, doc in all_results: doc_meta = cls._extract_metadata(doc) for k, v in doc_meta.items(): @@ -814,14 +823,19 @@ class DocMetadataService: meta[k][sv] = [] meta[k][sv].append(doc_id) - if doc_count >= 10000: - logging.warning(f"[get_flatted_meta_by_kbs] Results hit the 10000 limit for KBs {kb_ids}.") + doc_count = len(all_results) + if doc_count >= 100000: + logging.warning( + "[get_flatted_meta_by_kbs] Large result set: %d documents for KBs %s. " + "Consider performance impact.", doc_count, kb_ids, + ) - logging.debug(f"[get_flatted_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}") + logging.debug("[get_flatted_meta_by_kbs] KBs: %s, Retrieved %d documents, metadata: %s", + kb_ids, doc_count, meta) return meta except Exception as e: - logging.error(f"Error getting flattened metadata for KBs {kb_ids}: {e}") + logging.error("Error getting flattened metadata for KBs %s: %s", kb_ids, e) return {} @classmethod