Fix: paginate get_flatted_meta_by_kbs to support datasets with >10k documents (#16034) (#16095)

This commit is contained in:
Ju Boxiang
2026-06-24 13:20:07 +08:00
committed by GitHub
parent 14565b289a
commit 39b194453d

View File

@@ -774,31 +774,40 @@ class DocMetadataService:
condition = {"kb_id": kb_ids}
order_by = OrderByExpr()
if not settings.DOC_ENGINE_INFINITY:
order_by.asc("id")
# Query with large limit
results = settings.docStoreConn.search(
select_fields=["*"], # Get all fields
highlight_fields=[],
condition=condition,
match_expressions=[],
order_by=order_by,
offset=0,
limit=10000,
index_names=index_name,
knowledgebase_ids=kb_ids
)
# Paginate to support datasets with more than 10,000 documents.
page_size = 1000
offset = 0
all_results = []
while True:
batch = settings.docStoreConn.search(
select_fields=["*"],
highlight_fields=[],
condition=condition,
match_expressions=[],
order_by=order_by,
offset=offset,
limit=page_size,
index_names=index_name,
knowledgebase_ids=kb_ids
)
batch_docs = list(cls._iter_search_results(batch))
if not batch_docs:
break
all_results.extend(batch_docs)
logging.debug(
"[get_flatted_meta_by_kbs] offset=%d batch=%d total=%d kb_ids=%s",
offset, len(batch_docs), len(all_results), kb_ids,
)
if len(batch_docs) < page_size:
break
offset += page_size
logging.debug(f"[get_flatted_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}")
logging.debug(f"[get_flatted_meta_by_kbs] results type: {type(results)}")
# Aggregate metadata
# Aggregate metadata over all retrieved results
meta = {}
doc_count = 0
# Use helper to iterate over results in any format
for doc_id, doc in cls._iter_search_results(results):
doc_count += 1
# Extract metadata fields (exclude system fields)
for doc_id, doc in all_results:
doc_meta = cls._extract_metadata(doc)
for k, v in doc_meta.items():
@@ -814,14 +823,19 @@ class DocMetadataService:
meta[k][sv] = []
meta[k][sv].append(doc_id)
if doc_count >= 10000:
logging.warning(f"[get_flatted_meta_by_kbs] Results hit the 10000 limit for KBs {kb_ids}.")
doc_count = len(all_results)
if doc_count >= 100000:
logging.warning(
"[get_flatted_meta_by_kbs] Large result set: %d documents for KBs %s. "
"Consider performance impact.", doc_count, kb_ids,
)
logging.debug(f"[get_flatted_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}")
logging.debug("[get_flatted_meta_by_kbs] KBs: %s, Retrieved %d documents, metadata: %s",
kb_ids, doc_count, meta)
return meta
except Exception as e:
logging.error(f"Error getting flattened metadata for KBs {kb_ids}: {e}")
logging.error("Error getting flattened metadata for KBs %s: %s", kb_ids, e)
return {}
@classmethod