mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
This commit is contained in:
@@ -774,31 +774,40 @@ class DocMetadataService:
|
||||
|
||||
condition = {"kb_id": kb_ids}
|
||||
order_by = OrderByExpr()
|
||||
if not settings.DOC_ENGINE_INFINITY:
|
||||
order_by.asc("id")
|
||||
|
||||
# Query with large limit
|
||||
results = settings.docStoreConn.search(
|
||||
select_fields=["*"], # Get all fields
|
||||
highlight_fields=[],
|
||||
condition=condition,
|
||||
match_expressions=[],
|
||||
order_by=order_by,
|
||||
offset=0,
|
||||
limit=10000,
|
||||
index_names=index_name,
|
||||
knowledgebase_ids=kb_ids
|
||||
)
|
||||
# Paginate to support datasets with more than 10,000 documents.
|
||||
page_size = 1000
|
||||
offset = 0
|
||||
all_results = []
|
||||
while True:
|
||||
batch = settings.docStoreConn.search(
|
||||
select_fields=["*"],
|
||||
highlight_fields=[],
|
||||
condition=condition,
|
||||
match_expressions=[],
|
||||
order_by=order_by,
|
||||
offset=offset,
|
||||
limit=page_size,
|
||||
index_names=index_name,
|
||||
knowledgebase_ids=kb_ids
|
||||
)
|
||||
batch_docs = list(cls._iter_search_results(batch))
|
||||
if not batch_docs:
|
||||
break
|
||||
all_results.extend(batch_docs)
|
||||
logging.debug(
|
||||
"[get_flatted_meta_by_kbs] offset=%d batch=%d total=%d kb_ids=%s",
|
||||
offset, len(batch_docs), len(all_results), kb_ids,
|
||||
)
|
||||
if len(batch_docs) < page_size:
|
||||
break
|
||||
offset += page_size
|
||||
|
||||
logging.debug(f"[get_flatted_meta_by_kbs] index_name: {index_name}, kb_ids: {kb_ids}")
|
||||
logging.debug(f"[get_flatted_meta_by_kbs] results type: {type(results)}")
|
||||
|
||||
# Aggregate metadata
|
||||
# Aggregate metadata over all retrieved results
|
||||
meta = {}
|
||||
doc_count = 0
|
||||
|
||||
# Use helper to iterate over results in any format
|
||||
for doc_id, doc in cls._iter_search_results(results):
|
||||
doc_count += 1
|
||||
# Extract metadata fields (exclude system fields)
|
||||
for doc_id, doc in all_results:
|
||||
doc_meta = cls._extract_metadata(doc)
|
||||
|
||||
for k, v in doc_meta.items():
|
||||
@@ -814,14 +823,19 @@ class DocMetadataService:
|
||||
meta[k][sv] = []
|
||||
meta[k][sv].append(doc_id)
|
||||
|
||||
if doc_count >= 10000:
|
||||
logging.warning(f"[get_flatted_meta_by_kbs] Results hit the 10000 limit for KBs {kb_ids}.")
|
||||
doc_count = len(all_results)
|
||||
if doc_count >= 100000:
|
||||
logging.warning(
|
||||
"[get_flatted_meta_by_kbs] Large result set: %d documents for KBs %s. "
|
||||
"Consider performance impact.", doc_count, kb_ids,
|
||||
)
|
||||
|
||||
logging.debug(f"[get_flatted_meta_by_kbs] KBs: {kb_ids}, Returning metadata: {meta}")
|
||||
logging.debug("[get_flatted_meta_by_kbs] KBs: %s, Retrieved %d documents, metadata: %s",
|
||||
kb_ids, doc_count, meta)
|
||||
return meta
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting flattened metadata for KBs {kb_ids}: {e}")
|
||||
logging.error("Error getting flattened metadata for KBs %s: %s", kb_ids, e)
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user