Refactor: Consolidation WEB API & HTTP API for document infos (#14239)

### What problem does this PR solve?

Before consolidation
Web API: POST /v1/document/infos
Http API - GET /api/v1/datasets/<dataset_id>/documents

After consolidation, Restful API -- GET
/api/v1/datasets/<dataset_id>/documents?ids=id1&ids=id2

### Type of change

- [ ] Refactoring
This commit is contained in:
Jack
2026-04-21 19:35:11 +08:00
committed by GitHub
parent 779deadf76
commit 2d05475693
9 changed files with 60 additions and 59 deletions

View File

@@ -75,11 +75,36 @@ def list_document(auth, dataset_id):
return res.json()
def get_docs_info(auth, doc_ids):
def get_docs_info(auth, dataset_id, doc_ids=None, doc_id=None):
"""
Get document information by IDs.
Args:
auth: Authorization header
dataset_id: Dataset ID
doc_ids: List of document IDs (use for multiple) - exclusive with doc_id
doc_id: Single document ID (use for one) - exclusive with doc_ids
Raises:
ValueError: If both doc_id and doc_ids are provided
"""
# Validate that id and ids are not used together
if doc_id and doc_ids:
raise ValueError("Cannot use both 'id' and 'ids' parameters at the same time.")
authorization = {"Authorization": auth}
json_req = {"doc_ids": doc_ids}
url = f"{HOST_ADDRESS}/v1/document/infos"
res = requests.post(url=url, headers=authorization, json=json_req)
params = {}
if doc_ids:
# Multiple IDs
for id in doc_ids:
params.append(("ids", id))
elif doc_id:
# Single ID
params["id"] = doc_id
# Use /api/v1 prefix for dataset API
url = f"{HOST_ADDRESS}/api/v1/datasets/{dataset_id}/documents"
res = requests.get(url=url, headers=authorization, params=params)
return res.json()

View File

@@ -48,14 +48,14 @@ def test_parse_txt_document(get_auth):
for doc in res['data']['docs']:
doc_id_list.append(doc['id'])
res = get_docs_info(get_auth, doc_id_list)
res = get_docs_info(get_auth, dataset_id, doc_ids=doc_id_list)
print(doc_id_list)
doc_count = len(doc_id_list)
res = parse_docs(get_auth, doc_id_list)
start_ts = timer()
while True:
res = get_docs_info(get_auth, doc_id_list)
res = get_docs_info(get_auth, dataset_id, doc_ids=doc_id_list)
finished_count = 0
for doc_info in res['data']:
if doc_info['progress'] == 1: