Files
ragflow/api/apps/sdk/doc.py

575 lines
22 KiB
Python
Raw Normal View History

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
Feat/configurable metadata display (#13464) ### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
2026-04-30 18:13:27 +03:00
import logging
from io import BytesIO
from quart import request, send_file
from api.db.db_models import APIToken, Document, Task
from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_model_config_by_type_and_name, get_tenant_default_model_by_type
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
from api.db.services.tenant_llm_service import TenantLLMService
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
from common import settings
from common.constants import LLMType, RetCode, TaskStatus
from common.metadata_utils import convert_conditions, meta_filter
from rag.app.tag import label_question
from rag.nlp import search
from rag.prompts.generator import cross_languages, keyword_extraction
MAXIMUM_OF_UPLOADING_FILES = 256
Feat/configurable metadata display (#13464) ### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
2026-04-30 18:13:27 +03:00
from api.utils.reference_metadata_utils import (
enrich_chunks_with_document_metadata,
resolve_reference_metadata_preferences,
)
def _resolve_reference_metadata(req: dict, search_config: dict | None = None):
return resolve_reference_metadata_preferences(req, search_config)
def _enrich_chunks_with_document_metadata(chunks: list[dict], metadata_fields=None) -> None:
enrich_chunks_with_document_metadata(chunks, metadata_fields)
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"]) # noqa: F821
@token_required
async def download(tenant_id, dataset_id, document_id):
"""
Download a document from a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
produces:
- application/octet-stream
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: path
name: document_id
type: string
required: true
description: ID of the document to download.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
responses:
200:
description: Document file stream.
schema:
type: file
400:
description: Error message.
schema:
type: object
"""
if not document_id:
return get_error_data_result(message="Specify document_id please.")
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
doc = DocumentService.query(kb_id=dataset_id, id=document_id)
if not doc:
return get_error_data_result(message=f"The dataset not own the document {document_id}.")
# The process of downloading
doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location)
if not file_stream:
return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
file = BytesIO(file_stream)
# Use send_file with a proper filename and MIME type
return await send_file(
file,
as_attachment=True,
attachment_filename=doc[0].name,
mimetype="application/octet-stream", # Set a default MIME type
)
@manager.route("/documents/<document_id>", methods=["GET"]) # noqa: F821
async def download_doc(document_id):
token = request.headers.get("Authorization").split()
if len(token) != 2:
return get_error_data_result(message="Authorization is not valid!")
token = token[1]
fix: authorize beta document downloads by tenant (#14496) ## Summary This fixes a missing authorization check in the beta API document download endpoint: - **CWE:** CWE-862 (Missing Authorization) - **Severity:** Medium - **Affected route/file:** `GET /api/v1/documents/<document_id>` in `api/apps/sdk/doc.py` - **Data flow:** the route reads a bearer beta API token, resolves the token with `APIToken.query(beta=token)`, accepts `document_id` directly from the URL, loads the document with `DocumentService.query(id=document_id)`, and then fetches the backing object through `File2DocumentService.get_storage_address()` / `settings.STORAGE_IMPL.get()`. Before this change, that flow verified that the API token was valid, but it did not verify that the token's tenant owned the document's knowledge base. A caller with any valid beta API token and a known document ID could therefore reach storage for a document belonging to another tenant. ## Fix The endpoint now takes the tenant ID from the resolved API token and checks the document's knowledge base with: ```python KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id) ``` If the knowledge base is not owned by the token tenant, the request returns an access error before any storage lookup occurs. This mirrors the tenant-scoped ownership checks used by the dataset-scoped document download path and keeps the patch small. ## Tests Added unit coverage for `download_doc()` to assert that: - the beta token tenant ID is used in the knowledge-base ownership lookup; - cross-tenant access returns `You do not have access to this document.`; - storage resolution is not called before tenant authorization succeeds; - the existing same-tenant empty-file and successful-download paths still run after the authorization gate passes. I also verified the final patch is limited to `api/apps/sdk/doc.py` and the related document SDK route unit test. A local `pytest` invocation could not complete in this checkout because the shared test fixture attempts to log in to a RAGFlow server at `127.0.0.1:9380`, which was not running in the local environment. ## Security analysis This is exploitable when an attacker has a valid beta API token for their own tenant and obtains or guesses a document ID from another tenant. The token alone should not grant access to other tenants' files, but the direct document route previously authorized only the token itself and not the requested resource. The new tenant-scoped knowledge-base check binds the requested document back to the token tenant before storage is accessed, preventing cross-tenant document downloads through this endpoint. Before submitting, we attempted to disprove this by checking whether existing dataset-scoped routes, token validation, or framework protections already enforced ownership. They do not apply to this direct document-ID route: it bypassed the dataset path parameter and used only `DocumentService.query(id=document_id)` before reading storage. cc @lewiswigmore
2026-05-06 07:55:41 +01:00
logging.info("Beta API token lookup attempted for document download")
objs = APIToken.query(beta=token)
if not objs:
fix: authorize beta document downloads by tenant (#14496) ## Summary This fixes a missing authorization check in the beta API document download endpoint: - **CWE:** CWE-862 (Missing Authorization) - **Severity:** Medium - **Affected route/file:** `GET /api/v1/documents/<document_id>` in `api/apps/sdk/doc.py` - **Data flow:** the route reads a bearer beta API token, resolves the token with `APIToken.query(beta=token)`, accepts `document_id` directly from the URL, loads the document with `DocumentService.query(id=document_id)`, and then fetches the backing object through `File2DocumentService.get_storage_address()` / `settings.STORAGE_IMPL.get()`. Before this change, that flow verified that the API token was valid, but it did not verify that the token's tenant owned the document's knowledge base. A caller with any valid beta API token and a known document ID could therefore reach storage for a document belonging to another tenant. ## Fix The endpoint now takes the tenant ID from the resolved API token and checks the document's knowledge base with: ```python KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id) ``` If the knowledge base is not owned by the token tenant, the request returns an access error before any storage lookup occurs. This mirrors the tenant-scoped ownership checks used by the dataset-scoped document download path and keeps the patch small. ## Tests Added unit coverage for `download_doc()` to assert that: - the beta token tenant ID is used in the knowledge-base ownership lookup; - cross-tenant access returns `You do not have access to this document.`; - storage resolution is not called before tenant authorization succeeds; - the existing same-tenant empty-file and successful-download paths still run after the authorization gate passes. I also verified the final patch is limited to `api/apps/sdk/doc.py` and the related document SDK route unit test. A local `pytest` invocation could not complete in this checkout because the shared test fixture attempts to log in to a RAGFlow server at `127.0.0.1:9380`, which was not running in the local environment. ## Security analysis This is exploitable when an attacker has a valid beta API token for their own tenant and obtains or guesses a document ID from another tenant. The token alone should not grant access to other tenants' files, but the direct document route previously authorized only the token itself and not the requested resource. The new tenant-scoped knowledge-base check binds the requested document back to the token tenant before storage is accessed, preventing cross-tenant document downloads through this endpoint. Before submitting, we attempted to disprove this by checking whether existing dataset-scoped routes, token validation, or framework protections already enforced ownership. They do not apply to this direct document-ID route: it bypassed the dataset path parameter and used only `DocumentService.query(id=document_id)` before reading storage. cc @lewiswigmore
2026-05-06 07:55:41 +01:00
logging.warning("Beta API token lookup failed for document download: invalid API key")
return get_error_data_result(message='Authentication error: API key is invalid!"')
fix: authorize beta document downloads by tenant (#14496) ## Summary This fixes a missing authorization check in the beta API document download endpoint: - **CWE:** CWE-862 (Missing Authorization) - **Severity:** Medium - **Affected route/file:** `GET /api/v1/documents/<document_id>` in `api/apps/sdk/doc.py` - **Data flow:** the route reads a bearer beta API token, resolves the token with `APIToken.query(beta=token)`, accepts `document_id` directly from the URL, loads the document with `DocumentService.query(id=document_id)`, and then fetches the backing object through `File2DocumentService.get_storage_address()` / `settings.STORAGE_IMPL.get()`. Before this change, that flow verified that the API token was valid, but it did not verify that the token's tenant owned the document's knowledge base. A caller with any valid beta API token and a known document ID could therefore reach storage for a document belonging to another tenant. ## Fix The endpoint now takes the tenant ID from the resolved API token and checks the document's knowledge base with: ```python KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id) ``` If the knowledge base is not owned by the token tenant, the request returns an access error before any storage lookup occurs. This mirrors the tenant-scoped ownership checks used by the dataset-scoped document download path and keeps the patch small. ## Tests Added unit coverage for `download_doc()` to assert that: - the beta token tenant ID is used in the knowledge-base ownership lookup; - cross-tenant access returns `You do not have access to this document.`; - storage resolution is not called before tenant authorization succeeds; - the existing same-tenant empty-file and successful-download paths still run after the authorization gate passes. I also verified the final patch is limited to `api/apps/sdk/doc.py` and the related document SDK route unit test. A local `pytest` invocation could not complete in this checkout because the shared test fixture attempts to log in to a RAGFlow server at `127.0.0.1:9380`, which was not running in the local environment. ## Security analysis This is exploitable when an attacker has a valid beta API token for their own tenant and obtains or guesses a document ID from another tenant. The token alone should not grant access to other tenants' files, but the direct document route previously authorized only the token itself and not the requested resource. The new tenant-scoped knowledge-base check binds the requested document back to the token tenant before storage is accessed, preventing cross-tenant document downloads through this endpoint. Before submitting, we attempted to disprove this by checking whether existing dataset-scoped routes, token validation, or framework protections already enforced ownership. They do not apply to this direct document-ID route: it bypassed the dataset path parameter and used only `DocumentService.query(id=document_id)` before reading storage. cc @lewiswigmore
2026-05-06 07:55:41 +01:00
if len(objs) > 1:
logging.error("Beta API token lookup is ambiguous for document download: matches=%s", len(objs))
return get_error_data_result(message="Authentication error: API key configuration is ambiguous.")
tenant_id = objs[0].tenant_id
logging.info("Beta API token authorized for document download: tenant_id=%s", tenant_id)
if not document_id:
return get_error_data_result(message="Specify document_id please.")
doc = DocumentService.query(id=document_id)
if not doc:
return get_error_data_result(message=f"The dataset not own the document {document_id}.")
fix: authorize beta document downloads by tenant (#14496) ## Summary This fixes a missing authorization check in the beta API document download endpoint: - **CWE:** CWE-862 (Missing Authorization) - **Severity:** Medium - **Affected route/file:** `GET /api/v1/documents/<document_id>` in `api/apps/sdk/doc.py` - **Data flow:** the route reads a bearer beta API token, resolves the token with `APIToken.query(beta=token)`, accepts `document_id` directly from the URL, loads the document with `DocumentService.query(id=document_id)`, and then fetches the backing object through `File2DocumentService.get_storage_address()` / `settings.STORAGE_IMPL.get()`. Before this change, that flow verified that the API token was valid, but it did not verify that the token's tenant owned the document's knowledge base. A caller with any valid beta API token and a known document ID could therefore reach storage for a document belonging to another tenant. ## Fix The endpoint now takes the tenant ID from the resolved API token and checks the document's knowledge base with: ```python KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id) ``` If the knowledge base is not owned by the token tenant, the request returns an access error before any storage lookup occurs. This mirrors the tenant-scoped ownership checks used by the dataset-scoped document download path and keeps the patch small. ## Tests Added unit coverage for `download_doc()` to assert that: - the beta token tenant ID is used in the knowledge-base ownership lookup; - cross-tenant access returns `You do not have access to this document.`; - storage resolution is not called before tenant authorization succeeds; - the existing same-tenant empty-file and successful-download paths still run after the authorization gate passes. I also verified the final patch is limited to `api/apps/sdk/doc.py` and the related document SDK route unit test. A local `pytest` invocation could not complete in this checkout because the shared test fixture attempts to log in to a RAGFlow server at `127.0.0.1:9380`, which was not running in the local environment. ## Security analysis This is exploitable when an attacker has a valid beta API token for their own tenant and obtains or guesses a document ID from another tenant. The token alone should not grant access to other tenants' files, but the direct document route previously authorized only the token itself and not the requested resource. The new tenant-scoped knowledge-base check binds the requested document back to the token tenant before storage is accessed, preventing cross-tenant document downloads through this endpoint. Before submitting, we attempted to disprove this by checking whether existing dataset-scoped routes, token validation, or framework protections already enforced ownership. They do not apply to this direct document-ID route: it bypassed the dataset path parameter and used only `DocumentService.query(id=document_id)` before reading storage. cc @lewiswigmore
2026-05-06 07:55:41 +01:00
if not KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id):
logging.warning(
"cross-tenant access denied for document download: tenant_id=%s kb_id=%s document_id=%s",
tenant_id,
doc[0].kb_id,
document_id,
)
return get_error_data_result(message="You do not have access to this document.")
# The process of downloading
doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location)
if not file_stream:
return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
file = BytesIO(file_stream)
# Use send_file with a proper filename and MIME type
return await send_file(
file,
as_attachment=True,
attachment_filename=doc[0].name,
mimetype="application/octet-stream", # Set a default MIME type
)
DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"
@manager.route("/datasets/<dataset_id>/chunks", methods=["POST"]) # noqa: F821
@token_required
async def parse(tenant_id, dataset_id):
"""
Start parsing documents into chunks.
---
tags:
- Chunks
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: body
name: body
description: Parsing parameters.
required: true
schema:
type: object
properties:
document_ids:
type: array
items:
type: string
description: List of document IDs to parse.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
responses:
200:
description: Parsing started successfully.
schema:
type: object
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
req = await get_request_json()
if not req.get("document_ids"):
return get_error_data_result("`document_ids` is required")
doc_list = req.get("document_ids")
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
doc_list = unique_doc_ids
not_found = []
success_count = 0
for id in doc_list:
doc = DocumentService.query(id=id, kb_id=dataset_id)
if not doc:
not_found.append(id)
continue
if not doc:
return get_error_data_result(message=f"You don't own the document {id}.")
info = {"run": "1", "progress": 0, "progress_msg": "", "chunk_num": 0, "token_num": 0}
if (
DocumentService.filter_update(
[
Document.id == id,
((Document.run.is_null(True)) | (Document.run != TaskStatus.RUNNING.value)),
],
info,
)
== 0
):
return get_error_data_result("Can't parse document that is currently being processed")
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
TaskService.filter_delete([Task.doc_id == id])
e, doc = DocumentService.get_by_id(id)
doc = doc.to_dict()
doc["tenant_id"] = tenant_id
bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
queue_tasks(doc, bucket, name, 0)
success_count += 1
if not_found:
return get_result(message=f"Documents not found: {not_found}", code=RetCode.DATA_ERROR)
if duplicate_messages:
if success_count > 0:
return get_result(
message=f"Partially parsed {success_count} documents with {len(duplicate_messages)} errors",
data={"success_count": success_count, "errors": duplicate_messages},
)
else:
return get_error_data_result(message=";".join(duplicate_messages))
return get_result()
@manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"]) # noqa: F821
@token_required
async def stop_parsing(tenant_id, dataset_id):
"""
Stop parsing documents into chunks.
---
tags:
- Chunks
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: body
name: body
description: Stop parsing parameters.
required: true
schema:
type: object
properties:
document_ids:
type: array
items:
type: string
description: List of document IDs to stop parsing.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
responses:
200:
description: Parsing stopped successfully.
schema:
type: object
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
req = await get_request_json()
if not req.get("document_ids"):
return get_error_data_result("`document_ids` is required")
doc_list = req.get("document_ids")
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
doc_list = unique_doc_ids
success_count = 0
for id in doc_list:
doc = DocumentService.query(id=id, kb_id=dataset_id)
if not doc:
return get_error_data_result(message=f"You don't own the document {id}.")
if doc[0].run != TaskStatus.RUNNING.value:
return construct_json_result(
code=RetCode.DATA_ERROR,
message=DOC_STOP_PARSING_INVALID_STATE_MESSAGE,
data={"error_code": DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE},
)
# Send cancellation signal via Redis to stop background task
cancel_all_task_of(id)
info = {"run": "2", "progress": 0, "chunk_num": 0}
DocumentService.update_by_id(id, info)
settings.docStoreConn.delete({"doc_id": doc[0].id}, search.index_name(tenant_id), dataset_id)
success_count += 1
if duplicate_messages:
if success_count > 0:
return get_result(
message=f"Partially stopped {success_count} documents with {len(duplicate_messages)} errors",
data={"success_count": success_count, "errors": duplicate_messages},
)
else:
return get_error_data_result(message=";".join(duplicate_messages))
return get_result()
@manager.route("/retrieval", methods=["POST"]) # noqa: F821
@token_required
async def retrieval_test(tenant_id):
"""
Retrieve chunks based on a query.
---
tags:
- Retrieval
security:
- ApiKeyAuth: []
parameters:
- in: body
name: body
description: Retrieval parameters.
required: true
schema:
type: object
properties:
dataset_ids:
type: array
items:
type: string
required: true
description: List of dataset IDs to search in.
question:
type: string
required: true
description: Query string.
document_ids:
type: array
items:
type: string
description: List of document IDs to filter.
similarity_threshold:
type: number
format: float
description: Similarity threshold.
vector_similarity_weight:
type: number
format: float
description: Vector similarity weight.
top_k:
type: integer
description: Maximum number of chunks to return.
highlight:
type: boolean
description: Whether to highlight matched content.
metadata_condition:
type: object
description: metadata filter condition.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
responses:
200:
description: Retrieval results.
schema:
type: object
properties:
chunks:
type: array
items:
type: object
properties:
id:
type: string
description: Chunk ID.
content:
type: string
description: Chunk content.
document_id:
type: string
description: ID of the document.
dataset_id:
type: string
description: ID of the dataset.
similarity:
type: number
format: float
description: Similarity score.
"""
req = await get_request_json()
if not req.get("dataset_ids"):
return get_error_data_result("`dataset_ids` is required.")
kb_ids = req["dataset_ids"]
if not isinstance(kb_ids, list):
return get_error_data_result("`dataset_ids` should be a list")
for id in kb_ids:
if not KnowledgebaseService.accessible(kb_id=id, user_id=tenant_id):
return get_error_data_result(f"You don't own the dataset {id}.")
kbs = KnowledgebaseService.get_by_ids(kb_ids)
embd_nms = list(set([TenantLLMService.split_model_name_and_factory(kb.embd_id)[0] for kb in kbs])) # remove vendor suffix for comparison
if len(embd_nms) != 1:
return get_result(
message='Datasets use different embedding models."',
code=RetCode.DATA_ERROR,
)
if "question" not in req:
return get_error_data_result("`question` is required.")
page = int(req.get("page", 1))
size = int(req.get("page_size", 30))
question = req["question"]
# Trim whitespace and validate question
if isinstance(question, str):
question = question.strip()
# Return empty result if question is empty or whitespace-only
if not question:
return get_result(data={"total": 0, "chunks": [], "doc_aggs": {}})
doc_ids = req.get("document_ids", [])
use_kg = req.get("use_kg", False)
toc_enhance = req.get("toc_enhance", False)
langs = req.get("cross_languages", [])
if not isinstance(doc_ids, list):
return get_error_data_result("`documents` should be a list")
if doc_ids:
doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
for doc_id in doc_ids:
if doc_id not in doc_ids_list:
return get_error_data_result(f"The datasets don't own the document {doc_id}")
if not doc_ids:
metadata_condition = req.get("metadata_condition")
if metadata_condition:
metas = DocMetadataService.get_flatted_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
# If metadata_condition has conditions but no docs match, return empty result
if not doc_ids and metadata_condition.get("conditions"):
return get_result(data={"total": 0, "chunks": [], "doc_aggs": {}})
if metadata_condition and not doc_ids:
doc_ids = ["-999"]
else:
# If doc_ids is None all documents of the datasets are used
doc_ids = None
similarity_threshold = float(req.get("similarity_threshold", 0.2))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024))
fix: file logs not displayed in dataset ingestion page (#14479) ### What problem does this PR solve? ## Summary Fixed a bug where the **File Logs** tab in the dataset ingestion page always showed "No logs" even after files were parsed successfully. ## Root Cause Both the **File Logs** and **Dataset Logs** tabs on the frontend called the same backend endpoint `/datasets/{dataset_id}/ingestions`. However, the backend only queried `get_dataset_logs_by_kb_id`, which hard-filtered records by `document_id == GRAPH_RAPTOR_FAKE_DOC_ID` (dataset-level logs). As a result, real file-level logs were never returned, causing the table to appear empty. ## Changes ### Backend - **`api/apps/restful_apis/dataset_api.py`** - Added two new query parameters to `list_ingestion_logs`: - `log_type` — `"file"` or `"dataset"` (default: `"dataset"`) - `keywords` — search keyword for filtering by document / task name - **`api/apps/services/dataset_api_service.py`** - Updated `list_ingestion_logs` signature to accept `log_type` and `keywords`. - Added conditional routing: - When `log_type == "file"`, call `PipelineOperationLogService.get_file_logs_by_kb_id` - Otherwise, call `PipelineOperationLogService.get_dataset_logs_by_kb_id` - **`api/db/services/pipeline_operation_log_service.py`** - Extended `get_dataset_logs_by_kb_id` with an optional `keywords` parameter so dataset logs can also be searched. ### Frontend - **`web/src/pages/dataset/dataset-overview/hook.ts`** - Removed the separate API function switching (`listPipelineDatasetLogs` vs `listDataPipelineLogDocument`). - Unified both tabs to call `listDataPipelineLogDocument` with the new `log_type` query parameter (`"file"` or `"dataset"`). - Ensured `keywords` and filter values are passed through correctly. ## Behavior After Fix | Tab | `log_type` | Returned Records | Searchable Field | |---|---|---|---| | File Logs | `file` | Real document-level logs | `document_name` (file name) | | Dataset Logs | `dataset` | GraphRAG / RAPTOR / MindMap logs | `document_name` (task type) | ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Signed-off-by: noob <yixiao121314@outlook.com> Co-authored-by: Wang Qi <wangq8@outlook.com> Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
2026-04-29 14:10:24 +00:00
if top <= 0:
return get_error_data_result("`top_k` must be greater than 0")
highlight_val = req.get("highlight", None)
if highlight_val is None:
highlight = False
elif isinstance(highlight_val, bool):
highlight = highlight_val
elif isinstance(highlight_val, str):
if highlight_val.lower() in ["true", "false"]:
highlight = highlight_val.lower() == "true"
else:
return get_error_data_result("`highlight` should be a boolean")
else:
return get_error_data_result("`highlight` should be a boolean")
Feat/configurable metadata display (#13464) ### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
2026-04-30 18:13:27 +03:00
include_metadata, metadata_fields = _resolve_reference_metadata(req)
try:
tenant_ids = list(set([kb.tenant_id for kb in kbs]))
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
if not e:
return get_error_data_result(message="Dataset not found!")
if kb.tenant_embd_id:
embd_model_config = get_model_config_by_id(kb.tenant_embd_id)
else:
embd_model_config = get_model_config_by_type_and_name(kb.tenant_id, LLMType.EMBEDDING, kb.embd_id)
embd_mdl = LLMBundle(kb.tenant_id, embd_model_config)
rerank_mdl = None
if req.get("tenant_rerank_id"):
rerank_model_config = get_model_config_by_id(req["tenant_rerank_id"])
rerank_mdl = LLMBundle(kb.tenant_id, rerank_model_config)
elif req.get("rerank_id"):
rerank_model_config = get_model_config_by_type_and_name(kb.tenant_id, LLMType.RERANK, req["rerank_id"])
rerank_mdl = LLMBundle(kb.tenant_id, rerank_model_config)
if langs:
question = await cross_languages(kb.tenant_id, None, question, langs)
if req.get("keyword", False):
chat_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.CHAT)
chat_mdl = LLMBundle(kb.tenant_id, chat_model_config)
question += await keyword_extraction(chat_mdl, question)
ranks = await settings.retriever.retrieval(
question,
embd_mdl,
tenant_ids,
kb_ids,
page,
size,
similarity_threshold,
vector_similarity_weight,
top,
doc_ids,
rerank_mdl=rerank_mdl,
highlight=highlight,
rank_feature=label_question(question, kbs),
)
if toc_enhance:
chat_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.CHAT)
chat_mdl = LLMBundle(kb.tenant_id, chat_model_config)
cks = await settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size)
if cks:
ranks["chunks"] = cks
ranks["chunks"] = settings.retriever.retrieval_by_children(ranks["chunks"], tenant_ids)
if use_kg:
chat_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.CHAT)
ck = await settings.kg_retriever.retrieval(question, [k.tenant_id for k in kbs], kb_ids, embd_mdl, LLMBundle(kb.tenant_id, chat_model_config))
if ck["content_with_weight"]:
ranks["chunks"].insert(0, ck)
for c in ranks["chunks"]:
c.pop("vector", None)
Feat/configurable metadata display (#13464) ### What problem does this PR solve? Currently, RAGFlow's Search and Chat interfaces display only raw vectorized text chunks during retrieval, without contextual information about their source documents. Users cannot see document titles, page numbers, upload dates, or custom metadata fields that would help them understand and trust the retrieved results. This PR introduces an **optional metadata display feature** that enriches retrieved chunks with document-level metadata in both the Search tab and Chatbot interface. **Key improvements:** - **Search results**: Display document metadata as styled badges beneath chunk snippets - **Chat citations**: Show metadata in citation popovers and reference lists for better source context - **LLM context**: Metadata is injected into the LLM prompt to enable more accurate, citation-aware responses - **External API support**: Applications using RAGFlow's SDK retrieval endpoints (`/v1/retrieval`, `/v1/searchbots/retrieval_test`) can opt-in via request parameters - **User control**: Multi-select dropdown UI allows users to choose which metadata fields to display **Implementation approach:** - ✅ Reuses existing `DocMetadataService` infrastructure (no new database tables or indices) - ✅ Settings stored in existing JSON configuration fields (`search_config.reference_metadata`, `prompt_config.reference_metadata`) - ✅ No database migrations required - ✅ Disabled by default (fully opt-in and backward-compatible) - ✅ Dynamic metadata field selection populated from actual document metadata keys - ✅ Fixed critical bug where Python's builtin `set()` was shadowed by a route handler function **Modified endpoints (all backward-compatible):** - `POST /v1/retrieval` (Public SDK) - `POST /v1/searchbots/retrieval_test` (Searchbots) - `POST /v1/chunk/retrieval_test` (UI/Internal) - Chat completions endpoints (via `extra_body.reference_metadata` or `prompt_config`) ### Type of change - [x] New Feature (non-breaking change which adds functionality) ###Images - <img width="879" height="1275" alt="image" src="https://github.com/user-attachments/assets/95b2d731-31ae-45a1-b081-bf5893f52aeb" /> <br><br> <br><br> <img width="1532" height="362" alt="image" src="https://github.com/user-attachments/assets/9cebc65b-b7a7-459f-b25e-3b13fa9b638e" /> <br><br> <br><br> <img width="2586" height="1320" alt="image" src="https://github.com/user-attachments/assets/2153d493-d899-461f-a7a9-041391e07776" /> --------- Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Attili-sys <Attili-sys@users.noreply.github.com> Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
2026-04-30 18:13:27 +03:00
if include_metadata:
logging.info(
"sdk.retrieval reference_metadata enabled dataset_ids=%s fields=%s chunks=%s",
kb_ids,
sorted(metadata_fields) if metadata_fields else None,
len(ranks["chunks"]),
)
enrich_chunks_with_document_metadata(ranks["chunks"], metadata_fields)
##rename keys
renamed_chunks = []
for chunk in ranks["chunks"]:
key_mapping = {
"chunk_id": "id",
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"question_kwd": "questions",
"docnm_kwd": "document_keyword",
"kb_id": "dataset_id",
}
rename_chunk = {}
for key, value in chunk.items():
new_key = key_mapping.get(key, key)
rename_chunk[new_key] = value
renamed_chunks.append(rename_chunk)
ranks["chunks"] = renamed_chunks
return get_result(data=ranks)
except Exception as e:
if str(e).find("not_found") > 0:
return get_result(
message="No chunk found! Check the chunk status please!",
code=RetCode.DATA_ERROR,
)
return server_error_response(e)