From faf77a5a8a91b31481bf819bca40ca0a4526d85d Mon Sep 17 00:00:00 2001 From: kpdev <156195510+kiannidev@users.noreply.github.com> Date: Fri, 22 May 2026 00:19:53 -0700 Subject: [PATCH] feat(evaluation): track token usage in evaluation results (#13487) ## Summary Implements the TODO in `evaluation_service.py`: **Track token usage** in evaluation results. ## Changes - **Import** `num_tokens_from_string` from `common.token_utils` - **Prompt tokens**: Use the full prompt returned by `async_chat` when available (includes system prompt + knowledge base + query), otherwise fall back to the question token count - **Completion tokens**: Count tokens in the generated answer - **Storage**: Store `token_usage` as `{prompt_tokens, completion_tokens, total_tokens}` in each `EvaluationResult` instead of `None` ## Why The evaluation pipeline previously saved `token_usage: None` for every result. This change allows downstream consumers (e.g. evaluation dashboards, cost tracking) to see approximate token usage per test case using the same tokenizer (tiktoken cl100k_base) used elsewhere in RAGFlow. ## Testing - No new tests added; existing evaluation flow unchanged - Token counting uses existing `num_tokens_from_string` utility --------- Co-authored-by: kiannidev --- api/db/services/evaluation_service.py | 30 +++- .../test_evaluation_service_token_usage.py | 170 ++++++++++++++++++ 2 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/api/db/services/test_evaluation_service_token_usage.py diff --git a/api/db/services/evaluation_service.py b/api/db/services/evaluation_service.py index 48255512f5..8a4878ad3a 100644 --- a/api/db/services/evaluation_service.py +++ b/api/db/services/evaluation_service.py @@ -39,6 +39,7 @@ from api.db.services.dialog_service import DialogService from common.misc_utils import get_uuid from common.time_utils import current_timestamp from common.constants import StatusEnum +from common.token_utils import num_tokens_from_string class EvaluationService(CommonService): @@ -417,6 +418,12 @@ class EvaluationService(CommonService): answer = ans.get("answer", "") retrieved_chunks = ans.get("reference", {}).get("chunks", []) break + else: + ans = {} + logging.warning( + "Evaluation case %s produced no answer from chat; token_usage will reflect empty output", + case.get("id", "unknown"), + ) execution_time = timer() - start_time @@ -430,6 +437,27 @@ class EvaluationService(CommonService): dialog=dialog ) + # Track token usage: use full prompt from async_chat when available. + # Note: Counts use tiktoken (cl100k_base), which matches OpenAI models but is an + # approximation for other providers (Anthropic, local models, etc.). Downstream + # consumers should treat these values as estimates for cost tracking. + full_prompt = ans.get("prompt", "") + if full_prompt: + prompt_tokens = num_tokens_from_string(full_prompt) + else: + logging.debug( + "Evaluation case %s: ans has no 'prompt' key; using question-only count " + "(undercounts system + retrieved context)", + case.get("id", "unknown"), + ) + prompt_tokens = num_tokens_from_string(case.get("question", "") or "") + completion_tokens = num_tokens_from_string(answer or "") + token_usage = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + # Save result result_id = get_uuid() result = { @@ -440,7 +468,7 @@ class EvaluationService(CommonService): "retrieved_chunks": retrieved_chunks, "metrics": metrics, "execution_time": execution_time, - "token_usage": None, # TODO: Track token usage + "token_usage": token_usage, "create_time": current_timestamp() } diff --git a/test/unit_test/api/db/services/test_evaluation_service_token_usage.py b/test/unit_test/api/db/services/test_evaluation_service_token_usage.py new file mode 100644 index 0000000000..ecf17140f5 --- /dev/null +++ b/test/unit_test/api/db/services/test_evaluation_service_token_usage.py @@ -0,0 +1,170 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for token_usage tracking in EvaluationService._evaluate_single_case.""" + +import pytest +from types import SimpleNamespace + +from api.db.services.evaluation_service import EvaluationService + + +@pytest.fixture +def mock_dialog(): + """Minimal dialog object for evaluation.""" + return SimpleNamespace( + kb_ids=["kb-1"], + prompt_config={"quote": True}, + llm_id="test-llm", + tenant_id="tenant-1", + ) + + +@pytest.fixture +def minimal_case(): + """Minimal test case for evaluation.""" + return { + "id": "case-1", + "question": "What is the capital of France?", + "reference_answer": None, + "relevant_chunk_ids": None, + } + + +@pytest.mark.p2 +def test_token_usage_structure_when_prompt_available(monkeypatch, mock_dialog, minimal_case): + """Verify token_usage dict has correct structure when ans contains 'prompt'.""" + captured_create = {} + + async def mock_async_chat(dialog, messages, stream, **kwargs): + # Simulate async_chat yielding one result with full prompt + yield { + "answer": "Paris is the capital of France.", + "reference": {"chunks": []}, + "prompt": "System instructions here.\n\nKnowledge: Some context.\n\nQuery: What is the capital of France?", + } + + def capture_create(**kwargs): + captured_create.update(kwargs) + + monkeypatch.setattr( + "api.db.services.dialog_service.async_chat", + mock_async_chat, + ) + monkeypatch.setattr( + "api.db.services.evaluation_service.EvaluationResult.create", + capture_create, + ) + + result = EvaluationService._evaluate_single_case("run-1", minimal_case, mock_dialog) + + assert result is not None + assert "token_usage" in captured_create + token_usage = captured_create["token_usage"] + assert "prompt_tokens" in token_usage + assert "completion_tokens" in token_usage + assert "total_tokens" in token_usage + assert token_usage["total_tokens"] == token_usage["prompt_tokens"] + token_usage["completion_tokens"] + assert token_usage["prompt_tokens"] > 0 + assert token_usage["completion_tokens"] > 0 + + +@pytest.mark.p2 +def test_token_usage_fallback_when_prompt_missing(monkeypatch, mock_dialog, minimal_case): + """Verify fallback to question-only count when ans has no 'prompt' key.""" + captured_create = {} + + async def mock_async_chat_no_prompt(dialog, messages, stream, **kwargs): + # Simulate response without 'prompt' (e.g. async_chat_solo) + yield { + "answer": "Paris.", + "reference": {"chunks": []}, + } + + def capture_create(**kwargs): + captured_create.update(kwargs) + + monkeypatch.setattr( + "api.db.services.dialog_service.async_chat", + mock_async_chat_no_prompt, + ) + monkeypatch.setattr( + "api.db.services.evaluation_service.EvaluationResult.create", + capture_create, + ) + + result = EvaluationService._evaluate_single_case("run-1", minimal_case, mock_dialog) + + assert result is not None + assert "token_usage" in captured_create + token_usage = captured_create["token_usage"] + assert "prompt_tokens" in token_usage + assert "completion_tokens" in token_usage + assert "total_tokens" in token_usage + assert token_usage["total_tokens"] == token_usage["prompt_tokens"] + token_usage["completion_tokens"] + # With fallback, prompt_tokens should reflect question only (smaller than full prompt) + assert token_usage["prompt_tokens"] >= 0 + assert token_usage["completion_tokens"] > 0 + + +@pytest.mark.p2 +def test_token_usage_no_answer_logs_warning(monkeypatch, mock_dialog, minimal_case, caplog): + """When chat yields no answers, we still record token_usage and log a warning.""" + captured_create = {} + + async def mock_async_chat_empty(dialog, messages, stream, **kwargs): + # Simulate async_chat that yields no items at all + if False: + yield {} + + def capture_create(**kwargs): + captured_create.update(kwargs) + + monkeypatch.setattr( + "api.db.services.dialog_service.async_chat", + mock_async_chat_empty, + ) + monkeypatch.setattr( + "api.db.services.evaluation_service.EvaluationResult.create", + capture_create, + ) + + with caplog.at_level("WARNING"): + result = EvaluationService._evaluate_single_case("run-1", minimal_case, mock_dialog) + + assert result is not None + token_usage = captured_create["token_usage"] + # No answer tokens in this case + assert token_usage["completion_tokens"] == 0 + assert token_usage["prompt_tokens"] >= 0 + assert token_usage["total_tokens"] == token_usage["prompt_tokens"] + assert any("produced no answer from chat" in msg for msg in caplog.messages) + + +@pytest.mark.p2 +def test_compute_summary_metrics_aggregates_metrics(): + """_compute_summary_metrics should average numeric metrics correctly.""" + results = [ + {"execution_time": 1.0, "metrics": {"precision": 0.5, "answer_length": 10}}, + {"execution_time": 3.0, "metrics": {"precision": 1.0, "answer_length": 20}}, + ] + + summary = EvaluationService._compute_summary_metrics(results) + + assert summary["total_cases"] == 2 + assert summary["avg_execution_time"] == pytest.approx(2.0) + assert summary["avg_precision"] == pytest.approx(0.75) + assert summary["avg_answer_length"] == pytest.approx(15.0)