feat(evaluation): track token usage in evaluation results (#13487)

## Summary Implements the TODO in `evaluation_service.py`: **Track token usage** in evaluation results. ## Changes - **Import** `num_tokens_from_string` from `common.token_utils` - **Prompt tokens**: Use the full prompt returned by `async_chat` when available (includes system prompt + knowledge base + query), otherwise fall back to the question token count - **Completion tokens**: Count tokens in the generated answer - **Storage**: Store `token_usage` as `{prompt_tokens, completion_tokens, total_tokens}` in each `EvaluationResult` instead of `None` ## Why The evaluation pipeline previously saved `token_usage: None` for every result. This change allows downstream consumers (e.g. evaluation dashboards, cost tracking) to see approximate token usage per test case using the same tokenizer (tiktoken cl100k_base) used elsewhere in RAGFlow. ## Testing - No new tests added; existing evaluation flow unchanged - Token counting uses existing `num_tokens_from_string` utility --------- Co-authored-by: kiannidev <kiannidev@users.noreply.github.com>
2026-06-29 23:41:12 +08:00 · 2026-05-22 00:19:53 -07:00
parent b1ef5d365f
commit faf77a5a8a
2 changed files with 199 additions and 1 deletions
--- a/api/db/services/evaluation_service.py
+++ b/api/db/services/evaluation_service.py
@@ -39,6 +39,7 @@ from api.db.services.dialog_service import DialogService
 from common.misc_utils import get_uuid
 from common.time_utils import current_timestamp
 from common.constants import StatusEnum
+from common.token_utils import num_tokens_from_string


 class EvaluationService(CommonService):
@@ -417,6 +418,12 @@ class EvaluationService(CommonService):
                    answer = ans.get("answer", "")
                    retrieved_chunks = ans.get("reference", {}).get("chunks", [])
                    break
+            else:
+                ans = {}
+                logging.warning(
+                    "Evaluation case %s produced no answer from chat; token_usage will reflect empty output",
+                    case.get("id", "unknown"),
+                )

            execution_time = timer() - start_time

@@ -430,6 +437,27 @@ class EvaluationService(CommonService):
                dialog=dialog
            )

+            # Track token usage: use full prompt from async_chat when available.
+            # Note: Counts use tiktoken (cl100k_base), which matches OpenAI models but is an
+            # approximation for other providers (Anthropic, local models, etc.). Downstream
+            # consumers should treat these values as estimates for cost tracking.
+            full_prompt = ans.get("prompt", "")
+            if full_prompt:
+                prompt_tokens = num_tokens_from_string(full_prompt)
+            else:
+                logging.debug(
+                    "Evaluation case %s: ans has no 'prompt' key; using question-only count "
+                    "(undercounts system + retrieved context)",
+                    case.get("id", "unknown"),
+                )
+                prompt_tokens = num_tokens_from_string(case.get("question", "") or "")
+            completion_tokens = num_tokens_from_string(answer or "")
+            token_usage = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
+
            # Save result
            result_id = get_uuid()
            result = {
@@ -440,7 +468,7 @@ class EvaluationService(CommonService):
                "retrieved_chunks": retrieved_chunks,
                "metrics": metrics,
                "execution_time": execution_time,
-                "token_usage": None,  # TODO: Track token usage
+                "token_usage": token_usage,
                "create_time": current_timestamp()
            }