mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 01:01:56 +08:00
### What problem does this PR solve?
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [x] Other (please describe):
## Summary
Agent (Canvas) runs previously did not surface token usage in the SSE
stream, and RAGFlow's own Langfuse generations for agent runs were
missing the prompt/completion split and the session/user correlation.
This made it impossible for an external caller (or Langfuse) to
reconcile an agent turn's cost with the upstream provider (e.g.
OpenRouter), because a single turn can issue several distinct LLM calls
(query rewriting / cross-language translation, multi-round tool
reasoning, nested sub-agents, and the final answer).
This PR introduces a per-run token usage sink so that **every** LLM call
in a run is aggregated and reported once, and enriches Langfuse
generations with the prompt/completion split plus session/user
attributes.
## What changes
### 1. Per-run token usage sink (`common/token_utils.py`)
- Adds two `contextvars`: `token_usage_sink` (a mutable per-run
accumulator) and `langfuse_run_attrs` (session_id/user_id for the run).
- Adds `record_run_token_usage(...)` (thread-safe via a lock, because
`thread_pool_exec` copies the context into worker threads that share the
sink dict) and `usage_from_response(...)` which extracts a
`{prompt_tokens, completion_tokens, total_tokens}` split from
OpenAI/OpenRouter-style responses.
### 2. Provider layer captures the prompt/completion split
(`rag/llm/chat_model.py`)
- `LiteLLMBase` and `Base` now store `self.last_usage`
(prompt/completion/total) for the most recent chat call, in both the
plain and tool-calling paths.
- Streaming requests set `stream_options.include_usage = True` (LiteLLM
path) so the authoritative usage arrives on the final chunk; this is
read even on the usage-only chunk that carries no `choices`.
- Fixes a multi-round accounting bug in `*_with_tools`: token totals
were **overwritten** by each round (`total_tokens = tol`) instead of
accumulated, undercounting multi-round tool conversations. Each round is
now committed to a running aggregate.
### 3. LLMBundle reports usage once, per call
(`api/db/services/llm_service.py`)
- New `_report_usage(total_tokens)` records the call's usage into the
active run sink and returns the prompt/completion/total split for
Langfuse. The split is only used when it is consistent with the
authoritative total; otherwise only the total is reported.
- All three chat entry points (`async_chat`, `async_chat_streamly`,
`async_chat_streamly_delta`) now emit `usage_details` with
`input`/`output`/`total` instead of total-only.
- `_start_langfuse_observation` now applies `session_id`/`user_id` from
the per-run context (`langfuse_run_attrs`) so agent-run generations are
correctly grouped, even though agent LLMBundles are constructed without
those attributes.
### 4. Canvas installs the sink and emits the aggregate
(`agent/canvas.py`)
- `Canvas.run()` installs a fresh `token_usage_sink` and
`langfuse_run_attrs` (from `user_id`/`session_id`) at the start of every
turn.
- `message_end` now includes an aggregated `usage` object:
`{prompt_tokens, completion_tokens, total_tokens, calls}` covering all
LLM calls in the run.
### 5. Pass session id into the run
(`api/db/services/canvas_service.py`)
- `completion()` forwards `session_id` to `Canvas.run()` for Langfuse
session correlation.
## Why a context variable
LLM calls in an agent run originate from many places that each build
their own `LLMBundle` (e.g. `cross_languages`/`keyword_extraction`
helpers, the Agent component, and nested sub-agents invoked as tools). A
run-scoped context variable is the only non-invasive chokepoint that
captures all of them exactly once, including nested agents (which run in
the same async context) and thread-pool tools (the executor copies the
context).
## Behavior / compatibility
- No public API or wire-format removal: `message_end` gains an
additional optional `usage` field; existing consumers are unaffected.
- When a provider does not return authoritative usage, behavior falls
back to the previous token estimate (total only, no split).
- Non-agent flows (Dataflow `Pipeline`, sync `Graph.run`) are untouched.
## Testing
- [x] Simple agent answer: `message_end.usage.total_tokens` matches
provider usage.
- [x] Agent with cross-language retrieval: aggregate equals the sum of
both provider calls.
- [x] Tool-calling agent (multi-round): total accumulates across rounds.
- [x] Nested agent (agent-as-tool): sub-agent tokens included in the
parent run total.
- [x] Langfuse: agent generations show input/output split and are
grouped by session/user.
---------
Co-authored-by: yzc <yuzhichang@gmail.com>
Co-authored-by: Cursor <cursoragent@cursor.com>
186 lines
6.9 KiB
Python
186 lines
6.9 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import contextvars
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import threading
|
|
import tiktoken
|
|
|
|
from common.file_utils import get_project_base_directory
|
|
|
|
|
|
def _ensure_tiktoken_cache() -> str:
|
|
cache_dir = get_project_base_directory()
|
|
os.environ["TIKTOKEN_CACHE_DIR"] = cache_dir
|
|
|
|
bundled_encoding_path = get_project_base_directory("ragflow_deps", "cl100k_base.tiktoken")
|
|
encoding_url = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
|
cached_encoding_path = os.path.join(cache_dir, hashlib.sha1(encoding_url.encode()).hexdigest())
|
|
|
|
if os.path.exists(bundled_encoding_path) and not os.path.exists(cached_encoding_path):
|
|
shutil.copyfile(bundled_encoding_path, cached_encoding_path)
|
|
|
|
return cache_dir
|
|
|
|
|
|
tiktoken_cache_dir = _ensure_tiktoken_cache()
|
|
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
|
|
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
|
encoder = tiktoken.get_encoding("cl100k_base")
|
|
|
|
|
|
# Per-run token usage sink. An agent run (Canvas.run) installs a mutable dict here
|
|
# at the start of each turn; every LLMBundle chat call adds its provider-reported
|
|
# usage to it. This is the single chokepoint that aggregates token usage across all
|
|
# LLM calls in a run (query rewriting, cross-language translation, tool reasoning,
|
|
# and the final streamed answer) regardless of which component or helper issued the
|
|
# call. Default None means "not inside a tracked run" and callers must no-op.
|
|
token_usage_sink: contextvars.ContextVar = contextvars.ContextVar("ragflow_token_usage_sink", default=None)
|
|
|
|
# Per-run Langfuse correlating attributes (e.g. {"session_id": ..., "user_id": ...}).
|
|
# Installed by Canvas.run so RAGFlow's own Langfuse generations can be grouped by
|
|
# session and user even though the agent's LLMBundles are created without them.
|
|
langfuse_run_attrs: contextvars.ContextVar = contextvars.ContextVar("ragflow_langfuse_run_attrs", default=None)
|
|
|
|
|
|
# Guards sink mutations: concurrent tool calls (asyncio.gather + thread_pool_exec,
|
|
# which copies the context so worker threads share the same sink dict) can otherwise
|
|
# race on the read-modify-write of the counters.
|
|
_sink_lock = threading.Lock()
|
|
|
|
|
|
def record_run_token_usage(prompt_tokens: int = 0, completion_tokens: int = 0, total_tokens: int = 0) -> None:
|
|
"""Add a single LLM call's token usage to the active run sink, if any.
|
|
|
|
Safe to call from anywhere: when no run sink is installed it does nothing.
|
|
"""
|
|
sink = token_usage_sink.get()
|
|
if sink is None:
|
|
return
|
|
try:
|
|
with _sink_lock:
|
|
sink["prompt_tokens"] += int(prompt_tokens or 0)
|
|
sink["completion_tokens"] += int(completion_tokens or 0)
|
|
sink["total_tokens"] += int(total_tokens or 0)
|
|
sink["calls"] += 1
|
|
except Exception:
|
|
# Never let usage bookkeeping break a request; log at debug so a malformed
|
|
# sink or token value is still traceable without adding noise.
|
|
logging.debug("Failed to record run token usage", exc_info=True)
|
|
|
|
|
|
def usage_from_response(resp) -> dict:
|
|
"""Extract a {prompt_tokens, completion_tokens, total_tokens} split from an LLM response.
|
|
|
|
Handles OpenAI/OpenRouter-style ``resp.usage`` objects and dict variants. Missing
|
|
fields default to 0; ``total_tokens`` falls back to prompt+completion when absent.
|
|
"""
|
|
out = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
|
if resp is None:
|
|
return out
|
|
|
|
usage = None
|
|
try:
|
|
usage = getattr(resp, "usage", None)
|
|
if usage is None and isinstance(resp, dict):
|
|
usage = resp.get("usage")
|
|
except Exception:
|
|
usage = None
|
|
if usage is None:
|
|
return out
|
|
|
|
def _get(obj, *names):
|
|
for n in names:
|
|
try:
|
|
v = obj.get(n) if isinstance(obj, dict) else getattr(obj, n, None)
|
|
except Exception:
|
|
v = None
|
|
if v:
|
|
return int(v)
|
|
return 0
|
|
|
|
out["prompt_tokens"] = _get(usage, "prompt_tokens", "input_tokens")
|
|
out["completion_tokens"] = _get(usage, "completion_tokens", "output_tokens")
|
|
out["total_tokens"] = _get(usage, "total_tokens")
|
|
if not out["total_tokens"]:
|
|
out["total_tokens"] = out["prompt_tokens"] + out["completion_tokens"]
|
|
return out
|
|
|
|
|
|
def num_tokens_from_string(string: str) -> int:
|
|
"""Returns the number of tokens in a text string."""
|
|
try:
|
|
code_list = encoder.encode(string)
|
|
return len(code_list)
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def total_token_count_from_response(resp):
|
|
"""
|
|
Extract token count from LLM response in various formats.
|
|
|
|
Handles None responses and different response structures from various LLM providers.
|
|
Returns 0 if token count cannot be determined.
|
|
"""
|
|
if resp is None:
|
|
return 0
|
|
|
|
try:
|
|
if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
|
|
return resp.usage.total_tokens
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
if hasattr(resp, "usage_metadata") and hasattr(resp.usage_metadata, "total_tokens"):
|
|
return resp.usage_metadata.total_tokens
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
if hasattr(resp, "meta") and hasattr(resp.meta, "billed_units") and hasattr(resp.meta.billed_units, "input_tokens"):
|
|
return resp.meta.billed_units.input_tokens
|
|
except Exception:
|
|
pass
|
|
|
|
if isinstance(resp, dict) and "usage" in resp and "total_tokens" in resp["usage"]:
|
|
try:
|
|
return resp["usage"]["total_tokens"]
|
|
except Exception:
|
|
pass
|
|
|
|
if isinstance(resp, dict) and "usage" in resp and "input_tokens" in resp["usage"] and "output_tokens" in resp["usage"]:
|
|
try:
|
|
return resp["usage"]["input_tokens"] + resp["usage"]["output_tokens"]
|
|
except Exception:
|
|
pass
|
|
|
|
if isinstance(resp, dict) and "meta" in resp and "tokens" in resp["meta"] and "input_tokens" in resp["meta"]["tokens"] and "output_tokens" in resp["meta"]["tokens"]:
|
|
try:
|
|
return resp["meta"]["tokens"]["input_tokens"] + resp["meta"]["tokens"]["output_tokens"]
|
|
except Exception:
|
|
pass
|
|
return 0
|
|
|
|
|
|
def truncate(string: str, max_len: int) -> str:
|
|
"""Returns truncated text if the length of text exceed max_len."""
|
|
return encoder.decode(encoder.encode(string)[:max_len])
|