fix: add legacy chat/completions mode (#16014)

### What problem does this PR solve?
Adds a legacy mode for /chat/completions that restores v0.23.0-style
output by converting start_to_think/end_to_think back into raw
<think></think> markers and streaming cumulative answer text.

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
buua436
2026-06-16 10:34:06 +08:00
committed by GitHub
parent efdd58df66
commit 8e235b7b95
3 changed files with 59 additions and 19 deletions

View File

@@ -1229,6 +1229,11 @@ async def session_completion(chat_id_in_arg=""):
dia.llm_id = tenant_info.llm_id
merge_generation_config(dia, chat_model_config)
legacy = _get_bool_request_flag(
req,
"legacy",
default=False,
)
stream_mode = req.pop("stream", True)
def _format_answer(ans):
@@ -1242,10 +1247,53 @@ async def session_completion(chat_id_in_arg=""):
"""Yield SSE-formatted chunks from the async chat generator."""
nonlocal dia, msg, req, conv
try:
async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
ans = _format_answer(ans)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": ans})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
if legacy:
# v0.23.0-style streaming: emit accumulated answer text and
# reconstruct raw <think>...</think> markers from the newer
# start_to_think/end_to_think events.
legacy_answer = ""
final_answer = None
async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
ans = _format_answer(ans)
if ans.get("final"):
final_answer = ans
continue
if ans.get("start_to_think"):
legacy_answer += "<think>"
legacy_chunk = {**ans, "answer": legacy_answer}
legacy_chunk.pop("start_to_think", None)
legacy_chunk.pop("end_to_think", None)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
continue
if ans.get("end_to_think"):
legacy_answer += "</think>"
legacy_chunk = {**ans, "answer": legacy_answer}
legacy_chunk.pop("start_to_think", None)
legacy_chunk.pop("end_to_think", None)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
continue
delta = ans.get("answer") or ""
if not delta:
continue
legacy_answer += delta
legacy_chunk = {**ans, "answer": legacy_answer}
legacy_chunk.pop("start_to_think", None)
legacy_chunk.pop("end_to_think", None)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
if final_answer is not None:
final_chunk = {**final_answer, "answer": final_answer.get("answer") or legacy_answer}
final_chunk.pop("start_to_think", None)
final_chunk.pop("end_to_think", None)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": final_chunk})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
else:
async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
ans = _format_answer(ans)
payload = _sanitize_json_floats({"code": 0, "message": "", "data": ans})
yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
if conv is not None:
await thread_pool_exec(ConversationService.update_by_id, conv.id, conv.to_dict())
except Exception as ex:

View File

@@ -900,6 +900,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
final = await decorate_answer(_extract_visible_answer(thought + full_answer))
final["final"] = True
final["audio_binary"] = None
final["answer"] = ""
yield final
else:
if llm_model_config["model_type"] == "chat":
@@ -1715,6 +1716,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
full_answer = last_state.full_text if last_state else ""
final = await decorate_answer(_extract_visible_answer(full_answer))
final["final"] = True
final["answer"] = ""
yield final