fix: add legacy chat/completions mode (#16014)

### What problem does this PR solve? Adds a legacy mode for /chat/completions that restores v0.23.0-style output by converting start_to_think/end_to_think back into raw <think></think> markers and streaming cumulative answer text. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-29 15:31:05 +08:00 · 2026-06-16 10:34:06 +08:00
parent efdd58df66
commit 8e235b7b95
3 changed files with 59 additions and 19 deletions
--- a/api/apps/restful_apis/chat_api.py
+++ b/api/apps/restful_apis/chat_api.py
@@ -1229,6 +1229,11 @@ async def session_completion(chat_id_in_arg=""):
            dia.llm_id = tenant_info.llm_id
            merge_generation_config(dia, chat_model_config)

+        legacy = _get_bool_request_flag(
+            req,
+            "legacy",
+            default=False,
+        )
        stream_mode = req.pop("stream", True)

        def _format_answer(ans):
@@ -1242,10 +1247,53 @@ async def session_completion(chat_id_in_arg=""):
            """Yield SSE-formatted chunks from the async chat generator."""
            nonlocal dia, msg, req, conv
            try:
-                async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
-                    ans = _format_answer(ans)
-                    payload = _sanitize_json_floats({"code": 0, "message": "", "data": ans})
-                    yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
+                if legacy:
+                    # v0.23.0-style streaming: emit accumulated answer text and
+                    # reconstruct raw <think>...</think> markers from the newer
+                    # start_to_think/end_to_think events.
+                    legacy_answer = ""
+                    final_answer = None
+                    async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
+                        ans = _format_answer(ans)
+                        if ans.get("final"):
+                            final_answer = ans
+                            continue
+                        if ans.get("start_to_think"):
+                            legacy_answer += "<think>"
+                            legacy_chunk = {**ans, "answer": legacy_answer}
+                            legacy_chunk.pop("start_to_think", None)
+                            legacy_chunk.pop("end_to_think", None)
+                            payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
+                            yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
+                            continue
+                        if ans.get("end_to_think"):
+                            legacy_answer += "</think>"
+                            legacy_chunk = {**ans, "answer": legacy_answer}
+                            legacy_chunk.pop("start_to_think", None)
+                            legacy_chunk.pop("end_to_think", None)
+                            payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
+                            yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
+                            continue
+                        delta = ans.get("answer") or ""
+                        if not delta:
+                            continue
+                        legacy_answer += delta
+                        legacy_chunk = {**ans, "answer": legacy_answer}
+                        legacy_chunk.pop("start_to_think", None)
+                        legacy_chunk.pop("end_to_think", None)
+                        payload = _sanitize_json_floats({"code": 0, "message": "", "data": legacy_chunk})
+                        yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
+                    if final_answer is not None:
+                        final_chunk = {**final_answer, "answer": final_answer.get("answer") or legacy_answer}
+                        final_chunk.pop("start_to_think", None)
+                        final_chunk.pop("end_to_think", None)
+                        payload = _sanitize_json_floats({"code": 0, "message": "", "data": final_chunk})
+                        yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
+                else:
+                    async for ans in async_chat(dia, msg, True, session_id=session_id, **req):
+                        ans = _format_answer(ans)
+                        payload = _sanitize_json_floats({"code": 0, "message": "", "data": ans})
+                        yield "data:" + json.dumps(payload, ensure_ascii=False) + "\n\n"
                if conv is not None:
                    await thread_pool_exec(ConversationService.update_by_id, conv.id, conv.to_dict())
            except Exception as ex:
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -900,6 +900,7 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
            final = await decorate_answer(_extract_visible_answer(thought + full_answer))
            final["final"] = True
            final["audio_binary"] = None
+            final["answer"] = ""
            yield final
    else:
        if llm_model_config["model_type"] == "chat":
@@ -1715,6 +1716,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
    full_answer = last_state.full_text if last_state else ""
    final = await decorate_answer(_extract_visible_answer(full_answer))
    final["final"] = True
+    final["answer"] = ""
    yield final