From d6660cf156d546656207814cd580c44e7f9dbbbc Mon Sep 17 00:00:00 2001 From: Qinsanz <49357907+Qinsanz@users.noreply.github.com> Date: Mon, 11 May 2026 12:05:24 +0800 Subject: [PATCH] fix(keyword_extraction): accept Chinese commas/semicolons/newlines as keyword delimiters (#14540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Widen the keyword delimiter in `rag/svr/task_executor.py`: both `build_chunks` (LLM `keyword_extraction` cache parsing) and `run_dataflow` (chunk-level `keywords` ingestion) now split on `, , ; ; 、 \r \n` instead of only ASCII comma. ## Why `rag/prompts/keyword_prompt.md` instructs the LLM: > The keywords are delimited by ENGLISH COMMA. In practice, Chinese-leaning models (Qwen / Tongyi-Qianwen, GLM, etc.) frequently ignore this instruction when the source content is Chinese and emit Chinese commas (`,`) instead. Result: `cached.split(",")` sees the full LLM output as a *single* keyword. Repro: `auto_keywords>=4` + Chinese docs + `qwen-plus@Tongyi-Qianwen`. We observed entries in `important_kwd` like `"功能介绍,配置说明,参数详解,问题排查"` — one bucket instead of four. ## Impact - Silent data-quality bug; no exception thrown. - BM25 `important_kwd^30` boost effectively stops firing — the indexed term is the whole list, never matches user query tokens. - Any downstream aggregating `important_kwd` (tagging, analytics, candidate-keyword review UIs) sees garbage. ## Compatibility - Pure widening of the splitter; ASCII-comma-only outputs continue to work identically. - No schema / API change. ## Test plan Manually verified against `qwen-plus@Tongyi-Qianwen` with `auto_keywords=10` on Chinese .txt files: - Before: `important_kwd` contains one element per chunk that is the full LLM string with `,`-separated phrases inside. - After: `important_kwd` contains N elements, one per phrase, as the LLM intended. --- rag/svr/task_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 2568aa036b..8ce913e79f 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -385,7 +385,7 @@ async def build_chunks(task, progress_callback): cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn) set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn}) if cached: - d["important_kwd"] = cached.split(",") + d["important_kwd"] = [k for k in re.split(r"[,,;;、\r\n]+", cached) if k.strip()] d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"])) return @@ -775,7 +775,7 @@ async def run_dataflow(task: dict): del ck["questions"] if "keywords" in ck: if "important_tks" not in ck: - ck["important_kwd"] = ck["keywords"].split(",") + ck["important_kwd"] = [k for k in re.split(r"[,,;;、\r\n]+", ck["keywords"]) if k.strip()] ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"])) del ck["keywords"] if "summary" in ck: