From b3891ba6a429b671cc109e56f3f589cf1908c7e0 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Tue, 21 Apr 2026 12:17:57 +0800 Subject: [PATCH] Fix audio/video in pipeline (#14241) ### What problem does this PR solve? Fix audio/video in pipeline ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/flow/parser/parser.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 3f98a7efed..d1fd7ead38 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -288,12 +288,13 @@ class ParserParam(ProcessParamBase): audio_config = self.setups.get("audio", "") if audio_config: - self.check_empty(audio_config.get("llm_id"), "Audio VLM") + audio_vlm = audio_config.get("vlm") or {} + self.check_empty(audio_vlm.get("llm_id"), "Audio VLM") video_config = self.setups.get("video", "") if video_config: - self.check_empty(video_config.get("llm_id"), "Video VLM") - + video_vlm = video_config.get("vlm") or {} + self.check_empty(video_vlm.get("llm_id"), "Video VLM") email_config = self.setups.get("email", "") if email_config: email_output_format = email_config.get("output_format", "") @@ -1076,13 +1077,14 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.") conf = self._param.setups["audio"] + vlm = conf.get("vlm") self.set_output("output_format", conf["output_format"]) _, ext = os.path.splitext(name) with tempfile.NamedTemporaryFile(suffix=ext) as tmpf: tmpf.write(blob) tmpf.flush() tmp_path = os.path.abspath(tmpf.name) - seq2txt_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, conf["llm_id"]) + seq2txt_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, vlm["llm_id"]) seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), seq2txt_model_config) txt = seq2txt_mdl.transcription(tmp_path) @@ -1093,8 +1095,9 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.") conf = self._param.setups["video"] + vlm = conf.get("vlm") self.set_output("output_format", conf["output_format"]) - cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, conf["llm_id"]) + cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, vlm["llm_id"]) cv_mdl = LLMBundle(self._canvas.get_tenant_id(), cv_model_config) video_prompt = str(conf.get("prompt", "") or "") txt = asyncio.run(cv_mdl.async_chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name, video_prompt=video_prompt))