Fix audio/video in pipeline (#14241)

### What problem does this PR solve?

Fix audio/video in pipeline

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Magicbook1108
2026-04-21 12:17:57 +08:00
committed by GitHub
parent 8aab158942
commit b3891ba6a4

View File

@@ -288,12 +288,13 @@ class ParserParam(ProcessParamBase):
audio_config = self.setups.get("audio", "")
if audio_config:
self.check_empty(audio_config.get("llm_id"), "Audio VLM")
audio_vlm = audio_config.get("vlm") or {}
self.check_empty(audio_vlm.get("llm_id"), "Audio VLM")
video_config = self.setups.get("video", "")
if video_config:
self.check_empty(video_config.get("llm_id"), "Video VLM")
video_vlm = video_config.get("vlm") or {}
self.check_empty(video_vlm.get("llm_id"), "Video VLM")
email_config = self.setups.get("email", "")
if email_config:
email_output_format = email_config.get("output_format", "")
@@ -1076,13 +1077,14 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
conf = self._param.setups["audio"]
vlm = conf.get("vlm")
self.set_output("output_format", conf["output_format"])
_, ext = os.path.splitext(name)
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
tmpf.write(blob)
tmpf.flush()
tmp_path = os.path.abspath(tmpf.name)
seq2txt_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, conf["llm_id"])
seq2txt_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, vlm["llm_id"])
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), seq2txt_model_config)
txt = seq2txt_mdl.transcription(tmp_path)
@@ -1093,8 +1095,9 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
conf = self._param.setups["video"]
vlm = conf.get("vlm")
self.set_output("output_format", conf["output_format"])
cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, conf["llm_id"])
cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, vlm["llm_id"])
cv_mdl = LLMBundle(self._canvas.get_tenant_id(), cv_model_config)
video_prompt = str(conf.get("prompt", "") or "")
txt = asyncio.run(cv_mdl.async_chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name, video_prompt=video_prompt))