From c6e3a2e713c1aae9f4f051e370a6cc396c0e6e61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BA=B7=E4=BC=9F?= <71975350+lksr1201@users.noreply.github.com> Date: Tue, 19 May 2026 12:28:31 +0800 Subject: [PATCH] Fix: MinerU vlm-http-client backend output file detection (#14240) ## Problem When using MinerU with `vlm-http-client` backend, the parser fails to find the output files because they are located in a `vlm/` subdirectory, but the `_read_output` method doesn't check this location. ## Error Message [ERROR]MinerU not found. [MinerU] Missing output file, tried: ... ## Root Cause The MinerU API with `vlm-http-client` backend returns output files in the following structure: output_dir/ vlm/ filename_content_list.json filename.md images/ However, the `_read_output` method in `mineru_parser.py` only checks: 1. `output_dir/filename_content_list.json` 2. `output_dir/sanitized_filename_content_list.json` 3. `output_dir/sanitized_filename/sanitized_filename_content_list.json` It doesn't check the `vlm/` subdirectory. ## Solution Added two additional fallback paths to check the `vlm/` subdirectory: - `output_dir/vlm/filename_content_list.json` - `output_dir/vlm/sanitized_filename_content_list.json` ## Testing Tested with MinerU API using `vlm-http-client` backend. The parser now successfully finds and processes the output files. ## Related This issue occurs specifically when using: - MinerU backend: `vlm-http-client` - MinerU server URL configured for remote vLLM inference --- deepdoc/parser/mineru_parser.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 90c33573ce..b369f9122a 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -539,6 +539,21 @@ class MinerUParser(RAGFlowPdfParser): if nested_alt.exists(): subdir = nested_alt.parent json_file = nested_alt + else: + # Try vlm subdirectory (for vlm-http-client backend) + vlm_path = output_dir / "vlm" / f"{file_stem}_content_list.json" + self.logger.info(f"[MinerU] Trying vlm subdirectory: {vlm_path}") + attempted.append(vlm_path) + if vlm_path.exists(): + subdir = vlm_path.parent + json_file = vlm_path + else: + vlm_safe = output_dir / "vlm" / f"{safe_stem}_content_list.json" + self.logger.info(f"[MinerU] Trying vlm subdirectory with sanitized name: {vlm_safe}") + attempted.append(vlm_safe) + if vlm_safe.exists(): + subdir = vlm_safe.parent + json_file = vlm_safe if not json_file: parse_subdir = None