Fix: markdown parser in pipeline (#14032)

### What problem does this PR solve?

Fix: markdown parser in pipeline

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Magicbook1108
2026-04-10 14:11:14 +08:00
committed by GitHub
parent 9ce293a736
commit 18cafff790
2 changed files with 15 additions and 2 deletions

View File

@@ -936,6 +936,7 @@ class Parser(ProcessBase):
delimiter=conf.get("delimiter"),
return_section_images=True,
)
if conf.get("output_format") == "json":
json_results = []
@@ -954,6 +955,16 @@ class Parser(ProcessBase):
json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text"
json_results.append(json_result)
for table in tables:
table_text = table[0][1] if table and table[0] else ""
if table_text:
json_results.append(
{
"text": table_text,
"doc_type_kwd": "table",
}
)
enhance_media_sections_with_vision(
json_results,
self._canvas._tenant_id,
@@ -962,7 +973,9 @@ class Parser(ProcessBase):
)
self.set_output("json", json_results)
else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
texts = [section_text for section_text, _ in sections if section_text]
texts.extend(table[0][1] for table in tables if table and table[0] and table[0][1])
self.set_output("text", "\n".join(texts))
def _code(self, name, blob, **kwargs):
"""Parse text and source code files as plain text chunks."""

View File

@@ -39,7 +39,7 @@ export enum EmailOutputFormat {
}
export enum TextMarkdownOutputFormat {
Text = 'text',
Text = 'json',
}
export enum TextJsonOutputFormat {