diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 6f2f26d041..cf756649b7 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -936,6 +936,7 @@ class Parser(ProcessBase): delimiter=conf.get("delimiter"), return_section_images=True, ) + if conf.get("output_format") == "json": json_results = [] @@ -954,6 +955,16 @@ class Parser(ProcessBase): json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text" json_results.append(json_result) + for table in tables: + table_text = table[0][1] if table and table[0] else "" + if table_text: + json_results.append( + { + "text": table_text, + "doc_type_kwd": "table", + } + ) + enhance_media_sections_with_vision( json_results, self._canvas._tenant_id, @@ -962,7 +973,9 @@ class Parser(ProcessBase): ) self.set_output("json", json_results) else: - self.set_output("text", "\n".join([section_text for section_text, _ in sections])) + texts = [section_text for section_text, _ in sections if section_text] + texts.extend(table[0][1] for table in tables if table and table[0] and table[0][1]) + self.set_output("text", "\n".join(texts)) def _code(self, name, blob, **kwargs): """Parse text and source code files as plain text chunks.""" diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index 757c69c4b2..67eca573f3 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -39,7 +39,7 @@ export enum EmailOutputFormat { } export enum TextMarkdownOutputFormat { - Text = 'text', + Text = 'json', } export enum TextJsonOutputFormat {