diff --git a/rag/app/naive.py b/rag/app/naive.py index f885fcbab2..25b715b6ed 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca callback(0.1, "Start to parse.") sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) sections = _normalize_section_text_for_rtl_presentation_forms(sections) + print("\n", "-"*150, "\n") + print(sections) + print("\n", "-"*150, "\n") callback(0.8, "Finish parsing.") elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE): diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 182f07595f..e9c06cb879 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -66,7 +66,11 @@ class ParserParam(ProcessParamBase): "markdown", "html", ], - "word": [ + "doc": [ + "json", + "markdown", + ], + "docx": [ "json", "markdown", ], @@ -80,11 +84,11 @@ class ParserParam(ProcessParamBase): "text", "json", ], - "text&markdown": [ + "markdown": [ "text", "json", ], - "code": [ + "text&code": [ "text", "json", ], @@ -121,21 +125,28 @@ class ParserParam(ProcessParamBase): "csv", ], }, - "word": { + "doc": { "remove_toc": False, "suffix": [ "doc", + ], + "output_format": "json", + }, + "docx": { + "remove_toc": False, + "suffix": [ "docx", ], "output_format": "json", }, - "text&markdown": { - "suffix": ["md", "markdown", "mdx", "txt"], + "markdown": { + "suffix": ["md", "markdown", "mdx"], "remove_toc": False, "output_format": "json", }, - "code": { + "text&code": { "suffix": [ + "txt", "py", "js", "java", @@ -150,12 +161,12 @@ class ParserParam(ProcessParamBase): "kt", "sql", ], - "output_format": "text", + "output_format": "json", }, "html": { "suffix": ["htm", "html"], "remove_toc": "false", - "output_format": "text", + "output_format": "json", }, "slides": { "parse_method": "deepdoc", # deepdoc/tcadp_parser @@ -235,10 +246,15 @@ class ParserParam(ProcessParamBase): spreadsheet_output_format = spreadsheet_config.get("output_format", "") self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"]) - doc_config = self.setups.get("word", "") + doc_config = self.setups.get("doc", "") if doc_config: doc_output_format = doc_config.get("output_format", "") - self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"]) + self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"]) + + docx_config = self.setups.get("docx", "") + if docx_config: + docx_output_format = docx_config.get("output_format", "") + self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"]) slides_config = self.setups.get("slides", "") if slides_config: @@ -251,15 +267,15 @@ class ParserParam(ProcessParamBase): if image_parse_method not in ["ocr"]: self.check_empty(image_config.get("lang", ""), "Image VLM language") - text_config = self.setups.get("text&markdown", "") + text_config = self.setups.get("markdown", "") if text_config: text_output_format = text_config.get("output_format", "") - self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"]) + self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"]) - code_config = self.setups.get("code", "") + code_config = self.setups.get("text&code", "") if code_config: code_output_format = code_config.get("output_format", "") - self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"]) + self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"]) html_config = self.setups.get("html", "") if html_config: @@ -733,10 +749,27 @@ class Parser(ProcessBase): elif conf.get("output_format") == "markdown": self.set_output("markdown", spreadsheet_parser.markdown(blob)) - def _word(self, name, blob, **kwargs): - """Parse doc/docx files and optionally remove table-of-contents content.""" - self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document") - conf = self._param.setups["word"] + def _doc(self, name, blob, **kwargs): + """Parse DOC files into text/json sections.""" + self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document") + conf = self._param.setups["doc"] + self.set_output("output_format", conf["output_format"]) + + from tika import parser as tika_parser + + parsed = tika_parser.from_buffer(io.BytesIO(blob)) + sections = [line for line in parsed["content"].split("\n") if line] + + if conf.get("output_format") == "json": + self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections]) + return + + self.set_output("markdown", "\n".join(sections)) + + def _docx(self, name, blob, **kwargs): + """Parse DOCX files and optionally remove table-of-contents content.""" + self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document") + conf = self._param.setups["docx"] self.set_output("output_format", conf["output_format"]) if re.search(r"\.doc$", name, re.IGNORECASE): @@ -885,14 +918,14 @@ class Parser(ProcessBase): self.set_output("json", sections) def _markdown(self, name, blob, **kwargs): - """Parse markdown and txt files into text/json sections.""" + """Parse markdown files into text/json sections.""" from functools import reduce from rag.app.naive import Markdown as naive_markdown_parser from rag.nlp import concat_img self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.") - conf = self._param.setups["text&markdown"] + conf = self._param.setups["markdown"] self.set_output("output_format", conf["output_format"]) markdown_parser = naive_markdown_parser() @@ -903,11 +936,6 @@ class Parser(ProcessBase): delimiter=conf.get("delimiter"), return_section_images=True, ) - if name.lower().endswith(".txt") and conf.get("remove_toc") == "true": - sections, kept_indices = remove_toc(sections) - if section_images: - section_images = [section_images[i] for i in kept_indices if i < len(section_images)] - if conf.get("output_format") == "json": json_results = [] @@ -937,11 +965,15 @@ class Parser(ProcessBase): self.set_output("text", "\n".join([section_text for section_text, _ in sections])) def _code(self, name, blob, **kwargs): - """Parse source code files as plain text chunks.""" - self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.") - conf = self._param.setups["code"] + """Parse text and source code files as plain text chunks.""" + self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.") + conf = self._param.setups["text&code"] self.set_output("output_format", conf["output_format"]) + print("\n\n") + print(conf.get("output_format")) + print("\n\n") + sections = TxtParser()( name, blob, @@ -952,6 +984,10 @@ class Parser(ProcessBase): self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]]) return + print("\n", "-"*150, "\n") + print(sections) + print("\n", "-"*150, "\n") + self.set_output("text", "\n".join([section[0] for section in sections if section[0]])) def _html(self, name, blob, **kwargs): @@ -1199,12 +1235,13 @@ class Parser(ProcessBase): """Dispatch the current file to the matching parser branch by suffix.""" function_map = { "pdf": self._pdf, - "text&markdown": self._markdown, - "code": self._code, + "markdown": self._markdown, + "text&code": self._code, "html": self._html, "spreadsheet": self._spreadsheet, "slides": self._slides, - "word": self._word, + "doc": self._doc, + "docx": self._docx, "image": self._image, "audio": self._audio, "video": self._video, diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 87e73c3d91..b72a096c48 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable spreadsheet: 'Spreadsheet', image: 'Image', email: 'Email', - 'text&markdown': 'Text & Markup', - code: 'Code', + markdown: 'Markdown', + 'text&code': 'Text & Code', html: 'HTML', - word: 'Word', + doc: 'DOC', + docx: 'DOCX', slides: 'PPTX', audio: 'Audio', video: 'Video', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 919316df4f..1eb2d30cc2 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1950,10 +1950,11 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 spreadsheet: '表格', image: '图片', email: '邮件', - 'text&markdown': '文本与标记', - code: '代码', + markdown: 'Markdown', + 'text&code': '文本与代码', html: 'HTML', - word: 'Word', + doc: 'DOC', + docx: 'DOCX', slides: 'PPTX', audio: '音频', video: '视频', diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index f5996660e8..7578da3965 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -9,10 +9,11 @@ export enum FileType { Spreadsheet = 'spreadsheet', Image = 'image', Email = 'email', - TextMarkdown = 'text&markdown', - Code = 'code', + TextMarkdown = 'markdown', + Code = 'text&code', Html = 'html', - Docx = 'word', + Doc = 'doc', + Docx = 'docx', PowerPoint = 'slides', Video = 'video', Audio = 'audio', @@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat { Text = 'text', } +export enum TextJsonOutputFormat { + Text = 'text', + Json = 'json', +} + export enum DocxOutputFormat { Markdown = 'markdown', Json = 'json', @@ -64,8 +70,9 @@ export const OutputFormatMap = { [FileType.Image]: ImageOutputFormat, [FileType.Email]: EmailOutputFormat, [FileType.TextMarkdown]: TextMarkdownOutputFormat, - [FileType.Code]: TextMarkdownOutputFormat, - [FileType.Html]: TextMarkdownOutputFormat, + [FileType.Code]: TextJsonOutputFormat, + [FileType.Html]: TextJsonOutputFormat, + [FileType.Doc]: DocxOutputFormat, [FileType.Docx]: DocxOutputFormat, [FileType.PowerPoint]: PptOutputFormat, [FileType.Video]: VideoOutputFormat, @@ -78,8 +85,9 @@ export const InitialOutputFormatMap = { [FileType.Image]: ImageOutputFormat.Text, [FileType.Email]: EmailOutputFormat.Text, [FileType.TextMarkdown]: TextMarkdownOutputFormat.Text, - [FileType.Code]: TextMarkdownOutputFormat.Text, - [FileType.Html]: TextMarkdownOutputFormat.Text, + [FileType.Code]: TextJsonOutputFormat.Json, + [FileType.Html]: TextJsonOutputFormat.Json, + [FileType.Doc]: DocxOutputFormat.Json, [FileType.Docx]: DocxOutputFormat.Json, [FileType.PowerPoint]: PptOutputFormat.Json, [FileType.Video]: VideoOutputFormat.Text, @@ -216,12 +224,17 @@ export const initialParserValues = { }, { fileFormat: FileType.Code, - output_format: TextMarkdownOutputFormat.Text, + output_format: TextJsonOutputFormat.Json, preprocess: PreprocessValue.main_content, }, { fileFormat: FileType.Html, - output_format: TextMarkdownOutputFormat.Text, + output_format: TextJsonOutputFormat.Json, + preprocess: PreprocessValue.main_content, + }, + { + fileFormat: FileType.Doc, + output_format: DocxOutputFormat.Json, preprocess: PreprocessValue.main_content, }, { @@ -340,8 +353,9 @@ export const FileTypeSuffixMap = { [FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'], [FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'], [FileType.Email]: ['eml', 'msg'], - [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'], + [FileType.TextMarkdown]: ['md', 'markdown', 'mdx'], [FileType.Code]: [ + 'txt', 'py', 'js', 'java', @@ -357,7 +371,8 @@ export const FileTypeSuffixMap = { 'sql', ], [FileType.Html]: ['htm', 'html'], - [FileType.Docx]: ['doc', 'docx'], + [FileType.Doc]: ['doc'], + [FileType.Docx]: ['docx'], [FileType.PowerPoint]: ['pptx', 'ppt'], [FileType.Video]: ['mp4', 'avi', 'mkv'], [FileType.Audio]: [ diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx index d3f78b1cd7..b21f5ff668 100644 --- a/web/src/pages/agent/form/parser-form/index.tsx +++ b/web/src/pages/agent/form/parser-form/index.tsx @@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial< { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, { value: PreprocessValue.section_title }, ], + [FileType.Doc]: [ + { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, + { value: PreprocessValue.section_title }, + ], [FileType.Docx]: [ { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, { value: PreprocessValue.section_title },