mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-01 00:05:43 +08:00
Refact: refact on parser structure (#14012)
### What problem does this PR solve? Refact: refact on parser structure ### Type of change - [x] Refactoring
This commit is contained in:
@@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
||||
print("\n", "-"*150, "\n")
|
||||
print(sections)
|
||||
print("\n", "-"*150, "\n")
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
||||
|
||||
@@ -66,7 +66,11 @@ class ParserParam(ProcessParamBase):
|
||||
"markdown",
|
||||
"html",
|
||||
],
|
||||
"word": [
|
||||
"doc": [
|
||||
"json",
|
||||
"markdown",
|
||||
],
|
||||
"docx": [
|
||||
"json",
|
||||
"markdown",
|
||||
],
|
||||
@@ -80,11 +84,11 @@ class ParserParam(ProcessParamBase):
|
||||
"text",
|
||||
"json",
|
||||
],
|
||||
"text&markdown": [
|
||||
"markdown": [
|
||||
"text",
|
||||
"json",
|
||||
],
|
||||
"code": [
|
||||
"text&code": [
|
||||
"text",
|
||||
"json",
|
||||
],
|
||||
@@ -121,21 +125,28 @@ class ParserParam(ProcessParamBase):
|
||||
"csv",
|
||||
],
|
||||
},
|
||||
"word": {
|
||||
"doc": {
|
||||
"remove_toc": False,
|
||||
"suffix": [
|
||||
"doc",
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"docx": {
|
||||
"remove_toc": False,
|
||||
"suffix": [
|
||||
"docx",
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"text&markdown": {
|
||||
"suffix": ["md", "markdown", "mdx", "txt"],
|
||||
"markdown": {
|
||||
"suffix": ["md", "markdown", "mdx"],
|
||||
"remove_toc": False,
|
||||
"output_format": "json",
|
||||
},
|
||||
"code": {
|
||||
"text&code": {
|
||||
"suffix": [
|
||||
"txt",
|
||||
"py",
|
||||
"js",
|
||||
"java",
|
||||
@@ -150,12 +161,12 @@ class ParserParam(ProcessParamBase):
|
||||
"kt",
|
||||
"sql",
|
||||
],
|
||||
"output_format": "text",
|
||||
"output_format": "json",
|
||||
},
|
||||
"html": {
|
||||
"suffix": ["htm", "html"],
|
||||
"remove_toc": "false",
|
||||
"output_format": "text",
|
||||
"output_format": "json",
|
||||
},
|
||||
"slides": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
@@ -235,10 +246,15 @@ class ParserParam(ProcessParamBase):
|
||||
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
|
||||
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
|
||||
|
||||
doc_config = self.setups.get("word", "")
|
||||
doc_config = self.setups.get("doc", "")
|
||||
if doc_config:
|
||||
doc_output_format = doc_config.get("output_format", "")
|
||||
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
|
||||
self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
|
||||
|
||||
docx_config = self.setups.get("docx", "")
|
||||
if docx_config:
|
||||
docx_output_format = docx_config.get("output_format", "")
|
||||
self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])
|
||||
|
||||
slides_config = self.setups.get("slides", "")
|
||||
if slides_config:
|
||||
@@ -251,15 +267,15 @@ class ParserParam(ProcessParamBase):
|
||||
if image_parse_method not in ["ocr"]:
|
||||
self.check_empty(image_config.get("lang", ""), "Image VLM language")
|
||||
|
||||
text_config = self.setups.get("text&markdown", "")
|
||||
text_config = self.setups.get("markdown", "")
|
||||
if text_config:
|
||||
text_output_format = text_config.get("output_format", "")
|
||||
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
|
||||
self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])
|
||||
|
||||
code_config = self.setups.get("code", "")
|
||||
code_config = self.setups.get("text&code", "")
|
||||
if code_config:
|
||||
code_output_format = code_config.get("output_format", "")
|
||||
self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"])
|
||||
self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])
|
||||
|
||||
html_config = self.setups.get("html", "")
|
||||
if html_config:
|
||||
@@ -733,10 +749,27 @@ class Parser(ProcessBase):
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
def _word(self, name, blob, **kwargs):
|
||||
"""Parse doc/docx files and optionally remove table-of-contents content."""
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
conf = self._param.setups["word"]
|
||||
def _doc(self, name, blob, **kwargs):
|
||||
"""Parse DOC files into text/json sections."""
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
|
||||
conf = self._param.setups["doc"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
from tika import parser as tika_parser
|
||||
|
||||
parsed = tika_parser.from_buffer(io.BytesIO(blob))
|
||||
sections = [line for line in parsed["content"].split("\n") if line]
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
|
||||
return
|
||||
|
||||
self.set_output("markdown", "\n".join(sections))
|
||||
|
||||
def _docx(self, name, blob, **kwargs):
|
||||
"""Parse DOCX files and optionally remove table-of-contents content."""
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
|
||||
conf = self._param.setups["docx"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if re.search(r"\.doc$", name, re.IGNORECASE):
|
||||
@@ -885,14 +918,14 @@ class Parser(ProcessBase):
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, name, blob, **kwargs):
|
||||
"""Parse markdown and txt files into text/json sections."""
|
||||
"""Parse markdown files into text/json sections."""
|
||||
from functools import reduce
|
||||
|
||||
from rag.app.naive import Markdown as naive_markdown_parser
|
||||
from rag.nlp import concat_img
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
|
||||
conf = self._param.setups["text&markdown"]
|
||||
conf = self._param.setups["markdown"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
markdown_parser = naive_markdown_parser()
|
||||
@@ -903,11 +936,6 @@ class Parser(ProcessBase):
|
||||
delimiter=conf.get("delimiter"),
|
||||
return_section_images=True,
|
||||
)
|
||||
if name.lower().endswith(".txt") and conf.get("remove_toc") == "true":
|
||||
sections, kept_indices = remove_toc(sections)
|
||||
if section_images:
|
||||
section_images = [section_images[i] for i in kept_indices if i < len(section_images)]
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
json_results = []
|
||||
|
||||
@@ -937,11 +965,15 @@ class Parser(ProcessBase):
|
||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||
|
||||
def _code(self, name, blob, **kwargs):
|
||||
"""Parse source code files as plain text chunks."""
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.")
|
||||
conf = self._param.setups["code"]
|
||||
"""Parse text and source code files as plain text chunks."""
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
|
||||
conf = self._param.setups["text&code"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("\n\n")
|
||||
print(conf.get("output_format"))
|
||||
print("\n\n")
|
||||
|
||||
sections = TxtParser()(
|
||||
name,
|
||||
blob,
|
||||
@@ -952,6 +984,10 @@ class Parser(ProcessBase):
|
||||
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
|
||||
return
|
||||
|
||||
print("\n", "-"*150, "\n")
|
||||
print(sections)
|
||||
print("\n", "-"*150, "\n")
|
||||
|
||||
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
|
||||
|
||||
def _html(self, name, blob, **kwargs):
|
||||
@@ -1199,12 +1235,13 @@ class Parser(ProcessBase):
|
||||
"""Dispatch the current file to the matching parser branch by suffix."""
|
||||
function_map = {
|
||||
"pdf": self._pdf,
|
||||
"text&markdown": self._markdown,
|
||||
"code": self._code,
|
||||
"markdown": self._markdown,
|
||||
"text&code": self._code,
|
||||
"html": self._html,
|
||||
"spreadsheet": self._spreadsheet,
|
||||
"slides": self._slides,
|
||||
"word": self._word,
|
||||
"doc": self._doc,
|
||||
"docx": self._docx,
|
||||
"image": self._image,
|
||||
"audio": self._audio,
|
||||
"video": self._video,
|
||||
|
||||
@@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable
|
||||
spreadsheet: 'Spreadsheet',
|
||||
image: 'Image',
|
||||
email: 'Email',
|
||||
'text&markdown': 'Text & Markup',
|
||||
code: 'Code',
|
||||
markdown: 'Markdown',
|
||||
'text&code': 'Text & Code',
|
||||
html: 'HTML',
|
||||
word: 'Word',
|
||||
doc: 'DOC',
|
||||
docx: 'DOCX',
|
||||
slides: 'PPTX',
|
||||
audio: 'Audio',
|
||||
video: 'Video',
|
||||
|
||||
@@ -1950,10 +1950,11 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
spreadsheet: '表格',
|
||||
image: '图片',
|
||||
email: '邮件',
|
||||
'text&markdown': '文本与标记',
|
||||
code: '代码',
|
||||
markdown: 'Markdown',
|
||||
'text&code': '文本与代码',
|
||||
html: 'HTML',
|
||||
word: 'Word',
|
||||
doc: 'DOC',
|
||||
docx: 'DOCX',
|
||||
slides: 'PPTX',
|
||||
audio: '音频',
|
||||
video: '视频',
|
||||
|
||||
@@ -9,10 +9,11 @@ export enum FileType {
|
||||
Spreadsheet = 'spreadsheet',
|
||||
Image = 'image',
|
||||
Email = 'email',
|
||||
TextMarkdown = 'text&markdown',
|
||||
Code = 'code',
|
||||
TextMarkdown = 'markdown',
|
||||
Code = 'text&code',
|
||||
Html = 'html',
|
||||
Docx = 'word',
|
||||
Doc = 'doc',
|
||||
Docx = 'docx',
|
||||
PowerPoint = 'slides',
|
||||
Video = 'video',
|
||||
Audio = 'audio',
|
||||
@@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat {
|
||||
Text = 'text',
|
||||
}
|
||||
|
||||
export enum TextJsonOutputFormat {
|
||||
Text = 'text',
|
||||
Json = 'json',
|
||||
}
|
||||
|
||||
export enum DocxOutputFormat {
|
||||
Markdown = 'markdown',
|
||||
Json = 'json',
|
||||
@@ -64,8 +70,9 @@ export const OutputFormatMap = {
|
||||
[FileType.Image]: ImageOutputFormat,
|
||||
[FileType.Email]: EmailOutputFormat,
|
||||
[FileType.TextMarkdown]: TextMarkdownOutputFormat,
|
||||
[FileType.Code]: TextMarkdownOutputFormat,
|
||||
[FileType.Html]: TextMarkdownOutputFormat,
|
||||
[FileType.Code]: TextJsonOutputFormat,
|
||||
[FileType.Html]: TextJsonOutputFormat,
|
||||
[FileType.Doc]: DocxOutputFormat,
|
||||
[FileType.Docx]: DocxOutputFormat,
|
||||
[FileType.PowerPoint]: PptOutputFormat,
|
||||
[FileType.Video]: VideoOutputFormat,
|
||||
@@ -78,8 +85,9 @@ export const InitialOutputFormatMap = {
|
||||
[FileType.Image]: ImageOutputFormat.Text,
|
||||
[FileType.Email]: EmailOutputFormat.Text,
|
||||
[FileType.TextMarkdown]: TextMarkdownOutputFormat.Text,
|
||||
[FileType.Code]: TextMarkdownOutputFormat.Text,
|
||||
[FileType.Html]: TextMarkdownOutputFormat.Text,
|
||||
[FileType.Code]: TextJsonOutputFormat.Json,
|
||||
[FileType.Html]: TextJsonOutputFormat.Json,
|
||||
[FileType.Doc]: DocxOutputFormat.Json,
|
||||
[FileType.Docx]: DocxOutputFormat.Json,
|
||||
[FileType.PowerPoint]: PptOutputFormat.Json,
|
||||
[FileType.Video]: VideoOutputFormat.Text,
|
||||
@@ -216,12 +224,17 @@ export const initialParserValues = {
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Code,
|
||||
output_format: TextMarkdownOutputFormat.Text,
|
||||
output_format: TextJsonOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Html,
|
||||
output_format: TextMarkdownOutputFormat.Text,
|
||||
output_format: TextJsonOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Doc,
|
||||
output_format: DocxOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
},
|
||||
{
|
||||
@@ -340,8 +353,9 @@ export const FileTypeSuffixMap = {
|
||||
[FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'],
|
||||
[FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'],
|
||||
[FileType.Email]: ['eml', 'msg'],
|
||||
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
|
||||
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx'],
|
||||
[FileType.Code]: [
|
||||
'txt',
|
||||
'py',
|
||||
'js',
|
||||
'java',
|
||||
@@ -357,7 +371,8 @@ export const FileTypeSuffixMap = {
|
||||
'sql',
|
||||
],
|
||||
[FileType.Html]: ['htm', 'html'],
|
||||
[FileType.Docx]: ['doc', 'docx'],
|
||||
[FileType.Doc]: ['doc'],
|
||||
[FileType.Docx]: ['docx'],
|
||||
[FileType.PowerPoint]: ['pptx', 'ppt'],
|
||||
[FileType.Video]: ['mp4', 'avi', 'mkv'],
|
||||
[FileType.Audio]: [
|
||||
|
||||
@@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial<
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.section_title },
|
||||
],
|
||||
[FileType.Doc]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.section_title },
|
||||
],
|
||||
[FileType.Docx]: [
|
||||
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
|
||||
{ value: PreprocessValue.section_title },
|
||||
|
||||
Reference in New Issue
Block a user