Refact: refact on parser structure (#14012)

### What problem does this PR solve?

Refact: refact on parser structure

### Type of change

- [x] Refactoring
This commit is contained in:
Magicbook1108
2026-04-10 10:03:44 +08:00
committed by GitHub
parent cd04467b9b
commit 27329b40ed
6 changed files with 110 additions and 49 deletions

View File

@@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
print("\n", "-"*150, "\n")
print(sections)
print("\n", "-"*150, "\n")
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):

View File

@@ -66,7 +66,11 @@ class ParserParam(ProcessParamBase):
"markdown",
"html",
],
"word": [
"doc": [
"json",
"markdown",
],
"docx": [
"json",
"markdown",
],
@@ -80,11 +84,11 @@ class ParserParam(ProcessParamBase):
"text",
"json",
],
"text&markdown": [
"markdown": [
"text",
"json",
],
"code": [
"text&code": [
"text",
"json",
],
@@ -121,21 +125,28 @@ class ParserParam(ProcessParamBase):
"csv",
],
},
"word": {
"doc": {
"remove_toc": False,
"suffix": [
"doc",
],
"output_format": "json",
},
"docx": {
"remove_toc": False,
"suffix": [
"docx",
],
"output_format": "json",
},
"text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"],
"markdown": {
"suffix": ["md", "markdown", "mdx"],
"remove_toc": False,
"output_format": "json",
},
"code": {
"text&code": {
"suffix": [
"txt",
"py",
"js",
"java",
@@ -150,12 +161,12 @@ class ParserParam(ProcessParamBase):
"kt",
"sql",
],
"output_format": "text",
"output_format": "json",
},
"html": {
"suffix": ["htm", "html"],
"remove_toc": "false",
"output_format": "text",
"output_format": "json",
},
"slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
@@ -235,10 +246,15 @@ class ParserParam(ProcessParamBase):
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
doc_config = self.setups.get("word", "")
doc_config = self.setups.get("doc", "")
if doc_config:
doc_output_format = doc_config.get("output_format", "")
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
docx_config = self.setups.get("docx", "")
if docx_config:
docx_output_format = docx_config.get("output_format", "")
self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])
slides_config = self.setups.get("slides", "")
if slides_config:
@@ -251,15 +267,15 @@ class ParserParam(ProcessParamBase):
if image_parse_method not in ["ocr"]:
self.check_empty(image_config.get("lang", ""), "Image VLM language")
text_config = self.setups.get("text&markdown", "")
text_config = self.setups.get("markdown", "")
if text_config:
text_output_format = text_config.get("output_format", "")
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])
code_config = self.setups.get("code", "")
code_config = self.setups.get("text&code", "")
if code_config:
code_output_format = code_config.get("output_format", "")
self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"])
self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])
html_config = self.setups.get("html", "")
if html_config:
@@ -733,10 +749,27 @@ class Parser(ProcessBase):
elif conf.get("output_format") == "markdown":
self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _word(self, name, blob, **kwargs):
"""Parse doc/docx files and optionally remove table-of-contents content."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
conf = self._param.setups["word"]
def _doc(self, name, blob, **kwargs):
"""Parse DOC files into text/json sections."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
conf = self._param.setups["doc"]
self.set_output("output_format", conf["output_format"])
from tika import parser as tika_parser
parsed = tika_parser.from_buffer(io.BytesIO(blob))
sections = [line for line in parsed["content"].split("\n") if line]
if conf.get("output_format") == "json":
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
return
self.set_output("markdown", "\n".join(sections))
def _docx(self, name, blob, **kwargs):
"""Parse DOCX files and optionally remove table-of-contents content."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
conf = self._param.setups["docx"]
self.set_output("output_format", conf["output_format"])
if re.search(r"\.doc$", name, re.IGNORECASE):
@@ -885,14 +918,14 @@ class Parser(ProcessBase):
self.set_output("json", sections)
def _markdown(self, name, blob, **kwargs):
"""Parse markdown and txt files into text/json sections."""
"""Parse markdown files into text/json sections."""
from functools import reduce
from rag.app.naive import Markdown as naive_markdown_parser
from rag.nlp import concat_img
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
conf = self._param.setups["text&markdown"]
conf = self._param.setups["markdown"]
self.set_output("output_format", conf["output_format"])
markdown_parser = naive_markdown_parser()
@@ -903,11 +936,6 @@ class Parser(ProcessBase):
delimiter=conf.get("delimiter"),
return_section_images=True,
)
if name.lower().endswith(".txt") and conf.get("remove_toc") == "true":
sections, kept_indices = remove_toc(sections)
if section_images:
section_images = [section_images[i] for i in kept_indices if i < len(section_images)]
if conf.get("output_format") == "json":
json_results = []
@@ -937,11 +965,15 @@ class Parser(ProcessBase):
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
def _code(self, name, blob, **kwargs):
"""Parse source code files as plain text chunks."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.")
conf = self._param.setups["code"]
"""Parse text and source code files as plain text chunks."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
conf = self._param.setups["text&code"]
self.set_output("output_format", conf["output_format"])
print("\n\n")
print(conf.get("output_format"))
print("\n\n")
sections = TxtParser()(
name,
blob,
@@ -952,6 +984,10 @@ class Parser(ProcessBase):
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
return
print("\n", "-"*150, "\n")
print(sections)
print("\n", "-"*150, "\n")
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
def _html(self, name, blob, **kwargs):
@@ -1199,12 +1235,13 @@ class Parser(ProcessBase):
"""Dispatch the current file to the matching parser branch by suffix."""
function_map = {
"pdf": self._pdf,
"text&markdown": self._markdown,
"code": self._code,
"markdown": self._markdown,
"text&code": self._code,
"html": self._html,
"spreadsheet": self._spreadsheet,
"slides": self._slides,
"word": self._word,
"doc": self._doc,
"docx": self._docx,
"image": self._image,
"audio": self._audio,
"video": self._video,

View File

@@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable
spreadsheet: 'Spreadsheet',
image: 'Image',
email: 'Email',
'text&markdown': 'Text & Markup',
code: 'Code',
markdown: 'Markdown',
'text&code': 'Text & Code',
html: 'HTML',
word: 'Word',
doc: 'DOC',
docx: 'DOCX',
slides: 'PPTX',
audio: 'Audio',
video: 'Video',

View File

@@ -1950,10 +1950,11 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
spreadsheet: '表格',
image: '图片',
email: '邮件',
'text&markdown': '文本与标记',
code: '代码',
markdown: 'Markdown',
'text&code': '文本与代码',
html: 'HTML',
word: 'Word',
doc: 'DOC',
docx: 'DOCX',
slides: 'PPTX',
audio: '音频',
video: '视频',

View File

@@ -9,10 +9,11 @@ export enum FileType {
Spreadsheet = 'spreadsheet',
Image = 'image',
Email = 'email',
TextMarkdown = 'text&markdown',
Code = 'code',
TextMarkdown = 'markdown',
Code = 'text&code',
Html = 'html',
Docx = 'word',
Doc = 'doc',
Docx = 'docx',
PowerPoint = 'slides',
Video = 'video',
Audio = 'audio',
@@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat {
Text = 'text',
}
export enum TextJsonOutputFormat {
Text = 'text',
Json = 'json',
}
export enum DocxOutputFormat {
Markdown = 'markdown',
Json = 'json',
@@ -64,8 +70,9 @@ export const OutputFormatMap = {
[FileType.Image]: ImageOutputFormat,
[FileType.Email]: EmailOutputFormat,
[FileType.TextMarkdown]: TextMarkdownOutputFormat,
[FileType.Code]: TextMarkdownOutputFormat,
[FileType.Html]: TextMarkdownOutputFormat,
[FileType.Code]: TextJsonOutputFormat,
[FileType.Html]: TextJsonOutputFormat,
[FileType.Doc]: DocxOutputFormat,
[FileType.Docx]: DocxOutputFormat,
[FileType.PowerPoint]: PptOutputFormat,
[FileType.Video]: VideoOutputFormat,
@@ -78,8 +85,9 @@ export const InitialOutputFormatMap = {
[FileType.Image]: ImageOutputFormat.Text,
[FileType.Email]: EmailOutputFormat.Text,
[FileType.TextMarkdown]: TextMarkdownOutputFormat.Text,
[FileType.Code]: TextMarkdownOutputFormat.Text,
[FileType.Html]: TextMarkdownOutputFormat.Text,
[FileType.Code]: TextJsonOutputFormat.Json,
[FileType.Html]: TextJsonOutputFormat.Json,
[FileType.Doc]: DocxOutputFormat.Json,
[FileType.Docx]: DocxOutputFormat.Json,
[FileType.PowerPoint]: PptOutputFormat.Json,
[FileType.Video]: VideoOutputFormat.Text,
@@ -216,12 +224,17 @@ export const initialParserValues = {
},
{
fileFormat: FileType.Code,
output_format: TextMarkdownOutputFormat.Text,
output_format: TextJsonOutputFormat.Json,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.Html,
output_format: TextMarkdownOutputFormat.Text,
output_format: TextJsonOutputFormat.Json,
preprocess: PreprocessValue.main_content,
},
{
fileFormat: FileType.Doc,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
},
{
@@ -340,8 +353,9 @@ export const FileTypeSuffixMap = {
[FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'],
[FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'],
[FileType.Email]: ['eml', 'msg'],
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx'],
[FileType.Code]: [
'txt',
'py',
'js',
'java',
@@ -357,7 +371,8 @@ export const FileTypeSuffixMap = {
'sql',
],
[FileType.Html]: ['htm', 'html'],
[FileType.Docx]: ['doc', 'docx'],
[FileType.Doc]: ['doc'],
[FileType.Docx]: ['docx'],
[FileType.PowerPoint]: ['pptx', 'ppt'],
[FileType.Video]: ['mp4', 'avi', 'mkv'],
[FileType.Audio]: [

View File

@@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial<
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.section_title },
],
[FileType.Doc]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.section_title },
],
[FileType.Docx]: [
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
{ value: PreprocessValue.section_title },