mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: add button for remove header & footer in pipeline (#14486)
### What problem does this PR solve? Feat: add button for remove header & footer in pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -52,7 +52,7 @@ class RAGFlowHtmlParser:
|
||||
raise TypeError("txt type should be string!")
|
||||
|
||||
temp_sections = []
|
||||
soup = BeautifulSoup(txt, "html5lib")
|
||||
soup = BeautifulSoup(txt, "html.parser")
|
||||
# delete <style> tag
|
||||
for style_tag in soup.find_all(["style", "script"]):
|
||||
style_tag.decompose()
|
||||
@@ -210,4 +210,3 @@ class RAGFlowHtmlParser:
|
||||
chunks.append(current_block)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
@@ -46,6 +46,9 @@ from rag.flow.parser.schema import ParserFromUpstream
|
||||
from rag.flow.parser.utils import (
|
||||
enhance_media_sections_with_vision,
|
||||
extract_word_outlines,
|
||||
extract_docx_header_footer_texts,
|
||||
remove_header_footer_docx_sections,
|
||||
remove_header_footer_html_blob,
|
||||
remove_toc,
|
||||
remove_toc_pdf,
|
||||
remove_toc_word,
|
||||
@@ -113,6 +116,7 @@ class ParserParam(ProcessParamBase):
|
||||
"lang": "Chinese",
|
||||
"flatten_media_to_text": False,
|
||||
"remove_toc": False,
|
||||
"remove_header_footer": False,
|
||||
"suffix": [
|
||||
"pdf",
|
||||
],
|
||||
@@ -130,6 +134,7 @@ class ParserParam(ProcessParamBase):
|
||||
},
|
||||
"doc": {
|
||||
"remove_toc": False,
|
||||
"remove_header_footer": False,
|
||||
"suffix": [
|
||||
"doc",
|
||||
],
|
||||
@@ -138,6 +143,7 @@ class ParserParam(ProcessParamBase):
|
||||
"docx": {
|
||||
"flatten_media_to_text": False,
|
||||
"remove_toc": False,
|
||||
"remove_header_footer": False,
|
||||
"suffix": [
|
||||
"docx",
|
||||
],
|
||||
@@ -170,7 +176,8 @@ class ParserParam(ProcessParamBase):
|
||||
},
|
||||
"html": {
|
||||
"suffix": ["htm", "html"],
|
||||
"remove_toc": "false",
|
||||
"remove_toc": False,
|
||||
"remove_header_footer": False,
|
||||
"output_format": "json",
|
||||
},
|
||||
"slides": {
|
||||
@@ -321,10 +328,6 @@ class Parser(ProcessBase):
|
||||
flatten_media_to_text = conf.get("flatten_media_to_text")
|
||||
pdf_parser = None
|
||||
|
||||
# Optional PDF post-processing flags applied after parsing.
|
||||
abstract_enabled = "abstract" in conf.get("preprocess", [])
|
||||
author_enabled = "author" in conf.get("preprocess", [])
|
||||
|
||||
# Normalize parser selection and optional provider-specific model name.
|
||||
raw_parse_method = conf.get("parse_method", "")
|
||||
parser_model_name = None
|
||||
@@ -587,7 +590,6 @@ class Parser(ProcessBase):
|
||||
if image is not None:
|
||||
box["image"] = image
|
||||
bboxes.append(box)
|
||||
|
||||
# Vision parser treats each page as a large image block.
|
||||
else:
|
||||
if conf.get("parse_method"):
|
||||
@@ -634,19 +636,15 @@ class Parser(ProcessBase):
|
||||
toc_bboxes, _ = remove_toc(bboxes[:split_at])
|
||||
bboxes = toc_bboxes + bboxes[split_at:]
|
||||
|
||||
normalize_bboxes = []
|
||||
# Normalize shared bbox fields for downstream consumers.
|
||||
layout_counters = {}
|
||||
for b in bboxes:
|
||||
raw_layout = str(b.get("layout_type") or "").strip()
|
||||
has_layout = bool(raw_layout)
|
||||
layout = re.sub(r"\s+", " ", raw_layout) if has_layout else "text"
|
||||
b["layout_type"] = layout
|
||||
|
||||
if not b.get("layoutno"):
|
||||
seq = layout_counters.get(layout, 0)
|
||||
layout_counters[layout] = seq + 1
|
||||
b["layoutno"] = f"{layout}-{seq}"
|
||||
|
||||
if conf.get("remove_header_footer") and re.search(r"(header|footer|number)", raw_layout, re.I):
|
||||
continue
|
||||
if flatten_media_to_text:
|
||||
b["doc_type_kwd"] = "text"
|
||||
elif layout == "table":
|
||||
@@ -657,67 +655,8 @@ class Parser(ProcessBase):
|
||||
b["doc_type_kwd"] = "image"
|
||||
else:
|
||||
b["doc_type_kwd"] = "text"
|
||||
|
||||
# Mark likely author blocks near the title when enabled.
|
||||
if author_enabled:
|
||||
def _begin(txt):
|
||||
if not isinstance(txt, str):
|
||||
return False
|
||||
return re.match(
|
||||
r"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
|
||||
txt.lower().strip(),
|
||||
)
|
||||
|
||||
i = 0
|
||||
while i < min(32, len(bboxes) - 1):
|
||||
b = bboxes[i]
|
||||
i += 1
|
||||
layout_type = b.get("layout_type", "")
|
||||
layoutno = b.get("layoutno", "")
|
||||
is_title = "title" in str(layout_type).lower() or "title" in str(layoutno).lower()
|
||||
if not is_title:
|
||||
continue
|
||||
|
||||
title_txt = b.get("text", "")
|
||||
if _begin(title_txt):
|
||||
break
|
||||
|
||||
for j in range(3):
|
||||
next_idx = i + j
|
||||
if next_idx >= len(bboxes):
|
||||
break
|
||||
candidate = bboxes[next_idx].get("text", "")
|
||||
if _begin(candidate):
|
||||
break
|
||||
if isinstance(candidate, str) and "@" in candidate:
|
||||
break
|
||||
bboxes[next_idx]["author"] = True
|
||||
break
|
||||
|
||||
# Mark the abstract block when enabled.
|
||||
if abstract_enabled:
|
||||
i = 0
|
||||
abstract_idx = None
|
||||
while i + 1 < min(32, len(bboxes)):
|
||||
b = bboxes[i]
|
||||
i += 1
|
||||
txt = b.get("text", "")
|
||||
if not isinstance(txt, str):
|
||||
continue
|
||||
txt = txt.lower().strip()
|
||||
if re.match(r"(abstract|摘要)", txt):
|
||||
if len(txt.split()) > 32 or len(txt) > 64:
|
||||
abstract_idx = i - 1
|
||||
break
|
||||
next_txt = bboxes[i].get("text", "") if i < len(bboxes) else ""
|
||||
if isinstance(next_txt, str):
|
||||
next_txt = next_txt.lower().strip()
|
||||
if len(next_txt.split()) > 32 or len(next_txt) > 64:
|
||||
abstract_idx = i
|
||||
i += 1
|
||||
break
|
||||
if abstract_idx is not None:
|
||||
bboxes[abstract_idx]["abstract"] = True
|
||||
normalize_bboxes.append(b)
|
||||
bboxes = normalize_bboxes
|
||||
|
||||
enhance_media_sections_with_vision(
|
||||
bboxes,
|
||||
@@ -900,6 +839,9 @@ class Parser(ProcessBase):
|
||||
# JSON output keeps text/image blocks and appends table HTML as table items.
|
||||
if conf.get("output_format") == "json":
|
||||
main_sections = docx_parser(name, binary=blob)
|
||||
if conf.get("remove_header_footer"):
|
||||
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
|
||||
main_sections = remove_header_footer_docx_sections(main_sections, header_footer_texts)
|
||||
if conf.get("remove_toc"):
|
||||
main_sections = remove_toc_word(main_sections, outlines)
|
||||
sections = []
|
||||
@@ -931,6 +873,10 @@ class Parser(ProcessBase):
|
||||
# Markdown output removes TOC on plain markdown lines before writing back.
|
||||
elif conf.get("output_format") == "markdown":
|
||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||
if conf.get("remove_header_footer"):
|
||||
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
|
||||
markdown_lines = remove_header_footer_docx_sections(markdown_text.split("\n"), header_footer_texts)
|
||||
markdown_text = "\n".join(markdown_lines)
|
||||
if conf.get("remove_toc"):
|
||||
markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
|
||||
|
||||
@@ -1091,8 +1037,11 @@ class Parser(ProcessBase):
|
||||
conf = self._param.setups["html"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if conf.get("remove_header_footer"):
|
||||
blob = remove_header_footer_html_blob(blob)
|
||||
|
||||
sections = HtmlParser()(name, blob, int(conf.get("chunk_token_num", 512)))
|
||||
if conf.get("remove_toc") == "true":
|
||||
if conf.get("remove_toc"):
|
||||
sections, _ = remove_toc(sections)
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections if section])
|
||||
|
||||
@@ -72,7 +72,6 @@ def extract_pdf_positions(item):
|
||||
return []
|
||||
|
||||
positions = _extract_raw_positions(item)
|
||||
uses_position_tag = isinstance(item.get("position_tag"), str) and bool(item.get("position_tag"))
|
||||
ref_page_number = item.get("page_number")
|
||||
ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None
|
||||
if ref_page_number is not None and ref_page_number <= 0:
|
||||
@@ -86,9 +85,7 @@ def extract_pdf_positions(item):
|
||||
page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0]
|
||||
try:
|
||||
page_number = int(page_number)
|
||||
if uses_position_tag:
|
||||
page_number += 1
|
||||
elif ref_page_number is not None and page_number == ref_page_number - 1:
|
||||
if ref_page_number is not None and page_number == ref_page_number - 1:
|
||||
page_number = ref_page_number
|
||||
elif page_number <= 0:
|
||||
page_number += 1
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.joint_services.tenant_model_service import (
|
||||
@@ -34,6 +35,48 @@ def remove_toc(items):
|
||||
return [items[i] for i in kept_indices], kept_indices
|
||||
|
||||
|
||||
def extract_docx_header_footer_texts(filename=None, binary=None):
|
||||
doc = Document(filename) if binary is None else Document(BytesIO(binary))
|
||||
texts = set()
|
||||
for section in doc.sections:
|
||||
for container in (section.header, section.footer):
|
||||
for paragraph in container.paragraphs:
|
||||
normalized = re.sub(r"\s+", " ", paragraph.text).strip()
|
||||
if normalized:
|
||||
texts.add(normalized)
|
||||
for table in container.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
normalized = re.sub(r"\s+", " ", cell.text).strip()
|
||||
if normalized:
|
||||
texts.add(normalized)
|
||||
return texts
|
||||
|
||||
|
||||
def remove_header_footer_docx_sections(items, header_footer_texts):
|
||||
if not header_footer_texts:
|
||||
return items
|
||||
|
||||
filtered = []
|
||||
for item in items:
|
||||
text = _item_text(item)
|
||||
normalized = re.sub(r"\s+", " ", text).strip() if isinstance(text, str) else ""
|
||||
if normalized and normalized in header_footer_texts:
|
||||
continue
|
||||
filtered.append(item)
|
||||
return filtered
|
||||
|
||||
|
||||
def remove_header_footer_html_blob(blob):
|
||||
soup = BeautifulSoup(blob, "html.parser")
|
||||
for element in soup.find_all(
|
||||
lambda tag: tag.name in {"header", "footer"}
|
||||
or tag.get("role") in {"banner", "contentinfo"}
|
||||
):
|
||||
element.decompose()
|
||||
return str(soup).encode("utf-8")
|
||||
|
||||
|
||||
def extract_word_outlines(filename, binary=None):
|
||||
doc = Document(filename) if binary is None else Document(BytesIO(binary))
|
||||
outlines = []
|
||||
|
||||
@@ -1527,6 +1527,7 @@ Best for: Documents with flowing, contextually connected content — such as boo
|
||||
removeToc: 'Remove original table of contents',
|
||||
removeTocTip:
|
||||
'Remove the table of contents included in the original PDF, so it is not parsed as regular content or chunked for retrieval.',
|
||||
removeHeaderFooter: 'Remove header and footer',
|
||||
autoPlay: 'Auto play audio',
|
||||
downloadFileTypeTip: 'The file type to download',
|
||||
downloadFileType: 'Download file type',
|
||||
|
||||
@@ -1278,6 +1278,7 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
removeToc: '移除原始目录',
|
||||
removeTocTip:
|
||||
'移除原始PDF中包含的目录,这样它就不会被解析为常规内容或作为检索块。',
|
||||
removeHeaderFooter: '移除页眉页脚',
|
||||
autoPlay: '自动播放',
|
||||
downloadFileTypeTip: '文件下载的类型',
|
||||
downloadFileType: '文件类型',
|
||||
|
||||
@@ -198,6 +198,7 @@ export const initialParserValues = {
|
||||
parse_method: ParseDocumentType.DeepDOC,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
flatten_media_to_text: false,
|
||||
remove_header_footer: false,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Spreadsheet,
|
||||
@@ -234,17 +235,21 @@ export const initialParserValues = {
|
||||
fileFormat: FileType.Html,
|
||||
output_format: TextJsonOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
remove_header_footer: false,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Doc,
|
||||
output_format: DocxOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
flatten_media_to_text: false,
|
||||
remove_header_footer: false,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.Docx,
|
||||
output_format: DocxOutputFormat.Json,
|
||||
preprocess: PreprocessValue.main_content,
|
||||
flatten_media_to_text: false,
|
||||
remove_header_footer: false,
|
||||
},
|
||||
{
|
||||
fileFormat: FileType.PowerPoint,
|
||||
|
||||
@@ -157,6 +157,28 @@ export function RmdirFormField({ prefix }: CommonProps) {
|
||||
);
|
||||
}
|
||||
|
||||
export function RemoveHeaderFooterFormField({ prefix }: CommonProps) {
|
||||
const { t } = useTranslation();
|
||||
return (
|
||||
<RAGFlowFormItem
|
||||
name={buildFieldNameWithPrefix(`remove_header_footer`, prefix)}
|
||||
label={t('flow.removeHeaderFooter')}
|
||||
horizontal={true}
|
||||
labelClassName="w-full"
|
||||
valueClassName="w-8"
|
||||
>
|
||||
{(field) => (
|
||||
<Switch
|
||||
checked={field.value}
|
||||
onCheckedChange={(checked) => {
|
||||
field.onChange?.(checked);
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
);
|
||||
}
|
||||
|
||||
export function LanguageFormField({ prefix }: CommonProps) {
|
||||
const { t } = useTranslation();
|
||||
|
||||
|
||||
@@ -127,6 +127,7 @@ const FileFormatWidgetMap = {
|
||||
[FileType.PDF]: PdfFormFields,
|
||||
[FileType.Spreadsheet]: SpreadsheetFormFields,
|
||||
[FileType.PowerPoint]: PptFormFields,
|
||||
[FileType.Doc]: WordFormFields,
|
||||
[FileType.Docx]: WordFormFields,
|
||||
[FileType.Video]: VideoFormFields,
|
||||
[FileType.Audio]: AudioFormFields,
|
||||
@@ -160,6 +161,7 @@ export const FormSchema = z.object({
|
||||
markdown_image_response_type: z.string().optional(),
|
||||
enable_multi_column: z.boolean().optional(),
|
||||
remove_toc: z.boolean().optional(),
|
||||
remove_header_footer: z.boolean().optional(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
@@ -352,6 +354,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
|
||||
vlm: { llm_id: '' },
|
||||
table_result_type: '',
|
||||
markdown_image_response_type: '',
|
||||
remove_header_footer: false,
|
||||
// preprocess: [],
|
||||
});
|
||||
}, [append]);
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
LanguageFormField,
|
||||
LargeModelFormField,
|
||||
ParserMethodFormField,
|
||||
RemoveHeaderFooterFormField,
|
||||
RmdirFormField,
|
||||
TwoColumnCheckFormField,
|
||||
} from './common-form-fields';
|
||||
@@ -104,6 +105,7 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
||||
<>
|
||||
<TwoColumnCheckFormField prefix={prefix} />
|
||||
<RmdirFormField prefix={prefix} />
|
||||
<RemoveHeaderFooterFormField prefix={prefix} />
|
||||
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
|
||||
<FlattenMediaToTextFormField prefix={prefix} />
|
||||
{!flattenMediaToText && (
|
||||
|
||||
@@ -4,6 +4,7 @@ import { useWatch } from 'react-hook-form';
|
||||
import {
|
||||
FlattenMediaToTextFormField,
|
||||
LargeModelFormField,
|
||||
RemoveHeaderFooterFormField,
|
||||
RmdirFormField,
|
||||
} from './common-form-fields';
|
||||
import { CommonProps } from './interface';
|
||||
@@ -32,5 +33,10 @@ export function TextMarkdownFormFields({ prefix }: CommonProps) {
|
||||
}
|
||||
|
||||
export function HtmlFormFields({ prefix }: CommonProps) {
|
||||
return <RmdirFormField prefix={prefix} />;
|
||||
return (
|
||||
<>
|
||||
<RmdirFormField prefix={prefix} />
|
||||
<RemoveHeaderFooterFormField prefix={prefix} />
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
FlattenMediaToTextFormField,
|
||||
LargeModelFormField,
|
||||
OutputFormatFormFieldProps,
|
||||
RemoveHeaderFooterFormField,
|
||||
RmdirFormField,
|
||||
} from './common-form-fields';
|
||||
import { buildFieldNameWithPrefix } from './utils';
|
||||
@@ -20,6 +21,7 @@ export function WordFormFields({ prefix }: OutputFormatFormFieldProps) {
|
||||
return (
|
||||
<>
|
||||
<RmdirFormField prefix={prefix} />
|
||||
<RemoveHeaderFooterFormField prefix={prefix} />
|
||||
<FlattenMediaToTextFormField prefix={prefix} />
|
||||
{!flattenMediaToText && (
|
||||
<LargeModelFormField
|
||||
|
||||
@@ -231,6 +231,7 @@ function transformParserParams(params: ParserFormSchemaType) {
|
||||
flatten_media_to_text: cur.flatten_media_to_text,
|
||||
enable_multi_column: cur.enable_multi_column,
|
||||
remove_toc: cur.remove_toc,
|
||||
remove_header_footer: cur.remove_header_footer || false,
|
||||
};
|
||||
// Only include TCADP parameters if TCADP Parser is selected
|
||||
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||
@@ -279,7 +280,29 @@ function transformParserParams(params: ParserFormSchemaType) {
|
||||
fields: cur.fields,
|
||||
};
|
||||
break;
|
||||
case FileType.Doc:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
vlm: { llm_id: cur.vlm?.llm_id },
|
||||
flatten_media_to_text: cur.flatten_media_to_text,
|
||||
remove_header_footer: cur.remove_header_footer || false,
|
||||
};
|
||||
break;
|
||||
case FileType.Docx:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
vlm: { llm_id: cur.vlm?.llm_id },
|
||||
flatten_media_to_text: cur.flatten_media_to_text,
|
||||
remove_header_footer: cur.remove_header_footer || false,
|
||||
};
|
||||
break;
|
||||
case FileType.Html:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
remove_toc: cur.remove_toc,
|
||||
remove_header_footer: cur.remove_header_footer || false,
|
||||
};
|
||||
break;
|
||||
case FileType.TextMarkdown:
|
||||
filteredSetup = {
|
||||
...filteredSetup,
|
||||
|
||||
Reference in New Issue
Block a user