Feat: add button for remove header & footer in pipeline (#14486)

### What problem does this PR solve?

Feat: add button for remove header & footer in pipeline

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-04-30 12:30:41 +08:00
committed by GitHub
parent 2932b65da6
commit bb3b99f0a5
13 changed files with 135 additions and 82 deletions

View File

@@ -52,7 +52,7 @@ class RAGFlowHtmlParser:
raise TypeError("txt type should be string!")
temp_sections = []
soup = BeautifulSoup(txt, "html5lib")
soup = BeautifulSoup(txt, "html.parser")
# delete <style> tag
for style_tag in soup.find_all(["style", "script"]):
style_tag.decompose()
@@ -210,4 +210,3 @@ class RAGFlowHtmlParser:
chunks.append(current_block)
return chunks

View File

@@ -46,6 +46,9 @@ from rag.flow.parser.schema import ParserFromUpstream
from rag.flow.parser.utils import (
enhance_media_sections_with_vision,
extract_word_outlines,
extract_docx_header_footer_texts,
remove_header_footer_docx_sections,
remove_header_footer_html_blob,
remove_toc,
remove_toc_pdf,
remove_toc_word,
@@ -113,6 +116,7 @@ class ParserParam(ProcessParamBase):
"lang": "Chinese",
"flatten_media_to_text": False,
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"pdf",
],
@@ -130,6 +134,7 @@ class ParserParam(ProcessParamBase):
},
"doc": {
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"doc",
],
@@ -138,6 +143,7 @@ class ParserParam(ProcessParamBase):
"docx": {
"flatten_media_to_text": False,
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"docx",
],
@@ -170,7 +176,8 @@ class ParserParam(ProcessParamBase):
},
"html": {
"suffix": ["htm", "html"],
"remove_toc": "false",
"remove_toc": False,
"remove_header_footer": False,
"output_format": "json",
},
"slides": {
@@ -321,10 +328,6 @@ class Parser(ProcessBase):
flatten_media_to_text = conf.get("flatten_media_to_text")
pdf_parser = None
# Optional PDF post-processing flags applied after parsing.
abstract_enabled = "abstract" in conf.get("preprocess", [])
author_enabled = "author" in conf.get("preprocess", [])
# Normalize parser selection and optional provider-specific model name.
raw_parse_method = conf.get("parse_method", "")
parser_model_name = None
@@ -587,7 +590,6 @@ class Parser(ProcessBase):
if image is not None:
box["image"] = image
bboxes.append(box)
# Vision parser treats each page as a large image block.
else:
if conf.get("parse_method"):
@@ -634,19 +636,15 @@ class Parser(ProcessBase):
toc_bboxes, _ = remove_toc(bboxes[:split_at])
bboxes = toc_bboxes + bboxes[split_at:]
normalize_bboxes = []
# Normalize shared bbox fields for downstream consumers.
layout_counters = {}
for b in bboxes:
raw_layout = str(b.get("layout_type") or "").strip()
has_layout = bool(raw_layout)
layout = re.sub(r"\s+", " ", raw_layout) if has_layout else "text"
b["layout_type"] = layout
if not b.get("layoutno"):
seq = layout_counters.get(layout, 0)
layout_counters[layout] = seq + 1
b["layoutno"] = f"{layout}-{seq}"
if conf.get("remove_header_footer") and re.search(r"(header|footer|number)", raw_layout, re.I):
continue
if flatten_media_to_text:
b["doc_type_kwd"] = "text"
elif layout == "table":
@@ -657,67 +655,8 @@ class Parser(ProcessBase):
b["doc_type_kwd"] = "image"
else:
b["doc_type_kwd"] = "text"
# Mark likely author blocks near the title when enabled.
if author_enabled:
def _begin(txt):
if not isinstance(txt, str):
return False
return re.match(
r"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
txt.lower().strip(),
)
i = 0
while i < min(32, len(bboxes) - 1):
b = bboxes[i]
i += 1
layout_type = b.get("layout_type", "")
layoutno = b.get("layoutno", "")
is_title = "title" in str(layout_type).lower() or "title" in str(layoutno).lower()
if not is_title:
continue
title_txt = b.get("text", "")
if _begin(title_txt):
break
for j in range(3):
next_idx = i + j
if next_idx >= len(bboxes):
break
candidate = bboxes[next_idx].get("text", "")
if _begin(candidate):
break
if isinstance(candidate, str) and "@" in candidate:
break
bboxes[next_idx]["author"] = True
break
# Mark the abstract block when enabled.
if abstract_enabled:
i = 0
abstract_idx = None
while i + 1 < min(32, len(bboxes)):
b = bboxes[i]
i += 1
txt = b.get("text", "")
if not isinstance(txt, str):
continue
txt = txt.lower().strip()
if re.match(r"(abstract|摘要)", txt):
if len(txt.split()) > 32 or len(txt) > 64:
abstract_idx = i - 1
break
next_txt = bboxes[i].get("text", "") if i < len(bboxes) else ""
if isinstance(next_txt, str):
next_txt = next_txt.lower().strip()
if len(next_txt.split()) > 32 or len(next_txt) > 64:
abstract_idx = i
i += 1
break
if abstract_idx is not None:
bboxes[abstract_idx]["abstract"] = True
normalize_bboxes.append(b)
bboxes = normalize_bboxes
enhance_media_sections_with_vision(
bboxes,
@@ -900,6 +839,9 @@ class Parser(ProcessBase):
# JSON output keeps text/image blocks and appends table HTML as table items.
if conf.get("output_format") == "json":
main_sections = docx_parser(name, binary=blob)
if conf.get("remove_header_footer"):
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
main_sections = remove_header_footer_docx_sections(main_sections, header_footer_texts)
if conf.get("remove_toc"):
main_sections = remove_toc_word(main_sections, outlines)
sections = []
@@ -931,6 +873,10 @@ class Parser(ProcessBase):
# Markdown output removes TOC on plain markdown lines before writing back.
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
if conf.get("remove_header_footer"):
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
markdown_lines = remove_header_footer_docx_sections(markdown_text.split("\n"), header_footer_texts)
markdown_text = "\n".join(markdown_lines)
if conf.get("remove_toc"):
markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
@@ -1091,8 +1037,11 @@ class Parser(ProcessBase):
conf = self._param.setups["html"]
self.set_output("output_format", conf["output_format"])
if conf.get("remove_header_footer"):
blob = remove_header_footer_html_blob(blob)
sections = HtmlParser()(name, blob, int(conf.get("chunk_token_num", 512)))
if conf.get("remove_toc") == "true":
if conf.get("remove_toc"):
sections, _ = remove_toc(sections)
if conf.get("output_format") == "json":
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections if section])

View File

@@ -72,7 +72,6 @@ def extract_pdf_positions(item):
return []
positions = _extract_raw_positions(item)
uses_position_tag = isinstance(item.get("position_tag"), str) and bool(item.get("position_tag"))
ref_page_number = item.get("page_number")
ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None
if ref_page_number is not None and ref_page_number <= 0:
@@ -86,9 +85,7 @@ def extract_pdf_positions(item):
page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0]
try:
page_number = int(page_number)
if uses_position_tag:
page_number += 1
elif ref_page_number is not None and page_number == ref_page_number - 1:
if ref_page_number is not None and page_number == ref_page_number - 1:
page_number = ref_page_number
elif page_number <= 0:
page_number += 1

View File

@@ -16,6 +16,7 @@
import re
from io import BytesIO
from bs4 import BeautifulSoup
from docx import Document
from api.db.services.llm_service import LLMBundle
from api.db.joint_services.tenant_model_service import (
@@ -34,6 +35,48 @@ def remove_toc(items):
return [items[i] for i in kept_indices], kept_indices
def extract_docx_header_footer_texts(filename=None, binary=None):
doc = Document(filename) if binary is None else Document(BytesIO(binary))
texts = set()
for section in doc.sections:
for container in (section.header, section.footer):
for paragraph in container.paragraphs:
normalized = re.sub(r"\s+", " ", paragraph.text).strip()
if normalized:
texts.add(normalized)
for table in container.tables:
for row in table.rows:
for cell in row.cells:
normalized = re.sub(r"\s+", " ", cell.text).strip()
if normalized:
texts.add(normalized)
return texts
def remove_header_footer_docx_sections(items, header_footer_texts):
if not header_footer_texts:
return items
filtered = []
for item in items:
text = _item_text(item)
normalized = re.sub(r"\s+", " ", text).strip() if isinstance(text, str) else ""
if normalized and normalized in header_footer_texts:
continue
filtered.append(item)
return filtered
def remove_header_footer_html_blob(blob):
soup = BeautifulSoup(blob, "html.parser")
for element in soup.find_all(
lambda tag: tag.name in {"header", "footer"}
or tag.get("role") in {"banner", "contentinfo"}
):
element.decompose()
return str(soup).encode("utf-8")
def extract_word_outlines(filename, binary=None):
doc = Document(filename) if binary is None else Document(BytesIO(binary))
outlines = []

View File

@@ -1527,6 +1527,7 @@ Best for: Documents with flowing, contextually connected content — such as boo
removeToc: 'Remove original table of contents',
removeTocTip:
'Remove the table of contents included in the original PDF, so it is not parsed as regular content or chunked for retrieval.',
removeHeaderFooter: 'Remove header and footer',
autoPlay: 'Auto play audio',
downloadFileTypeTip: 'The file type to download',
downloadFileType: 'Download file type',

View File

@@ -1278,6 +1278,7 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
removeToc: '移除原始目录',
removeTocTip:
'移除原始PDF中包含的目录这样它就不会被解析为常规内容或作为检索块。',
removeHeaderFooter: '移除页眉页脚',
autoPlay: '自动播放',
downloadFileTypeTip: '文件下载的类型',
downloadFileType: '文件类型',

View File

@@ -198,6 +198,7 @@ export const initialParserValues = {
parse_method: ParseDocumentType.DeepDOC,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
remove_header_footer: false,
},
{
fileFormat: FileType.Spreadsheet,
@@ -234,17 +235,21 @@ export const initialParserValues = {
fileFormat: FileType.Html,
output_format: TextJsonOutputFormat.Json,
preprocess: PreprocessValue.main_content,
remove_header_footer: false,
},
{
fileFormat: FileType.Doc,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
remove_header_footer: false,
},
{
fileFormat: FileType.Docx,
output_format: DocxOutputFormat.Json,
preprocess: PreprocessValue.main_content,
flatten_media_to_text: false,
remove_header_footer: false,
},
{
fileFormat: FileType.PowerPoint,

View File

@@ -157,6 +157,28 @@ export function RmdirFormField({ prefix }: CommonProps) {
);
}
export function RemoveHeaderFooterFormField({ prefix }: CommonProps) {
const { t } = useTranslation();
return (
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`remove_header_footer`, prefix)}
label={t('flow.removeHeaderFooter')}
horizontal={true}
labelClassName="w-full"
valueClassName="w-8"
>
{(field) => (
<Switch
checked={field.value}
onCheckedChange={(checked) => {
field.onChange?.(checked);
}}
/>
)}
</RAGFlowFormItem>
);
}
export function LanguageFormField({ prefix }: CommonProps) {
const { t } = useTranslation();

View File

@@ -127,6 +127,7 @@ const FileFormatWidgetMap = {
[FileType.PDF]: PdfFormFields,
[FileType.Spreadsheet]: SpreadsheetFormFields,
[FileType.PowerPoint]: PptFormFields,
[FileType.Doc]: WordFormFields,
[FileType.Docx]: WordFormFields,
[FileType.Video]: VideoFormFields,
[FileType.Audio]: AudioFormFields,
@@ -160,6 +161,7 @@ export const FormSchema = z.object({
markdown_image_response_type: z.string().optional(),
enable_multi_column: z.boolean().optional(),
remove_toc: z.boolean().optional(),
remove_header_footer: z.boolean().optional(),
}),
),
});
@@ -352,6 +354,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
vlm: { llm_id: '' },
table_result_type: '',
markdown_image_response_type: '',
remove_header_footer: false,
// preprocess: [],
});
}, [append]);

View File

@@ -15,6 +15,7 @@ import {
LanguageFormField,
LargeModelFormField,
ParserMethodFormField,
RemoveHeaderFooterFormField,
RmdirFormField,
TwoColumnCheckFormField,
} from './common-form-fields';
@@ -104,6 +105,7 @@ export function PdfFormFields({ prefix }: CommonProps) {
<>
<TwoColumnCheckFormField prefix={prefix} />
<RmdirFormField prefix={prefix} />
<RemoveHeaderFooterFormField prefix={prefix} />
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (

View File

@@ -4,6 +4,7 @@ import { useWatch } from 'react-hook-form';
import {
FlattenMediaToTextFormField,
LargeModelFormField,
RemoveHeaderFooterFormField,
RmdirFormField,
} from './common-form-fields';
import { CommonProps } from './interface';
@@ -32,5 +33,10 @@ export function TextMarkdownFormFields({ prefix }: CommonProps) {
}
export function HtmlFormFields({ prefix }: CommonProps) {
return <RmdirFormField prefix={prefix} />;
return (
<>
<RmdirFormField prefix={prefix} />
<RemoveHeaderFooterFormField prefix={prefix} />
</>
);
}

View File

@@ -5,6 +5,7 @@ import {
FlattenMediaToTextFormField,
LargeModelFormField,
OutputFormatFormFieldProps,
RemoveHeaderFooterFormField,
RmdirFormField,
} from './common-form-fields';
import { buildFieldNameWithPrefix } from './utils';
@@ -20,6 +21,7 @@ export function WordFormFields({ prefix }: OutputFormatFormFieldProps) {
return (
<>
<RmdirFormField prefix={prefix} />
<RemoveHeaderFooterFormField prefix={prefix} />
<FlattenMediaToTextFormField prefix={prefix} />
{!flattenMediaToText && (
<LargeModelFormField

View File

@@ -231,6 +231,7 @@ function transformParserParams(params: ParserFormSchemaType) {
flatten_media_to_text: cur.flatten_media_to_text,
enable_multi_column: cur.enable_multi_column,
remove_toc: cur.remove_toc,
remove_header_footer: cur.remove_header_footer || false,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
@@ -279,7 +280,29 @@ function transformParserParams(params: ParserFormSchemaType) {
fields: cur.fields,
};
break;
case FileType.Doc:
filteredSetup = {
...filteredSetup,
vlm: { llm_id: cur.vlm?.llm_id },
flatten_media_to_text: cur.flatten_media_to_text,
remove_header_footer: cur.remove_header_footer || false,
};
break;
case FileType.Docx:
filteredSetup = {
...filteredSetup,
vlm: { llm_id: cur.vlm?.llm_id },
flatten_media_to_text: cur.flatten_media_to_text,
remove_header_footer: cur.remove_header_footer || false,
};
break;
case FileType.Html:
filteredSetup = {
...filteredSetup,
remove_toc: cur.remove_toc,
remove_header_footer: cur.remove_header_footer || false,
};
break;
case FileType.TextMarkdown:
filteredSetup = {
...filteredSetup,