mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-02 08:45:42 +08:00
### What problem does this PR solve? Package refactor and PDF post process. ### Type of change - [x] Refactoring --------- Co-authored-by: Claude <noreply@anthropic.com>
437 lines
12 KiB
Go
437 lines
12 KiB
Go
package post
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
|
|
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
|
|
"ragflow/internal/deepdoc/parser/pdf/util"
|
|
)
|
|
|
|
// ── Config ─────────────────────────────────────────────────────────────
|
|
|
|
// Config keys for PipelineConfig.
|
|
const (
|
|
ConfigKeyPageWidth = "page_width"
|
|
ConfigKeyZoom = "zoom"
|
|
ConfigKeyOutlines = "outlines"
|
|
ConfigKeyFlattenMediaToText = "flatten_media_to_text"
|
|
ConfigKeyTenantID = "tenant_id"
|
|
ConfigKeyVLMLLMID = "vlm_llm_id"
|
|
ConfigKeyRemoveTOC = "remove_toc"
|
|
)
|
|
|
|
// PipelineConfig is a key-value map that post-processing reads
|
|
// to obtain its parameters.
|
|
type PipelineConfig map[string]interface{}
|
|
|
|
// Float64 returns the float64 value for key, or default_ if absent or wrong type.
|
|
func (c PipelineConfig) Float64(key string, default_ float64) float64 {
|
|
if c == nil {
|
|
return default_
|
|
}
|
|
v, ok := c[key]
|
|
if !ok {
|
|
return default_
|
|
}
|
|
f, ok := v.(float64)
|
|
if !ok {
|
|
return default_
|
|
}
|
|
return f
|
|
}
|
|
|
|
// Bool returns the bool value for key. Returns false if absent or wrong type.
|
|
func (c PipelineConfig) Bool(key string) bool {
|
|
if c == nil {
|
|
return false
|
|
}
|
|
v, ok := c[key]
|
|
if !ok {
|
|
return false
|
|
}
|
|
b, ok := v.(bool)
|
|
if !ok {
|
|
return false
|
|
}
|
|
return b
|
|
}
|
|
|
|
// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines.
|
|
func (c PipelineConfig) Outlines() []pdftype.Outline {
|
|
if c == nil {
|
|
return nil
|
|
}
|
|
v, ok := c[ConfigKeyOutlines]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
o, ok := v.([]pdftype.Outline)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
return o
|
|
}
|
|
|
|
// String returns the string value for key. Returns "" if absent or wrong type.
|
|
func (c PipelineConfig) String(key string) string {
|
|
if c == nil {
|
|
return ""
|
|
}
|
|
v, ok := c[key]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
s, ok := v.(string)
|
|
if !ok {
|
|
return ""
|
|
}
|
|
return s
|
|
}
|
|
|
|
// ── Patterns ───────────────────────────────────────────────────────────
|
|
|
|
// headerFooterPattern matches layout types that should be treated as
|
|
// page furniture (Python: r"(header|footer|number)" in parser.py:637).
|
|
var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`)
|
|
|
|
// tocTitlePattern matches outline titles that mark a table-of-contents page.
|
|
// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$"
|
|
var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`)
|
|
|
|
// ── PostProcess ────────────────────────────────────────────────────────
|
|
|
|
// PostProcess applies PDF post-processing to a ParseResult in-place.
|
|
// The config map controls which features to enable.
|
|
//
|
|
// Execution order (matches Python _pdf):
|
|
// 1. reorderMultiColumn — if page_width > 0
|
|
// 2. removeTOCByOutlines — if outlines present
|
|
// 3. normalizeLayoutType — always
|
|
// 4. filterHeaderFooter — always
|
|
// 5. assignDocTypeKwd — always (respects flatten_media_to_text)
|
|
// 6. enhanceWithVision — if image_describer present
|
|
func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error {
|
|
if result == nil {
|
|
return errors.New("PostProcess: nil result")
|
|
}
|
|
if config == nil {
|
|
config = PipelineConfig{}
|
|
}
|
|
|
|
// 1. Multi-column reorder
|
|
pw := config.Float64(ConfigKeyPageWidth, 0)
|
|
if pw > 0 {
|
|
zoom := config.Float64(ConfigKeyZoom, 1.0)
|
|
if zoom <= 0 {
|
|
zoom = 1.0
|
|
}
|
|
reorderMultiColumn(result, pw, zoom)
|
|
}
|
|
|
|
// 2. Remove TOC pages (only when explicitly enabled).
|
|
// Outlines from config take precedence; otherwise read from ParseResult.
|
|
outlines := config.Outlines()
|
|
if len(outlines) == 0 {
|
|
outlines = result.Outlines
|
|
}
|
|
if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 {
|
|
removeTOCByOutlines(result, outlines)
|
|
}
|
|
|
|
// 3-5. Always-on steps
|
|
normalizeLayoutType(result)
|
|
filterHeaderFooter(result)
|
|
assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText))
|
|
|
|
// 6. VLM enhancement
|
|
tenantID := config.String(ConfigKeyTenantID)
|
|
vlmLLMID := config.String(ConfigKeyVLMLLMID)
|
|
if tenantID != "" && vlmLLMID != "" {
|
|
describer, err := resolveImageDescriber(tenantID, vlmLLMID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := enhanceWithVision(ctx, result, describer); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// resolveImageDescriber resolves a VLM model from tenant config and returns
|
|
// an ImageDescriber. Corresponds to Python's
|
|
// get_model_config_from_provider_instance + LLMBundle.
|
|
// resolveImageDescriber resolves a VLM model from tenant config and returns
|
|
// an ImageDescriber. The implementation is assigned by init() in
|
|
// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub).
|
|
// Overridable in tests.
|
|
var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error)
|
|
|
|
// SetImageDescriberResolver sets the factory that creates an ImageDescriber
|
|
// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the
|
|
// PDF document pipeline entry point) register the real implementation via
|
|
// init(). If never called, PostProcess skips VLM enhancement.
|
|
func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) {
|
|
resolveImageDescriber = fn
|
|
}
|
|
|
|
// ── normalizeLayoutType ────────────────────────────────────────────────
|
|
|
|
// normalizeLayoutType trims whitespace from LayoutType and defaults empty
|
|
// values to "text". Matches Python's layout_type normalization in parser.py.
|
|
func normalizeLayoutType(result *pdftype.ParseResult) {
|
|
for i := range result.Sections {
|
|
lt := strings.TrimSpace(result.Sections[i].LayoutType)
|
|
if lt == "" {
|
|
lt = "text"
|
|
}
|
|
result.Sections[i].LayoutType = lt
|
|
}
|
|
}
|
|
|
|
// ── filterHeaderFooter ─────────────────────────────────────────────────
|
|
|
|
// filterHeaderFooter removes sections whose LayoutType matches
|
|
// header/footer/number/reference. Python: remove_header_footer config.
|
|
func filterHeaderFooter(result *pdftype.ParseResult) {
|
|
sections := result.Sections[:0]
|
|
for _, s := range result.Sections {
|
|
if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) {
|
|
continue
|
|
}
|
|
sections = append(sections, s)
|
|
}
|
|
result.Sections = sections
|
|
}
|
|
|
|
// ── assignDocTypeKwd ───────────────────────────────────────────────────
|
|
|
|
// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence.
|
|
// When flatten is true, all sections become "text" and Image is cleared —
|
|
// this matches Python where flatten_media_to_text and VLM are mutually
|
|
// exclusive. Python: parser.py:639-648.
|
|
func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) {
|
|
for i := range result.Sections {
|
|
s := &result.Sections[i]
|
|
if flatten {
|
|
s.DocTypeKwd = "text"
|
|
s.Image = ""
|
|
continue
|
|
}
|
|
lt := strings.TrimSpace(s.LayoutType)
|
|
switch lt {
|
|
case "table":
|
|
s.DocTypeKwd = "table"
|
|
case "figure":
|
|
s.DocTypeKwd = "image"
|
|
default:
|
|
if lt == "" && s.Image != "" {
|
|
s.DocTypeKwd = "image"
|
|
} else {
|
|
s.DocTypeKwd = "text"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── enhanceWithVision ──────────────────────────────────────────────────
|
|
|
|
// enhanceWithVision adds VLM-generated descriptions to image/table sections.
|
|
func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error {
|
|
if describer == nil {
|
|
return nil
|
|
}
|
|
if len(result.Sections) == 0 {
|
|
return nil
|
|
}
|
|
|
|
sem := make(chan struct{}, maxDescribeConcurrency)
|
|
var wg sync.WaitGroup
|
|
|
|
for i := range result.Sections {
|
|
s := &result.Sections[i]
|
|
if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" {
|
|
continue
|
|
}
|
|
if s.Image == "" {
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
sem <- struct{}{}
|
|
go func(idx int, imgB64 string, origText string) {
|
|
defer wg.Done()
|
|
defer func() { <-sem }()
|
|
|
|
img, err := util.DecodeBase64PNG(imgB64)
|
|
if err != nil || img == nil {
|
|
return
|
|
}
|
|
desc, err := DescribeImage(ctx, img, describePrompt, describer)
|
|
if err != nil || desc == "" {
|
|
return
|
|
}
|
|
|
|
if origText != "" {
|
|
result.Sections[idx].Text = origText + "\n" + desc
|
|
} else {
|
|
result.Sections[idx].Text = desc
|
|
}
|
|
}(i, s.Image, s.Text)
|
|
}
|
|
wg.Wait()
|
|
|
|
return nil
|
|
}
|
|
|
|
// ── removeTOCByOutlines ────────────────────────────────────────────────
|
|
|
|
// removeTOCByOutlines removes sections whose page numbers fall inside
|
|
// TOC page ranges identified by PDF outlines.
|
|
func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) {
|
|
if len(outlines) == 0 {
|
|
return
|
|
}
|
|
tocPage, contentPage := findTOCPageRange(outlines)
|
|
if contentPage <= tocPage {
|
|
return
|
|
}
|
|
sections := result.Sections[:0]
|
|
for _, s := range result.Sections {
|
|
pg := sectionPage(s)
|
|
if pg >= tocPage && pg < contentPage {
|
|
continue
|
|
}
|
|
sections = append(sections, s)
|
|
}
|
|
result.Sections = sections
|
|
}
|
|
|
|
// findTOCPageRange scans outlines for a TOC entry and returns the
|
|
// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found.
|
|
func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) {
|
|
trimSplit:
|
|
for i, o := range outlines {
|
|
title := strings.TrimSpace(o.Title)
|
|
if idx := strings.Index(title, "@@"); idx >= 0 {
|
|
title = strings.TrimSpace(title[:idx])
|
|
}
|
|
if !tocTitlePattern.MatchString(strings.ToLower(title)) {
|
|
continue
|
|
}
|
|
tocPage = o.PageNumber
|
|
for _, next := range outlines[i+1:] {
|
|
if next.Level != o.Level {
|
|
continue
|
|
}
|
|
nt := strings.TrimSpace(next.Title)
|
|
if idx := strings.Index(nt, "@@"); idx >= 0 {
|
|
nt = strings.TrimSpace(nt[:idx])
|
|
}
|
|
if tocTitlePattern.MatchString(strings.ToLower(nt)) {
|
|
continue
|
|
}
|
|
contentPage = next.PageNumber
|
|
break trimSplit
|
|
}
|
|
break
|
|
}
|
|
return
|
|
}
|
|
|
|
// sectionPage returns the first page number of a Section, or 0.
|
|
func sectionPage(s pdftype.Section) int {
|
|
for _, p := range s.Positions {
|
|
for _, pn := range p.PageNumbers {
|
|
return pn
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// ── reorderMultiColumn ─────────────────────────────────────────────────
|
|
|
|
// reorderMultiColumn reorders text sections in multi-column layouts.
|
|
// If median text column width >= page width / 2 (single-column layout),
|
|
// the input order is preserved.
|
|
//
|
|
// Python: reorder_multi_column_bboxes + sort_X_by_page
|
|
func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) {
|
|
if len(result.Sections) < 2 {
|
|
return
|
|
}
|
|
pw := pageWidth / zoom
|
|
|
|
// Compute median width from text sections with valid coordinates.
|
|
var widths []float64
|
|
for _, s := range result.Sections {
|
|
if s.LayoutType != "text" {
|
|
continue
|
|
}
|
|
if len(s.Positions) == 0 {
|
|
continue
|
|
}
|
|
w := s.Positions[0].Right - s.Positions[0].Left
|
|
if w > 0 {
|
|
widths = append(widths, w)
|
|
}
|
|
}
|
|
if len(widths) == 0 {
|
|
return
|
|
}
|
|
sort.Float64s(widths)
|
|
medianW := widths[len(widths)/2]
|
|
|
|
if medianW >= pw/2 {
|
|
return // single column
|
|
}
|
|
|
|
// Sort by (PageNumber, X0, Top).
|
|
sort.Slice(result.Sections, func(i, j int) bool {
|
|
pi := sectionPage(result.Sections[i])
|
|
pj := sectionPage(result.Sections[j])
|
|
if pi != pj {
|
|
return pi < pj
|
|
}
|
|
xi := sectionX0(result.Sections[i])
|
|
xj := sectionX0(result.Sections[j])
|
|
if math.Abs(xi-xj) > 1e-6 {
|
|
return xi < xj
|
|
}
|
|
return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j])
|
|
})
|
|
|
|
threshold := medianW / 2
|
|
// Correct same-page sections with nearly-same X0 but inverted Top.
|
|
for i := len(result.Sections) - 1; i >= 1; i-- {
|
|
for j := i - 1; j >= 0; j-- {
|
|
if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold &&
|
|
sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) &&
|
|
sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) {
|
|
result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func sectionX0(s pdftype.Section) float64 {
|
|
for _, p := range s.Positions {
|
|
return p.Left
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func sectionTop(s pdftype.Section) float64 {
|
|
for _, p := range s.Positions {
|
|
return p.Top
|
|
}
|
|
return 0
|
|
}
|