Files
ragflow/internal/deepdoc/parser/pdf/post/post_steps.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

437 lines
12 KiB
Go

package post
import (
"context"
"errors"
"math"
"regexp"
"sort"
"strings"
"sync"
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
"ragflow/internal/deepdoc/parser/pdf/util"
)
// ── Config ─────────────────────────────────────────────────────────────
// Config keys for PipelineConfig.
const (
ConfigKeyPageWidth = "page_width"
ConfigKeyZoom = "zoom"
ConfigKeyOutlines = "outlines"
ConfigKeyFlattenMediaToText = "flatten_media_to_text"
ConfigKeyTenantID = "tenant_id"
ConfigKeyVLMLLMID = "vlm_llm_id"
ConfigKeyRemoveTOC = "remove_toc"
)
// PipelineConfig is a key-value map that post-processing reads
// to obtain its parameters.
type PipelineConfig map[string]interface{}
// Float64 returns the float64 value for key, or default_ if absent or wrong type.
func (c PipelineConfig) Float64(key string, default_ float64) float64 {
if c == nil {
return default_
}
v, ok := c[key]
if !ok {
return default_
}
f, ok := v.(float64)
if !ok {
return default_
}
return f
}
// Bool returns the bool value for key. Returns false if absent or wrong type.
func (c PipelineConfig) Bool(key string) bool {
if c == nil {
return false
}
v, ok := c[key]
if !ok {
return false
}
b, ok := v.(bool)
if !ok {
return false
}
return b
}
// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines.
func (c PipelineConfig) Outlines() []pdftype.Outline {
if c == nil {
return nil
}
v, ok := c[ConfigKeyOutlines]
if !ok {
return nil
}
o, ok := v.([]pdftype.Outline)
if !ok {
return nil
}
return o
}
// String returns the string value for key. Returns "" if absent or wrong type.
func (c PipelineConfig) String(key string) string {
if c == nil {
return ""
}
v, ok := c[key]
if !ok {
return ""
}
s, ok := v.(string)
if !ok {
return ""
}
return s
}
// ── Patterns ───────────────────────────────────────────────────────────
// headerFooterPattern matches layout types that should be treated as
// page furniture (Python: r"(header|footer|number)" in parser.py:637).
var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`)
// tocTitlePattern matches outline titles that mark a table-of-contents page.
// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$"
var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`)
// ── PostProcess ────────────────────────────────────────────────────────
// PostProcess applies PDF post-processing to a ParseResult in-place.
// The config map controls which features to enable.
//
// Execution order (matches Python _pdf):
// 1. reorderMultiColumn — if page_width > 0
// 2. removeTOCByOutlines — if outlines present
// 3. normalizeLayoutType — always
// 4. filterHeaderFooter — always
// 5. assignDocTypeKwd — always (respects flatten_media_to_text)
// 6. enhanceWithVision — if image_describer present
func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error {
if result == nil {
return errors.New("PostProcess: nil result")
}
if config == nil {
config = PipelineConfig{}
}
// 1. Multi-column reorder
pw := config.Float64(ConfigKeyPageWidth, 0)
if pw > 0 {
zoom := config.Float64(ConfigKeyZoom, 1.0)
if zoom <= 0 {
zoom = 1.0
}
reorderMultiColumn(result, pw, zoom)
}
// 2. Remove TOC pages (only when explicitly enabled).
// Outlines from config take precedence; otherwise read from ParseResult.
outlines := config.Outlines()
if len(outlines) == 0 {
outlines = result.Outlines
}
if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 {
removeTOCByOutlines(result, outlines)
}
// 3-5. Always-on steps
normalizeLayoutType(result)
filterHeaderFooter(result)
assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText))
// 6. VLM enhancement
tenantID := config.String(ConfigKeyTenantID)
vlmLLMID := config.String(ConfigKeyVLMLLMID)
if tenantID != "" && vlmLLMID != "" {
describer, err := resolveImageDescriber(tenantID, vlmLLMID)
if err != nil {
return err
}
if err := enhanceWithVision(ctx, result, describer); err != nil {
return err
}
}
return nil
}
// resolveImageDescriber resolves a VLM model from tenant config and returns
// an ImageDescriber. Corresponds to Python's
// get_model_config_from_provider_instance + LLMBundle.
// resolveImageDescriber resolves a VLM model from tenant config and returns
// an ImageDescriber. The implementation is assigned by init() in
// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub).
// Overridable in tests.
var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error)
// SetImageDescriberResolver sets the factory that creates an ImageDescriber
// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the
// PDF document pipeline entry point) register the real implementation via
// init(). If never called, PostProcess skips VLM enhancement.
func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) {
resolveImageDescriber = fn
}
// ── normalizeLayoutType ────────────────────────────────────────────────
// normalizeLayoutType trims whitespace from LayoutType and defaults empty
// values to "text". Matches Python's layout_type normalization in parser.py.
func normalizeLayoutType(result *pdftype.ParseResult) {
for i := range result.Sections {
lt := strings.TrimSpace(result.Sections[i].LayoutType)
if lt == "" {
lt = "text"
}
result.Sections[i].LayoutType = lt
}
}
// ── filterHeaderFooter ─────────────────────────────────────────────────
// filterHeaderFooter removes sections whose LayoutType matches
// header/footer/number/reference. Python: remove_header_footer config.
func filterHeaderFooter(result *pdftype.ParseResult) {
sections := result.Sections[:0]
for _, s := range result.Sections {
if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) {
continue
}
sections = append(sections, s)
}
result.Sections = sections
}
// ── assignDocTypeKwd ───────────────────────────────────────────────────
// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence.
// When flatten is true, all sections become "text" and Image is cleared —
// this matches Python where flatten_media_to_text and VLM are mutually
// exclusive. Python: parser.py:639-648.
func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) {
for i := range result.Sections {
s := &result.Sections[i]
if flatten {
s.DocTypeKwd = "text"
s.Image = ""
continue
}
lt := strings.TrimSpace(s.LayoutType)
switch lt {
case "table":
s.DocTypeKwd = "table"
case "figure":
s.DocTypeKwd = "image"
default:
if lt == "" && s.Image != "" {
s.DocTypeKwd = "image"
} else {
s.DocTypeKwd = "text"
}
}
}
}
// ── enhanceWithVision ──────────────────────────────────────────────────
// enhanceWithVision adds VLM-generated descriptions to image/table sections.
func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error {
if describer == nil {
return nil
}
if len(result.Sections) == 0 {
return nil
}
sem := make(chan struct{}, maxDescribeConcurrency)
var wg sync.WaitGroup
for i := range result.Sections {
s := &result.Sections[i]
if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" {
continue
}
if s.Image == "" {
continue
}
wg.Add(1)
sem <- struct{}{}
go func(idx int, imgB64 string, origText string) {
defer wg.Done()
defer func() { <-sem }()
img, err := util.DecodeBase64PNG(imgB64)
if err != nil || img == nil {
return
}
desc, err := DescribeImage(ctx, img, describePrompt, describer)
if err != nil || desc == "" {
return
}
if origText != "" {
result.Sections[idx].Text = origText + "\n" + desc
} else {
result.Sections[idx].Text = desc
}
}(i, s.Image, s.Text)
}
wg.Wait()
return nil
}
// ── removeTOCByOutlines ────────────────────────────────────────────────
// removeTOCByOutlines removes sections whose page numbers fall inside
// TOC page ranges identified by PDF outlines.
func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) {
if len(outlines) == 0 {
return
}
tocPage, contentPage := findTOCPageRange(outlines)
if contentPage <= tocPage {
return
}
sections := result.Sections[:0]
for _, s := range result.Sections {
pg := sectionPage(s)
if pg >= tocPage && pg < contentPage {
continue
}
sections = append(sections, s)
}
result.Sections = sections
}
// findTOCPageRange scans outlines for a TOC entry and returns the
// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found.
func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) {
trimSplit:
for i, o := range outlines {
title := strings.TrimSpace(o.Title)
if idx := strings.Index(title, "@@"); idx >= 0 {
title = strings.TrimSpace(title[:idx])
}
if !tocTitlePattern.MatchString(strings.ToLower(title)) {
continue
}
tocPage = o.PageNumber
for _, next := range outlines[i+1:] {
if next.Level != o.Level {
continue
}
nt := strings.TrimSpace(next.Title)
if idx := strings.Index(nt, "@@"); idx >= 0 {
nt = strings.TrimSpace(nt[:idx])
}
if tocTitlePattern.MatchString(strings.ToLower(nt)) {
continue
}
contentPage = next.PageNumber
break trimSplit
}
break
}
return
}
// sectionPage returns the first page number of a Section, or 0.
func sectionPage(s pdftype.Section) int {
for _, p := range s.Positions {
for _, pn := range p.PageNumbers {
return pn
}
}
return 0
}
// ── reorderMultiColumn ─────────────────────────────────────────────────
// reorderMultiColumn reorders text sections in multi-column layouts.
// If median text column width >= page width / 2 (single-column layout),
// the input order is preserved.
//
// Python: reorder_multi_column_bboxes + sort_X_by_page
func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) {
if len(result.Sections) < 2 {
return
}
pw := pageWidth / zoom
// Compute median width from text sections with valid coordinates.
var widths []float64
for _, s := range result.Sections {
if s.LayoutType != "text" {
continue
}
if len(s.Positions) == 0 {
continue
}
w := s.Positions[0].Right - s.Positions[0].Left
if w > 0 {
widths = append(widths, w)
}
}
if len(widths) == 0 {
return
}
sort.Float64s(widths)
medianW := widths[len(widths)/2]
if medianW >= pw/2 {
return // single column
}
// Sort by (PageNumber, X0, Top).
sort.Slice(result.Sections, func(i, j int) bool {
pi := sectionPage(result.Sections[i])
pj := sectionPage(result.Sections[j])
if pi != pj {
return pi < pj
}
xi := sectionX0(result.Sections[i])
xj := sectionX0(result.Sections[j])
if math.Abs(xi-xj) > 1e-6 {
return xi < xj
}
return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j])
})
threshold := medianW / 2
// Correct same-page sections with nearly-same X0 but inverted Top.
for i := len(result.Sections) - 1; i >= 1; i-- {
for j := i - 1; j >= 0; j-- {
if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold &&
sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) &&
sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) {
result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j]
}
}
}
}
func sectionX0(s pdftype.Section) float64 {
for _, p := range s.Positions {
return p.Left
}
return 0
}
func sectionTop(s pdftype.Section) float64 {
for _, p := range s.Positions {
return p.Top
}
return 0
}