Files
ragflow/internal/deepdoc/parser/pdf/layout/boxes_sections.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

175 lines
5.6 KiB
Go

package layout
import (
"sort"
"strings"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// ResolvePageSpan computes the ending page and bottom coordinate for a box
// that may span multiple pages. When pageHeights is nil or the box fits
// within its starting page the returned (toPage, bottom) equal the inputs.
//
// Zero or negative page heights are treated as invalid: the span stops at
// the preceding page, guarding against infinite loops caused by corrupted
// page images.
func ResolvePageSpan(pageNum int, bottom float64, pageHeights map[int]float64) (toPage int, newBottom float64) {
toPage = pageNum
newBottom = bottom
if pageHeights == nil {
return
}
ph, ok := pageHeights[pageNum]
if !ok || ph <= 0 || bottom <= ph {
return
}
remaining := bottom
for remaining > ph && ph > 0 {
nextPh, ok := pageHeights[toPage+1]
if !ok || nextPh <= 0 {
// Unknown or invalid next page height — extend by the
// last known height once and stop (Python: _line_tag
// while-loop break path).
remaining -= ph
toPage++
break
}
remaining -= ph
ph = nextPh
toPage++
}
newBottom = remaining
return
}
// boxesToSections converts layout boxes to section format with position tags.
//
// pageHeights provides the PDF-point height of each page (image height / zoom).
// Boxes that extend beyond their page produce multi-page position tags
// (Python's _line_tag while-loop detection via resolvePageSpan).
//
// Python equivalent: output consumed by naive.py::chunk()
func BoxesToSections(boxes []pdf.TextBox, pageHeights map[int]float64) []pdf.Section {
sections := make([]pdf.Section, 0, len(boxes))
for _, b := range boxes {
t := strings.TrimSpace(b.Text)
if t == "" {
continue
}
toPage, bottom := ResolvePageSpan(b.PageNumber, b.Bottom, pageHeights)
var posTag string
var pageNums []int
if b.PageNumber == toPage {
posTag = util.FormatPositionTag(b.PageNumber, b.X0, b.X1, b.Top, bottom)
pageNums = []int{b.PageNumber}
} else {
posTag = util.FormatPositionTagRange(b.PageNumber, toPage, b.X0, b.X1, b.Top, bottom)
pageNums = make([]int, 0, toPage-b.PageNumber+1)
for p := b.PageNumber; p <= toPage; p++ {
pageNums = append(pageNums, p)
}
}
sections = append(sections, pdf.Section{
Text: t,
PositionTag: posTag,
LayoutType: b.LayoutType,
Positions: []pdf.Position{{PageNumbers: pageNums, Left: b.X0, Right: b.X1, Top: b.Top, Bottom: bottom}},
})
}
return sections
}
// NormalizeSectionPositions ensures each Section's Positions field is populated
// by parsing PositionTag when Positions is empty. Sections that already have
// Positions populated are left unchanged.
//
// This mirrors the Python normalize_pdf_items_metadata — canonicalizing
// position metadata from the string tag format into the typed []Position form.
//
// Callers should invoke this AFTER Parse() returns, just before consuming
// Sections (e.g., before serialization to JSON or passing to the chunker).
// The normalization is intentionally NOT embedded inside the parser pipeline
// because Sections may come from multiple sources (deepdoc, MinerU, Docling,
// JSON deserialization, etc.).
func NormalizeSectionPositions(sections []pdf.Section) {
for i := range sections {
if len(sections[i].Positions) == 0 && sections[i].PositionTag != "" {
sections[i].Positions = util.ExtractPositions(sections[i].PositionTag)
}
}
}
// SortByPageThenY sorts boxes by page → vertical key → x0.
func SortByPageThenY(boxes []pdf.TextBox, sortByTop bool) {
key := func(b pdf.TextBox) float64 { return b.Bottom }
if sortByTop {
key = func(b pdf.TextBox) float64 { return b.Top }
}
sort.Slice(boxes, func(i, j int) bool {
if boxes[i].PageNumber != boxes[j].PageNumber {
return boxes[i].PageNumber < boxes[j].PageNumber
}
if key(boxes[i]) != key(boxes[j]) {
return key(boxes[i]) < key(boxes[j])
}
return boxes[i].X0 < boxes[j].X0
})
}
// SectionsToMarkdown converts Sections to a markdown string.
//
// Title sections get a "## " prefix.
// Figure sections produce an "![Image](data:image/png;base64,...)" tag.
// Text and all other sections are appended verbatim.
//
// This mirrors the Python parser.py:665-671 markdown output path.
func SectionsToMarkdown(sections []pdf.Section) string {
var b strings.Builder
for _, s := range sections {
if s.LayoutType == pdf.LayoutTypeTitle {
b.WriteString("\n## ")
}
if s.LayoutType == pdf.LayoutTypeFigure && s.Image != "" {
b.WriteString("\n![Image](data:image/png;base64,")
b.WriteString(s.Image)
b.WriteString(")")
continue
}
b.WriteString(s.Text)
b.WriteString("\n")
}
return b.String()
}
// SectionsToJSON converts Sections to a Python-compatible JSON dict format.
//
// Each dict has keys: text, layout_type, doc_type_kwd, _pdf_positions, image.
// The _pdf_positions key mirrors Python's PDF_POSITIONS_KEY constant —
// the canonical position format consumed by the chunker's extract_pdf_positions.
//
// This mirrors the Python parser.py:662 set_output("json", bboxes) path.
func SectionsToJSON(sections []pdf.Section) []map[string]any {
result := make([]map[string]any, len(sections))
for i, s := range sections {
positions := make([][]any, len(s.Positions))
for j, p := range s.Positions {
pages := make([]any, len(p.PageNumbers))
for k, pn := range p.PageNumbers {
pages[k] = pn
}
positions[j] = []any{pages, p.Left, p.Right, p.Top, p.Bottom}
}
result[i] = map[string]any{
"text": s.Text,
"layout_type": s.LayoutType,
"doc_type_kwd": s.DocTypeKwd,
"_pdf_positions": positions,
"image": s.Image,
}
}
return result
}