ragflow/internal/deepdoc/parser/pdf/layout/boxes_sections.go

package layout

import (
	"sort"
	"strings"

	pdf "ragflow/internal/deepdoc/parser/pdf/type"
	util "ragflow/internal/deepdoc/parser/pdf/util"
)

// ResolvePageSpan computes the ending page and bottom coordinate for a box
// that may span multiple pages.  When pageHeights is nil or the box fits
// within its starting page the returned (toPage, bottom) equal the inputs.
//
// Zero or negative page heights are treated as invalid: the span stops at
// the preceding page, guarding against infinite loops caused by corrupted
// page images.
func ResolvePageSpan(pageNum int, bottom float64, pageHeights map[int]float64) (toPage int, newBottom float64) {
	toPage = pageNum
	newBottom = bottom
	if pageHeights == nil {
		return
	}
	ph, ok := pageHeights[pageNum]
	if !ok || ph <= 0 || bottom <= ph {
		return
	}
	remaining := bottom
	for remaining > ph && ph > 0 {
		nextPh, ok := pageHeights[toPage+1]
		if !ok || nextPh <= 0 {
			// Unknown or invalid next page height — extend by the
			// last known height once and stop (Python: _line_tag
			// while-loop break path).
			remaining -= ph
			toPage++
			break
		}
		remaining -= ph
		ph = nextPh
		toPage++
	}
	newBottom = remaining
	return
}

// boxesToSections converts layout boxes to section format with position tags.
//
// pageHeights provides the PDF-point height of each page (image height / zoom).
// Boxes that extend beyond their page produce multi-page position tags
// (Python's _line_tag while-loop detection via resolvePageSpan).
//
// Python equivalent: output consumed by naive.py::chunk()
func BoxesToSections(boxes []pdf.TextBox, pageHeights map[int]float64) []pdf.Section {
	sections := make([]pdf.Section, 0, len(boxes))
	for _, b := range boxes {
		t := strings.TrimSpace(b.Text)
		if t == "" {
			continue
		}
		toPage, bottom := ResolvePageSpan(b.PageNumber, b.Bottom, pageHeights)

		var posTag string
		var pageNums []int
		if b.PageNumber == toPage {
			posTag = util.FormatPositionTag(b.PageNumber, b.X0, b.X1, b.Top, bottom)
			pageNums = []int{b.PageNumber}
		} else {
			posTag = util.FormatPositionTagRange(b.PageNumber, toPage, b.X0, b.X1, b.Top, bottom)
			pageNums = make([]int, 0, toPage-b.PageNumber+1)
			for p := b.PageNumber; p <= toPage; p++ {
				pageNums = append(pageNums, p)
			}
		}
		sections = append(sections, pdf.Section{
			Text:        t,
			PositionTag: posTag,
			LayoutType:  b.LayoutType,
			Positions:   []pdf.Position{{PageNumbers: pageNums, Left: b.X0, Right: b.X1, Top: b.Top, Bottom: bottom}},
		})
	}
	return sections
}

// NormalizeSectionPositions ensures each Section's Positions field is populated
// by parsing PositionTag when Positions is empty. Sections that already have
// Positions populated are left unchanged.
//
// This mirrors the Python normalize_pdf_items_metadata — canonicalizing
// position metadata from the string tag format into the typed []Position form.
//
// Callers should invoke this AFTER Parse() returns, just before consuming
// Sections (e.g., before serialization to JSON or passing to the chunker).
// The normalization is intentionally NOT embedded inside the parser pipeline
// because Sections may come from multiple sources (deepdoc, MinerU, Docling,
// JSON deserialization, etc.).
func NormalizeSectionPositions(sections []pdf.Section) {
	for i := range sections {
		if len(sections[i].Positions) == 0 && sections[i].PositionTag != "" {
			sections[i].Positions = util.ExtractPositions(sections[i].PositionTag)
		}
	}
}

// SortByPageThenY sorts boxes by page → vertical key → x0.
func SortByPageThenY(boxes []pdf.TextBox, sortByTop bool) {
	key := func(b pdf.TextBox) float64 { return b.Bottom }
	if sortByTop {
		key = func(b pdf.TextBox) float64 { return b.Top }
	}
	sort.Slice(boxes, func(i, j int) bool {
		if boxes[i].PageNumber != boxes[j].PageNumber {
			return boxes[i].PageNumber < boxes[j].PageNumber
		}
		if key(boxes[i]) != key(boxes[j]) {
			return key(boxes[i]) < key(boxes[j])
		}
		return boxes[i].X0 < boxes[j].X0
	})
}

// SectionsToMarkdown converts Sections to a markdown string.
//
// Title sections get a "## " prefix.
// Figure sections produce an "![Image](data:image/png;base64,...)" tag.
// Text and all other sections are appended verbatim.
//
// This mirrors the Python parser.py:665-671 markdown output path.
func SectionsToMarkdown(sections []pdf.Section) string {
	var b strings.Builder
	for _, s := range sections {
		if s.LayoutType == pdf.LayoutTypeTitle {
			b.WriteString("\n## ")
		}
		if s.LayoutType == pdf.LayoutTypeFigure && s.Image != "" {
			b.WriteString("\n![Image](data:image/png;base64,")
			b.WriteString(s.Image)
			b.WriteString(")")
			continue
		}
		b.WriteString(s.Text)
		b.WriteString("\n")
	}
	return b.String()
}

// SectionsToJSON converts Sections to a Python-compatible JSON dict format.
//
// Each dict has keys: text, layout_type, doc_type_kwd, _pdf_positions, image.
// The _pdf_positions key mirrors Python's PDF_POSITIONS_KEY constant —
// the canonical position format consumed by the chunker's extract_pdf_positions.
//
// This mirrors the Python parser.py:662 set_output("json", bboxes) path.
func SectionsToJSON(sections []pdf.Section) []map[string]any {
	result := make([]map[string]any, len(sections))
	for i, s := range sections {
		positions := make([][]any, len(s.Positions))
		for j, p := range s.Positions {
			pages := make([]any, len(p.PageNumbers))
			for k, pn := range p.PageNumbers {
				pages[k] = pn
			}
			positions[j] = []any{pages, p.Left, p.Right, p.Top, p.Bottom}
		}
		result[i] = map[string]any{
			"text":           s.Text,
			"layout_type":    s.LayoutType,
			"doc_type_kwd":   s.DocTypeKwd,
			"_pdf_positions": positions,
			"image":          s.Image,
		}
	}
	return result
}