ragflow/internal/deepdoc/parser/pdf/position.go

package parser

import (
	"fmt"
	"log/slog"
	"regexp"
	"strconv"
	"strings"
)

// @@ page position tag regex patterns.
//
// Python: pdf_parser.py:1868 remove_tag, 1872 extract_positions

// posTagPattern matches the full @@...## tag including coordinates.
// Format: @@{page_range}\t{left}\t{right}\t{top}\t{bottom}##
var posTagPattern = regexp.MustCompile(`@@[0-9-]+\t[0-9.\t]+##`)

// ExtractPositions parses @@ position tags from a text string.
//
// Each tag has format:
//
//	@@{page_range}\t{left}\t{right}\t{top}\t{bottom}##
//
// page_range can be a single page ("3") or a range ("0-2").
// Pages are zero-indexed in the returned values (subtracting 1 from PDF page numbers).
//
// Python: pdf_parser.py:1872 extract_positions()
//
// Example:
//
//	text := "Some text @@0-1\t50.0\t300.0\t200.0\t400.0## more text"
//	poss := ExtractPositions(text)
//	// poss[0] = Position{PageNumbers: [-1, 0], Left: 50.0, Right: 300.0, Top: 200.0, Bottom: 400.0}
func ExtractPositions(text string) []Position {
	var poss []Position
	for _, tag := range posTagPattern.FindAllString(text, -1) {
		cleaned := strings.TrimPrefix(strings.TrimSuffix(tag, "##"), "@@")
		parts := strings.Split(cleaned, "\t")
		if len(parts) != 5 {
			continue
		}

		// Parse page range
		var pageNums []int
		for _, p := range strings.Split(parts[0], "-") {
			n, err := strconv.Atoi(p)
			if err != nil {
				slog.Warn("ExtractPositions: invalid page number in tag", "tag", tag, "part", p, "err", err)
				continue
			}
			pageNums = append(pageNums, n-1) // 0-index
		}

		left, err := strconv.ParseFloat(parts[1], 64)
		if err != nil {
			slog.Warn("ExtractPositions: invalid left coordinate", "tag", tag, "err", err)
			continue
		}
		right, err := strconv.ParseFloat(parts[2], 64)
		if err != nil {
			slog.Warn("ExtractPositions: invalid right coordinate", "tag", tag, "err", err)
			continue
		}
		top, err := strconv.ParseFloat(parts[3], 64)
		if err != nil {
			slog.Warn("ExtractPositions: invalid top coordinate", "tag", tag, "err", err)
			continue
		}
		bottom, err := strconv.ParseFloat(parts[4], 64)
		if err != nil {
			slog.Warn("ExtractPositions: invalid bottom coordinate", "tag", tag, "err", err)
			continue
		}

		poss = append(poss, Position{
			PageNumbers: pageNums,
			Left:        left,
			Right:       right,
			Top:         top,
			Bottom:      bottom,
		})
	}
	return poss
}

// FormatPositionTag creates a @@ position tag string from page number and bounding box.
//
// Reverse of ExtractPositions. Used when converting PDF engine
// bboxes back to RAGFlow position tag format.
//
// Example:
//
//	tag := FormatPositionTag(0, 50.0, 300.0, 200.0, 400.0)
//	// "@@0-0\t50.0\t300.0\t200.0\t400.0##"
func FormatPositionTag(pageNum int, left, right, top, bottom float64) string {
	return fmt.Sprintf("@@%d\t%.1f\t%.1f\t%.1f\t%.1f##",
		pageNum+1, left, right, top, bottom)
}

// FormatPositionTagRange creates a @@ position tag for multi-page content.
//
// Example:
//
//	tag := FormatPositionTagRange(0, 2, 50.0, 300.0, 200.0, 400.0)
//	// "@@0-2\t50.0\t300.0\t200.0\t400.0##"
func FormatPositionTagRange(fromPage, toPage int, left, right, top, bottom float64) string {
	return fmt.Sprintf("@@%d-%d\t%.1f\t%.1f\t%.1f\t%.1f##",
		fromPage+1, toPage+1, left, right, top, bottom)
}