Files
ragflow/internal/deepdoc/parser/pdf/geometry.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

301 lines
8.0 KiB
Go

package parser
import (
"image"
"math"
"sort"
)
// CharWidth returns the average character width: (x1 - x0) / len(text).
// Returns 0 if text is empty.
//
// Python: pdf_parser.py:107 __char_width()
//
// Example:
//
// c := TextChar{X0: 50, X1: 58, Text: "A"}
// w := CharWidth(c) // (58-50)/1 = 8
func CharWidth(c TextChar) float64 {
if len(c.Text) == 0 {
return 0
}
return (c.X1 - c.X0) / float64(len(c.Text))
}
// CharHeight returns the character height in PDF points.
//
// Python: pdf_parser.py:110 __height()
//
// Example:
//
// c := TextChar{Top: 200, Bottom: 212}
// h := CharHeight(c) // 212-200 = 12
func CharHeight(c TextChar) float64 {
return c.Bottom - c.Top
}
// XDis computes the minimum horizontal distance between two characters.
// Used to determine if they belong to the same text line.
//
// Python: pdf_parser.py:113 _x_dis()
//
// Example:
//
// a := TextChar{X0: 50, X1: 58}
// b := TextChar{X0: 60, X1: 68}
// d := XDis(a, b) // min(|58-60|=2, |50-68|=18, |108-128|/2=10) = 2
func XDis(a, b TextChar) float64 {
return min(
math.Abs(a.X1-b.X0),
min(math.Abs(a.X0-b.X1), math.Abs(a.X0+a.X1-b.X0-b.X1)/2),
)
}
// YDis computes the vertical distance between two characters' centerlines.
// Positive means b is below a.
//
// Python: pdf_parser.py:116 _y_dis()
//
// Example:
//
// a := TextChar{Top: 100, Bottom: 112}
// b := TextChar{Top: 114, Bottom: 126}
// d := YDis(a, b) // (114+126-100-112)/2 = 14
func YDis(a, b TextChar) float64 {
return (b.Top + b.Bottom - a.Top - a.Bottom) / 2
}
// BoxWidth returns the width of a text box.
func BoxWidth(b TextBox) float64 {
return b.X1 - b.X0
}
// BoxHeight returns the height of a text box.
func BoxHeight(b TextBox) float64 {
return b.Bottom - b.Top
}
// BoxYDis computes vertical centerline distance between boxes.
// Positive means b2 is below b1.
func BoxYDis(b1, b2 TextBox) float64 {
return (b2.Top + b2.Bottom - b1.Top - b1.Bottom) / 2
}
// BoxXDis computes horizontal distance between boxes.
func BoxXDis(b1, b2 TextBox) float64 {
return min(
math.Abs(b1.X1-b2.X0),
min(math.Abs(b1.X0-b2.X1), math.Abs(b1.X0+b1.X1-b2.X0-b2.X1)/2),
)
}
// ── Rectangular interface and overlap helpers ──────────────────────────
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
type Rectangular interface {
Bounds() (x0, y0, x1, y1 float64)
}
// Area returns the area of a Rectangular. Returns 0 for degenerate rects.
func Area(r Rectangular) float64 {
x0, y0, x1, y1 := r.Bounds()
if x1 <= x0 || y1 <= y0 {
return 0
}
return (x1 - x0) * (y1 - y0)
}
// rectOverlapInter returns the intersection area of two axis-aligned rectangles.
// Returns 0 when the rectangles do not overlap or either is degenerate.
func rectOverlapInter(x0a, y0a, x1a, y1a, x0b, y0b, x1b, y1b float64) float64 {
x0 := max(x0a, x0b)
y0 := max(y0a, y0b)
x1 := min(x1a, x1b)
y1 := min(y1a, y1b)
if x0 >= x1 || y0 >= y1 {
return 0
}
return (x1 - x0) * (y1 - y0)
}
// OverlapInter returns the raw intersection area of two rectangles.
func OverlapInter(a, b Rectangular) float64 {
ax0, ay0, ax1, ay1 := a.Bounds()
bx0, by0, bx1, by1 := b.Bounds()
return rectOverlapInter(ax0, ay0, ax1, ay1, bx0, by0, bx1, by1)
}
// OverlapRatio returns intersection(a,b) / Area(denom).
// Returns 0 when denom has zero area or there is no intersection.
func OverlapRatio(a, b, denom Rectangular) float64 {
inter := OverlapInter(a, b)
if inter <= 0 {
return 0
}
d := Area(denom)
if d <= 0 {
return 0
}
return inter / d
}
// OverlapRatioA returns intersection(a,b) / Area(a).
func OverlapRatioA(a, b Rectangular) float64 {
return OverlapRatio(a, b, a)
}
// OverlapRatioMax returns intersection(a,b) / max(Area(a), Area(b)).
func OverlapRatioMax(a, b Rectangular) float64 {
inter := OverlapInter(a, b)
if inter <= 0 {
return 0
}
d := max(Area(a), Area(b))
if d <= 0 {
return 0
}
return inter / d
}
// OverlapX returns the horizontal (X-axis only) overlap ratio between two rectangles.
// Ratio = overlap_width / max(1, min(width(a), width(b))).
//
// Python: pdf_parser.py:964-965 overlap calculation in _naive_vertical_merge
func OverlapX(a, b Rectangular) float64 {
ax0, _, ax1, _ := a.Bounds()
bx0, _, bx1, _ := b.Bounds()
overlap := math.Max(0, math.Min(ax1, bx1)-math.Max(ax0, bx0))
wA := ax1 - ax0
wB := bx1 - bx0
minWidth := math.Max(1, math.Min(wA, wB))
return overlap / minWidth
}
// SortXByPage sorts boxes by page_number, then x0, then top.
// After sorting, corrects for same-page boxes that have nearly the same x0
// but inverted top ordering (a layout artifact).
//
// Python: pdf_parser.py:178 sort_X_by_page()
func SortXByPage(boxes []TextBox, threshold float64) []TextBox {
sort.Slice(boxes, func(i, j int) bool {
if boxes[i].PageNumber != boxes[j].PageNumber {
return boxes[i].PageNumber < boxes[j].PageNumber
}
if boxes[i].X0 != boxes[j].X0 {
return boxes[i].X0 < boxes[j].X0
}
return boxes[i].Top < boxes[j].Top
})
for i := len(boxes) - 1; i >= 1; i-- {
for j := i - 1; j >= 0; j-- {
if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold &&
boxes[j+1].Top < boxes[j].Top &&
boxes[j+1].PageNumber == boxes[j].PageNumber {
boxes[j], boxes[j+1] = boxes[j+1], boxes[j]
}
}
}
return boxes
}
// MedianCharHeight computes the median character height for a page,
// matching Python's np.median(char height) in __images__ (pdf_parser.py:1552).
// Used as a reference unit for vertical spacing decisions.
func MedianCharHeight(chars []TextChar) float64 {
heights := make([]float64, len(chars))
for i, c := range chars {
heights[i] = CharHeight(c)
}
return medianFloat64(heights, 10)
}
// MedianCharWidth computes the median character width for a page,
// matching Python's np.median(char width) in __images__ (pdf_parser.py:1553).
func MedianCharWidth(chars []TextChar) float64 {
widths := make([]float64, len(chars))
for i, c := range chars {
widths[i] = CharWidth(c)
}
return medianFloat64(widths, 5)
}
// MedianHeight computes the median height of a set of text boxes.
// Falls back to 10 if list is empty.
//
// Python: np.median([b["bottom"]-b["top"] for b in bxs]) or 10
// in _naive_vertical_merge:941
func MedianHeight(boxes []TextBox) float64 {
heights := make([]float64, len(boxes))
for i, b := range boxes {
heights[i] = b.Bottom - b.Top
}
return medianFloat64(heights, 10)
}
// medianFloat64 returns the median of vals, or fallback if empty.
func medianFloat64(vals []float64, fallback float64) float64 {
if len(vals) == 0 {
return fallback
}
sort.Float64s(vals)
n := len(vals)
if n%2 == 0 {
return (vals[n/2-1] + vals[n/2]) / 2
}
return vals[n/2]
}
// rect is a lightweight rectangle for overlap calculations.
// Coordinates are in whatever space the caller uses (pixel or PDF points).
type rect struct{ x0, y0, x1, y1 float64 }
func (r rect) Bounds() (float64, float64, float64, float64) { return r.x0, r.y0, r.x1, r.y1 }
// rectOverlap returns the overlap ratio between two rects.
// Ratio = area(intersection) / max(area(a), area(b)).
// Returns 0 when there is no overlap.
func rectOverlap(a, b rect) float64 {
return OverlapRatioMax(a, b)
}
// fastCrop copies a rectangular region from src to a new *image.RGBA.
// Uses direct Pix slice copy for *image.RGBA sources (zero allocation per row);
// falls back to pixel-by-pixel for other image types.
func fastCrop(src image.Image, x0, y0, x1, y1 int) *image.RGBA {
// Clamp to source bounds
b := src.Bounds()
if x0 < b.Min.X {
x0 = b.Min.X
}
if y0 < b.Min.Y {
y0 = b.Min.Y
}
if x1 > b.Max.X {
x1 = b.Max.X
}
if y1 > b.Max.Y {
y1 = b.Max.Y
}
if x0 >= x1 || y0 >= y1 {
return image.NewRGBA(image.Rect(0, 0, 1, 1))
}
w, h := x1-x0, y1-y0
dst := image.NewRGBA(image.Rect(0, 0, w, h))
if rgba, ok := src.(*image.RGBA); ok {
for y := y0; y < y1; y++ {
srcRow := rgba.Pix[rgba.PixOffset(x0, y):rgba.PixOffset(x1, y)]
dstRow := dst.Pix[dst.PixOffset(0, y-y0):]
copy(dstRow, srcRow)
}
} else {
for y := y0; y < y1; y++ {
for x := x0; x < x1; x++ {
dst.Set(x-x0, y-y0, src.At(x, y))
}
}
}
return dst
}