mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
301 lines
8.0 KiB
Go
301 lines
8.0 KiB
Go
package parser
|
|
|
|
import (
|
|
"image"
|
|
"math"
|
|
"sort"
|
|
)
|
|
|
|
// CharWidth returns the average character width: (x1 - x0) / len(text).
|
|
// Returns 0 if text is empty.
|
|
//
|
|
// Python: pdf_parser.py:107 __char_width()
|
|
//
|
|
// Example:
|
|
//
|
|
// c := TextChar{X0: 50, X1: 58, Text: "A"}
|
|
// w := CharWidth(c) // (58-50)/1 = 8
|
|
func CharWidth(c TextChar) float64 {
|
|
if len(c.Text) == 0 {
|
|
return 0
|
|
}
|
|
return (c.X1 - c.X0) / float64(len(c.Text))
|
|
}
|
|
|
|
// CharHeight returns the character height in PDF points.
|
|
//
|
|
// Python: pdf_parser.py:110 __height()
|
|
//
|
|
// Example:
|
|
//
|
|
// c := TextChar{Top: 200, Bottom: 212}
|
|
// h := CharHeight(c) // 212-200 = 12
|
|
func CharHeight(c TextChar) float64 {
|
|
return c.Bottom - c.Top
|
|
}
|
|
|
|
// XDis computes the minimum horizontal distance between two characters.
|
|
// Used to determine if they belong to the same text line.
|
|
//
|
|
// Python: pdf_parser.py:113 _x_dis()
|
|
//
|
|
// Example:
|
|
//
|
|
// a := TextChar{X0: 50, X1: 58}
|
|
// b := TextChar{X0: 60, X1: 68}
|
|
// d := XDis(a, b) // min(|58-60|=2, |50-68|=18, |108-128|/2=10) = 2
|
|
func XDis(a, b TextChar) float64 {
|
|
return min(
|
|
math.Abs(a.X1-b.X0),
|
|
min(math.Abs(a.X0-b.X1), math.Abs(a.X0+a.X1-b.X0-b.X1)/2),
|
|
)
|
|
}
|
|
|
|
// YDis computes the vertical distance between two characters' centerlines.
|
|
// Positive means b is below a.
|
|
//
|
|
// Python: pdf_parser.py:116 _y_dis()
|
|
//
|
|
// Example:
|
|
//
|
|
// a := TextChar{Top: 100, Bottom: 112}
|
|
// b := TextChar{Top: 114, Bottom: 126}
|
|
// d := YDis(a, b) // (114+126-100-112)/2 = 14
|
|
func YDis(a, b TextChar) float64 {
|
|
return (b.Top + b.Bottom - a.Top - a.Bottom) / 2
|
|
}
|
|
|
|
// BoxWidth returns the width of a text box.
|
|
func BoxWidth(b TextBox) float64 {
|
|
return b.X1 - b.X0
|
|
}
|
|
|
|
// BoxHeight returns the height of a text box.
|
|
func BoxHeight(b TextBox) float64 {
|
|
return b.Bottom - b.Top
|
|
}
|
|
|
|
// BoxYDis computes vertical centerline distance between boxes.
|
|
// Positive means b2 is below b1.
|
|
func BoxYDis(b1, b2 TextBox) float64 {
|
|
return (b2.Top + b2.Bottom - b1.Top - b1.Bottom) / 2
|
|
}
|
|
|
|
// BoxXDis computes horizontal distance between boxes.
|
|
func BoxXDis(b1, b2 TextBox) float64 {
|
|
return min(
|
|
math.Abs(b1.X1-b2.X0),
|
|
min(math.Abs(b1.X0-b2.X1), math.Abs(b1.X0+b1.X1-b2.X0-b2.X1)/2),
|
|
)
|
|
}
|
|
|
|
// ── Rectangular interface and overlap helpers ──────────────────────────
|
|
|
|
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
|
|
type Rectangular interface {
|
|
Bounds() (x0, y0, x1, y1 float64)
|
|
}
|
|
|
|
// Area returns the area of a Rectangular. Returns 0 for degenerate rects.
|
|
func Area(r Rectangular) float64 {
|
|
x0, y0, x1, y1 := r.Bounds()
|
|
if x1 <= x0 || y1 <= y0 {
|
|
return 0
|
|
}
|
|
return (x1 - x0) * (y1 - y0)
|
|
}
|
|
|
|
// rectOverlapInter returns the intersection area of two axis-aligned rectangles.
|
|
// Returns 0 when the rectangles do not overlap or either is degenerate.
|
|
func rectOverlapInter(x0a, y0a, x1a, y1a, x0b, y0b, x1b, y1b float64) float64 {
|
|
x0 := max(x0a, x0b)
|
|
y0 := max(y0a, y0b)
|
|
x1 := min(x1a, x1b)
|
|
y1 := min(y1a, y1b)
|
|
if x0 >= x1 || y0 >= y1 {
|
|
return 0
|
|
}
|
|
return (x1 - x0) * (y1 - y0)
|
|
}
|
|
|
|
// OverlapInter returns the raw intersection area of two rectangles.
|
|
func OverlapInter(a, b Rectangular) float64 {
|
|
ax0, ay0, ax1, ay1 := a.Bounds()
|
|
bx0, by0, bx1, by1 := b.Bounds()
|
|
return rectOverlapInter(ax0, ay0, ax1, ay1, bx0, by0, bx1, by1)
|
|
}
|
|
|
|
// OverlapRatio returns intersection(a,b) / Area(denom).
|
|
// Returns 0 when denom has zero area or there is no intersection.
|
|
func OverlapRatio(a, b, denom Rectangular) float64 {
|
|
inter := OverlapInter(a, b)
|
|
if inter <= 0 {
|
|
return 0
|
|
}
|
|
d := Area(denom)
|
|
if d <= 0 {
|
|
return 0
|
|
}
|
|
return inter / d
|
|
}
|
|
|
|
// OverlapRatioA returns intersection(a,b) / Area(a).
|
|
func OverlapRatioA(a, b Rectangular) float64 {
|
|
return OverlapRatio(a, b, a)
|
|
}
|
|
|
|
// OverlapRatioMax returns intersection(a,b) / max(Area(a), Area(b)).
|
|
func OverlapRatioMax(a, b Rectangular) float64 {
|
|
inter := OverlapInter(a, b)
|
|
if inter <= 0 {
|
|
return 0
|
|
}
|
|
d := max(Area(a), Area(b))
|
|
if d <= 0 {
|
|
return 0
|
|
}
|
|
return inter / d
|
|
}
|
|
|
|
// OverlapX returns the horizontal (X-axis only) overlap ratio between two rectangles.
|
|
// Ratio = overlap_width / max(1, min(width(a), width(b))).
|
|
//
|
|
// Python: pdf_parser.py:964-965 overlap calculation in _naive_vertical_merge
|
|
func OverlapX(a, b Rectangular) float64 {
|
|
ax0, _, ax1, _ := a.Bounds()
|
|
bx0, _, bx1, _ := b.Bounds()
|
|
overlap := math.Max(0, math.Min(ax1, bx1)-math.Max(ax0, bx0))
|
|
wA := ax1 - ax0
|
|
wB := bx1 - bx0
|
|
minWidth := math.Max(1, math.Min(wA, wB))
|
|
return overlap / minWidth
|
|
}
|
|
|
|
// SortXByPage sorts boxes by page_number, then x0, then top.
|
|
// After sorting, corrects for same-page boxes that have nearly the same x0
|
|
// but inverted top ordering (a layout artifact).
|
|
//
|
|
// Python: pdf_parser.py:178 sort_X_by_page()
|
|
func SortXByPage(boxes []TextBox, threshold float64) []TextBox {
|
|
sort.Slice(boxes, func(i, j int) bool {
|
|
if boxes[i].PageNumber != boxes[j].PageNumber {
|
|
return boxes[i].PageNumber < boxes[j].PageNumber
|
|
}
|
|
if boxes[i].X0 != boxes[j].X0 {
|
|
return boxes[i].X0 < boxes[j].X0
|
|
}
|
|
return boxes[i].Top < boxes[j].Top
|
|
})
|
|
|
|
for i := len(boxes) - 1; i >= 1; i-- {
|
|
for j := i - 1; j >= 0; j-- {
|
|
if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold &&
|
|
boxes[j+1].Top < boxes[j].Top &&
|
|
boxes[j+1].PageNumber == boxes[j].PageNumber {
|
|
boxes[j], boxes[j+1] = boxes[j+1], boxes[j]
|
|
}
|
|
}
|
|
}
|
|
return boxes
|
|
}
|
|
|
|
// MedianCharHeight computes the median character height for a page,
|
|
// matching Python's np.median(char height) in __images__ (pdf_parser.py:1552).
|
|
// Used as a reference unit for vertical spacing decisions.
|
|
func MedianCharHeight(chars []TextChar) float64 {
|
|
heights := make([]float64, len(chars))
|
|
for i, c := range chars {
|
|
heights[i] = CharHeight(c)
|
|
}
|
|
return medianFloat64(heights, 10)
|
|
}
|
|
|
|
// MedianCharWidth computes the median character width for a page,
|
|
// matching Python's np.median(char width) in __images__ (pdf_parser.py:1553).
|
|
func MedianCharWidth(chars []TextChar) float64 {
|
|
widths := make([]float64, len(chars))
|
|
for i, c := range chars {
|
|
widths[i] = CharWidth(c)
|
|
}
|
|
return medianFloat64(widths, 5)
|
|
}
|
|
|
|
// MedianHeight computes the median height of a set of text boxes.
|
|
// Falls back to 10 if list is empty.
|
|
//
|
|
// Python: np.median([b["bottom"]-b["top"] for b in bxs]) or 10
|
|
// in _naive_vertical_merge:941
|
|
func MedianHeight(boxes []TextBox) float64 {
|
|
heights := make([]float64, len(boxes))
|
|
for i, b := range boxes {
|
|
heights[i] = b.Bottom - b.Top
|
|
}
|
|
return medianFloat64(heights, 10)
|
|
}
|
|
|
|
// medianFloat64 returns the median of vals, or fallback if empty.
|
|
func medianFloat64(vals []float64, fallback float64) float64 {
|
|
if len(vals) == 0 {
|
|
return fallback
|
|
}
|
|
sort.Float64s(vals)
|
|
n := len(vals)
|
|
if n%2 == 0 {
|
|
return (vals[n/2-1] + vals[n/2]) / 2
|
|
}
|
|
return vals[n/2]
|
|
}
|
|
|
|
// rect is a lightweight rectangle for overlap calculations.
|
|
// Coordinates are in whatever space the caller uses (pixel or PDF points).
|
|
type rect struct{ x0, y0, x1, y1 float64 }
|
|
|
|
func (r rect) Bounds() (float64, float64, float64, float64) { return r.x0, r.y0, r.x1, r.y1 }
|
|
|
|
// rectOverlap returns the overlap ratio between two rects.
|
|
// Ratio = area(intersection) / max(area(a), area(b)).
|
|
// Returns 0 when there is no overlap.
|
|
func rectOverlap(a, b rect) float64 {
|
|
return OverlapRatioMax(a, b)
|
|
}
|
|
|
|
// fastCrop copies a rectangular region from src to a new *image.RGBA.
|
|
// Uses direct Pix slice copy for *image.RGBA sources (zero allocation per row);
|
|
// falls back to pixel-by-pixel for other image types.
|
|
func fastCrop(src image.Image, x0, y0, x1, y1 int) *image.RGBA {
|
|
// Clamp to source bounds
|
|
b := src.Bounds()
|
|
if x0 < b.Min.X {
|
|
x0 = b.Min.X
|
|
}
|
|
if y0 < b.Min.Y {
|
|
y0 = b.Min.Y
|
|
}
|
|
if x1 > b.Max.X {
|
|
x1 = b.Max.X
|
|
}
|
|
if y1 > b.Max.Y {
|
|
y1 = b.Max.Y
|
|
}
|
|
if x0 >= x1 || y0 >= y1 {
|
|
return image.NewRGBA(image.Rect(0, 0, 1, 1))
|
|
}
|
|
w, h := x1-x0, y1-y0
|
|
dst := image.NewRGBA(image.Rect(0, 0, w, h))
|
|
if rgba, ok := src.(*image.RGBA); ok {
|
|
for y := y0; y < y1; y++ {
|
|
srcRow := rgba.Pix[rgba.PixOffset(x0, y):rgba.PixOffset(x1, y)]
|
|
dstRow := dst.Pix[dst.PixOffset(0, y-y0):]
|
|
copy(dstRow, srcRow)
|
|
}
|
|
|
|
} else {
|
|
for y := y0; y < y1; y++ {
|
|
for x := x0; x < x1; x++ {
|
|
dst.Set(x-x0, y-y0, src.At(x, y))
|
|
}
|
|
}
|
|
}
|
|
return dst
|
|
}
|