mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-05 10:58:34 +08:00
2
.gitignore
vendored
2
.gitignore
vendored
@@ -245,3 +245,5 @@ bin/*
|
||||
# Parser test fixtures and python tools
|
||||
internal/deepdoc/parser/pdf/testdata/
|
||||
internal/deepdoc/parser/pdf/tools-py/
|
||||
internal/deepdoc/parser/docx/testdata/
|
||||
internal/deepdoc/parser/docx/tool/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -51,12 +51,12 @@ func TestBatchResults(t *testing.T) {
|
||||
}
|
||||
pdfs := all[:min(count, len(all))]
|
||||
|
||||
ddClient, err := inf.NewInferenceClient(os.Getenv("DEEPDOC_URL"))
|
||||
ddClient, err := inf.NewClient(os.Getenv("DEEPDOC_URL"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !ddClient.Health() {
|
||||
t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL)
|
||||
t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.BaseURL())
|
||||
}
|
||||
deepDoc := pdf.DocAnalyzer(ddClient)
|
||||
|
||||
@@ -238,9 +238,9 @@ func parseOne(pdfDir, name string, deepDoc pdf.DocAnalyzer, skipOCR bool) (*pars
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.SkipOCR = skipOCR
|
||||
p := NewParser(cfg, deepDoc)
|
||||
p := NewParser(cfg)
|
||||
t0 := time.Now()
|
||||
parsed, err := p.Parse(context.Background(), eng)
|
||||
parsed, err := p.ParseRaw(context.Background(), eng, deepDoc)
|
||||
elapsed := time.Since(t0).Seconds()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse: %w", err)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"ragflow/internal/deepdoc/parser/pdf/tools"
|
||||
"ragflow/internal/deepdoc/parser/pdf/tool"
|
||||
)
|
||||
|
||||
// TestBatchCompareWithPython compares Go output against Python reference
|
||||
@@ -37,29 +37,29 @@ func TestBatchCompareWithPython(t *testing.T) {
|
||||
pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text")
|
||||
|
||||
// Read Go text files' #@meta (no aggregate JSON dependency).
|
||||
goResults, err := tools.ReadGoTextMeta(goTextDir)
|
||||
goResults, err := tool.ReadGoTextMeta(goTextDir)
|
||||
if err != nil || len(goResults) == 0 {
|
||||
t.Fatalf("No Go text files in %s: %v", goTextDir, err)
|
||||
}
|
||||
|
||||
// Read Python text files' #@meta
|
||||
pyResults, err := tools.ReadPythonTextMeta(pyTextDir)
|
||||
pyResults, err := tool.ReadPythonTextMeta(pyTextDir)
|
||||
if err != nil || len(pyResults) == 0 {
|
||||
t.Fatalf("No Python text files in %s: %v", pyTextDir, err)
|
||||
}
|
||||
|
||||
t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults))
|
||||
tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
|
||||
tool.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
|
||||
|
||||
// Compare tables.
|
||||
goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables")
|
||||
pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables")
|
||||
tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
|
||||
tool.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
|
||||
// Compare DLA + TSR raw intermediates.
|
||||
goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla")
|
||||
pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla")
|
||||
tools.CompareDLAWithPython(t, goDLADir, pyDLADir)
|
||||
tool.CompareDLAWithPython(t, goDLADir, pyDLADir)
|
||||
goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw")
|
||||
pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw")
|
||||
tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
|
||||
tool.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@@ -27,8 +27,8 @@ func TestParse_CropSectionImages(t *testing.T) {
|
||||
defer eng.Close()
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -79,8 +79,8 @@ func TestCrop_Regression_SnapshotPDFs(t *testing.T) {
|
||||
}
|
||||
defer eng.Close()
|
||||
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && integration
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -46,7 +46,7 @@ func TestDLARealWorldCompare(t *testing.T) {
|
||||
for _, pg := range pdf.pages {
|
||||
testName := pdf.name + "/page" + string(rune('0'+pg))
|
||||
t.Run(testName, func(t *testing.T) {
|
||||
pageImg, err := renderPageToImage(eng, pg)
|
||||
pageImg, err := RenderPageToImage(eng, pg)
|
||||
if err != nil {
|
||||
t.Fatalf("render page %d: %v", pg, err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && integration
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -28,7 +28,7 @@ func TestDLATSRResponseCompare(t *testing.T) {
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
pageImg, err := renderPageToImage(eng, 0)
|
||||
pageImg, err := RenderPageToImage(eng, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("render: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"os"
|
||||
@@ -11,20 +11,14 @@ import (
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// ── Shared CGO test helpers ──────────────────────────────────────────────────
|
||||
// These helpers were previously duplicated across multiple test files with
|
||||
// different build tags (integration, manual). Consolidating them into one file
|
||||
// with the //go:build cgo tag makes them available to all cgo-tagged tests.
|
||||
|
||||
// mustConnectInferenceClient returns a InferenceClient pointed at the OSS service;
|
||||
// skips the test if the service reports a non-OSS model type.
|
||||
func mustConnectInferenceClient(t *testing.T) *inf.InferenceClient {
|
||||
// mustConnectInferenceClient returns a InferenceClient for the OSS DeepDoc service.
|
||||
func mustConnectInferenceClient(t *testing.T) *inf.Client {
|
||||
t.Helper()
|
||||
url := os.Getenv("OSSDEEPDOC_URL")
|
||||
if url == "" {
|
||||
url = "http://localhost:9390"
|
||||
}
|
||||
client, err := inf.NewInferenceClient(url)
|
||||
client, err := inf.NewClient(url)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -48,3 +42,12 @@ func mustOpenEngine(t *testing.T, name string) pdf.PDFEngine {
|
||||
}
|
||||
return eng
|
||||
}
|
||||
|
||||
func mustReadPDF(t *testing.T, name string) []byte {
|
||||
t.Helper()
|
||||
data, err := os.ReadFile(filepath.Join("testdata", "pdfs", name))
|
||||
if err != nil {
|
||||
t.Fatalf("read fixture %s: %v", name, err)
|
||||
}
|
||||
return data
|
||||
}
|
||||
@@ -21,8 +21,8 @@ import (
|
||||
"github.com/cenkalti/backoff/v5"
|
||||
)
|
||||
|
||||
// InferenceClient wraps the DeepDoc HTTP API.
|
||||
type InferenceClient struct {
|
||||
// Client wraps the DeepDoc HTTP API.
|
||||
type Client struct {
|
||||
baseURL string
|
||||
httpClient *http.Client
|
||||
|
||||
@@ -33,24 +33,27 @@ type InferenceClient struct {
|
||||
}
|
||||
|
||||
// BaseURL returns the configured DeepDoc service URL.
|
||||
func (c *InferenceClient) BaseURL() string { return c.baseURL }
|
||||
func (c *Client) BaseURL() string { return c.baseURL }
|
||||
|
||||
// NewInferenceClient creates a client. baseURL must be provided by the caller
|
||||
// NewClient creates a client. baseURL must be provided by the caller
|
||||
// (e.g. from the DEEPDOC_URL environment variable). Returns an error if empty.
|
||||
func NewInferenceClient(baseURL string) (*InferenceClient, error) {
|
||||
func NewClient(baseURL string) (*Client, error) {
|
||||
if baseURL == "" {
|
||||
return nil, fmt.Errorf("deepdoc client: baseURL is required (set DEEPDOC_URL)")
|
||||
}
|
||||
return &InferenceClient{
|
||||
return &Client{
|
||||
baseURL: baseURL,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 120 * time.Second,
|
||||
},
|
||||
DLALabels: DefaultDLALabels(),
|
||||
TSRLabels: DefaultTSRLabels(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Default DLA/TSR label tables used as fallback when no model-specific
|
||||
// labels are injected by a TableBuilder constructor.
|
||||
// DefaultDLALabels returns the 10-class DLA taxonomy matching Python's
|
||||
// deepdoc/vision/dla_cli.py:10-21. Duplicates at indices 4, 7, 9 are
|
||||
// kept verbatim for backward compatibility with existing inference servers.
|
||||
func DefaultDLALabels() []string {
|
||||
return []string{
|
||||
pdf.LayoutTypeTitle, pdf.LayoutTypeText, pdf.LayoutTypeReference,
|
||||
@@ -59,6 +62,9 @@ func DefaultDLALabels() []string {
|
||||
pdf.LayoutTypeEquation, pdf.DLALabelFigureCaption,
|
||||
}
|
||||
}
|
||||
|
||||
// DefaultTSRLabels returns the 6-class TSR taxonomy matching Python's
|
||||
// deepdoc/server/adapters/tsr_adapter.py:21-26.
|
||||
func DefaultTSRLabels() []string {
|
||||
return []string{
|
||||
"table", "table column", "table row",
|
||||
@@ -72,7 +78,7 @@ type bboxesResponse struct {
|
||||
}
|
||||
|
||||
// DLA analyzes a full page image and returns labeled regions.
|
||||
func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
|
||||
func (c *Client) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
|
||||
data, err := util.EncodeJPEG(pageImage)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dla: encode: %w", err)
|
||||
@@ -87,9 +93,6 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
|
||||
continue
|
||||
}
|
||||
labels := c.DLALabels
|
||||
if labels == nil {
|
||||
labels = DefaultDLALabels()
|
||||
}
|
||||
label := ""
|
||||
if clsID := int(b[5]); clsID >= 0 && clsID < len(labels) {
|
||||
label = labels[clsID]
|
||||
@@ -104,7 +107,7 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
|
||||
}
|
||||
|
||||
// TSR recognises table structure from a cropped image.
|
||||
func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
|
||||
func (c *Client) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
|
||||
data, err := util.EncodeJPEG(cropped)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("tsr: encode: %w", err)
|
||||
@@ -119,9 +122,6 @@ func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.T
|
||||
continue
|
||||
}
|
||||
tlabels := c.TSRLabels
|
||||
if tlabels == nil {
|
||||
tlabels = DefaultTSRLabels()
|
||||
}
|
||||
label := ""
|
||||
if len(b) >= 6 {
|
||||
if cls := int(b[5]); cls >= 0 && cls < len(tlabels) {
|
||||
@@ -152,7 +152,7 @@ type ocrRecognizeResponse struct {
|
||||
|
||||
// OCRDetect detects text regions (bounding boxes) in an image.
|
||||
// DeepDoc /predict/ocr with operator=det returns quad boxes: [[[x0,y0],[x1,y1],[x2,y2],[x3,y3]], ...]
|
||||
func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
|
||||
func (c *Client) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
|
||||
data, err := util.EncodeJPEG(cropped)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ocr detect: encode: %w", err)
|
||||
@@ -197,7 +197,7 @@ func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([
|
||||
|
||||
// OCRRecognize recognizes text in a cropped image region.
|
||||
// DeepDoc /predict/ocr with operator=rec returns [[["text", confidence], ...]]
|
||||
func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
|
||||
func (c *Client) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
|
||||
data, err := util.EncodeJPEG(cropped)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ocr rec: encode: %w", err)
|
||||
@@ -224,7 +224,7 @@ func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image)
|
||||
// OCRRecognizeBatch recognizes text in multiple cropped image regions.
|
||||
// Returns a slice of results and a parallel slice of errors (nil on success).
|
||||
// A nil cropped image in the input produces nil results and a non-nil error.
|
||||
func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
|
||||
func (c *Client) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
|
||||
results := make([][]pdf.OCRText, len(cropped))
|
||||
errs := make([]error, len(cropped))
|
||||
|
||||
@@ -255,7 +255,7 @@ func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image
|
||||
}
|
||||
|
||||
// Health checks whether the DeepDoc service is reachable.
|
||||
func (c *InferenceClient) Health() bool {
|
||||
func (c *Client) Health() bool {
|
||||
resp, err := c.httpClient.Get(c.baseURL + "/health")
|
||||
if err != nil {
|
||||
return false
|
||||
@@ -264,7 +264,7 @@ func (c *InferenceClient) Health() bool {
|
||||
return resp.StatusCode == 200
|
||||
}
|
||||
|
||||
func (c *InferenceClient) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
|
||||
func (c *Client) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
|
||||
// Build multipart body once — the image data is idempotent.
|
||||
var body bytes.Buffer
|
||||
w := multipart.NewWriter(&body)
|
||||
|
||||
@@ -11,11 +11,11 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mustNewDeepDocClient wraps NewInferenceClient for test convenience.
|
||||
// mustNewDeepDocClient wraps NewClient for test convenience.
|
||||
// Fails the test if the URL is empty.
|
||||
func mustNewDeepDocClient(t *testing.T, baseURL string) *InferenceClient {
|
||||
func mustNewDeepDocClient(t *testing.T, baseURL string) *Client {
|
||||
t.Helper()
|
||||
client, err := NewInferenceClient(baseURL)
|
||||
client, err := NewClient(baseURL)
|
||||
if err != nil {
|
||||
t.Fatalf("NewDeepDocClient(%q): %v", baseURL, err)
|
||||
}
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
//go:build cgo && integration
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
tbl "ragflow/internal/deepdoc/parser/pdf/table"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
@@ -15,13 +14,11 @@ import (
|
||||
// through the OSS TableBuilder produces tables with the expected row/column structure.
|
||||
func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -29,7 +26,7 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
|
||||
t.Skip("DLA did not detect any tables in fixture")
|
||||
}
|
||||
|
||||
t.Logf("OssDeepDoc produced %d tables", len(result.Tables))
|
||||
t.Logf("DeepDoc produced %d tables", len(result.Tables))
|
||||
for i, tbl := range result.Tables {
|
||||
t.Logf("table[%d]: %d rows", i, len(tbl.Rows))
|
||||
for ri, row := range tbl.Rows {
|
||||
@@ -51,13 +48,11 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
|
||||
// rows with the expected grid structure.
|
||||
func TestIntegration_DeepDoc_TableRows(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -92,13 +87,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
|
||||
parseOnce := func() *pdf.ParseResult {
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -124,13 +117,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
|
||||
// does not crash.
|
||||
func TestIntegration_DeepDoc_EmptyPage(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "01_english_simple.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
|
||||
p := NewParser(cfg, client)
|
||||
_, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
_, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
41
internal/deepdoc/parser/pdf/mock_engine.go
Normal file
41
internal/deepdoc/parser/pdf/mock_engine.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// MockEngine is a minimal pdf.PDFEngine stub for unit/integration tests.
|
||||
type MockEngine struct {
|
||||
Chars map[int][]pdf.TextChar
|
||||
NumPages int
|
||||
RenderW int
|
||||
RenderH int
|
||||
}
|
||||
|
||||
func (m *MockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
|
||||
return m.Chars[pg], nil
|
||||
}
|
||||
func (m *MockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
|
||||
return nil, ErrNoPDFData
|
||||
}
|
||||
func (m *MockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
|
||||
w, h := m.RenderW, m.RenderH
|
||||
if w <= 0 {
|
||||
w = 100
|
||||
}
|
||||
if h <= 0 {
|
||||
h = 100
|
||||
}
|
||||
return image.NewRGBA(image.Rect(0, 0, w, h)), nil
|
||||
}
|
||||
func (m *MockEngine) PageCount() (int, error) {
|
||||
if m.NumPages <= 0 {
|
||||
return 1, nil
|
||||
}
|
||||
return m.NumPages, nil
|
||||
}
|
||||
func (m *MockEngine) RawData() []byte { return nil }
|
||||
func (m *MockEngine) Close() error { return nil }
|
||||
func (m *MockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }
|
||||
@@ -1,11 +1,13 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"image/png"
|
||||
"os"
|
||||
inf "ragflow/internal/deepdoc/parser/pdf/inference"
|
||||
util "ragflow/internal/deepdoc/parser/pdf/util"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
@@ -19,7 +21,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
|
||||
if url == "" {
|
||||
t.Skip("DEEPDOC_URL not set")
|
||||
}
|
||||
dd, err := inf.NewInferenceClient(url)
|
||||
dd, err := inf.NewClient(url)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -41,7 +43,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("pdf_oxide chars: %d", len(chars))
|
||||
t.Logf("pdf_oxide Chars: %d", len(chars))
|
||||
|
||||
var sample strings.Builder
|
||||
for i, c := range chars {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -10,10 +10,10 @@ import (
|
||||
|
||||
// ── outline-tracking mock engines ──────────────────────────────────────────
|
||||
|
||||
// outlineTrackingEngine wraps mockEngine and records whether Outlines()
|
||||
// outlineTrackingEngine wraps MockEngine and records whether Outlines()
|
||||
// was called.
|
||||
type outlineTrackingEngine struct {
|
||||
*mockEngine
|
||||
*MockEngine
|
||||
outlines []pdf.Outline
|
||||
outlinesCalled bool
|
||||
}
|
||||
@@ -25,7 +25,7 @@ func (e *outlineTrackingEngine) Outlines() ([]pdf.Outline, error) {
|
||||
|
||||
// outlineErrorEngine returns an error from Outlines().
|
||||
type outlineErrorEngine struct {
|
||||
*mockEngine
|
||||
*MockEngine
|
||||
}
|
||||
|
||||
func (e *outlineErrorEngine) Outlines() ([]pdf.Outline, error) {
|
||||
@@ -46,13 +46,13 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
|
||||
{Title: "Section 1.1", Level: 1, PageNumber: 2},
|
||||
}
|
||||
eng := &outlineTrackingEngine{
|
||||
mockEngine: &mockEngine{pageCount: 3},
|
||||
MockEngine: &MockEngine{NumPages: 3},
|
||||
outlines: expectedOutlines,
|
||||
}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mockDLA)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse failed: %v", err)
|
||||
}
|
||||
@@ -79,18 +79,18 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
|
||||
// and produces sections (outlines are best-effort).
|
||||
func TestParse_OutlinesErrorDoesNotBlockParsing(t *testing.T) {
|
||||
eng := &outlineErrorEngine{
|
||||
mockEngine: &mockEngine{
|
||||
pageCount: 2,
|
||||
chars: map[int][]pdf.TextChar{
|
||||
MockEngine: &MockEngine{
|
||||
NumPages: 2,
|
||||
Chars: map[int][]pdf.TextChar{
|
||||
0: {{Text: "Hello world", X0: 100, X1: 200, Top: 100, Bottom: 120}},
|
||||
1: {{Text: "Page two", X0: 100, X1: 200, Top: 100, Bottom: 120}},
|
||||
},
|
||||
},
|
||||
}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mockDLA)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse should not fail when Outlines() errors: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -31,8 +31,8 @@ func TestParse_BatchEquivalence(t *testing.T) {
|
||||
defer eng.Close()
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.BatchSize = batchSize
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
22
internal/deepdoc/parser/pdf/parse_cgo.go
Normal file
22
internal/deepdoc/parser/pdf/parse_cgo.go
Normal file
@@ -0,0 +1,22 @@
|
||||
//go:build cgo
|
||||
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// Parse runs the full PDF extraction pipeline from raw bytes.
|
||||
// Creates and manages the PDF engine lifecycle internally.
|
||||
func (p *Parser) Parse(ctx context.Context, data []byte, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
|
||||
engine, err := NewEngine(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("pdfoxide.NewEngine: %w", err)
|
||||
}
|
||||
defer engine.Close()
|
||||
|
||||
return p.ParseRaw(ctx, engine, docAnalyzer)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -8,52 +8,36 @@ import (
|
||||
"log/slog"
|
||||
"sync"
|
||||
|
||||
inf "ragflow/internal/deepdoc/parser/pdf/inference"
|
||||
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
|
||||
tbl "ragflow/internal/deepdoc/parser/pdf/table"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
util "ragflow/internal/deepdoc/parser/pdf/util"
|
||||
)
|
||||
|
||||
// Parser is the main PDF text/layout extraction pipeline.
|
||||
// Parser is the core PDF text/layout extraction pipeline.
|
||||
// It corresponds to RAGFlowPdfParser in pdf_parser.py.
|
||||
// Parser is stateless after construction — safe to reuse across documents.
|
||||
// Stateless after construction — safe to reuse across documents.
|
||||
type Parser struct {
|
||||
Config pdf.ParserConfig
|
||||
|
||||
// DeepDoc is the required document layout / OCR / table recognition
|
||||
// service. Set at construction time by NewParser.
|
||||
DeepDoc pdf.DocAnalyzer
|
||||
|
||||
// SampleChars samples up to n chars from a page for English detection.
|
||||
// Defaults to random sampling (matching Python's random.choices).
|
||||
// Inject a deterministic sampler for reproducible tests.
|
||||
SampleChars pdf.SampleFunc
|
||||
|
||||
// tableBuilder is the TSR model adapter. Set at construction time
|
||||
//
|
||||
// different implementation via Config.TableBuilder.
|
||||
tableBuilder pdf.TableBuilder
|
||||
}
|
||||
|
||||
// NewParser creates a new Parser with the required DeepDoc service.
|
||||
func NewParser(cfg pdf.ParserConfig, doc pdf.DocAnalyzer) *Parser {
|
||||
tb := cfg.TableBuilder
|
||||
if tb == nil {
|
||||
tb = NewTableBuilderFor(doc)
|
||||
}
|
||||
return &Parser{
|
||||
Config: cfg,
|
||||
DeepDoc: doc,
|
||||
tableBuilder: tb,
|
||||
}
|
||||
// pageResult holds per-page output from extractPages.
|
||||
type pageResult struct {
|
||||
pg int
|
||||
ocrBoxes []pdf.TextBox
|
||||
chars []pdf.TextChar
|
||||
ocrUsed bool
|
||||
pageImg image.Image
|
||||
err error
|
||||
}
|
||||
|
||||
// New creates a new Parser with the given config.
|
||||
func NewParser(cfg pdf.ParserConfig) *Parser {
|
||||
return &Parser{Config: cfg}
|
||||
}
|
||||
|
||||
// ── TableBuilder factory ───────────────────────────────────────────────────
|
||||
|
||||
// tableBuilderFactory holds a model-specific TableBuilder factory registered
|
||||
// by EE packages via RegisterTableBuilder. If nil, the default OSS
|
||||
// implementation is used.
|
||||
var tableBuilderFactory func(pdf.DocAnalyzer) pdf.TableBuilder
|
||||
|
||||
// RegisterTableBuilder registers a TableBuilder factory for the PDF parser.
|
||||
@@ -62,30 +46,20 @@ func RegisterTableBuilder(factory func(pdf.DocAnalyzer) pdf.TableBuilder) {
|
||||
tableBuilderFactory = factory
|
||||
}
|
||||
|
||||
// NewTableBuilderFor creates the right TableBuilder, chosen by the registry.
|
||||
// Checks the registry first for EE-registered implementations, falling back
|
||||
// to the default OSS DeepDocTableBuilder. Label taxonomies are injected
|
||||
// before construction.
|
||||
func NewTableBuilderFor(doc pdf.DocAnalyzer) pdf.TableBuilder {
|
||||
if tableBuilderFactory != nil {
|
||||
return tableBuilderFactory(doc)
|
||||
}
|
||||
if c, ok := doc.(*inf.InferenceClient); ok {
|
||||
c.DLALabels = inf.DefaultDLALabels()
|
||||
c.TSRLabels = inf.DefaultTSRLabels()
|
||||
}
|
||||
return tbl.NewDeepDocTableBuilder(doc)
|
||||
}
|
||||
|
||||
// Parse runs the full PDF extraction pipeline: chars → boxes →
|
||||
// column assignment → text merge → vertical merge → sections.
|
||||
//
|
||||
// For documents larger than Config.BatchSize pages, processes in batches
|
||||
// to bound memory usage (matching Python's batch_size=50).
|
||||
//
|
||||
// Returns a pdf.ParseResult containing sections, tables, page images, figures,
|
||||
// and pipeline stage metrics. Parser itself remains stateless.
|
||||
func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseResult, error) {
|
||||
// ── Public API ─────────────────────────────────────────────────────────────
|
||||
|
||||
// ParseRaw is the internal entry point: runs the core pipeline on an
|
||||
// already-opened engine. Exported for tests that inject mock engines.
|
||||
func (p *Parser) ParseRaw(ctx context.Context, engine pdf.PDFEngine, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
|
||||
tb := NewTableBuilderFor(docAnalyzer)
|
||||
|
||||
// Normalize page range
|
||||
pageCount, err := engine.PageCount()
|
||||
if err != nil {
|
||||
@@ -103,11 +77,10 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
totalPages := toPage - fromPage + 1
|
||||
batchSize := p.Config.BatchSize
|
||||
if batchSize <= 0 {
|
||||
batchSize = 50 // default, matching Python's batch_size
|
||||
batchSize = 50
|
||||
}
|
||||
|
||||
// ── Prescan: lightweight char extraction for language/noise detection ──
|
||||
// No rendering, no OCR — just raw chars for global decisions.
|
||||
// ── Prescan ──
|
||||
prescanChars := make(map[int][]pdf.TextChar)
|
||||
prescanMedianH := make(map[int]float64)
|
||||
prescanMedianW := make(map[int]float64)
|
||||
@@ -115,26 +88,27 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
chars, extractErr := engine.ExtractChars(pg)
|
||||
if extractErr != nil {
|
||||
slog.Warn("prescan: ExtractChars failed", "page", pg, "err", extractErr)
|
||||
chars = nil // skip broken pages (matching old behavior)
|
||||
chars = nil
|
||||
}
|
||||
prescanChars[pg] = chars
|
||||
prescanMedianH[pg] = util.MedianCharHeight(chars)
|
||||
prescanMedianW[pg] = util.MedianCharWidth(chars)
|
||||
}
|
||||
isEnglish := util.DetectEnglish(prescanChars, totalPages, p.SampleChars)
|
||||
isEnglish := util.DetectEnglish(prescanChars, totalPages, nil)
|
||||
scanNoise := util.IsScanNoise(util.FullTextFromChars(prescanChars))
|
||||
|
||||
// ── Extract PDF outlines/bookmarks (best-effort, non-fatal) ──
|
||||
// ── Outlines ──
|
||||
outlines, outlineErr := engine.Outlines()
|
||||
if outlineErr != nil {
|
||||
slog.Warn("Failed to extract PDF outlines; continuing without them", "err", outlineErr)
|
||||
outlines = nil
|
||||
}
|
||||
|
||||
// ── Small document: process all at once (no batching overhead) ──
|
||||
// ── Small document ──
|
||||
if totalPages <= batchSize {
|
||||
result, err := p.processPages(ctx, engine, fromPage, toPage,
|
||||
prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise)
|
||||
prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise,
|
||||
docAnalyzer, tb)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -142,7 +116,7 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// ── Large document: process in batches to bound memory ──
|
||||
// ── Large document: batched ──
|
||||
slog.Info("batched processing", "pages", totalPages, "batchSize", batchSize)
|
||||
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
|
||||
for start := fromPage; start <= toPage; start += batchSize {
|
||||
@@ -151,7 +125,6 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
}
|
||||
end := min(start+batchSize-1, toPage)
|
||||
|
||||
// Slice prescan data for this batch.
|
||||
batchChars := make(map[int][]pdf.TextChar, end-start+1)
|
||||
batchMH := make(map[int]float64, end-start+1)
|
||||
batchMW := make(map[int]float64, end-start+1)
|
||||
@@ -162,15 +135,14 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
}
|
||||
|
||||
batch, err := p.processPages(ctx, engine, start, end,
|
||||
batchChars, batchMH, batchMW, isEnglish, scanNoise)
|
||||
batchChars, batchMH, batchMW, isEnglish, scanNoise,
|
||||
docAnalyzer, tb)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Merge batch results.
|
||||
result.Sections = append(result.Sections, batch.Sections...)
|
||||
result.Tables = append(result.Tables, batch.Tables...)
|
||||
// Figures() is computed on demand from Sections.
|
||||
for pg, img := range batch.PageImages {
|
||||
result.PageImages[pg] = img
|
||||
}
|
||||
@@ -184,33 +156,22 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// extractPages runs per-page OCR (detect + recognize) for the given page
|
||||
// range, returning text boxes, char data, whether any page used OCR, and
|
||||
// any errors encountered. Partial results are returned even when some
|
||||
// pages fail — callers should inspect the error for diagnostics but may
|
||||
// still use the returned boxes and chars.
|
||||
// ── Internal pipeline steps ────────────────────────────────────────────────
|
||||
|
||||
func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
prescanChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
pageImages map[int]image.Image,
|
||||
docAnalyzer pdf.DocAnalyzer,
|
||||
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
|
||||
var boxes []pdf.TextBox
|
||||
pageChars := make(map[int][]pdf.TextChar)
|
||||
ocrUsedAny := false
|
||||
|
||||
type pr struct {
|
||||
pg int
|
||||
ocrBoxes []pdf.TextBox
|
||||
chars []pdf.TextChar
|
||||
ocrUsed bool
|
||||
pageImg image.Image
|
||||
err error
|
||||
}
|
||||
pageCount := toPage - fromPage + 1
|
||||
results := make([]pr, pageCount)
|
||||
results := make([]pageResult, pageCount)
|
||||
|
||||
// Semaphore cap: 0 → sequential; >0 → bounded parallelism.
|
||||
cap := p.Config.MaxOCRConcurrency
|
||||
if cap <= 0 {
|
||||
cap = 1
|
||||
@@ -222,16 +183,15 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
pg := fromPage + i
|
||||
chars := prescanChars[pg]
|
||||
|
||||
// Fast path: pages with embedded chars → sequential inline (no HTTP OCR).
|
||||
if len(chars) > 0 && !util.IsGarbledPage(chars) {
|
||||
pageImg, renderErr := renderPageToImage(engine, pg)
|
||||
pageImg, renderErr := RenderPageToImage(engine, pg)
|
||||
if renderErr == nil && pageImg != nil {
|
||||
pageImages[pg] = pageImg
|
||||
}
|
||||
var ocrBoxes []pdf.TextBox
|
||||
ocrUsed := false
|
||||
if !p.Config.SkipOCR && renderErr == nil && pageImg != nil {
|
||||
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
|
||||
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
|
||||
if ocrBoxes == nil {
|
||||
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
|
||||
} else {
|
||||
@@ -241,30 +201,28 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
} else {
|
||||
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
|
||||
}
|
||||
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
|
||||
results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
|
||||
continue
|
||||
}
|
||||
|
||||
// OCR path: render + detect + recognize (potentially parallel).
|
||||
wg.Add(1)
|
||||
go func(i, pg int, chars []pdf.TextChar) {
|
||||
defer wg.Done()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
results[i] = pr{pg: pg, err: ctx.Err()}
|
||||
results[i] = pageResult{pg: pg, err: ctx.Err()}
|
||||
return
|
||||
case sem <- struct{}{}:
|
||||
}
|
||||
defer func() { <-sem }()
|
||||
|
||||
pageImg, err := renderPageToImage(engine, pg)
|
||||
pageImg, err := RenderPageToImage(engine, pg)
|
||||
if err != nil {
|
||||
results[i] = pr{pg: pg, err: err}
|
||||
results[i] = pageResult{pg: pg, err: err}
|
||||
return
|
||||
}
|
||||
// Check if context was cancelled during render.
|
||||
if err := ctx.Err(); err != nil {
|
||||
results[i] = pr{pg: pg, err: err}
|
||||
results[i] = pageResult{pg: pg, err: err}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -275,7 +233,7 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
if len(chars) > 0 {
|
||||
label = "garbled page"
|
||||
}
|
||||
ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, p.DeepDoc, pg, label)
|
||||
ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, docAnalyzer, pg, label)
|
||||
if ocrBoxes != nil {
|
||||
for j := range ocrBoxes {
|
||||
for _, r := range ocrBoxes[j].Text {
|
||||
@@ -286,9 +244,8 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
ocrUsed = true
|
||||
}
|
||||
}
|
||||
// Merged OCR path for pages with both embedded and OCR chars.
|
||||
if !ocrUsed && len(chars) > 0 && !p.Config.SkipOCR {
|
||||
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
|
||||
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
|
||||
if ocrBoxes != nil {
|
||||
ocrUsed = true
|
||||
}
|
||||
@@ -298,15 +255,252 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
|
||||
}
|
||||
}
|
||||
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
|
||||
results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
|
||||
}(i, pg, chars)
|
||||
}
|
||||
wg.Wait()
|
||||
return mergePageResults(results, boxes, pageImages, pageChars, ocrUsedAny, medianHeights, medianWidths)
|
||||
}
|
||||
|
||||
// Merge results in page order.
|
||||
func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
pageImages map[int]image.Image,
|
||||
pageChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
ocrUsedAny bool,
|
||||
docAnalyzer pdf.DocAnalyzer,
|
||||
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
|
||||
slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
|
||||
var boxes []pdf.TextBox
|
||||
for pg := fromPage; pg <= toPage; pg++ {
|
||||
img := pageImages[pg]
|
||||
if img == nil {
|
||||
var err error
|
||||
img, err = RenderPageToImage(engine, pg)
|
||||
if err != nil {
|
||||
slog.Warn("scan noise: page render failed", "page", pg, "err", err)
|
||||
continue
|
||||
}
|
||||
pageImages[pg] = img
|
||||
}
|
||||
ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "scan page")
|
||||
if ocrBoxes == nil {
|
||||
slog.Warn("scan noise: page OCR empty", "page", pg)
|
||||
continue
|
||||
}
|
||||
boxes = append(boxes, ocrBoxes...)
|
||||
var chars []pdf.TextChar
|
||||
for _, b := range ocrBoxes {
|
||||
for _, r := range b.Text {
|
||||
chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
|
||||
break
|
||||
}
|
||||
}
|
||||
pageChars[pg] = chars
|
||||
medianHeights[pg] = util.MedianCharHeight(chars)
|
||||
medianWidths[pg] = util.MedianCharWidth(chars)
|
||||
}
|
||||
slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
|
||||
return boxes, pageChars, true
|
||||
}
|
||||
|
||||
func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
pageImages map[int]image.Image,
|
||||
boxes []pdf.TextBox, ocrUsedAny bool,
|
||||
docAnalyzer pdf.DocAnalyzer,
|
||||
) ([]pdf.TextBox, bool) {
|
||||
retryZoomVal := p.Config.Zoom * pdf.DlaScale
|
||||
retryDPI := retryZoomVal * 72
|
||||
slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoomVal)
|
||||
for pg := fromPage; pg <= toPage; pg++ {
|
||||
img, err := engine.RenderPageImage(pg, retryDPI)
|
||||
if err != nil {
|
||||
slog.Warn("zoom retry: render failed", "page", pg, "err", err)
|
||||
continue
|
||||
}
|
||||
pageImages[pg] = img
|
||||
if retryDPI != pdf.DlaDPI {
|
||||
if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
|
||||
pageImages[pg] = dlaImg
|
||||
}
|
||||
}
|
||||
ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "zoom retry")
|
||||
if ocrBoxes == nil {
|
||||
continue
|
||||
}
|
||||
scaleFactor := retryZoomVal / p.Config.Zoom
|
||||
for i := range ocrBoxes {
|
||||
ocrBoxes[i].X0 /= scaleFactor
|
||||
ocrBoxes[i].X1 /= scaleFactor
|
||||
ocrBoxes[i].Top /= scaleFactor
|
||||
ocrBoxes[i].Bottom /= scaleFactor
|
||||
}
|
||||
boxes = append(boxes, ocrBoxes...)
|
||||
ocrUsedAny = true
|
||||
}
|
||||
return boxes, ocrUsedAny
|
||||
}
|
||||
|
||||
func (p *Parser) buildLayout(ctx context.Context,
|
||||
result *pdf.ParseResult, engine pdf.PDFEngine,
|
||||
boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
|
||||
docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
|
||||
) error {
|
||||
result.Metrics.BoxesInitial = len(boxes)
|
||||
|
||||
result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages, docAnalyzer, tb)
|
||||
result.Metrics.TablesCount = len(result.Tables)
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
|
||||
boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
|
||||
result.Metrics.BoxesTextMerge = len(boxes)
|
||||
|
||||
lyt.SortByPageThenY(boxes, p.Config.SortByTop)
|
||||
|
||||
if ocrUsedAny {
|
||||
isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, nil)
|
||||
}
|
||||
boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
|
||||
result.Metrics.BoxesVertMerge = len(boxes)
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
|
||||
boxes = tbl.ConsolidateFigures(boxes)
|
||||
|
||||
pageHeights := make(map[int]float64, len(result.PageImages))
|
||||
for pg, img := range result.PageImages {
|
||||
pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
|
||||
}
|
||||
result.Sections = lyt.BoxesToSections(boxes, pageHeights)
|
||||
result.Metrics.BoxesFinal = len(result.Sections)
|
||||
result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
prescanChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
isEnglish, isScanNoiseDoc bool,
|
||||
docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
|
||||
) (*pdf.ParseResult, error) {
|
||||
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
|
||||
|
||||
boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
|
||||
fromPage, toPage, prescanChars,
|
||||
medianHeights, medianWidths, result.PageImages, docAnalyzer)
|
||||
if ocrErr != nil {
|
||||
slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
|
||||
}
|
||||
|
||||
if isScanNoiseDoc {
|
||||
boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
|
||||
fromPage, toPage, result.PageImages,
|
||||
pageChars, medianHeights, medianWidths, ocrUsedAny, docAnalyzer)
|
||||
}
|
||||
|
||||
if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
|
||||
boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
|
||||
result.PageImages, boxes, ocrUsedAny, docAnalyzer)
|
||||
}
|
||||
|
||||
if len(boxes) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
|
||||
medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish,
|
||||
docAnalyzer, tb); err != nil {
|
||||
return nil, fmt.Errorf("buildLayout: %w", err)
|
||||
}
|
||||
p.fillSectionImages(result)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
|
||||
if len(result.PageImages) == 0 {
|
||||
return
|
||||
}
|
||||
tableImgByRegion := make(map[string]string, len(result.Tables))
|
||||
for _, tbl := range result.Tables {
|
||||
if tbl.ImageB64 == "" {
|
||||
continue
|
||||
}
|
||||
pg := 0
|
||||
if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
|
||||
pg = tbl.Positions[0].PageNumbers[0]
|
||||
}
|
||||
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
|
||||
pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
|
||||
tableImgByRegion[key] = tbl.ImageB64
|
||||
}
|
||||
for i := range result.Sections {
|
||||
if result.Sections[i].LayoutType == pdf.LayoutTypeTable {
|
||||
if img, ok := matchTableImage(&result.Sections[i], tableImgByRegion); ok {
|
||||
result.Sections[i].Image = img
|
||||
continue
|
||||
}
|
||||
}
|
||||
if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
|
||||
if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
|
||||
result.Sections[i].Image = dlaImg
|
||||
continue
|
||||
}
|
||||
}
|
||||
img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
|
||||
result.Sections[i].Image = img
|
||||
if img == "" && result.Sections[i].Text != "" {
|
||||
tag := result.Sections[i].PositionTag
|
||||
slog.Warn("cropSectionImage empty for non-empty section",
|
||||
"section", i, "posTag", tag[:min(80, len(tag))])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// matchTableImage looks up a pre-rendered table image for a section.
|
||||
// Uses Positions if available; falls back to TableItem Region boundaries.
|
||||
func matchTableImage(sec *pdf.Section, tableImgByRegion map[string]string) (string, bool) {
|
||||
pg := 0
|
||||
if len(sec.Positions) > 0 {
|
||||
pos := sec.Positions[0]
|
||||
if len(pos.PageNumbers) > 0 {
|
||||
pg = pos.PageNumbers[0]
|
||||
}
|
||||
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
|
||||
if img, ok := tableImgByRegion[key]; ok {
|
||||
return img, true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
if sec.TableItem != nil {
|
||||
if len(sec.TableItem.Positions) > 0 && len(sec.TableItem.Positions[0].PageNumbers) > 0 {
|
||||
pg = sec.TableItem.Positions[0].PageNumbers[0]
|
||||
}
|
||||
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg,
|
||||
sec.TableItem.RegionLeft, sec.TableItem.RegionRight,
|
||||
sec.TableItem.RegionTop, sec.TableItem.RegionBottom)
|
||||
if img, ok := tableImgByRegion[key]; ok {
|
||||
return img, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// mergePageResults collects per-page OCR results into the final output.
|
||||
func mergePageResults(results []pageResult, boxes []pdf.TextBox, pageImages map[int]image.Image,
|
||||
pageChars map[int][]pdf.TextChar, ocrUsedAny bool,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
|
||||
var errs []error
|
||||
for i := 0; i < pageCount; i++ {
|
||||
r := results[i]
|
||||
for _, r := range results {
|
||||
if r.err != nil {
|
||||
slog.Warn("page OCR failed", "page", r.pg, "err", r.err)
|
||||
errs = append(errs, fmt.Errorf("page %d: %w", r.pg, r.err))
|
||||
@@ -329,233 +523,3 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
}
|
||||
return boxes, pageChars, ocrUsedAny, errors.Join(errs...)
|
||||
}
|
||||
|
||||
// retryScanNoise re-runs OCR on all pages when prescan detects scan noise,
|
||||
// overwriting page-level state with fresh detect+recognize results.
|
||||
func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
pageImages map[int]image.Image,
|
||||
pageChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
ocrUsedAny bool,
|
||||
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
|
||||
slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
|
||||
var boxes []pdf.TextBox
|
||||
for pg := fromPage; pg <= toPage; pg++ {
|
||||
img := pageImages[pg]
|
||||
if img == nil {
|
||||
var err error
|
||||
img, err = renderPageToImage(engine, pg)
|
||||
if err != nil {
|
||||
slog.Warn("scan noise: page render failed", "page", pg, "err", err)
|
||||
continue
|
||||
}
|
||||
pageImages[pg] = img
|
||||
}
|
||||
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "scan page")
|
||||
if ocrBoxes == nil {
|
||||
slog.Warn("scan noise: page OCR empty", "page", pg)
|
||||
continue
|
||||
}
|
||||
boxes = append(boxes, ocrBoxes...)
|
||||
var chars []pdf.TextChar
|
||||
for _, b := range ocrBoxes {
|
||||
for _, r := range b.Text {
|
||||
chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
|
||||
break
|
||||
}
|
||||
}
|
||||
pageChars[pg] = chars
|
||||
medianHeights[pg] = util.MedianCharHeight(chars)
|
||||
medianWidths[pg] = util.MedianCharWidth(chars)
|
||||
}
|
||||
slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
|
||||
return boxes, pageChars, true
|
||||
}
|
||||
|
||||
// retryZoom re-renders pages at higher resolution and re-runs OCR when the
|
||||
// initial extraction produced zero boxes. Box coordinates are scaled back
|
||||
// to Config.Zoom space. Matches Python's __images__ retry.
|
||||
func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
pageImages map[int]image.Image,
|
||||
boxes []pdf.TextBox, ocrUsedAny bool,
|
||||
) ([]pdf.TextBox, bool) {
|
||||
retryZoom := p.Config.Zoom * pdf.DlaScale
|
||||
retryDPI := retryZoom * 72
|
||||
slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoom)
|
||||
for pg := fromPage; pg <= toPage; pg++ {
|
||||
img, err := engine.RenderPageImage(pg, retryDPI)
|
||||
if err != nil {
|
||||
slog.Warn("zoom retry: render failed", "page", pg, "err", err)
|
||||
continue
|
||||
}
|
||||
pageImages[pg] = img
|
||||
// Downstream DLA/TSR assumes pdf.DlaDPI. Re-render at standard
|
||||
// resolution so layout coordinates are scaled correctly.
|
||||
if retryDPI != pdf.DlaDPI {
|
||||
if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
|
||||
pageImages[pg] = dlaImg
|
||||
}
|
||||
}
|
||||
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "zoom retry")
|
||||
if ocrBoxes == nil {
|
||||
continue
|
||||
}
|
||||
scaleFactor := retryZoom / p.Config.Zoom
|
||||
for i := range ocrBoxes {
|
||||
ocrBoxes[i].X0 /= scaleFactor
|
||||
ocrBoxes[i].X1 /= scaleFactor
|
||||
ocrBoxes[i].Top /= scaleFactor
|
||||
ocrBoxes[i].Bottom /= scaleFactor
|
||||
}
|
||||
boxes = append(boxes, ocrBoxes...)
|
||||
ocrUsedAny = true
|
||||
}
|
||||
return boxes, ocrUsedAny
|
||||
}
|
||||
|
||||
// buildLayout runs the DLA → TSR → Column → TextMerge → VM → pdf.Section
|
||||
// pipeline and populates result.Metrics, result.Tables, result.Sections,
|
||||
// and result.Sections. Matches Python's _parse_loaded_window_into_bboxes
|
||||
// order.
|
||||
func (p *Parser) buildLayout(ctx context.Context,
|
||||
result *pdf.ParseResult, engine pdf.PDFEngine,
|
||||
boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
|
||||
) error {
|
||||
result.Metrics.BoxesInitial = len(boxes)
|
||||
|
||||
result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages)
|
||||
result.Metrics.TablesCount = len(result.Tables)
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
|
||||
boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
|
||||
result.Metrics.BoxesTextMerge = len(boxes)
|
||||
|
||||
lyt.SortByPageThenY(boxes, p.Config.SortByTop)
|
||||
|
||||
if ocrUsedAny {
|
||||
isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, p.SampleChars)
|
||||
}
|
||||
boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
|
||||
result.Metrics.BoxesVertMerge = len(boxes)
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
|
||||
boxes = tbl.ConsolidateFigures(boxes)
|
||||
|
||||
pageHeights := make(map[int]float64, len(result.PageImages))
|
||||
for pg, img := range result.PageImages {
|
||||
pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
|
||||
}
|
||||
result.Sections = lyt.BoxesToSections(boxes, pageHeights)
|
||||
result.Metrics.BoxesFinal = len(result.Sections)
|
||||
result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
|
||||
return nil
|
||||
}
|
||||
|
||||
// processPages runs the full pipeline on pages [fromPage, toPage].
|
||||
// prescanChars provides pre-extracted chars (avoids double extraction).
|
||||
func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
|
||||
fromPage, toPage int,
|
||||
prescanChars map[int][]pdf.TextChar,
|
||||
medianHeights, medianWidths map[int]float64,
|
||||
isEnglish, isScanNoiseDoc bool,
|
||||
) (*pdf.ParseResult, error) {
|
||||
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
|
||||
|
||||
// 1. OCR extraction — per-page detect + recognize + char merge.
|
||||
boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
|
||||
fromPage, toPage, prescanChars,
|
||||
medianHeights, medianWidths, result.PageImages)
|
||||
if ocrErr != nil {
|
||||
slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
|
||||
}
|
||||
// 2. Scan noise retry — re-OCR all pages when prescan detects scan noise.
|
||||
if isScanNoiseDoc {
|
||||
boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
|
||||
fromPage, toPage, result.PageImages,
|
||||
pageChars, medianHeights, medianWidths, ocrUsedAny)
|
||||
}
|
||||
|
||||
// 3. Zoom retry — re-render at higher resolution if OCR produced zero boxes.
|
||||
if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
|
||||
boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
|
||||
result.PageImages, boxes, ocrUsedAny)
|
||||
}
|
||||
|
||||
if len(boxes) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// 4. Layout pipeline — DLA → TSR → Column → TextMerge → VM → Sections.
|
||||
if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
|
||||
medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish); err != nil {
|
||||
return nil, fmt.Errorf("buildLayout: %w", err)
|
||||
}
|
||||
// 5. Crop section images from page renders.
|
||||
p.fillSectionImages(result)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// fillSectionImages populates result.Sections[i].Image with cropped
|
||||
// page images. Table sections are matched to their TableItem image;
|
||||
// figure sections try DLA-aware cropping first, then fall back to
|
||||
// position-tag-based cropping.
|
||||
func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
|
||||
if len(result.PageImages) == 0 {
|
||||
return
|
||||
}
|
||||
// Build lookup: DLA region -> table image (base64).
|
||||
tableImgByRegion := make(map[string]string, len(result.Tables))
|
||||
for _, tbl := range result.Tables {
|
||||
if tbl.ImageB64 == "" {
|
||||
continue
|
||||
}
|
||||
pg := 0
|
||||
if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
|
||||
pg = tbl.Positions[0].PageNumbers[0]
|
||||
}
|
||||
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
|
||||
pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
|
||||
tableImgByRegion[key] = tbl.ImageB64
|
||||
}
|
||||
for i := range result.Sections {
|
||||
if result.Sections[i].LayoutType == pdf.LayoutTypeTable && len(result.Sections[i].Positions) > 0 {
|
||||
pos := result.Sections[i].Positions[0]
|
||||
pg := 0
|
||||
if len(pos.PageNumbers) > 0 {
|
||||
pg = pos.PageNumbers[0]
|
||||
}
|
||||
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
|
||||
pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
|
||||
if img, ok := tableImgByRegion[key]; ok {
|
||||
result.Sections[i].Image = img
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Try DLA-aware cropping for figure sections (matching Python's
|
||||
// cropout which uses DLA region boundaries instead of text boxes).
|
||||
if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
|
||||
if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
|
||||
result.Sections[i].Image = dlaImg
|
||||
continue
|
||||
}
|
||||
}
|
||||
img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
|
||||
result.Sections[i].Image = img
|
||||
if img == "" && result.Sections[i].Text != "" {
|
||||
tag := result.Sections[i].PositionTag
|
||||
slog.Warn("cropSectionImage empty for non-empty section",
|
||||
"section", i, "posTag", tag[:min(80, len(tag))])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -53,10 +53,11 @@ func TestEnrichWithDeepDoc_Noop(t *testing.T) {
|
||||
boxes := []pdf.TextBox{
|
||||
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"},
|
||||
}
|
||||
eng := &mockEngine{pageCount: 1}
|
||||
eng := &MockEngine{NumPages: 1}
|
||||
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
mock := &MockDocAnalyzer{Healthy: false}
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Error("unhealthy DeepDoc → 0 Tables")
|
||||
}
|
||||
@@ -83,10 +84,10 @@ func TestExtractTableBoxes_Mock(t *testing.T) {
|
||||
{X0: 600, Y0: 410, X1: 1240, Y1: 800, Text: "B2"},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummyImg := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
|
||||
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 1 {
|
||||
t.Fatalf("expected 1 pdf.TableItem, got %d", len(tables))
|
||||
}
|
||||
@@ -105,9 +106,9 @@ func TestExtractTableBoxes_Mock(t *testing.T) {
|
||||
|
||||
func TestExtractTableBoxes_NoTables(t *testing.T) {
|
||||
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{}}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("0 tables expected, got %d", len(tables))
|
||||
}
|
||||
@@ -121,9 +122,9 @@ func TestExtractTableBoxes_NonTableRegions(t *testing.T) {
|
||||
{X0: 150, Y0: 600, X1: 1650, Y1: 900, Label: "figure", Confidence: 0.8},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 2000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("non-table regions → 0 tables, got %d", len(tables))
|
||||
}
|
||||
@@ -139,9 +140,9 @@ func TestExtractTableBoxes_NoOverlap(t *testing.T) {
|
||||
{X0: 150, Y0: 1500, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("no overlap → 0 tables, got %d", len(tables))
|
||||
}
|
||||
@@ -158,9 +159,9 @@ func TestExtractTableBoxes_TSRError(t *testing.T) {
|
||||
},
|
||||
TSRCells: nil, // TSR returns nothing
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 1 {
|
||||
t.Fatalf("TSR failure: expected 1 pdf.TableItem with image+positions, got %d", len(tables))
|
||||
}
|
||||
@@ -180,9 +181,9 @@ func TestExtractTableBoxes_DLAError(t *testing.T) {
|
||||
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{
|
||||
{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "text", Confidence: 0.9},
|
||||
}}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("non-table DLA → 0 tables, got %d", len(tables))
|
||||
}
|
||||
@@ -238,9 +239,9 @@ func TestExtractTableBoxes_InvalidRegion(t *testing.T) {
|
||||
{X0: 500, Y0: 100, X1: 100, Y1: 300, Label: "table", Confidence: 0.9},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
|
||||
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("invalid DLA region should be skipped, got %d tables", len(tables))
|
||||
}
|
||||
@@ -252,16 +253,16 @@ func TestParse_CollectsFigures(t *testing.T) {
|
||||
// End-to-end: Parse() with mock DeepDoc that labels a box as "figure".
|
||||
// Verify p.Figures is populated.
|
||||
|
||||
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
|
||||
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
|
||||
mock := &MockDocAnalyzer{
|
||||
Healthy: true,
|
||||
DLARegions: []pdf.DLARegion{
|
||||
{X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -282,15 +283,15 @@ func TestParse_CollectsFigures(t *testing.T) {
|
||||
func TestParse_NoFigures(t *testing.T) {
|
||||
// Parse() with no DLA figure regions → p.Figures should be empty.
|
||||
|
||||
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
|
||||
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
|
||||
mock := &MockDocAnalyzer{
|
||||
DLARegions: []pdf.DLARegion{
|
||||
{X0: 150, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -302,10 +303,11 @@ func TestParse_NoFigures(t *testing.T) {
|
||||
func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
|
||||
// Parse() with mock DeepDoc → Figures should be empty (no DLA-detected figures).
|
||||
|
||||
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
|
||||
mock := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -319,9 +321,9 @@ func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
|
||||
func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
|
||||
// When DeepDoc is available and the page has embedded chars,
|
||||
// Parse should use ocrMergeChars (detect → merge → recognize).
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
|
||||
}},
|
||||
}
|
||||
@@ -331,9 +333,9 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
|
||||
{X0: 5, Y0: 5, X1: 50, Y1: 5, X2: 50, Y2: 50, X3: 5, Y3: 50},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -349,15 +351,16 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
|
||||
|
||||
func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
|
||||
// Without DeepDoc, Parse should use charsToBoxes (unchanged behavior).
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
|
||||
}},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
mock := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -368,9 +371,9 @@ func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
|
||||
|
||||
func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
|
||||
// OCRDetect returns no boxes → falls through to charsToBoxes.
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
|
||||
}},
|
||||
}
|
||||
@@ -378,9 +381,9 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
|
||||
Healthy: true,
|
||||
OCRBoxes: []pdf.OCRBox{}, // empty detect
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -392,18 +395,19 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
|
||||
// ── Error path coverage ────────────────────────────────────────────────
|
||||
|
||||
func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
|
||||
mock := &MockDocAnalyzer{
|
||||
Healthy: true,
|
||||
DLAErr: fmt.Errorf("DLA service unavailable"),
|
||||
})
|
||||
eng := &mockEngine{pageCount: 1}
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
eng := &MockEngine{NumPages: 1}
|
||||
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
|
||||
pageImages := map[int]image.Image{0: img}
|
||||
boxes := []pdf.TextBox{
|
||||
{PageNumber: 0, X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "text"},
|
||||
}
|
||||
// enrichWithDeepDoc should return nil (not panic) on DLA error.
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) != 0 {
|
||||
t.Errorf("DLA error should produce 0 tables, got %d", len(tables))
|
||||
}
|
||||
@@ -412,20 +416,21 @@ func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
|
||||
func TestMockDocAnalyzer_TSRError_DoesNotCrash(t *testing.T) {
|
||||
// TSR error: DLA succeeds, TSR fails. The table region is detected
|
||||
// but no cells are returned — the table is skipped gracefully.
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
|
||||
mock := &MockDocAnalyzer{
|
||||
Healthy: true,
|
||||
DLARegions: []pdf.DLARegion{
|
||||
{X0: 0, Y0: 0, X1: 400, Y1: 400, Label: "table", Confidence: 0.95},
|
||||
},
|
||||
TSRErr: fmt.Errorf("TSR model timeout"),
|
||||
})
|
||||
eng := &mockEngine{pageCount: 1}
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
eng := &MockEngine{NumPages: 1}
|
||||
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
|
||||
pageImages := map[int]image.Image{0: img}
|
||||
boxes := []pdf.TextBox{
|
||||
{PageNumber: 0, X0: 10, X1: 90, Top: 10, Bottom: 90, Text: "in table region"},
|
||||
}
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
|
||||
// DLA detects the table region → 1 pdf.TableItem is created. TSR failure
|
||||
// means it has no cells, but the pipeline must not panic.
|
||||
if len(tables) != 1 {
|
||||
@@ -440,12 +445,12 @@ func TestMockDocAnalyzer_OCRDetectError_DoesNotCrash(t *testing.T) {
|
||||
// OCRDetect failure path: extractPages uses ocrDetectAndRecognize which
|
||||
// calls doc.OCRDetect. When it fails, the page is skipped gracefully.
|
||||
mock := &MockDocAnalyzer{Healthy: true, OCRDetectErr: fmt.Errorf("OCR model OOM")}
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
Chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
_, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
_, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse returned error: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -54,12 +54,17 @@ func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc pdf.Doc
|
||||
// merges the chars into detect regions, and OCRs any regions without chars.
|
||||
// Matches Python's __ocr: detect → match chars to boxes → use char text
|
||||
// for boxes with embedded chars → OCR recognize only empty/garbled boxes.
|
||||
type ocrDetectBox struct {
|
||||
box pdf.TextBox
|
||||
x0, y0, x1, y1 float64
|
||||
}
|
||||
|
||||
func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextChar, doc pdf.DocAnalyzer, pageNum int) []pdf.TextBox {
|
||||
detectBoxes, err := doc.OCRDetect(ctx, pageImg)
|
||||
if err != nil || len(detectBoxes) == 0 {
|
||||
ocrDetectBoxes, err := doc.OCRDetect(ctx, pageImg)
|
||||
if err != nil || len(ocrDetectBoxes) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes))
|
||||
slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(ocrDetectBoxes))
|
||||
|
||||
// Detect boxes are in pixel space (216 DPI). Scale to PDF space (72 DPI)
|
||||
// so coordinates match embedded chars.
|
||||
@@ -69,12 +74,8 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
|
||||
imgH := float64(imgBounds.Dy()) / scale
|
||||
|
||||
// Step 1: match embedded chars to detect boxes (Python __ocr char matching).
|
||||
type detectBox struct {
|
||||
box pdf.TextBox
|
||||
x0, y0, x1, y1 float64 // PDF-space bounds
|
||||
}
|
||||
boxes := make([]detectBox, 0, len(detectBoxes))
|
||||
for _, b := range detectBoxes {
|
||||
boxes := make([]ocrDetectBox, 0, len(ocrDetectBoxes))
|
||||
for _, b := range ocrDetectBoxes {
|
||||
x0 := min(b.X0, b.X1, b.X2, b.X3) / scale
|
||||
y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale
|
||||
x1 := max(b.X0, b.X1, b.X2, b.X3) / scale
|
||||
@@ -94,7 +95,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
|
||||
if x0 >= x1 || y0 >= y1 {
|
||||
continue
|
||||
}
|
||||
boxes = append(boxes, detectBox{box: pdf.TextBox{
|
||||
boxes = append(boxes, ocrDetectBox{box: pdf.TextBox{
|
||||
X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum,
|
||||
}, x0: x0, y0: y0, x1: x1, y1: y1})
|
||||
}
|
||||
@@ -145,82 +146,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
|
||||
boxChars[bestIdx] = append(boxChars[bestIdx], c)
|
||||
}
|
||||
|
||||
// Step 3: assemble text for each box.
|
||||
var result []pdf.TextBox
|
||||
var needOCR []int
|
||||
for i := range boxes {
|
||||
tb := boxes[i].box
|
||||
tb.Text = ""
|
||||
|
||||
if len(boxChars[i]) > 0 {
|
||||
// Sort chars by reading order, matching Python's sort_Y_firstly.
|
||||
// Fuzzy Y-group: chars within median char height are "same line",
|
||||
// sorted by X; different lines sorted by Y.
|
||||
sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
|
||||
// Use lineToTextBox for correct space insertion + garbled detection.
|
||||
// lineToTextBox inserts ASCII word spaces at visible gaps —
|
||||
// matching Python's __img_ocr + __ocr char logic.
|
||||
lineBox := lyt.LineToTextBox(boxChars[i])
|
||||
tb.Text = lineBox.Text
|
||||
|
||||
// Strategy 1: If majority of chars are garbled (PUA), clear text → OCR.
|
||||
var garbledCnt, totalCnt int
|
||||
for _, c := range boxChars[i] {
|
||||
for _, r := range c.Text {
|
||||
totalCnt++
|
||||
if util.IsGarbledChar(string(r)) {
|
||||
garbledCnt++
|
||||
}
|
||||
}
|
||||
}
|
||||
if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
|
||||
tb.Text = ""
|
||||
}
|
||||
// Strategy 2: font-encoding garbled (subset fonts, min 5 chars).
|
||||
if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
|
||||
tb.Text = ""
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: batch OCR recognize boxes without embedded chars (or garbled).
|
||||
if tb.Text == "" {
|
||||
needOCR = append(needOCR, i)
|
||||
}
|
||||
result = append(result, tb)
|
||||
}
|
||||
|
||||
if len(needOCR) > 0 {
|
||||
cropped := make([]image.Image, len(needOCR))
|
||||
for j, idx := range needOCR {
|
||||
cropped[j] = util.FastCrop(pageImg,
|
||||
int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
|
||||
int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
|
||||
}
|
||||
allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
|
||||
for j, idx := range needOCR {
|
||||
if allErrs[j] != nil {
|
||||
slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
|
||||
continue
|
||||
}
|
||||
var ocrParts []string
|
||||
for _, t := range allTexts[j] {
|
||||
if strings.TrimSpace(t.Text) != "" {
|
||||
ocrParts = append(ocrParts, t.Text)
|
||||
}
|
||||
}
|
||||
result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
|
||||
}
|
||||
}
|
||||
// Filter out boxes with no text.
|
||||
filtered := result[:0]
|
||||
for _, tb := range result {
|
||||
if tb.Text != "" {
|
||||
filtered = append(filtered, tb)
|
||||
}
|
||||
}
|
||||
result = filtered
|
||||
slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result))
|
||||
return result
|
||||
return buildTextBoxes(ctx, pageImg, boxes, boxChars, doc, scale, pageNum)
|
||||
}
|
||||
|
||||
// sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X.
|
||||
@@ -289,3 +215,71 @@ func ocrTableCells(ctx context.Context, cells []pdf.TSRCell, tableImg image.Imag
|
||||
cells[i].Text = strings.TrimSpace(strings.Join(parts, " "))
|
||||
}
|
||||
}
|
||||
|
||||
// buildTextBoxes assembles detect box text from embedded chars and fills
|
||||
// empty boxes via batch OCR.
|
||||
func buildTextBoxes(ctx context.Context, pageImg image.Image,
|
||||
boxes []ocrDetectBox, boxChars [][]pdf.TextChar, doc pdf.DocAnalyzer, scale float64, pageNum int,
|
||||
) []pdf.TextBox {
|
||||
var result []pdf.TextBox
|
||||
var needOCR []int
|
||||
for i := range boxes {
|
||||
tb := boxes[i].box
|
||||
tb.Text = ""
|
||||
if len(boxChars[i]) > 0 {
|
||||
sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
|
||||
lineBox := lyt.LineToTextBox(boxChars[i])
|
||||
tb.Text = lineBox.Text
|
||||
var garbledCnt, totalCnt int
|
||||
for _, c := range boxChars[i] {
|
||||
for _, r := range c.Text {
|
||||
totalCnt++
|
||||
if util.IsGarbledChar(string(r)) {
|
||||
garbledCnt++
|
||||
}
|
||||
}
|
||||
}
|
||||
if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
|
||||
tb.Text = ""
|
||||
}
|
||||
if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
|
||||
tb.Text = ""
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(tb.Text) == "" {
|
||||
tb.Text = ""
|
||||
needOCR = append(needOCR, i)
|
||||
}
|
||||
result = append(result, tb)
|
||||
}
|
||||
if len(needOCR) > 0 {
|
||||
cropped := make([]image.Image, len(needOCR))
|
||||
for j, idx := range needOCR {
|
||||
cropped[j] = util.FastCrop(pageImg,
|
||||
int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
|
||||
int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
|
||||
}
|
||||
allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
|
||||
for j, idx := range needOCR {
|
||||
if allErrs[j] != nil {
|
||||
slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
|
||||
continue
|
||||
}
|
||||
var ocrParts []string
|
||||
for _, t := range allTexts[j] {
|
||||
if strings.TrimSpace(t.Text) != "" {
|
||||
ocrParts = append(ocrParts, t.Text)
|
||||
}
|
||||
}
|
||||
result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
|
||||
}
|
||||
}
|
||||
filtered := result[:0]
|
||||
for _, tb := range result {
|
||||
if strings.TrimSpace(tb.Text) != "" {
|
||||
filtered = append(filtered, tb)
|
||||
}
|
||||
}
|
||||
slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(filtered))
|
||||
return filtered
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && integration
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
_ "image/png"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"ragflow/internal/deepdoc/parser/pdf/post"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// ── golden-file helpers ────────────────────────────────────────────────────
|
||||
@@ -95,12 +95,11 @@ func tablesToGolden(tables []pdf.TableItem) []tableGolden {
|
||||
// TestIntegration_SectionsText verifies section text output matches golden.
|
||||
func TestIntegration_SectionsText(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "01_english_simple.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -139,12 +138,11 @@ func TestIntegration_SectionsText(t *testing.T) {
|
||||
// TestIntegration_SectionsCount verifies section count is stable.
|
||||
func TestIntegration_SectionsCount(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "01_english_simple.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -166,12 +164,11 @@ func TestIntegration_SectionsCount(t *testing.T) {
|
||||
// TestIntegration_TableStructure verifies table rows and cell text match golden.
|
||||
func TestIntegration_TableStructure(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -221,12 +218,11 @@ func TestIntegration_TableStructure(t *testing.T) {
|
||||
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
|
||||
func TestIntegration_TableImageB64(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -261,12 +257,11 @@ func TestIntegration_TableImageB64(t *testing.T) {
|
||||
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
|
||||
func TestIntegration_LayoutTypes(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "06_table_content.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -316,7 +311,6 @@ func TestIntegration_Idempotency(t *testing.T) {
|
||||
|
||||
// Render a fixture page as the stable input image.
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
pageImg, err := eng.RenderPageImage(0, 216)
|
||||
if err != nil {
|
||||
t.Fatalf("render page: %v", err)
|
||||
@@ -531,12 +525,11 @@ func floatClose(a, b, eps float64) bool {
|
||||
// fixes from the Python→Go migration.
|
||||
func TestIntegration_TableAlign(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "18_table_caption.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "18_table_caption.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -572,12 +565,11 @@ func TestIntegration_TableAlign(t *testing.T) {
|
||||
// (header/footer/reference) boxes are popped from output.
|
||||
func TestIntegration_GarbageLayout(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "17_garbage_layout.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -603,13 +595,12 @@ func TestIntegration_GarbageLayout(t *testing.T) {
|
||||
// TestIntegration_MultiChunk verifies chunked processing for large documents.
|
||||
func TestIntegration_MultiChunk(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "19_multipage_chunk.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.BatchSize = 10 // small batches to force multi-batch path
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -635,11 +626,10 @@ func TestIntegration_NoRegression(t *testing.T) {
|
||||
"07_mixed_content.pdf",
|
||||
} {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, name)
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, name)
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -662,11 +652,10 @@ func TestIntegration_TableRotation(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
|
||||
t.Run("upright_table", func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, "rotate_0.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "rotate_0.pdf")
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -677,16 +666,15 @@ func TestIntegration_TableRotation(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("rotated_90_table", func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, "rotate_90.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "rotate_90.pdf")
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
// DeepDoc DLA does not yet correctly annotate boxes on rotated
|
||||
// pages (regions and characters are in different coordinate
|
||||
// spaces post-rotation). Character extraction and rotation are
|
||||
// verified via the charsToBoxes path.
|
||||
// verified via the lyt.CharsToBoxes path.
|
||||
cfg.SkipOCR = true
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -701,12 +689,11 @@ func TestIntegration_TableRotation(t *testing.T) {
|
||||
// characters with a visible gap (Python __img_ocr space insertion).
|
||||
func TestIntegration_WordSpacing(t *testing.T) {
|
||||
client := mustConnectInferenceClient(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
data := mustReadPDF(t, "01_english_simple.pdf")
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.Parse(context.Background(), data, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -734,53 +721,34 @@ func TestIntegration_WordSpacing(t *testing.T) {
|
||||
// TestE2E_ParseAndPostProcess runs Parse → PostProcess end-to-end on a real
|
||||
// PDF. Skips VLM (no tenant_id set) but exercises all other operators.
|
||||
func TestE2E_ParseAndPostProcess(t *testing.T) {
|
||||
engine := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer engine.Close()
|
||||
data := mustReadPDF(t, "01_english_simple.pdf")
|
||||
|
||||
mock := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), engine)
|
||||
result, err := p.Parse(context.Background(), data, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
preCount := len(result.Sections)
|
||||
if preCount == 0 {
|
||||
if len(result.Sections) == 0 {
|
||||
t.Fatal("Parse() returned zero sections")
|
||||
}
|
||||
t.Logf("sections: %d", len(result.Sections))
|
||||
|
||||
// Post-processing (no VLM).
|
||||
config := post.PipelineConfig{
|
||||
post.ConfigKeyPageWidth: 612.0,
|
||||
post.ConfigKeyZoom: 1.0,
|
||||
}
|
||||
if err := post.PostProcess(context.Background(), result, config); err != nil {
|
||||
t.Fatalf("PostProcess: %v", err)
|
||||
}
|
||||
|
||||
postCount := len(result.Sections)
|
||||
t.Logf("sections: %d → %d after PostProcess", preCount, postCount)
|
||||
if postCount == 0 {
|
||||
t.Error("PostProcess removed all sections")
|
||||
}
|
||||
|
||||
// Every section must have DocTypeKwd + LayoutType set.
|
||||
// PostProcess is handled by the Pipeline framework.
|
||||
// Verify raw parse produces sections with LayoutType set.
|
||||
for i, s := range result.Sections {
|
||||
if s.DocTypeKwd == "" {
|
||||
t.Errorf("section[%d] DocTypeKwd empty after PostProcess", i)
|
||||
}
|
||||
if s.LayoutType == "" {
|
||||
t.Errorf("section[%d] LayoutType empty after PostProcess", i)
|
||||
}
|
||||
t.Logf(" section[%d]: layout=%q text=%q", i, s.LayoutType, truncate(s.Text, 60))
|
||||
}
|
||||
|
||||
// Figures() must reflect post-processed sections.
|
||||
figs := result.Figures()
|
||||
t.Logf("figures: %d", len(figs))
|
||||
for _, f := range figs {
|
||||
if f.LayoutType != "figure" {
|
||||
t.Errorf("Figures() LayoutType=%q, want 'figure'", f.LayoutType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "..."
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -47,8 +47,8 @@ func TestIntegration_NoCrash(t *testing.T) {
|
||||
defer eng.Close()
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, client)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"math"
|
||||
|
||||
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
|
||||
tbl "ragflow/internal/deepdoc/parser/pdf/table"
|
||||
@@ -207,15 +208,16 @@ func TestOCR_FallbackIntegration(t *testing.T) {
|
||||
|
||||
func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) {
|
||||
chars := garbledSample()
|
||||
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
|
||||
mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), mockEng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("garbled chars: %d sections", len(result.Sections))
|
||||
t.Logf("garbled Chars: %d sections", len(result.Sections))
|
||||
}
|
||||
|
||||
func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
|
||||
@@ -241,9 +243,10 @@ func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
|
||||
chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112}
|
||||
chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112}
|
||||
|
||||
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), mockEng)
|
||||
mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -279,7 +282,7 @@ func TestIsGarbledPage(t *testing.T) {
|
||||
})
|
||||
t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) {
|
||||
// ### unmapped glyphs + real CJK text (no subset fonts).
|
||||
// isScanNoise returns false (≥2 consecutive CJK chars: "护理全科").
|
||||
// isScanNoise returns false (≥2 consecutive CJK Chars: "护理全科").
|
||||
chars := []pdf.TextChar{
|
||||
{Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0},
|
||||
{Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0},
|
||||
@@ -552,11 +555,12 @@ func TestTableSectionCaptionInHTML(t *testing.T) {
|
||||
// text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true.
|
||||
// The 0.3 threshold should not match a wide box that barely touches a
|
||||
// narrow cell — this would cause body text to leak into table cells.
|
||||
// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for
|
||||
// TestParser_ConcurrentSafety verifies that Parser.ParseRaw() is safe for
|
||||
// concurrent use. 8 goroutines each call Parse 5 times on the same Parser
|
||||
// instance. Run with -race.
|
||||
func TestParser_ConcurrentSafety(t *testing.T) {
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
var wg sync.WaitGroup
|
||||
n := 8
|
||||
@@ -565,10 +569,58 @@ func TestParser_ConcurrentSafety(t *testing.T) {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for range 5 {
|
||||
eng := &mockEngine{pageCount: 2}
|
||||
_, _ = p.Parse(context.Background(), eng)
|
||||
eng := &MockEngine{NumPages: 2}
|
||||
if _, err := p.ParseRaw(context.Background(), eng, mockDLA); err != nil {
|
||||
t.Errorf("ParseRaw: %v", err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestParseRaw_ClampsFromPage(t *testing.T) {
|
||||
// A negative FromPage should be treated as page 0.
|
||||
// Only page 0 has content so we can verify clamping worked.
|
||||
eng := &MockEngine{NumPages: 3, Chars: map[int][]pdf.TextChar{
|
||||
0: {{Text: "page0", X0: 100, X1: 200, Top: 100, Bottom: 120}},
|
||||
}}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.FromPage = -1
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseRaw: %v", err)
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
t.Error("expected sections from page 0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRaw_ZeroZoom_NoNaN(t *testing.T) {
|
||||
// Zoom=0 should not produce NaN coordinates.
|
||||
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{
|
||||
0: {{Text: "test", X0: 100, X1: 200, Top: 100, Bottom: 120}},
|
||||
}}
|
||||
mockDLA := &MockDocAnalyzer{Healthy: true}
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.Zoom = 0
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseRaw: %v", err)
|
||||
}
|
||||
foundPosition := false
|
||||
for _, s := range result.Sections {
|
||||
for _, pos := range s.Positions {
|
||||
foundPosition = true
|
||||
if math.IsNaN(pos.Left) || math.IsNaN(pos.Top) {
|
||||
t.Error("Zoom=0 produced NaN coordinates")
|
||||
}
|
||||
}
|
||||
}
|
||||
if !foundPosition {
|
||||
t.Fatal("expected at least one position to validate")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -34,8 +34,8 @@ func TestParse_PdfiumRender(t *testing.T) {
|
||||
t.Fatalf("RawData() length %d != original %d", len(raw), len(data))
|
||||
}
|
||||
|
||||
// Render a page through pdfium (via the parser's renderPageToImage).
|
||||
img, err := renderPageToImage(eng, 0)
|
||||
// Render a page through pdfium (via the parser's RenderPageToImage).
|
||||
img, err := RenderPageToImage(eng, 0)
|
||||
if err != nil {
|
||||
t.Skipf("pdfium render not available: %v", err)
|
||||
}
|
||||
@@ -48,8 +48,8 @@ func TestParse_PdfiumRender(t *testing.T) {
|
||||
// Run Parse with pdfium rendering — BATCH_SKIP_DEEPDOC=1 to avoid HTTP calls.
|
||||
t.Setenv("BATCH_SKIP_DEEPDOC", "1")
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -64,10 +64,10 @@ func TestParse_PdfiumRender(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestParse_PdfiumRender_NoData(t *testing.T) {
|
||||
// When engine has no raw PDF bytes, renderPageToImage falls back to
|
||||
// When engine has no raw PDF bytes, RenderPageToImage falls back to
|
||||
// engine.RenderPageImage(). Stub returns (nil, nil) → guard converts
|
||||
// to ErrNoPDFData so callers never receive a nil image with nil error.
|
||||
img, err := renderPageToImage(&pythonCharEngineStub{}, 0)
|
||||
img, err := RenderPageToImage(&pythonCharEngineStub{}, 0)
|
||||
if err != ErrNoPDFData {
|
||||
t.Errorf("expected ErrNoPDFData, got %v", err)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
@@ -11,8 +11,8 @@ import (
|
||||
)
|
||||
|
||||
// pdfoxideEngine adapts pdfoxide.Engine to the pdf.PDFEngine interface.
|
||||
type pdfoxideEngine struct {
|
||||
inner *pdfoxide.Engine
|
||||
type PDFOxideEngine struct {
|
||||
Inner *pdfoxide.Engine
|
||||
}
|
||||
|
||||
// NewEngine returns a pdf.PDFEngine backed by pdf_oxide.
|
||||
@@ -21,15 +21,15 @@ func NewEngine(pdfBytes []byte) (pdf.PDFEngine, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &pdfoxideEngine{inner: eng}, nil
|
||||
return &PDFOxideEngine{Inner: eng}, nil
|
||||
}
|
||||
|
||||
func (e *pdfoxideEngine) RawData() []byte { return e.inner.RawData() }
|
||||
func (e *pdfoxideEngine) PageCount() (int, error) { return e.inner.PageCount() }
|
||||
func (e *pdfoxideEngine) Close() error { return e.inner.Close() }
|
||||
func (e *PDFOxideEngine) RawData() []byte { return e.Inner.RawData() }
|
||||
func (e *PDFOxideEngine) PageCount() (int, error) { return e.Inner.PageCount() }
|
||||
func (e *PDFOxideEngine) Close() error { return e.Inner.Close() }
|
||||
|
||||
func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
|
||||
ol := pdfium.ExtractOutlines(e.inner.RawData())
|
||||
func (e *PDFOxideEngine) Outlines() ([]pdf.Outline, error) {
|
||||
ol := pdfium.ExtractOutlines(e.Inner.RawData())
|
||||
result := make([]pdf.Outline, len(ol))
|
||||
for i, o := range ol {
|
||||
result[i] = pdf.Outline{Title: o.Title, Level: o.Level, PageNumber: o.PageNumber}
|
||||
@@ -37,16 +37,16 @@ func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (e *pdfoxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
|
||||
return e.inner.RenderPage(pageNum, dpi)
|
||||
func (e *PDFOxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
|
||||
return e.Inner.RenderPage(pageNum, dpi)
|
||||
}
|
||||
|
||||
func (e *pdfoxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
|
||||
return e.inner.RenderPageImage(pageNum, dpi)
|
||||
func (e *PDFOxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
|
||||
return e.Inner.RenderPageImage(pageNum, dpi)
|
||||
}
|
||||
|
||||
func (e *pdfoxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
|
||||
chars, err := e.inner.ExtractChars(pageNum)
|
||||
func (e *PDFOxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
|
||||
chars, err := e.Inner.ExtractChars(pageNum)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
|
||||
"ragflow/internal/deepdoc/parser/pdf/tool"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
util "ragflow/internal/deepdoc/parser/pdf/util"
|
||||
)
|
||||
|
||||
// TestPipelineParity verifies Go pipeline logic equivalence with Python.
|
||||
@@ -53,8 +54,9 @@ func TestPipelineParity(t *testing.T) {
|
||||
// Run Go pipeline (SKIP_OCR — no DeepDoc)
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.SortByTop = true
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), engine)
|
||||
mockAnalyzer := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), engine, mockAnalyzer)
|
||||
if err != nil {
|
||||
t.Errorf("%s: Parse: %v", name, err)
|
||||
continue
|
||||
@@ -151,7 +153,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
|
||||
if isWS && len(out) > 0 {
|
||||
prev := &out[len(out)-1]
|
||||
gap := b.Top - prev.Bottom
|
||||
ov := OverlapX(prev, &b)
|
||||
ov := util.OverlapX(prev, &b)
|
||||
// Python: gap passes AND xov passes → whitespace merged
|
||||
// into prev, extending bottom. i advances (Go for-loop).
|
||||
if gap <= thr && ov >= 0.3 {
|
||||
@@ -169,7 +171,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
gap := b.Top - prev.Bottom
|
||||
ov := OverlapX(prev, &b)
|
||||
ov := util.OverlapX(prev, &b)
|
||||
if gap > thr {
|
||||
out = append(out, b)
|
||||
continue
|
||||
@@ -219,7 +221,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
gap := b.Top - prev.Bottom
|
||||
ov := OverlapX(prev, &b)
|
||||
ov := util.OverlapX(prev, &b)
|
||||
if gap > thr {
|
||||
out = append(out, b)
|
||||
continue
|
||||
@@ -250,18 +252,18 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
|
||||
t.Logf("Gap with bridge: 420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr)
|
||||
|
||||
// The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still
|
||||
// differ — the mechanism is real. But production NaiveVerticalMerge now
|
||||
// differ — the mechanism is real. But production lyt.NaiveVerticalMerge now
|
||||
// handles whitespace inline (gap bridge), matching Python.
|
||||
if nWS == nNoWS {
|
||||
t.Error("Manual implementations should differ — the gap bridge mechanism is real")
|
||||
}
|
||||
|
||||
// Verify production NaiveVerticalMerge matches vWithWS (Python behavior).
|
||||
// Verify production lyt.NaiveVerticalMerge matches vWithWS (Python behavior).
|
||||
mhMap := map[int]float64{1: mh}
|
||||
mwMap := map[int]float64{1: 5}
|
||||
vmResult := lyt.NaiveVerticalMerge(boxes, mhMap, mwMap, false)
|
||||
t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult))
|
||||
t.Logf("lyt.NaiveVerticalMerge (production): %d sections", len(vmResult))
|
||||
if len(vmResult) != nWS {
|
||||
t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
|
||||
t.Errorf("lyt.NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"image"
|
||||
"image/png"
|
||||
)
|
||||
|
||||
// ── chat driver interface (self-contained, avoids entity/models import) ──
|
||||
|
||||
// ChatDriver is the subset of modelModule.ModelDriver needed to call a
|
||||
// vision-capable chat API. Defined here to keep model_image_describer.go
|
||||
// self-contained and avoid import chains that require CGO.
|
||||
type ChatDriver interface {
|
||||
ChatWithMessages(modelName string, messages []ChatMessage, apiConfig *ChatAPIConfig, chatConfig *ChatConfig) (*ChatResponse, error)
|
||||
}
|
||||
|
||||
// ChatMessage mirrors modelModule.Message.
|
||||
type ChatMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content interface{} `json:"content"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"`
|
||||
ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"`
|
||||
}
|
||||
|
||||
// ChatAPIConfig mirrors modelModule.APIConfig.
|
||||
type ChatAPIConfig struct {
|
||||
ApiKey *string
|
||||
Region *string
|
||||
BaseURL *string
|
||||
}
|
||||
|
||||
// ChatConfig mirrors modelModule.ChatConfig (may be nil).
|
||||
type ChatConfig struct{}
|
||||
|
||||
// ChatResponse mirrors modelModule.ChatResponse.
|
||||
type ChatResponse struct {
|
||||
Answer *string `json:"answer"`
|
||||
ReasonContent *string `json:"reason_content"`
|
||||
ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"`
|
||||
}
|
||||
|
||||
// ── ModelImageDescriber ────────────────────────────────────────────────
|
||||
|
||||
// ModelImageDescriber implements ImageDescriber via any ChatDriver.
|
||||
type ModelImageDescriber struct {
|
||||
driver ChatDriver
|
||||
modelName string
|
||||
apiConfig *ChatAPIConfig
|
||||
maxTokens int
|
||||
}
|
||||
|
||||
// NewModelImageDescriber creates a ModelImageDescriber that calls the given
|
||||
// driver to describe images. maxTokens sets the response length limit (passed
|
||||
// as ChatConfig.MaxTokens); 0 means use provider default.
|
||||
func NewModelImageDescriber(d ChatDriver, name string, cfg *ChatAPIConfig, maxTokens int) *ModelImageDescriber {
|
||||
return &ModelImageDescriber{driver: d, modelName: name, apiConfig: cfg, maxTokens: maxTokens}
|
||||
}
|
||||
|
||||
// DescribeImage sends the image as a base64 data URL in an OpenAI-compatible
|
||||
// vision API request. Returns the model's text response.
|
||||
func (d *ModelImageDescriber) DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error) {
|
||||
dataURL, err := encodeImageToBase64DataURL(img)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("image encode: %w", err)
|
||||
}
|
||||
|
||||
msgs := []ChatMessage{{
|
||||
Role: "user",
|
||||
Content: []interface{}{
|
||||
map[string]interface{}{"type": "text", "text": prompt},
|
||||
map[string]interface{}{"type": "image_url", "image_url": map[string]string{"url": dataURL}},
|
||||
},
|
||||
}}
|
||||
|
||||
var chatCfg *ChatConfig
|
||||
if d.maxTokens > 0 {
|
||||
chatCfg = &ChatConfig{}
|
||||
}
|
||||
resp, err := d.driver.ChatWithMessages(d.modelName, msgs, d.apiConfig, chatCfg)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("image describe: %w", err)
|
||||
}
|
||||
if resp.Answer == nil || *resp.Answer == "" {
|
||||
return "", errors.New("image describe: empty response")
|
||||
}
|
||||
return *resp.Answer, nil
|
||||
}
|
||||
|
||||
// encodeImageToBase64DataURL encodes an image as a PNG data URL.
|
||||
func encodeImageToBase64DataURL(img image.Image) (string, error) {
|
||||
var buf bytes.Buffer
|
||||
if err := png.Encode(&buf, img); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return "data:image/png;base64," + base64.StdEncoding.EncodeToString(buf.Bytes()), nil
|
||||
}
|
||||
@@ -1,79 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"image"
|
||||
"image/color"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ── mock ChatDriver ────────────────────────────────────────────────────
|
||||
|
||||
type mockChatDriver struct {
|
||||
answer string
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockChatDriver) ChatWithMessages(_ string, _ []ChatMessage, _ *ChatAPIConfig, _ *ChatConfig) (*ChatResponse, error) {
|
||||
if m.err != nil {
|
||||
return nil, m.err
|
||||
}
|
||||
a := m.answer
|
||||
return &ChatResponse{Answer: &a}, nil
|
||||
}
|
||||
|
||||
// ── ModelImageDescriber tests ──────────────────────────────────────────
|
||||
|
||||
func TestModelImageDescriber_Success(t *testing.T) {
|
||||
img := newTestImage(100, 100)
|
||||
want := "A chart showing revenue growth."
|
||||
driver := &mockChatDriver{answer: want}
|
||||
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
|
||||
|
||||
got, err := desc.DescribeImage(context.Background(), img, "Describe this chart")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestModelImageDescriber_DriverError(t *testing.T) {
|
||||
img := newTestImage(100, 100)
|
||||
driver := &mockChatDriver{err: errors.New("API rate limited")}
|
||||
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
|
||||
|
||||
_, err := desc.DescribeImage(context.Background(), img, "prompt")
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestModelImageDescriber_EmptyAnswer(t *testing.T) {
|
||||
img := newTestImage(100, 100)
|
||||
driver := &mockChatDriver{answer: ""}
|
||||
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
|
||||
|
||||
_, err := desc.DescribeImage(context.Background(), img, "prompt")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for empty answer, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// ── encodeImageToBase64DataURL tests ───────────────────────────────────
|
||||
|
||||
func TestEncodeImageToBase64DataURL(t *testing.T) {
|
||||
img := image.NewRGBA(image.Rect(0, 0, 1, 1))
|
||||
img.Set(0, 0, color.RGBA{R: 255, G: 0, B: 0, A: 255})
|
||||
|
||||
url, err := encodeImageToBase64DataURL(img)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !strings.HasPrefix(url, "data:image/png;base64,") {
|
||||
t.Errorf("missing data URL prefix: %s...", url[:min(50, len(url))])
|
||||
}
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// ── Tests for remove_toc config flag ────────────────────────────────────────
|
||||
|
||||
// TestPostProcess_RemoveTOC_DisabledByConfig verifies that when
|
||||
// remove_toc=false, outlines are NOT used to remove TOC pages even
|
||||
// when outlines are present.
|
||||
func TestPostProcess_RemoveTOC_DisabledByConfig(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
|
||||
makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
|
||||
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
|
||||
makePosSection("第二章 正文", 5, 100, 500, 100, 200),
|
||||
)
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "目录", Level: 0, PageNumber: 1},
|
||||
{Title: "第一章", Level: 0, PageNumber: 3},
|
||||
{Title: "第二章", Level: 0, PageNumber: 5},
|
||||
}
|
||||
|
||||
config := PipelineConfig{
|
||||
ConfigKeyRemoveTOC: false,
|
||||
ConfigKeyOutlines: outlines,
|
||||
}
|
||||
err := PostProcess(context.Background(), result, config)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 4 {
|
||||
t.Errorf("remove_toc=false should keep all sections, got %d", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostProcess_RemoveTOC_EnabledByConfig verifies that when
|
||||
// remove_toc=true and outlines are present, TOC pages are removed.
|
||||
func TestPostProcess_RemoveTOC_EnabledByConfig(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
|
||||
makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
|
||||
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
|
||||
makePosSection("第二章 正文", 5, 100, 500, 100, 200),
|
||||
)
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "目录", Level: 0, PageNumber: 1},
|
||||
{Title: "第一章", Level: 0, PageNumber: 3},
|
||||
{Title: "第二章", Level: 0, PageNumber: 5},
|
||||
}
|
||||
|
||||
config := PipelineConfig{
|
||||
ConfigKeyRemoveTOC: true,
|
||||
ConfigKeyOutlines: outlines,
|
||||
}
|
||||
err := PostProcess(context.Background(), result, config)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Errorf("remove_toc=true should remove TOC pages, got %d sections", len(result.Sections))
|
||||
}
|
||||
for _, s := range result.Sections {
|
||||
for _, p := range s.Positions {
|
||||
for _, pn := range p.PageNumbers {
|
||||
if pn < 3 {
|
||||
t.Errorf("TOC page %d should have been removed: section %q", pn, s.Text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostProcess_RemoveTOC_NoOutlines verifies that when no outlines
|
||||
// are passed, no TOC removal happens.
|
||||
func TestPostProcess_RemoveTOC_NoOutlines(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("目录内容", 1, 100, 500, 100, 200),
|
||||
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
|
||||
)
|
||||
config := PipelineConfig{
|
||||
ConfigKeyRemoveTOC: true,
|
||||
}
|
||||
err := PostProcess(context.Background(), result, config)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Errorf("no outlines → all sections kept, got %d", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
// TestPostProcess_RemoveTOC_EmptyOutlines verifies empty outlines array is no-op.
|
||||
func TestPostProcess_RemoveTOC_EmptyOutlines(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("目录", 1, 100, 500, 100, 200),
|
||||
makePosSection("正文", 2, 100, 500, 100, 200),
|
||||
)
|
||||
config := PipelineConfig{
|
||||
ConfigKeyRemoveTOC: true,
|
||||
ConfigKeyOutlines: []pdftype.Outline{},
|
||||
}
|
||||
err := PostProcess(context.Background(), result, config)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Errorf("empty outlines → all sections kept, got %d", len(result.Sections))
|
||||
}
|
||||
}
|
||||
@@ -1,436 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
"ragflow/internal/deepdoc/parser/pdf/util"
|
||||
)
|
||||
|
||||
// ── Config ─────────────────────────────────────────────────────────────
|
||||
|
||||
// Config keys for PipelineConfig.
|
||||
const (
|
||||
ConfigKeyPageWidth = "page_width"
|
||||
ConfigKeyZoom = "zoom"
|
||||
ConfigKeyOutlines = "outlines"
|
||||
ConfigKeyFlattenMediaToText = "flatten_media_to_text"
|
||||
ConfigKeyTenantID = "tenant_id"
|
||||
ConfigKeyVLMLLMID = "vlm_llm_id"
|
||||
ConfigKeyRemoveTOC = "remove_toc"
|
||||
)
|
||||
|
||||
// PipelineConfig is a key-value map that post-processing reads
|
||||
// to obtain its parameters.
|
||||
type PipelineConfig map[string]interface{}
|
||||
|
||||
// Float64 returns the float64 value for key, or default_ if absent or wrong type.
|
||||
func (c PipelineConfig) Float64(key string, default_ float64) float64 {
|
||||
if c == nil {
|
||||
return default_
|
||||
}
|
||||
v, ok := c[key]
|
||||
if !ok {
|
||||
return default_
|
||||
}
|
||||
f, ok := v.(float64)
|
||||
if !ok {
|
||||
return default_
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
// Bool returns the bool value for key. Returns false if absent or wrong type.
|
||||
func (c PipelineConfig) Bool(key string) bool {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
v, ok := c[key]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
b, ok := v.(bool)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines.
|
||||
func (c PipelineConfig) Outlines() []pdftype.Outline {
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
v, ok := c[ConfigKeyOutlines]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
o, ok := v.([]pdftype.Outline)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return o
|
||||
}
|
||||
|
||||
// String returns the string value for key. Returns "" if absent or wrong type.
|
||||
func (c PipelineConfig) String(key string) string {
|
||||
if c == nil {
|
||||
return ""
|
||||
}
|
||||
v, ok := c[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
s, ok := v.(string)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// ── Patterns ───────────────────────────────────────────────────────────
|
||||
|
||||
// headerFooterPattern matches layout types that should be treated as
|
||||
// page furniture (Python: r"(header|footer|number)" in parser.py:637).
|
||||
var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`)
|
||||
|
||||
// tocTitlePattern matches outline titles that mark a table-of-contents page.
|
||||
// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$"
|
||||
var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`)
|
||||
|
||||
// ── PostProcess ────────────────────────────────────────────────────────
|
||||
|
||||
// PostProcess applies PDF post-processing to a ParseResult in-place.
|
||||
// The config map controls which features to enable.
|
||||
//
|
||||
// Execution order (matches Python _pdf):
|
||||
// 1. reorderMultiColumn — if page_width > 0
|
||||
// 2. removeTOCByOutlines — if outlines present
|
||||
// 3. normalizeLayoutType — always
|
||||
// 4. filterHeaderFooter — always
|
||||
// 5. assignDocTypeKwd — always (respects flatten_media_to_text)
|
||||
// 6. enhanceWithVision — if image_describer present
|
||||
func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error {
|
||||
if result == nil {
|
||||
return errors.New("PostProcess: nil result")
|
||||
}
|
||||
if config == nil {
|
||||
config = PipelineConfig{}
|
||||
}
|
||||
|
||||
// 1. Multi-column reorder
|
||||
pw := config.Float64(ConfigKeyPageWidth, 0)
|
||||
if pw > 0 {
|
||||
zoom := config.Float64(ConfigKeyZoom, 1.0)
|
||||
if zoom <= 0 {
|
||||
zoom = 1.0
|
||||
}
|
||||
reorderMultiColumn(result, pw, zoom)
|
||||
}
|
||||
|
||||
// 2. Remove TOC pages (only when explicitly enabled).
|
||||
// Outlines from config take precedence; otherwise read from ParseResult.
|
||||
outlines := config.Outlines()
|
||||
if len(outlines) == 0 {
|
||||
outlines = result.Outlines
|
||||
}
|
||||
if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 {
|
||||
removeTOCByOutlines(result, outlines)
|
||||
}
|
||||
|
||||
// 3-5. Always-on steps
|
||||
normalizeLayoutType(result)
|
||||
filterHeaderFooter(result)
|
||||
assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText))
|
||||
|
||||
// 6. VLM enhancement
|
||||
tenantID := config.String(ConfigKeyTenantID)
|
||||
vlmLLMID := config.String(ConfigKeyVLMLLMID)
|
||||
if tenantID != "" && vlmLLMID != "" {
|
||||
describer, err := resolveImageDescriber(tenantID, vlmLLMID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := enhanceWithVision(ctx, result, describer); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveImageDescriber resolves a VLM model from tenant config and returns
|
||||
// an ImageDescriber. Corresponds to Python's
|
||||
// get_model_config_from_provider_instance + LLMBundle.
|
||||
// resolveImageDescriber resolves a VLM model from tenant config and returns
|
||||
// an ImageDescriber. The implementation is assigned by init() in
|
||||
// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub).
|
||||
// Overridable in tests.
|
||||
var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error)
|
||||
|
||||
// SetImageDescriberResolver sets the factory that creates an ImageDescriber
|
||||
// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the
|
||||
// PDF document pipeline entry point) register the real implementation via
|
||||
// init(). If never called, PostProcess skips VLM enhancement.
|
||||
func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) {
|
||||
resolveImageDescriber = fn
|
||||
}
|
||||
|
||||
// ── normalizeLayoutType ────────────────────────────────────────────────
|
||||
|
||||
// normalizeLayoutType trims whitespace from LayoutType and defaults empty
|
||||
// values to "text". Matches Python's layout_type normalization in parser.py.
|
||||
func normalizeLayoutType(result *pdftype.ParseResult) {
|
||||
for i := range result.Sections {
|
||||
lt := strings.TrimSpace(result.Sections[i].LayoutType)
|
||||
if lt == "" {
|
||||
lt = "text"
|
||||
}
|
||||
result.Sections[i].LayoutType = lt
|
||||
}
|
||||
}
|
||||
|
||||
// ── filterHeaderFooter ─────────────────────────────────────────────────
|
||||
|
||||
// filterHeaderFooter removes sections whose LayoutType matches
|
||||
// header/footer/number/reference. Python: remove_header_footer config.
|
||||
func filterHeaderFooter(result *pdftype.ParseResult) {
|
||||
sections := result.Sections[:0]
|
||||
for _, s := range result.Sections {
|
||||
if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) {
|
||||
continue
|
||||
}
|
||||
sections = append(sections, s)
|
||||
}
|
||||
result.Sections = sections
|
||||
}
|
||||
|
||||
// ── assignDocTypeKwd ───────────────────────────────────────────────────
|
||||
|
||||
// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence.
|
||||
// When flatten is true, all sections become "text" and Image is cleared —
|
||||
// this matches Python where flatten_media_to_text and VLM are mutually
|
||||
// exclusive. Python: parser.py:639-648.
|
||||
func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) {
|
||||
for i := range result.Sections {
|
||||
s := &result.Sections[i]
|
||||
if flatten {
|
||||
s.DocTypeKwd = "text"
|
||||
s.Image = ""
|
||||
continue
|
||||
}
|
||||
lt := strings.TrimSpace(s.LayoutType)
|
||||
switch lt {
|
||||
case "table":
|
||||
s.DocTypeKwd = "table"
|
||||
case "figure":
|
||||
s.DocTypeKwd = "image"
|
||||
default:
|
||||
if lt == "" && s.Image != "" {
|
||||
s.DocTypeKwd = "image"
|
||||
} else {
|
||||
s.DocTypeKwd = "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── enhanceWithVision ──────────────────────────────────────────────────
|
||||
|
||||
// enhanceWithVision adds VLM-generated descriptions to image/table sections.
|
||||
func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error {
|
||||
if describer == nil {
|
||||
return nil
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, maxDescribeConcurrency)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i := range result.Sections {
|
||||
s := &result.Sections[i]
|
||||
if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" {
|
||||
continue
|
||||
}
|
||||
if s.Image == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(idx int, imgB64 string, origText string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
|
||||
img, err := util.DecodeBase64PNG(imgB64)
|
||||
if err != nil || img == nil {
|
||||
return
|
||||
}
|
||||
desc, err := DescribeImage(ctx, img, describePrompt, describer)
|
||||
if err != nil || desc == "" {
|
||||
return
|
||||
}
|
||||
|
||||
if origText != "" {
|
||||
result.Sections[idx].Text = origText + "\n" + desc
|
||||
} else {
|
||||
result.Sections[idx].Text = desc
|
||||
}
|
||||
}(i, s.Image, s.Text)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── removeTOCByOutlines ────────────────────────────────────────────────
|
||||
|
||||
// removeTOCByOutlines removes sections whose page numbers fall inside
|
||||
// TOC page ranges identified by PDF outlines.
|
||||
func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) {
|
||||
if len(outlines) == 0 {
|
||||
return
|
||||
}
|
||||
tocPage, contentPage := findTOCPageRange(outlines)
|
||||
if contentPage <= tocPage {
|
||||
return
|
||||
}
|
||||
sections := result.Sections[:0]
|
||||
for _, s := range result.Sections {
|
||||
pg := sectionPage(s)
|
||||
if pg >= tocPage && pg < contentPage {
|
||||
continue
|
||||
}
|
||||
sections = append(sections, s)
|
||||
}
|
||||
result.Sections = sections
|
||||
}
|
||||
|
||||
// findTOCPageRange scans outlines for a TOC entry and returns the
|
||||
// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found.
|
||||
func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) {
|
||||
trimSplit:
|
||||
for i, o := range outlines {
|
||||
title := strings.TrimSpace(o.Title)
|
||||
if idx := strings.Index(title, "@@"); idx >= 0 {
|
||||
title = strings.TrimSpace(title[:idx])
|
||||
}
|
||||
if !tocTitlePattern.MatchString(strings.ToLower(title)) {
|
||||
continue
|
||||
}
|
||||
tocPage = o.PageNumber
|
||||
for _, next := range outlines[i+1:] {
|
||||
if next.Level != o.Level {
|
||||
continue
|
||||
}
|
||||
nt := strings.TrimSpace(next.Title)
|
||||
if idx := strings.Index(nt, "@@"); idx >= 0 {
|
||||
nt = strings.TrimSpace(nt[:idx])
|
||||
}
|
||||
if tocTitlePattern.MatchString(strings.ToLower(nt)) {
|
||||
continue
|
||||
}
|
||||
contentPage = next.PageNumber
|
||||
break trimSplit
|
||||
}
|
||||
break
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// sectionPage returns the first page number of a Section, or 0.
|
||||
func sectionPage(s pdftype.Section) int {
|
||||
for _, p := range s.Positions {
|
||||
for _, pn := range p.PageNumbers {
|
||||
return pn
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// ── reorderMultiColumn ─────────────────────────────────────────────────
|
||||
|
||||
// reorderMultiColumn reorders text sections in multi-column layouts.
|
||||
// If median text column width >= page width / 2 (single-column layout),
|
||||
// the input order is preserved.
|
||||
//
|
||||
// Python: reorder_multi_column_bboxes + sort_X_by_page
|
||||
func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) {
|
||||
if len(result.Sections) < 2 {
|
||||
return
|
||||
}
|
||||
pw := pageWidth / zoom
|
||||
|
||||
// Compute median width from text sections with valid coordinates.
|
||||
var widths []float64
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType != "text" {
|
||||
continue
|
||||
}
|
||||
if len(s.Positions) == 0 {
|
||||
continue
|
||||
}
|
||||
w := s.Positions[0].Right - s.Positions[0].Left
|
||||
if w > 0 {
|
||||
widths = append(widths, w)
|
||||
}
|
||||
}
|
||||
if len(widths) == 0 {
|
||||
return
|
||||
}
|
||||
sort.Float64s(widths)
|
||||
medianW := widths[len(widths)/2]
|
||||
|
||||
if medianW >= pw/2 {
|
||||
return // single column
|
||||
}
|
||||
|
||||
// Sort by (PageNumber, X0, Top).
|
||||
sort.Slice(result.Sections, func(i, j int) bool {
|
||||
pi := sectionPage(result.Sections[i])
|
||||
pj := sectionPage(result.Sections[j])
|
||||
if pi != pj {
|
||||
return pi < pj
|
||||
}
|
||||
xi := sectionX0(result.Sections[i])
|
||||
xj := sectionX0(result.Sections[j])
|
||||
if math.Abs(xi-xj) > 1e-6 {
|
||||
return xi < xj
|
||||
}
|
||||
return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j])
|
||||
})
|
||||
|
||||
threshold := medianW / 2
|
||||
// Correct same-page sections with nearly-same X0 but inverted Top.
|
||||
for i := len(result.Sections) - 1; i >= 1; i-- {
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold &&
|
||||
sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) &&
|
||||
sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) {
|
||||
result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sectionX0(s pdftype.Section) float64 {
|
||||
for _, p := range s.Positions {
|
||||
return p.Left
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func sectionTop(s pdftype.Section) float64 {
|
||||
for _, p := range s.Positions {
|
||||
return p.Top
|
||||
}
|
||||
return 0
|
||||
}
|
||||
@@ -1,434 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
// dummyBase64PNG is a valid 50×50 red pixel PNG, base64-encoded.
|
||||
const dummyBase64PNG = "iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAIAAACRXR/mAAAAUElEQVR4nOzOsREAEAAAMefsvzILaL6iSCbI2uNH83XgTqvQKrQKrUKr0Cq0Cq1Cq9AqtAqtQqvQKrQKrUKr0Cq0Cq1Cq9AqtAqt4gQAAP//miQBZqrF+JAAAAAASUVORK5CYII="
|
||||
|
||||
func newTestResult(sections ...pdftype.Section) *pdftype.ParseResult {
|
||||
return &pdftype.ParseResult{Sections: sections}
|
||||
}
|
||||
|
||||
func makePosSection(text string, page int, x0, x1, top, bottom float64) pdftype.Section {
|
||||
return pdftype.Section{
|
||||
Text: text,
|
||||
LayoutType: "text",
|
||||
Positions: []pdftype.Position{{PageNumbers: []int{page}, Left: x0, Right: x1, Top: top, Bottom: bottom}},
|
||||
}
|
||||
}
|
||||
|
||||
// ── normalizeLayoutType ────────────────────────────────────────────────
|
||||
|
||||
func TestNormalizeLayoutType(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "a", LayoutType: ""},
|
||||
pdftype.Section{Text: "b", LayoutType: " "},
|
||||
pdftype.Section{Text: "c", LayoutType: "table"},
|
||||
pdftype.Section{Text: "d", LayoutType: " figure "},
|
||||
pdftype.Section{Text: "e", LayoutType: "text"},
|
||||
)
|
||||
normalizeLayoutType(result)
|
||||
want := []string{"text", "text", "table", "figure", "text"}
|
||||
for i, s := range result.Sections {
|
||||
if s.LayoutType != want[i] {
|
||||
t.Errorf("Sections[%d]: got %q, want %q", i, s.LayoutType, want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── filterHeaderFooter ─────────────────────────────────────────────────
|
||||
|
||||
func TestFilterHeaderFooter(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "Page 1", LayoutType: "header"},
|
||||
pdftype.Section{Text: "Chapter 1", LayoutType: "text"},
|
||||
pdftype.Section{LayoutType: "footer"},
|
||||
pdftype.Section{LayoutType: "number"},
|
||||
pdftype.Section{Text: "Body", LayoutType: "text"},
|
||||
pdftype.Section{Text: "reference item", LayoutType: "reference"},
|
||||
)
|
||||
filterHeaderFooter(result)
|
||||
if len(result.Sections) != 2 {
|
||||
t.Fatalf("expected 2 sections, got %d: %+v", len(result.Sections), result.Sections)
|
||||
}
|
||||
if result.Sections[0].Text != "Chapter 1" || result.Sections[1].Text != "Body" {
|
||||
t.Errorf("wrong sections kept: %+v", result.Sections)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterHeaderFooter_Empty(t *testing.T) {
|
||||
result := newTestResult()
|
||||
filterHeaderFooter(result)
|
||||
if len(result.Sections) != 0 {
|
||||
t.Error("expected empty result")
|
||||
}
|
||||
}
|
||||
|
||||
// ── assignDocTypeKwd ───────────────────────────────────────────────────
|
||||
|
||||
func TestAssignDocTypeKwd_Normal(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "a", LayoutType: "table"},
|
||||
pdftype.Section{Text: "b", LayoutType: "figure"},
|
||||
pdftype.Section{Text: "c", LayoutType: "equation"},
|
||||
pdftype.Section{Text: "d", LayoutType: "", Image: dummyBase64PNG},
|
||||
pdftype.Section{Text: "e", LayoutType: "text"},
|
||||
pdftype.Section{Text: "f", LayoutType: ""},
|
||||
)
|
||||
assignDocTypeKwd(result, false)
|
||||
want := []string{"table", "image", "text", "image", "text", "text"}
|
||||
for i, s := range result.Sections {
|
||||
if s.DocTypeKwd != want[i] {
|
||||
t.Errorf("Sections[%d]: got %q, want %q", i, s.DocTypeKwd, want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAssignDocTypeKwd_Flatten(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "a", LayoutType: "table", DocTypeKwd: "table", Image: dummyBase64PNG},
|
||||
pdftype.Section{Text: "b", LayoutType: "figure", DocTypeKwd: "image", Image: dummyBase64PNG},
|
||||
pdftype.Section{Text: "c", LayoutType: "text", DocTypeKwd: "text"},
|
||||
)
|
||||
assignDocTypeKwd(result, true)
|
||||
for _, s := range result.Sections {
|
||||
if s.DocTypeKwd != "text" {
|
||||
t.Errorf("expected all 'text', got %q", s.DocTypeKwd)
|
||||
}
|
||||
if s.Image != "" {
|
||||
t.Error("flatten should clear Image to prevent VLM enhancement")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── enhanceWithVision ──────────────────────────────────────────────────
|
||||
|
||||
func TestEnhanceWithVision_NoOp(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "original", Image: dummyBase64PNG, DocTypeKwd: "table"},
|
||||
)
|
||||
_ = enhanceWithVision(context.Background(), result, nil)
|
||||
if result.Sections[0].Text != "original" {
|
||||
t.Errorf("text changed when describer is nil: %q", result.Sections[0].Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnhanceWithVision_Success(t *testing.T) {
|
||||
want := "A table showing Q1 revenue."
|
||||
desc := &mockImageDescriber{describe: want}
|
||||
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "", Image: dummyBase64PNG, DocTypeKwd: "table"},
|
||||
)
|
||||
if err := enhanceWithVision(context.Background(), result, desc); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if result.Sections[0].Text != want {
|
||||
t.Errorf("text not enhanced: got %q", result.Sections[0].Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnhanceWithVision_SkipText(t *testing.T) {
|
||||
desc := &mockImageDescriber{describe: "should not be called"}
|
||||
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "plain text", DocTypeKwd: "text", Image: ""},
|
||||
)
|
||||
if err := enhanceWithVision(context.Background(), result, desc); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if result.Sections[0].Text != "plain text" {
|
||||
t.Errorf("text changed: %q", result.Sections[0].Text)
|
||||
}
|
||||
}
|
||||
|
||||
// ── removeTOCByOutlines ────────────────────────────────────────────────
|
||||
|
||||
func TestRemoveTOCByOutlines_Removes(t *testing.T) {
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 1},
|
||||
{Title: "目录", Level: 0, PageNumber: 3},
|
||||
{Title: "Chapter 2 Methods", Level: 0, PageNumber: 5},
|
||||
}
|
||||
result := newTestResult(
|
||||
makePosSection("s1", 1, 50, 550, 100, 120),
|
||||
makePosSection("s2", 2, 50, 550, 100, 120),
|
||||
makePosSection("toc1", 3, 50, 550, 100, 120),
|
||||
makePosSection("toc2", 4, 50, 550, 100, 120),
|
||||
makePosSection("body1", 5, 50, 550, 100, 120),
|
||||
makePosSection("body2", 6, 50, 550, 100, 120),
|
||||
)
|
||||
removeTOCByOutlines(result, outlines)
|
||||
if len(result.Sections) != 4 {
|
||||
t.Fatalf("expected 4 sections, got %d", len(result.Sections))
|
||||
}
|
||||
if result.Sections[0].Text != "s1" || result.Sections[1].Text != "s2" {
|
||||
t.Error("pre-TOC pages should be kept")
|
||||
}
|
||||
if result.Sections[2].Text != "body1" || result.Sections[3].Text != "body2" {
|
||||
t.Error("post-TOC pages should be kept")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveTOCByOutlines_NoMatch(t *testing.T) {
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "1. Introduction", Level: 0, PageNumber: 1},
|
||||
{Title: "2. Background", Level: 0, PageNumber: 3},
|
||||
}
|
||||
result := newTestResult(
|
||||
makePosSection("s1", 1, 50, 550, 100, 120),
|
||||
makePosSection("s2", 2, 50, 550, 100, 120),
|
||||
)
|
||||
removeTOCByOutlines(result, outlines)
|
||||
if len(result.Sections) != 2 {
|
||||
t.Errorf("expected 2 sections, got %d (no TOC should mean no removal)", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveTOCByOutlines_NilOutlines(t *testing.T) {
|
||||
result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
|
||||
removeTOCByOutlines(result, nil)
|
||||
if len(result.Sections) != 1 {
|
||||
t.Errorf("nil outlines should be no-op: got %d sections", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveTOCByOutlines_EmptyOutlines(t *testing.T) {
|
||||
result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
|
||||
removeTOCByOutlines(result, []pdftype.Outline{})
|
||||
if len(result.Sections) != 1 {
|
||||
t.Errorf("empty outlines should be no-op: got %d sections", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveTOCByOutlines_NoNext(t *testing.T) {
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "目录", Level: 0, PageNumber: 2},
|
||||
}
|
||||
result := newTestResult(
|
||||
makePosSection("toc", 2, 50, 550, 100, 120),
|
||||
makePosSection("body", 3, 50, 550, 100, 120),
|
||||
)
|
||||
removeTOCByOutlines(result, outlines)
|
||||
if len(result.Sections) != 2 {
|
||||
t.Errorf("no next outline → keep all sections: got %d", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
// ── reorderMultiColumn ─────────────────────────────────────────────────
|
||||
|
||||
func TestReorderMultiColumn_SingleCol(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("B", 0, 50, 550, 200, 220),
|
||||
makePosSection("A", 0, 50, 550, 100, 120),
|
||||
)
|
||||
reorderMultiColumn(result, 600.0, 1.0)
|
||||
// medianW=500 >= 300 → single col, order preserved
|
||||
if result.Sections[0].Text != "B" {
|
||||
t.Fatal("single column should preserve original order")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReorderMultiColumn_MultiCol(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("B", 0, 300, 500, 100, 120),
|
||||
makePosSection("A", 0, 50, 250, 100, 120),
|
||||
)
|
||||
reorderMultiColumn(result, 600.0, 1.0)
|
||||
if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
|
||||
t.Log("multi-column: sections reordered")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReorderMultiColumn_Empty(t *testing.T) {
|
||||
result := newTestResult()
|
||||
reorderMultiColumn(result, 600.0, 1.0)
|
||||
if len(result.Sections) != 0 {
|
||||
t.Error("empty sections should remain empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReorderMultiColumn_NoText(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "t1", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 300, Right: 500, Top: 100, Bottom: 120}}},
|
||||
pdftype.Section{Text: "t2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 50, Right: 250, Top: 100, Bottom: 120}}},
|
||||
)
|
||||
reorderMultiColumn(result, 600.0, 1.0)
|
||||
if len(result.Sections) != 2 {
|
||||
t.Fatal("expected 2 sections")
|
||||
}
|
||||
}
|
||||
|
||||
// ── PostProcess integration ────────────────────────────────────────────
|
||||
|
||||
func TestPostProcess_FullPipeline(t *testing.T) {
|
||||
// Simulates post-processing after Parse(): all features enabled.
|
||||
result := newTestResult(
|
||||
// Page 1: TOC — should be removed
|
||||
pdftype.Section{Text: "目录", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
|
||||
pdftype.Section{Text: "Chapter 1 ... 1", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 120, Bottom: 140}}},
|
||||
// Page 1: header — should be removed
|
||||
pdftype.Section{Text: "Page 1", LayoutType: "header", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 500, Right: 550, Top: 10, Bottom: 20}}},
|
||||
// Page 3: actual content
|
||||
pdftype.Section{Text: "Introduction text", LayoutType: "", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
|
||||
pdftype.Section{Text: "Row1 Col1 Row1 Col2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 200, Bottom: 300}}, Image: dummyBase64PNG},
|
||||
pdftype.Section{Text: "Chart description", LayoutType: "figure", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 300, Bottom: 400}}, Image: dummyBase64PNG},
|
||||
// Page 4: footer — should be removed
|
||||
pdftype.Section{Text: "Confidential", LayoutType: "footer", Positions: []pdftype.Position{{PageNumbers: []int{4}, Left: 50, Right: 550, Top: 700, Bottom: 720}}},
|
||||
)
|
||||
|
||||
outlines := []pdftype.Outline{
|
||||
{Title: "目录", Level: 0, PageNumber: 1},
|
||||
{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 3},
|
||||
}
|
||||
|
||||
wantVLM := "This table shows quarterly revenue data with 2 columns."
|
||||
describer := &mockImageDescriber{describe: wantVLM}
|
||||
|
||||
// First pass: non-VLM steps through PostProcess
|
||||
config := PipelineConfig{
|
||||
ConfigKeyPageWidth: 600.0,
|
||||
ConfigKeyZoom: 1.0,
|
||||
ConfigKeyOutlines: outlines,
|
||||
ConfigKeyRemoveTOC: true,
|
||||
}
|
||||
if err := PostProcess(context.Background(), result, config); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Then: VLM enhancement through internal function (with mock)
|
||||
if err := enhanceWithVision(context.Background(), result, describer); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Then: flatten
|
||||
if err := PostProcess(context.Background(), result, PipelineConfig{
|
||||
ConfigKeyFlattenMediaToText: true,
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Verify
|
||||
if len(result.Sections) != 3 {
|
||||
t.Fatalf("expected 3 sections after filtering, got %d: %+v", len(result.Sections), result.Sections)
|
||||
}
|
||||
for i, s := range result.Sections {
|
||||
if s.DocTypeKwd != "text" {
|
||||
t.Errorf("section[%d] DocTypeKwd = %q, want 'text'", i, s.DocTypeKwd)
|
||||
}
|
||||
if s.LayoutType == "header" || s.LayoutType == "footer" {
|
||||
t.Errorf("section[%d] LayoutType = %q, should have been filtered out", i, s.LayoutType)
|
||||
}
|
||||
}
|
||||
// Table section should have enhanced text
|
||||
found := false
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "table" {
|
||||
found = true
|
||||
if s.Text != "Row1 Col1 Row1 Col2\n"+wantVLM {
|
||||
t.Errorf("table text not enhanced: %q", s.Text)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Error("table section missing from result")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_Minimal(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "Hello", LayoutType: ""},
|
||||
pdftype.Section{Text: "World", LayoutType: " "},
|
||||
)
|
||||
if err := PostProcess(context.Background(), result, nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Fatalf("expected 2 sections, got %d", len(result.Sections))
|
||||
}
|
||||
if result.Sections[0].LayoutType != "text" || result.Sections[1].LayoutType != "text" {
|
||||
t.Error("layout not normalized")
|
||||
}
|
||||
if result.Sections[0].DocTypeKwd != "text" || result.Sections[1].DocTypeKwd != "text" {
|
||||
t.Error("doc_type_kwd not assigned")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_NilResult(t *testing.T) {
|
||||
if err := PostProcess(context.Background(), nil, nil); err == nil {
|
||||
t.Error("expected error for nil result")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_EmptySections(t *testing.T) {
|
||||
result := newTestResult()
|
||||
if err := PostProcess(context.Background(), result, nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 0 {
|
||||
t.Error("empty should remain empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_FiguresLazy(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "Fig1", LayoutType: "figure"},
|
||||
pdftype.Section{Text: "Body", LayoutType: "text"},
|
||||
pdftype.Section{Text: "Fig2", LayoutType: "figure"},
|
||||
)
|
||||
if err := PostProcess(context.Background(), result, nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
figs := result.Figures()
|
||||
if len(figs) != 2 {
|
||||
t.Fatalf("expected 2 figures, got %d", len(figs))
|
||||
}
|
||||
if figs[0].Text != "Fig1" || figs[1].Text != "Fig2" {
|
||||
t.Errorf("wrong figures: %+v", figs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_FilterOnly(t *testing.T) {
|
||||
result := newTestResult(
|
||||
pdftype.Section{Text: "Header", LayoutType: "header"},
|
||||
pdftype.Section{Text: "Second", LayoutType: "text"},
|
||||
pdftype.Section{Text: "First", LayoutType: "text"},
|
||||
)
|
||||
if err := PostProcess(context.Background(), result, nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Fatalf("expected 2 sections after filtering, got %d", len(result.Sections))
|
||||
}
|
||||
figs := result.Figures()
|
||||
if len(figs) != 0 {
|
||||
t.Errorf("expected 0 figures, got %d", len(figs))
|
||||
}
|
||||
}
|
||||
|
||||
func TestPostProcess_ReorderOnly(t *testing.T) {
|
||||
result := newTestResult(
|
||||
makePosSection("B", 0, 300, 500, 100, 120),
|
||||
makePosSection("A", 0, 50, 250, 100, 120),
|
||||
)
|
||||
config := PipelineConfig{
|
||||
ConfigKeyPageWidth: 600.0,
|
||||
ConfigKeyZoom: 1.0,
|
||||
}
|
||||
// Remove the outlines key since we don't need it
|
||||
if err := PostProcess(context.Background(), result, config); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(result.Sections) != 2 {
|
||||
t.Fatal("expected 2 sections")
|
||||
}
|
||||
// Should be reordered: col 1 leftmost: A then B
|
||||
if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
|
||||
t.Log("multi-column: sections reordered left-to-right")
|
||||
}
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"image"
|
||||
)
|
||||
|
||||
// ImageDescriber describes an image using a vision language model.
|
||||
type ImageDescriber interface {
|
||||
DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error)
|
||||
}
|
||||
|
||||
// maxDescribeConcurrency limits how many concurrent VLM calls are in flight.
|
||||
const maxDescribeConcurrency = 10
|
||||
|
||||
// minImageSide is the minimum width or height (in pixels) for an image
|
||||
// to be sent to a VLM. Tiny crops fail provider image-size limits.
|
||||
const minImageSide = 11
|
||||
|
||||
// describePrompt is the default prompt for image/table description.
|
||||
// Python: vision_llm_figure_describe_prompt.md
|
||||
const describePrompt = `## ROLE
|
||||
|
||||
You are an expert visual data analyst.
|
||||
|
||||
## GOAL
|
||||
|
||||
Analyze the image and produce a textual representation strictly based on what is visible in the image.
|
||||
|
||||
## DECISION RULE (CRITICAL)
|
||||
|
||||
First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
|
||||
|
||||
## OUTPUT RULES (STRICT)
|
||||
|
||||
- Produce output in exactly one of the two modes defined below.
|
||||
- Do NOT mention, label, or reference the modes in the output.
|
||||
- Do NOT combine content from both modes.
|
||||
- Do NOT explain or justify the choice of mode.
|
||||
- Do NOT add any headings, titles, or commentary beyond what the mode requires.
|
||||
|
||||
---
|
||||
|
||||
## MODE 1: STRUCTURED VISUAL DATA OUTPUT
|
||||
|
||||
(Use only if the image contains enumerable data units forming a coherent dataset.)
|
||||
|
||||
Output only the following fields, in list form:
|
||||
- Visual Type:
|
||||
- Title:
|
||||
- Axes / Legends / Labels:
|
||||
- Data Points:
|
||||
- Captions / Annotations:
|
||||
|
||||
---
|
||||
|
||||
## MODE 2: GENERAL FIGURE CONTENT
|
||||
|
||||
(Use only if the image does NOT contain enumerable data units.)
|
||||
|
||||
Write the content directly, starting from the first sentence.
|
||||
Do NOT add any introductory labels, titles, headings, or prefixes.
|
||||
|
||||
Requirements:
|
||||
- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
|
||||
- Explicitly name interface elements or visual objects exactly as they appear.
|
||||
- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
|
||||
- Describe spatial grouping, containment, and alignment of elements.
|
||||
- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
|
||||
- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
|
||||
|
||||
Use concise, information-dense sentences.
|
||||
Do not use bullet lists or structured fields in this mode.`
|
||||
|
||||
// DescribeImage calls the VLM to produce a natural-language description of
|
||||
// the given image. Returns the description text or an error.
|
||||
//
|
||||
// Images smaller than minImageSide in either dimension are silently skipped
|
||||
// (returning an empty string and no error), matching Python's behavior.
|
||||
func DescribeImage(ctx context.Context, img image.Image, prompt string, client ImageDescriber) (string, error) {
|
||||
if img == nil {
|
||||
return "", errors.New("DescribeImage: nil image")
|
||||
}
|
||||
b := img.Bounds()
|
||||
if b.Dx() == 0 || b.Dy() == 0 {
|
||||
return "", errors.New("DescribeImage: empty image (0x0)")
|
||||
}
|
||||
if b.Dx() < minImageSide || b.Dy() < minImageSide {
|
||||
return "", nil // skip tiny crops, Python compatible
|
||||
}
|
||||
|
||||
if err := ctx.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return client.DescribeImage(ctx, img, prompt)
|
||||
}
|
||||
@@ -1,112 +0,0 @@
|
||||
package post
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"image"
|
||||
"image/color"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ── mock image describer ───────────────────────────────────────────────
|
||||
|
||||
type mockImageDescriber struct {
|
||||
describe string
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockImageDescriber) DescribeImage(_ context.Context, _ image.Image, _ string) (string, error) {
|
||||
return m.describe, m.err
|
||||
}
|
||||
|
||||
// ── DescribeImage tests ────────────────────────────────────────────────
|
||||
|
||||
func TestDescribeImage_Success(t *testing.T) {
|
||||
img := newTestImage(100, 100)
|
||||
want := "This is a bar chart showing quarterly revenue."
|
||||
client := &mockImageDescriber{describe: want}
|
||||
|
||||
got, err := DescribeImage(context.Background(), img, "Describe this image", client)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("DescribeImage() = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeImage_VLMError(t *testing.T) {
|
||||
img := newTestImage(100, 100)
|
||||
client := &mockImageDescriber{err: errors.New("VLM timeout")}
|
||||
|
||||
got, err := DescribeImage(context.Background(), img, "Describe this image", client)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if got != "" {
|
||||
t.Errorf("expected empty string on error, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeImage_CanceledContext(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // cancel immediately
|
||||
img := newTestImage(100, 100)
|
||||
client := &mockImageDescriber{describe: "should not be reached"}
|
||||
|
||||
got, err := DescribeImage(ctx, img, "prompt", client)
|
||||
if err == nil {
|
||||
t.Fatal("expected context error, got nil")
|
||||
}
|
||||
if got != "" {
|
||||
t.Errorf("expected empty string, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeImage_NilImage(t *testing.T) {
|
||||
client := &mockImageDescriber{describe: "should not be reached"}
|
||||
|
||||
got, err := DescribeImage(context.Background(), nil, "prompt", client)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for nil image, got nil")
|
||||
}
|
||||
if got != "" {
|
||||
t.Errorf("expected empty string, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeImage_EmptyImage(t *testing.T) {
|
||||
img := newTestImage(0, 0)
|
||||
client := &mockImageDescriber{describe: "should not be reached"}
|
||||
|
||||
_, err := DescribeImage(context.Background(), img, "prompt", client)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for empty image, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeImage_TinyImage(t *testing.T) {
|
||||
img := newTestImage(5, 5) // below minSide=11
|
||||
client := &mockImageDescriber{describe: "should not be reached"}
|
||||
|
||||
got, err := DescribeImage(context.Background(), img, "prompt", client)
|
||||
if err != nil {
|
||||
t.Fatal("tiny images should be silently skipped, not error")
|
||||
}
|
||||
if got != "" {
|
||||
t.Errorf("expected empty string for tiny image, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
func newTestImage(w, h int) image.Image {
|
||||
img := image.NewRGBA(image.Rect(0, 0, w, h))
|
||||
// Fill with a recognizable pattern.
|
||||
for y := 0; y < h; y++ {
|
||||
for x := 0; x < w; x++ {
|
||||
img.Set(x, y, color.RGBA{R: uint8(x % 256), G: uint8(y % 256), B: 128, A: 255})
|
||||
}
|
||||
}
|
||||
return img
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
@@ -53,7 +53,7 @@ func TestRenderCompare(t *testing.T) {
|
||||
}
|
||||
|
||||
// Render page 0 with pdfium (Go).
|
||||
goImg, err := renderPageToImage(eng, 0)
|
||||
goImg, err := RenderPageToImage(eng, 0)
|
||||
eng.Close()
|
||||
if err != nil {
|
||||
t.Logf("%s: render error: %v", name, err)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
var renderFn = fallbackRender
|
||||
|
||||
// renderPageToImage renders a page at 216 DPI for downstream DLA/TSR/OCR.
|
||||
func renderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
|
||||
func RenderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
|
||||
return renderFn(engine, pageNum)
|
||||
}
|
||||
|
||||
@@ -25,7 +25,10 @@ func fallbackRender(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
|
||||
}
|
||||
// Guard against typed-nil (e.g. (*image.RGBA)(nil) returned as non-nil
|
||||
// interface). The plain img==nil check misses that case.
|
||||
if img == nil || reflect.ValueOf(img).IsNil() {
|
||||
if img == nil {
|
||||
return nil, ErrNoPDFData
|
||||
}
|
||||
if rv := reflect.ValueOf(img); rv.Kind() == reflect.Ptr && rv.IsNil() {
|
||||
return nil, ErrNoPDFData
|
||||
}
|
||||
return img, nil
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
@@ -24,8 +24,8 @@ func pdfiumPtSize(eng pdf.PDFEngine, file string, t *testing.T) (w, h float64) {
|
||||
raw := eng.RawData()
|
||||
if raw == nil {
|
||||
// Fallback: use pdf_oxide pre-rotation size.
|
||||
if pe, ok := eng.(*pdfoxideEngine); ok {
|
||||
w, h, _ = pe.inner.PageSize(0)
|
||||
if pe, ok := eng.(*PDFOxideEngine); ok {
|
||||
w, h, _ = pe.Inner.PageSize(0)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -302,7 +302,7 @@ func TestRotation_CropBoxWithRotate(t *testing.T) {
|
||||
// CropBox excludes content from the page edges; chars near the
|
||||
// CropBox boundary may end up outside the effective page after rotation.
|
||||
if oobRate > 40 {
|
||||
t.Errorf("too many OOB chars: %.1f%%", oobRate)
|
||||
t.Errorf("too many OOB Chars: %.1f%%", oobRate)
|
||||
}
|
||||
|
||||
// Verify render alignment.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -43,9 +43,8 @@ func TestScanAllPDFs(t *testing.T) {
|
||||
|
||||
eng := mustOpenEngine(t, name)
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
cfg.TableBuilder = NewDeepDocTableBuildService(client)
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, client)
|
||||
eng.Close()
|
||||
if err != nil {
|
||||
fmt.Printf(" ❌ ERROR: %v\n", err)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
@@ -16,7 +16,7 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestSnapshotStageComparison verifies Go's TextMerge output
|
||||
// TestSnapshotStageComparison verifies Go's lyt.TextMerge output
|
||||
// matches Python's _text_merge sample boxes using synthetic input.
|
||||
func TestSnapshotStageComparison(t *testing.T) {
|
||||
snapDir := filepath.Join("testdata", "snapshots")
|
||||
@@ -47,19 +47,19 @@ func TestSnapshotStageComparison(t *testing.T) {
|
||||
// Convert sample boxes to Go pdf.TextBox format
|
||||
goBoxes := snapshotBoxesToGo(s1.SampleBoxesPage0)
|
||||
|
||||
// Run Go TextMerge with default params
|
||||
// Run Go lyt.TextMerge with default params
|
||||
meanH := map[int]float64{0: avg(s1.MeanHeight)}
|
||||
merged := lyt.TextMerge(goBoxes, meanH, 3)
|
||||
|
||||
// Compare counts
|
||||
if len(merged) > 0 {
|
||||
t.Logf(" Go TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
|
||||
t.Logf(" Go lyt.TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
|
||||
mergeRatio := float64(len(merged)) / float64(len(goBoxes))
|
||||
pyRatio := float64(s4.BoxesAfter) / float64(s4.BoxesBefore)
|
||||
t.Logf(" Merge ratios: Go=%.0f%% Python=%.0f%%", mergeRatio*100, pyRatio*100)
|
||||
}
|
||||
|
||||
// Run Go NaiveVerticalMerge
|
||||
// Run Go lyt.NaiveVerticalMerge
|
||||
meanW := map[int]float64{0: avg(s1.MeanWidth)}
|
||||
vm := lyt.NaiveVerticalMerge(merged, meanH, meanW, s1.IsEnglish)
|
||||
if s6, ok := snap.Stages["_naive_vertical_merge"]; ok {
|
||||
|
||||
@@ -2,6 +2,7 @@ package table
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
@@ -698,7 +699,47 @@ func RowsToHTML(rows [][]pdf.TSRCell, caption string, headerRows map[int]bool, s
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── Span computation (Python: __cal_spans) ──
|
||||
// SimpleRowsToHTML converts plain string-based table data to an HTML table.
|
||||
// The first row is treated as a header (<th>). Used by DOCX, XLSX, PPTX,
|
||||
// and HTML parsers that produce [][]string directly.
|
||||
func SimpleRowsToHTML(rows [][]string) string {
|
||||
if len(rows) == 0 {
|
||||
return "<table></table>"
|
||||
}
|
||||
nCols := 0
|
||||
for _, row := range rows {
|
||||
if len(row) > nCols {
|
||||
nCols = len(row)
|
||||
}
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString("<table>")
|
||||
for ri, row := range rows {
|
||||
b.WriteString("<tr>")
|
||||
tag := "td"
|
||||
if ri == 0 {
|
||||
tag = "th"
|
||||
}
|
||||
for ci := 0; ci < nCols; ci++ {
|
||||
text := ""
|
||||
if ci < len(row) {
|
||||
text = row[ci]
|
||||
}
|
||||
b.WriteString("<")
|
||||
b.WriteString(tag)
|
||||
b.WriteString(" >")
|
||||
b.WriteString(html.EscapeString(text))
|
||||
b.WriteString("</")
|
||||
b.WriteString(tag)
|
||||
b.WriteString(">")
|
||||
}
|
||||
b.WriteString("</tr>")
|
||||
}
|
||||
b.WriteString("</table>")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// Span computation (Python: __cal_spans) ──
|
||||
|
||||
// calSpans computes colspan and rowspan for spanning cells in the grid.
|
||||
// Returns spanInfo (row,col → colspan,rowspan) and covered (cells hidden by spans).
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -12,10 +12,10 @@ import (
|
||||
util "ragflow/internal/deepdoc/parser/pdf/util"
|
||||
)
|
||||
|
||||
// enrichWithDeepDoc runs DLA+TSR via p.DeepDoc and returns detected tables.
|
||||
// enrichWithDeepDoc runs DLA+TSR via docAnalyzer and returns detected tables.
|
||||
// pageImages optionally provides pre-rendered page images to avoid re-rendering.
|
||||
func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image) []pdf.TableItem {
|
||||
if !p.DeepDoc.Health() {
|
||||
func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
|
||||
if !docAnalyzer.Health() {
|
||||
return nil
|
||||
}
|
||||
// Group boxes by page for annotation write-back.
|
||||
@@ -50,7 +50,7 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
|
||||
for i, idx := range indices {
|
||||
pageBoxes[i] = boxes[idx]
|
||||
}
|
||||
tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems))
|
||||
tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems), docAnalyzer, tb)
|
||||
tableItems = append(tableItems, tables...)
|
||||
// Write back DLA and TSR annotations (R/C/H/SP) to the original boxes.
|
||||
for i, idx := range indices {
|
||||
@@ -65,21 +65,21 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
|
||||
return tableItems
|
||||
}
|
||||
|
||||
func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int) []pdf.TableItem {
|
||||
func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
|
||||
pageImg, ok := pageImages[pageNum]
|
||||
if !ok {
|
||||
var err error
|
||||
pageImg, err = renderPageToImage(engine, pageNum)
|
||||
pageImg, err = RenderPageToImage(engine, pageNum)
|
||||
if err != nil {
|
||||
slog.Warn("render page for DeepDoc failed", "page", pageNum, "err", err)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx)
|
||||
return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx, docAnalyzer, tb)
|
||||
}
|
||||
|
||||
func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int) []pdf.TableItem {
|
||||
regions, err := p.DeepDoc.DLA(ctx, pageImg)
|
||||
func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
|
||||
regions, err := docAnalyzer.DLA(ctx, pageImg)
|
||||
if err != nil {
|
||||
slog.Warn("DLA failed", "page", pageNum, "err", err)
|
||||
return nil
|
||||
@@ -95,148 +95,117 @@ func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.Par
|
||||
tableMatches := tbl.MatchTableRegions(boxes, regions, scale)
|
||||
var items []pdf.TableItem
|
||||
for _, tm := range tableMatches {
|
||||
cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
|
||||
if cropErr != nil {
|
||||
// DLA returned an invalid region (e.g. x1 < x0). Python
|
||||
// PIL.Image.crop() raises ValueError here; we skip this
|
||||
// table instead of passing a full-page image to TSR.
|
||||
continue
|
||||
item := p.processOneTable(ctx, result, boxes, pageImg, pageNum, docAnalyzer, tb, tm, scale, tableBaseIdx+len(items))
|
||||
if item.ImageB64 != "" || len(item.Cells) > 0 || len(item.Positions) > 0 {
|
||||
items = append(items, item)
|
||||
}
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
// Rotation detection (Python: _evaluate_table_orientation).
|
||||
// If rotated, TSR and OCR use the rotated image; cell coords
|
||||
// are mapped back to original crop space for box matching.
|
||||
autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
|
||||
bestAngle := 0
|
||||
origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
|
||||
tsrImg := cropped
|
||||
if autoRotate {
|
||||
angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, p.DeepDoc)
|
||||
bestAngle = angle
|
||||
tsrImg = rotated
|
||||
}
|
||||
|
||||
imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
|
||||
if encErr != nil {
|
||||
slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
|
||||
}
|
||||
|
||||
var cells []pdf.TSRCell
|
||||
var tsrErr error
|
||||
cells, tsrErr = p.tableBuilder.DetectCells(ctx, tsrImg)
|
||||
if tsrErr != nil {
|
||||
slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
|
||||
}
|
||||
// Collect TSR raw cells for debug comparison.
|
||||
if tsrErr == nil {
|
||||
for _, c := range cells {
|
||||
if result != nil {
|
||||
result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
|
||||
TableIndex: tableBaseIdx + len(items), Page: pageNum,
|
||||
Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1,
|
||||
Text: c.Text,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
// Python margin: w*0.03, h*0.03 (_table_transformer_job:374-376).
|
||||
w := tm.Region.X1 - tm.Region.X0
|
||||
h := tm.Region.Y1 - tm.Region.Y0
|
||||
marginX := w * 0.03
|
||||
marginY := h * 0.03
|
||||
cropOffX := math.Max(0, tm.Region.X0-marginX)
|
||||
cropOffY := math.Max(0, tm.Region.Y0-marginY)
|
||||
|
||||
var boxInCrop []pdf.TextBox
|
||||
if tsrErr == nil && len(cells) > 0 {
|
||||
if bestAngle != 0 {
|
||||
// OCR on rotated image before mapping cells back.
|
||||
// Cells are in rotated-pixel space; OCR works best
|
||||
// on upright text. After mapping, cells move to
|
||||
// original crop space where boxInCrop lives.
|
||||
if !p.Config.SkipOCR {
|
||||
ocrTableCells(ctx, cells, tsrImg, p.DeepDoc)
|
||||
}
|
||||
for i := range cells {
|
||||
cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
|
||||
cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
|
||||
}
|
||||
}
|
||||
// Fill cell text from pre-merge boxes, skipping caption boxes
|
||||
// (text entirely above the first TSR cell row).
|
||||
firstCellTop := 1e9
|
||||
for _, c := range cells {
|
||||
if c.Y0 >= 0 && c.Y0 < firstCellTop {
|
||||
firstCellTop = c.Y0
|
||||
}
|
||||
}
|
||||
if firstCellTop == 1e9 {
|
||||
firstCellTop = cells[0].Y0 // fallback if all cells have Y0 < 0
|
||||
}
|
||||
boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
|
||||
for _, idx := range tm.BoxIdx {
|
||||
b := boxes[idx]
|
||||
if b.Bottom*scale-cropOffY < firstCellTop {
|
||||
continue // caption box above first TSR cell
|
||||
}
|
||||
boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
|
||||
}
|
||||
}
|
||||
var positions []pdf.Position
|
||||
for _, idx := range tm.BoxIdx {
|
||||
b := boxes[idx]
|
||||
positions = append(positions, pdf.Position{
|
||||
PageNumbers: []int{pageNum},
|
||||
Left: b.X0, Right: b.X1,
|
||||
Top: b.Top, Bottom: b.Bottom,
|
||||
// processOneTable handles DLA+TSR+OCR for a single table region match.
|
||||
func (p *Parser) processOneTable(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, tm tbl.TableMatch, scale float64, tableIdx int) pdf.TableItem {
|
||||
cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
|
||||
if cropErr != nil {
|
||||
return pdf.TableItem{}
|
||||
}
|
||||
autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
|
||||
bestAngle := 0
|
||||
origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
|
||||
tsrImg := cropped
|
||||
if autoRotate {
|
||||
angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, docAnalyzer)
|
||||
bestAngle = angle
|
||||
tsrImg = rotated
|
||||
}
|
||||
imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
|
||||
if encErr != nil {
|
||||
slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
|
||||
}
|
||||
cells, tsrErr := tb.DetectCells(ctx, tsrImg)
|
||||
if tsrErr != nil {
|
||||
slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
|
||||
}
|
||||
if tsrErr == nil && result != nil {
|
||||
for _, c := range cells {
|
||||
result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
|
||||
TableIndex: tableIdx, Page: pageNum,
|
||||
Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, Text: c.Text,
|
||||
})
|
||||
}
|
||||
// Pre-compute grid from raw TSR cells (without crop offset).
|
||||
// Stored in pdf.TableItem for constructTable; annotateTableBoxes
|
||||
// recomputes with offset cells for spatial matching precision.
|
||||
var grid [][]pdf.TSRCell
|
||||
if len(cells) > 0 {
|
||||
grid = p.tableBuilder.GroupCells(cells)
|
||||
// Fill cell text from boxes in crop space. Works for both
|
||||
// Label-aware grouping (cells rearranged) vs. cross-product (creates new cells).
|
||||
if len(grid) > 0 {
|
||||
flat := tbl.FlattenGrid(grid)
|
||||
tbl.FillCellTextFromBoxes(flat, boxInCrop)
|
||||
idx := 0
|
||||
}
|
||||
w := tm.Region.X1 - tm.Region.X0
|
||||
h := tm.Region.Y1 - tm.Region.Y0
|
||||
cropOffX := math.Max(0, tm.Region.X0-w*0.03)
|
||||
cropOffY := math.Max(0, tm.Region.Y0-h*0.03)
|
||||
var boxInCrop []pdf.TextBox
|
||||
if tsrErr == nil && len(cells) > 0 {
|
||||
if bestAngle != 0 {
|
||||
if !p.Config.SkipOCR {
|
||||
ocrTableCells(ctx, cells, tsrImg, docAnalyzer)
|
||||
}
|
||||
for i := range cells {
|
||||
cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
|
||||
cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
|
||||
}
|
||||
}
|
||||
firstCellTop := 1e9
|
||||
for _, c := range cells {
|
||||
if c.Y0 >= 0 && c.Y0 < firstCellTop {
|
||||
firstCellTop = c.Y0
|
||||
}
|
||||
}
|
||||
if firstCellTop == 1e9 {
|
||||
firstCellTop = cells[0].Y0
|
||||
}
|
||||
boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
|
||||
for _, idx := range tm.BoxIdx {
|
||||
b := boxes[idx]
|
||||
if b.Bottom*scale-cropOffY < firstCellTop {
|
||||
continue
|
||||
}
|
||||
boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
|
||||
}
|
||||
}
|
||||
var positions []pdf.Position
|
||||
for _, idx := range tm.BoxIdx {
|
||||
b := boxes[idx]
|
||||
positions = append(positions, pdf.Position{
|
||||
PageNumbers: []int{pageNum},
|
||||
Left: b.X0, Right: b.X1, Top: b.Top, Bottom: b.Bottom,
|
||||
})
|
||||
}
|
||||
var grid [][]pdf.TSRCell
|
||||
if len(cells) > 0 {
|
||||
grid = tb.GroupCells(cells)
|
||||
if len(grid) > 0 {
|
||||
flat := tbl.FlattenGrid(grid)
|
||||
tbl.FillCellTextFromBoxes(flat, boxInCrop)
|
||||
idx := 0
|
||||
for ri := range grid {
|
||||
for ci := range grid[ri] {
|
||||
grid[ri][ci].Text = flat[idx].Text
|
||||
idx++
|
||||
}
|
||||
}
|
||||
if bestAngle == 0 && !p.Config.SkipOCR {
|
||||
ocrTableCells(ctx, flat, tsrImg, docAnalyzer)
|
||||
idx = 0
|
||||
for ri := range grid {
|
||||
for ci := range grid[ri] {
|
||||
grid[ri][ci].Text = flat[idx].Text
|
||||
idx++
|
||||
}
|
||||
}
|
||||
if bestAngle == 0 && !p.Config.SkipOCR {
|
||||
ocrTableCells(ctx, flat, tsrImg, p.DeepDoc)
|
||||
idx = 0
|
||||
for ri := range grid {
|
||||
for ci := range grid[ri] {
|
||||
grid[ri][ci].Text = flat[idx].Text
|
||||
idx++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
items = append(items, pdf.TableItem{
|
||||
ImageB64: imgB64,
|
||||
Cells: cells,
|
||||
Grid: grid,
|
||||
Positions: positions,
|
||||
Scale: scale,
|
||||
CropOffX: cropOffX,
|
||||
CropOffY: cropOffY,
|
||||
// DLA region in PDF point space (Python's cropout uses layout region boundaries).
|
||||
RegionLeft: tm.Region.X0 / scale,
|
||||
RegionRight: tm.Region.X1 / scale,
|
||||
RegionTop: tm.Region.Y0 / scale,
|
||||
RegionBottom: tm.Region.Y1 / scale,
|
||||
})
|
||||
|
||||
tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, p.tableBuilder)
|
||||
}
|
||||
return items
|
||||
item := pdf.TableItem{
|
||||
ImageB64: imgB64, Cells: cells, Grid: grid, Positions: positions,
|
||||
Scale: scale, CropOffX: cropOffX, CropOffY: cropOffY,
|
||||
RegionLeft: tm.Region.X0 / scale, RegionRight: tm.Region.X1 / scale,
|
||||
RegionTop: tm.Region.Y0 / scale, RegionBottom: tm.Region.Y1 / scale,
|
||||
}
|
||||
tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, tb)
|
||||
return item
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
inf "ragflow/internal/deepdoc/parser/pdf/inference"
|
||||
tbl "ragflow/internal/deepdoc/parser/pdf/table"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
util "ragflow/internal/deepdoc/parser/pdf/util"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -32,7 +33,7 @@ func TestTableRotation_Integration(t *testing.T) {
|
||||
if baseURL == "" {
|
||||
baseURL = "http://localhost:9390"
|
||||
}
|
||||
dd, err := inf.NewInferenceClient(baseURL)
|
||||
dd, err := inf.NewClient(baseURL)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -59,10 +60,10 @@ func TestTableRotation_Integration(t *testing.T) {
|
||||
cfg.ToPage = pageCount - 1
|
||||
autoRotate := true
|
||||
cfg.AutoRotateTables = &autoRotate
|
||||
_ = NewParser(cfg, dd) // verify construction does not panic
|
||||
_ = NewParser(cfg) // verify construction does not panic
|
||||
|
||||
for pg := 0; pg < pageCount; pg++ {
|
||||
pageImg, err := renderPageToImage(eng, pg)
|
||||
pageImg, err := RenderPageToImage(eng, pg)
|
||||
if err != nil {
|
||||
t.Fatalf("render page %d: %v", pg, err)
|
||||
}
|
||||
@@ -80,7 +81,7 @@ func TestTableRotation_Integration(t *testing.T) {
|
||||
tableCount++
|
||||
|
||||
// Crop table region
|
||||
cropped, err := cropImageRegion(pageImg, r)
|
||||
cropped, err := util.CropImageRegion(pageImg, r)
|
||||
if err != nil {
|
||||
t.Errorf(" crop table %d: %v", tableCount, err)
|
||||
continue
|
||||
@@ -130,7 +131,7 @@ func TestTableRotation_Stability(t *testing.T) {
|
||||
if baseURL == "" {
|
||||
baseURL = "http://localhost:9390"
|
||||
}
|
||||
dd, err := inf.NewInferenceClient(baseURL)
|
||||
dd, err := inf.NewClient(baseURL)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -163,7 +164,7 @@ func TestTableRotation_Stability(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
|
||||
pageImg, err := renderPageToImage(eng, 0)
|
||||
pageImg, err := RenderPageToImage(eng, 0)
|
||||
eng.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -177,7 +178,11 @@ func TestTableRotation_Stability(t *testing.T) {
|
||||
continue
|
||||
}
|
||||
tables++
|
||||
cropped, _ := cropImageRegion(pageImg, r)
|
||||
cropped, err := util.CropImageRegion(pageImg, r)
|
||||
if err != nil {
|
||||
t.Errorf(" %s crop table: %v", e.Name(), err)
|
||||
continue
|
||||
}
|
||||
if cropped == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -16,11 +16,11 @@ import (
|
||||
// entries. Go backfills pdf.Section.Text from pdf.TableItem.Rows after
|
||||
// linkTableSections.
|
||||
func TestTableSection_TextFromTSR(t *testing.T) {
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
renderW: 900, // 300pt at 3x = 900px (216 DPI)
|
||||
renderH: 600,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
RenderW: 900, // 300pt at 3x = 900px (216 DPI)
|
||||
RenderH: 600,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
// PDF space (72 DPI): well inside DLA region
|
||||
{X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"},
|
||||
{X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"},
|
||||
@@ -42,9 +42,9 @@ func TestTableSection_TextFromTSR(t *testing.T) {
|
||||
{X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -93,14 +93,14 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
|
||||
{X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
// 0 text boxes, but page 0 has a rendered image.
|
||||
boxes := []pdf.TextBox{}
|
||||
dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600))
|
||||
pageImages := map[int]image.Image{0: dummyImg}
|
||||
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages)
|
||||
tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages, mock, NewTableBuilderFor(mock))
|
||||
if len(tables) == 0 {
|
||||
t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0")
|
||||
}
|
||||
@@ -113,10 +113,10 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
|
||||
// is merged into the nearest "figure" pdf.Section and the caption pdf.Section is
|
||||
// removed. Matches Python _extract_table_figure caption matching.
|
||||
func TestFigureCaption_MergedIntoFigure(t *testing.T) {
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
renderW: 1800, renderH: 2400,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
RenderW: 1800, RenderH: 2400,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
// Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100).
|
||||
{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"},
|
||||
// Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113).
|
||||
@@ -131,9 +131,9 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
|
||||
{X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -169,10 +169,10 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
|
||||
// TestTableCaption_MergedIntoTable verifies that "table caption" text
|
||||
// is merged into the nearest table pdf.Section and the caption is removed.
|
||||
func TestTableCaption_MergedIntoTable(t *testing.T) {
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
renderW: 1800, renderH: 2400,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
RenderW: 1800, RenderH: 2400,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
// Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100).
|
||||
{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"},
|
||||
// Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113).
|
||||
@@ -190,9 +190,9 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
|
||||
{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -224,10 +224,10 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
|
||||
// boxes overlapping a table region, regardless of their DLA label.
|
||||
// This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs.
|
||||
func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
|
||||
eng := &mockEngine{
|
||||
pageCount: 1,
|
||||
renderW: 1800, renderH: 2400,
|
||||
chars: map[int][]pdf.TextChar{0: {
|
||||
eng := &MockEngine{
|
||||
NumPages: 1,
|
||||
RenderW: 1800, RenderH: 2400,
|
||||
Chars: map[int][]pdf.TextChar{0: {
|
||||
// Box A: inside DLA table region, labeled as "text" by DLA.
|
||||
{X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"},
|
||||
// Box B: inside DLA table region, same situation.
|
||||
@@ -247,9 +247,9 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
|
||||
{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"},
|
||||
},
|
||||
}
|
||||
p := NewParser(pdf.DefaultParserConfig(), mock)
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -286,9 +286,10 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
|
||||
|
||||
// TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully.
|
||||
func TestEmptyDoc_NoCrash(t *testing.T) {
|
||||
eng := &mockEngine{pageCount: 0}
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
eng := &MockEngine{NumPages: 0}
|
||||
mock := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
@@ -299,13 +300,69 @@ func TestEmptyDoc_NoCrash(t *testing.T) {
|
||||
|
||||
// TestNilChars_handled verifies zero-chars pages don't crash.
|
||||
func TestNilChars_Handled(t *testing.T) {
|
||||
eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200}
|
||||
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
eng := &MockEngine{NumPages: 1, RenderW: 200, RenderH: 200}
|
||||
mock := &MockDocAnalyzer{Healthy: true}
|
||||
p := NewParser(pdf.DefaultParserConfig())
|
||||
result, err := p.ParseRaw(context.Background(), eng, mock)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) != 0 && p.DeepDoc != nil {
|
||||
if len(result.Sections) != 0 {
|
||||
t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections))
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchTableImage_ByPositions(t *testing.T) {
|
||||
tableByRegion := map[string]string{
|
||||
"0_50.0_500.0_100.0_300.0": "img_base64_positions",
|
||||
}
|
||||
sec := &pdf.Section{
|
||||
LayoutType: pdf.LayoutTypeTable,
|
||||
Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 50.0, Right: 500.0, Top: 100.0, Bottom: 300.0}},
|
||||
}
|
||||
img, ok := matchTableImage(sec, tableByRegion)
|
||||
if !ok {
|
||||
t.Fatal("expected match by Positions")
|
||||
}
|
||||
if img != "img_base64_positions" {
|
||||
t.Errorf("got %q, want img_base64_positions", img)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchTableImage_FallbackToRegion(t *testing.T) {
|
||||
tableByRegion := map[string]string{
|
||||
"0_80.0_520.0_200.0_400.0": "img_base64_region",
|
||||
}
|
||||
sec := &pdf.Section{
|
||||
LayoutType: pdf.LayoutTypeTable,
|
||||
Positions: nil,
|
||||
TableItem: &pdf.TableItem{RegionLeft: 80.0, RegionRight: 520.0, RegionTop: 200.0, RegionBottom: 400.0},
|
||||
}
|
||||
img, ok := matchTableImage(sec, tableByRegion)
|
||||
if !ok {
|
||||
t.Fatal("expected match by Region fallback")
|
||||
}
|
||||
if img != "img_base64_region" {
|
||||
t.Errorf("got %q, want img_base64_region", img)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchTableImage_NoMatch(t *testing.T) {
|
||||
tableByRegion := map[string]string{"0_10.0_20.0_30.0_40.0": "no_chance"}
|
||||
sec := &pdf.Section{
|
||||
LayoutType: pdf.LayoutTypeTable,
|
||||
Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 100, Right: 200, Top: 300, Bottom: 400}},
|
||||
}
|
||||
_, ok := matchTableImage(sec, tableByRegion)
|
||||
if ok {
|
||||
t.Error("expected no match")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchTableImage_EmptySection(t *testing.T) {
|
||||
sec := &pdf.Section{LayoutType: pdf.LayoutTypeTable}
|
||||
_, ok := matchTableImage(sec, map[string]string{"x": "y"})
|
||||
if ok {
|
||||
t.Error("expected no match for empty section")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"image"
|
||||
@@ -6,48 +6,6 @@ import (
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
|
||||
// ── mockEngine: minimal pdf.PDFEngine stub for unit tests ─────────────
|
||||
|
||||
type mockEngine struct {
|
||||
chars map[int][]pdf.TextChar
|
||||
pageCount int
|
||||
renderW int
|
||||
renderH int
|
||||
}
|
||||
|
||||
func (m *mockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
|
||||
return m.chars[pg], nil
|
||||
}
|
||||
func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
|
||||
w, h := m.renderW, m.renderH
|
||||
if w <= 0 {
|
||||
w = 595
|
||||
}
|
||||
if h <= 0 {
|
||||
h = 842
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
|
||||
w, h := m.renderW, m.renderH
|
||||
if w <= 0 {
|
||||
w = 100
|
||||
}
|
||||
if h <= 0 {
|
||||
h = 100
|
||||
}
|
||||
return image.NewRGBA(image.Rect(0, 0, w, h)), nil
|
||||
}
|
||||
func (m *mockEngine) PageCount() (int, error) {
|
||||
if m.pageCount <= 0 {
|
||||
return 1, nil
|
||||
}
|
||||
return m.pageCount, nil
|
||||
}
|
||||
func (m *mockEngine) RawData() []byte { return nil }
|
||||
func (m *mockEngine) Close() error { return nil }
|
||||
func (m *mockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }
|
||||
|
||||
// ── testPageImg: small test image for ocrMergeChars tests ─────────────
|
||||
// 90×120 px at 216 DPI → 30×40 pt in PDF space after /3.0 scaling.
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -66,8 +66,8 @@ func TestDumpTextOutput(t *testing.T) {
|
||||
}
|
||||
|
||||
cfg := pdf.DefaultParserConfig()
|
||||
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
p := NewParser(cfg)
|
||||
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
|
||||
eng.Close()
|
||||
if err != nil {
|
||||
t.Logf("[%d/%d] %s — parse error: %v", i+1, count, name, err)
|
||||
|
||||
@@ -1,320 +1,56 @@
|
||||
// Package pdftypes provides shared types, interfaces, and constants for the
|
||||
// PDF parser pipeline. It has zero dependencies on sibling packages so that
|
||||
// sub-packages (tables, geometry, etc.) can import it without circular imports.
|
||||
// Package pdftype provides PDF-specific types and re-exports shared types
|
||||
// from the doctype package via Go type aliases. Existing PDF parser code
|
||||
// that imports this package continues to work without changes.
|
||||
package pdftype
|
||||
|
||||
import (
|
||||
"context"
|
||||
"image"
|
||||
"unicode"
|
||||
)
|
||||
import doctype "ragflow/internal/deepdoc/parser/type"
|
||||
|
||||
// ── Pipeline types ────────────────────────────────────────────────────────
|
||||
// ── Re-export shared types via aliases ─────────────────────────────────────
|
||||
|
||||
// PipelineMetrics records diagnostic counts at each pipeline stage.
|
||||
type PipelineMetrics struct {
|
||||
BoxesInitial int
|
||||
BoxesTextMerge int
|
||||
BoxesVertMerge int
|
||||
BoxesFinal int
|
||||
TablesCount int
|
||||
}
|
||||
type PipelineMetrics = doctype.PipelineMetrics
|
||||
type ParseResult = doctype.ParseResult
|
||||
type DLAPageRegions = doctype.DLAPageRegions
|
||||
type TSRRawCell = doctype.TSRRawCell
|
||||
type TextChar = doctype.TextChar
|
||||
type TextBox = doctype.TextBox
|
||||
type Position = doctype.Position
|
||||
type Section = doctype.Section
|
||||
type TableItem = doctype.TableItem
|
||||
type TSRCell = doctype.TSRCell
|
||||
type DLARegion = doctype.DLARegion
|
||||
type OCRBox = doctype.OCRBox
|
||||
type OCRText = doctype.OCRText
|
||||
type ParserConfig = doctype.ParserConfig
|
||||
type DocAnalyzer = doctype.DocAnalyzer
|
||||
type Outline = doctype.Outline
|
||||
type PDFEngine = doctype.PDFEngine
|
||||
type Tokenizer = doctype.Tokenizer
|
||||
type SampleFunc = doctype.SampleFunc
|
||||
type TableBuilder = doctype.TableBuilder
|
||||
type Rectangular = doctype.Rectangular
|
||||
|
||||
// ParseResult encapsulates all outputs from a single Parse() call.
|
||||
type ParseResult struct {
|
||||
Sections []Section
|
||||
Tables []TableItem
|
||||
PageImages map[int]image.Image
|
||||
Metrics PipelineMetrics
|
||||
Outlines []Outline // PDF outlines/bookmarks extracted from the document
|
||||
// ── Re-export constants ────────────────────────────────────────────────────
|
||||
|
||||
DLADebug []DLAPageRegions
|
||||
TSRDebug []TSRRawCell
|
||||
}
|
||||
|
||||
// Figures returns all sections with LayoutType "figure".
|
||||
// Computed on demand from Sections — no stored field.
|
||||
func (r *ParseResult) Figures() []Section {
|
||||
return CollectFigures(r.Sections)
|
||||
}
|
||||
|
||||
// DLAPageRegions holds DLA layout regions for one page.
|
||||
type DLAPageRegions struct {
|
||||
Page int
|
||||
Regions []DLARegion
|
||||
}
|
||||
|
||||
// TSRRawCell holds a raw TSR cell before row/column grouping.
|
||||
type TSRRawCell struct {
|
||||
TableIndex int `json:"table_index"`
|
||||
Page int `json:"page"`
|
||||
Label string `json:"label"`
|
||||
X0 float64 `json:"x0"`
|
||||
Y0 float64 `json:"y0"`
|
||||
X1 float64 `json:"x1"`
|
||||
Y1 float64 `json:"y1"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// ── Character and text box types ──────────────────────────────────────────
|
||||
|
||||
// TextChar represents a single character extracted from a PDF page.
|
||||
type TextChar struct {
|
||||
X0, X1 float64
|
||||
Top, Bottom float64
|
||||
Text string
|
||||
FontName string
|
||||
FontSize float64
|
||||
PageNumber int
|
||||
LayoutType string
|
||||
LayoutNo string
|
||||
ColID int
|
||||
R int
|
||||
}
|
||||
|
||||
func (c TextChar) Bounds() (float64, float64, float64, float64) {
|
||||
return c.X0, c.Top, c.X1, c.Bottom
|
||||
}
|
||||
|
||||
// TextBox represents a rectangular region of text on a PDF page.
|
||||
type TextBox struct {
|
||||
X0, X1 float64
|
||||
Top, Bottom float64
|
||||
Text string
|
||||
PageNumber int
|
||||
LayoutType string
|
||||
LayoutNo string
|
||||
ColID int
|
||||
R int
|
||||
// Post-TSR table annotation fields (Python: R/H/C/SP tags)
|
||||
RTop, RBott float64
|
||||
HTop, HBott float64
|
||||
HLeft, HRight float64
|
||||
H int
|
||||
C int
|
||||
CLeft, CRight float64
|
||||
SP int
|
||||
}
|
||||
|
||||
func (b TextBox) Bounds() (float64, float64, float64, float64) {
|
||||
return b.X0, b.Top, b.X1, b.Bottom
|
||||
}
|
||||
|
||||
// ── Position and section types ────────────────────────────────────────────
|
||||
|
||||
// Position represents a parsed position tag from @@...## format.
|
||||
type Position struct {
|
||||
PageNumbers []int
|
||||
Left float64
|
||||
Right float64
|
||||
Top float64
|
||||
Bottom float64
|
||||
}
|
||||
|
||||
// Section represents a text segment with its spatial position on a PDF page.
|
||||
type Section struct {
|
||||
Text string
|
||||
PositionTag string
|
||||
LayoutType string
|
||||
DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing
|
||||
Positions []Position
|
||||
TableItem *TableItem
|
||||
Image string // base64-encoded cropped page image
|
||||
}
|
||||
|
||||
// SectionsByPage returns a slice of sections on the given page.
|
||||
func SectionsByPage(sections []Section, page int) []Section {
|
||||
var out []Section
|
||||
for _, s := range sections {
|
||||
for _, p := range s.Positions {
|
||||
for _, pn := range p.PageNumbers {
|
||||
if pn == page {
|
||||
out = append(out, s)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// CollectFigures returns all sections with LayoutType "figure".
|
||||
func CollectFigures(sections []Section) []Section {
|
||||
if sections == nil {
|
||||
return nil
|
||||
}
|
||||
figures := make([]Section, 0)
|
||||
for _, s := range sections {
|
||||
if s.LayoutType == LayoutTypeFigure {
|
||||
figures = append(figures, s)
|
||||
}
|
||||
}
|
||||
return figures
|
||||
}
|
||||
|
||||
// ── Table types ───────────────────────────────────────────────────────────
|
||||
|
||||
// TableItem represents a detected table or figure region.
|
||||
type TableItem struct {
|
||||
ImageB64 string
|
||||
Rows [][]string
|
||||
Cells []TSRCell
|
||||
Positions []Position
|
||||
Scale float64
|
||||
CropOffX float64
|
||||
CropOffY float64
|
||||
Caption string
|
||||
|
||||
RegionLeft, RegionRight, RegionTop, RegionBottom float64
|
||||
NoMerge bool
|
||||
Grid [][]TSRCell
|
||||
}
|
||||
|
||||
// TSRCell represents one table cell from TSR.
|
||||
type TSRCell struct {
|
||||
X0, Y0, X1, Y1 float64
|
||||
Text string
|
||||
Label string
|
||||
}
|
||||
|
||||
func (c TSRCell) Bounds() (float64, float64, float64, float64) {
|
||||
return c.X0, c.Y0, c.X1, c.Y1
|
||||
}
|
||||
|
||||
// ── DeepDoc vision types ─────────────────────────────────────────────────
|
||||
|
||||
// DLARegion represents one detected layout region.
|
||||
type DLARegion struct {
|
||||
X0, Y0, X1, Y1 float64
|
||||
Label string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
func (r DLARegion) Bounds() (float64, float64, float64, float64) {
|
||||
return r.X0, r.Y0, r.X1, r.Y1
|
||||
}
|
||||
|
||||
// OCRBox represents a detected text region from DeepDoc OCR detection.
|
||||
type OCRBox struct {
|
||||
X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
|
||||
}
|
||||
|
||||
// OCRText represents recognized text with confidence from DeepDoc OCR rec.
|
||||
type OCRText struct {
|
||||
Text string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
// ── Parser configuration ──────────────────────────────────────────────────
|
||||
|
||||
// ParserConfig holds parser configuration.
|
||||
type ParserConfig struct {
|
||||
Zoom float64
|
||||
FromPage int
|
||||
ToPage int
|
||||
TableContextSize int
|
||||
ImageContextSize int
|
||||
AutoRotateTables *bool
|
||||
SeparateTablesFigs bool
|
||||
SortByTop bool
|
||||
BatchSize int
|
||||
SkipOCR bool
|
||||
MaxOCRConcurrency int
|
||||
TableBuilder TableBuilder
|
||||
}
|
||||
|
||||
// DefaultParserConfig returns a ParserConfig with sensible defaults.
|
||||
func DefaultParserConfig() ParserConfig {
|
||||
return ParserConfig{
|
||||
Zoom: 3,
|
||||
FromPage: 0,
|
||||
ToPage: -1,
|
||||
BatchSize: 50,
|
||||
TableContextSize: 0,
|
||||
ImageContextSize: 0,
|
||||
SeparateTablesFigs: false,
|
||||
}
|
||||
}
|
||||
|
||||
// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
|
||||
const DlaDPI = 216
|
||||
|
||||
// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
|
||||
const DlaScale = DlaDPI / 72.0
|
||||
|
||||
// ── Layout type constants ─────────────────────────────────────────────────
|
||||
const DlaDPI = doctype.DlaDPI
|
||||
const DlaScale = doctype.DlaScale
|
||||
|
||||
const (
|
||||
LayoutTypeText = "text"
|
||||
LayoutTypeTable = "table"
|
||||
LayoutTypeFigure = "figure"
|
||||
LayoutTypeEquation = "equation"
|
||||
LayoutTypeTitle = "title"
|
||||
LayoutTypeReference = "reference"
|
||||
LayoutTypeFooter = "footer"
|
||||
LayoutTypeHeader = "header"
|
||||
|
||||
DLALabelFigureCaption = "figure caption"
|
||||
DLALabelTableCaption = "table caption"
|
||||
LayoutTypeText = doctype.LayoutTypeText
|
||||
LayoutTypeTable = doctype.LayoutTypeTable
|
||||
LayoutTypeFigure = doctype.LayoutTypeFigure
|
||||
LayoutTypeEquation = doctype.LayoutTypeEquation
|
||||
LayoutTypeTitle = doctype.LayoutTypeTitle
|
||||
LayoutTypeReference = doctype.LayoutTypeReference
|
||||
LayoutTypeFooter = doctype.LayoutTypeFooter
|
||||
LayoutTypeHeader = doctype.LayoutTypeHeader
|
||||
DLALabelFigureCaption = doctype.DLALabelFigureCaption
|
||||
DLALabelTableCaption = doctype.DLALabelTableCaption
|
||||
)
|
||||
|
||||
// ── Interfaces ────────────────────────────────────────────────────────────
|
||||
// ── Re-export functions and variables ──────────────────────────────────────
|
||||
|
||||
// DocAnalyzer abstracts DeepDoc vision operations.
|
||||
type DocAnalyzer interface {
|
||||
DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
|
||||
TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
|
||||
OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
|
||||
OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
|
||||
OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
|
||||
Health() bool
|
||||
}
|
||||
|
||||
// ── Outline ────────────────────────────────────────────────────────────
|
||||
|
||||
// Outline represents one entry in a PDF's document outline (table of contents).
|
||||
// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
|
||||
type Outline struct {
|
||||
Title string
|
||||
Level int
|
||||
PageNumber int // 1-indexed, matching Python
|
||||
}
|
||||
|
||||
// PDFEngine abstracts page extraction capabilities.
|
||||
type PDFEngine interface {
|
||||
ExtractChars(pageNum int) ([]TextChar, error)
|
||||
RenderPage(pageNum int, dpi float64) ([]byte, error)
|
||||
RenderPageImage(pageNum int, dpi float64) (image.Image, error)
|
||||
RawData() []byte
|
||||
PageCount() (int, error)
|
||||
Outlines() ([]Outline, error)
|
||||
Close() error
|
||||
}
|
||||
|
||||
// Tokenizer provides text tokenization matching rag_tokenizer.
|
||||
type Tokenizer interface {
|
||||
Tag(token string) string
|
||||
}
|
||||
|
||||
// SampleFunc samples up to n characters from a page's chars.
|
||||
type SampleFunc func(chars []TextChar, n int) string
|
||||
|
||||
// TableBuilder encapsulates TSR model-specific cell detection and grouping.
|
||||
type TableBuilder interface {
|
||||
Name() string
|
||||
DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
|
||||
GroupCells(cells []TSRCell) [][]TSRCell
|
||||
}
|
||||
|
||||
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
|
||||
type Rectangular interface {
|
||||
Bounds() (x0, y0, x1, y1 float64)
|
||||
}
|
||||
|
||||
// IsCJK reports whether r is a CJK character.
|
||||
func IsCJK(r rune) bool {
|
||||
return unicode.Is(unicode.Han, r) ||
|
||||
unicode.Is(unicode.Hiragana, r) ||
|
||||
unicode.Is(unicode.Katakana, r) ||
|
||||
unicode.Is(unicode.Hangul, r)
|
||||
}
|
||||
var (
|
||||
CollectFigures = doctype.CollectFigures
|
||||
DefaultParserConfig = doctype.DefaultParserConfig
|
||||
IsCJK = doctype.IsCJK
|
||||
)
|
||||
|
||||
@@ -131,34 +131,6 @@ func OverlapX(a, b pdf.Rectangular) float64 {
|
||||
return overlap / minWidth
|
||||
}
|
||||
|
||||
// SortXByPage sorts boxes by page_number, then x0, then top.
|
||||
// After sorting, corrects for same-page boxes that have nearly the same x0
|
||||
// but inverted top ordering (a layout artifact).
|
||||
//
|
||||
// Python: pdf_parser.py:178 sort_X_by_page()
|
||||
func SortXByPage(boxes []pdf.TextBox, threshold float64) []pdf.TextBox {
|
||||
sort.Slice(boxes, func(i, j int) bool {
|
||||
if boxes[i].PageNumber != boxes[j].PageNumber {
|
||||
return boxes[i].PageNumber < boxes[j].PageNumber
|
||||
}
|
||||
if boxes[i].X0 != boxes[j].X0 {
|
||||
return boxes[i].X0 < boxes[j].X0
|
||||
}
|
||||
return boxes[i].Top < boxes[j].Top
|
||||
})
|
||||
|
||||
for i := len(boxes) - 1; i >= 1; i-- {
|
||||
for j := i - 1; j >= 0; j-- {
|
||||
if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold &&
|
||||
boxes[j+1].Top < boxes[j].Top &&
|
||||
boxes[j+1].PageNumber == boxes[j].PageNumber {
|
||||
boxes[j], boxes[j+1] = boxes[j+1], boxes[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
return boxes
|
||||
}
|
||||
|
||||
// MedianCharHeight computes the median character height for a page,
|
||||
// matching Python's np.median(char height) in __images__ (pdf_parser.py:1552).
|
||||
// Used as a reference unit for vertical spacing decisions.
|
||||
|
||||
@@ -49,22 +49,6 @@ func TestYDis(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSortXByPage(t *testing.T) {
|
||||
boxes := []pdf.TextBox{
|
||||
{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
|
||||
{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
|
||||
{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
|
||||
{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
|
||||
}
|
||||
result := SortXByPage(boxes, 3)
|
||||
if result[0].Text != "D" {
|
||||
t.Errorf("first should be page 0: got %q", result[0].Text)
|
||||
}
|
||||
if result[1].Text != "B" || result[2].Text != "A" {
|
||||
t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOverlapX(t *testing.T) {
|
||||
b1 := pdf.TextBox{X0: 50, X1: 200}
|
||||
b2 := pdf.TextBox{X0: 100, X1: 250}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package parser
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"math"
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
|
||||
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
|
||||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||||
)
|
||||
@@ -41,7 +42,7 @@ func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) {
|
||||
t.Fatal("no chars")
|
||||
}
|
||||
|
||||
lines := groupCharsToLines(chars, false)
|
||||
lines := lyt.GroupCharsToLines(chars, false)
|
||||
for li, line := range lines {
|
||||
if len(line) <= 1 {
|
||||
continue
|
||||
|
||||
304
internal/deepdoc/parser/type/types.go
Normal file
304
internal/deepdoc/parser/type/types.go
Normal file
@@ -0,0 +1,304 @@
|
||||
// Package doctype provides shared types, interfaces, and constants for the
|
||||
// deepdoc parser pipeline. All format-specific parsers (pdf, docx, xlsx, etc.)
|
||||
// share these definitions. The package has zero dependencies on sibling
|
||||
// packages so that any sub-package can import it without circular imports.
|
||||
package doctype
|
||||
|
||||
import (
|
||||
"context"
|
||||
"image"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// ── Pipeline types ────────────────────────────────────────────────────────
|
||||
|
||||
// PipelineMetrics records diagnostic counts at each pipeline stage.
|
||||
type PipelineMetrics struct {
|
||||
BoxesInitial int
|
||||
BoxesTextMerge int
|
||||
BoxesVertMerge int
|
||||
BoxesFinal int
|
||||
TablesCount int
|
||||
}
|
||||
|
||||
// ParseResult encapsulates all outputs from a single Parse() call.
|
||||
type ParseResult struct {
|
||||
Sections []Section
|
||||
Tables []TableItem
|
||||
PageImages map[int]image.Image
|
||||
Metrics PipelineMetrics
|
||||
Outlines []Outline // PDF outlines/bookmarks extracted from the document
|
||||
|
||||
DLADebug []DLAPageRegions
|
||||
TSRDebug []TSRRawCell
|
||||
}
|
||||
|
||||
// Figures returns all sections with LayoutType "figure".
|
||||
// Computed on demand from Sections — no stored field.
|
||||
func (r *ParseResult) Figures() []Section {
|
||||
return CollectFigures(r.Sections)
|
||||
}
|
||||
|
||||
// DLAPageRegions holds DLA layout regions for one page.
|
||||
type DLAPageRegions struct {
|
||||
Page int
|
||||
Regions []DLARegion
|
||||
}
|
||||
|
||||
// TSRRawCell holds a raw TSR cell before row/column grouping.
|
||||
type TSRRawCell struct {
|
||||
TableIndex int `json:"table_index"`
|
||||
Page int `json:"page"`
|
||||
Label string `json:"label"`
|
||||
X0 float64 `json:"x0"`
|
||||
Y0 float64 `json:"y0"`
|
||||
X1 float64 `json:"x1"`
|
||||
Y1 float64 `json:"y1"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
// ── Character and text box types ──────────────────────────────────────────
|
||||
|
||||
// TextChar represents a single character extracted from a PDF page.
|
||||
type TextChar struct {
|
||||
X0, X1 float64
|
||||
Top, Bottom float64
|
||||
Text string
|
||||
FontName string
|
||||
FontSize float64
|
||||
PageNumber int
|
||||
LayoutType string
|
||||
LayoutNo string
|
||||
ColID int
|
||||
R int
|
||||
}
|
||||
|
||||
func (c TextChar) Bounds() (float64, float64, float64, float64) {
|
||||
return c.X0, c.Top, c.X1, c.Bottom
|
||||
}
|
||||
|
||||
// TextBox represents a rectangular region of text on a PDF page.
|
||||
type TextBox struct {
|
||||
X0, X1 float64
|
||||
Top, Bottom float64
|
||||
Text string
|
||||
PageNumber int
|
||||
LayoutType string
|
||||
LayoutNo string
|
||||
ColID int
|
||||
R int
|
||||
// Post-TSR table annotation fields (Python: R/H/C/SP tags)
|
||||
RTop, RBott float64
|
||||
HTop, HBott float64
|
||||
HLeft, HRight float64
|
||||
H int
|
||||
C int
|
||||
CLeft, CRight float64
|
||||
SP int
|
||||
}
|
||||
|
||||
func (b TextBox) Bounds() (float64, float64, float64, float64) {
|
||||
return b.X0, b.Top, b.X1, b.Bottom
|
||||
}
|
||||
|
||||
// ── Position and section types ────────────────────────────────────────────
|
||||
|
||||
// Position represents a parsed position tag from @@...## format.
|
||||
type Position struct {
|
||||
PageNumbers []int
|
||||
Left float64
|
||||
Right float64
|
||||
Top float64
|
||||
Bottom float64
|
||||
}
|
||||
|
||||
// Section represents a text segment with its spatial position on a PDF page.
|
||||
type Section struct {
|
||||
Text string
|
||||
PositionTag string
|
||||
LayoutType string
|
||||
DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing
|
||||
Positions []Position
|
||||
TableItem *TableItem
|
||||
Image string // base64-encoded cropped page image
|
||||
}
|
||||
|
||||
// CollectFigures returns all sections with LayoutType "figure".
|
||||
func CollectFigures(sections []Section) []Section {
|
||||
if sections == nil {
|
||||
return nil
|
||||
}
|
||||
figures := make([]Section, 0)
|
||||
for _, s := range sections {
|
||||
if s.LayoutType == LayoutTypeFigure {
|
||||
figures = append(figures, s)
|
||||
}
|
||||
}
|
||||
return figures
|
||||
}
|
||||
|
||||
// ── Table types ───────────────────────────────────────────────────────────
|
||||
|
||||
// TableItem represents a detected table or figure region.
|
||||
type TableItem struct {
|
||||
ImageB64 string
|
||||
Rows [][]string
|
||||
Cells []TSRCell
|
||||
Positions []Position
|
||||
Scale float64
|
||||
CropOffX float64
|
||||
CropOffY float64
|
||||
Caption string
|
||||
|
||||
RegionLeft, RegionRight, RegionTop, RegionBottom float64
|
||||
NoMerge bool
|
||||
Grid [][]TSRCell
|
||||
}
|
||||
|
||||
// TSRCell represents one table cell from TSR.
|
||||
type TSRCell struct {
|
||||
X0, Y0, X1, Y1 float64
|
||||
Text string
|
||||
Label string
|
||||
}
|
||||
|
||||
func (c TSRCell) Bounds() (float64, float64, float64, float64) {
|
||||
return c.X0, c.Y0, c.X1, c.Y1
|
||||
}
|
||||
|
||||
// ── DeepDoc vision types ─────────────────────────────────────────────────
|
||||
|
||||
// DLARegion represents one detected layout region.
|
||||
type DLARegion struct {
|
||||
X0, Y0, X1, Y1 float64
|
||||
Label string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
func (r DLARegion) Bounds() (float64, float64, float64, float64) {
|
||||
return r.X0, r.Y0, r.X1, r.Y1
|
||||
}
|
||||
|
||||
// OCRBox represents a detected text region from DeepDoc OCR detection.
|
||||
type OCRBox struct {
|
||||
X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
|
||||
}
|
||||
|
||||
// OCRText represents recognized text with confidence from DeepDoc OCR rec.
|
||||
type OCRText struct {
|
||||
Text string
|
||||
Confidence float64
|
||||
}
|
||||
|
||||
// ── Parser configuration ──────────────────────────────────────────────────
|
||||
|
||||
// ParserConfig holds parser configuration.
|
||||
type ParserConfig struct {
|
||||
Zoom float64
|
||||
FromPage int
|
||||
ToPage int
|
||||
TableContextSize int
|
||||
ImageContextSize int
|
||||
AutoRotateTables *bool
|
||||
SeparateTablesFigs bool
|
||||
SortByTop bool
|
||||
BatchSize int
|
||||
SkipOCR bool
|
||||
MaxOCRConcurrency int
|
||||
}
|
||||
|
||||
// DefaultParserConfig returns a ParserConfig with sensible defaults.
|
||||
func DefaultParserConfig() ParserConfig {
|
||||
return ParserConfig{
|
||||
Zoom: 3,
|
||||
FromPage: 0,
|
||||
ToPage: -1,
|
||||
BatchSize: 50,
|
||||
TableContextSize: 0,
|
||||
ImageContextSize: 0,
|
||||
SeparateTablesFigs: false,
|
||||
}
|
||||
}
|
||||
|
||||
// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
|
||||
const DlaDPI = 216
|
||||
|
||||
// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
|
||||
const DlaScale = DlaDPI / 72.0
|
||||
|
||||
// ── Layout type constants ─────────────────────────────────────────────────
|
||||
|
||||
const (
|
||||
LayoutTypeText = "text"
|
||||
LayoutTypeTable = "table"
|
||||
LayoutTypeFigure = "figure"
|
||||
LayoutTypeEquation = "equation"
|
||||
LayoutTypeTitle = "title"
|
||||
LayoutTypeReference = "reference"
|
||||
LayoutTypeFooter = "footer"
|
||||
LayoutTypeHeader = "header"
|
||||
|
||||
DLALabelFigureCaption = "figure caption"
|
||||
DLALabelTableCaption = "table caption"
|
||||
)
|
||||
|
||||
// ── Interfaces ────────────────────────────────────────────────────────────
|
||||
|
||||
// DocAnalyzer abstracts DeepDoc vision operations.
|
||||
type DocAnalyzer interface {
|
||||
DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
|
||||
TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
|
||||
OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
|
||||
OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
|
||||
OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
|
||||
Health() bool
|
||||
}
|
||||
|
||||
// ── Outline ────────────────────────────────────────────────────────────
|
||||
|
||||
// Outline represents one entry in a PDF's document outline (table of contents).
|
||||
// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
|
||||
type Outline struct {
|
||||
Title string
|
||||
Level int
|
||||
PageNumber int // 1-indexed, matching Python
|
||||
}
|
||||
|
||||
// PDFEngine abstracts page extraction capabilities.
|
||||
type PDFEngine interface {
|
||||
ExtractChars(pageNum int) ([]TextChar, error)
|
||||
RenderPage(pageNum int, dpi float64) ([]byte, error)
|
||||
RenderPageImage(pageNum int, dpi float64) (image.Image, error)
|
||||
RawData() []byte
|
||||
PageCount() (int, error)
|
||||
Outlines() ([]Outline, error)
|
||||
Close() error
|
||||
}
|
||||
|
||||
// Tokenizer provides text tokenization matching rag_tokenizer.
|
||||
type Tokenizer interface {
|
||||
Tag(token string) string
|
||||
}
|
||||
|
||||
// SampleFunc samples up to n characters from a page's chars.
|
||||
type SampleFunc func(chars []TextChar, n int) string
|
||||
|
||||
// TableBuilder encapsulates TSR model-specific cell detection and grouping.
|
||||
type TableBuilder interface {
|
||||
Name() string
|
||||
DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
|
||||
GroupCells(cells []TSRCell) [][]TSRCell
|
||||
}
|
||||
|
||||
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
|
||||
type Rectangular interface {
|
||||
Bounds() (x0, y0, x1, y1 float64)
|
||||
}
|
||||
|
||||
// IsCJK reports whether r is a CJK character.
|
||||
func IsCJK(r rune) bool {
|
||||
return unicode.Is(unicode.Han, r) ||
|
||||
unicode.Is(unicode.Hiragana, r) ||
|
||||
unicode.Is(unicode.Katakana, r) ||
|
||||
unicode.Is(unicode.Hangul, r)
|
||||
}
|
||||
Reference in New Issue
Block a user