Feat/oss parser no post (#16464)

### Summary

Remove dead code
This commit is contained in:
Jack
2026-07-02 09:46:33 +08:00
committed by GitHub
parent 133b1e15fd
commit 5bc4753d1e
51 changed files with 1381 additions and 2680 deletions

2
.gitignore vendored
View File

@@ -245,3 +245,5 @@ bin/*
# Parser test fixtures and python tools
internal/deepdoc/parser/pdf/testdata/
internal/deepdoc/parser/pdf/tools-py/
internal/deepdoc/parser/docx/testdata/
internal/deepdoc/parser/docx/tool/

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -51,12 +51,12 @@ func TestBatchResults(t *testing.T) {
}
pdfs := all[:min(count, len(all))]
ddClient, err := inf.NewInferenceClient(os.Getenv("DEEPDOC_URL"))
ddClient, err := inf.NewClient(os.Getenv("DEEPDOC_URL"))
if err != nil {
t.Fatal(err)
}
if !ddClient.Health() {
t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL)
t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.BaseURL())
}
deepDoc := pdf.DocAnalyzer(ddClient)
@@ -238,9 +238,9 @@ func parseOne(pdfDir, name string, deepDoc pdf.DocAnalyzer, skipOCR bool) (*pars
cfg := pdf.DefaultParserConfig()
cfg.SkipOCR = skipOCR
p := NewParser(cfg, deepDoc)
p := NewParser(cfg)
t0 := time.Now()
parsed, err := p.Parse(context.Background(), eng)
parsed, err := p.ParseRaw(context.Background(), eng, deepDoc)
elapsed := time.Since(t0).Seconds()
if err != nil {
return nil, fmt.Errorf("parse: %w", err)

View File

@@ -1,6 +1,6 @@
//go:build manual
package parser
package pdf
import (
"log/slog"
@@ -8,7 +8,7 @@ import (
"path/filepath"
"testing"
"ragflow/internal/deepdoc/parser/pdf/tools"
"ragflow/internal/deepdoc/parser/pdf/tool"
)
// TestBatchCompareWithPython compares Go output against Python reference
@@ -37,29 +37,29 @@ func TestBatchCompareWithPython(t *testing.T) {
pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text")
// Read Go text files' #@meta (no aggregate JSON dependency).
goResults, err := tools.ReadGoTextMeta(goTextDir)
goResults, err := tool.ReadGoTextMeta(goTextDir)
if err != nil || len(goResults) == 0 {
t.Fatalf("No Go text files in %s: %v", goTextDir, err)
}
// Read Python text files' #@meta
pyResults, err := tools.ReadPythonTextMeta(pyTextDir)
pyResults, err := tool.ReadPythonTextMeta(pyTextDir)
if err != nil || len(pyResults) == 0 {
t.Fatalf("No Python text files in %s: %v", pyTextDir, err)
}
t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults))
tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
tool.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
// Compare tables.
goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables")
pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables")
tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
tool.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
// Compare DLA + TSR raw intermediates.
goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla")
pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla")
tools.CompareDLAWithPython(t, goDLADir, pyDLADir)
tool.CompareDLAWithPython(t, goDLADir, pyDLADir)
goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw")
pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw")
tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
tool.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"bytes"
@@ -27,8 +27,8 @@ func TestParse_CropSectionImages(t *testing.T) {
defer eng.Close()
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -79,8 +79,8 @@ func TestCrop_Regression_SnapshotPDFs(t *testing.T) {
}
defer eng.Close()
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
if err != nil {
t.Fatalf("Parse: %v", err)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && integration
package parser
package pdf
import (
"context"
@@ -46,7 +46,7 @@ func TestDLARealWorldCompare(t *testing.T) {
for _, pg := range pdf.pages {
testName := pdf.name + "/page" + string(rune('0'+pg))
t.Run(testName, func(t *testing.T) {
pageImg, err := renderPageToImage(eng, pg)
pageImg, err := RenderPageToImage(eng, pg)
if err != nil {
t.Fatalf("render page %d: %v", pg, err)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && integration
package parser
package pdf
import (
"context"
@@ -28,7 +28,7 @@ func TestDLATSRResponseCompare(t *testing.T) {
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
pageImg, err := renderPageToImage(eng, 0)
pageImg, err := RenderPageToImage(eng, 0)
if err != nil {
t.Fatalf("render: %v", err)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo
package parser
package pdf
import (
"os"
@@ -11,20 +11,14 @@ import (
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// ── Shared CGO test helpers ──────────────────────────────────────────────────
// These helpers were previously duplicated across multiple test files with
// different build tags (integration, manual). Consolidating them into one file
// with the //go:build cgo tag makes them available to all cgo-tagged tests.
// mustConnectInferenceClient returns a InferenceClient pointed at the OSS service;
// skips the test if the service reports a non-OSS model type.
func mustConnectInferenceClient(t *testing.T) *inf.InferenceClient {
// mustConnectInferenceClient returns a InferenceClient for the OSS DeepDoc service.
func mustConnectInferenceClient(t *testing.T) *inf.Client {
t.Helper()
url := os.Getenv("OSSDEEPDOC_URL")
if url == "" {
url = "http://localhost:9390"
}
client, err := inf.NewInferenceClient(url)
client, err := inf.NewClient(url)
if err != nil {
t.Fatal(err)
}
@@ -48,3 +42,12 @@ func mustOpenEngine(t *testing.T, name string) pdf.PDFEngine {
}
return eng
}
func mustReadPDF(t *testing.T, name string) []byte {
t.Helper()
data, err := os.ReadFile(filepath.Join("testdata", "pdfs", name))
if err != nil {
t.Fatalf("read fixture %s: %v", name, err)
}
return data
}

View File

@@ -21,8 +21,8 @@ import (
"github.com/cenkalti/backoff/v5"
)
// InferenceClient wraps the DeepDoc HTTP API.
type InferenceClient struct {
// Client wraps the DeepDoc HTTP API.
type Client struct {
baseURL string
httpClient *http.Client
@@ -33,24 +33,27 @@ type InferenceClient struct {
}
// BaseURL returns the configured DeepDoc service URL.
func (c *InferenceClient) BaseURL() string { return c.baseURL }
func (c *Client) BaseURL() string { return c.baseURL }
// NewInferenceClient creates a client. baseURL must be provided by the caller
// NewClient creates a client. baseURL must be provided by the caller
// (e.g. from the DEEPDOC_URL environment variable). Returns an error if empty.
func NewInferenceClient(baseURL string) (*InferenceClient, error) {
func NewClient(baseURL string) (*Client, error) {
if baseURL == "" {
return nil, fmt.Errorf("deepdoc client: baseURL is required (set DEEPDOC_URL)")
}
return &InferenceClient{
return &Client{
baseURL: baseURL,
httpClient: &http.Client{
Timeout: 120 * time.Second,
},
DLALabels: DefaultDLALabels(),
TSRLabels: DefaultTSRLabels(),
}, nil
}
// Default DLA/TSR label tables used as fallback when no model-specific
// labels are injected by a TableBuilder constructor.
// DefaultDLALabels returns the 10-class DLA taxonomy matching Python's
// deepdoc/vision/dla_cli.py:10-21. Duplicates at indices 4, 7, 9 are
// kept verbatim for backward compatibility with existing inference servers.
func DefaultDLALabels() []string {
return []string{
pdf.LayoutTypeTitle, pdf.LayoutTypeText, pdf.LayoutTypeReference,
@@ -59,6 +62,9 @@ func DefaultDLALabels() []string {
pdf.LayoutTypeEquation, pdf.DLALabelFigureCaption,
}
}
// DefaultTSRLabels returns the 6-class TSR taxonomy matching Python's
// deepdoc/server/adapters/tsr_adapter.py:21-26.
func DefaultTSRLabels() []string {
return []string{
"table", "table column", "table row",
@@ -72,7 +78,7 @@ type bboxesResponse struct {
}
// DLA analyzes a full page image and returns labeled regions.
func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
func (c *Client) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
data, err := util.EncodeJPEG(pageImage)
if err != nil {
return nil, fmt.Errorf("dla: encode: %w", err)
@@ -87,9 +93,6 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
continue
}
labels := c.DLALabels
if labels == nil {
labels = DefaultDLALabels()
}
label := ""
if clsID := int(b[5]); clsID >= 0 && clsID < len(labels) {
label = labels[clsID]
@@ -104,7 +107,7 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
}
// TSR recognises table structure from a cropped image.
func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
func (c *Client) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
data, err := util.EncodeJPEG(cropped)
if err != nil {
return nil, fmt.Errorf("tsr: encode: %w", err)
@@ -119,9 +122,6 @@ func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.T
continue
}
tlabels := c.TSRLabels
if tlabels == nil {
tlabels = DefaultTSRLabels()
}
label := ""
if len(b) >= 6 {
if cls := int(b[5]); cls >= 0 && cls < len(tlabels) {
@@ -152,7 +152,7 @@ type ocrRecognizeResponse struct {
// OCRDetect detects text regions (bounding boxes) in an image.
// DeepDoc /predict/ocr with operator=det returns quad boxes: [[[x0,y0],[x1,y1],[x2,y2],[x3,y3]], ...]
func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
func (c *Client) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
data, err := util.EncodeJPEG(cropped)
if err != nil {
return nil, fmt.Errorf("ocr detect: encode: %w", err)
@@ -197,7 +197,7 @@ func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([
// OCRRecognize recognizes text in a cropped image region.
// DeepDoc /predict/ocr with operator=rec returns [[["text", confidence], ...]]
func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
func (c *Client) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
data, err := util.EncodeJPEG(cropped)
if err != nil {
return nil, fmt.Errorf("ocr rec: encode: %w", err)
@@ -224,7 +224,7 @@ func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image)
// OCRRecognizeBatch recognizes text in multiple cropped image regions.
// Returns a slice of results and a parallel slice of errors (nil on success).
// A nil cropped image in the input produces nil results and a non-nil error.
func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
func (c *Client) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
results := make([][]pdf.OCRText, len(cropped))
errs := make([]error, len(cropped))
@@ -255,7 +255,7 @@ func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image
}
// Health checks whether the DeepDoc service is reachable.
func (c *InferenceClient) Health() bool {
func (c *Client) Health() bool {
resp, err := c.httpClient.Get(c.baseURL + "/health")
if err != nil {
return false
@@ -264,7 +264,7 @@ func (c *InferenceClient) Health() bool {
return resp.StatusCode == 200
}
func (c *InferenceClient) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
func (c *Client) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
// Build multipart body once — the image data is idempotent.
var body bytes.Buffer
w := multipart.NewWriter(&body)

View File

@@ -11,11 +11,11 @@ import (
"testing"
)
// mustNewDeepDocClient wraps NewInferenceClient for test convenience.
// mustNewDeepDocClient wraps NewClient for test convenience.
// Fails the test if the URL is empty.
func mustNewDeepDocClient(t *testing.T, baseURL string) *InferenceClient {
func mustNewDeepDocClient(t *testing.T, baseURL string) *Client {
t.Helper()
client, err := NewInferenceClient(baseURL)
client, err := NewClient(baseURL)
if err != nil {
t.Fatalf("NewDeepDocClient(%q): %v", baseURL, err)
}

View File

@@ -1,13 +1,12 @@
//go:build cgo && integration
package parser
package pdf
import (
"context"
"strings"
"testing"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
@@ -15,13 +14,11 @@ import (
// through the OSS TableBuilder produces tables with the expected row/column structure.
func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -29,7 +26,7 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
t.Skip("DLA did not detect any tables in fixture")
}
t.Logf("OssDeepDoc produced %d tables", len(result.Tables))
t.Logf("DeepDoc produced %d tables", len(result.Tables))
for i, tbl := range result.Tables {
t.Logf("table[%d]: %d rows", i, len(tbl.Rows))
for ri, row := range tbl.Rows {
@@ -51,13 +48,11 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
// rows with the expected grid structure.
func TestIntegration_DeepDoc_TableRows(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -92,13 +87,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
client := mustConnectInferenceClient(t)
parseOnce := func() *pdf.ParseResult {
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -124,13 +117,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
// does not crash.
func TestIntegration_DeepDoc_EmptyPage(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
data := mustReadPDF(t, "01_english_simple.pdf")
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
_, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
_, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"

View File

@@ -0,0 +1,41 @@
package pdf
import (
"image"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// MockEngine is a minimal pdf.PDFEngine stub for unit/integration tests.
type MockEngine struct {
Chars map[int][]pdf.TextChar
NumPages int
RenderW int
RenderH int
}
func (m *MockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
return m.Chars[pg], nil
}
func (m *MockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
return nil, ErrNoPDFData
}
func (m *MockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
w, h := m.RenderW, m.RenderH
if w <= 0 {
w = 100
}
if h <= 0 {
h = 100
}
return image.NewRGBA(image.Rect(0, 0, w, h)), nil
}
func (m *MockEngine) PageCount() (int, error) {
if m.NumPages <= 0 {
return 1, nil
}
return m.NumPages, nil
}
func (m *MockEngine) RawData() []byte { return nil }
func (m *MockEngine) Close() error { return nil }
func (m *MockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }

View File

@@ -1,11 +1,13 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
"image/png"
"os"
inf "ragflow/internal/deepdoc/parser/pdf/inference"
util "ragflow/internal/deepdoc/parser/pdf/util"
"strings"
"testing"
)
@@ -19,7 +21,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
if url == "" {
t.Skip("DEEPDOC_URL not set")
}
dd, err := inf.NewInferenceClient(url)
dd, err := inf.NewClient(url)
if err != nil {
t.Fatal(err)
}
@@ -41,7 +43,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
if err != nil {
t.Fatal(err)
}
t.Logf("pdf_oxide chars: %d", len(chars))
t.Logf("pdf_oxide Chars: %d", len(chars))
var sample strings.Builder
for i, c := range chars {

View File

@@ -1,6 +1,6 @@
//go:build cgo
package parser
package pdf
import (
"context"

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -10,10 +10,10 @@ import (
// ── outline-tracking mock engines ──────────────────────────────────────────
// outlineTrackingEngine wraps mockEngine and records whether Outlines()
// outlineTrackingEngine wraps MockEngine and records whether Outlines()
// was called.
type outlineTrackingEngine struct {
*mockEngine
*MockEngine
outlines []pdf.Outline
outlinesCalled bool
}
@@ -25,7 +25,7 @@ func (e *outlineTrackingEngine) Outlines() ([]pdf.Outline, error) {
// outlineErrorEngine returns an error from Outlines().
type outlineErrorEngine struct {
*mockEngine
*MockEngine
}
func (e *outlineErrorEngine) Outlines() ([]pdf.Outline, error) {
@@ -46,13 +46,13 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
{Title: "Section 1.1", Level: 1, PageNumber: 2},
}
eng := &outlineTrackingEngine{
mockEngine: &mockEngine{pageCount: 3},
MockEngine: &MockEngine{NumPages: 3},
outlines: expectedOutlines,
}
mockDLA := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig(), mockDLA)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
if err != nil {
t.Fatalf("Parse failed: %v", err)
}
@@ -79,18 +79,18 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
// and produces sections (outlines are best-effort).
func TestParse_OutlinesErrorDoesNotBlockParsing(t *testing.T) {
eng := &outlineErrorEngine{
mockEngine: &mockEngine{
pageCount: 2,
chars: map[int][]pdf.TextChar{
MockEngine: &MockEngine{
NumPages: 2,
Chars: map[int][]pdf.TextChar{
0: {{Text: "Hello world", X0: 100, X1: 200, Top: 100, Bottom: 120}},
1: {{Text: "Page two", X0: 100, X1: 200, Top: 100, Bottom: 120}},
},
},
}
mockDLA := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig(), mockDLA)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
if err != nil {
t.Fatalf("Parse should not fail when Outlines() errors: %v", err)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -31,8 +31,8 @@ func TestParse_BatchEquivalence(t *testing.T) {
defer eng.Close()
cfg := pdf.DefaultParserConfig()
cfg.BatchSize = batchSize
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
if err != nil {
t.Fatal(err)
}

View File

@@ -0,0 +1,22 @@
//go:build cgo
package pdf
import (
"context"
"fmt"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// Parse runs the full PDF extraction pipeline from raw bytes.
// Creates and manages the PDF engine lifecycle internally.
func (p *Parser) Parse(ctx context.Context, data []byte, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
engine, err := NewEngine(data)
if err != nil {
return nil, fmt.Errorf("pdfoxide.NewEngine: %w", err)
}
defer engine.Close()
return p.ParseRaw(ctx, engine, docAnalyzer)
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -8,52 +8,36 @@ import (
"log/slog"
"sync"
inf "ragflow/internal/deepdoc/parser/pdf/inference"
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// Parser is the main PDF text/layout extraction pipeline.
// Parser is the core PDF text/layout extraction pipeline.
// It corresponds to RAGFlowPdfParser in pdf_parser.py.
// Parser is stateless after construction — safe to reuse across documents.
// Stateless after construction — safe to reuse across documents.
type Parser struct {
Config pdf.ParserConfig
// DeepDoc is the required document layout / OCR / table recognition
// service. Set at construction time by NewParser.
DeepDoc pdf.DocAnalyzer
// SampleChars samples up to n chars from a page for English detection.
// Defaults to random sampling (matching Python's random.choices).
// Inject a deterministic sampler for reproducible tests.
SampleChars pdf.SampleFunc
// tableBuilder is the TSR model adapter. Set at construction time
//
// different implementation via Config.TableBuilder.
tableBuilder pdf.TableBuilder
}
// NewParser creates a new Parser with the required DeepDoc service.
func NewParser(cfg pdf.ParserConfig, doc pdf.DocAnalyzer) *Parser {
tb := cfg.TableBuilder
if tb == nil {
tb = NewTableBuilderFor(doc)
}
return &Parser{
Config: cfg,
DeepDoc: doc,
tableBuilder: tb,
}
// pageResult holds per-page output from extractPages.
type pageResult struct {
pg int
ocrBoxes []pdf.TextBox
chars []pdf.TextChar
ocrUsed bool
pageImg image.Image
err error
}
// New creates a new Parser with the given config.
func NewParser(cfg pdf.ParserConfig) *Parser {
return &Parser{Config: cfg}
}
// ── TableBuilder factory ───────────────────────────────────────────────────
// tableBuilderFactory holds a model-specific TableBuilder factory registered
// by EE packages via RegisterTableBuilder. If nil, the default OSS
// implementation is used.
var tableBuilderFactory func(pdf.DocAnalyzer) pdf.TableBuilder
// RegisterTableBuilder registers a TableBuilder factory for the PDF parser.
@@ -62,30 +46,20 @@ func RegisterTableBuilder(factory func(pdf.DocAnalyzer) pdf.TableBuilder) {
tableBuilderFactory = factory
}
// NewTableBuilderFor creates the right TableBuilder, chosen by the registry.
// Checks the registry first for EE-registered implementations, falling back
// to the default OSS DeepDocTableBuilder. Label taxonomies are injected
// before construction.
func NewTableBuilderFor(doc pdf.DocAnalyzer) pdf.TableBuilder {
if tableBuilderFactory != nil {
return tableBuilderFactory(doc)
}
if c, ok := doc.(*inf.InferenceClient); ok {
c.DLALabels = inf.DefaultDLALabels()
c.TSRLabels = inf.DefaultTSRLabels()
}
return tbl.NewDeepDocTableBuilder(doc)
}
// Parse runs the full PDF extraction pipeline: chars → boxes →
// column assignment → text merge → vertical merge → sections.
//
// For documents larger than Config.BatchSize pages, processes in batches
// to bound memory usage (matching Python's batch_size=50).
//
// Returns a pdf.ParseResult containing sections, tables, page images, figures,
// and pipeline stage metrics. Parser itself remains stateless.
func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseResult, error) {
// ── Public API ─────────────────────────────────────────────────────────────
// ParseRaw is the internal entry point: runs the core pipeline on an
// already-opened engine. Exported for tests that inject mock engines.
func (p *Parser) ParseRaw(ctx context.Context, engine pdf.PDFEngine, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
tb := NewTableBuilderFor(docAnalyzer)
// Normalize page range
pageCount, err := engine.PageCount()
if err != nil {
@@ -103,11 +77,10 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
totalPages := toPage - fromPage + 1
batchSize := p.Config.BatchSize
if batchSize <= 0 {
batchSize = 50 // default, matching Python's batch_size
batchSize = 50
}
// ── Prescan: lightweight char extraction for language/noise detection ──
// No rendering, no OCR — just raw chars for global decisions.
// ── Prescan ──
prescanChars := make(map[int][]pdf.TextChar)
prescanMedianH := make(map[int]float64)
prescanMedianW := make(map[int]float64)
@@ -115,26 +88,27 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
chars, extractErr := engine.ExtractChars(pg)
if extractErr != nil {
slog.Warn("prescan: ExtractChars failed", "page", pg, "err", extractErr)
chars = nil // skip broken pages (matching old behavior)
chars = nil
}
prescanChars[pg] = chars
prescanMedianH[pg] = util.MedianCharHeight(chars)
prescanMedianW[pg] = util.MedianCharWidth(chars)
}
isEnglish := util.DetectEnglish(prescanChars, totalPages, p.SampleChars)
isEnglish := util.DetectEnglish(prescanChars, totalPages, nil)
scanNoise := util.IsScanNoise(util.FullTextFromChars(prescanChars))
// ── Extract PDF outlines/bookmarks (best-effort, non-fatal) ──
// ── Outlines ──
outlines, outlineErr := engine.Outlines()
if outlineErr != nil {
slog.Warn("Failed to extract PDF outlines; continuing without them", "err", outlineErr)
outlines = nil
}
// ── Small document: process all at once (no batching overhead) ──
// ── Small document ──
if totalPages <= batchSize {
result, err := p.processPages(ctx, engine, fromPage, toPage,
prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise)
prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise,
docAnalyzer, tb)
if err != nil {
return nil, err
}
@@ -142,7 +116,7 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
return result, nil
}
// ── Large document: process in batches to bound memory ──
// ── Large document: batched ──
slog.Info("batched processing", "pages", totalPages, "batchSize", batchSize)
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
for start := fromPage; start <= toPage; start += batchSize {
@@ -151,7 +125,6 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
}
end := min(start+batchSize-1, toPage)
// Slice prescan data for this batch.
batchChars := make(map[int][]pdf.TextChar, end-start+1)
batchMH := make(map[int]float64, end-start+1)
batchMW := make(map[int]float64, end-start+1)
@@ -162,15 +135,14 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
}
batch, err := p.processPages(ctx, engine, start, end,
batchChars, batchMH, batchMW, isEnglish, scanNoise)
batchChars, batchMH, batchMW, isEnglish, scanNoise,
docAnalyzer, tb)
if err != nil {
return nil, err
}
// Merge batch results.
result.Sections = append(result.Sections, batch.Sections...)
result.Tables = append(result.Tables, batch.Tables...)
// Figures() is computed on demand from Sections.
for pg, img := range batch.PageImages {
result.PageImages[pg] = img
}
@@ -184,33 +156,22 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
return result, nil
}
// extractPages runs per-page OCR (detect + recognize) for the given page
// range, returning text boxes, char data, whether any page used OCR, and
// any errors encountered. Partial results are returned even when some
// pages fail — callers should inspect the error for diagnostics but may
// still use the returned boxes and chars.
// ── Internal pipeline steps ────────────────────────────────────────────────
func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
prescanChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
pageImages map[int]image.Image,
docAnalyzer pdf.DocAnalyzer,
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
var boxes []pdf.TextBox
pageChars := make(map[int][]pdf.TextChar)
ocrUsedAny := false
type pr struct {
pg int
ocrBoxes []pdf.TextBox
chars []pdf.TextChar
ocrUsed bool
pageImg image.Image
err error
}
pageCount := toPage - fromPage + 1
results := make([]pr, pageCount)
results := make([]pageResult, pageCount)
// Semaphore cap: 0 → sequential; >0 → bounded parallelism.
cap := p.Config.MaxOCRConcurrency
if cap <= 0 {
cap = 1
@@ -222,16 +183,15 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
pg := fromPage + i
chars := prescanChars[pg]
// Fast path: pages with embedded chars → sequential inline (no HTTP OCR).
if len(chars) > 0 && !util.IsGarbledPage(chars) {
pageImg, renderErr := renderPageToImage(engine, pg)
pageImg, renderErr := RenderPageToImage(engine, pg)
if renderErr == nil && pageImg != nil {
pageImages[pg] = pageImg
}
var ocrBoxes []pdf.TextBox
ocrUsed := false
if !p.Config.SkipOCR && renderErr == nil && pageImg != nil {
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
if ocrBoxes == nil {
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
} else {
@@ -241,30 +201,28 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
} else {
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
}
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
continue
}
// OCR path: render + detect + recognize (potentially parallel).
wg.Add(1)
go func(i, pg int, chars []pdf.TextChar) {
defer wg.Done()
select {
case <-ctx.Done():
results[i] = pr{pg: pg, err: ctx.Err()}
results[i] = pageResult{pg: pg, err: ctx.Err()}
return
case sem <- struct{}{}:
}
defer func() { <-sem }()
pageImg, err := renderPageToImage(engine, pg)
pageImg, err := RenderPageToImage(engine, pg)
if err != nil {
results[i] = pr{pg: pg, err: err}
results[i] = pageResult{pg: pg, err: err}
return
}
// Check if context was cancelled during render.
if err := ctx.Err(); err != nil {
results[i] = pr{pg: pg, err: err}
results[i] = pageResult{pg: pg, err: err}
return
}
@@ -275,7 +233,7 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
if len(chars) > 0 {
label = "garbled page"
}
ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, p.DeepDoc, pg, label)
ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, docAnalyzer, pg, label)
if ocrBoxes != nil {
for j := range ocrBoxes {
for _, r := range ocrBoxes[j].Text {
@@ -286,9 +244,8 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
ocrUsed = true
}
}
// Merged OCR path for pages with both embedded and OCR chars.
if !ocrUsed && len(chars) > 0 && !p.Config.SkipOCR {
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
if ocrBoxes != nil {
ocrUsed = true
}
@@ -298,15 +255,252 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
}
}
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
}(i, pg, chars)
}
wg.Wait()
return mergePageResults(results, boxes, pageImages, pageChars, ocrUsedAny, medianHeights, medianWidths)
}
// Merge results in page order.
func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
pageChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
ocrUsedAny bool,
docAnalyzer pdf.DocAnalyzer,
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
var boxes []pdf.TextBox
for pg := fromPage; pg <= toPage; pg++ {
img := pageImages[pg]
if img == nil {
var err error
img, err = RenderPageToImage(engine, pg)
if err != nil {
slog.Warn("scan noise: page render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "scan page")
if ocrBoxes == nil {
slog.Warn("scan noise: page OCR empty", "page", pg)
continue
}
boxes = append(boxes, ocrBoxes...)
var chars []pdf.TextChar
for _, b := range ocrBoxes {
for _, r := range b.Text {
chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
break
}
}
pageChars[pg] = chars
medianHeights[pg] = util.MedianCharHeight(chars)
medianWidths[pg] = util.MedianCharWidth(chars)
}
slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
return boxes, pageChars, true
}
func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
boxes []pdf.TextBox, ocrUsedAny bool,
docAnalyzer pdf.DocAnalyzer,
) ([]pdf.TextBox, bool) {
retryZoomVal := p.Config.Zoom * pdf.DlaScale
retryDPI := retryZoomVal * 72
slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoomVal)
for pg := fromPage; pg <= toPage; pg++ {
img, err := engine.RenderPageImage(pg, retryDPI)
if err != nil {
slog.Warn("zoom retry: render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
if retryDPI != pdf.DlaDPI {
if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
pageImages[pg] = dlaImg
}
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "zoom retry")
if ocrBoxes == nil {
continue
}
scaleFactor := retryZoomVal / p.Config.Zoom
for i := range ocrBoxes {
ocrBoxes[i].X0 /= scaleFactor
ocrBoxes[i].X1 /= scaleFactor
ocrBoxes[i].Top /= scaleFactor
ocrBoxes[i].Bottom /= scaleFactor
}
boxes = append(boxes, ocrBoxes...)
ocrUsedAny = true
}
return boxes, ocrUsedAny
}
func (p *Parser) buildLayout(ctx context.Context,
result *pdf.ParseResult, engine pdf.PDFEngine,
boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
) error {
result.Metrics.BoxesInitial = len(boxes)
result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages, docAnalyzer, tb)
result.Metrics.TablesCount = len(result.Tables)
if err := ctx.Err(); err != nil {
return err
}
boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
result.Metrics.BoxesTextMerge = len(boxes)
lyt.SortByPageThenY(boxes, p.Config.SortByTop)
if ocrUsedAny {
isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, nil)
}
boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
result.Metrics.BoxesVertMerge = len(boxes)
if err := ctx.Err(); err != nil {
return err
}
boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
boxes = tbl.ConsolidateFigures(boxes)
pageHeights := make(map[int]float64, len(result.PageImages))
for pg, img := range result.PageImages {
pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
}
result.Sections = lyt.BoxesToSections(boxes, pageHeights)
result.Metrics.BoxesFinal = len(result.Sections)
result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
return nil
}
func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
prescanChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
isEnglish, isScanNoiseDoc bool,
docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
) (*pdf.ParseResult, error) {
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
fromPage, toPage, prescanChars,
medianHeights, medianWidths, result.PageImages, docAnalyzer)
if ocrErr != nil {
slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
}
if isScanNoiseDoc {
boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
fromPage, toPage, result.PageImages,
pageChars, medianHeights, medianWidths, ocrUsedAny, docAnalyzer)
}
if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
result.PageImages, boxes, ocrUsedAny, docAnalyzer)
}
if len(boxes) == 0 {
return result, nil
}
if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish,
docAnalyzer, tb); err != nil {
return nil, fmt.Errorf("buildLayout: %w", err)
}
p.fillSectionImages(result)
return result, nil
}
func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
if len(result.PageImages) == 0 {
return
}
tableImgByRegion := make(map[string]string, len(result.Tables))
for _, tbl := range result.Tables {
if tbl.ImageB64 == "" {
continue
}
pg := 0
if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
pg = tbl.Positions[0].PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
tableImgByRegion[key] = tbl.ImageB64
}
for i := range result.Sections {
if result.Sections[i].LayoutType == pdf.LayoutTypeTable {
if img, ok := matchTableImage(&result.Sections[i], tableImgByRegion); ok {
result.Sections[i].Image = img
continue
}
}
if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
result.Sections[i].Image = dlaImg
continue
}
}
img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
result.Sections[i].Image = img
if img == "" && result.Sections[i].Text != "" {
tag := result.Sections[i].PositionTag
slog.Warn("cropSectionImage empty for non-empty section",
"section", i, "posTag", tag[:min(80, len(tag))])
}
}
}
// matchTableImage looks up a pre-rendered table image for a section.
// Uses Positions if available; falls back to TableItem Region boundaries.
func matchTableImage(sec *pdf.Section, tableImgByRegion map[string]string) (string, bool) {
pg := 0
if len(sec.Positions) > 0 {
pos := sec.Positions[0]
if len(pos.PageNumbers) > 0 {
pg = pos.PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
if img, ok := tableImgByRegion[key]; ok {
return img, true
}
return "", false
}
if sec.TableItem != nil {
if len(sec.TableItem.Positions) > 0 && len(sec.TableItem.Positions[0].PageNumbers) > 0 {
pg = sec.TableItem.Positions[0].PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg,
sec.TableItem.RegionLeft, sec.TableItem.RegionRight,
sec.TableItem.RegionTop, sec.TableItem.RegionBottom)
if img, ok := tableImgByRegion[key]; ok {
return img, true
}
}
return "", false
}
// mergePageResults collects per-page OCR results into the final output.
func mergePageResults(results []pageResult, boxes []pdf.TextBox, pageImages map[int]image.Image,
pageChars map[int][]pdf.TextChar, ocrUsedAny bool,
medianHeights, medianWidths map[int]float64,
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
var errs []error
for i := 0; i < pageCount; i++ {
r := results[i]
for _, r := range results {
if r.err != nil {
slog.Warn("page OCR failed", "page", r.pg, "err", r.err)
errs = append(errs, fmt.Errorf("page %d: %w", r.pg, r.err))
@@ -329,233 +523,3 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
}
return boxes, pageChars, ocrUsedAny, errors.Join(errs...)
}
// retryScanNoise re-runs OCR on all pages when prescan detects scan noise,
// overwriting page-level state with fresh detect+recognize results.
func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
pageChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
ocrUsedAny bool,
) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
var boxes []pdf.TextBox
for pg := fromPage; pg <= toPage; pg++ {
img := pageImages[pg]
if img == nil {
var err error
img, err = renderPageToImage(engine, pg)
if err != nil {
slog.Warn("scan noise: page render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "scan page")
if ocrBoxes == nil {
slog.Warn("scan noise: page OCR empty", "page", pg)
continue
}
boxes = append(boxes, ocrBoxes...)
var chars []pdf.TextChar
for _, b := range ocrBoxes {
for _, r := range b.Text {
chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
break
}
}
pageChars[pg] = chars
medianHeights[pg] = util.MedianCharHeight(chars)
medianWidths[pg] = util.MedianCharWidth(chars)
}
slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
return boxes, pageChars, true
}
// retryZoom re-renders pages at higher resolution and re-runs OCR when the
// initial extraction produced zero boxes. Box coordinates are scaled back
// to Config.Zoom space. Matches Python's __images__ retry.
func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
boxes []pdf.TextBox, ocrUsedAny bool,
) ([]pdf.TextBox, bool) {
retryZoom := p.Config.Zoom * pdf.DlaScale
retryDPI := retryZoom * 72
slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoom)
for pg := fromPage; pg <= toPage; pg++ {
img, err := engine.RenderPageImage(pg, retryDPI)
if err != nil {
slog.Warn("zoom retry: render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
// Downstream DLA/TSR assumes pdf.DlaDPI. Re-render at standard
// resolution so layout coordinates are scaled correctly.
if retryDPI != pdf.DlaDPI {
if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
pageImages[pg] = dlaImg
}
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "zoom retry")
if ocrBoxes == nil {
continue
}
scaleFactor := retryZoom / p.Config.Zoom
for i := range ocrBoxes {
ocrBoxes[i].X0 /= scaleFactor
ocrBoxes[i].X1 /= scaleFactor
ocrBoxes[i].Top /= scaleFactor
ocrBoxes[i].Bottom /= scaleFactor
}
boxes = append(boxes, ocrBoxes...)
ocrUsedAny = true
}
return boxes, ocrUsedAny
}
// buildLayout runs the DLA → TSR → Column → TextMerge → VM → pdf.Section
// pipeline and populates result.Metrics, result.Tables, result.Sections,
// and result.Sections. Matches Python's _parse_loaded_window_into_bboxes
// order.
func (p *Parser) buildLayout(ctx context.Context,
result *pdf.ParseResult, engine pdf.PDFEngine,
boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
) error {
result.Metrics.BoxesInitial = len(boxes)
result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages)
result.Metrics.TablesCount = len(result.Tables)
if err := ctx.Err(); err != nil {
return err
}
boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
result.Metrics.BoxesTextMerge = len(boxes)
lyt.SortByPageThenY(boxes, p.Config.SortByTop)
if ocrUsedAny {
isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, p.SampleChars)
}
boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
result.Metrics.BoxesVertMerge = len(boxes)
if err := ctx.Err(); err != nil {
return err
}
boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
boxes = tbl.ConsolidateFigures(boxes)
pageHeights := make(map[int]float64, len(result.PageImages))
for pg, img := range result.PageImages {
pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
}
result.Sections = lyt.BoxesToSections(boxes, pageHeights)
result.Metrics.BoxesFinal = len(result.Sections)
result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
return nil
}
// processPages runs the full pipeline on pages [fromPage, toPage].
// prescanChars provides pre-extracted chars (avoids double extraction).
func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
fromPage, toPage int,
prescanChars map[int][]pdf.TextChar,
medianHeights, medianWidths map[int]float64,
isEnglish, isScanNoiseDoc bool,
) (*pdf.ParseResult, error) {
result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
// 1. OCR extraction — per-page detect + recognize + char merge.
boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
fromPage, toPage, prescanChars,
medianHeights, medianWidths, result.PageImages)
if ocrErr != nil {
slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
}
// 2. Scan noise retry — re-OCR all pages when prescan detects scan noise.
if isScanNoiseDoc {
boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
fromPage, toPage, result.PageImages,
pageChars, medianHeights, medianWidths, ocrUsedAny)
}
// 3. Zoom retry — re-render at higher resolution if OCR produced zero boxes.
if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
result.PageImages, boxes, ocrUsedAny)
}
if len(boxes) == 0 {
return result, nil
}
// 4. Layout pipeline — DLA → TSR → Column → TextMerge → VM → Sections.
if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish); err != nil {
return nil, fmt.Errorf("buildLayout: %w", err)
}
// 5. Crop section images from page renders.
p.fillSectionImages(result)
return result, nil
}
// fillSectionImages populates result.Sections[i].Image with cropped
// page images. Table sections are matched to their TableItem image;
// figure sections try DLA-aware cropping first, then fall back to
// position-tag-based cropping.
func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
if len(result.PageImages) == 0 {
return
}
// Build lookup: DLA region -> table image (base64).
tableImgByRegion := make(map[string]string, len(result.Tables))
for _, tbl := range result.Tables {
if tbl.ImageB64 == "" {
continue
}
pg := 0
if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
pg = tbl.Positions[0].PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
tableImgByRegion[key] = tbl.ImageB64
}
for i := range result.Sections {
if result.Sections[i].LayoutType == pdf.LayoutTypeTable && len(result.Sections[i].Positions) > 0 {
pos := result.Sections[i].Positions[0]
pg := 0
if len(pos.PageNumbers) > 0 {
pg = pos.PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
if img, ok := tableImgByRegion[key]; ok {
result.Sections[i].Image = img
continue
}
}
// Try DLA-aware cropping for figure sections (matching Python's
// cropout which uses DLA region boundaries instead of text boxes).
if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
result.Sections[i].Image = dlaImg
continue
}
}
img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
result.Sections[i].Image = img
if img == "" && result.Sections[i].Text != "" {
tag := result.Sections[i].PositionTag
slog.Warn("cropSectionImage empty for non-empty section",
"section", i, "posTag", tag[:min(80, len(tag))])
}
}
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -53,10 +53,11 @@ func TestEnrichWithDeepDoc_Noop(t *testing.T) {
boxes := []pdf.TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"},
}
eng := &mockEngine{pageCount: 1}
eng := &MockEngine{NumPages: 1}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil)
p := NewParser(pdf.DefaultParserConfig())
mock := &MockDocAnalyzer{Healthy: false}
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Error("unhealthy DeepDoc → 0 Tables")
}
@@ -83,10 +84,10 @@ func TestExtractTableBoxes_Mock(t *testing.T) {
{X0: 600, Y0: 410, X1: 1240, Y1: 800, Text: "B2"},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummyImg := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 1 {
t.Fatalf("expected 1 pdf.TableItem, got %d", len(tables))
}
@@ -105,9 +106,9 @@ func TestExtractTableBoxes_Mock(t *testing.T) {
func TestExtractTableBoxes_NoTables(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{}}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("0 tables expected, got %d", len(tables))
}
@@ -121,9 +122,9 @@ func TestExtractTableBoxes_NonTableRegions(t *testing.T) {
{X0: 150, Y0: 600, X1: 1650, Y1: 900, Label: "figure", Confidence: 0.8},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 2000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("non-table regions → 0 tables, got %d", len(tables))
}
@@ -139,9 +140,9 @@ func TestExtractTableBoxes_NoOverlap(t *testing.T) {
{X0: 150, Y0: 1500, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("no overlap → 0 tables, got %d", len(tables))
}
@@ -158,9 +159,9 @@ func TestExtractTableBoxes_TSRError(t *testing.T) {
},
TSRCells: nil, // TSR returns nothing
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 1 {
t.Fatalf("TSR failure: expected 1 pdf.TableItem with image+positions, got %d", len(tables))
}
@@ -180,9 +181,9 @@ func TestExtractTableBoxes_DLAError(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "text", Confidence: 0.9},
}}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("non-table DLA → 0 tables, got %d", len(tables))
}
@@ -238,9 +239,9 @@ func TestExtractTableBoxes_InvalidRegion(t *testing.T) {
{X0: 500, Y0: 100, X1: 100, Y1: 300, Label: "table", Confidence: 0.9},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("invalid DLA region should be skipped, got %d tables", len(tables))
}
@@ -252,16 +253,16 @@ func TestParse_CollectsFigures(t *testing.T) {
// End-to-end: Parse() with mock DeepDoc that labels a box as "figure".
// Verify p.Figures is populated.
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []pdf.DLARegion{
{X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -282,15 +283,15 @@ func TestParse_CollectsFigures(t *testing.T) {
func TestParse_NoFigures(t *testing.T) {
// Parse() with no DLA figure regions → p.Figures should be empty.
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
mock := &MockDocAnalyzer{
DLARegions: []pdf.DLARegion{
{X0: 150, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -302,10 +303,11 @@ func TestParse_NoFigures(t *testing.T) {
func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
// Parse() with mock DeepDoc → Figures should be empty (no DLA-detected figures).
eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
mock := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -319,9 +321,9 @@ func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
// When DeepDoc is available and the page has embedded chars,
// Parse should use ocrMergeChars (detect → merge → recognize).
eng := &mockEngine{
pageCount: 1,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
Chars: map[int][]pdf.TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
@@ -331,9 +333,9 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
{X0: 5, Y0: 5, X1: 50, Y1: 5, X2: 50, Y2: 50, X3: 5, Y3: 50},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -349,15 +351,16 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
// Without DeepDoc, Parse should use charsToBoxes (unchanged behavior).
eng := &mockEngine{
pageCount: 1,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
Chars: map[int][]pdf.TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
mock := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -368,9 +371,9 @@ func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
// OCRDetect returns no boxes → falls through to charsToBoxes.
eng := &mockEngine{
pageCount: 1,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
Chars: map[int][]pdf.TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
@@ -378,9 +381,9 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
Healthy: true,
OCRBoxes: []pdf.OCRBox{}, // empty detect
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -392,18 +395,19 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
// ── Error path coverage ────────────────────────────────────────────────
func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
mock := &MockDocAnalyzer{
Healthy: true,
DLAErr: fmt.Errorf("DLA service unavailable"),
})
eng := &mockEngine{pageCount: 1}
}
p := NewParser(pdf.DefaultParserConfig())
eng := &MockEngine{NumPages: 1}
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
pageImages := map[int]image.Image{0: img}
boxes := []pdf.TextBox{
{PageNumber: 0, X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "text"},
}
// enrichWithDeepDoc should return nil (not panic) on DLA error.
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
if len(tables) != 0 {
t.Errorf("DLA error should produce 0 tables, got %d", len(tables))
}
@@ -412,20 +416,21 @@ func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
func TestMockDocAnalyzer_TSRError_DoesNotCrash(t *testing.T) {
// TSR error: DLA succeeds, TSR fails. The table region is detected
// but no cells are returned — the table is skipped gracefully.
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 400, Y1: 400, Label: "table", Confidence: 0.95},
},
TSRErr: fmt.Errorf("TSR model timeout"),
})
eng := &mockEngine{pageCount: 1}
}
p := NewParser(pdf.DefaultParserConfig())
eng := &MockEngine{NumPages: 1}
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
pageImages := map[int]image.Image{0: img}
boxes := []pdf.TextBox{
{PageNumber: 0, X0: 10, X1: 90, Top: 10, Bottom: 90, Text: "in table region"},
}
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
// DLA detects the table region → 1 pdf.TableItem is created. TSR failure
// means it has no cells, but the pipeline must not panic.
if len(tables) != 1 {
@@ -440,12 +445,12 @@ func TestMockDocAnalyzer_OCRDetectError_DoesNotCrash(t *testing.T) {
// OCRDetect failure path: extractPages uses ocrDetectAndRecognize which
// calls doc.OCRDetect. When it fails, the page is skipped gracefully.
mock := &MockDocAnalyzer{Healthy: true, OCRDetectErr: fmt.Errorf("OCR model OOM")}
eng := &mockEngine{
pageCount: 1,
chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path
eng := &MockEngine{
NumPages: 1,
Chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path
}
p := NewParser(pdf.DefaultParserConfig(), mock)
_, err := p.Parse(context.Background(), eng)
p := NewParser(pdf.DefaultParserConfig())
_, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse returned error: %v", err)
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -54,12 +54,17 @@ func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc pdf.Doc
// merges the chars into detect regions, and OCRs any regions without chars.
// Matches Python's __ocr: detect → match chars to boxes → use char text
// for boxes with embedded chars → OCR recognize only empty/garbled boxes.
type ocrDetectBox struct {
box pdf.TextBox
x0, y0, x1, y1 float64
}
func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextChar, doc pdf.DocAnalyzer, pageNum int) []pdf.TextBox {
detectBoxes, err := doc.OCRDetect(ctx, pageImg)
if err != nil || len(detectBoxes) == 0 {
ocrDetectBoxes, err := doc.OCRDetect(ctx, pageImg)
if err != nil || len(ocrDetectBoxes) == 0 {
return nil
}
slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes))
slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(ocrDetectBoxes))
// Detect boxes are in pixel space (216 DPI). Scale to PDF space (72 DPI)
// so coordinates match embedded chars.
@@ -69,12 +74,8 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
imgH := float64(imgBounds.Dy()) / scale
// Step 1: match embedded chars to detect boxes (Python __ocr char matching).
type detectBox struct {
box pdf.TextBox
x0, y0, x1, y1 float64 // PDF-space bounds
}
boxes := make([]detectBox, 0, len(detectBoxes))
for _, b := range detectBoxes {
boxes := make([]ocrDetectBox, 0, len(ocrDetectBoxes))
for _, b := range ocrDetectBoxes {
x0 := min(b.X0, b.X1, b.X2, b.X3) / scale
y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale
x1 := max(b.X0, b.X1, b.X2, b.X3) / scale
@@ -94,7 +95,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
if x0 >= x1 || y0 >= y1 {
continue
}
boxes = append(boxes, detectBox{box: pdf.TextBox{
boxes = append(boxes, ocrDetectBox{box: pdf.TextBox{
X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum,
}, x0: x0, y0: y0, x1: x1, y1: y1})
}
@@ -145,82 +146,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
boxChars[bestIdx] = append(boxChars[bestIdx], c)
}
// Step 3: assemble text for each box.
var result []pdf.TextBox
var needOCR []int
for i := range boxes {
tb := boxes[i].box
tb.Text = ""
if len(boxChars[i]) > 0 {
// Sort chars by reading order, matching Python's sort_Y_firstly.
// Fuzzy Y-group: chars within median char height are "same line",
// sorted by X; different lines sorted by Y.
sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
// Use lineToTextBox for correct space insertion + garbled detection.
// lineToTextBox inserts ASCII word spaces at visible gaps —
// matching Python's __img_ocr + __ocr char logic.
lineBox := lyt.LineToTextBox(boxChars[i])
tb.Text = lineBox.Text
// Strategy 1: If majority of chars are garbled (PUA), clear text → OCR.
var garbledCnt, totalCnt int
for _, c := range boxChars[i] {
for _, r := range c.Text {
totalCnt++
if util.IsGarbledChar(string(r)) {
garbledCnt++
}
}
}
if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
tb.Text = ""
}
// Strategy 2: font-encoding garbled (subset fonts, min 5 chars).
if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
tb.Text = ""
}
}
// Step 4: batch OCR recognize boxes without embedded chars (or garbled).
if tb.Text == "" {
needOCR = append(needOCR, i)
}
result = append(result, tb)
}
if len(needOCR) > 0 {
cropped := make([]image.Image, len(needOCR))
for j, idx := range needOCR {
cropped[j] = util.FastCrop(pageImg,
int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
}
allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
for j, idx := range needOCR {
if allErrs[j] != nil {
slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
continue
}
var ocrParts []string
for _, t := range allTexts[j] {
if strings.TrimSpace(t.Text) != "" {
ocrParts = append(ocrParts, t.Text)
}
}
result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
}
}
// Filter out boxes with no text.
filtered := result[:0]
for _, tb := range result {
if tb.Text != "" {
filtered = append(filtered, tb)
}
}
result = filtered
slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result))
return result
return buildTextBoxes(ctx, pageImg, boxes, boxChars, doc, scale, pageNum)
}
// sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X.
@@ -289,3 +215,71 @@ func ocrTableCells(ctx context.Context, cells []pdf.TSRCell, tableImg image.Imag
cells[i].Text = strings.TrimSpace(strings.Join(parts, " "))
}
}
// buildTextBoxes assembles detect box text from embedded chars and fills
// empty boxes via batch OCR.
func buildTextBoxes(ctx context.Context, pageImg image.Image,
boxes []ocrDetectBox, boxChars [][]pdf.TextChar, doc pdf.DocAnalyzer, scale float64, pageNum int,
) []pdf.TextBox {
var result []pdf.TextBox
var needOCR []int
for i := range boxes {
tb := boxes[i].box
tb.Text = ""
if len(boxChars[i]) > 0 {
sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
lineBox := lyt.LineToTextBox(boxChars[i])
tb.Text = lineBox.Text
var garbledCnt, totalCnt int
for _, c := range boxChars[i] {
for _, r := range c.Text {
totalCnt++
if util.IsGarbledChar(string(r)) {
garbledCnt++
}
}
}
if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
tb.Text = ""
}
if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
tb.Text = ""
}
}
if strings.TrimSpace(tb.Text) == "" {
tb.Text = ""
needOCR = append(needOCR, i)
}
result = append(result, tb)
}
if len(needOCR) > 0 {
cropped := make([]image.Image, len(needOCR))
for j, idx := range needOCR {
cropped[j] = util.FastCrop(pageImg,
int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
}
allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
for j, idx := range needOCR {
if allErrs[j] != nil {
slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
continue
}
var ocrParts []string
for _, t := range allTexts[j] {
if strings.TrimSpace(t.Text) != "" {
ocrParts = append(ocrParts, t.Text)
}
}
result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
}
}
filtered := result[:0]
for _, tb := range result {
if strings.TrimSpace(tb.Text) != "" {
filtered = append(filtered, tb)
}
}
slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(filtered))
return filtered
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"

View File

@@ -1,6 +1,6 @@
//go:build cgo && integration
package parser
package pdf
import (
"bytes"
@@ -11,10 +11,10 @@ import (
_ "image/png"
"os"
"path/filepath"
"ragflow/internal/deepdoc/parser/pdf/post"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
"strings"
"testing"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// ── golden-file helpers ────────────────────────────────────────────────────
@@ -95,12 +95,11 @@ func tablesToGolden(tables []pdf.TableItem) []tableGolden {
// TestIntegration_SectionsText verifies section text output matches golden.
func TestIntegration_SectionsText(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
data := mustReadPDF(t, "01_english_simple.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -139,12 +138,11 @@ func TestIntegration_SectionsText(t *testing.T) {
// TestIntegration_SectionsCount verifies section count is stable.
func TestIntegration_SectionsCount(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
data := mustReadPDF(t, "01_english_simple.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -166,12 +164,11 @@ func TestIntegration_SectionsCount(t *testing.T) {
// TestIntegration_TableStructure verifies table rows and cell text match golden.
func TestIntegration_TableStructure(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -221,12 +218,11 @@ func TestIntegration_TableStructure(t *testing.T) {
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
func TestIntegration_TableImageB64(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -261,12 +257,11 @@ func TestIntegration_TableImageB64(t *testing.T) {
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
func TestIntegration_LayoutTypes(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
data := mustReadPDF(t, "06_table_content.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -316,7 +311,6 @@ func TestIntegration_Idempotency(t *testing.T) {
// Render a fixture page as the stable input image.
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
pageImg, err := eng.RenderPageImage(0, 216)
if err != nil {
t.Fatalf("render page: %v", err)
@@ -531,12 +525,11 @@ func floatClose(a, b, eps float64) bool {
// fixes from the Python→Go migration.
func TestIntegration_TableAlign(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "18_table_caption.pdf")
defer eng.Close()
data := mustReadPDF(t, "18_table_caption.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -572,12 +565,11 @@ func TestIntegration_TableAlign(t *testing.T) {
// (header/footer/reference) boxes are popped from output.
func TestIntegration_GarbageLayout(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
defer eng.Close()
data := mustReadPDF(t, "17_garbage_layout.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -603,13 +595,12 @@ func TestIntegration_GarbageLayout(t *testing.T) {
// TestIntegration_MultiChunk verifies chunked processing for large documents.
func TestIntegration_MultiChunk(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
defer eng.Close()
data := mustReadPDF(t, "19_multipage_chunk.pdf")
cfg := pdf.DefaultParserConfig()
cfg.BatchSize = 10 // small batches to force multi-batch path
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -635,11 +626,10 @@ func TestIntegration_NoRegression(t *testing.T) {
"07_mixed_content.pdf",
} {
t.Run(name, func(t *testing.T) {
eng := mustOpenEngine(t, name)
defer eng.Close()
data := mustReadPDF(t, name)
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -662,11 +652,10 @@ func TestIntegration_TableRotation(t *testing.T) {
client := mustConnectInferenceClient(t)
t.Run("upright_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_0.pdf")
defer eng.Close()
data := mustReadPDF(t, "rotate_0.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -677,16 +666,15 @@ func TestIntegration_TableRotation(t *testing.T) {
})
t.Run("rotated_90_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_90.pdf")
defer eng.Close()
data := mustReadPDF(t, "rotate_90.pdf")
cfg := pdf.DefaultParserConfig()
// DeepDoc DLA does not yet correctly annotate boxes on rotated
// pages (regions and characters are in different coordinate
// spaces post-rotation). Character extraction and rotation are
// verified via the charsToBoxes path.
// verified via the lyt.CharsToBoxes path.
cfg.SkipOCR = true
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -701,12 +689,11 @@ func TestIntegration_TableRotation(t *testing.T) {
// characters with a visible gap (Python __img_ocr space insertion).
func TestIntegration_WordSpacing(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
data := mustReadPDF(t, "01_english_simple.pdf")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.Parse(context.Background(), data, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -734,53 +721,34 @@ func TestIntegration_WordSpacing(t *testing.T) {
// TestE2E_ParseAndPostProcess runs Parse → PostProcess end-to-end on a real
// PDF. Skips VLM (no tenant_id set) but exercises all other operators.
func TestE2E_ParseAndPostProcess(t *testing.T) {
engine := mustOpenEngine(t, "01_english_simple.pdf")
defer engine.Close()
data := mustReadPDF(t, "01_english_simple.pdf")
mock := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), engine)
result, err := p.Parse(context.Background(), data, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
preCount := len(result.Sections)
if preCount == 0 {
if len(result.Sections) == 0 {
t.Fatal("Parse() returned zero sections")
}
t.Logf("sections: %d", len(result.Sections))
// Post-processing (no VLM).
config := post.PipelineConfig{
post.ConfigKeyPageWidth: 612.0,
post.ConfigKeyZoom: 1.0,
}
if err := post.PostProcess(context.Background(), result, config); err != nil {
t.Fatalf("PostProcess: %v", err)
}
postCount := len(result.Sections)
t.Logf("sections: %d → %d after PostProcess", preCount, postCount)
if postCount == 0 {
t.Error("PostProcess removed all sections")
}
// Every section must have DocTypeKwd + LayoutType set.
// PostProcess is handled by the Pipeline framework.
// Verify raw parse produces sections with LayoutType set.
for i, s := range result.Sections {
if s.DocTypeKwd == "" {
t.Errorf("section[%d] DocTypeKwd empty after PostProcess", i)
}
if s.LayoutType == "" {
t.Errorf("section[%d] LayoutType empty after PostProcess", i)
}
t.Logf(" section[%d]: layout=%q text=%q", i, s.LayoutType, truncate(s.Text, 60))
}
// Figures() must reflect post-processed sections.
figs := result.Figures()
t.Logf("figures: %d", len(figs))
for _, f := range figs {
if f.LayoutType != "figure" {
t.Errorf("Figures() LayoutType=%q, want 'figure'", f.LayoutType)
}
}
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n] + "..."
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -47,8 +47,8 @@ func TestIntegration_NoCrash(t *testing.T) {
defer eng.Close()
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, client)
if err != nil {
t.Fatalf("Parse: %v", err)
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -6,6 +6,7 @@ import (
"strings"
"sync"
"testing"
"math"
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
@@ -207,15 +208,16 @@ func TestOCR_FallbackIntegration(t *testing.T) {
func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) {
chars := garbledSample()
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
mockDLA := &MockDocAnalyzer{Healthy: true}
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), mockEng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
if err != nil {
t.Fatal(err)
}
t.Logf("garbled chars: %d sections", len(result.Sections))
t.Logf("garbled Chars: %d sections", len(result.Sections))
}
func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
@@ -241,9 +243,10 @@ func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112}
chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112}
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), mockEng)
mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
mockDLA := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
if err != nil {
t.Fatal(err)
}
@@ -279,7 +282,7 @@ func TestIsGarbledPage(t *testing.T) {
})
t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) {
// ### unmapped glyphs + real CJK text (no subset fonts).
// isScanNoise returns false (≥2 consecutive CJK chars: "护理全科").
// isScanNoise returns false (≥2 consecutive CJK Chars: "护理全科").
chars := []pdf.TextChar{
{Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0},
{Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0},
@@ -552,11 +555,12 @@ func TestTableSectionCaptionInHTML(t *testing.T) {
// text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true.
// The 0.3 threshold should not match a wide box that barely touches a
// narrow cell — this would cause body text to leak into table cells.
// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for
// TestParser_ConcurrentSafety verifies that Parser.ParseRaw() is safe for
// concurrent use. 8 goroutines each call Parse 5 times on the same Parser
// instance. Run with -race.
func TestParser_ConcurrentSafety(t *testing.T) {
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
mockDLA := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
var wg sync.WaitGroup
n := 8
@@ -565,10 +569,58 @@ func TestParser_ConcurrentSafety(t *testing.T) {
go func() {
defer wg.Done()
for range 5 {
eng := &mockEngine{pageCount: 2}
_, _ = p.Parse(context.Background(), eng)
eng := &MockEngine{NumPages: 2}
if _, err := p.ParseRaw(context.Background(), eng, mockDLA); err != nil {
t.Errorf("ParseRaw: %v", err)
}
}
}()
}
wg.Wait()
}
func TestParseRaw_ClampsFromPage(t *testing.T) {
// A negative FromPage should be treated as page 0.
// Only page 0 has content so we can verify clamping worked.
eng := &MockEngine{NumPages: 3, Chars: map[int][]pdf.TextChar{
0: {{Text: "page0", X0: 100, X1: 200, Top: 100, Bottom: 120}},
}}
mockDLA := &MockDocAnalyzer{Healthy: true}
cfg := pdf.DefaultParserConfig()
cfg.FromPage = -1
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
if err != nil {
t.Fatalf("ParseRaw: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected sections from page 0")
}
}
func TestParseRaw_ZeroZoom_NoNaN(t *testing.T) {
// Zoom=0 should not produce NaN coordinates.
eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{
0: {{Text: "test", X0: 100, X1: 200, Top: 100, Bottom: 120}},
}}
mockDLA := &MockDocAnalyzer{Healthy: true}
cfg := pdf.DefaultParserConfig()
cfg.Zoom = 0
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, mockDLA)
if err != nil {
t.Fatalf("ParseRaw: %v", err)
}
foundPosition := false
for _, s := range result.Sections {
for _, pos := range s.Positions {
foundPosition = true
if math.IsNaN(pos.Left) || math.IsNaN(pos.Top) {
t.Error("Zoom=0 produced NaN coordinates")
}
}
}
if !foundPosition {
t.Fatal("expected at least one position to validate")
}
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -34,8 +34,8 @@ func TestParse_PdfiumRender(t *testing.T) {
t.Fatalf("RawData() length %d != original %d", len(raw), len(data))
}
// Render a page through pdfium (via the parser's renderPageToImage).
img, err := renderPageToImage(eng, 0)
// Render a page through pdfium (via the parser's RenderPageToImage).
img, err := RenderPageToImage(eng, 0)
if err != nil {
t.Skipf("pdfium render not available: %v", err)
}
@@ -48,8 +48,8 @@ func TestParse_PdfiumRender(t *testing.T) {
// Run Parse with pdfium rendering — BATCH_SKIP_DEEPDOC=1 to avoid HTTP calls.
t.Setenv("BATCH_SKIP_DEEPDOC", "1")
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -64,10 +64,10 @@ func TestParse_PdfiumRender(t *testing.T) {
}
func TestParse_PdfiumRender_NoData(t *testing.T) {
// When engine has no raw PDF bytes, renderPageToImage falls back to
// When engine has no raw PDF bytes, RenderPageToImage falls back to
// engine.RenderPageImage(). Stub returns (nil, nil) → guard converts
// to ErrNoPDFData so callers never receive a nil image with nil error.
img, err := renderPageToImage(&pythonCharEngineStub{}, 0)
img, err := RenderPageToImage(&pythonCharEngineStub{}, 0)
if err != ErrNoPDFData {
t.Errorf("expected ErrNoPDFData, got %v", err)
}

View File

@@ -1,6 +1,6 @@
//go:build cgo
package parser
package pdf
import (
"image"
@@ -11,8 +11,8 @@ import (
)
// pdfoxideEngine adapts pdfoxide.Engine to the pdf.PDFEngine interface.
type pdfoxideEngine struct {
inner *pdfoxide.Engine
type PDFOxideEngine struct {
Inner *pdfoxide.Engine
}
// NewEngine returns a pdf.PDFEngine backed by pdf_oxide.
@@ -21,15 +21,15 @@ func NewEngine(pdfBytes []byte) (pdf.PDFEngine, error) {
if err != nil {
return nil, err
}
return &pdfoxideEngine{inner: eng}, nil
return &PDFOxideEngine{Inner: eng}, nil
}
func (e *pdfoxideEngine) RawData() []byte { return e.inner.RawData() }
func (e *pdfoxideEngine) PageCount() (int, error) { return e.inner.PageCount() }
func (e *pdfoxideEngine) Close() error { return e.inner.Close() }
func (e *PDFOxideEngine) RawData() []byte { return e.Inner.RawData() }
func (e *PDFOxideEngine) PageCount() (int, error) { return e.Inner.PageCount() }
func (e *PDFOxideEngine) Close() error { return e.Inner.Close() }
func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
ol := pdfium.ExtractOutlines(e.inner.RawData())
func (e *PDFOxideEngine) Outlines() ([]pdf.Outline, error) {
ol := pdfium.ExtractOutlines(e.Inner.RawData())
result := make([]pdf.Outline, len(ol))
for i, o := range ol {
result[i] = pdf.Outline{Title: o.Title, Level: o.Level, PageNumber: o.PageNumber}
@@ -37,16 +37,16 @@ func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
return result, nil
}
func (e *pdfoxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
return e.inner.RenderPage(pageNum, dpi)
func (e *PDFOxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
return e.Inner.RenderPage(pageNum, dpi)
}
func (e *pdfoxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
return e.inner.RenderPageImage(pageNum, dpi)
func (e *PDFOxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
return e.Inner.RenderPageImage(pageNum, dpi)
}
func (e *pdfoxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
chars, err := e.inner.ExtractChars(pageNum)
func (e *PDFOxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
chars, err := e.Inner.ExtractChars(pageNum)
if err != nil {
return nil, err
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -13,6 +13,7 @@ import (
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
"ragflow/internal/deepdoc/parser/pdf/tool"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// TestPipelineParity verifies Go pipeline logic equivalence with Python.
@@ -53,8 +54,9 @@ func TestPipelineParity(t *testing.T) {
// Run Go pipeline (SKIP_OCR — no DeepDoc)
cfg := pdf.DefaultParserConfig()
cfg.SortByTop = true
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), engine)
mockAnalyzer := &MockDocAnalyzer{Healthy: true}
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), engine, mockAnalyzer)
if err != nil {
t.Errorf("%s: Parse: %v", name, err)
continue
@@ -151,7 +153,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
if isWS && len(out) > 0 {
prev := &out[len(out)-1]
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
ov := util.OverlapX(prev, &b)
// Python: gap passes AND xov passes → whitespace merged
// into prev, extending bottom. i advances (Go for-loop).
if gap <= thr && ov >= 0.3 {
@@ -169,7 +171,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
continue
}
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
ov := util.OverlapX(prev, &b)
if gap > thr {
out = append(out, b)
continue
@@ -219,7 +221,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
continue
}
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
ov := util.OverlapX(prev, &b)
if gap > thr {
out = append(out, b)
continue
@@ -250,18 +252,18 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
t.Logf("Gap with bridge: 420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr)
// The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still
// differ — the mechanism is real. But production NaiveVerticalMerge now
// differ — the mechanism is real. But production lyt.NaiveVerticalMerge now
// handles whitespace inline (gap bridge), matching Python.
if nWS == nNoWS {
t.Error("Manual implementations should differ — the gap bridge mechanism is real")
}
// Verify production NaiveVerticalMerge matches vWithWS (Python behavior).
// Verify production lyt.NaiveVerticalMerge matches vWithWS (Python behavior).
mhMap := map[int]float64{1: mh}
mwMap := map[int]float64{1: 5}
vmResult := lyt.NaiveVerticalMerge(boxes, mhMap, mwMap, false)
t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult))
t.Logf("lyt.NaiveVerticalMerge (production): %d sections", len(vmResult))
if len(vmResult) != nWS {
t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
t.Errorf("lyt.NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
}
}

View File

@@ -1,101 +0,0 @@
package post
import (
"bytes"
"context"
"encoding/base64"
"errors"
"fmt"
"image"
"image/png"
)
// ── chat driver interface (self-contained, avoids entity/models import) ──
// ChatDriver is the subset of modelModule.ModelDriver needed to call a
// vision-capable chat API. Defined here to keep model_image_describer.go
// self-contained and avoid import chains that require CGO.
type ChatDriver interface {
ChatWithMessages(modelName string, messages []ChatMessage, apiConfig *ChatAPIConfig, chatConfig *ChatConfig) (*ChatResponse, error)
}
// ChatMessage mirrors modelModule.Message.
type ChatMessage struct {
Role string `json:"role"`
Content interface{} `json:"content"`
ToolCallID string `json:"tool_call_id,omitempty"`
ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"`
}
// ChatAPIConfig mirrors modelModule.APIConfig.
type ChatAPIConfig struct {
ApiKey *string
Region *string
BaseURL *string
}
// ChatConfig mirrors modelModule.ChatConfig (may be nil).
type ChatConfig struct{}
// ChatResponse mirrors modelModule.ChatResponse.
type ChatResponse struct {
Answer *string `json:"answer"`
ReasonContent *string `json:"reason_content"`
ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"`
}
// ── ModelImageDescriber ────────────────────────────────────────────────
// ModelImageDescriber implements ImageDescriber via any ChatDriver.
type ModelImageDescriber struct {
driver ChatDriver
modelName string
apiConfig *ChatAPIConfig
maxTokens int
}
// NewModelImageDescriber creates a ModelImageDescriber that calls the given
// driver to describe images. maxTokens sets the response length limit (passed
// as ChatConfig.MaxTokens); 0 means use provider default.
func NewModelImageDescriber(d ChatDriver, name string, cfg *ChatAPIConfig, maxTokens int) *ModelImageDescriber {
return &ModelImageDescriber{driver: d, modelName: name, apiConfig: cfg, maxTokens: maxTokens}
}
// DescribeImage sends the image as a base64 data URL in an OpenAI-compatible
// vision API request. Returns the model's text response.
func (d *ModelImageDescriber) DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error) {
dataURL, err := encodeImageToBase64DataURL(img)
if err != nil {
return "", fmt.Errorf("image encode: %w", err)
}
msgs := []ChatMessage{{
Role: "user",
Content: []interface{}{
map[string]interface{}{"type": "text", "text": prompt},
map[string]interface{}{"type": "image_url", "image_url": map[string]string{"url": dataURL}},
},
}}
var chatCfg *ChatConfig
if d.maxTokens > 0 {
chatCfg = &ChatConfig{}
}
resp, err := d.driver.ChatWithMessages(d.modelName, msgs, d.apiConfig, chatCfg)
if err != nil {
return "", fmt.Errorf("image describe: %w", err)
}
if resp.Answer == nil || *resp.Answer == "" {
return "", errors.New("image describe: empty response")
}
return *resp.Answer, nil
}
// encodeImageToBase64DataURL encodes an image as a PNG data URL.
func encodeImageToBase64DataURL(img image.Image) (string, error) {
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return "", err
}
return "data:image/png;base64," + base64.StdEncoding.EncodeToString(buf.Bytes()), nil
}

View File

@@ -1,79 +0,0 @@
package post
import (
"context"
"errors"
"image"
"image/color"
"strings"
"testing"
)
// ── mock ChatDriver ────────────────────────────────────────────────────
type mockChatDriver struct {
answer string
err error
}
func (m *mockChatDriver) ChatWithMessages(_ string, _ []ChatMessage, _ *ChatAPIConfig, _ *ChatConfig) (*ChatResponse, error) {
if m.err != nil {
return nil, m.err
}
a := m.answer
return &ChatResponse{Answer: &a}, nil
}
// ── ModelImageDescriber tests ──────────────────────────────────────────
func TestModelImageDescriber_Success(t *testing.T) {
img := newTestImage(100, 100)
want := "A chart showing revenue growth."
driver := &mockChatDriver{answer: want}
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
got, err := desc.DescribeImage(context.Background(), img, "Describe this chart")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestModelImageDescriber_DriverError(t *testing.T) {
img := newTestImage(100, 100)
driver := &mockChatDriver{err: errors.New("API rate limited")}
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
_, err := desc.DescribeImage(context.Background(), img, "prompt")
if err == nil {
t.Fatal("expected error, got nil")
}
}
func TestModelImageDescriber_EmptyAnswer(t *testing.T) {
img := newTestImage(100, 100)
driver := &mockChatDriver{answer: ""}
desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
_, err := desc.DescribeImage(context.Background(), img, "prompt")
if err == nil {
t.Fatal("expected error for empty answer, got nil")
}
}
// ── encodeImageToBase64DataURL tests ───────────────────────────────────
func TestEncodeImageToBase64DataURL(t *testing.T) {
img := image.NewRGBA(image.Rect(0, 0, 1, 1))
img.Set(0, 0, color.RGBA{R: 255, G: 0, B: 0, A: 255})
url, err := encodeImageToBase64DataURL(img)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !strings.HasPrefix(url, "data:image/png;base64,") {
t.Errorf("missing data URL prefix: %s...", url[:min(50, len(url))])
}
}

View File

@@ -1,114 +0,0 @@
package post
import (
"context"
"testing"
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
)
// ── Tests for remove_toc config flag ────────────────────────────────────────
// TestPostProcess_RemoveTOC_DisabledByConfig verifies that when
// remove_toc=false, outlines are NOT used to remove TOC pages even
// when outlines are present.
func TestPostProcess_RemoveTOC_DisabledByConfig(t *testing.T) {
result := newTestResult(
makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
makePosSection("第二章 正文", 5, 100, 500, 100, 200),
)
outlines := []pdftype.Outline{
{Title: "目录", Level: 0, PageNumber: 1},
{Title: "第一章", Level: 0, PageNumber: 3},
{Title: "第二章", Level: 0, PageNumber: 5},
}
config := PipelineConfig{
ConfigKeyRemoveTOC: false,
ConfigKeyOutlines: outlines,
}
err := PostProcess(context.Background(), result, config)
if err != nil {
t.Fatal(err)
}
if len(result.Sections) != 4 {
t.Errorf("remove_toc=false should keep all sections, got %d", len(result.Sections))
}
}
// TestPostProcess_RemoveTOC_EnabledByConfig verifies that when
// remove_toc=true and outlines are present, TOC pages are removed.
func TestPostProcess_RemoveTOC_EnabledByConfig(t *testing.T) {
result := newTestResult(
makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
makePosSection("第二章 正文", 5, 100, 500, 100, 200),
)
outlines := []pdftype.Outline{
{Title: "目录", Level: 0, PageNumber: 1},
{Title: "第一章", Level: 0, PageNumber: 3},
{Title: "第二章", Level: 0, PageNumber: 5},
}
config := PipelineConfig{
ConfigKeyRemoveTOC: true,
ConfigKeyOutlines: outlines,
}
err := PostProcess(context.Background(), result, config)
if err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Errorf("remove_toc=true should remove TOC pages, got %d sections", len(result.Sections))
}
for _, s := range result.Sections {
for _, p := range s.Positions {
for _, pn := range p.PageNumbers {
if pn < 3 {
t.Errorf("TOC page %d should have been removed: section %q", pn, s.Text)
}
}
}
}
}
// TestPostProcess_RemoveTOC_NoOutlines verifies that when no outlines
// are passed, no TOC removal happens.
func TestPostProcess_RemoveTOC_NoOutlines(t *testing.T) {
result := newTestResult(
makePosSection("目录内容", 1, 100, 500, 100, 200),
makePosSection("第一章 正文", 3, 100, 500, 100, 200),
)
config := PipelineConfig{
ConfigKeyRemoveTOC: true,
}
err := PostProcess(context.Background(), result, config)
if err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Errorf("no outlines → all sections kept, got %d", len(result.Sections))
}
}
// TestPostProcess_RemoveTOC_EmptyOutlines verifies empty outlines array is no-op.
func TestPostProcess_RemoveTOC_EmptyOutlines(t *testing.T) {
result := newTestResult(
makePosSection("目录", 1, 100, 500, 100, 200),
makePosSection("正文", 2, 100, 500, 100, 200),
)
config := PipelineConfig{
ConfigKeyRemoveTOC: true,
ConfigKeyOutlines: []pdftype.Outline{},
}
err := PostProcess(context.Background(), result, config)
if err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Errorf("empty outlines → all sections kept, got %d", len(result.Sections))
}
}

View File

@@ -1,436 +0,0 @@
package post
import (
"context"
"errors"
"math"
"regexp"
"sort"
"strings"
"sync"
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
"ragflow/internal/deepdoc/parser/pdf/util"
)
// ── Config ─────────────────────────────────────────────────────────────
// Config keys for PipelineConfig.
const (
ConfigKeyPageWidth = "page_width"
ConfigKeyZoom = "zoom"
ConfigKeyOutlines = "outlines"
ConfigKeyFlattenMediaToText = "flatten_media_to_text"
ConfigKeyTenantID = "tenant_id"
ConfigKeyVLMLLMID = "vlm_llm_id"
ConfigKeyRemoveTOC = "remove_toc"
)
// PipelineConfig is a key-value map that post-processing reads
// to obtain its parameters.
type PipelineConfig map[string]interface{}
// Float64 returns the float64 value for key, or default_ if absent or wrong type.
func (c PipelineConfig) Float64(key string, default_ float64) float64 {
if c == nil {
return default_
}
v, ok := c[key]
if !ok {
return default_
}
f, ok := v.(float64)
if !ok {
return default_
}
return f
}
// Bool returns the bool value for key. Returns false if absent or wrong type.
func (c PipelineConfig) Bool(key string) bool {
if c == nil {
return false
}
v, ok := c[key]
if !ok {
return false
}
b, ok := v.(bool)
if !ok {
return false
}
return b
}
// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines.
func (c PipelineConfig) Outlines() []pdftype.Outline {
if c == nil {
return nil
}
v, ok := c[ConfigKeyOutlines]
if !ok {
return nil
}
o, ok := v.([]pdftype.Outline)
if !ok {
return nil
}
return o
}
// String returns the string value for key. Returns "" if absent or wrong type.
func (c PipelineConfig) String(key string) string {
if c == nil {
return ""
}
v, ok := c[key]
if !ok {
return ""
}
s, ok := v.(string)
if !ok {
return ""
}
return s
}
// ── Patterns ───────────────────────────────────────────────────────────
// headerFooterPattern matches layout types that should be treated as
// page furniture (Python: r"(header|footer|number)" in parser.py:637).
var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`)
// tocTitlePattern matches outline titles that mark a table-of-contents page.
// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$"
var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`)
// ── PostProcess ────────────────────────────────────────────────────────
// PostProcess applies PDF post-processing to a ParseResult in-place.
// The config map controls which features to enable.
//
// Execution order (matches Python _pdf):
// 1. reorderMultiColumn — if page_width > 0
// 2. removeTOCByOutlines — if outlines present
// 3. normalizeLayoutType — always
// 4. filterHeaderFooter — always
// 5. assignDocTypeKwd — always (respects flatten_media_to_text)
// 6. enhanceWithVision — if image_describer present
func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error {
if result == nil {
return errors.New("PostProcess: nil result")
}
if config == nil {
config = PipelineConfig{}
}
// 1. Multi-column reorder
pw := config.Float64(ConfigKeyPageWidth, 0)
if pw > 0 {
zoom := config.Float64(ConfigKeyZoom, 1.0)
if zoom <= 0 {
zoom = 1.0
}
reorderMultiColumn(result, pw, zoom)
}
// 2. Remove TOC pages (only when explicitly enabled).
// Outlines from config take precedence; otherwise read from ParseResult.
outlines := config.Outlines()
if len(outlines) == 0 {
outlines = result.Outlines
}
if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 {
removeTOCByOutlines(result, outlines)
}
// 3-5. Always-on steps
normalizeLayoutType(result)
filterHeaderFooter(result)
assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText))
// 6. VLM enhancement
tenantID := config.String(ConfigKeyTenantID)
vlmLLMID := config.String(ConfigKeyVLMLLMID)
if tenantID != "" && vlmLLMID != "" {
describer, err := resolveImageDescriber(tenantID, vlmLLMID)
if err != nil {
return err
}
if err := enhanceWithVision(ctx, result, describer); err != nil {
return err
}
}
return nil
}
// resolveImageDescriber resolves a VLM model from tenant config and returns
// an ImageDescriber. Corresponds to Python's
// get_model_config_from_provider_instance + LLMBundle.
// resolveImageDescriber resolves a VLM model from tenant config and returns
// an ImageDescriber. The implementation is assigned by init() in
// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub).
// Overridable in tests.
var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error)
// SetImageDescriberResolver sets the factory that creates an ImageDescriber
// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the
// PDF document pipeline entry point) register the real implementation via
// init(). If never called, PostProcess skips VLM enhancement.
func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) {
resolveImageDescriber = fn
}
// ── normalizeLayoutType ────────────────────────────────────────────────
// normalizeLayoutType trims whitespace from LayoutType and defaults empty
// values to "text". Matches Python's layout_type normalization in parser.py.
func normalizeLayoutType(result *pdftype.ParseResult) {
for i := range result.Sections {
lt := strings.TrimSpace(result.Sections[i].LayoutType)
if lt == "" {
lt = "text"
}
result.Sections[i].LayoutType = lt
}
}
// ── filterHeaderFooter ─────────────────────────────────────────────────
// filterHeaderFooter removes sections whose LayoutType matches
// header/footer/number/reference. Python: remove_header_footer config.
func filterHeaderFooter(result *pdftype.ParseResult) {
sections := result.Sections[:0]
for _, s := range result.Sections {
if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) {
continue
}
sections = append(sections, s)
}
result.Sections = sections
}
// ── assignDocTypeKwd ───────────────────────────────────────────────────
// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence.
// When flatten is true, all sections become "text" and Image is cleared —
// this matches Python where flatten_media_to_text and VLM are mutually
// exclusive. Python: parser.py:639-648.
func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) {
for i := range result.Sections {
s := &result.Sections[i]
if flatten {
s.DocTypeKwd = "text"
s.Image = ""
continue
}
lt := strings.TrimSpace(s.LayoutType)
switch lt {
case "table":
s.DocTypeKwd = "table"
case "figure":
s.DocTypeKwd = "image"
default:
if lt == "" && s.Image != "" {
s.DocTypeKwd = "image"
} else {
s.DocTypeKwd = "text"
}
}
}
}
// ── enhanceWithVision ──────────────────────────────────────────────────
// enhanceWithVision adds VLM-generated descriptions to image/table sections.
func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error {
if describer == nil {
return nil
}
if len(result.Sections) == 0 {
return nil
}
sem := make(chan struct{}, maxDescribeConcurrency)
var wg sync.WaitGroup
for i := range result.Sections {
s := &result.Sections[i]
if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" {
continue
}
if s.Image == "" {
continue
}
wg.Add(1)
sem <- struct{}{}
go func(idx int, imgB64 string, origText string) {
defer wg.Done()
defer func() { <-sem }()
img, err := util.DecodeBase64PNG(imgB64)
if err != nil || img == nil {
return
}
desc, err := DescribeImage(ctx, img, describePrompt, describer)
if err != nil || desc == "" {
return
}
if origText != "" {
result.Sections[idx].Text = origText + "\n" + desc
} else {
result.Sections[idx].Text = desc
}
}(i, s.Image, s.Text)
}
wg.Wait()
return nil
}
// ── removeTOCByOutlines ────────────────────────────────────────────────
// removeTOCByOutlines removes sections whose page numbers fall inside
// TOC page ranges identified by PDF outlines.
func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) {
if len(outlines) == 0 {
return
}
tocPage, contentPage := findTOCPageRange(outlines)
if contentPage <= tocPage {
return
}
sections := result.Sections[:0]
for _, s := range result.Sections {
pg := sectionPage(s)
if pg >= tocPage && pg < contentPage {
continue
}
sections = append(sections, s)
}
result.Sections = sections
}
// findTOCPageRange scans outlines for a TOC entry and returns the
// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found.
func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) {
trimSplit:
for i, o := range outlines {
title := strings.TrimSpace(o.Title)
if idx := strings.Index(title, "@@"); idx >= 0 {
title = strings.TrimSpace(title[:idx])
}
if !tocTitlePattern.MatchString(strings.ToLower(title)) {
continue
}
tocPage = o.PageNumber
for _, next := range outlines[i+1:] {
if next.Level != o.Level {
continue
}
nt := strings.TrimSpace(next.Title)
if idx := strings.Index(nt, "@@"); idx >= 0 {
nt = strings.TrimSpace(nt[:idx])
}
if tocTitlePattern.MatchString(strings.ToLower(nt)) {
continue
}
contentPage = next.PageNumber
break trimSplit
}
break
}
return
}
// sectionPage returns the first page number of a Section, or 0.
func sectionPage(s pdftype.Section) int {
for _, p := range s.Positions {
for _, pn := range p.PageNumbers {
return pn
}
}
return 0
}
// ── reorderMultiColumn ─────────────────────────────────────────────────
// reorderMultiColumn reorders text sections in multi-column layouts.
// If median text column width >= page width / 2 (single-column layout),
// the input order is preserved.
//
// Python: reorder_multi_column_bboxes + sort_X_by_page
func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) {
if len(result.Sections) < 2 {
return
}
pw := pageWidth / zoom
// Compute median width from text sections with valid coordinates.
var widths []float64
for _, s := range result.Sections {
if s.LayoutType != "text" {
continue
}
if len(s.Positions) == 0 {
continue
}
w := s.Positions[0].Right - s.Positions[0].Left
if w > 0 {
widths = append(widths, w)
}
}
if len(widths) == 0 {
return
}
sort.Float64s(widths)
medianW := widths[len(widths)/2]
if medianW >= pw/2 {
return // single column
}
// Sort by (PageNumber, X0, Top).
sort.Slice(result.Sections, func(i, j int) bool {
pi := sectionPage(result.Sections[i])
pj := sectionPage(result.Sections[j])
if pi != pj {
return pi < pj
}
xi := sectionX0(result.Sections[i])
xj := sectionX0(result.Sections[j])
if math.Abs(xi-xj) > 1e-6 {
return xi < xj
}
return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j])
})
threshold := medianW / 2
// Correct same-page sections with nearly-same X0 but inverted Top.
for i := len(result.Sections) - 1; i >= 1; i-- {
for j := i - 1; j >= 0; j-- {
if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold &&
sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) &&
sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) {
result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j]
}
}
}
}
func sectionX0(s pdftype.Section) float64 {
for _, p := range s.Positions {
return p.Left
}
return 0
}
func sectionTop(s pdftype.Section) float64 {
for _, p := range s.Positions {
return p.Top
}
return 0
}

View File

@@ -1,434 +0,0 @@
package post
import (
"context"
"testing"
pdftype "ragflow/internal/deepdoc/parser/pdf/type"
)
// ── helpers ──────────────────────────────────────────────────────────────
// dummyBase64PNG is a valid 50×50 red pixel PNG, base64-encoded.
const dummyBase64PNG = "iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAIAAACRXR/mAAAAUElEQVR4nOzOsREAEAAAMefsvzILaL6iSCbI2uNH83XgTqvQKrQKrUKr0Cq0Cq1Cq9AqtAqtQqvQKrQKrUKr0Cq0Cq1Cq9AqtAqt4gQAAP//miQBZqrF+JAAAAAASUVORK5CYII="
func newTestResult(sections ...pdftype.Section) *pdftype.ParseResult {
return &pdftype.ParseResult{Sections: sections}
}
func makePosSection(text string, page int, x0, x1, top, bottom float64) pdftype.Section {
return pdftype.Section{
Text: text,
LayoutType: "text",
Positions: []pdftype.Position{{PageNumbers: []int{page}, Left: x0, Right: x1, Top: top, Bottom: bottom}},
}
}
// ── normalizeLayoutType ────────────────────────────────────────────────
func TestNormalizeLayoutType(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "a", LayoutType: ""},
pdftype.Section{Text: "b", LayoutType: " "},
pdftype.Section{Text: "c", LayoutType: "table"},
pdftype.Section{Text: "d", LayoutType: " figure "},
pdftype.Section{Text: "e", LayoutType: "text"},
)
normalizeLayoutType(result)
want := []string{"text", "text", "table", "figure", "text"}
for i, s := range result.Sections {
if s.LayoutType != want[i] {
t.Errorf("Sections[%d]: got %q, want %q", i, s.LayoutType, want[i])
}
}
}
// ── filterHeaderFooter ─────────────────────────────────────────────────
func TestFilterHeaderFooter(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "Page 1", LayoutType: "header"},
pdftype.Section{Text: "Chapter 1", LayoutType: "text"},
pdftype.Section{LayoutType: "footer"},
pdftype.Section{LayoutType: "number"},
pdftype.Section{Text: "Body", LayoutType: "text"},
pdftype.Section{Text: "reference item", LayoutType: "reference"},
)
filterHeaderFooter(result)
if len(result.Sections) != 2 {
t.Fatalf("expected 2 sections, got %d: %+v", len(result.Sections), result.Sections)
}
if result.Sections[0].Text != "Chapter 1" || result.Sections[1].Text != "Body" {
t.Errorf("wrong sections kept: %+v", result.Sections)
}
}
func TestFilterHeaderFooter_Empty(t *testing.T) {
result := newTestResult()
filterHeaderFooter(result)
if len(result.Sections) != 0 {
t.Error("expected empty result")
}
}
// ── assignDocTypeKwd ───────────────────────────────────────────────────
func TestAssignDocTypeKwd_Normal(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "a", LayoutType: "table"},
pdftype.Section{Text: "b", LayoutType: "figure"},
pdftype.Section{Text: "c", LayoutType: "equation"},
pdftype.Section{Text: "d", LayoutType: "", Image: dummyBase64PNG},
pdftype.Section{Text: "e", LayoutType: "text"},
pdftype.Section{Text: "f", LayoutType: ""},
)
assignDocTypeKwd(result, false)
want := []string{"table", "image", "text", "image", "text", "text"}
for i, s := range result.Sections {
if s.DocTypeKwd != want[i] {
t.Errorf("Sections[%d]: got %q, want %q", i, s.DocTypeKwd, want[i])
}
}
}
func TestAssignDocTypeKwd_Flatten(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "a", LayoutType: "table", DocTypeKwd: "table", Image: dummyBase64PNG},
pdftype.Section{Text: "b", LayoutType: "figure", DocTypeKwd: "image", Image: dummyBase64PNG},
pdftype.Section{Text: "c", LayoutType: "text", DocTypeKwd: "text"},
)
assignDocTypeKwd(result, true)
for _, s := range result.Sections {
if s.DocTypeKwd != "text" {
t.Errorf("expected all 'text', got %q", s.DocTypeKwd)
}
if s.Image != "" {
t.Error("flatten should clear Image to prevent VLM enhancement")
}
}
}
// ── enhanceWithVision ──────────────────────────────────────────────────
func TestEnhanceWithVision_NoOp(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "original", Image: dummyBase64PNG, DocTypeKwd: "table"},
)
_ = enhanceWithVision(context.Background(), result, nil)
if result.Sections[0].Text != "original" {
t.Errorf("text changed when describer is nil: %q", result.Sections[0].Text)
}
}
func TestEnhanceWithVision_Success(t *testing.T) {
want := "A table showing Q1 revenue."
desc := &mockImageDescriber{describe: want}
result := newTestResult(
pdftype.Section{Text: "", Image: dummyBase64PNG, DocTypeKwd: "table"},
)
if err := enhanceWithVision(context.Background(), result, desc); err != nil {
t.Fatal(err)
}
if result.Sections[0].Text != want {
t.Errorf("text not enhanced: got %q", result.Sections[0].Text)
}
}
func TestEnhanceWithVision_SkipText(t *testing.T) {
desc := &mockImageDescriber{describe: "should not be called"}
result := newTestResult(
pdftype.Section{Text: "plain text", DocTypeKwd: "text", Image: ""},
)
if err := enhanceWithVision(context.Background(), result, desc); err != nil {
t.Fatal(err)
}
if result.Sections[0].Text != "plain text" {
t.Errorf("text changed: %q", result.Sections[0].Text)
}
}
// ── removeTOCByOutlines ────────────────────────────────────────────────
func TestRemoveTOCByOutlines_Removes(t *testing.T) {
outlines := []pdftype.Outline{
{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 1},
{Title: "目录", Level: 0, PageNumber: 3},
{Title: "Chapter 2 Methods", Level: 0, PageNumber: 5},
}
result := newTestResult(
makePosSection("s1", 1, 50, 550, 100, 120),
makePosSection("s2", 2, 50, 550, 100, 120),
makePosSection("toc1", 3, 50, 550, 100, 120),
makePosSection("toc2", 4, 50, 550, 100, 120),
makePosSection("body1", 5, 50, 550, 100, 120),
makePosSection("body2", 6, 50, 550, 100, 120),
)
removeTOCByOutlines(result, outlines)
if len(result.Sections) != 4 {
t.Fatalf("expected 4 sections, got %d", len(result.Sections))
}
if result.Sections[0].Text != "s1" || result.Sections[1].Text != "s2" {
t.Error("pre-TOC pages should be kept")
}
if result.Sections[2].Text != "body1" || result.Sections[3].Text != "body2" {
t.Error("post-TOC pages should be kept")
}
}
func TestRemoveTOCByOutlines_NoMatch(t *testing.T) {
outlines := []pdftype.Outline{
{Title: "1. Introduction", Level: 0, PageNumber: 1},
{Title: "2. Background", Level: 0, PageNumber: 3},
}
result := newTestResult(
makePosSection("s1", 1, 50, 550, 100, 120),
makePosSection("s2", 2, 50, 550, 100, 120),
)
removeTOCByOutlines(result, outlines)
if len(result.Sections) != 2 {
t.Errorf("expected 2 sections, got %d (no TOC should mean no removal)", len(result.Sections))
}
}
func TestRemoveTOCByOutlines_NilOutlines(t *testing.T) {
result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
removeTOCByOutlines(result, nil)
if len(result.Sections) != 1 {
t.Errorf("nil outlines should be no-op: got %d sections", len(result.Sections))
}
}
func TestRemoveTOCByOutlines_EmptyOutlines(t *testing.T) {
result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
removeTOCByOutlines(result, []pdftype.Outline{})
if len(result.Sections) != 1 {
t.Errorf("empty outlines should be no-op: got %d sections", len(result.Sections))
}
}
func TestRemoveTOCByOutlines_NoNext(t *testing.T) {
outlines := []pdftype.Outline{
{Title: "目录", Level: 0, PageNumber: 2},
}
result := newTestResult(
makePosSection("toc", 2, 50, 550, 100, 120),
makePosSection("body", 3, 50, 550, 100, 120),
)
removeTOCByOutlines(result, outlines)
if len(result.Sections) != 2 {
t.Errorf("no next outline → keep all sections: got %d", len(result.Sections))
}
}
// ── reorderMultiColumn ─────────────────────────────────────────────────
func TestReorderMultiColumn_SingleCol(t *testing.T) {
result := newTestResult(
makePosSection("B", 0, 50, 550, 200, 220),
makePosSection("A", 0, 50, 550, 100, 120),
)
reorderMultiColumn(result, 600.0, 1.0)
// medianW=500 >= 300 → single col, order preserved
if result.Sections[0].Text != "B" {
t.Fatal("single column should preserve original order")
}
}
func TestReorderMultiColumn_MultiCol(t *testing.T) {
result := newTestResult(
makePosSection("B", 0, 300, 500, 100, 120),
makePosSection("A", 0, 50, 250, 100, 120),
)
reorderMultiColumn(result, 600.0, 1.0)
if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
t.Log("multi-column: sections reordered")
}
}
func TestReorderMultiColumn_Empty(t *testing.T) {
result := newTestResult()
reorderMultiColumn(result, 600.0, 1.0)
if len(result.Sections) != 0 {
t.Error("empty sections should remain empty")
}
}
func TestReorderMultiColumn_NoText(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "t1", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 300, Right: 500, Top: 100, Bottom: 120}}},
pdftype.Section{Text: "t2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 50, Right: 250, Top: 100, Bottom: 120}}},
)
reorderMultiColumn(result, 600.0, 1.0)
if len(result.Sections) != 2 {
t.Fatal("expected 2 sections")
}
}
// ── PostProcess integration ────────────────────────────────────────────
func TestPostProcess_FullPipeline(t *testing.T) {
// Simulates post-processing after Parse(): all features enabled.
result := newTestResult(
// Page 1: TOC — should be removed
pdftype.Section{Text: "目录", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
pdftype.Section{Text: "Chapter 1 ... 1", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 120, Bottom: 140}}},
// Page 1: header — should be removed
pdftype.Section{Text: "Page 1", LayoutType: "header", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 500, Right: 550, Top: 10, Bottom: 20}}},
// Page 3: actual content
pdftype.Section{Text: "Introduction text", LayoutType: "", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
pdftype.Section{Text: "Row1 Col1 Row1 Col2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 200, Bottom: 300}}, Image: dummyBase64PNG},
pdftype.Section{Text: "Chart description", LayoutType: "figure", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 300, Bottom: 400}}, Image: dummyBase64PNG},
// Page 4: footer — should be removed
pdftype.Section{Text: "Confidential", LayoutType: "footer", Positions: []pdftype.Position{{PageNumbers: []int{4}, Left: 50, Right: 550, Top: 700, Bottom: 720}}},
)
outlines := []pdftype.Outline{
{Title: "目录", Level: 0, PageNumber: 1},
{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 3},
}
wantVLM := "This table shows quarterly revenue data with 2 columns."
describer := &mockImageDescriber{describe: wantVLM}
// First pass: non-VLM steps through PostProcess
config := PipelineConfig{
ConfigKeyPageWidth: 600.0,
ConfigKeyZoom: 1.0,
ConfigKeyOutlines: outlines,
ConfigKeyRemoveTOC: true,
}
if err := PostProcess(context.Background(), result, config); err != nil {
t.Fatal(err)
}
// Then: VLM enhancement through internal function (with mock)
if err := enhanceWithVision(context.Background(), result, describer); err != nil {
t.Fatal(err)
}
// Then: flatten
if err := PostProcess(context.Background(), result, PipelineConfig{
ConfigKeyFlattenMediaToText: true,
}); err != nil {
t.Fatal(err)
}
// Verify
if len(result.Sections) != 3 {
t.Fatalf("expected 3 sections after filtering, got %d: %+v", len(result.Sections), result.Sections)
}
for i, s := range result.Sections {
if s.DocTypeKwd != "text" {
t.Errorf("section[%d] DocTypeKwd = %q, want 'text'", i, s.DocTypeKwd)
}
if s.LayoutType == "header" || s.LayoutType == "footer" {
t.Errorf("section[%d] LayoutType = %q, should have been filtered out", i, s.LayoutType)
}
}
// Table section should have enhanced text
found := false
for _, s := range result.Sections {
if s.LayoutType == "table" {
found = true
if s.Text != "Row1 Col1 Row1 Col2\n"+wantVLM {
t.Errorf("table text not enhanced: %q", s.Text)
}
}
}
if !found {
t.Error("table section missing from result")
}
}
func TestPostProcess_Minimal(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "Hello", LayoutType: ""},
pdftype.Section{Text: "World", LayoutType: " "},
)
if err := PostProcess(context.Background(), result, nil); err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Fatalf("expected 2 sections, got %d", len(result.Sections))
}
if result.Sections[0].LayoutType != "text" || result.Sections[1].LayoutType != "text" {
t.Error("layout not normalized")
}
if result.Sections[0].DocTypeKwd != "text" || result.Sections[1].DocTypeKwd != "text" {
t.Error("doc_type_kwd not assigned")
}
}
func TestPostProcess_NilResult(t *testing.T) {
if err := PostProcess(context.Background(), nil, nil); err == nil {
t.Error("expected error for nil result")
}
}
func TestPostProcess_EmptySections(t *testing.T) {
result := newTestResult()
if err := PostProcess(context.Background(), result, nil); err != nil {
t.Fatal(err)
}
if len(result.Sections) != 0 {
t.Error("empty should remain empty")
}
}
func TestPostProcess_FiguresLazy(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "Fig1", LayoutType: "figure"},
pdftype.Section{Text: "Body", LayoutType: "text"},
pdftype.Section{Text: "Fig2", LayoutType: "figure"},
)
if err := PostProcess(context.Background(), result, nil); err != nil {
t.Fatal(err)
}
figs := result.Figures()
if len(figs) != 2 {
t.Fatalf("expected 2 figures, got %d", len(figs))
}
if figs[0].Text != "Fig1" || figs[1].Text != "Fig2" {
t.Errorf("wrong figures: %+v", figs)
}
}
func TestPostProcess_FilterOnly(t *testing.T) {
result := newTestResult(
pdftype.Section{Text: "Header", LayoutType: "header"},
pdftype.Section{Text: "Second", LayoutType: "text"},
pdftype.Section{Text: "First", LayoutType: "text"},
)
if err := PostProcess(context.Background(), result, nil); err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Fatalf("expected 2 sections after filtering, got %d", len(result.Sections))
}
figs := result.Figures()
if len(figs) != 0 {
t.Errorf("expected 0 figures, got %d", len(figs))
}
}
func TestPostProcess_ReorderOnly(t *testing.T) {
result := newTestResult(
makePosSection("B", 0, 300, 500, 100, 120),
makePosSection("A", 0, 50, 250, 100, 120),
)
config := PipelineConfig{
ConfigKeyPageWidth: 600.0,
ConfigKeyZoom: 1.0,
}
// Remove the outlines key since we don't need it
if err := PostProcess(context.Background(), result, config); err != nil {
t.Fatal(err)
}
if len(result.Sections) != 2 {
t.Fatal("expected 2 sections")
}
// Should be reordered: col 1 leftmost: A then B
if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
t.Log("multi-column: sections reordered left-to-right")
}
}

View File

@@ -1,98 +0,0 @@
package post
import (
"context"
"errors"
"image"
)
// ImageDescriber describes an image using a vision language model.
type ImageDescriber interface {
DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error)
}
// maxDescribeConcurrency limits how many concurrent VLM calls are in flight.
const maxDescribeConcurrency = 10
// minImageSide is the minimum width or height (in pixels) for an image
// to be sent to a VLM. Tiny crops fail provider image-size limits.
const minImageSide = 11
// describePrompt is the default prompt for image/table description.
// Python: vision_llm_figure_describe_prompt.md
const describePrompt = `## ROLE
You are an expert visual data analyst.
## GOAL
Analyze the image and produce a textual representation strictly based on what is visible in the image.
## DECISION RULE (CRITICAL)
First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
## OUTPUT RULES (STRICT)
- Produce output in exactly one of the two modes defined below.
- Do NOT mention, label, or reference the modes in the output.
- Do NOT combine content from both modes.
- Do NOT explain or justify the choice of mode.
- Do NOT add any headings, titles, or commentary beyond what the mode requires.
---
## MODE 1: STRUCTURED VISUAL DATA OUTPUT
(Use only if the image contains enumerable data units forming a coherent dataset.)
Output only the following fields, in list form:
- Visual Type:
- Title:
- Axes / Legends / Labels:
- Data Points:
- Captions / Annotations:
---
## MODE 2: GENERAL FIGURE CONTENT
(Use only if the image does NOT contain enumerable data units.)
Write the content directly, starting from the first sentence.
Do NOT add any introductory labels, titles, headings, or prefixes.
Requirements:
- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
- Explicitly name interface elements or visual objects exactly as they appear.
- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
- Describe spatial grouping, containment, and alignment of elements.
- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
Use concise, information-dense sentences.
Do not use bullet lists or structured fields in this mode.`
// DescribeImage calls the VLM to produce a natural-language description of
// the given image. Returns the description text or an error.
//
// Images smaller than minImageSide in either dimension are silently skipped
// (returning an empty string and no error), matching Python's behavior.
func DescribeImage(ctx context.Context, img image.Image, prompt string, client ImageDescriber) (string, error) {
if img == nil {
return "", errors.New("DescribeImage: nil image")
}
b := img.Bounds()
if b.Dx() == 0 || b.Dy() == 0 {
return "", errors.New("DescribeImage: empty image (0x0)")
}
if b.Dx() < minImageSide || b.Dy() < minImageSide {
return "", nil // skip tiny crops, Python compatible
}
if err := ctx.Err(); err != nil {
return "", err
}
return client.DescribeImage(ctx, img, prompt)
}

View File

@@ -1,112 +0,0 @@
package post
import (
"context"
"errors"
"image"
"image/color"
"testing"
)
// ── mock image describer ───────────────────────────────────────────────
type mockImageDescriber struct {
describe string
err error
}
func (m *mockImageDescriber) DescribeImage(_ context.Context, _ image.Image, _ string) (string, error) {
return m.describe, m.err
}
// ── DescribeImage tests ────────────────────────────────────────────────
func TestDescribeImage_Success(t *testing.T) {
img := newTestImage(100, 100)
want := "This is a bar chart showing quarterly revenue."
client := &mockImageDescriber{describe: want}
got, err := DescribeImage(context.Background(), img, "Describe this image", client)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != want {
t.Errorf("DescribeImage() = %q, want %q", got, want)
}
}
func TestDescribeImage_VLMError(t *testing.T) {
img := newTestImage(100, 100)
client := &mockImageDescriber{err: errors.New("VLM timeout")}
got, err := DescribeImage(context.Background(), img, "Describe this image", client)
if err == nil {
t.Fatal("expected error, got nil")
}
if got != "" {
t.Errorf("expected empty string on error, got %q", got)
}
}
func TestDescribeImage_CanceledContext(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
img := newTestImage(100, 100)
client := &mockImageDescriber{describe: "should not be reached"}
got, err := DescribeImage(ctx, img, "prompt", client)
if err == nil {
t.Fatal("expected context error, got nil")
}
if got != "" {
t.Errorf("expected empty string, got %q", got)
}
}
func TestDescribeImage_NilImage(t *testing.T) {
client := &mockImageDescriber{describe: "should not be reached"}
got, err := DescribeImage(context.Background(), nil, "prompt", client)
if err == nil {
t.Fatal("expected error for nil image, got nil")
}
if got != "" {
t.Errorf("expected empty string, got %q", got)
}
}
func TestDescribeImage_EmptyImage(t *testing.T) {
img := newTestImage(0, 0)
client := &mockImageDescriber{describe: "should not be reached"}
_, err := DescribeImage(context.Background(), img, "prompt", client)
if err == nil {
t.Fatal("expected error for empty image, got nil")
}
}
func TestDescribeImage_TinyImage(t *testing.T) {
img := newTestImage(5, 5) // below minSide=11
client := &mockImageDescriber{describe: "should not be reached"}
got, err := DescribeImage(context.Background(), img, "prompt", client)
if err != nil {
t.Fatal("tiny images should be silently skipped, not error")
}
if got != "" {
t.Errorf("expected empty string for tiny image, got %q", got)
}
}
// ── helpers ────────────────────────────────────────────────────────────
func newTestImage(w, h int) image.Image {
img := image.NewRGBA(image.Rect(0, 0, w, h))
// Fill with a recognizable pattern.
for y := 0; y < h; y++ {
for x := 0; x < w; x++ {
img.Set(x, y, color.RGBA{R: uint8(x % 256), G: uint8(y % 256), B: 128, A: 255})
}
}
return img
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"image"
@@ -53,7 +53,7 @@ func TestRenderCompare(t *testing.T) {
}
// Render page 0 with pdfium (Go).
goImg, err := renderPageToImage(eng, 0)
goImg, err := RenderPageToImage(eng, 0)
eng.Close()
if err != nil {
t.Logf("%s: render error: %v", name, err)

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"image"
@@ -13,7 +13,7 @@ import (
var renderFn = fallbackRender
// renderPageToImage renders a page at 216 DPI for downstream DLA/TSR/OCR.
func renderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
func RenderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
return renderFn(engine, pageNum)
}
@@ -25,7 +25,10 @@ func fallbackRender(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
}
// Guard against typed-nil (e.g. (*image.RGBA)(nil) returned as non-nil
// interface). The plain img==nil check misses that case.
if img == nil || reflect.ValueOf(img).IsNil() {
if img == nil {
return nil, ErrNoPDFData
}
if rv := reflect.ValueOf(img); rv.Kind() == reflect.Ptr && rv.IsNil() {
return nil, ErrNoPDFData
}
return img, nil

View File

@@ -1,6 +1,6 @@
//go:build cgo
package parser
package pdf
import (
"image"

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"image"
@@ -24,8 +24,8 @@ func pdfiumPtSize(eng pdf.PDFEngine, file string, t *testing.T) (w, h float64) {
raw := eng.RawData()
if raw == nil {
// Fallback: use pdf_oxide pre-rotation size.
if pe, ok := eng.(*pdfoxideEngine); ok {
w, h, _ = pe.inner.PageSize(0)
if pe, ok := eng.(*PDFOxideEngine); ok {
w, h, _ = pe.Inner.PageSize(0)
}
return
}
@@ -302,7 +302,7 @@ func TestRotation_CropBoxWithRotate(t *testing.T) {
// CropBox excludes content from the page edges; chars near the
// CropBox boundary may end up outside the effective page after rotation.
if oobRate > 40 {
t.Errorf("too many OOB chars: %.1f%%", oobRate)
t.Errorf("too many OOB Chars: %.1f%%", oobRate)
}
// Verify render alignment.

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -43,9 +43,8 @@ func TestScanAllPDFs(t *testing.T) {
eng := mustOpenEngine(t, name)
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = NewDeepDocTableBuildService(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, client)
eng.Close()
if err != nil {
fmt.Printf(" ❌ ERROR: %v\n", err)

View File

@@ -1,6 +1,6 @@
//go:build manual
package parser
package pdf
import (
"encoding/json"
@@ -16,7 +16,7 @@ import (
"testing"
)
// TestSnapshotStageComparison verifies Go's TextMerge output
// TestSnapshotStageComparison verifies Go's lyt.TextMerge output
// matches Python's _text_merge sample boxes using synthetic input.
func TestSnapshotStageComparison(t *testing.T) {
snapDir := filepath.Join("testdata", "snapshots")
@@ -47,19 +47,19 @@ func TestSnapshotStageComparison(t *testing.T) {
// Convert sample boxes to Go pdf.TextBox format
goBoxes := snapshotBoxesToGo(s1.SampleBoxesPage0)
// Run Go TextMerge with default params
// Run Go lyt.TextMerge with default params
meanH := map[int]float64{0: avg(s1.MeanHeight)}
merged := lyt.TextMerge(goBoxes, meanH, 3)
// Compare counts
if len(merged) > 0 {
t.Logf(" Go TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
t.Logf(" Go lyt.TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
mergeRatio := float64(len(merged)) / float64(len(goBoxes))
pyRatio := float64(s4.BoxesAfter) / float64(s4.BoxesBefore)
t.Logf(" Merge ratios: Go=%.0f%% Python=%.0f%%", mergeRatio*100, pyRatio*100)
}
// Run Go NaiveVerticalMerge
// Run Go lyt.NaiveVerticalMerge
meanW := map[int]float64{0: avg(s1.MeanWidth)}
vm := lyt.NaiveVerticalMerge(merged, meanH, meanW, s1.IsEnglish)
if s6, ok := snap.Stages["_naive_vertical_merge"]; ok {

View File

@@ -2,6 +2,7 @@ package table
import (
"fmt"
"html"
"math"
"regexp"
"sort"
@@ -698,7 +699,47 @@ func RowsToHTML(rows [][]pdf.TSRCell, caption string, headerRows map[int]bool, s
return b.String()
}
// ── Span computation (Python: __cal_spans) ──
// SimpleRowsToHTML converts plain string-based table data to an HTML table.
// The first row is treated as a header (<th>). Used by DOCX, XLSX, PPTX,
// and HTML parsers that produce [][]string directly.
func SimpleRowsToHTML(rows [][]string) string {
if len(rows) == 0 {
return "<table></table>"
}
nCols := 0
for _, row := range rows {
if len(row) > nCols {
nCols = len(row)
}
}
var b strings.Builder
b.WriteString("<table>")
for ri, row := range rows {
b.WriteString("<tr>")
tag := "td"
if ri == 0 {
tag = "th"
}
for ci := 0; ci < nCols; ci++ {
text := ""
if ci < len(row) {
text = row[ci]
}
b.WriteString("<")
b.WriteString(tag)
b.WriteString(" >")
b.WriteString(html.EscapeString(text))
b.WriteString("</")
b.WriteString(tag)
b.WriteString(">")
}
b.WriteString("</tr>")
}
b.WriteString("</table>")
return b.String()
}
// Span computation (Python: __cal_spans) ──
// calSpans computes colspan and rowspan for spanning cells in the grid.
// Returns spanInfo (row,col → colspan,rowspan) and covered (cells hidden by spans).

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -12,10 +12,10 @@ import (
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// enrichWithDeepDoc runs DLA+TSR via p.DeepDoc and returns detected tables.
// enrichWithDeepDoc runs DLA+TSR via docAnalyzer and returns detected tables.
// pageImages optionally provides pre-rendered page images to avoid re-rendering.
func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image) []pdf.TableItem {
if !p.DeepDoc.Health() {
func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
if !docAnalyzer.Health() {
return nil
}
// Group boxes by page for annotation write-back.
@@ -50,7 +50,7 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
for i, idx := range indices {
pageBoxes[i] = boxes[idx]
}
tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems))
tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems), docAnalyzer, tb)
tableItems = append(tableItems, tables...)
// Write back DLA and TSR annotations (R/C/H/SP) to the original boxes.
for i, idx := range indices {
@@ -65,21 +65,21 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
return tableItems
}
func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int) []pdf.TableItem {
func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
pageImg, ok := pageImages[pageNum]
if !ok {
var err error
pageImg, err = renderPageToImage(engine, pageNum)
pageImg, err = RenderPageToImage(engine, pageNum)
if err != nil {
slog.Warn("render page for DeepDoc failed", "page", pageNum, "err", err)
return nil
}
}
return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx)
return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx, docAnalyzer, tb)
}
func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int) []pdf.TableItem {
regions, err := p.DeepDoc.DLA(ctx, pageImg)
func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
regions, err := docAnalyzer.DLA(ctx, pageImg)
if err != nil {
slog.Warn("DLA failed", "page", pageNum, "err", err)
return nil
@@ -95,148 +95,117 @@ func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.Par
tableMatches := tbl.MatchTableRegions(boxes, regions, scale)
var items []pdf.TableItem
for _, tm := range tableMatches {
cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
if cropErr != nil {
// DLA returned an invalid region (e.g. x1 < x0). Python
// PIL.Image.crop() raises ValueError here; we skip this
// table instead of passing a full-page image to TSR.
continue
item := p.processOneTable(ctx, result, boxes, pageImg, pageNum, docAnalyzer, tb, tm, scale, tableBaseIdx+len(items))
if item.ImageB64 != "" || len(item.Cells) > 0 || len(item.Positions) > 0 {
items = append(items, item)
}
}
return items
}
// Rotation detection (Python: _evaluate_table_orientation).
// If rotated, TSR and OCR use the rotated image; cell coords
// are mapped back to original crop space for box matching.
autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
bestAngle := 0
origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
tsrImg := cropped
if autoRotate {
angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, p.DeepDoc)
bestAngle = angle
tsrImg = rotated
}
imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
if encErr != nil {
slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
}
var cells []pdf.TSRCell
var tsrErr error
cells, tsrErr = p.tableBuilder.DetectCells(ctx, tsrImg)
if tsrErr != nil {
slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
}
// Collect TSR raw cells for debug comparison.
if tsrErr == nil {
for _, c := range cells {
if result != nil {
result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
TableIndex: tableBaseIdx + len(items), Page: pageNum,
Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1,
Text: c.Text,
})
}
}
}
// Python margin: w*0.03, h*0.03 (_table_transformer_job:374-376).
w := tm.Region.X1 - tm.Region.X0
h := tm.Region.Y1 - tm.Region.Y0
marginX := w * 0.03
marginY := h * 0.03
cropOffX := math.Max(0, tm.Region.X0-marginX)
cropOffY := math.Max(0, tm.Region.Y0-marginY)
var boxInCrop []pdf.TextBox
if tsrErr == nil && len(cells) > 0 {
if bestAngle != 0 {
// OCR on rotated image before mapping cells back.
// Cells are in rotated-pixel space; OCR works best
// on upright text. After mapping, cells move to
// original crop space where boxInCrop lives.
if !p.Config.SkipOCR {
ocrTableCells(ctx, cells, tsrImg, p.DeepDoc)
}
for i := range cells {
cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
}
}
// Fill cell text from pre-merge boxes, skipping caption boxes
// (text entirely above the first TSR cell row).
firstCellTop := 1e9
for _, c := range cells {
if c.Y0 >= 0 && c.Y0 < firstCellTop {
firstCellTop = c.Y0
}
}
if firstCellTop == 1e9 {
firstCellTop = cells[0].Y0 // fallback if all cells have Y0 < 0
}
boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
for _, idx := range tm.BoxIdx {
b := boxes[idx]
if b.Bottom*scale-cropOffY < firstCellTop {
continue // caption box above first TSR cell
}
boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
}
}
var positions []pdf.Position
for _, idx := range tm.BoxIdx {
b := boxes[idx]
positions = append(positions, pdf.Position{
PageNumbers: []int{pageNum},
Left: b.X0, Right: b.X1,
Top: b.Top, Bottom: b.Bottom,
// processOneTable handles DLA+TSR+OCR for a single table region match.
func (p *Parser) processOneTable(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, tm tbl.TableMatch, scale float64, tableIdx int) pdf.TableItem {
cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
if cropErr != nil {
return pdf.TableItem{}
}
autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
bestAngle := 0
origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
tsrImg := cropped
if autoRotate {
angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, docAnalyzer)
bestAngle = angle
tsrImg = rotated
}
imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
if encErr != nil {
slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
}
cells, tsrErr := tb.DetectCells(ctx, tsrImg)
if tsrErr != nil {
slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
}
if tsrErr == nil && result != nil {
for _, c := range cells {
result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
TableIndex: tableIdx, Page: pageNum,
Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, Text: c.Text,
})
}
// Pre-compute grid from raw TSR cells (without crop offset).
// Stored in pdf.TableItem for constructTable; annotateTableBoxes
// recomputes with offset cells for spatial matching precision.
var grid [][]pdf.TSRCell
if len(cells) > 0 {
grid = p.tableBuilder.GroupCells(cells)
// Fill cell text from boxes in crop space. Works for both
// Label-aware grouping (cells rearranged) vs. cross-product (creates new cells).
if len(grid) > 0 {
flat := tbl.FlattenGrid(grid)
tbl.FillCellTextFromBoxes(flat, boxInCrop)
idx := 0
}
w := tm.Region.X1 - tm.Region.X0
h := tm.Region.Y1 - tm.Region.Y0
cropOffX := math.Max(0, tm.Region.X0-w*0.03)
cropOffY := math.Max(0, tm.Region.Y0-h*0.03)
var boxInCrop []pdf.TextBox
if tsrErr == nil && len(cells) > 0 {
if bestAngle != 0 {
if !p.Config.SkipOCR {
ocrTableCells(ctx, cells, tsrImg, docAnalyzer)
}
for i := range cells {
cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
}
}
firstCellTop := 1e9
for _, c := range cells {
if c.Y0 >= 0 && c.Y0 < firstCellTop {
firstCellTop = c.Y0
}
}
if firstCellTop == 1e9 {
firstCellTop = cells[0].Y0
}
boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
for _, idx := range tm.BoxIdx {
b := boxes[idx]
if b.Bottom*scale-cropOffY < firstCellTop {
continue
}
boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
}
}
var positions []pdf.Position
for _, idx := range tm.BoxIdx {
b := boxes[idx]
positions = append(positions, pdf.Position{
PageNumbers: []int{pageNum},
Left: b.X0, Right: b.X1, Top: b.Top, Bottom: b.Bottom,
})
}
var grid [][]pdf.TSRCell
if len(cells) > 0 {
grid = tb.GroupCells(cells)
if len(grid) > 0 {
flat := tbl.FlattenGrid(grid)
tbl.FillCellTextFromBoxes(flat, boxInCrop)
idx := 0
for ri := range grid {
for ci := range grid[ri] {
grid[ri][ci].Text = flat[idx].Text
idx++
}
}
if bestAngle == 0 && !p.Config.SkipOCR {
ocrTableCells(ctx, flat, tsrImg, docAnalyzer)
idx = 0
for ri := range grid {
for ci := range grid[ri] {
grid[ri][ci].Text = flat[idx].Text
idx++
}
}
if bestAngle == 0 && !p.Config.SkipOCR {
ocrTableCells(ctx, flat, tsrImg, p.DeepDoc)
idx = 0
for ri := range grid {
for ci := range grid[ri] {
grid[ri][ci].Text = flat[idx].Text
idx++
}
}
}
}
}
items = append(items, pdf.TableItem{
ImageB64: imgB64,
Cells: cells,
Grid: grid,
Positions: positions,
Scale: scale,
CropOffX: cropOffX,
CropOffY: cropOffY,
// DLA region in PDF point space (Python's cropout uses layout region boundaries).
RegionLeft: tm.Region.X0 / scale,
RegionRight: tm.Region.X1 / scale,
RegionTop: tm.Region.Y0 / scale,
RegionBottom: tm.Region.Y1 / scale,
})
tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, p.tableBuilder)
}
return items
item := pdf.TableItem{
ImageB64: imgB64, Cells: cells, Grid: grid, Positions: positions,
Scale: scale, CropOffX: cropOffX, CropOffY: cropOffY,
RegionLeft: tm.Region.X0 / scale, RegionRight: tm.Region.X1 / scale,
RegionTop: tm.Region.Y0 / scale, RegionBottom: tm.Region.Y1 / scale,
}
tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, tb)
return item
}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -9,6 +9,7 @@ import (
inf "ragflow/internal/deepdoc/parser/pdf/inference"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
"testing"
)
@@ -32,7 +33,7 @@ func TestTableRotation_Integration(t *testing.T) {
if baseURL == "" {
baseURL = "http://localhost:9390"
}
dd, err := inf.NewInferenceClient(baseURL)
dd, err := inf.NewClient(baseURL)
if err != nil {
t.Fatal(err)
}
@@ -59,10 +60,10 @@ func TestTableRotation_Integration(t *testing.T) {
cfg.ToPage = pageCount - 1
autoRotate := true
cfg.AutoRotateTables = &autoRotate
_ = NewParser(cfg, dd) // verify construction does not panic
_ = NewParser(cfg) // verify construction does not panic
for pg := 0; pg < pageCount; pg++ {
pageImg, err := renderPageToImage(eng, pg)
pageImg, err := RenderPageToImage(eng, pg)
if err != nil {
t.Fatalf("render page %d: %v", pg, err)
}
@@ -80,7 +81,7 @@ func TestTableRotation_Integration(t *testing.T) {
tableCount++
// Crop table region
cropped, err := cropImageRegion(pageImg, r)
cropped, err := util.CropImageRegion(pageImg, r)
if err != nil {
t.Errorf(" crop table %d: %v", tableCount, err)
continue
@@ -130,7 +131,7 @@ func TestTableRotation_Stability(t *testing.T) {
if baseURL == "" {
baseURL = "http://localhost:9390"
}
dd, err := inf.NewInferenceClient(baseURL)
dd, err := inf.NewClient(baseURL)
if err != nil {
t.Fatal(err)
}
@@ -163,7 +164,7 @@ func TestTableRotation_Stability(t *testing.T) {
continue
}
pageImg, err := renderPageToImage(eng, 0)
pageImg, err := RenderPageToImage(eng, 0)
eng.Close()
if err != nil {
continue
@@ -177,7 +178,11 @@ func TestTableRotation_Stability(t *testing.T) {
continue
}
tables++
cropped, _ := cropImageRegion(pageImg, r)
cropped, err := util.CropImageRegion(pageImg, r)
if err != nil {
t.Errorf(" %s crop table: %v", e.Name(), err)
continue
}
if cropped == nil {
continue
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"context"
@@ -16,11 +16,11 @@ import (
// entries. Go backfills pdf.Section.Text from pdf.TableItem.Rows after
// linkTableSections.
func TestTableSection_TextFromTSR(t *testing.T) {
eng := &mockEngine{
pageCount: 1,
renderW: 900, // 300pt at 3x = 900px (216 DPI)
renderH: 600,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
RenderW: 900, // 300pt at 3x = 900px (216 DPI)
RenderH: 600,
Chars: map[int][]pdf.TextChar{0: {
// PDF space (72 DPI): well inside DLA region
{X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"},
{X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"},
@@ -42,9 +42,9 @@ func TestTableSection_TextFromTSR(t *testing.T) {
{X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -93,14 +93,14 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
{X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
// 0 text boxes, but page 0 has a rendered image.
boxes := []pdf.TextBox{}
dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600))
pageImages := map[int]image.Image{0: dummyImg}
tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages)
tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages, mock, NewTableBuilderFor(mock))
if len(tables) == 0 {
t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0")
}
@@ -113,10 +113,10 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
// is merged into the nearest "figure" pdf.Section and the caption pdf.Section is
// removed. Matches Python _extract_table_figure caption matching.
func TestFigureCaption_MergedIntoFigure(t *testing.T) {
eng := &mockEngine{
pageCount: 1,
renderW: 1800, renderH: 2400,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
RenderW: 1800, RenderH: 2400,
Chars: map[int][]pdf.TextChar{0: {
// Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100).
{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"},
// Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113).
@@ -131,9 +131,9 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
{X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -169,10 +169,10 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
// TestTableCaption_MergedIntoTable verifies that "table caption" text
// is merged into the nearest table pdf.Section and the caption is removed.
func TestTableCaption_MergedIntoTable(t *testing.T) {
eng := &mockEngine{
pageCount: 1,
renderW: 1800, renderH: 2400,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
RenderW: 1800, RenderH: 2400,
Chars: map[int][]pdf.TextChar{0: {
// Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100).
{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"},
// Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113).
@@ -190,9 +190,9 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -224,10 +224,10 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
// boxes overlapping a table region, regardless of their DLA label.
// This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs.
func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
eng := &mockEngine{
pageCount: 1,
renderW: 1800, renderH: 2400,
chars: map[int][]pdf.TextChar{0: {
eng := &MockEngine{
NumPages: 1,
RenderW: 1800, RenderH: 2400,
Chars: map[int][]pdf.TextChar{0: {
// Box A: inside DLA table region, labeled as "text" by DLA.
{X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"},
// Box B: inside DLA table region, same situation.
@@ -247,9 +247,9 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"},
},
}
p := NewParser(pdf.DefaultParserConfig(), mock)
p := NewParser(pdf.DefaultParserConfig())
result, err := p.Parse(context.Background(), eng)
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -286,9 +286,10 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
// TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully.
func TestEmptyDoc_NoCrash(t *testing.T) {
eng := &mockEngine{pageCount: 0}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
eng := &MockEngine{NumPages: 0}
mock := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
@@ -299,13 +300,69 @@ func TestEmptyDoc_NoCrash(t *testing.T) {
// TestNilChars_handled verifies zero-chars pages don't crash.
func TestNilChars_Handled(t *testing.T) {
eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
eng := &MockEngine{NumPages: 1, RenderW: 200, RenderH: 200}
mock := &MockDocAnalyzer{Healthy: true}
p := NewParser(pdf.DefaultParserConfig())
result, err := p.ParseRaw(context.Background(), eng, mock)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) != 0 && p.DeepDoc != nil {
if len(result.Sections) != 0 {
t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections))
}
}
func TestMatchTableImage_ByPositions(t *testing.T) {
tableByRegion := map[string]string{
"0_50.0_500.0_100.0_300.0": "img_base64_positions",
}
sec := &pdf.Section{
LayoutType: pdf.LayoutTypeTable,
Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 50.0, Right: 500.0, Top: 100.0, Bottom: 300.0}},
}
img, ok := matchTableImage(sec, tableByRegion)
if !ok {
t.Fatal("expected match by Positions")
}
if img != "img_base64_positions" {
t.Errorf("got %q, want img_base64_positions", img)
}
}
func TestMatchTableImage_FallbackToRegion(t *testing.T) {
tableByRegion := map[string]string{
"0_80.0_520.0_200.0_400.0": "img_base64_region",
}
sec := &pdf.Section{
LayoutType: pdf.LayoutTypeTable,
Positions: nil,
TableItem: &pdf.TableItem{RegionLeft: 80.0, RegionRight: 520.0, RegionTop: 200.0, RegionBottom: 400.0},
}
img, ok := matchTableImage(sec, tableByRegion)
if !ok {
t.Fatal("expected match by Region fallback")
}
if img != "img_base64_region" {
t.Errorf("got %q, want img_base64_region", img)
}
}
func TestMatchTableImage_NoMatch(t *testing.T) {
tableByRegion := map[string]string{"0_10.0_20.0_30.0_40.0": "no_chance"}
sec := &pdf.Section{
LayoutType: pdf.LayoutTypeTable,
Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 100, Right: 200, Top: 300, Bottom: 400}},
}
_, ok := matchTableImage(sec, tableByRegion)
if ok {
t.Error("expected no match")
}
}
func TestMatchTableImage_EmptySection(t *testing.T) {
sec := &pdf.Section{LayoutType: pdf.LayoutTypeTable}
_, ok := matchTableImage(sec, map[string]string{"x": "y"})
if ok {
t.Error("expected no match for empty section")
}
}

View File

@@ -1,4 +1,4 @@
package parser
package pdf
import (
"image"
@@ -6,48 +6,6 @@ import (
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// ── mockEngine: minimal pdf.PDFEngine stub for unit tests ─────────────
type mockEngine struct {
chars map[int][]pdf.TextChar
pageCount int
renderW int
renderH int
}
func (m *mockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
return m.chars[pg], nil
}
func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
w, h := m.renderW, m.renderH
if w <= 0 {
w = 595
}
if h <= 0 {
h = 842
}
return nil, nil
}
func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
w, h := m.renderW, m.renderH
if w <= 0 {
w = 100
}
if h <= 0 {
h = 100
}
return image.NewRGBA(image.Rect(0, 0, w, h)), nil
}
func (m *mockEngine) PageCount() (int, error) {
if m.pageCount <= 0 {
return 1, nil
}
return m.pageCount, nil
}
func (m *mockEngine) RawData() []byte { return nil }
func (m *mockEngine) Close() error { return nil }
func (m *mockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }
// ── testPageImg: small test image for ocrMergeChars tests ─────────────
// 90×120 px at 216 DPI → 30×40 pt in PDF space after /3.0 scaling.

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"context"
@@ -66,8 +66,8 @@ func TestDumpTextOutput(t *testing.T) {
}
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), eng)
p := NewParser(cfg)
result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
eng.Close()
if err != nil {
t.Logf("[%d/%d] %s — parse error: %v", i+1, count, name, err)

View File

@@ -1,320 +1,56 @@
// Package pdftypes provides shared types, interfaces, and constants for the
// PDF parser pipeline. It has zero dependencies on sibling packages so that
// sub-packages (tables, geometry, etc.) can import it without circular imports.
// Package pdftype provides PDF-specific types and re-exports shared types
// from the doctype package via Go type aliases. Existing PDF parser code
// that imports this package continues to work without changes.
package pdftype
import (
"context"
"image"
"unicode"
)
import doctype "ragflow/internal/deepdoc/parser/type"
// ── Pipeline types ────────────────────────────────────────────────────────
// ── Re-export shared types via aliases ─────────────────────────────────────
// PipelineMetrics records diagnostic counts at each pipeline stage.
type PipelineMetrics struct {
BoxesInitial int
BoxesTextMerge int
BoxesVertMerge int
BoxesFinal int
TablesCount int
}
type PipelineMetrics = doctype.PipelineMetrics
type ParseResult = doctype.ParseResult
type DLAPageRegions = doctype.DLAPageRegions
type TSRRawCell = doctype.TSRRawCell
type TextChar = doctype.TextChar
type TextBox = doctype.TextBox
type Position = doctype.Position
type Section = doctype.Section
type TableItem = doctype.TableItem
type TSRCell = doctype.TSRCell
type DLARegion = doctype.DLARegion
type OCRBox = doctype.OCRBox
type OCRText = doctype.OCRText
type ParserConfig = doctype.ParserConfig
type DocAnalyzer = doctype.DocAnalyzer
type Outline = doctype.Outline
type PDFEngine = doctype.PDFEngine
type Tokenizer = doctype.Tokenizer
type SampleFunc = doctype.SampleFunc
type TableBuilder = doctype.TableBuilder
type Rectangular = doctype.Rectangular
// ParseResult encapsulates all outputs from a single Parse() call.
type ParseResult struct {
Sections []Section
Tables []TableItem
PageImages map[int]image.Image
Metrics PipelineMetrics
Outlines []Outline // PDF outlines/bookmarks extracted from the document
// ── Re-export constants ────────────────────────────────────────────────────
DLADebug []DLAPageRegions
TSRDebug []TSRRawCell
}
// Figures returns all sections with LayoutType "figure".
// Computed on demand from Sections — no stored field.
func (r *ParseResult) Figures() []Section {
return CollectFigures(r.Sections)
}
// DLAPageRegions holds DLA layout regions for one page.
type DLAPageRegions struct {
Page int
Regions []DLARegion
}
// TSRRawCell holds a raw TSR cell before row/column grouping.
type TSRRawCell struct {
TableIndex int `json:"table_index"`
Page int `json:"page"`
Label string `json:"label"`
X0 float64 `json:"x0"`
Y0 float64 `json:"y0"`
X1 float64 `json:"x1"`
Y1 float64 `json:"y1"`
Text string `json:"text"`
}
// ── Character and text box types ──────────────────────────────────────────
// TextChar represents a single character extracted from a PDF page.
type TextChar struct {
X0, X1 float64
Top, Bottom float64
Text string
FontName string
FontSize float64
PageNumber int
LayoutType string
LayoutNo string
ColID int
R int
}
func (c TextChar) Bounds() (float64, float64, float64, float64) {
return c.X0, c.Top, c.X1, c.Bottom
}
// TextBox represents a rectangular region of text on a PDF page.
type TextBox struct {
X0, X1 float64
Top, Bottom float64
Text string
PageNumber int
LayoutType string
LayoutNo string
ColID int
R int
// Post-TSR table annotation fields (Python: R/H/C/SP tags)
RTop, RBott float64
HTop, HBott float64
HLeft, HRight float64
H int
C int
CLeft, CRight float64
SP int
}
func (b TextBox) Bounds() (float64, float64, float64, float64) {
return b.X0, b.Top, b.X1, b.Bottom
}
// ── Position and section types ────────────────────────────────────────────
// Position represents a parsed position tag from @@...## format.
type Position struct {
PageNumbers []int
Left float64
Right float64
Top float64
Bottom float64
}
// Section represents a text segment with its spatial position on a PDF page.
type Section struct {
Text string
PositionTag string
LayoutType string
DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing
Positions []Position
TableItem *TableItem
Image string // base64-encoded cropped page image
}
// SectionsByPage returns a slice of sections on the given page.
func SectionsByPage(sections []Section, page int) []Section {
var out []Section
for _, s := range sections {
for _, p := range s.Positions {
for _, pn := range p.PageNumbers {
if pn == page {
out = append(out, s)
break
}
}
}
}
return out
}
// CollectFigures returns all sections with LayoutType "figure".
func CollectFigures(sections []Section) []Section {
if sections == nil {
return nil
}
figures := make([]Section, 0)
for _, s := range sections {
if s.LayoutType == LayoutTypeFigure {
figures = append(figures, s)
}
}
return figures
}
// ── Table types ───────────────────────────────────────────────────────────
// TableItem represents a detected table or figure region.
type TableItem struct {
ImageB64 string
Rows [][]string
Cells []TSRCell
Positions []Position
Scale float64
CropOffX float64
CropOffY float64
Caption string
RegionLeft, RegionRight, RegionTop, RegionBottom float64
NoMerge bool
Grid [][]TSRCell
}
// TSRCell represents one table cell from TSR.
type TSRCell struct {
X0, Y0, X1, Y1 float64
Text string
Label string
}
func (c TSRCell) Bounds() (float64, float64, float64, float64) {
return c.X0, c.Y0, c.X1, c.Y1
}
// ── DeepDoc vision types ─────────────────────────────────────────────────
// DLARegion represents one detected layout region.
type DLARegion struct {
X0, Y0, X1, Y1 float64
Label string
Confidence float64
}
func (r DLARegion) Bounds() (float64, float64, float64, float64) {
return r.X0, r.Y0, r.X1, r.Y1
}
// OCRBox represents a detected text region from DeepDoc OCR detection.
type OCRBox struct {
X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
}
// OCRText represents recognized text with confidence from DeepDoc OCR rec.
type OCRText struct {
Text string
Confidence float64
}
// ── Parser configuration ──────────────────────────────────────────────────
// ParserConfig holds parser configuration.
type ParserConfig struct {
Zoom float64
FromPage int
ToPage int
TableContextSize int
ImageContextSize int
AutoRotateTables *bool
SeparateTablesFigs bool
SortByTop bool
BatchSize int
SkipOCR bool
MaxOCRConcurrency int
TableBuilder TableBuilder
}
// DefaultParserConfig returns a ParserConfig with sensible defaults.
func DefaultParserConfig() ParserConfig {
return ParserConfig{
Zoom: 3,
FromPage: 0,
ToPage: -1,
BatchSize: 50,
TableContextSize: 0,
ImageContextSize: 0,
SeparateTablesFigs: false,
}
}
// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
const DlaDPI = 216
// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
const DlaScale = DlaDPI / 72.0
// ── Layout type constants ─────────────────────────────────────────────────
const DlaDPI = doctype.DlaDPI
const DlaScale = doctype.DlaScale
const (
LayoutTypeText = "text"
LayoutTypeTable = "table"
LayoutTypeFigure = "figure"
LayoutTypeEquation = "equation"
LayoutTypeTitle = "title"
LayoutTypeReference = "reference"
LayoutTypeFooter = "footer"
LayoutTypeHeader = "header"
DLALabelFigureCaption = "figure caption"
DLALabelTableCaption = "table caption"
LayoutTypeText = doctype.LayoutTypeText
LayoutTypeTable = doctype.LayoutTypeTable
LayoutTypeFigure = doctype.LayoutTypeFigure
LayoutTypeEquation = doctype.LayoutTypeEquation
LayoutTypeTitle = doctype.LayoutTypeTitle
LayoutTypeReference = doctype.LayoutTypeReference
LayoutTypeFooter = doctype.LayoutTypeFooter
LayoutTypeHeader = doctype.LayoutTypeHeader
DLALabelFigureCaption = doctype.DLALabelFigureCaption
DLALabelTableCaption = doctype.DLALabelTableCaption
)
// ── Interfaces ────────────────────────────────────────────────────────────
// ── Re-export functions and variables ──────────────────────────────────────
// DocAnalyzer abstracts DeepDoc vision operations.
type DocAnalyzer interface {
DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
Health() bool
}
// ── Outline ────────────────────────────────────────────────────────────
// Outline represents one entry in a PDF's document outline (table of contents).
// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
type Outline struct {
Title string
Level int
PageNumber int // 1-indexed, matching Python
}
// PDFEngine abstracts page extraction capabilities.
type PDFEngine interface {
ExtractChars(pageNum int) ([]TextChar, error)
RenderPage(pageNum int, dpi float64) ([]byte, error)
RenderPageImage(pageNum int, dpi float64) (image.Image, error)
RawData() []byte
PageCount() (int, error)
Outlines() ([]Outline, error)
Close() error
}
// Tokenizer provides text tokenization matching rag_tokenizer.
type Tokenizer interface {
Tag(token string) string
}
// SampleFunc samples up to n characters from a page's chars.
type SampleFunc func(chars []TextChar, n int) string
// TableBuilder encapsulates TSR model-specific cell detection and grouping.
type TableBuilder interface {
Name() string
DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
GroupCells(cells []TSRCell) [][]TSRCell
}
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
type Rectangular interface {
Bounds() (x0, y0, x1, y1 float64)
}
// IsCJK reports whether r is a CJK character.
func IsCJK(r rune) bool {
return unicode.Is(unicode.Han, r) ||
unicode.Is(unicode.Hiragana, r) ||
unicode.Is(unicode.Katakana, r) ||
unicode.Is(unicode.Hangul, r)
}
var (
CollectFigures = doctype.CollectFigures
DefaultParserConfig = doctype.DefaultParserConfig
IsCJK = doctype.IsCJK
)

View File

@@ -131,34 +131,6 @@ func OverlapX(a, b pdf.Rectangular) float64 {
return overlap / minWidth
}
// SortXByPage sorts boxes by page_number, then x0, then top.
// After sorting, corrects for same-page boxes that have nearly the same x0
// but inverted top ordering (a layout artifact).
//
// Python: pdf_parser.py:178 sort_X_by_page()
func SortXByPage(boxes []pdf.TextBox, threshold float64) []pdf.TextBox {
sort.Slice(boxes, func(i, j int) bool {
if boxes[i].PageNumber != boxes[j].PageNumber {
return boxes[i].PageNumber < boxes[j].PageNumber
}
if boxes[i].X0 != boxes[j].X0 {
return boxes[i].X0 < boxes[j].X0
}
return boxes[i].Top < boxes[j].Top
})
for i := len(boxes) - 1; i >= 1; i-- {
for j := i - 1; j >= 0; j-- {
if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold &&
boxes[j+1].Top < boxes[j].Top &&
boxes[j+1].PageNumber == boxes[j].PageNumber {
boxes[j], boxes[j+1] = boxes[j+1], boxes[j]
}
}
}
return boxes
}
// MedianCharHeight computes the median character height for a page,
// matching Python's np.median(char height) in __images__ (pdf_parser.py:1552).
// Used as a reference unit for vertical spacing decisions.

View File

@@ -49,22 +49,6 @@ func TestYDis(t *testing.T) {
}
}
func TestSortXByPage(t *testing.T) {
boxes := []pdf.TextBox{
{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
}
result := SortXByPage(boxes, 3)
if result[0].Text != "D" {
t.Errorf("first should be page 0: got %q", result[0].Text)
}
if result[1].Text != "B" || result[2].Text != "A" {
t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
}
}
func TestOverlapX(t *testing.T) {
b1 := pdf.TextBox{X0: 50, X1: 200}
b2 := pdf.TextBox{X0: 100, X1: 250}

View File

@@ -1,6 +1,6 @@
//go:build cgo && manual
package parser
package pdf
import (
"math"
@@ -8,6 +8,7 @@ import (
"path/filepath"
"testing"
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
@@ -41,7 +42,7 @@ func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) {
t.Fatal("no chars")
}
lines := groupCharsToLines(chars, false)
lines := lyt.GroupCharsToLines(chars, false)
for li, line := range lines {
if len(line) <= 1 {
continue

View File

@@ -0,0 +1,304 @@
// Package doctype provides shared types, interfaces, and constants for the
// deepdoc parser pipeline. All format-specific parsers (pdf, docx, xlsx, etc.)
// share these definitions. The package has zero dependencies on sibling
// packages so that any sub-package can import it without circular imports.
package doctype
import (
"context"
"image"
"unicode"
)
// ── Pipeline types ────────────────────────────────────────────────────────
// PipelineMetrics records diagnostic counts at each pipeline stage.
type PipelineMetrics struct {
BoxesInitial int
BoxesTextMerge int
BoxesVertMerge int
BoxesFinal int
TablesCount int
}
// ParseResult encapsulates all outputs from a single Parse() call.
type ParseResult struct {
Sections []Section
Tables []TableItem
PageImages map[int]image.Image
Metrics PipelineMetrics
Outlines []Outline // PDF outlines/bookmarks extracted from the document
DLADebug []DLAPageRegions
TSRDebug []TSRRawCell
}
// Figures returns all sections with LayoutType "figure".
// Computed on demand from Sections — no stored field.
func (r *ParseResult) Figures() []Section {
return CollectFigures(r.Sections)
}
// DLAPageRegions holds DLA layout regions for one page.
type DLAPageRegions struct {
Page int
Regions []DLARegion
}
// TSRRawCell holds a raw TSR cell before row/column grouping.
type TSRRawCell struct {
TableIndex int `json:"table_index"`
Page int `json:"page"`
Label string `json:"label"`
X0 float64 `json:"x0"`
Y0 float64 `json:"y0"`
X1 float64 `json:"x1"`
Y1 float64 `json:"y1"`
Text string `json:"text"`
}
// ── Character and text box types ──────────────────────────────────────────
// TextChar represents a single character extracted from a PDF page.
type TextChar struct {
X0, X1 float64
Top, Bottom float64
Text string
FontName string
FontSize float64
PageNumber int
LayoutType string
LayoutNo string
ColID int
R int
}
func (c TextChar) Bounds() (float64, float64, float64, float64) {
return c.X0, c.Top, c.X1, c.Bottom
}
// TextBox represents a rectangular region of text on a PDF page.
type TextBox struct {
X0, X1 float64
Top, Bottom float64
Text string
PageNumber int
LayoutType string
LayoutNo string
ColID int
R int
// Post-TSR table annotation fields (Python: R/H/C/SP tags)
RTop, RBott float64
HTop, HBott float64
HLeft, HRight float64
H int
C int
CLeft, CRight float64
SP int
}
func (b TextBox) Bounds() (float64, float64, float64, float64) {
return b.X0, b.Top, b.X1, b.Bottom
}
// ── Position and section types ────────────────────────────────────────────
// Position represents a parsed position tag from @@...## format.
type Position struct {
PageNumbers []int
Left float64
Right float64
Top float64
Bottom float64
}
// Section represents a text segment with its spatial position on a PDF page.
type Section struct {
Text string
PositionTag string
LayoutType string
DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing
Positions []Position
TableItem *TableItem
Image string // base64-encoded cropped page image
}
// CollectFigures returns all sections with LayoutType "figure".
func CollectFigures(sections []Section) []Section {
if sections == nil {
return nil
}
figures := make([]Section, 0)
for _, s := range sections {
if s.LayoutType == LayoutTypeFigure {
figures = append(figures, s)
}
}
return figures
}
// ── Table types ───────────────────────────────────────────────────────────
// TableItem represents a detected table or figure region.
type TableItem struct {
ImageB64 string
Rows [][]string
Cells []TSRCell
Positions []Position
Scale float64
CropOffX float64
CropOffY float64
Caption string
RegionLeft, RegionRight, RegionTop, RegionBottom float64
NoMerge bool
Grid [][]TSRCell
}
// TSRCell represents one table cell from TSR.
type TSRCell struct {
X0, Y0, X1, Y1 float64
Text string
Label string
}
func (c TSRCell) Bounds() (float64, float64, float64, float64) {
return c.X0, c.Y0, c.X1, c.Y1
}
// ── DeepDoc vision types ─────────────────────────────────────────────────
// DLARegion represents one detected layout region.
type DLARegion struct {
X0, Y0, X1, Y1 float64
Label string
Confidence float64
}
func (r DLARegion) Bounds() (float64, float64, float64, float64) {
return r.X0, r.Y0, r.X1, r.Y1
}
// OCRBox represents a detected text region from DeepDoc OCR detection.
type OCRBox struct {
X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
}
// OCRText represents recognized text with confidence from DeepDoc OCR rec.
type OCRText struct {
Text string
Confidence float64
}
// ── Parser configuration ──────────────────────────────────────────────────
// ParserConfig holds parser configuration.
type ParserConfig struct {
Zoom float64
FromPage int
ToPage int
TableContextSize int
ImageContextSize int
AutoRotateTables *bool
SeparateTablesFigs bool
SortByTop bool
BatchSize int
SkipOCR bool
MaxOCRConcurrency int
}
// DefaultParserConfig returns a ParserConfig with sensible defaults.
func DefaultParserConfig() ParserConfig {
return ParserConfig{
Zoom: 3,
FromPage: 0,
ToPage: -1,
BatchSize: 50,
TableContextSize: 0,
ImageContextSize: 0,
SeparateTablesFigs: false,
}
}
// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
const DlaDPI = 216
// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
const DlaScale = DlaDPI / 72.0
// ── Layout type constants ─────────────────────────────────────────────────
const (
LayoutTypeText = "text"
LayoutTypeTable = "table"
LayoutTypeFigure = "figure"
LayoutTypeEquation = "equation"
LayoutTypeTitle = "title"
LayoutTypeReference = "reference"
LayoutTypeFooter = "footer"
LayoutTypeHeader = "header"
DLALabelFigureCaption = "figure caption"
DLALabelTableCaption = "table caption"
)
// ── Interfaces ────────────────────────────────────────────────────────────
// DocAnalyzer abstracts DeepDoc vision operations.
type DocAnalyzer interface {
DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
Health() bool
}
// ── Outline ────────────────────────────────────────────────────────────
// Outline represents one entry in a PDF's document outline (table of contents).
// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
type Outline struct {
Title string
Level int
PageNumber int // 1-indexed, matching Python
}
// PDFEngine abstracts page extraction capabilities.
type PDFEngine interface {
ExtractChars(pageNum int) ([]TextChar, error)
RenderPage(pageNum int, dpi float64) ([]byte, error)
RenderPageImage(pageNum int, dpi float64) (image.Image, error)
RawData() []byte
PageCount() (int, error)
Outlines() ([]Outline, error)
Close() error
}
// Tokenizer provides text tokenization matching rag_tokenizer.
type Tokenizer interface {
Tag(token string) string
}
// SampleFunc samples up to n characters from a page's chars.
type SampleFunc func(chars []TextChar, n int) string
// TableBuilder encapsulates TSR model-specific cell detection and grouping.
type TableBuilder interface {
Name() string
DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
GroupCells(cells []TSRCell) [][]TSRCell
}
// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
type Rectangular interface {
Bounds() (x0, y0, x1, y1 float64)
}
// IsCJK reports whether r is a CJK character.
func IsCJK(r rune) bool {
return unicode.Is(unicode.Han, r) ||
unicode.Is(unicode.Hiragana, r) ||
unicode.Is(unicode.Katakana, r) ||
unicode.Is(unicode.Hangul, r)
}