From 5bc4753d1e2b712deac91ecaa9b7840eee636319 Mon Sep 17 00:00:00 2001 From: Jack Date: Thu, 2 Jul 2026 09:46:33 +0800 Subject: [PATCH] Feat/oss parser no post (#16464) ### Summary Remove dead code --- .gitignore | 2 + .../deepdoc/parser/pdf/batch_smoke_test.go | 10 +- internal/deepdoc/parser/pdf/compare_test.go | 16 +- .../parser/pdf/crop_integration_test.go | 10 +- .../deepdoc/parser/pdf/dla_real_world_test.go | 4 +- .../parser/pdf/dla_tsr_compare_test.go | 4 +- ...st_helpers_cgo_test.go => helpers_test.go} | 23 +- .../deepdoc/parser/pdf/inference/client.go | 42 +- .../parser/pdf/inference/client_test.go | 6 +- .../pdf/inference_client_integration_test.go | 37 +- ..._analyzer_test.go => mock_doc_analyzer.go} | 2 +- internal/deepdoc/parser/pdf/mock_engine.go | 41 ++ internal/deepdoc/parser/pdf/ocr_merge_test.go | 8 +- .../parser/pdf/ocr_recognize_batch_test.go | 2 +- .../parser/pdf/outline_extraction_test.go | 24 +- .../deepdoc/parser/pdf/page_batch_test.go | 6 +- internal/deepdoc/parser/pdf/parse_cgo.go | 22 + internal/deepdoc/parser/pdf/parser.go | 612 +++++++++--------- .../deepdoc/parser/pdf/parser_mock_test.go | 115 ++-- internal/deepdoc/parser/pdf/parser_ocr.go | 168 +++-- .../deepdoc/parser/pdf/parser_ocr_test.go | 2 +- .../pdf/parser_pipeline_integration_test.go | 142 ++-- .../parser/pdf/parser_pipeline_manual_test.go | 6 +- internal/deepdoc/parser/pdf/parser_test.go | 78 ++- .../parser/pdf/pdfium_integration_test.go | 14 +- .../deepdoc/parser/pdf/pdfoxide_bridge.go | 30 +- .../parser/pdf/pipeline_parity_test.go | 22 +- .../parser/pdf/post/model_image_describer.go | 101 --- .../pdf/post/model_image_describer_test.go | 79 --- .../pdf/post/outline_postprocess_test.go | 114 ---- .../deepdoc/parser/pdf/post/post_steps.go | 436 ------------- .../parser/pdf/post/post_steps_test.go | 434 ------------- .../parser/pdf/post/vision_describe.go | 98 --- .../parser/pdf/post/vision_describe_test.go | 112 ---- .../deepdoc/parser/pdf/render_compare_test.go | 4 +- internal/deepdoc/parser/pdf/renderer.go | 9 +- .../deepdoc/parser/pdf/renderer_pdfium.go | 2 +- internal/deepdoc/parser/pdf/rotate_test.go | 8 +- .../deepdoc/parser/pdf/scan_all_pdfs_test.go | 7 +- internal/deepdoc/parser/pdf/snapshot_test.go | 10 +- .../parser/pdf/table/table_construct.go | 43 +- internal/deepdoc/parser/pdf/table_extract.go | 251 ++++--- .../pdf/table_rotate_integration_test.go | 21 +- .../deepdoc/parser/pdf/table_section_test.go | 127 +++- .../deepdoc/parser/pdf/test_helpers_test.go | 44 +- internal/deepdoc/parser/pdf/text_dump_test.go | 6 +- internal/deepdoc/parser/pdf/type/types.go | 354 ++-------- internal/deepdoc/parser/pdf/util/geometry.go | 28 - .../deepdoc/parser/pdf/util/geometry_test.go | 16 - internal/deepdoc/parser/pdf/ycoord_test.go | 5 +- internal/deepdoc/parser/type/types.go | 304 +++++++++ 51 files changed, 1381 insertions(+), 2680 deletions(-) rename internal/deepdoc/parser/pdf/{test_helpers_cgo_test.go => helpers_test.go} (53%) rename internal/deepdoc/parser/pdf/{mock_doc_analyzer_test.go => mock_doc_analyzer.go} (99%) create mode 100644 internal/deepdoc/parser/pdf/mock_engine.go create mode 100644 internal/deepdoc/parser/pdf/parse_cgo.go delete mode 100644 internal/deepdoc/parser/pdf/post/model_image_describer.go delete mode 100644 internal/deepdoc/parser/pdf/post/model_image_describer_test.go delete mode 100644 internal/deepdoc/parser/pdf/post/outline_postprocess_test.go delete mode 100644 internal/deepdoc/parser/pdf/post/post_steps.go delete mode 100644 internal/deepdoc/parser/pdf/post/post_steps_test.go delete mode 100644 internal/deepdoc/parser/pdf/post/vision_describe.go delete mode 100644 internal/deepdoc/parser/pdf/post/vision_describe_test.go create mode 100644 internal/deepdoc/parser/type/types.go diff --git a/.gitignore b/.gitignore index 3cc9ffc5a5..2a0629d120 100644 --- a/.gitignore +++ b/.gitignore @@ -245,3 +245,5 @@ bin/* # Parser test fixtures and python tools internal/deepdoc/parser/pdf/testdata/ internal/deepdoc/parser/pdf/tools-py/ +internal/deepdoc/parser/docx/testdata/ +internal/deepdoc/parser/docx/tool/ diff --git a/internal/deepdoc/parser/pdf/batch_smoke_test.go b/internal/deepdoc/parser/pdf/batch_smoke_test.go index c3870a0ba4..2a5bd817f7 100644 --- a/internal/deepdoc/parser/pdf/batch_smoke_test.go +++ b/internal/deepdoc/parser/pdf/batch_smoke_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -51,12 +51,12 @@ func TestBatchResults(t *testing.T) { } pdfs := all[:min(count, len(all))] - ddClient, err := inf.NewInferenceClient(os.Getenv("DEEPDOC_URL")) + ddClient, err := inf.NewClient(os.Getenv("DEEPDOC_URL")) if err != nil { t.Fatal(err) } if !ddClient.Health() { - t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL) + t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.BaseURL()) } deepDoc := pdf.DocAnalyzer(ddClient) @@ -238,9 +238,9 @@ func parseOne(pdfDir, name string, deepDoc pdf.DocAnalyzer, skipOCR bool) (*pars cfg := pdf.DefaultParserConfig() cfg.SkipOCR = skipOCR - p := NewParser(cfg, deepDoc) + p := NewParser(cfg) t0 := time.Now() - parsed, err := p.Parse(context.Background(), eng) + parsed, err := p.ParseRaw(context.Background(), eng, deepDoc) elapsed := time.Since(t0).Seconds() if err != nil { return nil, fmt.Errorf("parse: %w", err) diff --git a/internal/deepdoc/parser/pdf/compare_test.go b/internal/deepdoc/parser/pdf/compare_test.go index 44a845132a..6df639ce48 100644 --- a/internal/deepdoc/parser/pdf/compare_test.go +++ b/internal/deepdoc/parser/pdf/compare_test.go @@ -1,6 +1,6 @@ //go:build manual -package parser +package pdf import ( "log/slog" @@ -8,7 +8,7 @@ import ( "path/filepath" "testing" - "ragflow/internal/deepdoc/parser/pdf/tools" + "ragflow/internal/deepdoc/parser/pdf/tool" ) // TestBatchCompareWithPython compares Go output against Python reference @@ -37,29 +37,29 @@ func TestBatchCompareWithPython(t *testing.T) { pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text") // Read Go text files' #@meta (no aggregate JSON dependency). - goResults, err := tools.ReadGoTextMeta(goTextDir) + goResults, err := tool.ReadGoTextMeta(goTextDir) if err != nil || len(goResults) == 0 { t.Fatalf("No Go text files in %s: %v", goTextDir, err) } // Read Python text files' #@meta - pyResults, err := tools.ReadPythonTextMeta(pyTextDir) + pyResults, err := tool.ReadPythonTextMeta(pyTextDir) if err != nil || len(pyResults) == 0 { t.Fatalf("No Python text files in %s: %v", pyTextDir, err) } t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults)) - tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir) + tool.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir) // Compare tables. goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables") pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables") - tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2) + tool.CompareTablesWithPython(t, goTablesDir, pyTablesDir2) // Compare DLA + TSR raw intermediates. goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla") pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla") - tools.CompareDLAWithPython(t, goDLADir, pyDLADir) + tool.CompareDLAWithPython(t, goDLADir, pyDLADir) goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw") pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw") - tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir) + tool.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir) } diff --git a/internal/deepdoc/parser/pdf/crop_integration_test.go b/internal/deepdoc/parser/pdf/crop_integration_test.go index 72e90d3cbb..beb5ff25b2 100644 --- a/internal/deepdoc/parser/pdf/crop_integration_test.go +++ b/internal/deepdoc/parser/pdf/crop_integration_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "bytes" @@ -27,8 +27,8 @@ func TestParse_CropSectionImages(t *testing.T) { defer eng.Close() cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true}) if err != nil { t.Fatalf("Parse: %v", err) } @@ -79,8 +79,8 @@ func TestCrop_Regression_SnapshotPDFs(t *testing.T) { } defer eng.Close() - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + p := NewParser(pdf.DefaultParserConfig()) + result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true}) if err != nil { t.Fatalf("Parse: %v", err) } diff --git a/internal/deepdoc/parser/pdf/dla_real_world_test.go b/internal/deepdoc/parser/pdf/dla_real_world_test.go index e64512dee0..773ab3af37 100644 --- a/internal/deepdoc/parser/pdf/dla_real_world_test.go +++ b/internal/deepdoc/parser/pdf/dla_real_world_test.go @@ -1,6 +1,6 @@ //go:build cgo && integration -package parser +package pdf import ( "context" @@ -46,7 +46,7 @@ func TestDLARealWorldCompare(t *testing.T) { for _, pg := range pdf.pages { testName := pdf.name + "/page" + string(rune('0'+pg)) t.Run(testName, func(t *testing.T) { - pageImg, err := renderPageToImage(eng, pg) + pageImg, err := RenderPageToImage(eng, pg) if err != nil { t.Fatalf("render page %d: %v", pg, err) } diff --git a/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go b/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go index 31584945a5..32e5f91337 100644 --- a/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go +++ b/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go @@ -1,6 +1,6 @@ //go:build cgo && integration -package parser +package pdf import ( "context" @@ -28,7 +28,7 @@ func TestDLATSRResponseCompare(t *testing.T) { eng := mustOpenEngine(t, "06_table_content.pdf") defer eng.Close() - pageImg, err := renderPageToImage(eng, 0) + pageImg, err := RenderPageToImage(eng, 0) if err != nil { t.Fatalf("render: %v", err) } diff --git a/internal/deepdoc/parser/pdf/test_helpers_cgo_test.go b/internal/deepdoc/parser/pdf/helpers_test.go similarity index 53% rename from internal/deepdoc/parser/pdf/test_helpers_cgo_test.go rename to internal/deepdoc/parser/pdf/helpers_test.go index c84538fe48..7b8142eb7c 100644 --- a/internal/deepdoc/parser/pdf/test_helpers_cgo_test.go +++ b/internal/deepdoc/parser/pdf/helpers_test.go @@ -1,6 +1,6 @@ //go:build cgo -package parser +package pdf import ( "os" @@ -11,20 +11,14 @@ import ( pdf "ragflow/internal/deepdoc/parser/pdf/type" ) -// ── Shared CGO test helpers ────────────────────────────────────────────────── -// These helpers were previously duplicated across multiple test files with -// different build tags (integration, manual). Consolidating them into one file -// with the //go:build cgo tag makes them available to all cgo-tagged tests. - -// mustConnectInferenceClient returns a InferenceClient pointed at the OSS service; -// skips the test if the service reports a non-OSS model type. -func mustConnectInferenceClient(t *testing.T) *inf.InferenceClient { +// mustConnectInferenceClient returns a InferenceClient for the OSS DeepDoc service. +func mustConnectInferenceClient(t *testing.T) *inf.Client { t.Helper() url := os.Getenv("OSSDEEPDOC_URL") if url == "" { url = "http://localhost:9390" } - client, err := inf.NewInferenceClient(url) + client, err := inf.NewClient(url) if err != nil { t.Fatal(err) } @@ -48,3 +42,12 @@ func mustOpenEngine(t *testing.T, name string) pdf.PDFEngine { } return eng } + +func mustReadPDF(t *testing.T, name string) []byte { + t.Helper() + data, err := os.ReadFile(filepath.Join("testdata", "pdfs", name)) + if err != nil { + t.Fatalf("read fixture %s: %v", name, err) + } + return data +} diff --git a/internal/deepdoc/parser/pdf/inference/client.go b/internal/deepdoc/parser/pdf/inference/client.go index e7e5e48b43..22f367de43 100644 --- a/internal/deepdoc/parser/pdf/inference/client.go +++ b/internal/deepdoc/parser/pdf/inference/client.go @@ -21,8 +21,8 @@ import ( "github.com/cenkalti/backoff/v5" ) -// InferenceClient wraps the DeepDoc HTTP API. -type InferenceClient struct { +// Client wraps the DeepDoc HTTP API. +type Client struct { baseURL string httpClient *http.Client @@ -33,24 +33,27 @@ type InferenceClient struct { } // BaseURL returns the configured DeepDoc service URL. -func (c *InferenceClient) BaseURL() string { return c.baseURL } +func (c *Client) BaseURL() string { return c.baseURL } -// NewInferenceClient creates a client. baseURL must be provided by the caller +// NewClient creates a client. baseURL must be provided by the caller // (e.g. from the DEEPDOC_URL environment variable). Returns an error if empty. -func NewInferenceClient(baseURL string) (*InferenceClient, error) { +func NewClient(baseURL string) (*Client, error) { if baseURL == "" { return nil, fmt.Errorf("deepdoc client: baseURL is required (set DEEPDOC_URL)") } - return &InferenceClient{ + return &Client{ baseURL: baseURL, httpClient: &http.Client{ Timeout: 120 * time.Second, }, + DLALabels: DefaultDLALabels(), + TSRLabels: DefaultTSRLabels(), }, nil } -// Default DLA/TSR label tables used as fallback when no model-specific -// labels are injected by a TableBuilder constructor. +// DefaultDLALabels returns the 10-class DLA taxonomy matching Python's +// deepdoc/vision/dla_cli.py:10-21. Duplicates at indices 4, 7, 9 are +// kept verbatim for backward compatibility with existing inference servers. func DefaultDLALabels() []string { return []string{ pdf.LayoutTypeTitle, pdf.LayoutTypeText, pdf.LayoutTypeReference, @@ -59,6 +62,9 @@ func DefaultDLALabels() []string { pdf.LayoutTypeEquation, pdf.DLALabelFigureCaption, } } + +// DefaultTSRLabels returns the 6-class TSR taxonomy matching Python's +// deepdoc/server/adapters/tsr_adapter.py:21-26. func DefaultTSRLabels() []string { return []string{ "table", "table column", "table row", @@ -72,7 +78,7 @@ type bboxesResponse struct { } // DLA analyzes a full page image and returns labeled regions. -func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) { +func (c *Client) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) { data, err := util.EncodeJPEG(pageImage) if err != nil { return nil, fmt.Errorf("dla: encode: %w", err) @@ -87,9 +93,6 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf continue } labels := c.DLALabels - if labels == nil { - labels = DefaultDLALabels() - } label := "" if clsID := int(b[5]); clsID >= 0 && clsID < len(labels) { label = labels[clsID] @@ -104,7 +107,7 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf } // TSR recognises table structure from a cropped image. -func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) { +func (c *Client) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) { data, err := util.EncodeJPEG(cropped) if err != nil { return nil, fmt.Errorf("tsr: encode: %w", err) @@ -119,9 +122,6 @@ func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.T continue } tlabels := c.TSRLabels - if tlabels == nil { - tlabels = DefaultTSRLabels() - } label := "" if len(b) >= 6 { if cls := int(b[5]); cls >= 0 && cls < len(tlabels) { @@ -152,7 +152,7 @@ type ocrRecognizeResponse struct { // OCRDetect detects text regions (bounding boxes) in an image. // DeepDoc /predict/ocr with operator=det returns quad boxes: [[[x0,y0],[x1,y1],[x2,y2],[x3,y3]], ...] -func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) { +func (c *Client) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) { data, err := util.EncodeJPEG(cropped) if err != nil { return nil, fmt.Errorf("ocr detect: encode: %w", err) @@ -197,7 +197,7 @@ func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([ // OCRRecognize recognizes text in a cropped image region. // DeepDoc /predict/ocr with operator=rec returns [[["text", confidence], ...]] -func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) { +func (c *Client) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) { data, err := util.EncodeJPEG(cropped) if err != nil { return nil, fmt.Errorf("ocr rec: encode: %w", err) @@ -224,7 +224,7 @@ func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image) // OCRRecognizeBatch recognizes text in multiple cropped image regions. // Returns a slice of results and a parallel slice of errors (nil on success). // A nil cropped image in the input produces nil results and a non-nil error. -func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) { +func (c *Client) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) { results := make([][]pdf.OCRText, len(cropped)) errs := make([]error, len(cropped)) @@ -255,7 +255,7 @@ func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image } // Health checks whether the DeepDoc service is reachable. -func (c *InferenceClient) Health() bool { +func (c *Client) Health() bool { resp, err := c.httpClient.Get(c.baseURL + "/health") if err != nil { return false @@ -264,7 +264,7 @@ func (c *InferenceClient) Health() bool { return resp.StatusCode == 200 } -func (c *InferenceClient) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error { +func (c *Client) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error { // Build multipart body once — the image data is idempotent. var body bytes.Buffer w := multipart.NewWriter(&body) diff --git a/internal/deepdoc/parser/pdf/inference/client_test.go b/internal/deepdoc/parser/pdf/inference/client_test.go index 24ccf2c349..6f8e8bc016 100644 --- a/internal/deepdoc/parser/pdf/inference/client_test.go +++ b/internal/deepdoc/parser/pdf/inference/client_test.go @@ -11,11 +11,11 @@ import ( "testing" ) -// mustNewDeepDocClient wraps NewInferenceClient for test convenience. +// mustNewDeepDocClient wraps NewClient for test convenience. // Fails the test if the URL is empty. -func mustNewDeepDocClient(t *testing.T, baseURL string) *InferenceClient { +func mustNewDeepDocClient(t *testing.T, baseURL string) *Client { t.Helper() - client, err := NewInferenceClient(baseURL) + client, err := NewClient(baseURL) if err != nil { t.Fatalf("NewDeepDocClient(%q): %v", baseURL, err) } diff --git a/internal/deepdoc/parser/pdf/inference_client_integration_test.go b/internal/deepdoc/parser/pdf/inference_client_integration_test.go index fc3d343772..836968c630 100644 --- a/internal/deepdoc/parser/pdf/inference_client_integration_test.go +++ b/internal/deepdoc/parser/pdf/inference_client_integration_test.go @@ -1,13 +1,12 @@ //go:build cgo && integration -package parser +package pdf import ( "context" "strings" "testing" - tbl "ragflow/internal/deepdoc/parser/pdf/table" pdf "ragflow/internal/deepdoc/parser/pdf/type" ) @@ -15,13 +14,11 @@ import ( // through the OSS TableBuilder produces tables with the expected row/column structure. func TestIntegration_DeepDoc_TableStructure(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client) - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -29,7 +26,7 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) { t.Skip("DLA did not detect any tables in fixture") } - t.Logf("OssDeepDoc produced %d tables", len(result.Tables)) + t.Logf("DeepDoc produced %d tables", len(result.Tables)) for i, tbl := range result.Tables { t.Logf("table[%d]: %d rows", i, len(tbl.Rows)) for ri, row := range tbl.Rows { @@ -51,13 +48,11 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) { // rows with the expected grid structure. func TestIntegration_DeepDoc_TableRows(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client) - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -92,13 +87,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) { client := mustConnectInferenceClient(t) parseOnce := func() *pdf.ParseResult { - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client) - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -124,13 +117,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) { // does not crash. func TestIntegration_DeepDoc_EmptyPage(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "01_english_simple.pdf") - defer eng.Close() + data := mustReadPDF(t, "01_english_simple.pdf") cfg := pdf.DefaultParserConfig() - cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client) - p := NewParser(cfg, client) - _, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + _, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } diff --git a/internal/deepdoc/parser/pdf/mock_doc_analyzer_test.go b/internal/deepdoc/parser/pdf/mock_doc_analyzer.go similarity index 99% rename from internal/deepdoc/parser/pdf/mock_doc_analyzer_test.go rename to internal/deepdoc/parser/pdf/mock_doc_analyzer.go index 08d6906501..173f238cd3 100644 --- a/internal/deepdoc/parser/pdf/mock_doc_analyzer_test.go +++ b/internal/deepdoc/parser/pdf/mock_doc_analyzer.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" diff --git a/internal/deepdoc/parser/pdf/mock_engine.go b/internal/deepdoc/parser/pdf/mock_engine.go new file mode 100644 index 0000000000..b8034f1459 --- /dev/null +++ b/internal/deepdoc/parser/pdf/mock_engine.go @@ -0,0 +1,41 @@ +package pdf + +import ( + "image" + + pdf "ragflow/internal/deepdoc/parser/pdf/type" +) + +// MockEngine is a minimal pdf.PDFEngine stub for unit/integration tests. +type MockEngine struct { + Chars map[int][]pdf.TextChar + NumPages int + RenderW int + RenderH int +} + +func (m *MockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) { + return m.Chars[pg], nil +} +func (m *MockEngine) RenderPage(pg int, dpi float64) ([]byte, error) { + return nil, ErrNoPDFData +} +func (m *MockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) { + w, h := m.RenderW, m.RenderH + if w <= 0 { + w = 100 + } + if h <= 0 { + h = 100 + } + return image.NewRGBA(image.Rect(0, 0, w, h)), nil +} +func (m *MockEngine) PageCount() (int, error) { + if m.NumPages <= 0 { + return 1, nil + } + return m.NumPages, nil +} +func (m *MockEngine) RawData() []byte { return nil } +func (m *MockEngine) Close() error { return nil } +func (m *MockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil } diff --git a/internal/deepdoc/parser/pdf/ocr_merge_test.go b/internal/deepdoc/parser/pdf/ocr_merge_test.go index 7d8caa182d..fe34dfe0fa 100644 --- a/internal/deepdoc/parser/pdf/ocr_merge_test.go +++ b/internal/deepdoc/parser/pdf/ocr_merge_test.go @@ -1,11 +1,13 @@ //go:build cgo && manual -package parser +package pdf import ( "context" "image/png" "os" + inf "ragflow/internal/deepdoc/parser/pdf/inference" + util "ragflow/internal/deepdoc/parser/pdf/util" "strings" "testing" ) @@ -19,7 +21,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) { if url == "" { t.Skip("DEEPDOC_URL not set") } - dd, err := inf.NewInferenceClient(url) + dd, err := inf.NewClient(url) if err != nil { t.Fatal(err) } @@ -41,7 +43,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) { if err != nil { t.Fatal(err) } - t.Logf("pdf_oxide chars: %d", len(chars)) + t.Logf("pdf_oxide Chars: %d", len(chars)) var sample strings.Builder for i, c := range chars { diff --git a/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go b/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go index 08546ed1f8..6c6f834304 100644 --- a/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go +++ b/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go @@ -1,6 +1,6 @@ //go:build cgo -package parser +package pdf import ( "context" diff --git a/internal/deepdoc/parser/pdf/outline_extraction_test.go b/internal/deepdoc/parser/pdf/outline_extraction_test.go index 46b3de033f..552b819b58 100644 --- a/internal/deepdoc/parser/pdf/outline_extraction_test.go +++ b/internal/deepdoc/parser/pdf/outline_extraction_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -10,10 +10,10 @@ import ( // ── outline-tracking mock engines ────────────────────────────────────────── -// outlineTrackingEngine wraps mockEngine and records whether Outlines() +// outlineTrackingEngine wraps MockEngine and records whether Outlines() // was called. type outlineTrackingEngine struct { - *mockEngine + *MockEngine outlines []pdf.Outline outlinesCalled bool } @@ -25,7 +25,7 @@ func (e *outlineTrackingEngine) Outlines() ([]pdf.Outline, error) { // outlineErrorEngine returns an error from Outlines(). type outlineErrorEngine struct { - *mockEngine + *MockEngine } func (e *outlineErrorEngine) Outlines() ([]pdf.Outline, error) { @@ -46,13 +46,13 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) { {Title: "Section 1.1", Level: 1, PageNumber: 2}, } eng := &outlineTrackingEngine{ - mockEngine: &mockEngine{pageCount: 3}, + MockEngine: &MockEngine{NumPages: 3}, outlines: expectedOutlines, } mockDLA := &MockDocAnalyzer{Healthy: true} - p := NewParser(pdf.DefaultParserConfig(), mockDLA) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mockDLA) if err != nil { t.Fatalf("Parse failed: %v", err) } @@ -79,18 +79,18 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) { // and produces sections (outlines are best-effort). func TestParse_OutlinesErrorDoesNotBlockParsing(t *testing.T) { eng := &outlineErrorEngine{ - mockEngine: &mockEngine{ - pageCount: 2, - chars: map[int][]pdf.TextChar{ + MockEngine: &MockEngine{ + NumPages: 2, + Chars: map[int][]pdf.TextChar{ 0: {{Text: "Hello world", X0: 100, X1: 200, Top: 100, Bottom: 120}}, 1: {{Text: "Page two", X0: 100, X1: 200, Top: 100, Bottom: 120}}, }, }, } mockDLA := &MockDocAnalyzer{Healthy: true} - p := NewParser(pdf.DefaultParserConfig(), mockDLA) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mockDLA) if err != nil { t.Fatalf("Parse should not fail when Outlines() errors: %v", err) } diff --git a/internal/deepdoc/parser/pdf/page_batch_test.go b/internal/deepdoc/parser/pdf/page_batch_test.go index 0b1489f3c3..8b0c83b06a 100644 --- a/internal/deepdoc/parser/pdf/page_batch_test.go +++ b/internal/deepdoc/parser/pdf/page_batch_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -31,8 +31,8 @@ func TestParse_BatchEquivalence(t *testing.T) { defer eng.Close() cfg := pdf.DefaultParserConfig() cfg.BatchSize = batchSize - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, mockDLA) if err != nil { t.Fatal(err) } diff --git a/internal/deepdoc/parser/pdf/parse_cgo.go b/internal/deepdoc/parser/pdf/parse_cgo.go new file mode 100644 index 0000000000..aae70ae232 --- /dev/null +++ b/internal/deepdoc/parser/pdf/parse_cgo.go @@ -0,0 +1,22 @@ +//go:build cgo + +package pdf + +import ( + "context" + "fmt" + + pdf "ragflow/internal/deepdoc/parser/pdf/type" +) + +// Parse runs the full PDF extraction pipeline from raw bytes. +// Creates and manages the PDF engine lifecycle internally. +func (p *Parser) Parse(ctx context.Context, data []byte, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) { + engine, err := NewEngine(data) + if err != nil { + return nil, fmt.Errorf("pdfoxide.NewEngine: %w", err) + } + defer engine.Close() + + return p.ParseRaw(ctx, engine, docAnalyzer) +} diff --git a/internal/deepdoc/parser/pdf/parser.go b/internal/deepdoc/parser/pdf/parser.go index f731f4e445..fc3ac96c73 100644 --- a/internal/deepdoc/parser/pdf/parser.go +++ b/internal/deepdoc/parser/pdf/parser.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -8,52 +8,36 @@ import ( "log/slog" "sync" - inf "ragflow/internal/deepdoc/parser/pdf/inference" lyt "ragflow/internal/deepdoc/parser/pdf/layout" tbl "ragflow/internal/deepdoc/parser/pdf/table" pdf "ragflow/internal/deepdoc/parser/pdf/type" util "ragflow/internal/deepdoc/parser/pdf/util" ) -// Parser is the main PDF text/layout extraction pipeline. +// Parser is the core PDF text/layout extraction pipeline. // It corresponds to RAGFlowPdfParser in pdf_parser.py. -// Parser is stateless after construction — safe to reuse across documents. +// Stateless after construction — safe to reuse across documents. type Parser struct { Config pdf.ParserConfig - - // DeepDoc is the required document layout / OCR / table recognition - // service. Set at construction time by NewParser. - DeepDoc pdf.DocAnalyzer - - // SampleChars samples up to n chars from a page for English detection. - // Defaults to random sampling (matching Python's random.choices). - // Inject a deterministic sampler for reproducible tests. - SampleChars pdf.SampleFunc - - // tableBuilder is the TSR model adapter. Set at construction time - // - // different implementation via Config.TableBuilder. - tableBuilder pdf.TableBuilder } -// NewParser creates a new Parser with the required DeepDoc service. -func NewParser(cfg pdf.ParserConfig, doc pdf.DocAnalyzer) *Parser { - tb := cfg.TableBuilder - if tb == nil { - tb = NewTableBuilderFor(doc) - } - return &Parser{ - Config: cfg, - DeepDoc: doc, - tableBuilder: tb, - } +// pageResult holds per-page output from extractPages. +type pageResult struct { + pg int + ocrBoxes []pdf.TextBox + chars []pdf.TextChar + ocrUsed bool + pageImg image.Image + err error +} + +// New creates a new Parser with the given config. +func NewParser(cfg pdf.ParserConfig) *Parser { + return &Parser{Config: cfg} } // ── TableBuilder factory ─────────────────────────────────────────────────── -// tableBuilderFactory holds a model-specific TableBuilder factory registered -// by EE packages via RegisterTableBuilder. If nil, the default OSS -// implementation is used. var tableBuilderFactory func(pdf.DocAnalyzer) pdf.TableBuilder // RegisterTableBuilder registers a TableBuilder factory for the PDF parser. @@ -62,30 +46,20 @@ func RegisterTableBuilder(factory func(pdf.DocAnalyzer) pdf.TableBuilder) { tableBuilderFactory = factory } -// NewTableBuilderFor creates the right TableBuilder, chosen by the registry. -// Checks the registry first for EE-registered implementations, falling back -// to the default OSS DeepDocTableBuilder. Label taxonomies are injected -// before construction. func NewTableBuilderFor(doc pdf.DocAnalyzer) pdf.TableBuilder { if tableBuilderFactory != nil { return tableBuilderFactory(doc) } - if c, ok := doc.(*inf.InferenceClient); ok { - c.DLALabels = inf.DefaultDLALabels() - c.TSRLabels = inf.DefaultTSRLabels() - } return tbl.NewDeepDocTableBuilder(doc) } -// Parse runs the full PDF extraction pipeline: chars → boxes → -// column assignment → text merge → vertical merge → sections. -// -// For documents larger than Config.BatchSize pages, processes in batches -// to bound memory usage (matching Python's batch_size=50). -// -// Returns a pdf.ParseResult containing sections, tables, page images, figures, -// and pipeline stage metrics. Parser itself remains stateless. -func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseResult, error) { +// ── Public API ───────────────────────────────────────────────────────────── + +// ParseRaw is the internal entry point: runs the core pipeline on an +// already-opened engine. Exported for tests that inject mock engines. +func (p *Parser) ParseRaw(ctx context.Context, engine pdf.PDFEngine, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) { + tb := NewTableBuilderFor(docAnalyzer) + // Normalize page range pageCount, err := engine.PageCount() if err != nil { @@ -103,11 +77,10 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes totalPages := toPage - fromPage + 1 batchSize := p.Config.BatchSize if batchSize <= 0 { - batchSize = 50 // default, matching Python's batch_size + batchSize = 50 } - // ── Prescan: lightweight char extraction for language/noise detection ── - // No rendering, no OCR — just raw chars for global decisions. + // ── Prescan ── prescanChars := make(map[int][]pdf.TextChar) prescanMedianH := make(map[int]float64) prescanMedianW := make(map[int]float64) @@ -115,26 +88,27 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes chars, extractErr := engine.ExtractChars(pg) if extractErr != nil { slog.Warn("prescan: ExtractChars failed", "page", pg, "err", extractErr) - chars = nil // skip broken pages (matching old behavior) + chars = nil } prescanChars[pg] = chars prescanMedianH[pg] = util.MedianCharHeight(chars) prescanMedianW[pg] = util.MedianCharWidth(chars) } - isEnglish := util.DetectEnglish(prescanChars, totalPages, p.SampleChars) + isEnglish := util.DetectEnglish(prescanChars, totalPages, nil) scanNoise := util.IsScanNoise(util.FullTextFromChars(prescanChars)) - // ── Extract PDF outlines/bookmarks (best-effort, non-fatal) ── + // ── Outlines ── outlines, outlineErr := engine.Outlines() if outlineErr != nil { slog.Warn("Failed to extract PDF outlines; continuing without them", "err", outlineErr) outlines = nil } - // ── Small document: process all at once (no batching overhead) ── + // ── Small document ── if totalPages <= batchSize { result, err := p.processPages(ctx, engine, fromPage, toPage, - prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise) + prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise, + docAnalyzer, tb) if err != nil { return nil, err } @@ -142,7 +116,7 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes return result, nil } - // ── Large document: process in batches to bound memory ── + // ── Large document: batched ── slog.Info("batched processing", "pages", totalPages, "batchSize", batchSize) result := &pdf.ParseResult{PageImages: make(map[int]image.Image)} for start := fromPage; start <= toPage; start += batchSize { @@ -151,7 +125,6 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes } end := min(start+batchSize-1, toPage) - // Slice prescan data for this batch. batchChars := make(map[int][]pdf.TextChar, end-start+1) batchMH := make(map[int]float64, end-start+1) batchMW := make(map[int]float64, end-start+1) @@ -162,15 +135,14 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes } batch, err := p.processPages(ctx, engine, start, end, - batchChars, batchMH, batchMW, isEnglish, scanNoise) + batchChars, batchMH, batchMW, isEnglish, scanNoise, + docAnalyzer, tb) if err != nil { return nil, err } - // Merge batch results. result.Sections = append(result.Sections, batch.Sections...) result.Tables = append(result.Tables, batch.Tables...) - // Figures() is computed on demand from Sections. for pg, img := range batch.PageImages { result.PageImages[pg] = img } @@ -184,33 +156,22 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes return result, nil } -// extractPages runs per-page OCR (detect + recognize) for the given page -// range, returning text boxes, char data, whether any page used OCR, and -// any errors encountered. Partial results are returned even when some -// pages fail — callers should inspect the error for diagnostics but may -// still use the returned boxes and chars. +// ── Internal pipeline steps ──────────────────────────────────────────────── + func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, fromPage, toPage int, prescanChars map[int][]pdf.TextChar, medianHeights, medianWidths map[int]float64, pageImages map[int]image.Image, + docAnalyzer pdf.DocAnalyzer, ) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) { var boxes []pdf.TextBox pageChars := make(map[int][]pdf.TextChar) ocrUsedAny := false - type pr struct { - pg int - ocrBoxes []pdf.TextBox - chars []pdf.TextChar - ocrUsed bool - pageImg image.Image - err error - } pageCount := toPage - fromPage + 1 - results := make([]pr, pageCount) + results := make([]pageResult, pageCount) - // Semaphore cap: 0 → sequential; >0 → bounded parallelism. cap := p.Config.MaxOCRConcurrency if cap <= 0 { cap = 1 @@ -222,16 +183,15 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, pg := fromPage + i chars := prescanChars[pg] - // Fast path: pages with embedded chars → sequential inline (no HTTP OCR). if len(chars) > 0 && !util.IsGarbledPage(chars) { - pageImg, renderErr := renderPageToImage(engine, pg) + pageImg, renderErr := RenderPageToImage(engine, pg) if renderErr == nil && pageImg != nil { pageImages[pg] = pageImg } var ocrBoxes []pdf.TextBox ocrUsed := false if !p.Config.SkipOCR && renderErr == nil && pageImg != nil { - ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg) + ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg) if ocrBoxes == nil { ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop) } else { @@ -241,30 +201,28 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, } else { ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop) } - results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed} + results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed} continue } - // OCR path: render + detect + recognize (potentially parallel). wg.Add(1) go func(i, pg int, chars []pdf.TextChar) { defer wg.Done() select { case <-ctx.Done(): - results[i] = pr{pg: pg, err: ctx.Err()} + results[i] = pageResult{pg: pg, err: ctx.Err()} return case sem <- struct{}{}: } defer func() { <-sem }() - pageImg, err := renderPageToImage(engine, pg) + pageImg, err := RenderPageToImage(engine, pg) if err != nil { - results[i] = pr{pg: pg, err: err} + results[i] = pageResult{pg: pg, err: err} return } - // Check if context was cancelled during render. if err := ctx.Err(); err != nil { - results[i] = pr{pg: pg, err: err} + results[i] = pageResult{pg: pg, err: err} return } @@ -275,7 +233,7 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, if len(chars) > 0 { label = "garbled page" } - ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, p.DeepDoc, pg, label) + ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, docAnalyzer, pg, label) if ocrBoxes != nil { for j := range ocrBoxes { for _, r := range ocrBoxes[j].Text { @@ -286,9 +244,8 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, ocrUsed = true } } - // Merged OCR path for pages with both embedded and OCR chars. if !ocrUsed && len(chars) > 0 && !p.Config.SkipOCR { - ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg) + ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg) if ocrBoxes != nil { ocrUsed = true } @@ -298,15 +255,252 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop) } } - results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg} + results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg} }(i, pg, chars) } wg.Wait() + return mergePageResults(results, boxes, pageImages, pageChars, ocrUsedAny, medianHeights, medianWidths) +} - // Merge results in page order. +func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine, + fromPage, toPage int, + pageImages map[int]image.Image, + pageChars map[int][]pdf.TextChar, + medianHeights, medianWidths map[int]float64, + ocrUsedAny bool, + docAnalyzer pdf.DocAnalyzer, +) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) { + slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage) + var boxes []pdf.TextBox + for pg := fromPage; pg <= toPage; pg++ { + img := pageImages[pg] + if img == nil { + var err error + img, err = RenderPageToImage(engine, pg) + if err != nil { + slog.Warn("scan noise: page render failed", "page", pg, "err", err) + continue + } + pageImages[pg] = img + } + ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "scan page") + if ocrBoxes == nil { + slog.Warn("scan noise: page OCR empty", "page", pg) + continue + } + boxes = append(boxes, ocrBoxes...) + var chars []pdf.TextChar + for _, b := range ocrBoxes { + for _, r := range b.Text { + chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg}) + break + } + } + pageChars[pg] = chars + medianHeights[pg] = util.MedianCharHeight(chars) + medianWidths[pg] = util.MedianCharWidth(chars) + } + slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes)) + return boxes, pageChars, true +} + +func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine, + fromPage, toPage int, + pageImages map[int]image.Image, + boxes []pdf.TextBox, ocrUsedAny bool, + docAnalyzer pdf.DocAnalyzer, +) ([]pdf.TextBox, bool) { + retryZoomVal := p.Config.Zoom * pdf.DlaScale + retryDPI := retryZoomVal * 72 + slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoomVal) + for pg := fromPage; pg <= toPage; pg++ { + img, err := engine.RenderPageImage(pg, retryDPI) + if err != nil { + slog.Warn("zoom retry: render failed", "page", pg, "err", err) + continue + } + pageImages[pg] = img + if retryDPI != pdf.DlaDPI { + if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil { + pageImages[pg] = dlaImg + } + } + ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "zoom retry") + if ocrBoxes == nil { + continue + } + scaleFactor := retryZoomVal / p.Config.Zoom + for i := range ocrBoxes { + ocrBoxes[i].X0 /= scaleFactor + ocrBoxes[i].X1 /= scaleFactor + ocrBoxes[i].Top /= scaleFactor + ocrBoxes[i].Bottom /= scaleFactor + } + boxes = append(boxes, ocrBoxes...) + ocrUsedAny = true + } + return boxes, ocrUsedAny +} + +func (p *Parser) buildLayout(ctx context.Context, + result *pdf.ParseResult, engine pdf.PDFEngine, + boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar, + medianHeights, medianWidths map[int]float64, + fromPage, toPage int, ocrUsedAny bool, isEnglish bool, + docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, +) error { + result.Metrics.BoxesInitial = len(boxes) + + result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages, docAnalyzer, tb) + result.Metrics.TablesCount = len(result.Tables) + if err := ctx.Err(); err != nil { + return err + } + + boxes = lyt.AssignColumn(boxes, p.Config.Zoom) + boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom) + result.Metrics.BoxesTextMerge = len(boxes) + + lyt.SortByPageThenY(boxes, p.Config.SortByTop) + + if ocrUsedAny { + isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, nil) + } + boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish) + result.Metrics.BoxesVertMerge = len(boxes) + if err := ctx.Err(); err != nil { + return err + } + + boxes = tbl.ExtractTableAndReplace(boxes, result.Tables) + boxes = tbl.ConsolidateFigures(boxes) + + pageHeights := make(map[int]float64, len(result.PageImages)) + for pg, img := range result.PageImages { + pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom + } + result.Sections = lyt.BoxesToSections(boxes, pageHeights) + result.Metrics.BoxesFinal = len(result.Sections) + result.Sections = tbl.MergeCaptions(result.Sections, result.Figures()) + return nil +} + +func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine, + fromPage, toPage int, + prescanChars map[int][]pdf.TextChar, + medianHeights, medianWidths map[int]float64, + isEnglish, isScanNoiseDoc bool, + docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, +) (*pdf.ParseResult, error) { + result := &pdf.ParseResult{PageImages: make(map[int]image.Image)} + + boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine, + fromPage, toPage, prescanChars, + medianHeights, medianWidths, result.PageImages, docAnalyzer) + if ocrErr != nil { + slog.Warn("extractPages: some pages failed OCR", "err", ocrErr) + } + + if isScanNoiseDoc { + boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine, + fromPage, toPage, result.PageImages, + pageChars, medianHeights, medianWidths, ocrUsedAny, docAnalyzer) + } + + if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR { + boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage, + result.PageImages, boxes, ocrUsedAny, docAnalyzer) + } + + if len(boxes) == 0 { + return result, nil + } + + if err := p.buildLayout(ctx, result, engine, boxes, pageChars, + medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish, + docAnalyzer, tb); err != nil { + return nil, fmt.Errorf("buildLayout: %w", err) + } + p.fillSectionImages(result) + return result, nil +} + +func (p *Parser) fillSectionImages(result *pdf.ParseResult) { + if len(result.PageImages) == 0 { + return + } + tableImgByRegion := make(map[string]string, len(result.Tables)) + for _, tbl := range result.Tables { + if tbl.ImageB64 == "" { + continue + } + pg := 0 + if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 { + pg = tbl.Positions[0].PageNumbers[0] + } + key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", + pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom) + tableImgByRegion[key] = tbl.ImageB64 + } + for i := range result.Sections { + if result.Sections[i].LayoutType == pdf.LayoutTypeTable { + if img, ok := matchTableImage(&result.Sections[i], tableImgByRegion); ok { + result.Sections[i].Image = img + continue + } + } + if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 { + if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" { + result.Sections[i].Image = dlaImg + continue + } + } + img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom) + result.Sections[i].Image = img + if img == "" && result.Sections[i].Text != "" { + tag := result.Sections[i].PositionTag + slog.Warn("cropSectionImage empty for non-empty section", + "section", i, "posTag", tag[:min(80, len(tag))]) + } + } +} + +// matchTableImage looks up a pre-rendered table image for a section. +// Uses Positions if available; falls back to TableItem Region boundaries. +func matchTableImage(sec *pdf.Section, tableImgByRegion map[string]string) (string, bool) { + pg := 0 + if len(sec.Positions) > 0 { + pos := sec.Positions[0] + if len(pos.PageNumbers) > 0 { + pg = pos.PageNumbers[0] + } + key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg, pos.Left, pos.Right, pos.Top, pos.Bottom) + if img, ok := tableImgByRegion[key]; ok { + return img, true + } + return "", false + } + if sec.TableItem != nil { + if len(sec.TableItem.Positions) > 0 && len(sec.TableItem.Positions[0].PageNumbers) > 0 { + pg = sec.TableItem.Positions[0].PageNumbers[0] + } + key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg, + sec.TableItem.RegionLeft, sec.TableItem.RegionRight, + sec.TableItem.RegionTop, sec.TableItem.RegionBottom) + if img, ok := tableImgByRegion[key]; ok { + return img, true + } + } + return "", false +} + +// mergePageResults collects per-page OCR results into the final output. +func mergePageResults(results []pageResult, boxes []pdf.TextBox, pageImages map[int]image.Image, + pageChars map[int][]pdf.TextChar, ocrUsedAny bool, + medianHeights, medianWidths map[int]float64, +) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) { var errs []error - for i := 0; i < pageCount; i++ { - r := results[i] + for _, r := range results { if r.err != nil { slog.Warn("page OCR failed", "page", r.pg, "err", r.err) errs = append(errs, fmt.Errorf("page %d: %w", r.pg, r.err)) @@ -329,233 +523,3 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine, } return boxes, pageChars, ocrUsedAny, errors.Join(errs...) } - -// retryScanNoise re-runs OCR on all pages when prescan detects scan noise, -// overwriting page-level state with fresh detect+recognize results. -func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine, - fromPage, toPage int, - pageImages map[int]image.Image, - pageChars map[int][]pdf.TextChar, - medianHeights, medianWidths map[int]float64, - ocrUsedAny bool, -) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) { - slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage) - var boxes []pdf.TextBox - for pg := fromPage; pg <= toPage; pg++ { - img := pageImages[pg] - if img == nil { - var err error - img, err = renderPageToImage(engine, pg) - if err != nil { - slog.Warn("scan noise: page render failed", "page", pg, "err", err) - continue - } - pageImages[pg] = img - } - ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "scan page") - if ocrBoxes == nil { - slog.Warn("scan noise: page OCR empty", "page", pg) - continue - } - boxes = append(boxes, ocrBoxes...) - var chars []pdf.TextChar - for _, b := range ocrBoxes { - for _, r := range b.Text { - chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg}) - break - } - } - pageChars[pg] = chars - medianHeights[pg] = util.MedianCharHeight(chars) - medianWidths[pg] = util.MedianCharWidth(chars) - } - slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes)) - return boxes, pageChars, true -} - -// retryZoom re-renders pages at higher resolution and re-runs OCR when the -// initial extraction produced zero boxes. Box coordinates are scaled back -// to Config.Zoom space. Matches Python's __images__ retry. -func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine, - fromPage, toPage int, - pageImages map[int]image.Image, - boxes []pdf.TextBox, ocrUsedAny bool, -) ([]pdf.TextBox, bool) { - retryZoom := p.Config.Zoom * pdf.DlaScale - retryDPI := retryZoom * 72 - slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoom) - for pg := fromPage; pg <= toPage; pg++ { - img, err := engine.RenderPageImage(pg, retryDPI) - if err != nil { - slog.Warn("zoom retry: render failed", "page", pg, "err", err) - continue - } - pageImages[pg] = img - // Downstream DLA/TSR assumes pdf.DlaDPI. Re-render at standard - // resolution so layout coordinates are scaled correctly. - if retryDPI != pdf.DlaDPI { - if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil { - pageImages[pg] = dlaImg - } - } - ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "zoom retry") - if ocrBoxes == nil { - continue - } - scaleFactor := retryZoom / p.Config.Zoom - for i := range ocrBoxes { - ocrBoxes[i].X0 /= scaleFactor - ocrBoxes[i].X1 /= scaleFactor - ocrBoxes[i].Top /= scaleFactor - ocrBoxes[i].Bottom /= scaleFactor - } - boxes = append(boxes, ocrBoxes...) - ocrUsedAny = true - } - return boxes, ocrUsedAny -} - -// buildLayout runs the DLA → TSR → Column → TextMerge → VM → pdf.Section -// pipeline and populates result.Metrics, result.Tables, result.Sections, -// and result.Sections. Matches Python's _parse_loaded_window_into_bboxes -// order. -func (p *Parser) buildLayout(ctx context.Context, - result *pdf.ParseResult, engine pdf.PDFEngine, - boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar, - medianHeights, medianWidths map[int]float64, - fromPage, toPage int, ocrUsedAny bool, isEnglish bool, -) error { - result.Metrics.BoxesInitial = len(boxes) - - result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages) - result.Metrics.TablesCount = len(result.Tables) - if err := ctx.Err(); err != nil { - return err - } - - boxes = lyt.AssignColumn(boxes, p.Config.Zoom) - boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom) - result.Metrics.BoxesTextMerge = len(boxes) - - lyt.SortByPageThenY(boxes, p.Config.SortByTop) - - if ocrUsedAny { - isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, p.SampleChars) - } - boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish) - result.Metrics.BoxesVertMerge = len(boxes) - if err := ctx.Err(); err != nil { - return err - } - - boxes = tbl.ExtractTableAndReplace(boxes, result.Tables) - boxes = tbl.ConsolidateFigures(boxes) - - pageHeights := make(map[int]float64, len(result.PageImages)) - for pg, img := range result.PageImages { - pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom - } - result.Sections = lyt.BoxesToSections(boxes, pageHeights) - result.Metrics.BoxesFinal = len(result.Sections) - result.Sections = tbl.MergeCaptions(result.Sections, result.Figures()) - return nil -} - -// processPages runs the full pipeline on pages [fromPage, toPage]. -// prescanChars provides pre-extracted chars (avoids double extraction). -func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine, - fromPage, toPage int, - prescanChars map[int][]pdf.TextChar, - medianHeights, medianWidths map[int]float64, - isEnglish, isScanNoiseDoc bool, -) (*pdf.ParseResult, error) { - result := &pdf.ParseResult{PageImages: make(map[int]image.Image)} - - // 1. OCR extraction — per-page detect + recognize + char merge. - boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine, - fromPage, toPage, prescanChars, - medianHeights, medianWidths, result.PageImages) - if ocrErr != nil { - slog.Warn("extractPages: some pages failed OCR", "err", ocrErr) - } - // 2. Scan noise retry — re-OCR all pages when prescan detects scan noise. - if isScanNoiseDoc { - boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine, - fromPage, toPage, result.PageImages, - pageChars, medianHeights, medianWidths, ocrUsedAny) - } - - // 3. Zoom retry — re-render at higher resolution if OCR produced zero boxes. - if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR { - boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage, - result.PageImages, boxes, ocrUsedAny) - } - - if len(boxes) == 0 { - return result, nil - } - - // 4. Layout pipeline — DLA → TSR → Column → TextMerge → VM → Sections. - if err := p.buildLayout(ctx, result, engine, boxes, pageChars, - medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish); err != nil { - return nil, fmt.Errorf("buildLayout: %w", err) - } - // 5. Crop section images from page renders. - p.fillSectionImages(result) - - return result, nil -} - -// fillSectionImages populates result.Sections[i].Image with cropped -// page images. Table sections are matched to their TableItem image; -// figure sections try DLA-aware cropping first, then fall back to -// position-tag-based cropping. -func (p *Parser) fillSectionImages(result *pdf.ParseResult) { - if len(result.PageImages) == 0 { - return - } - // Build lookup: DLA region -> table image (base64). - tableImgByRegion := make(map[string]string, len(result.Tables)) - for _, tbl := range result.Tables { - if tbl.ImageB64 == "" { - continue - } - pg := 0 - if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 { - pg = tbl.Positions[0].PageNumbers[0] - } - key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", - pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom) - tableImgByRegion[key] = tbl.ImageB64 - } - for i := range result.Sections { - if result.Sections[i].LayoutType == pdf.LayoutTypeTable && len(result.Sections[i].Positions) > 0 { - pos := result.Sections[i].Positions[0] - pg := 0 - if len(pos.PageNumbers) > 0 { - pg = pos.PageNumbers[0] - } - key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", - pg, pos.Left, pos.Right, pos.Top, pos.Bottom) - if img, ok := tableImgByRegion[key]; ok { - result.Sections[i].Image = img - continue - } - } - // Try DLA-aware cropping for figure sections (matching Python's - // cropout which uses DLA region boundaries instead of text boxes). - if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 { - if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" { - result.Sections[i].Image = dlaImg - continue - } - } - img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom) - result.Sections[i].Image = img - if img == "" && result.Sections[i].Text != "" { - tag := result.Sections[i].PositionTag - slog.Warn("cropSectionImage empty for non-empty section", - "section", i, "posTag", tag[:min(80, len(tag))]) - } - } -} diff --git a/internal/deepdoc/parser/pdf/parser_mock_test.go b/internal/deepdoc/parser/pdf/parser_mock_test.go index ae1a0998fb..11a0091fc3 100644 --- a/internal/deepdoc/parser/pdf/parser_mock_test.go +++ b/internal/deepdoc/parser/pdf/parser_mock_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -53,10 +53,11 @@ func TestEnrichWithDeepDoc_Noop(t *testing.T) { boxes := []pdf.TextBox{ {PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}, } - eng := &mockEngine{pageCount: 1} + eng := &MockEngine{NumPages: 1} - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false}) - tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil) + p := NewParser(pdf.DefaultParserConfig()) + mock := &MockDocAnalyzer{Healthy: false} + tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Error("unhealthy DeepDoc → 0 Tables") } @@ -83,10 +84,10 @@ func TestExtractTableBoxes_Mock(t *testing.T) { {X0: 600, Y0: 410, X1: 1240, Y1: 800, Text: "B2"}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummyImg := image.NewRGBA(image.Rect(0, 0, 2000, 3000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 1 { t.Fatalf("expected 1 pdf.TableItem, got %d", len(tables)) } @@ -105,9 +106,9 @@ func TestExtractTableBoxes_Mock(t *testing.T) { func TestExtractTableBoxes_NoTables(t *testing.T) { mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{}} - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("0 tables expected, got %d", len(tables)) } @@ -121,9 +122,9 @@ func TestExtractTableBoxes_NonTableRegions(t *testing.T) { {X0: 150, Y0: 600, X1: 1650, Y1: 900, Label: "figure", Confidence: 0.8}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 2000, 2000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("non-table regions → 0 tables, got %d", len(tables)) } @@ -139,9 +140,9 @@ func TestExtractTableBoxes_NoOverlap(t *testing.T) { {X0: 150, Y0: 1500, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("no overlap → 0 tables, got %d", len(tables)) } @@ -158,9 +159,9 @@ func TestExtractTableBoxes_TSRError(t *testing.T) { }, TSRCells: nil, // TSR returns nothing } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 1 { t.Fatalf("TSR failure: expected 1 pdf.TableItem with image+positions, got %d", len(tables)) } @@ -180,9 +181,9 @@ func TestExtractTableBoxes_DLAError(t *testing.T) { mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{ {X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "text", Confidence: 0.9}, }} - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("non-table DLA → 0 tables, got %d", len(tables)) } @@ -238,9 +239,9 @@ func TestExtractTableBoxes_InvalidRegion(t *testing.T) { {X0: 500, Y0: 100, X1: 100, Y1: 300, Label: "table", Confidence: 0.9}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000)) - tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0) + tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("invalid DLA region should be skipped, got %d tables", len(tables)) } @@ -252,16 +253,16 @@ func TestParse_CollectsFigures(t *testing.T) { // End-to-end: Parse() with mock DeepDoc that labels a box as "figure". // Verify p.Figures is populated. - eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}} + eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}} mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []pdf.DLARegion{ {X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -282,15 +283,15 @@ func TestParse_CollectsFigures(t *testing.T) { func TestParse_NoFigures(t *testing.T) { // Parse() with no DLA figure regions → p.Figures should be empty. - eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}} + eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}} mock := &MockDocAnalyzer{ DLARegions: []pdf.DLARegion{ {X0: 150, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -302,10 +303,11 @@ func TestParse_NoFigures(t *testing.T) { func TestParse_NoDeepDoc_NoFigures(t *testing.T) { // Parse() with mock DeepDoc → Figures should be empty (no DLA-detected figures). - eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}} - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) + eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}} + mock := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -319,9 +321,9 @@ func TestParse_NoDeepDoc_NoFigures(t *testing.T) { func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) { // When DeepDoc is available and the page has embedded chars, // Parse should use ocrMergeChars (detect → merge → recognize). - eng := &mockEngine{ - pageCount: 1, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + Chars: map[int][]pdf.TextChar{0: { {X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}, }}, } @@ -331,9 +333,9 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) { {X0: 5, Y0: 5, X1: 50, Y1: 5, X2: 50, Y2: 50, X3: 5, Y3: 50}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -349,15 +351,16 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) { func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) { // Without DeepDoc, Parse should use charsToBoxes (unchanged behavior). - eng := &mockEngine{ - pageCount: 1, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + Chars: map[int][]pdf.TextChar{0: { {X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}, }}, } - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) + mock := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -368,9 +371,9 @@ func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) { func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) { // OCRDetect returns no boxes → falls through to charsToBoxes. - eng := &mockEngine{ - pageCount: 1, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + Chars: map[int][]pdf.TextChar{0: { {X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}, }}, } @@ -378,9 +381,9 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) { Healthy: true, OCRBoxes: []pdf.OCRBox{}, // empty detect } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -392,18 +395,19 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) { // ── Error path coverage ──────────────────────────────────────────────── func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) { - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{ + mock := &MockDocAnalyzer{ Healthy: true, DLAErr: fmt.Errorf("DLA service unavailable"), - }) - eng := &mockEngine{pageCount: 1} + } + p := NewParser(pdf.DefaultParserConfig()) + eng := &MockEngine{NumPages: 1} img := image.NewRGBA(image.Rect(0, 0, 100, 100)) pageImages := map[int]image.Image{0: img} boxes := []pdf.TextBox{ {PageNumber: 0, X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "text"}, } // enrichWithDeepDoc should return nil (not panic) on DLA error. - tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages) + tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock)) if len(tables) != 0 { t.Errorf("DLA error should produce 0 tables, got %d", len(tables)) } @@ -412,20 +416,21 @@ func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) { func TestMockDocAnalyzer_TSRError_DoesNotCrash(t *testing.T) { // TSR error: DLA succeeds, TSR fails. The table region is detected // but no cells are returned — the table is skipped gracefully. - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{ + mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []pdf.DLARegion{ {X0: 0, Y0: 0, X1: 400, Y1: 400, Label: "table", Confidence: 0.95}, }, TSRErr: fmt.Errorf("TSR model timeout"), - }) - eng := &mockEngine{pageCount: 1} + } + p := NewParser(pdf.DefaultParserConfig()) + eng := &MockEngine{NumPages: 1} img := image.NewRGBA(image.Rect(0, 0, 100, 100)) pageImages := map[int]image.Image{0: img} boxes := []pdf.TextBox{ {PageNumber: 0, X0: 10, X1: 90, Top: 10, Bottom: 90, Text: "in table region"}, } - tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages) + tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock)) // DLA detects the table region → 1 pdf.TableItem is created. TSR failure // means it has no cells, but the pipeline must not panic. if len(tables) != 1 { @@ -440,12 +445,12 @@ func TestMockDocAnalyzer_OCRDetectError_DoesNotCrash(t *testing.T) { // OCRDetect failure path: extractPages uses ocrDetectAndRecognize which // calls doc.OCRDetect. When it fails, the page is skipped gracefully. mock := &MockDocAnalyzer{Healthy: true, OCRDetectErr: fmt.Errorf("OCR model OOM")} - eng := &mockEngine{ - pageCount: 1, - chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path + eng := &MockEngine{ + NumPages: 1, + Chars: map[int][]pdf.TextChar{}, // empty → triggers OCR path } - p := NewParser(pdf.DefaultParserConfig(), mock) - _, err := p.Parse(context.Background(), eng) + p := NewParser(pdf.DefaultParserConfig()) + _, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse returned error: %v", err) } diff --git a/internal/deepdoc/parser/pdf/parser_ocr.go b/internal/deepdoc/parser/pdf/parser_ocr.go index b9ae837b34..fb803aedff 100644 --- a/internal/deepdoc/parser/pdf/parser_ocr.go +++ b/internal/deepdoc/parser/pdf/parser_ocr.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -54,12 +54,17 @@ func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc pdf.Doc // merges the chars into detect regions, and OCRs any regions without chars. // Matches Python's __ocr: detect → match chars to boxes → use char text // for boxes with embedded chars → OCR recognize only empty/garbled boxes. +type ocrDetectBox struct { + box pdf.TextBox + x0, y0, x1, y1 float64 +} + func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextChar, doc pdf.DocAnalyzer, pageNum int) []pdf.TextBox { - detectBoxes, err := doc.OCRDetect(ctx, pageImg) - if err != nil || len(detectBoxes) == 0 { + ocrDetectBoxes, err := doc.OCRDetect(ctx, pageImg) + if err != nil || len(ocrDetectBoxes) == 0 { return nil } - slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes)) + slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(ocrDetectBoxes)) // Detect boxes are in pixel space (216 DPI). Scale to PDF space (72 DPI) // so coordinates match embedded chars. @@ -69,12 +74,8 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha imgH := float64(imgBounds.Dy()) / scale // Step 1: match embedded chars to detect boxes (Python __ocr char matching). - type detectBox struct { - box pdf.TextBox - x0, y0, x1, y1 float64 // PDF-space bounds - } - boxes := make([]detectBox, 0, len(detectBoxes)) - for _, b := range detectBoxes { + boxes := make([]ocrDetectBox, 0, len(ocrDetectBoxes)) + for _, b := range ocrDetectBoxes { x0 := min(b.X0, b.X1, b.X2, b.X3) / scale y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale x1 := max(b.X0, b.X1, b.X2, b.X3) / scale @@ -94,7 +95,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha if x0 >= x1 || y0 >= y1 { continue } - boxes = append(boxes, detectBox{box: pdf.TextBox{ + boxes = append(boxes, ocrDetectBox{box: pdf.TextBox{ X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum, }, x0: x0, y0: y0, x1: x1, y1: y1}) } @@ -145,82 +146,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha boxChars[bestIdx] = append(boxChars[bestIdx], c) } - // Step 3: assemble text for each box. - var result []pdf.TextBox - var needOCR []int - for i := range boxes { - tb := boxes[i].box - tb.Text = "" - - if len(boxChars[i]) > 0 { - // Sort chars by reading order, matching Python's sort_Y_firstly. - // Fuzzy Y-group: chars within median char height are "same line", - // sorted by X; different lines sorted by Y. - sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i])) - // Use lineToTextBox for correct space insertion + garbled detection. - // lineToTextBox inserts ASCII word spaces at visible gaps — - // matching Python's __img_ocr + __ocr char logic. - lineBox := lyt.LineToTextBox(boxChars[i]) - tb.Text = lineBox.Text - - // Strategy 1: If majority of chars are garbled (PUA), clear text → OCR. - var garbledCnt, totalCnt int - for _, c := range boxChars[i] { - for _, r := range c.Text { - totalCnt++ - if util.IsGarbledChar(string(r)) { - garbledCnt++ - } - } - } - if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 { - tb.Text = "" - } - // Strategy 2: font-encoding garbled (subset fonts, min 5 chars). - if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) { - tb.Text = "" - } - } - - // Step 4: batch OCR recognize boxes without embedded chars (or garbled). - if tb.Text == "" { - needOCR = append(needOCR, i) - } - result = append(result, tb) - } - - if len(needOCR) > 0 { - cropped := make([]image.Image, len(needOCR)) - for j, idx := range needOCR { - cropped[j] = util.FastCrop(pageImg, - int(boxes[idx].x0*scale), int(boxes[idx].y0*scale), - int(boxes[idx].x1*scale), int(boxes[idx].y1*scale)) - } - allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped) - for j, idx := range needOCR { - if allErrs[j] != nil { - slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j]) - continue - } - var ocrParts []string - for _, t := range allTexts[j] { - if strings.TrimSpace(t.Text) != "" { - ocrParts = append(ocrParts, t.Text) - } - } - result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " ")) - } - } - // Filter out boxes with no text. - filtered := result[:0] - for _, tb := range result { - if tb.Text != "" { - filtered = append(filtered, tb) - } - } - result = filtered - slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result)) - return result + return buildTextBoxes(ctx, pageImg, boxes, boxChars, doc, scale, pageNum) } // sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X. @@ -289,3 +215,71 @@ func ocrTableCells(ctx context.Context, cells []pdf.TSRCell, tableImg image.Imag cells[i].Text = strings.TrimSpace(strings.Join(parts, " ")) } } + +// buildTextBoxes assembles detect box text from embedded chars and fills +// empty boxes via batch OCR. +func buildTextBoxes(ctx context.Context, pageImg image.Image, + boxes []ocrDetectBox, boxChars [][]pdf.TextChar, doc pdf.DocAnalyzer, scale float64, pageNum int, +) []pdf.TextBox { + var result []pdf.TextBox + var needOCR []int + for i := range boxes { + tb := boxes[i].box + tb.Text = "" + if len(boxChars[i]) > 0 { + sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i])) + lineBox := lyt.LineToTextBox(boxChars[i]) + tb.Text = lineBox.Text + var garbledCnt, totalCnt int + for _, c := range boxChars[i] { + for _, r := range c.Text { + totalCnt++ + if util.IsGarbledChar(string(r)) { + garbledCnt++ + } + } + } + if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 { + tb.Text = "" + } + if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) { + tb.Text = "" + } + } + if strings.TrimSpace(tb.Text) == "" { + tb.Text = "" + needOCR = append(needOCR, i) + } + result = append(result, tb) + } + if len(needOCR) > 0 { + cropped := make([]image.Image, len(needOCR)) + for j, idx := range needOCR { + cropped[j] = util.FastCrop(pageImg, + int(boxes[idx].x0*scale), int(boxes[idx].y0*scale), + int(boxes[idx].x1*scale), int(boxes[idx].y1*scale)) + } + allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped) + for j, idx := range needOCR { + if allErrs[j] != nil { + slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j]) + continue + } + var ocrParts []string + for _, t := range allTexts[j] { + if strings.TrimSpace(t.Text) != "" { + ocrParts = append(ocrParts, t.Text) + } + } + result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " ")) + } + } + filtered := result[:0] + for _, tb := range result { + if strings.TrimSpace(tb.Text) != "" { + filtered = append(filtered, tb) + } + } + slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(filtered)) + return filtered +} diff --git a/internal/deepdoc/parser/pdf/parser_ocr_test.go b/internal/deepdoc/parser/pdf/parser_ocr_test.go index 78efad4fcd..a5b4308f30 100644 --- a/internal/deepdoc/parser/pdf/parser_ocr_test.go +++ b/internal/deepdoc/parser/pdf/parser_ocr_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" diff --git a/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go b/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go index eb5facf679..cad7169937 100644 --- a/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go +++ b/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go @@ -1,6 +1,6 @@ //go:build cgo && integration -package parser +package pdf import ( "bytes" @@ -11,10 +11,10 @@ import ( _ "image/png" "os" "path/filepath" - "ragflow/internal/deepdoc/parser/pdf/post" - pdf "ragflow/internal/deepdoc/parser/pdf/type" "strings" "testing" + + pdf "ragflow/internal/deepdoc/parser/pdf/type" ) // ── golden-file helpers ──────────────────────────────────────────────────── @@ -95,12 +95,11 @@ func tablesToGolden(tables []pdf.TableItem) []tableGolden { // TestIntegration_SectionsText verifies section text output matches golden. func TestIntegration_SectionsText(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "01_english_simple.pdf") - defer eng.Close() + data := mustReadPDF(t, "01_english_simple.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -139,12 +138,11 @@ func TestIntegration_SectionsText(t *testing.T) { // TestIntegration_SectionsCount verifies section count is stable. func TestIntegration_SectionsCount(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "01_english_simple.pdf") - defer eng.Close() + data := mustReadPDF(t, "01_english_simple.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -166,12 +164,11 @@ func TestIntegration_SectionsCount(t *testing.T) { // TestIntegration_TableStructure verifies table rows and cell text match golden. func TestIntegration_TableStructure(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -221,12 +218,11 @@ func TestIntegration_TableStructure(t *testing.T) { // TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG. func TestIntegration_TableImageB64(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -261,12 +257,11 @@ func TestIntegration_TableImageB64(t *testing.T) { // TestIntegration_LayoutTypes verifies DLA labels boxes with expected types. func TestIntegration_LayoutTypes(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() + data := mustReadPDF(t, "06_table_content.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -316,7 +311,6 @@ func TestIntegration_Idempotency(t *testing.T) { // Render a fixture page as the stable input image. eng := mustOpenEngine(t, "06_table_content.pdf") - defer eng.Close() pageImg, err := eng.RenderPageImage(0, 216) if err != nil { t.Fatalf("render page: %v", err) @@ -531,12 +525,11 @@ func floatClose(a, b, eps float64) bool { // fixes from the Python→Go migration. func TestIntegration_TableAlign(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "18_table_caption.pdf") - defer eng.Close() + data := mustReadPDF(t, "18_table_caption.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -572,12 +565,11 @@ func TestIntegration_TableAlign(t *testing.T) { // (header/footer/reference) boxes are popped from output. func TestIntegration_GarbageLayout(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "17_garbage_layout.pdf") - defer eng.Close() + data := mustReadPDF(t, "17_garbage_layout.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -603,13 +595,12 @@ func TestIntegration_GarbageLayout(t *testing.T) { // TestIntegration_MultiChunk verifies chunked processing for large documents. func TestIntegration_MultiChunk(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "19_multipage_chunk.pdf") - defer eng.Close() + data := mustReadPDF(t, "19_multipage_chunk.pdf") cfg := pdf.DefaultParserConfig() cfg.BatchSize = 10 // small batches to force multi-batch path - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -635,11 +626,10 @@ func TestIntegration_NoRegression(t *testing.T) { "07_mixed_content.pdf", } { t.Run(name, func(t *testing.T) { - eng := mustOpenEngine(t, name) - defer eng.Close() + data := mustReadPDF(t, name) cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -662,11 +652,10 @@ func TestIntegration_TableRotation(t *testing.T) { client := mustConnectInferenceClient(t) t.Run("upright_table", func(t *testing.T) { - eng := mustOpenEngine(t, "rotate_0.pdf") - defer eng.Close() + data := mustReadPDF(t, "rotate_0.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -677,16 +666,15 @@ func TestIntegration_TableRotation(t *testing.T) { }) t.Run("rotated_90_table", func(t *testing.T) { - eng := mustOpenEngine(t, "rotate_90.pdf") - defer eng.Close() + data := mustReadPDF(t, "rotate_90.pdf") cfg := pdf.DefaultParserConfig() // DeepDoc DLA does not yet correctly annotate boxes on rotated // pages (regions and characters are in different coordinate // spaces post-rotation). Character extraction and rotation are - // verified via the charsToBoxes path. + // verified via the lyt.CharsToBoxes path. cfg.SkipOCR = true - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -701,12 +689,11 @@ func TestIntegration_TableRotation(t *testing.T) { // characters with a visible gap (Python __img_ocr space insertion). func TestIntegration_WordSpacing(t *testing.T) { client := mustConnectInferenceClient(t) - eng := mustOpenEngine(t, "01_english_simple.pdf") - defer eng.Close() + data := mustReadPDF(t, "01_english_simple.pdf") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.Parse(context.Background(), data, client) if err != nil { t.Fatalf("Parse: %v", err) } @@ -734,53 +721,34 @@ func TestIntegration_WordSpacing(t *testing.T) { // TestE2E_ParseAndPostProcess runs Parse → PostProcess end-to-end on a real // PDF. Skips VLM (no tenant_id set) but exercises all other operators. func TestE2E_ParseAndPostProcess(t *testing.T) { - engine := mustOpenEngine(t, "01_english_simple.pdf") - defer engine.Close() + data := mustReadPDF(t, "01_english_simple.pdf") mock := &MockDocAnalyzer{Healthy: true} - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), engine) + result, err := p.Parse(context.Background(), data, mock) if err != nil { t.Fatalf("Parse: %v", err) } - preCount := len(result.Sections) - if preCount == 0 { + if len(result.Sections) == 0 { t.Fatal("Parse() returned zero sections") } + t.Logf("sections: %d", len(result.Sections)) - // Post-processing (no VLM). - config := post.PipelineConfig{ - post.ConfigKeyPageWidth: 612.0, - post.ConfigKeyZoom: 1.0, - } - if err := post.PostProcess(context.Background(), result, config); err != nil { - t.Fatalf("PostProcess: %v", err) - } - - postCount := len(result.Sections) - t.Logf("sections: %d → %d after PostProcess", preCount, postCount) - if postCount == 0 { - t.Error("PostProcess removed all sections") - } - - // Every section must have DocTypeKwd + LayoutType set. + // PostProcess is handled by the Pipeline framework. + // Verify raw parse produces sections with LayoutType set. for i, s := range result.Sections { - if s.DocTypeKwd == "" { - t.Errorf("section[%d] DocTypeKwd empty after PostProcess", i) - } - if s.LayoutType == "" { - t.Errorf("section[%d] LayoutType empty after PostProcess", i) - } + t.Logf(" section[%d]: layout=%q text=%q", i, s.LayoutType, truncate(s.Text, 60)) } - // Figures() must reflect post-processed sections. figs := result.Figures() t.Logf("figures: %d", len(figs)) - for _, f := range figs { - if f.LayoutType != "figure" { - t.Errorf("Figures() LayoutType=%q, want 'figure'", f.LayoutType) - } - } +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "..." } diff --git a/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go b/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go index 9c2edfa522..d36ac20c3c 100644 --- a/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go +++ b/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -47,8 +47,8 @@ func TestIntegration_NoCrash(t *testing.T) { defer eng.Close() cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, client) if err != nil { t.Fatalf("Parse: %v", err) } diff --git a/internal/deepdoc/parser/pdf/parser_test.go b/internal/deepdoc/parser/pdf/parser_test.go index e703d69a33..0c2fe026a6 100644 --- a/internal/deepdoc/parser/pdf/parser_test.go +++ b/internal/deepdoc/parser/pdf/parser_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -6,6 +6,7 @@ import ( "strings" "sync" "testing" + "math" lyt "ragflow/internal/deepdoc/parser/pdf/layout" tbl "ragflow/internal/deepdoc/parser/pdf/table" @@ -207,15 +208,16 @@ func TestOCR_FallbackIntegration(t *testing.T) { func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) { chars := garbledSample() - mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1} + mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1} + mockDLA := &MockDocAnalyzer{Healthy: true} cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), mockEng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), mockEng, mockDLA) if err != nil { t.Fatal(err) } - t.Logf("garbled chars: %d sections", len(result.Sections)) + t.Logf("garbled Chars: %d sections", len(result.Sections)) } func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) { @@ -241,9 +243,10 @@ func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) { chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112} chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112} - mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1} - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), mockEng) + mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1} + mockDLA := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) + result, err := p.ParseRaw(context.Background(), mockEng, mockDLA) if err != nil { t.Fatal(err) } @@ -279,7 +282,7 @@ func TestIsGarbledPage(t *testing.T) { }) t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) { // ### unmapped glyphs + real CJK text (no subset fonts). - // isScanNoise returns false (≥2 consecutive CJK chars: "护理全科"). + // isScanNoise returns false (≥2 consecutive CJK Chars: "护理全科"). chars := []pdf.TextChar{ {Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0}, {Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0}, @@ -552,11 +555,12 @@ func TestTableSectionCaptionInHTML(t *testing.T) { // text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true. // The 0.3 threshold should not match a wide box that barely touches a // narrow cell — this would cause body text to leak into table cells. -// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for +// TestParser_ConcurrentSafety verifies that Parser.ParseRaw() is safe for // concurrent use. 8 goroutines each call Parse 5 times on the same Parser // instance. Run with -race. func TestParser_ConcurrentSafety(t *testing.T) { - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false}) + mockDLA := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) var wg sync.WaitGroup n := 8 @@ -565,10 +569,58 @@ func TestParser_ConcurrentSafety(t *testing.T) { go func() { defer wg.Done() for range 5 { - eng := &mockEngine{pageCount: 2} - _, _ = p.Parse(context.Background(), eng) + eng := &MockEngine{NumPages: 2} + if _, err := p.ParseRaw(context.Background(), eng, mockDLA); err != nil { + t.Errorf("ParseRaw: %v", err) + } } }() } wg.Wait() } + +func TestParseRaw_ClampsFromPage(t *testing.T) { + // A negative FromPage should be treated as page 0. + // Only page 0 has content so we can verify clamping worked. + eng := &MockEngine{NumPages: 3, Chars: map[int][]pdf.TextChar{ + 0: {{Text: "page0", X0: 100, X1: 200, Top: 100, Bottom: 120}}, + }} + mockDLA := &MockDocAnalyzer{Healthy: true} + cfg := pdf.DefaultParserConfig() + cfg.FromPage = -1 + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, mockDLA) + if err != nil { + t.Fatalf("ParseRaw: %v", err) + } + if len(result.Sections) == 0 { + t.Error("expected sections from page 0") + } +} + +func TestParseRaw_ZeroZoom_NoNaN(t *testing.T) { + // Zoom=0 should not produce NaN coordinates. + eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{ + 0: {{Text: "test", X0: 100, X1: 200, Top: 100, Bottom: 120}}, + }} + mockDLA := &MockDocAnalyzer{Healthy: true} + cfg := pdf.DefaultParserConfig() + cfg.Zoom = 0 + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, mockDLA) + if err != nil { + t.Fatalf("ParseRaw: %v", err) + } + foundPosition := false + for _, s := range result.Sections { + for _, pos := range s.Positions { + foundPosition = true + if math.IsNaN(pos.Left) || math.IsNaN(pos.Top) { + t.Error("Zoom=0 produced NaN coordinates") + } + } + } + if !foundPosition { + t.Fatal("expected at least one position to validate") + } +} diff --git a/internal/deepdoc/parser/pdf/pdfium_integration_test.go b/internal/deepdoc/parser/pdf/pdfium_integration_test.go index 3c20fea653..300564db31 100644 --- a/internal/deepdoc/parser/pdf/pdfium_integration_test.go +++ b/internal/deepdoc/parser/pdf/pdfium_integration_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -34,8 +34,8 @@ func TestParse_PdfiumRender(t *testing.T) { t.Fatalf("RawData() length %d != original %d", len(raw), len(data)) } - // Render a page through pdfium (via the parser's renderPageToImage). - img, err := renderPageToImage(eng, 0) + // Render a page through pdfium (via the parser's RenderPageToImage). + img, err := RenderPageToImage(eng, 0) if err != nil { t.Skipf("pdfium render not available: %v", err) } @@ -48,8 +48,8 @@ func TestParse_PdfiumRender(t *testing.T) { // Run Parse with pdfium rendering — BATCH_SKIP_DEEPDOC=1 to avoid HTTP calls. t.Setenv("BATCH_SKIP_DEEPDOC", "1") cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true}) if err != nil { t.Fatalf("Parse: %v", err) } @@ -64,10 +64,10 @@ func TestParse_PdfiumRender(t *testing.T) { } func TestParse_PdfiumRender_NoData(t *testing.T) { - // When engine has no raw PDF bytes, renderPageToImage falls back to + // When engine has no raw PDF bytes, RenderPageToImage falls back to // engine.RenderPageImage(). Stub returns (nil, nil) → guard converts // to ErrNoPDFData so callers never receive a nil image with nil error. - img, err := renderPageToImage(&pythonCharEngineStub{}, 0) + img, err := RenderPageToImage(&pythonCharEngineStub{}, 0) if err != ErrNoPDFData { t.Errorf("expected ErrNoPDFData, got %v", err) } diff --git a/internal/deepdoc/parser/pdf/pdfoxide_bridge.go b/internal/deepdoc/parser/pdf/pdfoxide_bridge.go index 195c88f4fc..340d634076 100644 --- a/internal/deepdoc/parser/pdf/pdfoxide_bridge.go +++ b/internal/deepdoc/parser/pdf/pdfoxide_bridge.go @@ -1,6 +1,6 @@ //go:build cgo -package parser +package pdf import ( "image" @@ -11,8 +11,8 @@ import ( ) // pdfoxideEngine adapts pdfoxide.Engine to the pdf.PDFEngine interface. -type pdfoxideEngine struct { - inner *pdfoxide.Engine +type PDFOxideEngine struct { + Inner *pdfoxide.Engine } // NewEngine returns a pdf.PDFEngine backed by pdf_oxide. @@ -21,15 +21,15 @@ func NewEngine(pdfBytes []byte) (pdf.PDFEngine, error) { if err != nil { return nil, err } - return &pdfoxideEngine{inner: eng}, nil + return &PDFOxideEngine{Inner: eng}, nil } -func (e *pdfoxideEngine) RawData() []byte { return e.inner.RawData() } -func (e *pdfoxideEngine) PageCount() (int, error) { return e.inner.PageCount() } -func (e *pdfoxideEngine) Close() error { return e.inner.Close() } +func (e *PDFOxideEngine) RawData() []byte { return e.Inner.RawData() } +func (e *PDFOxideEngine) PageCount() (int, error) { return e.Inner.PageCount() } +func (e *PDFOxideEngine) Close() error { return e.Inner.Close() } -func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) { - ol := pdfium.ExtractOutlines(e.inner.RawData()) +func (e *PDFOxideEngine) Outlines() ([]pdf.Outline, error) { + ol := pdfium.ExtractOutlines(e.Inner.RawData()) result := make([]pdf.Outline, len(ol)) for i, o := range ol { result[i] = pdf.Outline{Title: o.Title, Level: o.Level, PageNumber: o.PageNumber} @@ -37,16 +37,16 @@ func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) { return result, nil } -func (e *pdfoxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) { - return e.inner.RenderPage(pageNum, dpi) +func (e *PDFOxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) { + return e.Inner.RenderPage(pageNum, dpi) } -func (e *pdfoxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) { - return e.inner.RenderPageImage(pageNum, dpi) +func (e *PDFOxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) { + return e.Inner.RenderPageImage(pageNum, dpi) } -func (e *pdfoxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) { - chars, err := e.inner.ExtractChars(pageNum) +func (e *PDFOxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) { + chars, err := e.Inner.ExtractChars(pageNum) if err != nil { return nil, err } diff --git a/internal/deepdoc/parser/pdf/pipeline_parity_test.go b/internal/deepdoc/parser/pdf/pipeline_parity_test.go index 9ac1b56bfc..8bfb6e062c 100644 --- a/internal/deepdoc/parser/pdf/pipeline_parity_test.go +++ b/internal/deepdoc/parser/pdf/pipeline_parity_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -13,6 +13,7 @@ import ( lyt "ragflow/internal/deepdoc/parser/pdf/layout" "ragflow/internal/deepdoc/parser/pdf/tool" pdf "ragflow/internal/deepdoc/parser/pdf/type" + util "ragflow/internal/deepdoc/parser/pdf/util" ) // TestPipelineParity verifies Go pipeline logic equivalence with Python. @@ -53,8 +54,9 @@ func TestPipelineParity(t *testing.T) { // Run Go pipeline (SKIP_OCR — no DeepDoc) cfg := pdf.DefaultParserConfig() cfg.SortByTop = true - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), engine) + mockAnalyzer := &MockDocAnalyzer{Healthy: true} + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), engine, mockAnalyzer) if err != nil { t.Errorf("%s: Parse: %v", name, err) continue @@ -151,7 +153,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) { if isWS && len(out) > 0 { prev := &out[len(out)-1] gap := b.Top - prev.Bottom - ov := OverlapX(prev, &b) + ov := util.OverlapX(prev, &b) // Python: gap passes AND xov passes → whitespace merged // into prev, extending bottom. i advances (Go for-loop). if gap <= thr && ov >= 0.3 { @@ -169,7 +171,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) { continue } gap := b.Top - prev.Bottom - ov := OverlapX(prev, &b) + ov := util.OverlapX(prev, &b) if gap > thr { out = append(out, b) continue @@ -219,7 +221,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) { continue } gap := b.Top - prev.Bottom - ov := OverlapX(prev, &b) + ov := util.OverlapX(prev, &b) if gap > thr { out = append(out, b) continue @@ -250,18 +252,18 @@ func TestVMWhitespaceGapBridge(t *testing.T) { t.Logf("Gap with bridge: 420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr) // The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still - // differ — the mechanism is real. But production NaiveVerticalMerge now + // differ — the mechanism is real. But production lyt.NaiveVerticalMerge now // handles whitespace inline (gap bridge), matching Python. if nWS == nNoWS { t.Error("Manual implementations should differ — the gap bridge mechanism is real") } - // Verify production NaiveVerticalMerge matches vWithWS (Python behavior). + // Verify production lyt.NaiveVerticalMerge matches vWithWS (Python behavior). mhMap := map[int]float64{1: mh} mwMap := map[int]float64{1: 5} vmResult := lyt.NaiveVerticalMerge(boxes, mhMap, mwMap, false) - t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult)) + t.Logf("lyt.NaiveVerticalMerge (production): %d sections", len(vmResult)) if len(vmResult) != nWS { - t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS) + t.Errorf("lyt.NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS) } } diff --git a/internal/deepdoc/parser/pdf/post/model_image_describer.go b/internal/deepdoc/parser/pdf/post/model_image_describer.go deleted file mode 100644 index cd1d65065a..0000000000 --- a/internal/deepdoc/parser/pdf/post/model_image_describer.go +++ /dev/null @@ -1,101 +0,0 @@ -package post - -import ( - "bytes" - "context" - "encoding/base64" - "errors" - "fmt" - "image" - "image/png" -) - -// ── chat driver interface (self-contained, avoids entity/models import) ── - -// ChatDriver is the subset of modelModule.ModelDriver needed to call a -// vision-capable chat API. Defined here to keep model_image_describer.go -// self-contained and avoid import chains that require CGO. -type ChatDriver interface { - ChatWithMessages(modelName string, messages []ChatMessage, apiConfig *ChatAPIConfig, chatConfig *ChatConfig) (*ChatResponse, error) -} - -// ChatMessage mirrors modelModule.Message. -type ChatMessage struct { - Role string `json:"role"` - Content interface{} `json:"content"` - ToolCallID string `json:"tool_call_id,omitempty"` - ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"` -} - -// ChatAPIConfig mirrors modelModule.APIConfig. -type ChatAPIConfig struct { - ApiKey *string - Region *string - BaseURL *string -} - -// ChatConfig mirrors modelModule.ChatConfig (may be nil). -type ChatConfig struct{} - -// ChatResponse mirrors modelModule.ChatResponse. -type ChatResponse struct { - Answer *string `json:"answer"` - ReasonContent *string `json:"reason_content"` - ToolCalls []map[string]interface{} `json:"tool_calls,omitempty"` -} - -// ── ModelImageDescriber ──────────────────────────────────────────────── - -// ModelImageDescriber implements ImageDescriber via any ChatDriver. -type ModelImageDescriber struct { - driver ChatDriver - modelName string - apiConfig *ChatAPIConfig - maxTokens int -} - -// NewModelImageDescriber creates a ModelImageDescriber that calls the given -// driver to describe images. maxTokens sets the response length limit (passed -// as ChatConfig.MaxTokens); 0 means use provider default. -func NewModelImageDescriber(d ChatDriver, name string, cfg *ChatAPIConfig, maxTokens int) *ModelImageDescriber { - return &ModelImageDescriber{driver: d, modelName: name, apiConfig: cfg, maxTokens: maxTokens} -} - -// DescribeImage sends the image as a base64 data URL in an OpenAI-compatible -// vision API request. Returns the model's text response. -func (d *ModelImageDescriber) DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error) { - dataURL, err := encodeImageToBase64DataURL(img) - if err != nil { - return "", fmt.Errorf("image encode: %w", err) - } - - msgs := []ChatMessage{{ - Role: "user", - Content: []interface{}{ - map[string]interface{}{"type": "text", "text": prompt}, - map[string]interface{}{"type": "image_url", "image_url": map[string]string{"url": dataURL}}, - }, - }} - - var chatCfg *ChatConfig - if d.maxTokens > 0 { - chatCfg = &ChatConfig{} - } - resp, err := d.driver.ChatWithMessages(d.modelName, msgs, d.apiConfig, chatCfg) - if err != nil { - return "", fmt.Errorf("image describe: %w", err) - } - if resp.Answer == nil || *resp.Answer == "" { - return "", errors.New("image describe: empty response") - } - return *resp.Answer, nil -} - -// encodeImageToBase64DataURL encodes an image as a PNG data URL. -func encodeImageToBase64DataURL(img image.Image) (string, error) { - var buf bytes.Buffer - if err := png.Encode(&buf, img); err != nil { - return "", err - } - return "data:image/png;base64," + base64.StdEncoding.EncodeToString(buf.Bytes()), nil -} diff --git a/internal/deepdoc/parser/pdf/post/model_image_describer_test.go b/internal/deepdoc/parser/pdf/post/model_image_describer_test.go deleted file mode 100644 index 1307b5600c..0000000000 --- a/internal/deepdoc/parser/pdf/post/model_image_describer_test.go +++ /dev/null @@ -1,79 +0,0 @@ -package post - -import ( - "context" - "errors" - "image" - "image/color" - "strings" - "testing" -) - -// ── mock ChatDriver ──────────────────────────────────────────────────── - -type mockChatDriver struct { - answer string - err error -} - -func (m *mockChatDriver) ChatWithMessages(_ string, _ []ChatMessage, _ *ChatAPIConfig, _ *ChatConfig) (*ChatResponse, error) { - if m.err != nil { - return nil, m.err - } - a := m.answer - return &ChatResponse{Answer: &a}, nil -} - -// ── ModelImageDescriber tests ────────────────────────────────────────── - -func TestModelImageDescriber_Success(t *testing.T) { - img := newTestImage(100, 100) - want := "A chart showing revenue growth." - driver := &mockChatDriver{answer: want} - desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0) - - got, err := desc.DescribeImage(context.Background(), img, "Describe this chart") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if got != want { - t.Errorf("got %q, want %q", got, want) - } -} - -func TestModelImageDescriber_DriverError(t *testing.T) { - img := newTestImage(100, 100) - driver := &mockChatDriver{err: errors.New("API rate limited")} - desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0) - - _, err := desc.DescribeImage(context.Background(), img, "prompt") - if err == nil { - t.Fatal("expected error, got nil") - } -} - -func TestModelImageDescriber_EmptyAnswer(t *testing.T) { - img := newTestImage(100, 100) - driver := &mockChatDriver{answer: ""} - desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0) - - _, err := desc.DescribeImage(context.Background(), img, "prompt") - if err == nil { - t.Fatal("expected error for empty answer, got nil") - } -} - -// ── encodeImageToBase64DataURL tests ─────────────────────────────────── - -func TestEncodeImageToBase64DataURL(t *testing.T) { - img := image.NewRGBA(image.Rect(0, 0, 1, 1)) - img.Set(0, 0, color.RGBA{R: 255, G: 0, B: 0, A: 255}) - - url, err := encodeImageToBase64DataURL(img) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if !strings.HasPrefix(url, "data:image/png;base64,") { - t.Errorf("missing data URL prefix: %s...", url[:min(50, len(url))]) - } -} diff --git a/internal/deepdoc/parser/pdf/post/outline_postprocess_test.go b/internal/deepdoc/parser/pdf/post/outline_postprocess_test.go deleted file mode 100644 index 9df88ee17c..0000000000 --- a/internal/deepdoc/parser/pdf/post/outline_postprocess_test.go +++ /dev/null @@ -1,114 +0,0 @@ -package post - -import ( - "context" - "testing" - - pdftype "ragflow/internal/deepdoc/parser/pdf/type" -) - -// ── Tests for remove_toc config flag ──────────────────────────────────────── - -// TestPostProcess_RemoveTOC_DisabledByConfig verifies that when -// remove_toc=false, outlines are NOT used to remove TOC pages even -// when outlines are present. -func TestPostProcess_RemoveTOC_DisabledByConfig(t *testing.T) { - result := newTestResult( - makePosSection("目录内容 page1", 1, 100, 500, 100, 200), - makePosSection("更多目录 page2", 2, 100, 500, 100, 200), - makePosSection("第一章 正文", 3, 100, 500, 100, 200), - makePosSection("第二章 正文", 5, 100, 500, 100, 200), - ) - outlines := []pdftype.Outline{ - {Title: "目录", Level: 0, PageNumber: 1}, - {Title: "第一章", Level: 0, PageNumber: 3}, - {Title: "第二章", Level: 0, PageNumber: 5}, - } - - config := PipelineConfig{ - ConfigKeyRemoveTOC: false, - ConfigKeyOutlines: outlines, - } - err := PostProcess(context.Background(), result, config) - if err != nil { - t.Fatal(err) - } - if len(result.Sections) != 4 { - t.Errorf("remove_toc=false should keep all sections, got %d", len(result.Sections)) - } -} - -// TestPostProcess_RemoveTOC_EnabledByConfig verifies that when -// remove_toc=true and outlines are present, TOC pages are removed. -func TestPostProcess_RemoveTOC_EnabledByConfig(t *testing.T) { - result := newTestResult( - makePosSection("目录内容 page1", 1, 100, 500, 100, 200), - makePosSection("更多目录 page2", 2, 100, 500, 100, 200), - makePosSection("第一章 正文", 3, 100, 500, 100, 200), - makePosSection("第二章 正文", 5, 100, 500, 100, 200), - ) - outlines := []pdftype.Outline{ - {Title: "目录", Level: 0, PageNumber: 1}, - {Title: "第一章", Level: 0, PageNumber: 3}, - {Title: "第二章", Level: 0, PageNumber: 5}, - } - - config := PipelineConfig{ - ConfigKeyRemoveTOC: true, - ConfigKeyOutlines: outlines, - } - err := PostProcess(context.Background(), result, config) - if err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Errorf("remove_toc=true should remove TOC pages, got %d sections", len(result.Sections)) - } - for _, s := range result.Sections { - for _, p := range s.Positions { - for _, pn := range p.PageNumbers { - if pn < 3 { - t.Errorf("TOC page %d should have been removed: section %q", pn, s.Text) - } - } - } - } -} - -// TestPostProcess_RemoveTOC_NoOutlines verifies that when no outlines -// are passed, no TOC removal happens. -func TestPostProcess_RemoveTOC_NoOutlines(t *testing.T) { - result := newTestResult( - makePosSection("目录内容", 1, 100, 500, 100, 200), - makePosSection("第一章 正文", 3, 100, 500, 100, 200), - ) - config := PipelineConfig{ - ConfigKeyRemoveTOC: true, - } - err := PostProcess(context.Background(), result, config) - if err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Errorf("no outlines → all sections kept, got %d", len(result.Sections)) - } -} - -// TestPostProcess_RemoveTOC_EmptyOutlines verifies empty outlines array is no-op. -func TestPostProcess_RemoveTOC_EmptyOutlines(t *testing.T) { - result := newTestResult( - makePosSection("目录", 1, 100, 500, 100, 200), - makePosSection("正文", 2, 100, 500, 100, 200), - ) - config := PipelineConfig{ - ConfigKeyRemoveTOC: true, - ConfigKeyOutlines: []pdftype.Outline{}, - } - err := PostProcess(context.Background(), result, config) - if err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Errorf("empty outlines → all sections kept, got %d", len(result.Sections)) - } -} diff --git a/internal/deepdoc/parser/pdf/post/post_steps.go b/internal/deepdoc/parser/pdf/post/post_steps.go deleted file mode 100644 index 0180084def..0000000000 --- a/internal/deepdoc/parser/pdf/post/post_steps.go +++ /dev/null @@ -1,436 +0,0 @@ -package post - -import ( - "context" - "errors" - "math" - "regexp" - "sort" - "strings" - "sync" - - pdftype "ragflow/internal/deepdoc/parser/pdf/type" - "ragflow/internal/deepdoc/parser/pdf/util" -) - -// ── Config ───────────────────────────────────────────────────────────── - -// Config keys for PipelineConfig. -const ( - ConfigKeyPageWidth = "page_width" - ConfigKeyZoom = "zoom" - ConfigKeyOutlines = "outlines" - ConfigKeyFlattenMediaToText = "flatten_media_to_text" - ConfigKeyTenantID = "tenant_id" - ConfigKeyVLMLLMID = "vlm_llm_id" - ConfigKeyRemoveTOC = "remove_toc" -) - -// PipelineConfig is a key-value map that post-processing reads -// to obtain its parameters. -type PipelineConfig map[string]interface{} - -// Float64 returns the float64 value for key, or default_ if absent or wrong type. -func (c PipelineConfig) Float64(key string, default_ float64) float64 { - if c == nil { - return default_ - } - v, ok := c[key] - if !ok { - return default_ - } - f, ok := v.(float64) - if !ok { - return default_ - } - return f -} - -// Bool returns the bool value for key. Returns false if absent or wrong type. -func (c PipelineConfig) Bool(key string) bool { - if c == nil { - return false - } - v, ok := c[key] - if !ok { - return false - } - b, ok := v.(bool) - if !ok { - return false - } - return b -} - -// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines. -func (c PipelineConfig) Outlines() []pdftype.Outline { - if c == nil { - return nil - } - v, ok := c[ConfigKeyOutlines] - if !ok { - return nil - } - o, ok := v.([]pdftype.Outline) - if !ok { - return nil - } - return o -} - -// String returns the string value for key. Returns "" if absent or wrong type. -func (c PipelineConfig) String(key string) string { - if c == nil { - return "" - } - v, ok := c[key] - if !ok { - return "" - } - s, ok := v.(string) - if !ok { - return "" - } - return s -} - -// ── Patterns ─────────────────────────────────────────────────────────── - -// headerFooterPattern matches layout types that should be treated as -// page furniture (Python: r"(header|footer|number)" in parser.py:637). -var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`) - -// tocTitlePattern matches outline titles that mark a table-of-contents page. -// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$" -var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`) - -// ── PostProcess ──────────────────────────────────────────────────────── - -// PostProcess applies PDF post-processing to a ParseResult in-place. -// The config map controls which features to enable. -// -// Execution order (matches Python _pdf): -// 1. reorderMultiColumn — if page_width > 0 -// 2. removeTOCByOutlines — if outlines present -// 3. normalizeLayoutType — always -// 4. filterHeaderFooter — always -// 5. assignDocTypeKwd — always (respects flatten_media_to_text) -// 6. enhanceWithVision — if image_describer present -func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error { - if result == nil { - return errors.New("PostProcess: nil result") - } - if config == nil { - config = PipelineConfig{} - } - - // 1. Multi-column reorder - pw := config.Float64(ConfigKeyPageWidth, 0) - if pw > 0 { - zoom := config.Float64(ConfigKeyZoom, 1.0) - if zoom <= 0 { - zoom = 1.0 - } - reorderMultiColumn(result, pw, zoom) - } - - // 2. Remove TOC pages (only when explicitly enabled). - // Outlines from config take precedence; otherwise read from ParseResult. - outlines := config.Outlines() - if len(outlines) == 0 { - outlines = result.Outlines - } - if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 { - removeTOCByOutlines(result, outlines) - } - - // 3-5. Always-on steps - normalizeLayoutType(result) - filterHeaderFooter(result) - assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText)) - - // 6. VLM enhancement - tenantID := config.String(ConfigKeyTenantID) - vlmLLMID := config.String(ConfigKeyVLMLLMID) - if tenantID != "" && vlmLLMID != "" { - describer, err := resolveImageDescriber(tenantID, vlmLLMID) - if err != nil { - return err - } - if err := enhanceWithVision(ctx, result, describer); err != nil { - return err - } - } - - return nil -} - -// resolveImageDescriber resolves a VLM model from tenant config and returns -// an ImageDescriber. Corresponds to Python's -// get_model_config_from_provider_instance + LLMBundle. -// resolveImageDescriber resolves a VLM model from tenant config and returns -// an ImageDescriber. The implementation is assigned by init() in -// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub). -// Overridable in tests. -var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error) - -// SetImageDescriberResolver sets the factory that creates an ImageDescriber -// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the -// PDF document pipeline entry point) register the real implementation via -// init(). If never called, PostProcess skips VLM enhancement. -func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) { - resolveImageDescriber = fn -} - -// ── normalizeLayoutType ──────────────────────────────────────────────── - -// normalizeLayoutType trims whitespace from LayoutType and defaults empty -// values to "text". Matches Python's layout_type normalization in parser.py. -func normalizeLayoutType(result *pdftype.ParseResult) { - for i := range result.Sections { - lt := strings.TrimSpace(result.Sections[i].LayoutType) - if lt == "" { - lt = "text" - } - result.Sections[i].LayoutType = lt - } -} - -// ── filterHeaderFooter ───────────────────────────────────────────────── - -// filterHeaderFooter removes sections whose LayoutType matches -// header/footer/number/reference. Python: remove_header_footer config. -func filterHeaderFooter(result *pdftype.ParseResult) { - sections := result.Sections[:0] - for _, s := range result.Sections { - if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) { - continue - } - sections = append(sections, s) - } - result.Sections = sections -} - -// ── assignDocTypeKwd ─────────────────────────────────────────────────── - -// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence. -// When flatten is true, all sections become "text" and Image is cleared — -// this matches Python where flatten_media_to_text and VLM are mutually -// exclusive. Python: parser.py:639-648. -func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) { - for i := range result.Sections { - s := &result.Sections[i] - if flatten { - s.DocTypeKwd = "text" - s.Image = "" - continue - } - lt := strings.TrimSpace(s.LayoutType) - switch lt { - case "table": - s.DocTypeKwd = "table" - case "figure": - s.DocTypeKwd = "image" - default: - if lt == "" && s.Image != "" { - s.DocTypeKwd = "image" - } else { - s.DocTypeKwd = "text" - } - } - } -} - -// ── enhanceWithVision ────────────────────────────────────────────────── - -// enhanceWithVision adds VLM-generated descriptions to image/table sections. -func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error { - if describer == nil { - return nil - } - if len(result.Sections) == 0 { - return nil - } - - sem := make(chan struct{}, maxDescribeConcurrency) - var wg sync.WaitGroup - - for i := range result.Sections { - s := &result.Sections[i] - if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" { - continue - } - if s.Image == "" { - continue - } - - wg.Add(1) - sem <- struct{}{} - go func(idx int, imgB64 string, origText string) { - defer wg.Done() - defer func() { <-sem }() - - img, err := util.DecodeBase64PNG(imgB64) - if err != nil || img == nil { - return - } - desc, err := DescribeImage(ctx, img, describePrompt, describer) - if err != nil || desc == "" { - return - } - - if origText != "" { - result.Sections[idx].Text = origText + "\n" + desc - } else { - result.Sections[idx].Text = desc - } - }(i, s.Image, s.Text) - } - wg.Wait() - - return nil -} - -// ── removeTOCByOutlines ──────────────────────────────────────────────── - -// removeTOCByOutlines removes sections whose page numbers fall inside -// TOC page ranges identified by PDF outlines. -func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) { - if len(outlines) == 0 { - return - } - tocPage, contentPage := findTOCPageRange(outlines) - if contentPage <= tocPage { - return - } - sections := result.Sections[:0] - for _, s := range result.Sections { - pg := sectionPage(s) - if pg >= tocPage && pg < contentPage { - continue - } - sections = append(sections, s) - } - result.Sections = sections -} - -// findTOCPageRange scans outlines for a TOC entry and returns the -// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found. -func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) { -trimSplit: - for i, o := range outlines { - title := strings.TrimSpace(o.Title) - if idx := strings.Index(title, "@@"); idx >= 0 { - title = strings.TrimSpace(title[:idx]) - } - if !tocTitlePattern.MatchString(strings.ToLower(title)) { - continue - } - tocPage = o.PageNumber - for _, next := range outlines[i+1:] { - if next.Level != o.Level { - continue - } - nt := strings.TrimSpace(next.Title) - if idx := strings.Index(nt, "@@"); idx >= 0 { - nt = strings.TrimSpace(nt[:idx]) - } - if tocTitlePattern.MatchString(strings.ToLower(nt)) { - continue - } - contentPage = next.PageNumber - break trimSplit - } - break - } - return -} - -// sectionPage returns the first page number of a Section, or 0. -func sectionPage(s pdftype.Section) int { - for _, p := range s.Positions { - for _, pn := range p.PageNumbers { - return pn - } - } - return 0 -} - -// ── reorderMultiColumn ───────────────────────────────────────────────── - -// reorderMultiColumn reorders text sections in multi-column layouts. -// If median text column width >= page width / 2 (single-column layout), -// the input order is preserved. -// -// Python: reorder_multi_column_bboxes + sort_X_by_page -func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) { - if len(result.Sections) < 2 { - return - } - pw := pageWidth / zoom - - // Compute median width from text sections with valid coordinates. - var widths []float64 - for _, s := range result.Sections { - if s.LayoutType != "text" { - continue - } - if len(s.Positions) == 0 { - continue - } - w := s.Positions[0].Right - s.Positions[0].Left - if w > 0 { - widths = append(widths, w) - } - } - if len(widths) == 0 { - return - } - sort.Float64s(widths) - medianW := widths[len(widths)/2] - - if medianW >= pw/2 { - return // single column - } - - // Sort by (PageNumber, X0, Top). - sort.Slice(result.Sections, func(i, j int) bool { - pi := sectionPage(result.Sections[i]) - pj := sectionPage(result.Sections[j]) - if pi != pj { - return pi < pj - } - xi := sectionX0(result.Sections[i]) - xj := sectionX0(result.Sections[j]) - if math.Abs(xi-xj) > 1e-6 { - return xi < xj - } - return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j]) - }) - - threshold := medianW / 2 - // Correct same-page sections with nearly-same X0 but inverted Top. - for i := len(result.Sections) - 1; i >= 1; i-- { - for j := i - 1; j >= 0; j-- { - if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold && - sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) && - sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) { - result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j] - } - } - } -} - -func sectionX0(s pdftype.Section) float64 { - for _, p := range s.Positions { - return p.Left - } - return 0 -} - -func sectionTop(s pdftype.Section) float64 { - for _, p := range s.Positions { - return p.Top - } - return 0 -} diff --git a/internal/deepdoc/parser/pdf/post/post_steps_test.go b/internal/deepdoc/parser/pdf/post/post_steps_test.go deleted file mode 100644 index b348f09b8a..0000000000 --- a/internal/deepdoc/parser/pdf/post/post_steps_test.go +++ /dev/null @@ -1,434 +0,0 @@ -package post - -import ( - "context" - "testing" - - pdftype "ragflow/internal/deepdoc/parser/pdf/type" -) - -// ── helpers ────────────────────────────────────────────────────────────── - -// dummyBase64PNG is a valid 50×50 red pixel PNG, base64-encoded. -const dummyBase64PNG = "iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAIAAACRXR/mAAAAUElEQVR4nOzOsREAEAAAMefsvzILaL6iSCbI2uNH83XgTqvQKrQKrUKr0Cq0Cq1Cq9AqtAqtQqvQKrQKrUKr0Cq0Cq1Cq9AqtAqt4gQAAP//miQBZqrF+JAAAAAASUVORK5CYII=" - -func newTestResult(sections ...pdftype.Section) *pdftype.ParseResult { - return &pdftype.ParseResult{Sections: sections} -} - -func makePosSection(text string, page int, x0, x1, top, bottom float64) pdftype.Section { - return pdftype.Section{ - Text: text, - LayoutType: "text", - Positions: []pdftype.Position{{PageNumbers: []int{page}, Left: x0, Right: x1, Top: top, Bottom: bottom}}, - } -} - -// ── normalizeLayoutType ──────────────────────────────────────────────── - -func TestNormalizeLayoutType(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "a", LayoutType: ""}, - pdftype.Section{Text: "b", LayoutType: " "}, - pdftype.Section{Text: "c", LayoutType: "table"}, - pdftype.Section{Text: "d", LayoutType: " figure "}, - pdftype.Section{Text: "e", LayoutType: "text"}, - ) - normalizeLayoutType(result) - want := []string{"text", "text", "table", "figure", "text"} - for i, s := range result.Sections { - if s.LayoutType != want[i] { - t.Errorf("Sections[%d]: got %q, want %q", i, s.LayoutType, want[i]) - } - } -} - -// ── filterHeaderFooter ───────────────────────────────────────────────── - -func TestFilterHeaderFooter(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "Page 1", LayoutType: "header"}, - pdftype.Section{Text: "Chapter 1", LayoutType: "text"}, - pdftype.Section{LayoutType: "footer"}, - pdftype.Section{LayoutType: "number"}, - pdftype.Section{Text: "Body", LayoutType: "text"}, - pdftype.Section{Text: "reference item", LayoutType: "reference"}, - ) - filterHeaderFooter(result) - if len(result.Sections) != 2 { - t.Fatalf("expected 2 sections, got %d: %+v", len(result.Sections), result.Sections) - } - if result.Sections[0].Text != "Chapter 1" || result.Sections[1].Text != "Body" { - t.Errorf("wrong sections kept: %+v", result.Sections) - } -} - -func TestFilterHeaderFooter_Empty(t *testing.T) { - result := newTestResult() - filterHeaderFooter(result) - if len(result.Sections) != 0 { - t.Error("expected empty result") - } -} - -// ── assignDocTypeKwd ─────────────────────────────────────────────────── - -func TestAssignDocTypeKwd_Normal(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "a", LayoutType: "table"}, - pdftype.Section{Text: "b", LayoutType: "figure"}, - pdftype.Section{Text: "c", LayoutType: "equation"}, - pdftype.Section{Text: "d", LayoutType: "", Image: dummyBase64PNG}, - pdftype.Section{Text: "e", LayoutType: "text"}, - pdftype.Section{Text: "f", LayoutType: ""}, - ) - assignDocTypeKwd(result, false) - want := []string{"table", "image", "text", "image", "text", "text"} - for i, s := range result.Sections { - if s.DocTypeKwd != want[i] { - t.Errorf("Sections[%d]: got %q, want %q", i, s.DocTypeKwd, want[i]) - } - } -} - -func TestAssignDocTypeKwd_Flatten(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "a", LayoutType: "table", DocTypeKwd: "table", Image: dummyBase64PNG}, - pdftype.Section{Text: "b", LayoutType: "figure", DocTypeKwd: "image", Image: dummyBase64PNG}, - pdftype.Section{Text: "c", LayoutType: "text", DocTypeKwd: "text"}, - ) - assignDocTypeKwd(result, true) - for _, s := range result.Sections { - if s.DocTypeKwd != "text" { - t.Errorf("expected all 'text', got %q", s.DocTypeKwd) - } - if s.Image != "" { - t.Error("flatten should clear Image to prevent VLM enhancement") - } - } -} - -// ── enhanceWithVision ────────────────────────────────────────────────── - -func TestEnhanceWithVision_NoOp(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "original", Image: dummyBase64PNG, DocTypeKwd: "table"}, - ) - _ = enhanceWithVision(context.Background(), result, nil) - if result.Sections[0].Text != "original" { - t.Errorf("text changed when describer is nil: %q", result.Sections[0].Text) - } -} - -func TestEnhanceWithVision_Success(t *testing.T) { - want := "A table showing Q1 revenue." - desc := &mockImageDescriber{describe: want} - - result := newTestResult( - pdftype.Section{Text: "", Image: dummyBase64PNG, DocTypeKwd: "table"}, - ) - if err := enhanceWithVision(context.Background(), result, desc); err != nil { - t.Fatal(err) - } - if result.Sections[0].Text != want { - t.Errorf("text not enhanced: got %q", result.Sections[0].Text) - } -} - -func TestEnhanceWithVision_SkipText(t *testing.T) { - desc := &mockImageDescriber{describe: "should not be called"} - - result := newTestResult( - pdftype.Section{Text: "plain text", DocTypeKwd: "text", Image: ""}, - ) - if err := enhanceWithVision(context.Background(), result, desc); err != nil { - t.Fatal(err) - } - if result.Sections[0].Text != "plain text" { - t.Errorf("text changed: %q", result.Sections[0].Text) - } -} - -// ── removeTOCByOutlines ──────────────────────────────────────────────── - -func TestRemoveTOCByOutlines_Removes(t *testing.T) { - outlines := []pdftype.Outline{ - {Title: "Chapter 1 Introduction", Level: 0, PageNumber: 1}, - {Title: "目录", Level: 0, PageNumber: 3}, - {Title: "Chapter 2 Methods", Level: 0, PageNumber: 5}, - } - result := newTestResult( - makePosSection("s1", 1, 50, 550, 100, 120), - makePosSection("s2", 2, 50, 550, 100, 120), - makePosSection("toc1", 3, 50, 550, 100, 120), - makePosSection("toc2", 4, 50, 550, 100, 120), - makePosSection("body1", 5, 50, 550, 100, 120), - makePosSection("body2", 6, 50, 550, 100, 120), - ) - removeTOCByOutlines(result, outlines) - if len(result.Sections) != 4 { - t.Fatalf("expected 4 sections, got %d", len(result.Sections)) - } - if result.Sections[0].Text != "s1" || result.Sections[1].Text != "s2" { - t.Error("pre-TOC pages should be kept") - } - if result.Sections[2].Text != "body1" || result.Sections[3].Text != "body2" { - t.Error("post-TOC pages should be kept") - } -} - -func TestRemoveTOCByOutlines_NoMatch(t *testing.T) { - outlines := []pdftype.Outline{ - {Title: "1. Introduction", Level: 0, PageNumber: 1}, - {Title: "2. Background", Level: 0, PageNumber: 3}, - } - result := newTestResult( - makePosSection("s1", 1, 50, 550, 100, 120), - makePosSection("s2", 2, 50, 550, 100, 120), - ) - removeTOCByOutlines(result, outlines) - if len(result.Sections) != 2 { - t.Errorf("expected 2 sections, got %d (no TOC should mean no removal)", len(result.Sections)) - } -} - -func TestRemoveTOCByOutlines_NilOutlines(t *testing.T) { - result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120)) - removeTOCByOutlines(result, nil) - if len(result.Sections) != 1 { - t.Errorf("nil outlines should be no-op: got %d sections", len(result.Sections)) - } -} - -func TestRemoveTOCByOutlines_EmptyOutlines(t *testing.T) { - result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120)) - removeTOCByOutlines(result, []pdftype.Outline{}) - if len(result.Sections) != 1 { - t.Errorf("empty outlines should be no-op: got %d sections", len(result.Sections)) - } -} - -func TestRemoveTOCByOutlines_NoNext(t *testing.T) { - outlines := []pdftype.Outline{ - {Title: "目录", Level: 0, PageNumber: 2}, - } - result := newTestResult( - makePosSection("toc", 2, 50, 550, 100, 120), - makePosSection("body", 3, 50, 550, 100, 120), - ) - removeTOCByOutlines(result, outlines) - if len(result.Sections) != 2 { - t.Errorf("no next outline → keep all sections: got %d", len(result.Sections)) - } -} - -// ── reorderMultiColumn ───────────────────────────────────────────────── - -func TestReorderMultiColumn_SingleCol(t *testing.T) { - result := newTestResult( - makePosSection("B", 0, 50, 550, 200, 220), - makePosSection("A", 0, 50, 550, 100, 120), - ) - reorderMultiColumn(result, 600.0, 1.0) - // medianW=500 >= 300 → single col, order preserved - if result.Sections[0].Text != "B" { - t.Fatal("single column should preserve original order") - } -} - -func TestReorderMultiColumn_MultiCol(t *testing.T) { - result := newTestResult( - makePosSection("B", 0, 300, 500, 100, 120), - makePosSection("A", 0, 50, 250, 100, 120), - ) - reorderMultiColumn(result, 600.0, 1.0) - if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left { - t.Log("multi-column: sections reordered") - } -} - -func TestReorderMultiColumn_Empty(t *testing.T) { - result := newTestResult() - reorderMultiColumn(result, 600.0, 1.0) - if len(result.Sections) != 0 { - t.Error("empty sections should remain empty") - } -} - -func TestReorderMultiColumn_NoText(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "t1", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 300, Right: 500, Top: 100, Bottom: 120}}}, - pdftype.Section{Text: "t2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 50, Right: 250, Top: 100, Bottom: 120}}}, - ) - reorderMultiColumn(result, 600.0, 1.0) - if len(result.Sections) != 2 { - t.Fatal("expected 2 sections") - } -} - -// ── PostProcess integration ──────────────────────────────────────────── - -func TestPostProcess_FullPipeline(t *testing.T) { - // Simulates post-processing after Parse(): all features enabled. - result := newTestResult( - // Page 1: TOC — should be removed - pdftype.Section{Text: "目录", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 100, Bottom: 120}}}, - pdftype.Section{Text: "Chapter 1 ... 1", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 120, Bottom: 140}}}, - // Page 1: header — should be removed - pdftype.Section{Text: "Page 1", LayoutType: "header", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 500, Right: 550, Top: 10, Bottom: 20}}}, - // Page 3: actual content - pdftype.Section{Text: "Introduction text", LayoutType: "", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 100, Bottom: 120}}}, - pdftype.Section{Text: "Row1 Col1 Row1 Col2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 200, Bottom: 300}}, Image: dummyBase64PNG}, - pdftype.Section{Text: "Chart description", LayoutType: "figure", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 300, Bottom: 400}}, Image: dummyBase64PNG}, - // Page 4: footer — should be removed - pdftype.Section{Text: "Confidential", LayoutType: "footer", Positions: []pdftype.Position{{PageNumbers: []int{4}, Left: 50, Right: 550, Top: 700, Bottom: 720}}}, - ) - - outlines := []pdftype.Outline{ - {Title: "目录", Level: 0, PageNumber: 1}, - {Title: "Chapter 1 Introduction", Level: 0, PageNumber: 3}, - } - - wantVLM := "This table shows quarterly revenue data with 2 columns." - describer := &mockImageDescriber{describe: wantVLM} - - // First pass: non-VLM steps through PostProcess - config := PipelineConfig{ - ConfigKeyPageWidth: 600.0, - ConfigKeyZoom: 1.0, - ConfigKeyOutlines: outlines, - ConfigKeyRemoveTOC: true, - } - if err := PostProcess(context.Background(), result, config); err != nil { - t.Fatal(err) - } - // Then: VLM enhancement through internal function (with mock) - if err := enhanceWithVision(context.Background(), result, describer); err != nil { - t.Fatal(err) - } - // Then: flatten - if err := PostProcess(context.Background(), result, PipelineConfig{ - ConfigKeyFlattenMediaToText: true, - }); err != nil { - t.Fatal(err) - } - - // Verify - if len(result.Sections) != 3 { - t.Fatalf("expected 3 sections after filtering, got %d: %+v", len(result.Sections), result.Sections) - } - for i, s := range result.Sections { - if s.DocTypeKwd != "text" { - t.Errorf("section[%d] DocTypeKwd = %q, want 'text'", i, s.DocTypeKwd) - } - if s.LayoutType == "header" || s.LayoutType == "footer" { - t.Errorf("section[%d] LayoutType = %q, should have been filtered out", i, s.LayoutType) - } - } - // Table section should have enhanced text - found := false - for _, s := range result.Sections { - if s.LayoutType == "table" { - found = true - if s.Text != "Row1 Col1 Row1 Col2\n"+wantVLM { - t.Errorf("table text not enhanced: %q", s.Text) - } - } - } - if !found { - t.Error("table section missing from result") - } -} - -func TestPostProcess_Minimal(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "Hello", LayoutType: ""}, - pdftype.Section{Text: "World", LayoutType: " "}, - ) - if err := PostProcess(context.Background(), result, nil); err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Fatalf("expected 2 sections, got %d", len(result.Sections)) - } - if result.Sections[0].LayoutType != "text" || result.Sections[1].LayoutType != "text" { - t.Error("layout not normalized") - } - if result.Sections[0].DocTypeKwd != "text" || result.Sections[1].DocTypeKwd != "text" { - t.Error("doc_type_kwd not assigned") - } -} - -func TestPostProcess_NilResult(t *testing.T) { - if err := PostProcess(context.Background(), nil, nil); err == nil { - t.Error("expected error for nil result") - } -} - -func TestPostProcess_EmptySections(t *testing.T) { - result := newTestResult() - if err := PostProcess(context.Background(), result, nil); err != nil { - t.Fatal(err) - } - if len(result.Sections) != 0 { - t.Error("empty should remain empty") - } -} - -func TestPostProcess_FiguresLazy(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "Fig1", LayoutType: "figure"}, - pdftype.Section{Text: "Body", LayoutType: "text"}, - pdftype.Section{Text: "Fig2", LayoutType: "figure"}, - ) - if err := PostProcess(context.Background(), result, nil); err != nil { - t.Fatal(err) - } - figs := result.Figures() - if len(figs) != 2 { - t.Fatalf("expected 2 figures, got %d", len(figs)) - } - if figs[0].Text != "Fig1" || figs[1].Text != "Fig2" { - t.Errorf("wrong figures: %+v", figs) - } -} - -func TestPostProcess_FilterOnly(t *testing.T) { - result := newTestResult( - pdftype.Section{Text: "Header", LayoutType: "header"}, - pdftype.Section{Text: "Second", LayoutType: "text"}, - pdftype.Section{Text: "First", LayoutType: "text"}, - ) - if err := PostProcess(context.Background(), result, nil); err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Fatalf("expected 2 sections after filtering, got %d", len(result.Sections)) - } - figs := result.Figures() - if len(figs) != 0 { - t.Errorf("expected 0 figures, got %d", len(figs)) - } -} - -func TestPostProcess_ReorderOnly(t *testing.T) { - result := newTestResult( - makePosSection("B", 0, 300, 500, 100, 120), - makePosSection("A", 0, 50, 250, 100, 120), - ) - config := PipelineConfig{ - ConfigKeyPageWidth: 600.0, - ConfigKeyZoom: 1.0, - } - // Remove the outlines key since we don't need it - if err := PostProcess(context.Background(), result, config); err != nil { - t.Fatal(err) - } - if len(result.Sections) != 2 { - t.Fatal("expected 2 sections") - } - // Should be reordered: col 1 leftmost: A then B - if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left { - t.Log("multi-column: sections reordered left-to-right") - } -} diff --git a/internal/deepdoc/parser/pdf/post/vision_describe.go b/internal/deepdoc/parser/pdf/post/vision_describe.go deleted file mode 100644 index 0475f51774..0000000000 --- a/internal/deepdoc/parser/pdf/post/vision_describe.go +++ /dev/null @@ -1,98 +0,0 @@ -package post - -import ( - "context" - "errors" - "image" -) - -// ImageDescriber describes an image using a vision language model. -type ImageDescriber interface { - DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error) -} - -// maxDescribeConcurrency limits how many concurrent VLM calls are in flight. -const maxDescribeConcurrency = 10 - -// minImageSide is the minimum width or height (in pixels) for an image -// to be sent to a VLM. Tiny crops fail provider image-size limits. -const minImageSide = 11 - -// describePrompt is the default prompt for image/table description. -// Python: vision_llm_figure_describe_prompt.md -const describePrompt = `## ROLE - -You are an expert visual data analyst. - -## GOAL - -Analyze the image and produce a textual representation strictly based on what is visible in the image. - -## DECISION RULE (CRITICAL) - -First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset. - -## OUTPUT RULES (STRICT) - -- Produce output in exactly one of the two modes defined below. -- Do NOT mention, label, or reference the modes in the output. -- Do NOT combine content from both modes. -- Do NOT explain or justify the choice of mode. -- Do NOT add any headings, titles, or commentary beyond what the mode requires. - ---- - -## MODE 1: STRUCTURED VISUAL DATA OUTPUT - -(Use only if the image contains enumerable data units forming a coherent dataset.) - -Output only the following fields, in list form: -- Visual Type: -- Title: -- Axes / Legends / Labels: -- Data Points: -- Captions / Annotations: - ---- - -## MODE 2: GENERAL FIGURE CONTENT - -(Use only if the image does NOT contain enumerable data units.) - -Write the content directly, starting from the first sentence. -Do NOT add any introductory labels, titles, headings, or prefixes. - -Requirements: -- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right). -- Explicitly name interface elements or visual objects exactly as they appear. -- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels. -- Describe spatial grouping, containment, and alignment of elements. -- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes. -- Avoid narrative or stylistic language unless it is a dominant and functional visual element. - -Use concise, information-dense sentences. -Do not use bullet lists or structured fields in this mode.` - -// DescribeImage calls the VLM to produce a natural-language description of -// the given image. Returns the description text or an error. -// -// Images smaller than minImageSide in either dimension are silently skipped -// (returning an empty string and no error), matching Python's behavior. -func DescribeImage(ctx context.Context, img image.Image, prompt string, client ImageDescriber) (string, error) { - if img == nil { - return "", errors.New("DescribeImage: nil image") - } - b := img.Bounds() - if b.Dx() == 0 || b.Dy() == 0 { - return "", errors.New("DescribeImage: empty image (0x0)") - } - if b.Dx() < minImageSide || b.Dy() < minImageSide { - return "", nil // skip tiny crops, Python compatible - } - - if err := ctx.Err(); err != nil { - return "", err - } - - return client.DescribeImage(ctx, img, prompt) -} diff --git a/internal/deepdoc/parser/pdf/post/vision_describe_test.go b/internal/deepdoc/parser/pdf/post/vision_describe_test.go deleted file mode 100644 index 9f208d15a9..0000000000 --- a/internal/deepdoc/parser/pdf/post/vision_describe_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package post - -import ( - "context" - "errors" - "image" - "image/color" - "testing" -) - -// ── mock image describer ─────────────────────────────────────────────── - -type mockImageDescriber struct { - describe string - err error -} - -func (m *mockImageDescriber) DescribeImage(_ context.Context, _ image.Image, _ string) (string, error) { - return m.describe, m.err -} - -// ── DescribeImage tests ──────────────────────────────────────────────── - -func TestDescribeImage_Success(t *testing.T) { - img := newTestImage(100, 100) - want := "This is a bar chart showing quarterly revenue." - client := &mockImageDescriber{describe: want} - - got, err := DescribeImage(context.Background(), img, "Describe this image", client) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if got != want { - t.Errorf("DescribeImage() = %q, want %q", got, want) - } -} - -func TestDescribeImage_VLMError(t *testing.T) { - img := newTestImage(100, 100) - client := &mockImageDescriber{err: errors.New("VLM timeout")} - - got, err := DescribeImage(context.Background(), img, "Describe this image", client) - if err == nil { - t.Fatal("expected error, got nil") - } - if got != "" { - t.Errorf("expected empty string on error, got %q", got) - } -} - -func TestDescribeImage_CanceledContext(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - cancel() // cancel immediately - img := newTestImage(100, 100) - client := &mockImageDescriber{describe: "should not be reached"} - - got, err := DescribeImage(ctx, img, "prompt", client) - if err == nil { - t.Fatal("expected context error, got nil") - } - if got != "" { - t.Errorf("expected empty string, got %q", got) - } -} - -func TestDescribeImage_NilImage(t *testing.T) { - client := &mockImageDescriber{describe: "should not be reached"} - - got, err := DescribeImage(context.Background(), nil, "prompt", client) - if err == nil { - t.Fatal("expected error for nil image, got nil") - } - if got != "" { - t.Errorf("expected empty string, got %q", got) - } -} - -func TestDescribeImage_EmptyImage(t *testing.T) { - img := newTestImage(0, 0) - client := &mockImageDescriber{describe: "should not be reached"} - - _, err := DescribeImage(context.Background(), img, "prompt", client) - if err == nil { - t.Fatal("expected error for empty image, got nil") - } -} - -func TestDescribeImage_TinyImage(t *testing.T) { - img := newTestImage(5, 5) // below minSide=11 - client := &mockImageDescriber{describe: "should not be reached"} - - got, err := DescribeImage(context.Background(), img, "prompt", client) - if err != nil { - t.Fatal("tiny images should be silently skipped, not error") - } - if got != "" { - t.Errorf("expected empty string for tiny image, got %q", got) - } -} - -// ── helpers ──────────────────────────────────────────────────────────── - -func newTestImage(w, h int) image.Image { - img := image.NewRGBA(image.Rect(0, 0, w, h)) - // Fill with a recognizable pattern. - for y := 0; y < h; y++ { - for x := 0; x < w; x++ { - img.Set(x, y, color.RGBA{R: uint8(x % 256), G: uint8(y % 256), B: 128, A: 255}) - } - } - return img -} diff --git a/internal/deepdoc/parser/pdf/render_compare_test.go b/internal/deepdoc/parser/pdf/render_compare_test.go index 6c2446d615..8e3069b791 100644 --- a/internal/deepdoc/parser/pdf/render_compare_test.go +++ b/internal/deepdoc/parser/pdf/render_compare_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "image" @@ -53,7 +53,7 @@ func TestRenderCompare(t *testing.T) { } // Render page 0 with pdfium (Go). - goImg, err := renderPageToImage(eng, 0) + goImg, err := RenderPageToImage(eng, 0) eng.Close() if err != nil { t.Logf("%s: render error: %v", name, err) diff --git a/internal/deepdoc/parser/pdf/renderer.go b/internal/deepdoc/parser/pdf/renderer.go index e409cad2a5..0f8a13938f 100644 --- a/internal/deepdoc/parser/pdf/renderer.go +++ b/internal/deepdoc/parser/pdf/renderer.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "image" @@ -13,7 +13,7 @@ import ( var renderFn = fallbackRender // renderPageToImage renders a page at 216 DPI for downstream DLA/TSR/OCR. -func renderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) { +func RenderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) { return renderFn(engine, pageNum) } @@ -25,7 +25,10 @@ func fallbackRender(engine pdf.PDFEngine, pageNum int) (image.Image, error) { } // Guard against typed-nil (e.g. (*image.RGBA)(nil) returned as non-nil // interface). The plain img==nil check misses that case. - if img == nil || reflect.ValueOf(img).IsNil() { + if img == nil { + return nil, ErrNoPDFData + } + if rv := reflect.ValueOf(img); rv.Kind() == reflect.Ptr && rv.IsNil() { return nil, ErrNoPDFData } return img, nil diff --git a/internal/deepdoc/parser/pdf/renderer_pdfium.go b/internal/deepdoc/parser/pdf/renderer_pdfium.go index 0e8869f657..2305a61a9f 100644 --- a/internal/deepdoc/parser/pdf/renderer_pdfium.go +++ b/internal/deepdoc/parser/pdf/renderer_pdfium.go @@ -1,6 +1,6 @@ //go:build cgo -package parser +package pdf import ( "image" diff --git a/internal/deepdoc/parser/pdf/rotate_test.go b/internal/deepdoc/parser/pdf/rotate_test.go index 44680cbbec..0cf5c1b719 100644 --- a/internal/deepdoc/parser/pdf/rotate_test.go +++ b/internal/deepdoc/parser/pdf/rotate_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "image" @@ -24,8 +24,8 @@ func pdfiumPtSize(eng pdf.PDFEngine, file string, t *testing.T) (w, h float64) { raw := eng.RawData() if raw == nil { // Fallback: use pdf_oxide pre-rotation size. - if pe, ok := eng.(*pdfoxideEngine); ok { - w, h, _ = pe.inner.PageSize(0) + if pe, ok := eng.(*PDFOxideEngine); ok { + w, h, _ = pe.Inner.PageSize(0) } return } @@ -302,7 +302,7 @@ func TestRotation_CropBoxWithRotate(t *testing.T) { // CropBox excludes content from the page edges; chars near the // CropBox boundary may end up outside the effective page after rotation. if oobRate > 40 { - t.Errorf("too many OOB chars: %.1f%%", oobRate) + t.Errorf("too many OOB Chars: %.1f%%", oobRate) } // Verify render alignment. diff --git a/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go b/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go index ae4bc7a499..f958f8a495 100644 --- a/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go +++ b/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -43,9 +43,8 @@ func TestScanAllPDFs(t *testing.T) { eng := mustOpenEngine(t, name) cfg := pdf.DefaultParserConfig() - cfg.TableBuilder = NewDeepDocTableBuildService(client) - p := NewParser(cfg, client) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, client) eng.Close() if err != nil { fmt.Printf(" ❌ ERROR: %v\n", err) diff --git a/internal/deepdoc/parser/pdf/snapshot_test.go b/internal/deepdoc/parser/pdf/snapshot_test.go index 1343ac2b16..d3f7b5c807 100644 --- a/internal/deepdoc/parser/pdf/snapshot_test.go +++ b/internal/deepdoc/parser/pdf/snapshot_test.go @@ -1,6 +1,6 @@ //go:build manual -package parser +package pdf import ( "encoding/json" @@ -16,7 +16,7 @@ import ( "testing" ) -// TestSnapshotStageComparison verifies Go's TextMerge output +// TestSnapshotStageComparison verifies Go's lyt.TextMerge output // matches Python's _text_merge sample boxes using synthetic input. func TestSnapshotStageComparison(t *testing.T) { snapDir := filepath.Join("testdata", "snapshots") @@ -47,19 +47,19 @@ func TestSnapshotStageComparison(t *testing.T) { // Convert sample boxes to Go pdf.TextBox format goBoxes := snapshotBoxesToGo(s1.SampleBoxesPage0) - // Run Go TextMerge with default params + // Run Go lyt.TextMerge with default params meanH := map[int]float64{0: avg(s1.MeanHeight)} merged := lyt.TextMerge(goBoxes, meanH, 3) // Compare counts if len(merged) > 0 { - t.Logf(" Go TextMerge: %d -> %d boxes", len(goBoxes), len(merged)) + t.Logf(" Go lyt.TextMerge: %d -> %d boxes", len(goBoxes), len(merged)) mergeRatio := float64(len(merged)) / float64(len(goBoxes)) pyRatio := float64(s4.BoxesAfter) / float64(s4.BoxesBefore) t.Logf(" Merge ratios: Go=%.0f%% Python=%.0f%%", mergeRatio*100, pyRatio*100) } - // Run Go NaiveVerticalMerge + // Run Go lyt.NaiveVerticalMerge meanW := map[int]float64{0: avg(s1.MeanWidth)} vm := lyt.NaiveVerticalMerge(merged, meanH, meanW, s1.IsEnglish) if s6, ok := snap.Stages["_naive_vertical_merge"]; ok { diff --git a/internal/deepdoc/parser/pdf/table/table_construct.go b/internal/deepdoc/parser/pdf/table/table_construct.go index fdd8ad8d1b..bb7509e95d 100644 --- a/internal/deepdoc/parser/pdf/table/table_construct.go +++ b/internal/deepdoc/parser/pdf/table/table_construct.go @@ -2,6 +2,7 @@ package table import ( "fmt" + "html" "math" "regexp" "sort" @@ -698,7 +699,47 @@ func RowsToHTML(rows [][]pdf.TSRCell, caption string, headerRows map[int]bool, s return b.String() } -// ── Span computation (Python: __cal_spans) ── +// SimpleRowsToHTML converts plain string-based table data to an HTML table. +// The first row is treated as a header (). Used by DOCX, XLSX, PPTX, +// and HTML parsers that produce [][]string directly. +func SimpleRowsToHTML(rows [][]string) string { + if len(rows) == 0 { + return "
" + } + nCols := 0 + for _, row := range rows { + if len(row) > nCols { + nCols = len(row) + } + } + var b strings.Builder + b.WriteString("") + for ri, row := range rows { + b.WriteString("") + tag := "td" + if ri == 0 { + tag = "th" + } + for ci := 0; ci < nCols; ci++ { + text := "" + if ci < len(row) { + text = row[ci] + } + b.WriteString("<") + b.WriteString(tag) + b.WriteString(" >") + b.WriteString(html.EscapeString(text)) + b.WriteString("") + } + b.WriteString("") + } + b.WriteString("
") + return b.String() +} + +// Span computation (Python: __cal_spans) ── // calSpans computes colspan and rowspan for spanning cells in the grid. // Returns spanInfo (row,col → colspan,rowspan) and covered (cells hidden by spans). diff --git a/internal/deepdoc/parser/pdf/table_extract.go b/internal/deepdoc/parser/pdf/table_extract.go index f90deae187..a65f631d24 100644 --- a/internal/deepdoc/parser/pdf/table_extract.go +++ b/internal/deepdoc/parser/pdf/table_extract.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -12,10 +12,10 @@ import ( util "ragflow/internal/deepdoc/parser/pdf/util" ) -// enrichWithDeepDoc runs DLA+TSR via p.DeepDoc and returns detected tables. +// enrichWithDeepDoc runs DLA+TSR via docAnalyzer and returns detected tables. // pageImages optionally provides pre-rendered page images to avoid re-rendering. -func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image) []pdf.TableItem { - if !p.DeepDoc.Health() { +func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem { + if !docAnalyzer.Health() { return nil } // Group boxes by page for annotation write-back. @@ -50,7 +50,7 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, for i, idx := range indices { pageBoxes[i] = boxes[idx] } - tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems)) + tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems), docAnalyzer, tb) tableItems = append(tableItems, tables...) // Write back DLA and TSR annotations (R/C/H/SP) to the original boxes. for i, idx := range indices { @@ -65,21 +65,21 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, return tableItems } -func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int) []pdf.TableItem { +func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem { pageImg, ok := pageImages[pageNum] if !ok { var err error - pageImg, err = renderPageToImage(engine, pageNum) + pageImg, err = RenderPageToImage(engine, pageNum) if err != nil { slog.Warn("render page for DeepDoc failed", "page", pageNum, "err", err) return nil } } - return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx) + return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx, docAnalyzer, tb) } -func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int) []pdf.TableItem { - regions, err := p.DeepDoc.DLA(ctx, pageImg) +func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem { + regions, err := docAnalyzer.DLA(ctx, pageImg) if err != nil { slog.Warn("DLA failed", "page", pageNum, "err", err) return nil @@ -95,148 +95,117 @@ func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.Par tableMatches := tbl.MatchTableRegions(boxes, regions, scale) var items []pdf.TableItem for _, tm := range tableMatches { - cropped, cropErr := util.CropImageRegion(pageImg, tm.Region) - if cropErr != nil { - // DLA returned an invalid region (e.g. x1 < x0). Python - // PIL.Image.crop() raises ValueError here; we skip this - // table instead of passing a full-page image to TSR. - continue + item := p.processOneTable(ctx, result, boxes, pageImg, pageNum, docAnalyzer, tb, tm, scale, tableBaseIdx+len(items)) + if item.ImageB64 != "" || len(item.Cells) > 0 || len(item.Positions) > 0 { + items = append(items, item) } + } + return items +} - // Rotation detection (Python: _evaluate_table_orientation). - // If rotated, TSR and OCR use the rotated image; cell coords - // are mapped back to original crop space for box matching. - autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables - bestAngle := 0 - origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy() - tsrImg := cropped - if autoRotate { - angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, p.DeepDoc) - bestAngle = angle - tsrImg = rotated - } - - imgB64, encErr := util.EncodeImageToBase64PNG(cropped) - if encErr != nil { - slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr) - } - - var cells []pdf.TSRCell - var tsrErr error - cells, tsrErr = p.tableBuilder.DetectCells(ctx, tsrImg) - if tsrErr != nil { - slog.Warn("TSR failed", "page", pageNum, "err", tsrErr) - } - // Collect TSR raw cells for debug comparison. - if tsrErr == nil { - for _, c := range cells { - if result != nil { - result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{ - TableIndex: tableBaseIdx + len(items), Page: pageNum, - Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, - Text: c.Text, - }) - } - } - } - // Python margin: w*0.03, h*0.03 (_table_transformer_job:374-376). - w := tm.Region.X1 - tm.Region.X0 - h := tm.Region.Y1 - tm.Region.Y0 - marginX := w * 0.03 - marginY := h * 0.03 - cropOffX := math.Max(0, tm.Region.X0-marginX) - cropOffY := math.Max(0, tm.Region.Y0-marginY) - - var boxInCrop []pdf.TextBox - if tsrErr == nil && len(cells) > 0 { - if bestAngle != 0 { - // OCR on rotated image before mapping cells back. - // Cells are in rotated-pixel space; OCR works best - // on upright text. After mapping, cells move to - // original crop space where boxInCrop lives. - if !p.Config.SkipOCR { - ocrTableCells(ctx, cells, tsrImg, p.DeepDoc) - } - for i := range cells { - cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH) - cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH) - } - } - // Fill cell text from pre-merge boxes, skipping caption boxes - // (text entirely above the first TSR cell row). - firstCellTop := 1e9 - for _, c := range cells { - if c.Y0 >= 0 && c.Y0 < firstCellTop { - firstCellTop = c.Y0 - } - } - if firstCellTop == 1e9 { - firstCellTop = cells[0].Y0 // fallback if all cells have Y0 < 0 - } - boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx)) - for _, idx := range tm.BoxIdx { - b := boxes[idx] - if b.Bottom*scale-cropOffY < firstCellTop { - continue // caption box above first TSR cell - } - boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY)) - } - } - var positions []pdf.Position - for _, idx := range tm.BoxIdx { - b := boxes[idx] - positions = append(positions, pdf.Position{ - PageNumbers: []int{pageNum}, - Left: b.X0, Right: b.X1, - Top: b.Top, Bottom: b.Bottom, +// processOneTable handles DLA+TSR+OCR for a single table region match. +func (p *Parser) processOneTable(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, tm tbl.TableMatch, scale float64, tableIdx int) pdf.TableItem { + cropped, cropErr := util.CropImageRegion(pageImg, tm.Region) + if cropErr != nil { + return pdf.TableItem{} + } + autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables + bestAngle := 0 + origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy() + tsrImg := cropped + if autoRotate { + angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, docAnalyzer) + bestAngle = angle + tsrImg = rotated + } + imgB64, encErr := util.EncodeImageToBase64PNG(cropped) + if encErr != nil { + slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr) + } + cells, tsrErr := tb.DetectCells(ctx, tsrImg) + if tsrErr != nil { + slog.Warn("TSR failed", "page", pageNum, "err", tsrErr) + } + if tsrErr == nil && result != nil { + for _, c := range cells { + result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{ + TableIndex: tableIdx, Page: pageNum, + Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, Text: c.Text, }) } - // Pre-compute grid from raw TSR cells (without crop offset). - // Stored in pdf.TableItem for constructTable; annotateTableBoxes - // recomputes with offset cells for spatial matching precision. - var grid [][]pdf.TSRCell - if len(cells) > 0 { - grid = p.tableBuilder.GroupCells(cells) - // Fill cell text from boxes in crop space. Works for both - // Label-aware grouping (cells rearranged) vs. cross-product (creates new cells). - if len(grid) > 0 { - flat := tbl.FlattenGrid(grid) - tbl.FillCellTextFromBoxes(flat, boxInCrop) - idx := 0 + } + w := tm.Region.X1 - tm.Region.X0 + h := tm.Region.Y1 - tm.Region.Y0 + cropOffX := math.Max(0, tm.Region.X0-w*0.03) + cropOffY := math.Max(0, tm.Region.Y0-h*0.03) + var boxInCrop []pdf.TextBox + if tsrErr == nil && len(cells) > 0 { + if bestAngle != 0 { + if !p.Config.SkipOCR { + ocrTableCells(ctx, cells, tsrImg, docAnalyzer) + } + for i := range cells { + cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH) + cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH) + } + } + firstCellTop := 1e9 + for _, c := range cells { + if c.Y0 >= 0 && c.Y0 < firstCellTop { + firstCellTop = c.Y0 + } + } + if firstCellTop == 1e9 { + firstCellTop = cells[0].Y0 + } + boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx)) + for _, idx := range tm.BoxIdx { + b := boxes[idx] + if b.Bottom*scale-cropOffY < firstCellTop { + continue + } + boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY)) + } + } + var positions []pdf.Position + for _, idx := range tm.BoxIdx { + b := boxes[idx] + positions = append(positions, pdf.Position{ + PageNumbers: []int{pageNum}, + Left: b.X0, Right: b.X1, Top: b.Top, Bottom: b.Bottom, + }) + } + var grid [][]pdf.TSRCell + if len(cells) > 0 { + grid = tb.GroupCells(cells) + if len(grid) > 0 { + flat := tbl.FlattenGrid(grid) + tbl.FillCellTextFromBoxes(flat, boxInCrop) + idx := 0 + for ri := range grid { + for ci := range grid[ri] { + grid[ri][ci].Text = flat[idx].Text + idx++ + } + } + if bestAngle == 0 && !p.Config.SkipOCR { + ocrTableCells(ctx, flat, tsrImg, docAnalyzer) + idx = 0 for ri := range grid { for ci := range grid[ri] { grid[ri][ci].Text = flat[idx].Text idx++ } } - if bestAngle == 0 && !p.Config.SkipOCR { - ocrTableCells(ctx, flat, tsrImg, p.DeepDoc) - idx = 0 - for ri := range grid { - for ci := range grid[ri] { - grid[ri][ci].Text = flat[idx].Text - idx++ - } - } - } } } - items = append(items, pdf.TableItem{ - ImageB64: imgB64, - Cells: cells, - Grid: grid, - Positions: positions, - Scale: scale, - CropOffX: cropOffX, - CropOffY: cropOffY, - // DLA region in PDF point space (Python's cropout uses layout region boundaries). - RegionLeft: tm.Region.X0 / scale, - RegionRight: tm.Region.X1 / scale, - RegionTop: tm.Region.Y0 / scale, - RegionBottom: tm.Region.Y1 / scale, - }) - - tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, p.tableBuilder) } - return items + item := pdf.TableItem{ + ImageB64: imgB64, Cells: cells, Grid: grid, Positions: positions, + Scale: scale, CropOffX: cropOffX, CropOffY: cropOffY, + RegionLeft: tm.Region.X0 / scale, RegionRight: tm.Region.X1 / scale, + RegionTop: tm.Region.Y0 / scale, RegionBottom: tm.Region.Y1 / scale, + } + tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, tb) + return item } diff --git a/internal/deepdoc/parser/pdf/table_rotate_integration_test.go b/internal/deepdoc/parser/pdf/table_rotate_integration_test.go index 6163608f45..551600e47c 100644 --- a/internal/deepdoc/parser/pdf/table_rotate_integration_test.go +++ b/internal/deepdoc/parser/pdf/table_rotate_integration_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -9,6 +9,7 @@ import ( inf "ragflow/internal/deepdoc/parser/pdf/inference" tbl "ragflow/internal/deepdoc/parser/pdf/table" pdf "ragflow/internal/deepdoc/parser/pdf/type" + util "ragflow/internal/deepdoc/parser/pdf/util" "testing" ) @@ -32,7 +33,7 @@ func TestTableRotation_Integration(t *testing.T) { if baseURL == "" { baseURL = "http://localhost:9390" } - dd, err := inf.NewInferenceClient(baseURL) + dd, err := inf.NewClient(baseURL) if err != nil { t.Fatal(err) } @@ -59,10 +60,10 @@ func TestTableRotation_Integration(t *testing.T) { cfg.ToPage = pageCount - 1 autoRotate := true cfg.AutoRotateTables = &autoRotate - _ = NewParser(cfg, dd) // verify construction does not panic + _ = NewParser(cfg) // verify construction does not panic for pg := 0; pg < pageCount; pg++ { - pageImg, err := renderPageToImage(eng, pg) + pageImg, err := RenderPageToImage(eng, pg) if err != nil { t.Fatalf("render page %d: %v", pg, err) } @@ -80,7 +81,7 @@ func TestTableRotation_Integration(t *testing.T) { tableCount++ // Crop table region - cropped, err := cropImageRegion(pageImg, r) + cropped, err := util.CropImageRegion(pageImg, r) if err != nil { t.Errorf(" crop table %d: %v", tableCount, err) continue @@ -130,7 +131,7 @@ func TestTableRotation_Stability(t *testing.T) { if baseURL == "" { baseURL = "http://localhost:9390" } - dd, err := inf.NewInferenceClient(baseURL) + dd, err := inf.NewClient(baseURL) if err != nil { t.Fatal(err) } @@ -163,7 +164,7 @@ func TestTableRotation_Stability(t *testing.T) { continue } - pageImg, err := renderPageToImage(eng, 0) + pageImg, err := RenderPageToImage(eng, 0) eng.Close() if err != nil { continue @@ -177,7 +178,11 @@ func TestTableRotation_Stability(t *testing.T) { continue } tables++ - cropped, _ := cropImageRegion(pageImg, r) + cropped, err := util.CropImageRegion(pageImg, r) + if err != nil { + t.Errorf(" %s crop table: %v", e.Name(), err) + continue + } if cropped == nil { continue } diff --git a/internal/deepdoc/parser/pdf/table_section_test.go b/internal/deepdoc/parser/pdf/table_section_test.go index fd400f3ab0..3ca41a41d3 100644 --- a/internal/deepdoc/parser/pdf/table_section_test.go +++ b/internal/deepdoc/parser/pdf/table_section_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "context" @@ -16,11 +16,11 @@ import ( // entries. Go backfills pdf.Section.Text from pdf.TableItem.Rows after // linkTableSections. func TestTableSection_TextFromTSR(t *testing.T) { - eng := &mockEngine{ - pageCount: 1, - renderW: 900, // 300pt at 3x = 900px (216 DPI) - renderH: 600, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + RenderW: 900, // 300pt at 3x = 900px (216 DPI) + RenderH: 600, + Chars: map[int][]pdf.TextChar{0: { // PDF space (72 DPI): well inside DLA region {X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"}, {X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"}, @@ -42,9 +42,9 @@ func TestTableSection_TextFromTSR(t *testing.T) { {X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -93,14 +93,14 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) { {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) // 0 text boxes, but page 0 has a rendered image. boxes := []pdf.TextBox{} dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600)) pageImages := map[int]image.Image{0: dummyImg} - tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages) + tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages, mock, NewTableBuilderFor(mock)) if len(tables) == 0 { t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0") } @@ -113,10 +113,10 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) { // is merged into the nearest "figure" pdf.Section and the caption pdf.Section is // removed. Matches Python _extract_table_figure caption matching. func TestFigureCaption_MergedIntoFigure(t *testing.T) { - eng := &mockEngine{ - pageCount: 1, - renderW: 1800, renderH: 2400, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + RenderW: 1800, RenderH: 2400, + Chars: map[int][]pdf.TextChar{0: { // Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100). {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"}, // Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113). @@ -131,9 +131,9 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) { {X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -169,10 +169,10 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) { // TestTableCaption_MergedIntoTable verifies that "table caption" text // is merged into the nearest table pdf.Section and the caption is removed. func TestTableCaption_MergedIntoTable(t *testing.T) { - eng := &mockEngine{ - pageCount: 1, - renderW: 1800, renderH: 2400, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + RenderW: 1800, RenderH: 2400, + Chars: map[int][]pdf.TextChar{0: { // Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100). {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"}, // Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113). @@ -190,9 +190,9 @@ func TestTableCaption_MergedIntoTable(t *testing.T) { {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -224,10 +224,10 @@ func TestTableCaption_MergedIntoTable(t *testing.T) { // boxes overlapping a table region, regardless of their DLA label. // This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs. func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) { - eng := &mockEngine{ - pageCount: 1, - renderW: 1800, renderH: 2400, - chars: map[int][]pdf.TextChar{0: { + eng := &MockEngine{ + NumPages: 1, + RenderW: 1800, RenderH: 2400, + Chars: map[int][]pdf.TextChar{0: { // Box A: inside DLA table region, labeled as "text" by DLA. {X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"}, // Box B: inside DLA table region, same situation. @@ -247,9 +247,9 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) { {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"}, }, } - p := NewParser(pdf.DefaultParserConfig(), mock) + p := NewParser(pdf.DefaultParserConfig()) - result, err := p.Parse(context.Background(), eng) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -286,9 +286,10 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) { // TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully. func TestEmptyDoc_NoCrash(t *testing.T) { - eng := &mockEngine{pageCount: 0} - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + eng := &MockEngine{NumPages: 0} + mock := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } @@ -299,13 +300,69 @@ func TestEmptyDoc_NoCrash(t *testing.T) { // TestNilChars_handled verifies zero-chars pages don't crash. func TestNilChars_Handled(t *testing.T) { - eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200} - p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + eng := &MockEngine{NumPages: 1, RenderW: 200, RenderH: 200} + mock := &MockDocAnalyzer{Healthy: true} + p := NewParser(pdf.DefaultParserConfig()) + result, err := p.ParseRaw(context.Background(), eng, mock) if err != nil { t.Fatalf("Parse: %v", err) } - if len(result.Sections) != 0 && p.DeepDoc != nil { + if len(result.Sections) != 0 { t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections)) } } + +func TestMatchTableImage_ByPositions(t *testing.T) { + tableByRegion := map[string]string{ + "0_50.0_500.0_100.0_300.0": "img_base64_positions", + } + sec := &pdf.Section{ + LayoutType: pdf.LayoutTypeTable, + Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 50.0, Right: 500.0, Top: 100.0, Bottom: 300.0}}, + } + img, ok := matchTableImage(sec, tableByRegion) + if !ok { + t.Fatal("expected match by Positions") + } + if img != "img_base64_positions" { + t.Errorf("got %q, want img_base64_positions", img) + } +} + +func TestMatchTableImage_FallbackToRegion(t *testing.T) { + tableByRegion := map[string]string{ + "0_80.0_520.0_200.0_400.0": "img_base64_region", + } + sec := &pdf.Section{ + LayoutType: pdf.LayoutTypeTable, + Positions: nil, + TableItem: &pdf.TableItem{RegionLeft: 80.0, RegionRight: 520.0, RegionTop: 200.0, RegionBottom: 400.0}, + } + img, ok := matchTableImage(sec, tableByRegion) + if !ok { + t.Fatal("expected match by Region fallback") + } + if img != "img_base64_region" { + t.Errorf("got %q, want img_base64_region", img) + } +} + +func TestMatchTableImage_NoMatch(t *testing.T) { + tableByRegion := map[string]string{"0_10.0_20.0_30.0_40.0": "no_chance"} + sec := &pdf.Section{ + LayoutType: pdf.LayoutTypeTable, + Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 100, Right: 200, Top: 300, Bottom: 400}}, + } + _, ok := matchTableImage(sec, tableByRegion) + if ok { + t.Error("expected no match") + } +} + +func TestMatchTableImage_EmptySection(t *testing.T) { + sec := &pdf.Section{LayoutType: pdf.LayoutTypeTable} + _, ok := matchTableImage(sec, map[string]string{"x": "y"}) + if ok { + t.Error("expected no match for empty section") + } +} diff --git a/internal/deepdoc/parser/pdf/test_helpers_test.go b/internal/deepdoc/parser/pdf/test_helpers_test.go index f5937a333f..81ea5bcbd0 100644 --- a/internal/deepdoc/parser/pdf/test_helpers_test.go +++ b/internal/deepdoc/parser/pdf/test_helpers_test.go @@ -1,4 +1,4 @@ -package parser +package pdf import ( "image" @@ -6,48 +6,6 @@ import ( pdf "ragflow/internal/deepdoc/parser/pdf/type" ) -// ── mockEngine: minimal pdf.PDFEngine stub for unit tests ───────────── - -type mockEngine struct { - chars map[int][]pdf.TextChar - pageCount int - renderW int - renderH int -} - -func (m *mockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) { - return m.chars[pg], nil -} -func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) { - w, h := m.renderW, m.renderH - if w <= 0 { - w = 595 - } - if h <= 0 { - h = 842 - } - return nil, nil -} -func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) { - w, h := m.renderW, m.renderH - if w <= 0 { - w = 100 - } - if h <= 0 { - h = 100 - } - return image.NewRGBA(image.Rect(0, 0, w, h)), nil -} -func (m *mockEngine) PageCount() (int, error) { - if m.pageCount <= 0 { - return 1, nil - } - return m.pageCount, nil -} -func (m *mockEngine) RawData() []byte { return nil } -func (m *mockEngine) Close() error { return nil } -func (m *mockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil } - // ── testPageImg: small test image for ocrMergeChars tests ───────────── // 90×120 px at 216 DPI → 30×40 pt in PDF space after /3.0 scaling. diff --git a/internal/deepdoc/parser/pdf/text_dump_test.go b/internal/deepdoc/parser/pdf/text_dump_test.go index c798fa9411..baa7fabd3b 100644 --- a/internal/deepdoc/parser/pdf/text_dump_test.go +++ b/internal/deepdoc/parser/pdf/text_dump_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "context" @@ -66,8 +66,8 @@ func TestDumpTextOutput(t *testing.T) { } cfg := pdf.DefaultParserConfig() - p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) - result, err := p.Parse(context.Background(), eng) + p := NewParser(cfg) + result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true}) eng.Close() if err != nil { t.Logf("[%d/%d] %s — parse error: %v", i+1, count, name, err) diff --git a/internal/deepdoc/parser/pdf/type/types.go b/internal/deepdoc/parser/pdf/type/types.go index 58b89bf26a..d5fa0cdfa3 100644 --- a/internal/deepdoc/parser/pdf/type/types.go +++ b/internal/deepdoc/parser/pdf/type/types.go @@ -1,320 +1,56 @@ -// Package pdftypes provides shared types, interfaces, and constants for the -// PDF parser pipeline. It has zero dependencies on sibling packages so that -// sub-packages (tables, geometry, etc.) can import it without circular imports. +// Package pdftype provides PDF-specific types and re-exports shared types +// from the doctype package via Go type aliases. Existing PDF parser code +// that imports this package continues to work without changes. package pdftype -import ( - "context" - "image" - "unicode" -) +import doctype "ragflow/internal/deepdoc/parser/type" -// ── Pipeline types ──────────────────────────────────────────────────────── +// ── Re-export shared types via aliases ───────────────────────────────────── -// PipelineMetrics records diagnostic counts at each pipeline stage. -type PipelineMetrics struct { - BoxesInitial int - BoxesTextMerge int - BoxesVertMerge int - BoxesFinal int - TablesCount int -} +type PipelineMetrics = doctype.PipelineMetrics +type ParseResult = doctype.ParseResult +type DLAPageRegions = doctype.DLAPageRegions +type TSRRawCell = doctype.TSRRawCell +type TextChar = doctype.TextChar +type TextBox = doctype.TextBox +type Position = doctype.Position +type Section = doctype.Section +type TableItem = doctype.TableItem +type TSRCell = doctype.TSRCell +type DLARegion = doctype.DLARegion +type OCRBox = doctype.OCRBox +type OCRText = doctype.OCRText +type ParserConfig = doctype.ParserConfig +type DocAnalyzer = doctype.DocAnalyzer +type Outline = doctype.Outline +type PDFEngine = doctype.PDFEngine +type Tokenizer = doctype.Tokenizer +type SampleFunc = doctype.SampleFunc +type TableBuilder = doctype.TableBuilder +type Rectangular = doctype.Rectangular -// ParseResult encapsulates all outputs from a single Parse() call. -type ParseResult struct { - Sections []Section - Tables []TableItem - PageImages map[int]image.Image - Metrics PipelineMetrics - Outlines []Outline // PDF outlines/bookmarks extracted from the document +// ── Re-export constants ──────────────────────────────────────────────────── - DLADebug []DLAPageRegions - TSRDebug []TSRRawCell -} - -// Figures returns all sections with LayoutType "figure". -// Computed on demand from Sections — no stored field. -func (r *ParseResult) Figures() []Section { - return CollectFigures(r.Sections) -} - -// DLAPageRegions holds DLA layout regions for one page. -type DLAPageRegions struct { - Page int - Regions []DLARegion -} - -// TSRRawCell holds a raw TSR cell before row/column grouping. -type TSRRawCell struct { - TableIndex int `json:"table_index"` - Page int `json:"page"` - Label string `json:"label"` - X0 float64 `json:"x0"` - Y0 float64 `json:"y0"` - X1 float64 `json:"x1"` - Y1 float64 `json:"y1"` - Text string `json:"text"` -} - -// ── Character and text box types ────────────────────────────────────────── - -// TextChar represents a single character extracted from a PDF page. -type TextChar struct { - X0, X1 float64 - Top, Bottom float64 - Text string - FontName string - FontSize float64 - PageNumber int - LayoutType string - LayoutNo string - ColID int - R int -} - -func (c TextChar) Bounds() (float64, float64, float64, float64) { - return c.X0, c.Top, c.X1, c.Bottom -} - -// TextBox represents a rectangular region of text on a PDF page. -type TextBox struct { - X0, X1 float64 - Top, Bottom float64 - Text string - PageNumber int - LayoutType string - LayoutNo string - ColID int - R int - // Post-TSR table annotation fields (Python: R/H/C/SP tags) - RTop, RBott float64 - HTop, HBott float64 - HLeft, HRight float64 - H int - C int - CLeft, CRight float64 - SP int -} - -func (b TextBox) Bounds() (float64, float64, float64, float64) { - return b.X0, b.Top, b.X1, b.Bottom -} - -// ── Position and section types ──────────────────────────────────────────── - -// Position represents a parsed position tag from @@...## format. -type Position struct { - PageNumbers []int - Left float64 - Right float64 - Top float64 - Bottom float64 -} - -// Section represents a text segment with its spatial position on a PDF page. -type Section struct { - Text string - PositionTag string - LayoutType string - DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing - Positions []Position - TableItem *TableItem - Image string // base64-encoded cropped page image -} - -// SectionsByPage returns a slice of sections on the given page. -func SectionsByPage(sections []Section, page int) []Section { - var out []Section - for _, s := range sections { - for _, p := range s.Positions { - for _, pn := range p.PageNumbers { - if pn == page { - out = append(out, s) - break - } - } - } - } - return out -} - -// CollectFigures returns all sections with LayoutType "figure". -func CollectFigures(sections []Section) []Section { - if sections == nil { - return nil - } - figures := make([]Section, 0) - for _, s := range sections { - if s.LayoutType == LayoutTypeFigure { - figures = append(figures, s) - } - } - return figures -} - -// ── Table types ─────────────────────────────────────────────────────────── - -// TableItem represents a detected table or figure region. -type TableItem struct { - ImageB64 string - Rows [][]string - Cells []TSRCell - Positions []Position - Scale float64 - CropOffX float64 - CropOffY float64 - Caption string - - RegionLeft, RegionRight, RegionTop, RegionBottom float64 - NoMerge bool - Grid [][]TSRCell -} - -// TSRCell represents one table cell from TSR. -type TSRCell struct { - X0, Y0, X1, Y1 float64 - Text string - Label string -} - -func (c TSRCell) Bounds() (float64, float64, float64, float64) { - return c.X0, c.Y0, c.X1, c.Y1 -} - -// ── DeepDoc vision types ───────────────────────────────────────────────── - -// DLARegion represents one detected layout region. -type DLARegion struct { - X0, Y0, X1, Y1 float64 - Label string - Confidence float64 -} - -func (r DLARegion) Bounds() (float64, float64, float64, float64) { - return r.X0, r.Y0, r.X1, r.Y1 -} - -// OCRBox represents a detected text region from DeepDoc OCR detection. -type OCRBox struct { - X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64 -} - -// OCRText represents recognized text with confidence from DeepDoc OCR rec. -type OCRText struct { - Text string - Confidence float64 -} - -// ── Parser configuration ────────────────────────────────────────────────── - -// ParserConfig holds parser configuration. -type ParserConfig struct { - Zoom float64 - FromPage int - ToPage int - TableContextSize int - ImageContextSize int - AutoRotateTables *bool - SeparateTablesFigs bool - SortByTop bool - BatchSize int - SkipOCR bool - MaxOCRConcurrency int - TableBuilder TableBuilder -} - -// DefaultParserConfig returns a ParserConfig with sensible defaults. -func DefaultParserConfig() ParserConfig { - return ParserConfig{ - Zoom: 3, - FromPage: 0, - ToPage: -1, - BatchSize: 50, - TableContextSize: 0, - ImageContextSize: 0, - SeparateTablesFigs: false, - } -} - -// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR. -const DlaDPI = 216 - -// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space. -const DlaScale = DlaDPI / 72.0 - -// ── Layout type constants ───────────────────────────────────────────────── +const DlaDPI = doctype.DlaDPI +const DlaScale = doctype.DlaScale const ( - LayoutTypeText = "text" - LayoutTypeTable = "table" - LayoutTypeFigure = "figure" - LayoutTypeEquation = "equation" - LayoutTypeTitle = "title" - LayoutTypeReference = "reference" - LayoutTypeFooter = "footer" - LayoutTypeHeader = "header" - - DLALabelFigureCaption = "figure caption" - DLALabelTableCaption = "table caption" + LayoutTypeText = doctype.LayoutTypeText + LayoutTypeTable = doctype.LayoutTypeTable + LayoutTypeFigure = doctype.LayoutTypeFigure + LayoutTypeEquation = doctype.LayoutTypeEquation + LayoutTypeTitle = doctype.LayoutTypeTitle + LayoutTypeReference = doctype.LayoutTypeReference + LayoutTypeFooter = doctype.LayoutTypeFooter + LayoutTypeHeader = doctype.LayoutTypeHeader + DLALabelFigureCaption = doctype.DLALabelFigureCaption + DLALabelTableCaption = doctype.DLALabelTableCaption ) -// ── Interfaces ──────────────────────────────────────────────────────────── +// ── Re-export functions and variables ────────────────────────────────────── -// DocAnalyzer abstracts DeepDoc vision operations. -type DocAnalyzer interface { - DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error) - TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error) - OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error) - OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error) - OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error) - Health() bool -} - -// ── Outline ──────────────────────────────────────────────────────────── - -// Outline represents one entry in a PDF's document outline (table of contents). -// Python: extract_pdf_outlines() in deepdoc/parser/utils.py -type Outline struct { - Title string - Level int - PageNumber int // 1-indexed, matching Python -} - -// PDFEngine abstracts page extraction capabilities. -type PDFEngine interface { - ExtractChars(pageNum int) ([]TextChar, error) - RenderPage(pageNum int, dpi float64) ([]byte, error) - RenderPageImage(pageNum int, dpi float64) (image.Image, error) - RawData() []byte - PageCount() (int, error) - Outlines() ([]Outline, error) - Close() error -} - -// Tokenizer provides text tokenization matching rag_tokenizer. -type Tokenizer interface { - Tag(token string) string -} - -// SampleFunc samples up to n characters from a page's chars. -type SampleFunc func(chars []TextChar, n int) string - -// TableBuilder encapsulates TSR model-specific cell detection and grouping. -type TableBuilder interface { - Name() string - DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error) - GroupCells(cells []TSRCell) [][]TSRCell -} - -// Rectangular is any 2D axis-aligned rectangle that can report its bounds. -type Rectangular interface { - Bounds() (x0, y0, x1, y1 float64) -} - -// IsCJK reports whether r is a CJK character. -func IsCJK(r rune) bool { - return unicode.Is(unicode.Han, r) || - unicode.Is(unicode.Hiragana, r) || - unicode.Is(unicode.Katakana, r) || - unicode.Is(unicode.Hangul, r) -} +var ( + CollectFigures = doctype.CollectFigures + DefaultParserConfig = doctype.DefaultParserConfig + IsCJK = doctype.IsCJK +) diff --git a/internal/deepdoc/parser/pdf/util/geometry.go b/internal/deepdoc/parser/pdf/util/geometry.go index 51e9e30bfe..66ed7ad26c 100644 --- a/internal/deepdoc/parser/pdf/util/geometry.go +++ b/internal/deepdoc/parser/pdf/util/geometry.go @@ -131,34 +131,6 @@ func OverlapX(a, b pdf.Rectangular) float64 { return overlap / minWidth } -// SortXByPage sorts boxes by page_number, then x0, then top. -// After sorting, corrects for same-page boxes that have nearly the same x0 -// but inverted top ordering (a layout artifact). -// -// Python: pdf_parser.py:178 sort_X_by_page() -func SortXByPage(boxes []pdf.TextBox, threshold float64) []pdf.TextBox { - sort.Slice(boxes, func(i, j int) bool { - if boxes[i].PageNumber != boxes[j].PageNumber { - return boxes[i].PageNumber < boxes[j].PageNumber - } - if boxes[i].X0 != boxes[j].X0 { - return boxes[i].X0 < boxes[j].X0 - } - return boxes[i].Top < boxes[j].Top - }) - - for i := len(boxes) - 1; i >= 1; i-- { - for j := i - 1; j >= 0; j-- { - if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold && - boxes[j+1].Top < boxes[j].Top && - boxes[j+1].PageNumber == boxes[j].PageNumber { - boxes[j], boxes[j+1] = boxes[j+1], boxes[j] - } - } - } - return boxes -} - // MedianCharHeight computes the median character height for a page, // matching Python's np.median(char height) in __images__ (pdf_parser.py:1552). // Used as a reference unit for vertical spacing decisions. diff --git a/internal/deepdoc/parser/pdf/util/geometry_test.go b/internal/deepdoc/parser/pdf/util/geometry_test.go index 89dca3c567..1a45650b8e 100644 --- a/internal/deepdoc/parser/pdf/util/geometry_test.go +++ b/internal/deepdoc/parser/pdf/util/geometry_test.go @@ -49,22 +49,6 @@ func TestYDis(t *testing.T) { } } -func TestSortXByPage(t *testing.T) { - boxes := []pdf.TextBox{ - {PageNumber: 1, X0: 100, Top: 50, Text: "C"}, - {PageNumber: 1, X0: 50, Top: 100, Text: "A"}, - {PageNumber: 1, X0: 50, Top: 30, Text: "B"}, - {PageNumber: 0, X0: 0, Top: 0, Text: "D"}, - } - result := SortXByPage(boxes, 3) - if result[0].Text != "D" { - t.Errorf("first should be page 0: got %q", result[0].Text) - } - if result[1].Text != "B" || result[2].Text != "A" { - t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text) - } -} - func TestOverlapX(t *testing.T) { b1 := pdf.TextBox{X0: 50, X1: 200} b2 := pdf.TextBox{X0: 100, X1: 250} diff --git a/internal/deepdoc/parser/pdf/ycoord_test.go b/internal/deepdoc/parser/pdf/ycoord_test.go index c49ea17b31..f45a35e334 100644 --- a/internal/deepdoc/parser/pdf/ycoord_test.go +++ b/internal/deepdoc/parser/pdf/ycoord_test.go @@ -1,6 +1,6 @@ //go:build cgo && manual -package parser +package pdf import ( "math" @@ -8,6 +8,7 @@ import ( "path/filepath" "testing" + lyt "ragflow/internal/deepdoc/parser/pdf/layout" "ragflow/internal/deepdoc/parser/pdf/pdfoxide" pdf "ragflow/internal/deepdoc/parser/pdf/type" ) @@ -41,7 +42,7 @@ func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) { t.Fatal("no chars") } - lines := groupCharsToLines(chars, false) + lines := lyt.GroupCharsToLines(chars, false) for li, line := range lines { if len(line) <= 1 { continue diff --git a/internal/deepdoc/parser/type/types.go b/internal/deepdoc/parser/type/types.go new file mode 100644 index 0000000000..946c722924 --- /dev/null +++ b/internal/deepdoc/parser/type/types.go @@ -0,0 +1,304 @@ +// Package doctype provides shared types, interfaces, and constants for the +// deepdoc parser pipeline. All format-specific parsers (pdf, docx, xlsx, etc.) +// share these definitions. The package has zero dependencies on sibling +// packages so that any sub-package can import it without circular imports. +package doctype + +import ( + "context" + "image" + "unicode" +) + +// ── Pipeline types ──────────────────────────────────────────────────────── + +// PipelineMetrics records diagnostic counts at each pipeline stage. +type PipelineMetrics struct { + BoxesInitial int + BoxesTextMerge int + BoxesVertMerge int + BoxesFinal int + TablesCount int +} + +// ParseResult encapsulates all outputs from a single Parse() call. +type ParseResult struct { + Sections []Section + Tables []TableItem + PageImages map[int]image.Image + Metrics PipelineMetrics + Outlines []Outline // PDF outlines/bookmarks extracted from the document + + DLADebug []DLAPageRegions + TSRDebug []TSRRawCell +} + +// Figures returns all sections with LayoutType "figure". +// Computed on demand from Sections — no stored field. +func (r *ParseResult) Figures() []Section { + return CollectFigures(r.Sections) +} + +// DLAPageRegions holds DLA layout regions for one page. +type DLAPageRegions struct { + Page int + Regions []DLARegion +} + +// TSRRawCell holds a raw TSR cell before row/column grouping. +type TSRRawCell struct { + TableIndex int `json:"table_index"` + Page int `json:"page"` + Label string `json:"label"` + X0 float64 `json:"x0"` + Y0 float64 `json:"y0"` + X1 float64 `json:"x1"` + Y1 float64 `json:"y1"` + Text string `json:"text"` +} + +// ── Character and text box types ────────────────────────────────────────── + +// TextChar represents a single character extracted from a PDF page. +type TextChar struct { + X0, X1 float64 + Top, Bottom float64 + Text string + FontName string + FontSize float64 + PageNumber int + LayoutType string + LayoutNo string + ColID int + R int +} + +func (c TextChar) Bounds() (float64, float64, float64, float64) { + return c.X0, c.Top, c.X1, c.Bottom +} + +// TextBox represents a rectangular region of text on a PDF page. +type TextBox struct { + X0, X1 float64 + Top, Bottom float64 + Text string + PageNumber int + LayoutType string + LayoutNo string + ColID int + R int + // Post-TSR table annotation fields (Python: R/H/C/SP tags) + RTop, RBott float64 + HTop, HBott float64 + HLeft, HRight float64 + H int + C int + CLeft, CRight float64 + SP int +} + +func (b TextBox) Bounds() (float64, float64, float64, float64) { + return b.X0, b.Top, b.X1, b.Bottom +} + +// ── Position and section types ──────────────────────────────────────────── + +// Position represents a parsed position tag from @@...## format. +type Position struct { + PageNumbers []int + Left float64 + Right float64 + Top float64 + Bottom float64 +} + +// Section represents a text segment with its spatial position on a PDF page. +type Section struct { + Text string + PositionTag string + LayoutType string + DocTypeKwd string // "text"/"table"/"image" — assigned during post-processing + Positions []Position + TableItem *TableItem + Image string // base64-encoded cropped page image +} + +// CollectFigures returns all sections with LayoutType "figure". +func CollectFigures(sections []Section) []Section { + if sections == nil { + return nil + } + figures := make([]Section, 0) + for _, s := range sections { + if s.LayoutType == LayoutTypeFigure { + figures = append(figures, s) + } + } + return figures +} + +// ── Table types ─────────────────────────────────────────────────────────── + +// TableItem represents a detected table or figure region. +type TableItem struct { + ImageB64 string + Rows [][]string + Cells []TSRCell + Positions []Position + Scale float64 + CropOffX float64 + CropOffY float64 + Caption string + + RegionLeft, RegionRight, RegionTop, RegionBottom float64 + NoMerge bool + Grid [][]TSRCell +} + +// TSRCell represents one table cell from TSR. +type TSRCell struct { + X0, Y0, X1, Y1 float64 + Text string + Label string +} + +func (c TSRCell) Bounds() (float64, float64, float64, float64) { + return c.X0, c.Y0, c.X1, c.Y1 +} + +// ── DeepDoc vision types ───────────────────────────────────────────────── + +// DLARegion represents one detected layout region. +type DLARegion struct { + X0, Y0, X1, Y1 float64 + Label string + Confidence float64 +} + +func (r DLARegion) Bounds() (float64, float64, float64, float64) { + return r.X0, r.Y0, r.X1, r.Y1 +} + +// OCRBox represents a detected text region from DeepDoc OCR detection. +type OCRBox struct { + X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64 +} + +// OCRText represents recognized text with confidence from DeepDoc OCR rec. +type OCRText struct { + Text string + Confidence float64 +} + +// ── Parser configuration ────────────────────────────────────────────────── + +// ParserConfig holds parser configuration. +type ParserConfig struct { + Zoom float64 + FromPage int + ToPage int + TableContextSize int + ImageContextSize int + AutoRotateTables *bool + SeparateTablesFigs bool + SortByTop bool + BatchSize int + SkipOCR bool + MaxOCRConcurrency int +} + +// DefaultParserConfig returns a ParserConfig with sensible defaults. +func DefaultParserConfig() ParserConfig { + return ParserConfig{ + Zoom: 3, + FromPage: 0, + ToPage: -1, + BatchSize: 50, + TableContextSize: 0, + ImageContextSize: 0, + SeparateTablesFigs: false, + } +} + +// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR. +const DlaDPI = 216 + +// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space. +const DlaScale = DlaDPI / 72.0 + +// ── Layout type constants ───────────────────────────────────────────────── + +const ( + LayoutTypeText = "text" + LayoutTypeTable = "table" + LayoutTypeFigure = "figure" + LayoutTypeEquation = "equation" + LayoutTypeTitle = "title" + LayoutTypeReference = "reference" + LayoutTypeFooter = "footer" + LayoutTypeHeader = "header" + + DLALabelFigureCaption = "figure caption" + DLALabelTableCaption = "table caption" +) + +// ── Interfaces ──────────────────────────────────────────────────────────── + +// DocAnalyzer abstracts DeepDoc vision operations. +type DocAnalyzer interface { + DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error) + TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error) + OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error) + OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error) + OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error) + Health() bool +} + +// ── Outline ──────────────────────────────────────────────────────────── + +// Outline represents one entry in a PDF's document outline (table of contents). +// Python: extract_pdf_outlines() in deepdoc/parser/utils.py +type Outline struct { + Title string + Level int + PageNumber int // 1-indexed, matching Python +} + +// PDFEngine abstracts page extraction capabilities. +type PDFEngine interface { + ExtractChars(pageNum int) ([]TextChar, error) + RenderPage(pageNum int, dpi float64) ([]byte, error) + RenderPageImage(pageNum int, dpi float64) (image.Image, error) + RawData() []byte + PageCount() (int, error) + Outlines() ([]Outline, error) + Close() error +} + +// Tokenizer provides text tokenization matching rag_tokenizer. +type Tokenizer interface { + Tag(token string) string +} + +// SampleFunc samples up to n characters from a page's chars. +type SampleFunc func(chars []TextChar, n int) string + +// TableBuilder encapsulates TSR model-specific cell detection and grouping. +type TableBuilder interface { + Name() string + DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error) + GroupCells(cells []TSRCell) [][]TSRCell +} + +// Rectangular is any 2D axis-aligned rectangle that can report its bounds. +type Rectangular interface { + Bounds() (x0, y0, x1, y1 float64) +} + +// IsCJK reports whether r is a CJK character. +func IsCJK(r rune) bool { + return unicode.Is(unicode.Han, r) || + unicode.Is(unicode.Hiragana, r) || + unicode.Is(unicode.Katakana, r) || + unicode.Is(unicode.Hangul, r) +}