Feat/oss parser no post (#16464)

### Summary Remove dead code
2026-07-05 10:58:34 +08:00 · 2026-07-02 09:46:33 +08:00
parent 133b1e15fd
commit 5bc4753d1e
51 changed files with 1381 additions and 2680 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -245,3 +245,5 @@ bin/*
 # Parser test fixtures and python tools
 internal/deepdoc/parser/pdf/testdata/
 internal/deepdoc/parser/pdf/tools-py/
+internal/deepdoc/parser/docx/testdata/
+internal/deepdoc/parser/docx/tool/
--- a/internal/deepdoc/parser/pdf/batch_smoke_test.go
+++ b/internal/deepdoc/parser/pdf/batch_smoke_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -51,12 +51,12 @@ func TestBatchResults(t *testing.T) {
 	}
 	pdfs := all[:min(count, len(all))]

-	ddClient, err := inf.NewInferenceClient(os.Getenv("DEEPDOC_URL"))
+	ddClient, err := inf.NewClient(os.Getenv("DEEPDOC_URL"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if !ddClient.Health() {
-		t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL)
+		t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.BaseURL())
 	}
 	deepDoc := pdf.DocAnalyzer(ddClient)

@@ -238,9 +238,9 @@ func parseOne(pdfDir, name string, deepDoc pdf.DocAnalyzer, skipOCR bool) (*pars

 	cfg := pdf.DefaultParserConfig()
 	cfg.SkipOCR = skipOCR
-	p := NewParser(cfg, deepDoc)
+	p := NewParser(cfg)
 	t0 := time.Now()
-	parsed, err := p.Parse(context.Background(), eng)
+	parsed, err := p.ParseRaw(context.Background(), eng, deepDoc)
 	elapsed := time.Since(t0).Seconds()
 	if err != nil {
 		return nil, fmt.Errorf("parse: %w", err)
--- a/internal/deepdoc/parser/pdf/compare_test.go
+++ b/internal/deepdoc/parser/pdf/compare_test.go
@@ -1,6 +1,6 @@
 //go:build manual

-package parser
+package pdf

 import (
 	"log/slog"
@@ -8,7 +8,7 @@ import (
 	"path/filepath"
 	"testing"

-	"ragflow/internal/deepdoc/parser/pdf/tools"
+	"ragflow/internal/deepdoc/parser/pdf/tool"
 )

 // TestBatchCompareWithPython compares Go output against Python reference
@@ -37,29 +37,29 @@ func TestBatchCompareWithPython(t *testing.T) {
 	pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text")

 	// Read Go text files' #@meta (no aggregate JSON dependency).
-	goResults, err := tools.ReadGoTextMeta(goTextDir)
+	goResults, err := tool.ReadGoTextMeta(goTextDir)
 	if err != nil || len(goResults) == 0 {
 		t.Fatalf("No Go text files in %s: %v", goTextDir, err)
 	}

 	// Read Python text files' #@meta
-	pyResults, err := tools.ReadPythonTextMeta(pyTextDir)
+	pyResults, err := tool.ReadPythonTextMeta(pyTextDir)
 	if err != nil || len(pyResults) == 0 {
 		t.Fatalf("No Python text files in %s: %v", pyTextDir, err)
 	}

 	t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults))
-	tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
+	tool.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)

 	// Compare tables.
 	goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables")
 	pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables")
-	tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
+	tool.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
 	// Compare DLA + TSR raw intermediates.
 	goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla")
 	pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla")
-	tools.CompareDLAWithPython(t, goDLADir, pyDLADir)
+	tool.CompareDLAWithPython(t, goDLADir, pyDLADir)
 	goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw")
 	pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw")
-	tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
+	tool.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
 }
--- a/internal/deepdoc/parser/pdf/crop_integration_test.go
+++ b/internal/deepdoc/parser/pdf/crop_integration_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"bytes"
@@ -27,8 +27,8 @@ func TestParse_CropSectionImages(t *testing.T) {
 	defer eng.Close()

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -79,8 +79,8 @@ func TestCrop_Regression_SnapshotPDFs(t *testing.T) {
 			}
 			defer eng.Close()

-			p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
-			result, err := p.Parse(context.Background(), eng)
+			p := NewParser(pdf.DefaultParserConfig())
+			result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
 			if err != nil {
 				t.Fatalf("Parse: %v", err)
 			}
--- a/internal/deepdoc/parser/pdf/dla_real_world_test.go
+++ b/internal/deepdoc/parser/pdf/dla_real_world_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && integration

-package parser
+package pdf

 import (
 	"context"
@@ -46,7 +46,7 @@ func TestDLARealWorldCompare(t *testing.T) {
 		for _, pg := range pdf.pages {
 			testName := pdf.name + "/page" + string(rune('0'+pg))
 			t.Run(testName, func(t *testing.T) {
-				pageImg, err := renderPageToImage(eng, pg)
+				pageImg, err := RenderPageToImage(eng, pg)
 				if err != nil {
 					t.Fatalf("render page %d: %v", pg, err)
 				}
--- a/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go
+++ b/internal/deepdoc/parser/pdf/dla_tsr_compare_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && integration

-package parser
+package pdf

 import (
 	"context"
@@ -28,7 +28,7 @@ func TestDLATSRResponseCompare(t *testing.T) {
 	eng := mustOpenEngine(t, "06_table_content.pdf")
 	defer eng.Close()

-	pageImg, err := renderPageToImage(eng, 0)
+	pageImg, err := RenderPageToImage(eng, 0)
 	if err != nil {
 		t.Fatalf("render: %v", err)
 	}
--- a/internal/deepdoc/parser/pdf/test_helpers_cgo_test.go
+++ b/internal/deepdoc/parser/pdf/test_helpers_cgo_test.go
@@ -1,6 +1,6 @@
 //go:build cgo

-package parser
+package pdf

 import (
 	"os"
@@ -11,20 +11,14 @@ import (
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 )

-// ── Shared CGO test helpers ──────────────────────────────────────────────────
-// These helpers were previously duplicated across multiple test files with
-// different build tags (integration, manual). Consolidating them into one file
-// with the //go:build cgo tag makes them available to all cgo-tagged tests.
-
-// mustConnectInferenceClient returns a InferenceClient pointed at the OSS service;
-// skips the test if the service reports a non-OSS model type.
-func mustConnectInferenceClient(t *testing.T) *inf.InferenceClient {
+// mustConnectInferenceClient returns a InferenceClient for the OSS DeepDoc service.
+func mustConnectInferenceClient(t *testing.T) *inf.Client {
 	t.Helper()
 	url := os.Getenv("OSSDEEPDOC_URL")
 	if url == "" {
 		url = "http://localhost:9390"
 	}
-	client, err := inf.NewInferenceClient(url)
+	client, err := inf.NewClient(url)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -48,3 +42,12 @@ func mustOpenEngine(t *testing.T, name string) pdf.PDFEngine {
 	}
 	return eng
 }
+
+func mustReadPDF(t *testing.T, name string) []byte {
+	t.Helper()
+	data, err := os.ReadFile(filepath.Join("testdata", "pdfs", name))
+	if err != nil {
+		t.Fatalf("read fixture %s: %v", name, err)
+	}
+	return data
+}
--- a/internal/deepdoc/parser/pdf/inference/client.go
+++ b/internal/deepdoc/parser/pdf/inference/client.go
@@ -21,8 +21,8 @@ import (
 	"github.com/cenkalti/backoff/v5"
 )

-// InferenceClient wraps the DeepDoc HTTP API.
-type InferenceClient struct {
+// Client wraps the DeepDoc HTTP API.
+type Client struct {
 	baseURL    string
 	httpClient *http.Client

@@ -33,24 +33,27 @@ type InferenceClient struct {
 }

 // BaseURL returns the configured DeepDoc service URL.
-func (c *InferenceClient) BaseURL() string { return c.baseURL }
+func (c *Client) BaseURL() string { return c.baseURL }

-// NewInferenceClient creates a client.  baseURL must be provided by the caller
+// NewClient creates a client.  baseURL must be provided by the caller
 // (e.g. from the DEEPDOC_URL environment variable).  Returns an error if empty.
-func NewInferenceClient(baseURL string) (*InferenceClient, error) {
+func NewClient(baseURL string) (*Client, error) {
 	if baseURL == "" {
 		return nil, fmt.Errorf("deepdoc client: baseURL is required (set DEEPDOC_URL)")
 	}
-	return &InferenceClient{
+	return &Client{
 		baseURL: baseURL,
 		httpClient: &http.Client{
 			Timeout: 120 * time.Second,
 		},
+		DLALabels: DefaultDLALabels(),
+		TSRLabels: DefaultTSRLabels(),
 	}, nil
 }

-// Default DLA/TSR label tables used as fallback when no model-specific
-// labels are injected by a TableBuilder constructor.
+// DefaultDLALabels returns the 10-class DLA taxonomy matching Python's
+// deepdoc/vision/dla_cli.py:10-21.  Duplicates at indices 4, 7, 9 are
+// kept verbatim for backward compatibility with existing inference servers.
 func DefaultDLALabels() []string {
 	return []string{
 		pdf.LayoutTypeTitle, pdf.LayoutTypeText, pdf.LayoutTypeReference,
@@ -59,6 +62,9 @@ func DefaultDLALabels() []string {
 		pdf.LayoutTypeEquation, pdf.DLALabelFigureCaption,
 	}
 }
+
+// DefaultTSRLabels returns the 6-class TSR taxonomy matching Python's
+// deepdoc/server/adapters/tsr_adapter.py:21-26.
 func DefaultTSRLabels() []string {
 	return []string{
 		"table", "table column", "table row",
@@ -72,7 +78,7 @@ type bboxesResponse struct {
 }

 // DLA analyzes a full page image and returns labeled regions.
-func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
+func (c *Client) DLA(ctx context.Context, pageImage image.Image) ([]pdf.DLARegion, error) {
 	data, err := util.EncodeJPEG(pageImage)
 	if err != nil {
 		return nil, fmt.Errorf("dla: encode: %w", err)
@@ -87,9 +93,6 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
 			continue
 		}
 		labels := c.DLALabels
-		if labels == nil {
-			labels = DefaultDLALabels()
-		}
 		label := ""
 		if clsID := int(b[5]); clsID >= 0 && clsID < len(labels) {
 			label = labels[clsID]
@@ -104,7 +107,7 @@ func (c *InferenceClient) DLA(ctx context.Context, pageImage image.Image) ([]pdf
 }

 // TSR recognises table structure from a cropped image.
-func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
+func (c *Client) TSR(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
 	data, err := util.EncodeJPEG(cropped)
 	if err != nil {
 		return nil, fmt.Errorf("tsr: encode: %w", err)
@@ -119,9 +122,6 @@ func (c *InferenceClient) TSR(ctx context.Context, cropped image.Image) ([]pdf.T
 			continue
 		}
 		tlabels := c.TSRLabels
-		if tlabels == nil {
-			tlabels = DefaultTSRLabels()
-		}
 		label := ""
 		if len(b) >= 6 {
 			if cls := int(b[5]); cls >= 0 && cls < len(tlabels) {
@@ -152,7 +152,7 @@ type ocrRecognizeResponse struct {

 // OCRDetect detects text regions (bounding boxes) in an image.
 // DeepDoc /predict/ocr with operator=det returns quad boxes: [[[x0,y0],[x1,y1],[x2,y2],[x3,y3]], ...]
-func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
+func (c *Client) OCRDetect(ctx context.Context, cropped image.Image) ([]pdf.OCRBox, error) {
 	data, err := util.EncodeJPEG(cropped)
 	if err != nil {
 		return nil, fmt.Errorf("ocr detect: encode: %w", err)
@@ -197,7 +197,7 @@ func (c *InferenceClient) OCRDetect(ctx context.Context, cropped image.Image) ([

 // OCRRecognize recognizes text in a cropped image region.
 // DeepDoc /predict/ocr with operator=rec returns [[["text", confidence], ...]]
-func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
+func (c *Client) OCRRecognize(ctx context.Context, cropped image.Image) ([]pdf.OCRText, error) {
 	data, err := util.EncodeJPEG(cropped)
 	if err != nil {
 		return nil, fmt.Errorf("ocr rec: encode: %w", err)
@@ -224,7 +224,7 @@ func (c *InferenceClient) OCRRecognize(ctx context.Context, cropped image.Image)
 // OCRRecognizeBatch recognizes text in multiple cropped image regions.
 // Returns a slice of results and a parallel slice of errors (nil on success).
 // A nil cropped image in the input produces nil results and a non-nil error.
-func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
+func (c *Client) OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]pdf.OCRText, []error) {
 	results := make([][]pdf.OCRText, len(cropped))
 	errs := make([]error, len(cropped))

@@ -255,7 +255,7 @@ func (c *InferenceClient) OCRRecognizeBatch(ctx context.Context, cropped []image
 }

 // Health checks whether the DeepDoc service is reachable.
-func (c *InferenceClient) Health() bool {
+func (c *Client) Health() bool {
 	resp, err := c.httpClient.Get(c.baseURL + "/health")
 	if err != nil {
 		return false
@@ -264,7 +264,7 @@ func (c *InferenceClient) Health() bool {
 	return resp.StatusCode == 200
 }

-func (c *InferenceClient) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
+func (c *Client) post(ctx context.Context, endpoint string, imgData []byte, filename string, result interface{}, extraFields ...string) error {
 	// Build multipart body once — the image data is idempotent.
 	var body bytes.Buffer
 	w := multipart.NewWriter(&body)
--- a/internal/deepdoc/parser/pdf/inference/client_test.go
+++ b/internal/deepdoc/parser/pdf/inference/client_test.go
@@ -11,11 +11,11 @@ import (
 	"testing"
 )

-// mustNewDeepDocClient wraps NewInferenceClient for test convenience.
+// mustNewDeepDocClient wraps NewClient for test convenience.
 // Fails the test if the URL is empty.
-func mustNewDeepDocClient(t *testing.T, baseURL string) *InferenceClient {
+func mustNewDeepDocClient(t *testing.T, baseURL string) *Client {
 	t.Helper()
-	client, err := NewInferenceClient(baseURL)
+	client, err := NewClient(baseURL)
 	if err != nil {
 		t.Fatalf("NewDeepDocClient(%q): %v", baseURL, err)
 	}
--- a/internal/deepdoc/parser/pdf/inference_client_integration_test.go
+++ b/internal/deepdoc/parser/pdf/inference_client_integration_test.go
@@ -1,13 +1,12 @@
 //go:build cgo && integration

-package parser
+package pdf

 import (
 	"context"
 	"strings"
 	"testing"

-	tbl "ragflow/internal/deepdoc/parser/pdf/table"
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 )

@@ -15,13 +14,11 @@ import (
 // through the OSS TableBuilder produces tables with the expected row/column structure.
 func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "06_table_content.pdf")

 	cfg := pdf.DefaultParserConfig()
-	cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -29,7 +26,7 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
 		t.Skip("DLA did not detect any tables in fixture")
 	}

-	t.Logf("OssDeepDoc produced %d tables", len(result.Tables))
+	t.Logf("DeepDoc produced %d tables", len(result.Tables))
 	for i, tbl := range result.Tables {
 		t.Logf("table[%d]: %d rows", i, len(tbl.Rows))
 		for ri, row := range tbl.Rows {
@@ -51,13 +48,11 @@ func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
 // rows with the expected grid structure.
 func TestIntegration_DeepDoc_TableRows(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "06_table_content.pdf")

 	cfg := pdf.DefaultParserConfig()
-	cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -92,13 +87,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
 	client := mustConnectInferenceClient(t)

 	parseOnce := func() *pdf.ParseResult {
-		eng := mustOpenEngine(t, "06_table_content.pdf")
-		defer eng.Close()
+		data := mustReadPDF(t, "06_table_content.pdf")

 		cfg := pdf.DefaultParserConfig()
-		cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
-		p := NewParser(cfg, client)
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.Parse(context.Background(), data, client)
 		if err != nil {
 			t.Fatalf("Parse: %v", err)
 		}
@@ -124,13 +117,11 @@ func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
 // does not crash.
 func TestIntegration_DeepDoc_EmptyPage(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "01_english_simple.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "01_english_simple.pdf")

 	cfg := pdf.DefaultParserConfig()
-	cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
-	p := NewParser(cfg, client)
-	_, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	_, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
--- a/internal/deepdoc/parser/pdf/mock_doc_analyzer_test.go
+++ b/internal/deepdoc/parser/pdf/mock_doc_analyzer_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
--- a/internal/deepdoc/parser/pdf/mock_engine.go
+++ b/internal/deepdoc/parser/pdf/mock_engine.go
@@ -0,0 +1,41 @@
+package pdf
+
+import (
+	"image"
+
+	pdf "ragflow/internal/deepdoc/parser/pdf/type"
+)
+
+// MockEngine is a minimal pdf.PDFEngine stub for unit/integration tests.
+type MockEngine struct {
+	Chars    map[int][]pdf.TextChar
+	NumPages int
+	RenderW  int
+	RenderH  int
+}
+
+func (m *MockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
+	return m.Chars[pg], nil
+}
+func (m *MockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
+	return nil, ErrNoPDFData
+}
+func (m *MockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
+	w, h := m.RenderW, m.RenderH
+	if w <= 0 {
+		w = 100
+	}
+	if h <= 0 {
+		h = 100
+	}
+	return image.NewRGBA(image.Rect(0, 0, w, h)), nil
+}
+func (m *MockEngine) PageCount() (int, error) {
+	if m.NumPages <= 0 {
+		return 1, nil
+	}
+	return m.NumPages, nil
+}
+func (m *MockEngine) RawData() []byte                  { return nil }
+func (m *MockEngine) Close() error                     { return nil }
+func (m *MockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }
--- a/internal/deepdoc/parser/pdf/ocr_merge_test.go
+++ b/internal/deepdoc/parser/pdf/ocr_merge_test.go
@@ -1,11 +1,13 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
 	"image/png"
 	"os"
+	inf "ragflow/internal/deepdoc/parser/pdf/inference"
+	util "ragflow/internal/deepdoc/parser/pdf/util"
 	"strings"
 	"testing"
 )
@@ -19,7 +21,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
 	if url == "" {
 		t.Skip("DEEPDOC_URL not set")
 	}
-	dd, err := inf.NewInferenceClient(url)
+	dd, err := inf.NewClient(url)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -41,7 +43,7 @@ func TestOCR_mergeChars_RealScanned(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	t.Logf("pdf_oxide chars: %d", len(chars))
+	t.Logf("pdf_oxide Chars: %d", len(chars))

 	var sample strings.Builder
 	for i, c := range chars {
--- a/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go
+++ b/internal/deepdoc/parser/pdf/ocr_recognize_batch_test.go
@@ -1,6 +1,6 @@
 //go:build cgo

-package parser
+package pdf

 import (
 	"context"
--- a/internal/deepdoc/parser/pdf/outline_extraction_test.go
+++ b/internal/deepdoc/parser/pdf/outline_extraction_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -10,10 +10,10 @@ import (

 // ── outline-tracking mock engines ──────────────────────────────────────────

-// outlineTrackingEngine wraps mockEngine and records whether Outlines()
+// outlineTrackingEngine wraps MockEngine and records whether Outlines()
 // was called.
 type outlineTrackingEngine struct {
-	*mockEngine
+	*MockEngine
 	outlines       []pdf.Outline
 	outlinesCalled bool
 }
@@ -25,7 +25,7 @@ func (e *outlineTrackingEngine) Outlines() ([]pdf.Outline, error) {

 // outlineErrorEngine returns an error from Outlines().
 type outlineErrorEngine struct {
-	*mockEngine
+	*MockEngine
 }

 func (e *outlineErrorEngine) Outlines() ([]pdf.Outline, error) {
@@ -46,13 +46,13 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
 		{Title: "Section 1.1", Level: 1, PageNumber: 2},
 	}
 	eng := &outlineTrackingEngine{
-		mockEngine: &mockEngine{pageCount: 3},
+		MockEngine: &MockEngine{NumPages: 3},
 		outlines:   expectedOutlines,
 	}
 	mockDLA := &MockDocAnalyzer{Healthy: true}
-	p := NewParser(pdf.DefaultParserConfig(), mockDLA)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mockDLA)
 	if err != nil {
 		t.Fatalf("Parse failed: %v", err)
 	}
@@ -79,18 +79,18 @@ func TestParse_ExtractsOutlinesFromEngine(t *testing.T) {
 // and produces sections (outlines are best-effort).
 func TestParse_OutlinesErrorDoesNotBlockParsing(t *testing.T) {
 	eng := &outlineErrorEngine{
-		mockEngine: &mockEngine{
-			pageCount: 2,
-			chars: map[int][]pdf.TextChar{
+		MockEngine: &MockEngine{
+			NumPages: 2,
+			Chars: map[int][]pdf.TextChar{
 				0: {{Text: "Hello world", X0: 100, X1: 200, Top: 100, Bottom: 120}},
 				1: {{Text: "Page two", X0: 100, X1: 200, Top: 100, Bottom: 120}},
 			},
 		},
 	}
 	mockDLA := &MockDocAnalyzer{Healthy: true}
-	p := NewParser(pdf.DefaultParserConfig(), mockDLA)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mockDLA)
 	if err != nil {
 		t.Fatalf("Parse should not fail when Outlines() errors: %v", err)
 	}
--- a/internal/deepdoc/parser/pdf/page_batch_test.go
+++ b/internal/deepdoc/parser/pdf/page_batch_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -31,8 +31,8 @@ func TestParse_BatchEquivalence(t *testing.T) {
 		defer eng.Close()
 		cfg := pdf.DefaultParserConfig()
 		cfg.BatchSize = batchSize
-		p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.ParseRaw(context.Background(), eng, mockDLA)
 		if err != nil {
 			t.Fatal(err)
 		}
--- a/internal/deepdoc/parser/pdf/parse_cgo.go
+++ b/internal/deepdoc/parser/pdf/parse_cgo.go
@@ -0,0 +1,22 @@
+//go:build cgo
+
+package pdf
+
+import (
+	"context"
+	"fmt"
+
+	pdf "ragflow/internal/deepdoc/parser/pdf/type"
+)
+
+// Parse runs the full PDF extraction pipeline from raw bytes.
+// Creates and manages the PDF engine lifecycle internally.
+func (p *Parser) Parse(ctx context.Context, data []byte, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
+	engine, err := NewEngine(data)
+	if err != nil {
+		return nil, fmt.Errorf("pdfoxide.NewEngine: %w", err)
+	}
+	defer engine.Close()
+
+	return p.ParseRaw(ctx, engine, docAnalyzer)
+}
--- a/internal/deepdoc/parser/pdf/parser.go
+++ b/internal/deepdoc/parser/pdf/parser.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -8,52 +8,36 @@ import (
 	"log/slog"
 	"sync"

-	inf "ragflow/internal/deepdoc/parser/pdf/inference"
 	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
 	tbl "ragflow/internal/deepdoc/parser/pdf/table"
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 	util "ragflow/internal/deepdoc/parser/pdf/util"
 )

-// Parser is the main PDF text/layout extraction pipeline.
+// Parser is the core PDF text/layout extraction pipeline.
 // It corresponds to RAGFlowPdfParser in pdf_parser.py.
-// Parser is stateless after construction — safe to reuse across documents.
+// Stateless after construction — safe to reuse across documents.
 type Parser struct {
 	Config pdf.ParserConfig
-
-	// DeepDoc is the required document layout / OCR / table recognition
-	// service. Set at construction time by NewParser.
-	DeepDoc pdf.DocAnalyzer
-
-	// SampleChars samples up to n chars from a page for English detection.
-	// Defaults to random sampling (matching Python's random.choices).
-	// Inject a deterministic sampler for reproducible tests.
-	SampleChars pdf.SampleFunc
-
-	// tableBuilder is the TSR model adapter. Set at construction time
-	//
-	// different implementation via Config.TableBuilder.
-	tableBuilder pdf.TableBuilder
 }

-// NewParser creates a new Parser with the required DeepDoc service.
-func NewParser(cfg pdf.ParserConfig, doc pdf.DocAnalyzer) *Parser {
-	tb := cfg.TableBuilder
-	if tb == nil {
-		tb = NewTableBuilderFor(doc)
-	}
-	return &Parser{
-		Config:       cfg,
-		DeepDoc:      doc,
-		tableBuilder: tb,
-	}
+// pageResult holds per-page output from extractPages.
+type pageResult struct {
+	pg       int
+	ocrBoxes []pdf.TextBox
+	chars    []pdf.TextChar
+	ocrUsed  bool
+	pageImg  image.Image
+	err      error
+}
+
+// New creates a new Parser with the given config.
+func NewParser(cfg pdf.ParserConfig) *Parser {
+	return &Parser{Config: cfg}
 }

 // ── TableBuilder factory ───────────────────────────────────────────────────

-// tableBuilderFactory holds a model-specific TableBuilder factory registered
-// by EE packages via RegisterTableBuilder. If nil, the default OSS
-// implementation is used.
 var tableBuilderFactory func(pdf.DocAnalyzer) pdf.TableBuilder

 // RegisterTableBuilder registers a TableBuilder factory for the PDF parser.
@@ -62,30 +46,20 @@ func RegisterTableBuilder(factory func(pdf.DocAnalyzer) pdf.TableBuilder) {
 	tableBuilderFactory = factory
 }

-// NewTableBuilderFor creates the right TableBuilder, chosen by the registry.
-// Checks the registry first for EE-registered implementations, falling back
-// to the default OSS DeepDocTableBuilder. Label taxonomies are injected
-// before construction.
 func NewTableBuilderFor(doc pdf.DocAnalyzer) pdf.TableBuilder {
 	if tableBuilderFactory != nil {
 		return tableBuilderFactory(doc)
 	}
-	if c, ok := doc.(*inf.InferenceClient); ok {
-		c.DLALabels = inf.DefaultDLALabels()
-		c.TSRLabels = inf.DefaultTSRLabels()
-	}
 	return tbl.NewDeepDocTableBuilder(doc)
 }

-// Parse runs the full PDF extraction pipeline: chars → boxes →
-// column assignment → text merge → vertical merge → sections.
-//
-// For documents larger than Config.BatchSize pages, processes in batches
-// to bound memory usage (matching Python's batch_size=50).
-//
-// Returns a pdf.ParseResult containing sections, tables, page images, figures,
-// and pipeline stage metrics. Parser itself remains stateless.
-func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseResult, error) {
+// ── Public API ─────────────────────────────────────────────────────────────
+
+// ParseRaw is the internal entry point: runs the core pipeline on an
+// already-opened engine. Exported for tests that inject mock engines.
+func (p *Parser) ParseRaw(ctx context.Context, engine pdf.PDFEngine, docAnalyzer pdf.DocAnalyzer) (*pdf.ParseResult, error) {
+	tb := NewTableBuilderFor(docAnalyzer)
+
 	// Normalize page range
 	pageCount, err := engine.PageCount()
 	if err != nil {
@@ -103,11 +77,10 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 	totalPages := toPage - fromPage + 1
 	batchSize := p.Config.BatchSize
 	if batchSize <= 0 {
-		batchSize = 50 // default, matching Python's batch_size
+		batchSize = 50
 	}

-	// ── Prescan: lightweight char extraction for language/noise detection ──
-	// No rendering, no OCR — just raw chars for global decisions.
+	// ── Prescan ──
 	prescanChars := make(map[int][]pdf.TextChar)
 	prescanMedianH := make(map[int]float64)
 	prescanMedianW := make(map[int]float64)
@@ -115,26 +88,27 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 		chars, extractErr := engine.ExtractChars(pg)
 		if extractErr != nil {
 			slog.Warn("prescan: ExtractChars failed", "page", pg, "err", extractErr)
-			chars = nil // skip broken pages (matching old behavior)
+			chars = nil
 		}
 		prescanChars[pg] = chars
 		prescanMedianH[pg] = util.MedianCharHeight(chars)
 		prescanMedianW[pg] = util.MedianCharWidth(chars)
 	}
-	isEnglish := util.DetectEnglish(prescanChars, totalPages, p.SampleChars)
+	isEnglish := util.DetectEnglish(prescanChars, totalPages, nil)
 	scanNoise := util.IsScanNoise(util.FullTextFromChars(prescanChars))

-	// ── Extract PDF outlines/bookmarks (best-effort, non-fatal) ──
+	// ── Outlines ──
 	outlines, outlineErr := engine.Outlines()
 	if outlineErr != nil {
 		slog.Warn("Failed to extract PDF outlines; continuing without them", "err", outlineErr)
 		outlines = nil
 	}

-	// ── Small document: process all at once (no batching overhead) ──
+	// ── Small document ──
 	if totalPages <= batchSize {
 		result, err := p.processPages(ctx, engine, fromPage, toPage,
-			prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise)
+			prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise,
+			docAnalyzer, tb)
 		if err != nil {
 			return nil, err
 		}
@@ -142,7 +116,7 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 		return result, nil
 	}

-	// ── Large document: process in batches to bound memory ──
+	// ── Large document: batched ──
 	slog.Info("batched processing", "pages", totalPages, "batchSize", batchSize)
 	result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
 	for start := fromPage; start <= toPage; start += batchSize {
@@ -151,7 +125,6 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 		}
 		end := min(start+batchSize-1, toPage)

-		// Slice prescan data for this batch.
 		batchChars := make(map[int][]pdf.TextChar, end-start+1)
 		batchMH := make(map[int]float64, end-start+1)
 		batchMW := make(map[int]float64, end-start+1)
@@ -162,15 +135,14 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 		}

 		batch, err := p.processPages(ctx, engine, start, end,
-			batchChars, batchMH, batchMW, isEnglish, scanNoise)
+			batchChars, batchMH, batchMW, isEnglish, scanNoise,
+			docAnalyzer, tb)
 		if err != nil {
 			return nil, err
 		}

-		// Merge batch results.
 		result.Sections = append(result.Sections, batch.Sections...)
 		result.Tables = append(result.Tables, batch.Tables...)
-		// Figures() is computed on demand from Sections.
 		for pg, img := range batch.PageImages {
 			result.PageImages[pg] = img
 		}
@@ -184,33 +156,22 @@ func (p *Parser) Parse(ctx context.Context, engine pdf.PDFEngine) (*pdf.ParseRes
 	return result, nil
 }

-// extractPages runs per-page OCR (detect + recognize) for the given page
-// range, returning text boxes, char data, whether any page used OCR, and
-// any errors encountered.  Partial results are returned even when some
-// pages fail — callers should inspect the error for diagnostics but may
-// still use the returned boxes and chars.
+// ── Internal pipeline steps ────────────────────────────────────────────────
+
 func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 	fromPage, toPage int,
 	prescanChars map[int][]pdf.TextChar,
 	medianHeights, medianWidths map[int]float64,
 	pageImages map[int]image.Image,
+	docAnalyzer pdf.DocAnalyzer,
 ) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
 	var boxes []pdf.TextBox
 	pageChars := make(map[int][]pdf.TextChar)
 	ocrUsedAny := false

-	type pr struct {
-		pg       int
-		ocrBoxes []pdf.TextBox
-		chars    []pdf.TextChar
-		ocrUsed  bool
-		pageImg  image.Image
-		err      error
-	}
 	pageCount := toPage - fromPage + 1
-	results := make([]pr, pageCount)
+	results := make([]pageResult, pageCount)

-	// Semaphore cap: 0 → sequential; >0 → bounded parallelism.
 	cap := p.Config.MaxOCRConcurrency
 	if cap <= 0 {
 		cap = 1
@@ -222,16 +183,15 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 		pg := fromPage + i
 		chars := prescanChars[pg]

-		// Fast path: pages with embedded chars → sequential inline (no HTTP OCR).
 		if len(chars) > 0 && !util.IsGarbledPage(chars) {
-			pageImg, renderErr := renderPageToImage(engine, pg)
+			pageImg, renderErr := RenderPageToImage(engine, pg)
 			if renderErr == nil && pageImg != nil {
 				pageImages[pg] = pageImg
 			}
 			var ocrBoxes []pdf.TextBox
 			ocrUsed := false
 			if !p.Config.SkipOCR && renderErr == nil && pageImg != nil {
-				ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
+				ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
 				if ocrBoxes == nil {
 					ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
 				} else {
@@ -241,30 +201,28 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 			} else {
 				ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
 			}
-			results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
+			results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
 			continue
 		}

-		// OCR path: render + detect + recognize (potentially parallel).
 		wg.Add(1)
 		go func(i, pg int, chars []pdf.TextChar) {
 			defer wg.Done()
 			select {
 			case <-ctx.Done():
-				results[i] = pr{pg: pg, err: ctx.Err()}
+				results[i] = pageResult{pg: pg, err: ctx.Err()}
 				return
 			case sem <- struct{}{}:
 			}
 			defer func() { <-sem }()

-			pageImg, err := renderPageToImage(engine, pg)
+			pageImg, err := RenderPageToImage(engine, pg)
 			if err != nil {
-				results[i] = pr{pg: pg, err: err}
+				results[i] = pageResult{pg: pg, err: err}
 				return
 			}
-			// Check if context was cancelled during render.
 			if err := ctx.Err(); err != nil {
-				results[i] = pr{pg: pg, err: err}
+				results[i] = pageResult{pg: pg, err: err}
 				return
 			}

@@ -275,7 +233,7 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 				if len(chars) > 0 {
 					label = "garbled page"
 				}
-				ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, p.DeepDoc, pg, label)
+				ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, docAnalyzer, pg, label)
 				if ocrBoxes != nil {
 					for j := range ocrBoxes {
 						for _, r := range ocrBoxes[j].Text {
@@ -286,9 +244,8 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 					ocrUsed = true
 				}
 			}
-			// Merged OCR path for pages with both embedded and OCR chars.
 			if !ocrUsed && len(chars) > 0 && !p.Config.SkipOCR {
-				ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
+				ocrBoxes = ocrMergeChars(ctx, pageImg, chars, docAnalyzer, pg)
 				if ocrBoxes != nil {
 					ocrUsed = true
 				}
@@ -298,15 +255,252 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 					ocrBoxes = lyt.CharsToBoxes(chars, pg, p.Config.SortByTop)
 				}
 			}
-			results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
+			results[i] = pageResult{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
 		}(i, pg, chars)
 	}
 	wg.Wait()
+	return mergePageResults(results, boxes, pageImages, pageChars, ocrUsedAny, medianHeights, medianWidths)
+}

-	// Merge results in page order.
+func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
+	fromPage, toPage int,
+	pageImages map[int]image.Image,
+	pageChars map[int][]pdf.TextChar,
+	medianHeights, medianWidths map[int]float64,
+	ocrUsedAny bool,
+	docAnalyzer pdf.DocAnalyzer,
+) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
+	slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
+	var boxes []pdf.TextBox
+	for pg := fromPage; pg <= toPage; pg++ {
+		img := pageImages[pg]
+		if img == nil {
+			var err error
+			img, err = RenderPageToImage(engine, pg)
+			if err != nil {
+				slog.Warn("scan noise: page render failed", "page", pg, "err", err)
+				continue
+			}
+			pageImages[pg] = img
+		}
+		ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "scan page")
+		if ocrBoxes == nil {
+			slog.Warn("scan noise: page OCR empty", "page", pg)
+			continue
+		}
+		boxes = append(boxes, ocrBoxes...)
+		var chars []pdf.TextChar
+		for _, b := range ocrBoxes {
+			for _, r := range b.Text {
+				chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
+				break
+			}
+		}
+		pageChars[pg] = chars
+		medianHeights[pg] = util.MedianCharHeight(chars)
+		medianWidths[pg] = util.MedianCharWidth(chars)
+	}
+	slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
+	return boxes, pageChars, true
+}
+
+func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
+	fromPage, toPage int,
+	pageImages map[int]image.Image,
+	boxes []pdf.TextBox, ocrUsedAny bool,
+	docAnalyzer pdf.DocAnalyzer,
+) ([]pdf.TextBox, bool) {
+	retryZoomVal := p.Config.Zoom * pdf.DlaScale
+	retryDPI := retryZoomVal * 72
+	slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoomVal)
+	for pg := fromPage; pg <= toPage; pg++ {
+		img, err := engine.RenderPageImage(pg, retryDPI)
+		if err != nil {
+			slog.Warn("zoom retry: render failed", "page", pg, "err", err)
+			continue
+		}
+		pageImages[pg] = img
+		if retryDPI != pdf.DlaDPI {
+			if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
+				pageImages[pg] = dlaImg
+			}
+		}
+		ocrBoxes := ocrDetectAndRecognize(ctx, img, docAnalyzer, pg, "zoom retry")
+		if ocrBoxes == nil {
+			continue
+		}
+		scaleFactor := retryZoomVal / p.Config.Zoom
+		for i := range ocrBoxes {
+			ocrBoxes[i].X0 /= scaleFactor
+			ocrBoxes[i].X1 /= scaleFactor
+			ocrBoxes[i].Top /= scaleFactor
+			ocrBoxes[i].Bottom /= scaleFactor
+		}
+		boxes = append(boxes, ocrBoxes...)
+		ocrUsedAny = true
+	}
+	return boxes, ocrUsedAny
+}
+
+func (p *Parser) buildLayout(ctx context.Context,
+	result *pdf.ParseResult, engine pdf.PDFEngine,
+	boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
+	medianHeights, medianWidths map[int]float64,
+	fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
+	docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
+) error {
+	result.Metrics.BoxesInitial = len(boxes)
+
+	result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages, docAnalyzer, tb)
+	result.Metrics.TablesCount = len(result.Tables)
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
+	boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
+	result.Metrics.BoxesTextMerge = len(boxes)
+
+	lyt.SortByPageThenY(boxes, p.Config.SortByTop)
+
+	if ocrUsedAny {
+		isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, nil)
+	}
+	boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
+	result.Metrics.BoxesVertMerge = len(boxes)
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+
+	boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
+	boxes = tbl.ConsolidateFigures(boxes)
+
+	pageHeights := make(map[int]float64, len(result.PageImages))
+	for pg, img := range result.PageImages {
+		pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
+	}
+	result.Sections = lyt.BoxesToSections(boxes, pageHeights)
+	result.Metrics.BoxesFinal = len(result.Sections)
+	result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
+	return nil
+}
+
+func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
+	fromPage, toPage int,
+	prescanChars map[int][]pdf.TextChar,
+	medianHeights, medianWidths map[int]float64,
+	isEnglish, isScanNoiseDoc bool,
+	docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder,
+) (*pdf.ParseResult, error) {
+	result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
+
+	boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
+		fromPage, toPage, prescanChars,
+		medianHeights, medianWidths, result.PageImages, docAnalyzer)
+	if ocrErr != nil {
+		slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
+	}
+
+	if isScanNoiseDoc {
+		boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
+			fromPage, toPage, result.PageImages,
+			pageChars, medianHeights, medianWidths, ocrUsedAny, docAnalyzer)
+	}
+
+	if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
+		boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
+			result.PageImages, boxes, ocrUsedAny, docAnalyzer)
+	}
+
+	if len(boxes) == 0 {
+		return result, nil
+	}
+
+	if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
+		medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish,
+		docAnalyzer, tb); err != nil {
+		return nil, fmt.Errorf("buildLayout: %w", err)
+	}
+	p.fillSectionImages(result)
+	return result, nil
+}
+
+func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
+	if len(result.PageImages) == 0 {
+		return
+	}
+	tableImgByRegion := make(map[string]string, len(result.Tables))
+	for _, tbl := range result.Tables {
+		if tbl.ImageB64 == "" {
+			continue
+		}
+		pg := 0
+		if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
+			pg = tbl.Positions[0].PageNumbers[0]
+		}
+		key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
+			pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
+		tableImgByRegion[key] = tbl.ImageB64
+	}
+	for i := range result.Sections {
+		if result.Sections[i].LayoutType == pdf.LayoutTypeTable {
+			if img, ok := matchTableImage(&result.Sections[i], tableImgByRegion); ok {
+				result.Sections[i].Image = img
+				continue
+			}
+		}
+		if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
+			if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
+				result.Sections[i].Image = dlaImg
+				continue
+			}
+		}
+		img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
+		result.Sections[i].Image = img
+		if img == "" && result.Sections[i].Text != "" {
+			tag := result.Sections[i].PositionTag
+			slog.Warn("cropSectionImage empty for non-empty section",
+				"section", i, "posTag", tag[:min(80, len(tag))])
+		}
+	}
+}
+
+// matchTableImage looks up a pre-rendered table image for a section.
+// Uses Positions if available; falls back to TableItem Region boundaries.
+func matchTableImage(sec *pdf.Section, tableImgByRegion map[string]string) (string, bool) {
+	pg := 0
+	if len(sec.Positions) > 0 {
+		pos := sec.Positions[0]
+		if len(pos.PageNumbers) > 0 {
+			pg = pos.PageNumbers[0]
+		}
+		key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
+		if img, ok := tableImgByRegion[key]; ok {
+			return img, true
+		}
+		return "", false
+	}
+	if sec.TableItem != nil {
+		if len(sec.TableItem.Positions) > 0 && len(sec.TableItem.Positions[0].PageNumbers) > 0 {
+			pg = sec.TableItem.Positions[0].PageNumbers[0]
+		}
+		key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f", pg,
+			sec.TableItem.RegionLeft, sec.TableItem.RegionRight,
+			sec.TableItem.RegionTop, sec.TableItem.RegionBottom)
+		if img, ok := tableImgByRegion[key]; ok {
+			return img, true
+		}
+	}
+	return "", false
+}
+
+// mergePageResults collects per-page OCR results into the final output.
+func mergePageResults(results []pageResult, boxes []pdf.TextBox, pageImages map[int]image.Image,
+	pageChars map[int][]pdf.TextChar, ocrUsedAny bool,
+	medianHeights, medianWidths map[int]float64,
+) ([]pdf.TextBox, map[int][]pdf.TextChar, bool, error) {
 	var errs []error
-	for i := 0; i < pageCount; i++ {
-		r := results[i]
+	for _, r := range results {
 		if r.err != nil {
 			slog.Warn("page OCR failed", "page", r.pg, "err", r.err)
 			errs = append(errs, fmt.Errorf("page %d: %w", r.pg, r.err))
@@ -329,233 +523,3 @@ func (p *Parser) extractPages(ctx context.Context, engine pdf.PDFEngine,
 	}
 	return boxes, pageChars, ocrUsedAny, errors.Join(errs...)
 }
-
-// retryScanNoise re-runs OCR on all pages when prescan detects scan noise,
-// overwriting page-level state with fresh detect+recognize results.
-func (p *Parser) retryScanNoise(ctx context.Context, engine pdf.PDFEngine,
-	fromPage, toPage int,
-	pageImages map[int]image.Image,
-	pageChars map[int][]pdf.TextChar,
-	medianHeights, medianWidths map[int]float64,
-	ocrUsedAny bool,
-) ([]pdf.TextBox, map[int][]pdf.TextChar, bool) {
-	slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
-	var boxes []pdf.TextBox
-	for pg := fromPage; pg <= toPage; pg++ {
-		img := pageImages[pg]
-		if img == nil {
-			var err error
-			img, err = renderPageToImage(engine, pg)
-			if err != nil {
-				slog.Warn("scan noise: page render failed", "page", pg, "err", err)
-				continue
-			}
-			pageImages[pg] = img
-		}
-		ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "scan page")
-		if ocrBoxes == nil {
-			slog.Warn("scan noise: page OCR empty", "page", pg)
-			continue
-		}
-		boxes = append(boxes, ocrBoxes...)
-		var chars []pdf.TextChar
-		for _, b := range ocrBoxes {
-			for _, r := range b.Text {
-				chars = append(chars, pdf.TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
-				break
-			}
-		}
-		pageChars[pg] = chars
-		medianHeights[pg] = util.MedianCharHeight(chars)
-		medianWidths[pg] = util.MedianCharWidth(chars)
-	}
-	slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
-	return boxes, pageChars, true
-}
-
-// retryZoom re-renders pages at higher resolution and re-runs OCR when the
-// initial extraction produced zero boxes.  Box coordinates are scaled back
-// to Config.Zoom space.  Matches Python's __images__ retry.
-func (p *Parser) retryZoom(ctx context.Context, engine pdf.PDFEngine,
-	fromPage, toPage int,
-	pageImages map[int]image.Image,
-	boxes []pdf.TextBox, ocrUsedAny bool,
-) ([]pdf.TextBox, bool) {
-	retryZoom := p.Config.Zoom * pdf.DlaScale
-	retryDPI := retryZoom * 72
-	slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoom)
-	for pg := fromPage; pg <= toPage; pg++ {
-		img, err := engine.RenderPageImage(pg, retryDPI)
-		if err != nil {
-			slog.Warn("zoom retry: render failed", "page", pg, "err", err)
-			continue
-		}
-		pageImages[pg] = img
-		// Downstream DLA/TSR assumes pdf.DlaDPI. Re-render at standard
-		// resolution so layout coordinates are scaled correctly.
-		if retryDPI != pdf.DlaDPI {
-			if dlaImg, dlaErr := engine.RenderPageImage(pg, pdf.DlaDPI); dlaErr == nil {
-				pageImages[pg] = dlaImg
-			}
-		}
-		ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "zoom retry")
-		if ocrBoxes == nil {
-			continue
-		}
-		scaleFactor := retryZoom / p.Config.Zoom
-		for i := range ocrBoxes {
-			ocrBoxes[i].X0 /= scaleFactor
-			ocrBoxes[i].X1 /= scaleFactor
-			ocrBoxes[i].Top /= scaleFactor
-			ocrBoxes[i].Bottom /= scaleFactor
-		}
-		boxes = append(boxes, ocrBoxes...)
-		ocrUsedAny = true
-	}
-	return boxes, ocrUsedAny
-}
-
-// buildLayout runs the DLA → TSR → Column → TextMerge → VM → pdf.Section
-// pipeline and populates result.Metrics, result.Tables, result.Sections,
-// and result.Sections.  Matches Python's _parse_loaded_window_into_bboxes
-// order.
-func (p *Parser) buildLayout(ctx context.Context,
-	result *pdf.ParseResult, engine pdf.PDFEngine,
-	boxes []pdf.TextBox, pageChars map[int][]pdf.TextChar,
-	medianHeights, medianWidths map[int]float64,
-	fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
-) error {
-	result.Metrics.BoxesInitial = len(boxes)
-
-	result.Tables = p.enrichWithDeepDoc(ctx, result, engine, boxes, result.PageImages)
-	result.Metrics.TablesCount = len(result.Tables)
-	if err := ctx.Err(); err != nil {
-		return err
-	}
-
-	boxes = lyt.AssignColumn(boxes, p.Config.Zoom)
-	boxes = lyt.TextMerge(boxes, medianHeights, p.Config.Zoom)
-	result.Metrics.BoxesTextMerge = len(boxes)
-
-	lyt.SortByPageThenY(boxes, p.Config.SortByTop)
-
-	if ocrUsedAny {
-		isEnglish = util.DetectEnglish(pageChars, toPage-fromPage+1, p.SampleChars)
-	}
-	boxes = lyt.NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
-	result.Metrics.BoxesVertMerge = len(boxes)
-	if err := ctx.Err(); err != nil {
-		return err
-	}
-
-	boxes = tbl.ExtractTableAndReplace(boxes, result.Tables)
-	boxes = tbl.ConsolidateFigures(boxes)
-
-	pageHeights := make(map[int]float64, len(result.PageImages))
-	for pg, img := range result.PageImages {
-		pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
-	}
-	result.Sections = lyt.BoxesToSections(boxes, pageHeights)
-	result.Metrics.BoxesFinal = len(result.Sections)
-	result.Sections = tbl.MergeCaptions(result.Sections, result.Figures())
-	return nil
-}
-
-// processPages runs the full pipeline on pages [fromPage, toPage].
-// prescanChars provides pre-extracted chars (avoids double extraction).
-func (p *Parser) processPages(ctx context.Context, engine pdf.PDFEngine,
-	fromPage, toPage int,
-	prescanChars map[int][]pdf.TextChar,
-	medianHeights, medianWidths map[int]float64,
-	isEnglish, isScanNoiseDoc bool,
-) (*pdf.ParseResult, error) {
-	result := &pdf.ParseResult{PageImages: make(map[int]image.Image)}
-
-	// 1. OCR extraction — per-page detect + recognize + char merge.
-	boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
-		fromPage, toPage, prescanChars,
-		medianHeights, medianWidths, result.PageImages)
-	if ocrErr != nil {
-		slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
-	}
-	// 2. Scan noise retry — re-OCR all pages when prescan detects scan noise.
-	if isScanNoiseDoc {
-		boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
-			fromPage, toPage, result.PageImages,
-			pageChars, medianHeights, medianWidths, ocrUsedAny)
-	}
-
-	// 3. Zoom retry — re-render at higher resolution if OCR produced zero boxes.
-	if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
-		boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
-			result.PageImages, boxes, ocrUsedAny)
-	}
-
-	if len(boxes) == 0 {
-		return result, nil
-	}
-
-	// 4. Layout pipeline — DLA → TSR → Column → TextMerge → VM → Sections.
-	if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
-		medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish); err != nil {
-		return nil, fmt.Errorf("buildLayout: %w", err)
-	}
-	// 5. Crop section images from page renders.
-	p.fillSectionImages(result)
-
-	return result, nil
-}
-
-// fillSectionImages populates result.Sections[i].Image with cropped
-// page images. Table sections are matched to their TableItem image;
-// figure sections try DLA-aware cropping first, then fall back to
-// position-tag-based cropping.
-func (p *Parser) fillSectionImages(result *pdf.ParseResult) {
-	if len(result.PageImages) == 0 {
-		return
-	}
-	// Build lookup: DLA region -> table image (base64).
-	tableImgByRegion := make(map[string]string, len(result.Tables))
-	for _, tbl := range result.Tables {
-		if tbl.ImageB64 == "" {
-			continue
-		}
-		pg := 0
-		if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
-			pg = tbl.Positions[0].PageNumbers[0]
-		}
-		key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
-			pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
-		tableImgByRegion[key] = tbl.ImageB64
-	}
-	for i := range result.Sections {
-		if result.Sections[i].LayoutType == pdf.LayoutTypeTable && len(result.Sections[i].Positions) > 0 {
-			pos := result.Sections[i].Positions[0]
-			pg := 0
-			if len(pos.PageNumbers) > 0 {
-				pg = pos.PageNumbers[0]
-			}
-			key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
-				pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
-			if img, ok := tableImgByRegion[key]; ok {
-				result.Sections[i].Image = img
-				continue
-			}
-		}
-		// Try DLA-aware cropping for figure sections (matching Python's
-		// cropout which uses DLA region boundaries instead of text boxes).
-		if result.Sections[i].LayoutType == pdf.LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
-			if dlaImg := util.CropSectionByDLA(result.Sections[i], result.DLADebug, result.PageImages); dlaImg != "" {
-				result.Sections[i].Image = dlaImg
-				continue
-			}
-		}
-		img := util.CropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
-		result.Sections[i].Image = img
-		if img == "" && result.Sections[i].Text != "" {
-			tag := result.Sections[i].PositionTag
-			slog.Warn("cropSectionImage empty for non-empty section",
-				"section", i, "posTag", tag[:min(80, len(tag))])
-		}
-	}
-}
--- a/internal/deepdoc/parser/pdf/parser_mock_test.go
+++ b/internal/deepdoc/parser/pdf/parser_mock_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -53,10 +53,11 @@ func TestEnrichWithDeepDoc_Noop(t *testing.T) {
 	boxes := []pdf.TextBox{
 		{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"},
 	}
-	eng := &mockEngine{pageCount: 1}
+	eng := &MockEngine{NumPages: 1}

-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
-	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil)
+	p := NewParser(pdf.DefaultParserConfig())
+	mock := &MockDocAnalyzer{Healthy: false}
+	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, nil, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Error("unhealthy DeepDoc → 0 Tables")
 	}
@@ -83,10 +84,10 @@ func TestExtractTableBoxes_Mock(t *testing.T) {
 			{X0: 600, Y0: 410, X1: 1240, Y1: 800, Text: "B2"},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummyImg := image.NewRGBA(image.Rect(0, 0, 2000, 3000))

-	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummyImg, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 1 {
 		t.Fatalf("expected 1 pdf.TableItem, got %d", len(tables))
 	}
@@ -105,9 +106,9 @@ func TestExtractTableBoxes_Mock(t *testing.T) {

 func TestExtractTableBoxes_NoTables(t *testing.T) {
 	mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{}}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("0 tables expected, got %d", len(tables))
 	}
@@ -121,9 +122,9 @@ func TestExtractTableBoxes_NonTableRegions(t *testing.T) {
 			{X0: 150, Y0: 600, X1: 1650, Y1: 900, Label: "figure", Confidence: 0.8},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 2000, 2000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("non-table regions → 0 tables, got %d", len(tables))
 	}
@@ -139,9 +140,9 @@ func TestExtractTableBoxes_NoOverlap(t *testing.T) {
 			{X0: 150, Y0: 1500, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("no overlap → 0 tables, got %d", len(tables))
 	}
@@ -158,9 +159,9 @@ func TestExtractTableBoxes_TSRError(t *testing.T) {
 		},
 		TSRCells: nil, // TSR returns nothing
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, boxes, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 1 {
 		t.Fatalf("TSR failure: expected 1 pdf.TableItem with image+positions, got %d", len(tables))
 	}
@@ -180,9 +181,9 @@ func TestExtractTableBoxes_DLAError(t *testing.T) {
 	mock := &MockDocAnalyzer{Healthy: true, DLARegions: []pdf.DLARegion{
 		{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "text", Confidence: 0.9},
 	}}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("non-table DLA → 0 tables, got %d", len(tables))
 	}
@@ -238,9 +239,9 @@ func TestExtractTableBoxes_InvalidRegion(t *testing.T) {
 			{X0: 500, Y0: 100, X1: 100, Y1: 300, Label: "table", Confidence: 0.9},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())
 	dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
-	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0)
+	tables := p.extractTableBoxesFromImage(context.Background(), nil, nil, dummy, 0, 0, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("invalid DLA region should be skipped, got %d tables", len(tables))
 	}
@@ -252,16 +253,16 @@ func TestParse_CollectsFigures(t *testing.T) {
 	// End-to-end: Parse() with mock DeepDoc that labels a box as "figure".
 	// Verify p.Figures is populated.

-	eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
+	eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
 	mock := &MockDocAnalyzer{
 		Healthy: true,
 		DLARegions: []pdf.DLARegion{
 			{X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -282,15 +283,15 @@ func TestParse_CollectsFigures(t *testing.T) {
 func TestParse_NoFigures(t *testing.T) {
 	// Parse() with no DLA figure regions → p.Figures should be empty.

-	eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
+	eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
 	mock := &MockDocAnalyzer{
 		DLARegions: []pdf.DLARegion{
 			{X0: 150, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -302,10 +303,11 @@ func TestParse_NoFigures(t *testing.T) {
 func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
 	// Parse() with mock DeepDoc → Figures should be empty (no DLA-detected figures).

-	eng := &mockEngine{pageCount: 1, chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
+	eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
+	mock := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -319,9 +321,9 @@ func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
 func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
 	// When DeepDoc is available and the page has embedded chars,
 	// Parse should use ocrMergeChars (detect → merge → recognize).
-	eng := &mockEngine{
-		pageCount: 1,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		Chars: map[int][]pdf.TextChar{0: {
 			{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
 		}},
 	}
@@ -331,9 +333,9 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
 			{X0: 5, Y0: 5, X1: 50, Y1: 5, X2: 50, Y2: 50, X3: 5, Y3: 50},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -349,15 +351,16 @@ func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {

 func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
 	// Without DeepDoc, Parse should use charsToBoxes (unchanged behavior).
-	eng := &mockEngine{
-		pageCount: 1,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		Chars: map[int][]pdf.TextChar{0: {
 			{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
 		}},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
+	mock := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -368,9 +371,9 @@ func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {

 func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
 	// OCRDetect returns no boxes → falls through to charsToBoxes.
-	eng := &mockEngine{
-		pageCount: 1,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		Chars: map[int][]pdf.TextChar{0: {
 			{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
 		}},
 	}
@@ -378,9 +381,9 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
 		Healthy:  true,
 		OCRBoxes: []pdf.OCRBox{}, // empty detect
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -392,18 +395,19 @@ func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
 // ── Error path coverage ────────────────────────────────────────────────

 func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
+	mock := &MockDocAnalyzer{
 		Healthy: true,
 		DLAErr:  fmt.Errorf("DLA service unavailable"),
-	})
-	eng := &mockEngine{pageCount: 1}
+	}
+	p := NewParser(pdf.DefaultParserConfig())
+	eng := &MockEngine{NumPages: 1}
 	img := image.NewRGBA(image.Rect(0, 0, 100, 100))
 	pageImages := map[int]image.Image{0: img}
 	boxes := []pdf.TextBox{
 		{PageNumber: 0, X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "text"},
 	}
 	// enrichWithDeepDoc should return nil (not panic) on DLA error.
-	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
+	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
 	if len(tables) != 0 {
 		t.Errorf("DLA error should produce 0 tables, got %d", len(tables))
 	}
@@ -412,20 +416,21 @@ func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
 func TestMockDocAnalyzer_TSRError_DoesNotCrash(t *testing.T) {
 	// TSR error: DLA succeeds, TSR fails.  The table region is detected
 	// but no cells are returned — the table is skipped gracefully.
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{
+	mock := &MockDocAnalyzer{
 		Healthy: true,
 		DLARegions: []pdf.DLARegion{
 			{X0: 0, Y0: 0, X1: 400, Y1: 400, Label: "table", Confidence: 0.95},
 		},
 		TSRErr: fmt.Errorf("TSR model timeout"),
-	})
-	eng := &mockEngine{pageCount: 1}
+	}
+	p := NewParser(pdf.DefaultParserConfig())
+	eng := &MockEngine{NumPages: 1}
 	img := image.NewRGBA(image.Rect(0, 0, 100, 100))
 	pageImages := map[int]image.Image{0: img}
 	boxes := []pdf.TextBox{
 		{PageNumber: 0, X0: 10, X1: 90, Top: 10, Bottom: 90, Text: "in table region"},
 	}
-	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages)
+	tables := p.enrichWithDeepDoc(context.Background(), nil, eng, boxes, pageImages, mock, NewTableBuilderFor(mock))
 	// DLA detects the table region → 1 pdf.TableItem is created.  TSR failure
 	// means it has no cells, but the pipeline must not panic.
 	if len(tables) != 1 {
@@ -440,12 +445,12 @@ func TestMockDocAnalyzer_OCRDetectError_DoesNotCrash(t *testing.T) {
 	// OCRDetect failure path: extractPages uses ocrDetectAndRecognize which
 	// calls doc.OCRDetect.  When it fails, the page is skipped gracefully.
 	mock := &MockDocAnalyzer{Healthy: true, OCRDetectErr: fmt.Errorf("OCR model OOM")}
-	eng := &mockEngine{
-		pageCount: 1,
-		chars:     map[int][]pdf.TextChar{}, // empty → triggers OCR path
+	eng := &MockEngine{
+		NumPages: 1,
+		Chars:    map[int][]pdf.TextChar{}, // empty → triggers OCR path
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
-	_, err := p.Parse(context.Background(), eng)
+	p := NewParser(pdf.DefaultParserConfig())
+	_, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse returned error: %v", err)
 	}
--- a/internal/deepdoc/parser/pdf/parser_ocr.go
+++ b/internal/deepdoc/parser/pdf/parser_ocr.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -54,12 +54,17 @@ func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc pdf.Doc
 // merges the chars into detect regions, and OCRs any regions without chars.
 // Matches Python's __ocr: detect → match chars to boxes → use char text
 // for boxes with embedded chars → OCR recognize only empty/garbled boxes.
+type ocrDetectBox struct {
+	box            pdf.TextBox
+	x0, y0, x1, y1 float64
+}
+
 func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextChar, doc pdf.DocAnalyzer, pageNum int) []pdf.TextBox {
-	detectBoxes, err := doc.OCRDetect(ctx, pageImg)
-	if err != nil || len(detectBoxes) == 0 {
+	ocrDetectBoxes, err := doc.OCRDetect(ctx, pageImg)
+	if err != nil || len(ocrDetectBoxes) == 0 {
 		return nil
 	}
-	slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes))
+	slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(ocrDetectBoxes))

 	// Detect boxes are in pixel space (216 DPI).  Scale to PDF space (72 DPI)
 	// so coordinates match embedded chars.
@@ -69,12 +74,8 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
 	imgH := float64(imgBounds.Dy()) / scale

 	// Step 1: match embedded chars to detect boxes (Python __ocr char matching).
-	type detectBox struct {
-		box            pdf.TextBox
-		x0, y0, x1, y1 float64 // PDF-space bounds
-	}
-	boxes := make([]detectBox, 0, len(detectBoxes))
-	for _, b := range detectBoxes {
+	boxes := make([]ocrDetectBox, 0, len(ocrDetectBoxes))
+	for _, b := range ocrDetectBoxes {
 		x0 := min(b.X0, b.X1, b.X2, b.X3) / scale
 		y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale
 		x1 := max(b.X0, b.X1, b.X2, b.X3) / scale
@@ -94,7 +95,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
 		if x0 >= x1 || y0 >= y1 {
 			continue
 		}
-		boxes = append(boxes, detectBox{box: pdf.TextBox{
+		boxes = append(boxes, ocrDetectBox{box: pdf.TextBox{
 			X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum,
 		}, x0: x0, y0: y0, x1: x1, y1: y1})
 	}
@@ -145,82 +146,7 @@ func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextCha
 		boxChars[bestIdx] = append(boxChars[bestIdx], c)
 	}

-	// Step 3: assemble text for each box.
-	var result []pdf.TextBox
-	var needOCR []int
-	for i := range boxes {
-		tb := boxes[i].box
-		tb.Text = ""
-
-		if len(boxChars[i]) > 0 {
-			// Sort chars by reading order, matching Python's sort_Y_firstly.
-			// Fuzzy Y-group: chars within median char height are "same line",
-			// sorted by X; different lines sorted by Y.
-			sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
-			// Use lineToTextBox for correct space insertion + garbled detection.
-			// lineToTextBox inserts ASCII word spaces at visible gaps —
-			// matching Python's __img_ocr + __ocr char logic.
-			lineBox := lyt.LineToTextBox(boxChars[i])
-			tb.Text = lineBox.Text
-
-			// Strategy 1: If majority of chars are garbled (PUA), clear text → OCR.
-			var garbledCnt, totalCnt int
-			for _, c := range boxChars[i] {
-				for _, r := range c.Text {
-					totalCnt++
-					if util.IsGarbledChar(string(r)) {
-						garbledCnt++
-					}
-				}
-			}
-			if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
-				tb.Text = ""
-			}
-			// Strategy 2: font-encoding garbled (subset fonts, min 5 chars).
-			if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
-				tb.Text = ""
-			}
-		}
-
-		// Step 4: batch OCR recognize boxes without embedded chars (or garbled).
-		if tb.Text == "" {
-			needOCR = append(needOCR, i)
-		}
-		result = append(result, tb)
-	}
-
-	if len(needOCR) > 0 {
-		cropped := make([]image.Image, len(needOCR))
-		for j, idx := range needOCR {
-			cropped[j] = util.FastCrop(pageImg,
-				int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
-				int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
-		}
-		allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
-		for j, idx := range needOCR {
-			if allErrs[j] != nil {
-				slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
-				continue
-			}
-			var ocrParts []string
-			for _, t := range allTexts[j] {
-				if strings.TrimSpace(t.Text) != "" {
-					ocrParts = append(ocrParts, t.Text)
-				}
-			}
-			result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
-		}
-	}
-	// Filter out boxes with no text.
-	filtered := result[:0]
-	for _, tb := range result {
-		if tb.Text != "" {
-			filtered = append(filtered, tb)
-		}
-	}
-	result = filtered
-	slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result))
-	return result
+	return buildTextBoxes(ctx, pageImg, boxes, boxChars, doc, scale, pageNum)
 }

 // sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X.
@@ -289,3 +215,71 @@ func ocrTableCells(ctx context.Context, cells []pdf.TSRCell, tableImg image.Imag
 		cells[i].Text = strings.TrimSpace(strings.Join(parts, " "))
 	}
 }
+
+// buildTextBoxes assembles detect box text from embedded chars and fills
+// empty boxes via batch OCR.
+func buildTextBoxes(ctx context.Context, pageImg image.Image,
+	boxes []ocrDetectBox, boxChars [][]pdf.TextChar, doc pdf.DocAnalyzer, scale float64, pageNum int,
+) []pdf.TextBox {
+	var result []pdf.TextBox
+	var needOCR []int
+	for i := range boxes {
+		tb := boxes[i].box
+		tb.Text = ""
+		if len(boxChars[i]) > 0 {
+			sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
+			lineBox := lyt.LineToTextBox(boxChars[i])
+			tb.Text = lineBox.Text
+			var garbledCnt, totalCnt int
+			for _, c := range boxChars[i] {
+				for _, r := range c.Text {
+					totalCnt++
+					if util.IsGarbledChar(string(r)) {
+						garbledCnt++
+					}
+				}
+			}
+			if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
+				tb.Text = ""
+			}
+			if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
+				tb.Text = ""
+			}
+		}
+		if strings.TrimSpace(tb.Text) == "" {
+			tb.Text = ""
+			needOCR = append(needOCR, i)
+		}
+		result = append(result, tb)
+	}
+	if len(needOCR) > 0 {
+		cropped := make([]image.Image, len(needOCR))
+		for j, idx := range needOCR {
+			cropped[j] = util.FastCrop(pageImg,
+				int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
+				int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
+		}
+		allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
+		for j, idx := range needOCR {
+			if allErrs[j] != nil {
+				slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
+				continue
+			}
+			var ocrParts []string
+			for _, t := range allTexts[j] {
+				if strings.TrimSpace(t.Text) != "" {
+					ocrParts = append(ocrParts, t.Text)
+				}
+			}
+			result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
+		}
+	}
+	filtered := result[:0]
+	for _, tb := range result {
+		if strings.TrimSpace(tb.Text) != "" {
+			filtered = append(filtered, tb)
+		}
+	}
+	slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(filtered))
+	return filtered
+}
--- a/internal/deepdoc/parser/pdf/parser_ocr_test.go
+++ b/internal/deepdoc/parser/pdf/parser_ocr_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
--- a/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go
+++ b/internal/deepdoc/parser/pdf/parser_pipeline_integration_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && integration

-package parser
+package pdf

 import (
 	"bytes"
@@ -11,10 +11,10 @@ import (
 	_ "image/png"
 	"os"
 	"path/filepath"
-	"ragflow/internal/deepdoc/parser/pdf/post"
-	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 	"strings"
 	"testing"
+
+	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 )

 // ── golden-file helpers ────────────────────────────────────────────────────
@@ -95,12 +95,11 @@ func tablesToGolden(tables []pdf.TableItem) []tableGolden {
 // TestIntegration_SectionsText verifies section text output matches golden.
 func TestIntegration_SectionsText(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "01_english_simple.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "01_english_simple.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -139,12 +138,11 @@ func TestIntegration_SectionsText(t *testing.T) {
 // TestIntegration_SectionsCount verifies section count is stable.
 func TestIntegration_SectionsCount(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "01_english_simple.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "01_english_simple.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -166,12 +164,11 @@ func TestIntegration_SectionsCount(t *testing.T) {
 // TestIntegration_TableStructure verifies table rows and cell text match golden.
 func TestIntegration_TableStructure(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "06_table_content.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -221,12 +218,11 @@ func TestIntegration_TableStructure(t *testing.T) {
 // TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
 func TestIntegration_TableImageB64(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "06_table_content.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -261,12 +257,11 @@ func TestIntegration_TableImageB64(t *testing.T) {
 // TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
 func TestIntegration_LayoutTypes(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "06_table_content.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -316,7 +311,6 @@ func TestIntegration_Idempotency(t *testing.T) {

 	// Render a fixture page as the stable input image.
 	eng := mustOpenEngine(t, "06_table_content.pdf")
-	defer eng.Close()
 	pageImg, err := eng.RenderPageImage(0, 216)
 	if err != nil {
 		t.Fatalf("render page: %v", err)
@@ -531,12 +525,11 @@ func floatClose(a, b, eps float64) bool {
 // fixes from the Python→Go migration.
 func TestIntegration_TableAlign(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "18_table_caption.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "18_table_caption.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -572,12 +565,11 @@ func TestIntegration_TableAlign(t *testing.T) {
 // (header/footer/reference) boxes are popped from output.
 func TestIntegration_GarbageLayout(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "17_garbage_layout.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "17_garbage_layout.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -603,13 +595,12 @@ func TestIntegration_GarbageLayout(t *testing.T) {
 // TestIntegration_MultiChunk verifies chunked processing for large documents.
 func TestIntegration_MultiChunk(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "19_multipage_chunk.pdf")

 	cfg := pdf.DefaultParserConfig()
 	cfg.BatchSize = 10 // small batches to force multi-batch path
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -635,11 +626,10 @@ func TestIntegration_NoRegression(t *testing.T) {
 		"07_mixed_content.pdf",
 	} {
 		t.Run(name, func(t *testing.T) {
-			eng := mustOpenEngine(t, name)
-			defer eng.Close()
+			data := mustReadPDF(t, name)
 			cfg := pdf.DefaultParserConfig()
-			p := NewParser(cfg, client)
-			result, err := p.Parse(context.Background(), eng)
+			p := NewParser(cfg)
+			result, err := p.Parse(context.Background(), data, client)
 			if err != nil {
 				t.Fatalf("Parse: %v", err)
 			}
@@ -662,11 +652,10 @@ func TestIntegration_TableRotation(t *testing.T) {
 	client := mustConnectInferenceClient(t)

 	t.Run("upright_table", func(t *testing.T) {
-		eng := mustOpenEngine(t, "rotate_0.pdf")
-		defer eng.Close()
+		data := mustReadPDF(t, "rotate_0.pdf")
 		cfg := pdf.DefaultParserConfig()
-		p := NewParser(cfg, client)
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.Parse(context.Background(), data, client)
 		if err != nil {
 			t.Fatalf("Parse: %v", err)
 		}
@@ -677,16 +666,15 @@ func TestIntegration_TableRotation(t *testing.T) {
 	})

 	t.Run("rotated_90_table", func(t *testing.T) {
-		eng := mustOpenEngine(t, "rotate_90.pdf")
-		defer eng.Close()
+		data := mustReadPDF(t, "rotate_90.pdf")
 		cfg := pdf.DefaultParserConfig()
 		// DeepDoc DLA does not yet correctly annotate boxes on rotated
 		// pages (regions and characters are in different coordinate
 		// spaces post-rotation).  Character extraction and rotation are
-		// verified via the charsToBoxes path.
+		// verified via the lyt.CharsToBoxes path.
 		cfg.SkipOCR = true
-		p := NewParser(cfg, client)
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.Parse(context.Background(), data, client)
 		if err != nil {
 			t.Fatalf("Parse: %v", err)
 		}
@@ -701,12 +689,11 @@ func TestIntegration_TableRotation(t *testing.T) {
 // characters with a visible gap (Python __img_ocr space insertion).
 func TestIntegration_WordSpacing(t *testing.T) {
 	client := mustConnectInferenceClient(t)
-	eng := mustOpenEngine(t, "01_english_simple.pdf")
-	defer eng.Close()
+	data := mustReadPDF(t, "01_english_simple.pdf")

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, client)
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.Parse(context.Background(), data, client)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -734,53 +721,34 @@ func TestIntegration_WordSpacing(t *testing.T) {
 // TestE2E_ParseAndPostProcess runs Parse → PostProcess end-to-end on a real
 // PDF. Skips VLM (no tenant_id set) but exercises all other operators.
 func TestE2E_ParseAndPostProcess(t *testing.T) {
-	engine := mustOpenEngine(t, "01_english_simple.pdf")
-	defer engine.Close()
+	data := mustReadPDF(t, "01_english_simple.pdf")

 	mock := &MockDocAnalyzer{Healthy: true}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), engine)
+	result, err := p.Parse(context.Background(), data, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}

-	preCount := len(result.Sections)
-	if preCount == 0 {
+	if len(result.Sections) == 0 {
 		t.Fatal("Parse() returned zero sections")
 	}
+	t.Logf("sections: %d", len(result.Sections))

-	// Post-processing (no VLM).
-	config := post.PipelineConfig{
-		post.ConfigKeyPageWidth: 612.0,
-		post.ConfigKeyZoom:      1.0,
-	}
-	if err := post.PostProcess(context.Background(), result, config); err != nil {
-		t.Fatalf("PostProcess: %v", err)
-	}
-
-	postCount := len(result.Sections)
-	t.Logf("sections: %d → %d after PostProcess", preCount, postCount)
-	if postCount == 0 {
-		t.Error("PostProcess removed all sections")
-	}
-
-	// Every section must have DocTypeKwd + LayoutType set.
+	// PostProcess is handled by the Pipeline framework.
+	// Verify raw parse produces sections with LayoutType set.
 	for i, s := range result.Sections {
-		if s.DocTypeKwd == "" {
-			t.Errorf("section[%d] DocTypeKwd empty after PostProcess", i)
-		}
-		if s.LayoutType == "" {
-			t.Errorf("section[%d] LayoutType empty after PostProcess", i)
-		}
+		t.Logf("  section[%d]: layout=%q text=%q", i, s.LayoutType, truncate(s.Text, 60))
 	}

-	// Figures() must reflect post-processed sections.
 	figs := result.Figures()
 	t.Logf("figures: %d", len(figs))
-	for _, f := range figs {
-		if f.LayoutType != "figure" {
-			t.Errorf("Figures() LayoutType=%q, want 'figure'", f.LayoutType)
-		}
-	}
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "..."
 }
--- a/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go
+++ b/internal/deepdoc/parser/pdf/parser_pipeline_manual_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -47,8 +47,8 @@ func TestIntegration_NoCrash(t *testing.T) {
 			defer eng.Close()

 			cfg := pdf.DefaultParserConfig()
-			p := NewParser(cfg, client)
-			result, err := p.Parse(context.Background(), eng)
+			p := NewParser(cfg)
+			result, err := p.ParseRaw(context.Background(), eng, client)
 			if err != nil {
 				t.Fatalf("Parse: %v", err)
 			}
--- a/internal/deepdoc/parser/pdf/parser_test.go
+++ b/internal/deepdoc/parser/pdf/parser_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -6,6 +6,7 @@ import (
 	"strings"
 	"sync"
 	"testing"
+	"math"

 	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
 	tbl "ragflow/internal/deepdoc/parser/pdf/table"
@@ -207,15 +208,16 @@ func TestOCR_FallbackIntegration(t *testing.T) {

 func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) {
 	chars := garbledSample()
-	mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
+	mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
+	mockDLA := &MockDocAnalyzer{Healthy: true}

 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), mockEng)
+	p := NewParser(cfg)
+	result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
 	if err != nil {
 		t.Fatal(err)
 	}
-	t.Logf("garbled chars: %d sections", len(result.Sections))
+	t.Logf("garbled Chars: %d sections", len(result.Sections))
 }

 func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
@@ -241,9 +243,10 @@ func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
 	chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112}
 	chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112}

-	mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), mockEng)
+	mockEng := &MockEngine{Chars: map[int][]pdf.TextChar{0: chars}, NumPages: 1}
+	mockDLA := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())
+	result, err := p.ParseRaw(context.Background(), mockEng, mockDLA)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -279,7 +282,7 @@ func TestIsGarbledPage(t *testing.T) {
 	})
 	t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) {
 		// ### unmapped glyphs + real CJK text (no subset fonts).
-		// isScanNoise returns false (≥2 consecutive CJK chars: "护理全科").
+		// isScanNoise returns false (≥2 consecutive CJK Chars: "护理全科").
 		chars := []pdf.TextChar{
 			{Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0},
 			{Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0},
@@ -552,11 +555,12 @@ func TestTableSectionCaptionInHTML(t *testing.T) {
 // text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true.
 // The 0.3 threshold should not match a wide box that barely touches a
 // narrow cell — this would cause body text to leak into table cells.
-// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for
+// TestParser_ConcurrentSafety verifies that Parser.ParseRaw() is safe for
 // concurrent use. 8 goroutines each call Parse 5 times on the same Parser
 // instance. Run with -race.
 func TestParser_ConcurrentSafety(t *testing.T) {
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
+	mockDLA := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())

 	var wg sync.WaitGroup
 	n := 8
@@ -565,10 +569,58 @@ func TestParser_ConcurrentSafety(t *testing.T) {
 		go func() {
 			defer wg.Done()
 			for range 5 {
-				eng := &mockEngine{pageCount: 2}
-				_, _ = p.Parse(context.Background(), eng)
+				eng := &MockEngine{NumPages: 2}
+				if _, err := p.ParseRaw(context.Background(), eng, mockDLA); err != nil {
+					t.Errorf("ParseRaw: %v", err)
+				}
 			}
 		}()
 	}
 	wg.Wait()
 }
+
+func TestParseRaw_ClampsFromPage(t *testing.T) {
+	// A negative FromPage should be treated as page 0.
+	// Only page 0 has content so we can verify clamping worked.
+	eng := &MockEngine{NumPages: 3, Chars: map[int][]pdf.TextChar{
+		0: {{Text: "page0", X0: 100, X1: 200, Top: 100, Bottom: 120}},
+	}}
+	mockDLA := &MockDocAnalyzer{Healthy: true}
+	cfg := pdf.DefaultParserConfig()
+	cfg.FromPage = -1
+	p := NewParser(cfg)
+	result, err := p.ParseRaw(context.Background(), eng, mockDLA)
+	if err != nil {
+		t.Fatalf("ParseRaw: %v", err)
+	}
+	if len(result.Sections) == 0 {
+		t.Error("expected sections from page 0")
+	}
+}
+
+func TestParseRaw_ZeroZoom_NoNaN(t *testing.T) {
+	// Zoom=0 should not produce NaN coordinates.
+	eng := &MockEngine{NumPages: 1, Chars: map[int][]pdf.TextChar{
+		0: {{Text: "test", X0: 100, X1: 200, Top: 100, Bottom: 120}},
+	}}
+	mockDLA := &MockDocAnalyzer{Healthy: true}
+	cfg := pdf.DefaultParserConfig()
+	cfg.Zoom = 0
+	p := NewParser(cfg)
+	result, err := p.ParseRaw(context.Background(), eng, mockDLA)
+	if err != nil {
+		t.Fatalf("ParseRaw: %v", err)
+	}
+	foundPosition := false
+	for _, s := range result.Sections {
+		for _, pos := range s.Positions {
+			foundPosition = true
+			if math.IsNaN(pos.Left) || math.IsNaN(pos.Top) {
+				t.Error("Zoom=0 produced NaN coordinates")
+			}
+		}
+	}
+	if !foundPosition {
+		t.Fatal("expected at least one position to validate")
+	}
+}
--- a/internal/deepdoc/parser/pdf/pdfium_integration_test.go
+++ b/internal/deepdoc/parser/pdf/pdfium_integration_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -34,8 +34,8 @@ func TestParse_PdfiumRender(t *testing.T) {
 		t.Fatalf("RawData() length %d != original %d", len(raw), len(data))
 	}

-	// Render a page through pdfium (via the parser's renderPageToImage).
-	img, err := renderPageToImage(eng, 0)
+	// Render a page through pdfium (via the parser's RenderPageToImage).
+	img, err := RenderPageToImage(eng, 0)
 	if err != nil {
 		t.Skipf("pdfium render not available: %v", err)
 	}
@@ -48,8 +48,8 @@ func TestParse_PdfiumRender(t *testing.T) {
 	// Run Parse with pdfium rendering — BATCH_SKIP_DEEPDOC=1 to avoid HTTP calls.
 	t.Setenv("BATCH_SKIP_DEEPDOC", "1")
 	cfg := pdf.DefaultParserConfig()
-	p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), eng)
+	p := NewParser(cfg)
+	result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -64,10 +64,10 @@ func TestParse_PdfiumRender(t *testing.T) {
 }

 func TestParse_PdfiumRender_NoData(t *testing.T) {
-	// When engine has no raw PDF bytes, renderPageToImage falls back to
+	// When engine has no raw PDF bytes, RenderPageToImage falls back to
 	// engine.RenderPageImage().  Stub returns (nil, nil) → guard converts
 	// to ErrNoPDFData so callers never receive a nil image with nil error.
-	img, err := renderPageToImage(&pythonCharEngineStub{}, 0)
+	img, err := RenderPageToImage(&pythonCharEngineStub{}, 0)
 	if err != ErrNoPDFData {
 		t.Errorf("expected ErrNoPDFData, got %v", err)
 	}
--- a/internal/deepdoc/parser/pdf/pdfoxide_bridge.go
+++ b/internal/deepdoc/parser/pdf/pdfoxide_bridge.go
@@ -1,6 +1,6 @@
 //go:build cgo

-package parser
+package pdf

 import (
 	"image"
@@ -11,8 +11,8 @@ import (
 )

 // pdfoxideEngine adapts pdfoxide.Engine to the pdf.PDFEngine interface.
-type pdfoxideEngine struct {
-	inner *pdfoxide.Engine
+type PDFOxideEngine struct {
+	Inner *pdfoxide.Engine
 }

 // NewEngine returns a pdf.PDFEngine backed by pdf_oxide.
@@ -21,15 +21,15 @@ func NewEngine(pdfBytes []byte) (pdf.PDFEngine, error) {
 	if err != nil {
 		return nil, err
 	}
-	return &pdfoxideEngine{inner: eng}, nil
+	return &PDFOxideEngine{Inner: eng}, nil
 }

-func (e *pdfoxideEngine) RawData() []byte         { return e.inner.RawData() }
-func (e *pdfoxideEngine) PageCount() (int, error) { return e.inner.PageCount() }
-func (e *pdfoxideEngine) Close() error            { return e.inner.Close() }
+func (e *PDFOxideEngine) RawData() []byte         { return e.Inner.RawData() }
+func (e *PDFOxideEngine) PageCount() (int, error) { return e.Inner.PageCount() }
+func (e *PDFOxideEngine) Close() error            { return e.Inner.Close() }

-func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
-	ol := pdfium.ExtractOutlines(e.inner.RawData())
+func (e *PDFOxideEngine) Outlines() ([]pdf.Outline, error) {
+	ol := pdfium.ExtractOutlines(e.Inner.RawData())
 	result := make([]pdf.Outline, len(ol))
 	for i, o := range ol {
 		result[i] = pdf.Outline{Title: o.Title, Level: o.Level, PageNumber: o.PageNumber}
@@ -37,16 +37,16 @@ func (e *pdfoxideEngine) Outlines() ([]pdf.Outline, error) {
 	return result, nil
 }

-func (e *pdfoxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
-	return e.inner.RenderPage(pageNum, dpi)
+func (e *PDFOxideEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
+	return e.Inner.RenderPage(pageNum, dpi)
 }

-func (e *pdfoxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
-	return e.inner.RenderPageImage(pageNum, dpi)
+func (e *PDFOxideEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
+	return e.Inner.RenderPageImage(pageNum, dpi)
 }

-func (e *pdfoxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
-	chars, err := e.inner.ExtractChars(pageNum)
+func (e *PDFOxideEngine) ExtractChars(pageNum int) ([]pdf.TextChar, error) {
+	chars, err := e.Inner.ExtractChars(pageNum)
 	if err != nil {
 		return nil, err
 	}
--- a/internal/deepdoc/parser/pdf/pipeline_parity_test.go
+++ b/internal/deepdoc/parser/pdf/pipeline_parity_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -13,6 +13,7 @@ import (
 	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
 	"ragflow/internal/deepdoc/parser/pdf/tool"
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
+	util "ragflow/internal/deepdoc/parser/pdf/util"
 )

 // TestPipelineParity verifies Go pipeline logic equivalence with Python.
@@ -53,8 +54,9 @@ func TestPipelineParity(t *testing.T) {
 		// Run Go pipeline (SKIP_OCR — no DeepDoc)
 		cfg := pdf.DefaultParserConfig()
 		cfg.SortByTop = true
-		p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-		result, err := p.Parse(context.Background(), engine)
+		mockAnalyzer := &MockDocAnalyzer{Healthy: true}
+		p := NewParser(cfg)
+		result, err := p.ParseRaw(context.Background(), engine, mockAnalyzer)
 		if err != nil {
 			t.Errorf("%s: Parse: %v", name, err)
 			continue
@@ -151,7 +153,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
 			if isWS && len(out) > 0 {
 				prev := &out[len(out)-1]
 				gap := b.Top - prev.Bottom
-				ov := OverlapX(prev, &b)
+				ov := util.OverlapX(prev, &b)
 				// Python: gap passes AND xov passes → whitespace merged
 				// into prev, extending bottom.  i advances (Go for-loop).
 				if gap <= thr && ov >= 0.3 {
@@ -169,7 +171,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
 				continue
 			}
 			gap := b.Top - prev.Bottom
-			ov := OverlapX(prev, &b)
+			ov := util.OverlapX(prev, &b)
 			if gap > thr {
 				out = append(out, b)
 				continue
@@ -219,7 +221,7 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
 				continue
 			}
 			gap := b.Top - prev.Bottom
-			ov := OverlapX(prev, &b)
+			ov := util.OverlapX(prev, &b)
 			if gap > thr {
 				out = append(out, b)
 				continue
@@ -250,18 +252,18 @@ func TestVMWhitespaceGapBridge(t *testing.T) {
 	t.Logf("Gap with bridge:    420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr)

 	// The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still
-	// differ — the mechanism is real.  But production NaiveVerticalMerge now
+	// differ — the mechanism is real.  But production lyt.NaiveVerticalMerge now
 	// handles whitespace inline (gap bridge), matching Python.
 	if nWS == nNoWS {
 		t.Error("Manual implementations should differ — the gap bridge mechanism is real")
 	}

-	// Verify production NaiveVerticalMerge matches vWithWS (Python behavior).
+	// Verify production lyt.NaiveVerticalMerge matches vWithWS (Python behavior).
 	mhMap := map[int]float64{1: mh}
 	mwMap := map[int]float64{1: 5}
 	vmResult := lyt.NaiveVerticalMerge(boxes, mhMap, mwMap, false)
-	t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult))
+	t.Logf("lyt.NaiveVerticalMerge (production): %d sections", len(vmResult))
 	if len(vmResult) != nWS {
-		t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
+		t.Errorf("lyt.NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
 	}
 }
--- a/internal/deepdoc/parser/pdf/post/model_image_describer.go
+++ b/internal/deepdoc/parser/pdf/post/model_image_describer.go
@@ -1,101 +0,0 @@
-package post
-
-import (
-	"bytes"
-	"context"
-	"encoding/base64"
-	"errors"
-	"fmt"
-	"image"
-	"image/png"
-)
-
-// ── chat driver interface (self-contained, avoids entity/models import) ──
-
-// ChatDriver is the subset of modelModule.ModelDriver needed to call a
-// vision-capable chat API.  Defined here to keep model_image_describer.go
-// self-contained and avoid import chains that require CGO.
-type ChatDriver interface {
-	ChatWithMessages(modelName string, messages []ChatMessage, apiConfig *ChatAPIConfig, chatConfig *ChatConfig) (*ChatResponse, error)
-}
-
-// ChatMessage mirrors modelModule.Message.
-type ChatMessage struct {
-	Role       string                   `json:"role"`
-	Content    interface{}              `json:"content"`
-	ToolCallID string                   `json:"tool_call_id,omitempty"`
-	ToolCalls  []map[string]interface{} `json:"tool_calls,omitempty"`
-}
-
-// ChatAPIConfig mirrors modelModule.APIConfig.
-type ChatAPIConfig struct {
-	ApiKey  *string
-	Region  *string
-	BaseURL *string
-}
-
-// ChatConfig mirrors modelModule.ChatConfig (may be nil).
-type ChatConfig struct{}
-
-// ChatResponse mirrors modelModule.ChatResponse.
-type ChatResponse struct {
-	Answer        *string                  `json:"answer"`
-	ReasonContent *string                  `json:"reason_content"`
-	ToolCalls     []map[string]interface{} `json:"tool_calls,omitempty"`
-}
-
-// ── ModelImageDescriber ────────────────────────────────────────────────
-
-// ModelImageDescriber implements ImageDescriber via any ChatDriver.
-type ModelImageDescriber struct {
-	driver    ChatDriver
-	modelName string
-	apiConfig *ChatAPIConfig
-	maxTokens int
-}
-
-// NewModelImageDescriber creates a ModelImageDescriber that calls the given
-// driver to describe images. maxTokens sets the response length limit (passed
-// as ChatConfig.MaxTokens); 0 means use provider default.
-func NewModelImageDescriber(d ChatDriver, name string, cfg *ChatAPIConfig, maxTokens int) *ModelImageDescriber {
-	return &ModelImageDescriber{driver: d, modelName: name, apiConfig: cfg, maxTokens: maxTokens}
-}
-
-// DescribeImage sends the image as a base64 data URL in an OpenAI-compatible
-// vision API request.  Returns the model's text response.
-func (d *ModelImageDescriber) DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error) {
-	dataURL, err := encodeImageToBase64DataURL(img)
-	if err != nil {
-		return "", fmt.Errorf("image encode: %w", err)
-	}
-
-	msgs := []ChatMessage{{
-		Role: "user",
-		Content: []interface{}{
-			map[string]interface{}{"type": "text", "text": prompt},
-			map[string]interface{}{"type": "image_url", "image_url": map[string]string{"url": dataURL}},
-		},
-	}}
-
-	var chatCfg *ChatConfig
-	if d.maxTokens > 0 {
-		chatCfg = &ChatConfig{}
-	}
-	resp, err := d.driver.ChatWithMessages(d.modelName, msgs, d.apiConfig, chatCfg)
-	if err != nil {
-		return "", fmt.Errorf("image describe: %w", err)
-	}
-	if resp.Answer == nil || *resp.Answer == "" {
-		return "", errors.New("image describe: empty response")
-	}
-	return *resp.Answer, nil
-}
-
-// encodeImageToBase64DataURL encodes an image as a PNG data URL.
-func encodeImageToBase64DataURL(img image.Image) (string, error) {
-	var buf bytes.Buffer
-	if err := png.Encode(&buf, img); err != nil {
-		return "", err
-	}
-	return "data:image/png;base64," + base64.StdEncoding.EncodeToString(buf.Bytes()), nil
-}
--- a/internal/deepdoc/parser/pdf/post/model_image_describer_test.go
+++ b/internal/deepdoc/parser/pdf/post/model_image_describer_test.go
@@ -1,79 +0,0 @@
-package post
-
-import (
-	"context"
-	"errors"
-	"image"
-	"image/color"
-	"strings"
-	"testing"
-)
-
-// ── mock ChatDriver ────────────────────────────────────────────────────
-
-type mockChatDriver struct {
-	answer string
-	err    error
-}
-
-func (m *mockChatDriver) ChatWithMessages(_ string, _ []ChatMessage, _ *ChatAPIConfig, _ *ChatConfig) (*ChatResponse, error) {
-	if m.err != nil {
-		return nil, m.err
-	}
-	a := m.answer
-	return &ChatResponse{Answer: &a}, nil
-}
-
-// ── ModelImageDescriber tests ──────────────────────────────────────────
-
-func TestModelImageDescriber_Success(t *testing.T) {
-	img := newTestImage(100, 100)
-	want := "A chart showing revenue growth."
-	driver := &mockChatDriver{answer: want}
-	desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
-
-	got, err := desc.DescribeImage(context.Background(), img, "Describe this chart")
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if got != want {
-		t.Errorf("got %q, want %q", got, want)
-	}
-}
-
-func TestModelImageDescriber_DriverError(t *testing.T) {
-	img := newTestImage(100, 100)
-	driver := &mockChatDriver{err: errors.New("API rate limited")}
-	desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
-
-	_, err := desc.DescribeImage(context.Background(), img, "prompt")
-	if err == nil {
-		t.Fatal("expected error, got nil")
-	}
-}
-
-func TestModelImageDescriber_EmptyAnswer(t *testing.T) {
-	img := newTestImage(100, 100)
-	driver := &mockChatDriver{answer: ""}
-	desc := NewModelImageDescriber(driver, "gpt-4o", nil, 0)
-
-	_, err := desc.DescribeImage(context.Background(), img, "prompt")
-	if err == nil {
-		t.Fatal("expected error for empty answer, got nil")
-	}
-}
-
-// ── encodeImageToBase64DataURL tests ───────────────────────────────────
-
-func TestEncodeImageToBase64DataURL(t *testing.T) {
-	img := image.NewRGBA(image.Rect(0, 0, 1, 1))
-	img.Set(0, 0, color.RGBA{R: 255, G: 0, B: 0, A: 255})
-
-	url, err := encodeImageToBase64DataURL(img)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if !strings.HasPrefix(url, "data:image/png;base64,") {
-		t.Errorf("missing data URL prefix: %s...", url[:min(50, len(url))])
-	}
-}
--- a/internal/deepdoc/parser/pdf/post/outline_postprocess_test.go
+++ b/internal/deepdoc/parser/pdf/post/outline_postprocess_test.go
@@ -1,114 +0,0 @@
-package post
-
-import (
-	"context"
-	"testing"
-
-	pdftype "ragflow/internal/deepdoc/parser/pdf/type"
-)
-
-// ── Tests for remove_toc config flag ────────────────────────────────────────
-
-// TestPostProcess_RemoveTOC_DisabledByConfig verifies that when
-// remove_toc=false, outlines are NOT used to remove TOC pages even
-// when outlines are present.
-func TestPostProcess_RemoveTOC_DisabledByConfig(t *testing.T) {
-	result := newTestResult(
-		makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
-		makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
-		makePosSection("第一章 正文", 3, 100, 500, 100, 200),
-		makePosSection("第二章 正文", 5, 100, 500, 100, 200),
-	)
-	outlines := []pdftype.Outline{
-		{Title: "目录", Level: 0, PageNumber: 1},
-		{Title: "第一章", Level: 0, PageNumber: 3},
-		{Title: "第二章", Level: 0, PageNumber: 5},
-	}
-
-	config := PipelineConfig{
-		ConfigKeyRemoveTOC: false,
-		ConfigKeyOutlines:  outlines,
-	}
-	err := PostProcess(context.Background(), result, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 4 {
-		t.Errorf("remove_toc=false should keep all sections, got %d", len(result.Sections))
-	}
-}
-
-// TestPostProcess_RemoveTOC_EnabledByConfig verifies that when
-// remove_toc=true and outlines are present, TOC pages are removed.
-func TestPostProcess_RemoveTOC_EnabledByConfig(t *testing.T) {
-	result := newTestResult(
-		makePosSection("目录内容 page1", 1, 100, 500, 100, 200),
-		makePosSection("更多目录 page2", 2, 100, 500, 100, 200),
-		makePosSection("第一章 正文", 3, 100, 500, 100, 200),
-		makePosSection("第二章 正文", 5, 100, 500, 100, 200),
-	)
-	outlines := []pdftype.Outline{
-		{Title: "目录", Level: 0, PageNumber: 1},
-		{Title: "第一章", Level: 0, PageNumber: 3},
-		{Title: "第二章", Level: 0, PageNumber: 5},
-	}
-
-	config := PipelineConfig{
-		ConfigKeyRemoveTOC: true,
-		ConfigKeyOutlines:  outlines,
-	}
-	err := PostProcess(context.Background(), result, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Errorf("remove_toc=true should remove TOC pages, got %d sections", len(result.Sections))
-	}
-	for _, s := range result.Sections {
-		for _, p := range s.Positions {
-			for _, pn := range p.PageNumbers {
-				if pn < 3 {
-					t.Errorf("TOC page %d should have been removed: section %q", pn, s.Text)
-				}
-			}
-		}
-	}
-}
-
-// TestPostProcess_RemoveTOC_NoOutlines verifies that when no outlines
-// are passed, no TOC removal happens.
-func TestPostProcess_RemoveTOC_NoOutlines(t *testing.T) {
-	result := newTestResult(
-		makePosSection("目录内容", 1, 100, 500, 100, 200),
-		makePosSection("第一章 正文", 3, 100, 500, 100, 200),
-	)
-	config := PipelineConfig{
-		ConfigKeyRemoveTOC: true,
-	}
-	err := PostProcess(context.Background(), result, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Errorf("no outlines → all sections kept, got %d", len(result.Sections))
-	}
-}
-
-// TestPostProcess_RemoveTOC_EmptyOutlines verifies empty outlines array is no-op.
-func TestPostProcess_RemoveTOC_EmptyOutlines(t *testing.T) {
-	result := newTestResult(
-		makePosSection("目录", 1, 100, 500, 100, 200),
-		makePosSection("正文", 2, 100, 500, 100, 200),
-	)
-	config := PipelineConfig{
-		ConfigKeyRemoveTOC: true,
-		ConfigKeyOutlines:  []pdftype.Outline{},
-	}
-	err := PostProcess(context.Background(), result, config)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Errorf("empty outlines → all sections kept, got %d", len(result.Sections))
-	}
-}
--- a/internal/deepdoc/parser/pdf/post/post_steps.go
+++ b/internal/deepdoc/parser/pdf/post/post_steps.go
@@ -1,436 +0,0 @@
-package post
-
-import (
-	"context"
-	"errors"
-	"math"
-	"regexp"
-	"sort"
-	"strings"
-	"sync"
-
-	pdftype "ragflow/internal/deepdoc/parser/pdf/type"
-	"ragflow/internal/deepdoc/parser/pdf/util"
-)
-
-// ── Config ─────────────────────────────────────────────────────────────
-
-// Config keys for PipelineConfig.
-const (
-	ConfigKeyPageWidth          = "page_width"
-	ConfigKeyZoom               = "zoom"
-	ConfigKeyOutlines           = "outlines"
-	ConfigKeyFlattenMediaToText = "flatten_media_to_text"
-	ConfigKeyTenantID           = "tenant_id"
-	ConfigKeyVLMLLMID           = "vlm_llm_id"
-	ConfigKeyRemoveTOC          = "remove_toc"
-)
-
-// PipelineConfig is a key-value map that post-processing reads
-// to obtain its parameters.
-type PipelineConfig map[string]interface{}
-
-// Float64 returns the float64 value for key, or default_ if absent or wrong type.
-func (c PipelineConfig) Float64(key string, default_ float64) float64 {
-	if c == nil {
-		return default_
-	}
-	v, ok := c[key]
-	if !ok {
-		return default_
-	}
-	f, ok := v.(float64)
-	if !ok {
-		return default_
-	}
-	return f
-}
-
-// Bool returns the bool value for key. Returns false if absent or wrong type.
-func (c PipelineConfig) Bool(key string) bool {
-	if c == nil {
-		return false
-	}
-	v, ok := c[key]
-	if !ok {
-		return false
-	}
-	b, ok := v.(bool)
-	if !ok {
-		return false
-	}
-	return b
-}
-
-// Outlines returns the []pdftype.Outline value for ConfigKeyOutlines.
-func (c PipelineConfig) Outlines() []pdftype.Outline {
-	if c == nil {
-		return nil
-	}
-	v, ok := c[ConfigKeyOutlines]
-	if !ok {
-		return nil
-	}
-	o, ok := v.([]pdftype.Outline)
-	if !ok {
-		return nil
-	}
-	return o
-}
-
-// String returns the string value for key. Returns "" if absent or wrong type.
-func (c PipelineConfig) String(key string) string {
-	if c == nil {
-		return ""
-	}
-	v, ok := c[key]
-	if !ok {
-		return ""
-	}
-	s, ok := v.(string)
-	if !ok {
-		return ""
-	}
-	return s
-}
-
-// ── Patterns ───────────────────────────────────────────────────────────
-
-// headerFooterPattern matches layout types that should be treated as
-// page furniture (Python: r"(header|footer|number)" in parser.py:637).
-var headerFooterPattern = regexp.MustCompile(`(header|footer|number|reference)`)
-
-// tocTitlePattern matches outline titles that mark a table-of-contents page.
-// Python: r"(contents|目录|目次|table of contents|致谢|acknowledge)$"
-var tocTitlePattern = regexp.MustCompile(`(?i)^(contents|目录|目次|table of contents|致谢|acknowledge)$`)
-
-// ── PostProcess ────────────────────────────────────────────────────────
-
-// PostProcess applies PDF post-processing to a ParseResult in-place.
-// The config map controls which features to enable.
-//
-// Execution order (matches Python _pdf):
-//  1. reorderMultiColumn — if page_width > 0
-//  2. removeTOCByOutlines — if outlines present
-//  3. normalizeLayoutType — always
-//  4. filterHeaderFooter — always
-//  5. assignDocTypeKwd — always (respects flatten_media_to_text)
-//  6. enhanceWithVision — if image_describer present
-func PostProcess(ctx context.Context, result *pdftype.ParseResult, config PipelineConfig) error {
-	if result == nil {
-		return errors.New("PostProcess: nil result")
-	}
-	if config == nil {
-		config = PipelineConfig{}
-	}
-
-	// 1. Multi-column reorder
-	pw := config.Float64(ConfigKeyPageWidth, 0)
-	if pw > 0 {
-		zoom := config.Float64(ConfigKeyZoom, 1.0)
-		if zoom <= 0 {
-			zoom = 1.0
-		}
-		reorderMultiColumn(result, pw, zoom)
-	}
-
-	// 2. Remove TOC pages (only when explicitly enabled).
-	// Outlines from config take precedence; otherwise read from ParseResult.
-	outlines := config.Outlines()
-	if len(outlines) == 0 {
-		outlines = result.Outlines
-	}
-	if config.Bool(ConfigKeyRemoveTOC) && len(outlines) > 0 {
-		removeTOCByOutlines(result, outlines)
-	}
-
-	// 3-5. Always-on steps
-	normalizeLayoutType(result)
-	filterHeaderFooter(result)
-	assignDocTypeKwd(result, config.Bool(ConfigKeyFlattenMediaToText))
-
-	// 6. VLM enhancement
-	tenantID := config.String(ConfigKeyTenantID)
-	vlmLLMID := config.String(ConfigKeyVLMLLMID)
-	if tenantID != "" && vlmLLMID != "" {
-		describer, err := resolveImageDescriber(tenantID, vlmLLMID)
-		if err != nil {
-			return err
-		}
-		if err := enhanceWithVision(ctx, result, describer); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// resolveImageDescriber resolves a VLM model from tenant config and returns
-// an ImageDescriber.  Corresponds to Python's
-// get_model_config_from_provider_instance + LLMBundle.
-// resolveImageDescriber resolves a VLM model from tenant config and returns
-// an ImageDescriber.  The implementation is assigned by init() in
-// post_steps_cgo.go (production) or post_steps_no_cgo.go (stub).
-// Overridable in tests.
-var resolveImageDescriber func(tenantID, llmID string) (ImageDescriber, error)
-
-// SetImageDescriberResolver sets the factory that creates an ImageDescriber
-// from tenant/LLM configuration. Higher layers (e.g. EE extensions or the
-// PDF document pipeline entry point) register the real implementation via
-// init(). If never called, PostProcess skips VLM enhancement.
-func SetImageDescriberResolver(fn func(tenantID, llmID string) (ImageDescriber, error)) {
-	resolveImageDescriber = fn
-}
-
-// ── normalizeLayoutType ────────────────────────────────────────────────
-
-// normalizeLayoutType trims whitespace from LayoutType and defaults empty
-// values to "text".  Matches Python's layout_type normalization in parser.py.
-func normalizeLayoutType(result *pdftype.ParseResult) {
-	for i := range result.Sections {
-		lt := strings.TrimSpace(result.Sections[i].LayoutType)
-		if lt == "" {
-			lt = "text"
-		}
-		result.Sections[i].LayoutType = lt
-	}
-}
-
-// ── filterHeaderFooter ─────────────────────────────────────────────────
-
-// filterHeaderFooter removes sections whose LayoutType matches
-// header/footer/number/reference.  Python: remove_header_footer config.
-func filterHeaderFooter(result *pdftype.ParseResult) {
-	sections := result.Sections[:0]
-	for _, s := range result.Sections {
-		if headerFooterPattern.MatchString(strings.TrimSpace(s.LayoutType)) {
-			continue
-		}
-		sections = append(sections, s)
-	}
-	result.Sections = sections
-}
-
-// ── assignDocTypeKwd ───────────────────────────────────────────────────
-
-// assignDocTypeKwd sets DocTypeKwd based on LayoutType and Image presence.
-// When flatten is true, all sections become "text" and Image is cleared —
-// this matches Python where flatten_media_to_text and VLM are mutually
-// exclusive.  Python: parser.py:639-648.
-func assignDocTypeKwd(result *pdftype.ParseResult, flatten bool) {
-	for i := range result.Sections {
-		s := &result.Sections[i]
-		if flatten {
-			s.DocTypeKwd = "text"
-			s.Image = ""
-			continue
-		}
-		lt := strings.TrimSpace(s.LayoutType)
-		switch lt {
-		case "table":
-			s.DocTypeKwd = "table"
-		case "figure":
-			s.DocTypeKwd = "image"
-		default:
-			if lt == "" && s.Image != "" {
-				s.DocTypeKwd = "image"
-			} else {
-				s.DocTypeKwd = "text"
-			}
-		}
-	}
-}
-
-// ── enhanceWithVision ──────────────────────────────────────────────────
-
-// enhanceWithVision adds VLM-generated descriptions to image/table sections.
-func enhanceWithVision(ctx context.Context, result *pdftype.ParseResult, describer ImageDescriber) error {
-	if describer == nil {
-		return nil
-	}
-	if len(result.Sections) == 0 {
-		return nil
-	}
-
-	sem := make(chan struct{}, maxDescribeConcurrency)
-	var wg sync.WaitGroup
-
-	for i := range result.Sections {
-		s := &result.Sections[i]
-		if s.DocTypeKwd != "table" && s.DocTypeKwd != "image" {
-			continue
-		}
-		if s.Image == "" {
-			continue
-		}
-
-		wg.Add(1)
-		sem <- struct{}{}
-		go func(idx int, imgB64 string, origText string) {
-			defer wg.Done()
-			defer func() { <-sem }()
-
-			img, err := util.DecodeBase64PNG(imgB64)
-			if err != nil || img == nil {
-				return
-			}
-			desc, err := DescribeImage(ctx, img, describePrompt, describer)
-			if err != nil || desc == "" {
-				return
-			}
-
-			if origText != "" {
-				result.Sections[idx].Text = origText + "\n" + desc
-			} else {
-				result.Sections[idx].Text = desc
-			}
-		}(i, s.Image, s.Text)
-	}
-	wg.Wait()
-
-	return nil
-}
-
-// ── removeTOCByOutlines ────────────────────────────────────────────────
-
-// removeTOCByOutlines removes sections whose page numbers fall inside
-// TOC page ranges identified by PDF outlines.
-func removeTOCByOutlines(result *pdftype.ParseResult, outlines []pdftype.Outline) {
-	if len(outlines) == 0 {
-		return
-	}
-	tocPage, contentPage := findTOCPageRange(outlines)
-	if contentPage <= tocPage {
-		return
-	}
-	sections := result.Sections[:0]
-	for _, s := range result.Sections {
-		pg := sectionPage(s)
-		if pg >= tocPage && pg < contentPage {
-			continue
-		}
-		sections = append(sections, s)
-	}
-	result.Sections = sections
-}
-
-// findTOCPageRange scans outlines for a TOC entry and returns the
-// [tocStartPage, contentStartPage) range. Returns (0, 0) when not found.
-func findTOCPageRange(outlines []pdftype.Outline) (tocPage, contentPage int) {
-trimSplit:
-	for i, o := range outlines {
-		title := strings.TrimSpace(o.Title)
-		if idx := strings.Index(title, "@@"); idx >= 0 {
-			title = strings.TrimSpace(title[:idx])
-		}
-		if !tocTitlePattern.MatchString(strings.ToLower(title)) {
-			continue
-		}
-		tocPage = o.PageNumber
-		for _, next := range outlines[i+1:] {
-			if next.Level != o.Level {
-				continue
-			}
-			nt := strings.TrimSpace(next.Title)
-			if idx := strings.Index(nt, "@@"); idx >= 0 {
-				nt = strings.TrimSpace(nt[:idx])
-			}
-			if tocTitlePattern.MatchString(strings.ToLower(nt)) {
-				continue
-			}
-			contentPage = next.PageNumber
-			break trimSplit
-		}
-		break
-	}
-	return
-}
-
-// sectionPage returns the first page number of a Section, or 0.
-func sectionPage(s pdftype.Section) int {
-	for _, p := range s.Positions {
-		for _, pn := range p.PageNumbers {
-			return pn
-		}
-	}
-	return 0
-}
-
-// ── reorderMultiColumn ─────────────────────────────────────────────────
-
-// reorderMultiColumn reorders text sections in multi-column layouts.
-// If median text column width >= page width / 2 (single-column layout),
-// the input order is preserved.
-//
-// Python: reorder_multi_column_bboxes + sort_X_by_page
-func reorderMultiColumn(result *pdftype.ParseResult, pageWidth, zoom float64) {
-	if len(result.Sections) < 2 {
-		return
-	}
-	pw := pageWidth / zoom
-
-	// Compute median width from text sections with valid coordinates.
-	var widths []float64
-	for _, s := range result.Sections {
-		if s.LayoutType != "text" {
-			continue
-		}
-		if len(s.Positions) == 0 {
-			continue
-		}
-		w := s.Positions[0].Right - s.Positions[0].Left
-		if w > 0 {
-			widths = append(widths, w)
-		}
-	}
-	if len(widths) == 0 {
-		return
-	}
-	sort.Float64s(widths)
-	medianW := widths[len(widths)/2]
-
-	if medianW >= pw/2 {
-		return // single column
-	}
-
-	// Sort by (PageNumber, X0, Top).
-	sort.Slice(result.Sections, func(i, j int) bool {
-		pi := sectionPage(result.Sections[i])
-		pj := sectionPage(result.Sections[j])
-		if pi != pj {
-			return pi < pj
-		}
-		xi := sectionX0(result.Sections[i])
-		xj := sectionX0(result.Sections[j])
-		if math.Abs(xi-xj) > 1e-6 {
-			return xi < xj
-		}
-		return sectionTop(result.Sections[i]) < sectionTop(result.Sections[j])
-	})
-
-	threshold := medianW / 2
-	// Correct same-page sections with nearly-same X0 but inverted Top.
-	for i := len(result.Sections) - 1; i >= 1; i-- {
-		for j := i - 1; j >= 0; j-- {
-			if math.Abs(sectionX0(result.Sections[j+1])-sectionX0(result.Sections[j])) < threshold &&
-				sectionTop(result.Sections[j+1]) < sectionTop(result.Sections[j]) &&
-				sectionPage(result.Sections[j+1]) == sectionPage(result.Sections[j]) {
-				result.Sections[j], result.Sections[j+1] = result.Sections[j+1], result.Sections[j]
-			}
-		}
-	}
-}
-
-func sectionX0(s pdftype.Section) float64 {
-	for _, p := range s.Positions {
-		return p.Left
-	}
-	return 0
-}
-
-func sectionTop(s pdftype.Section) float64 {
-	for _, p := range s.Positions {
-		return p.Top
-	}
-	return 0
-}
--- a/internal/deepdoc/parser/pdf/post/post_steps_test.go
+++ b/internal/deepdoc/parser/pdf/post/post_steps_test.go
@@ -1,434 +0,0 @@
-package post
-
-import (
-	"context"
-	"testing"
-
-	pdftype "ragflow/internal/deepdoc/parser/pdf/type"
-)
-
-// ── helpers ──────────────────────────────────────────────────────────────
-
-// dummyBase64PNG is a valid 50×50 red pixel PNG, base64-encoded.
-const dummyBase64PNG = "iVBORw0KGgoAAAANSUhEUgAAADIAAAAyCAIAAACRXR/mAAAAUElEQVR4nOzOsREAEAAAMefsvzILaL6iSCbI2uNH83XgTqvQKrQKrUKr0Cq0Cq1Cq9AqtAqtQqvQKrQKrUKr0Cq0Cq1Cq9AqtAqt4gQAAP//miQBZqrF+JAAAAAASUVORK5CYII="
-
-func newTestResult(sections ...pdftype.Section) *pdftype.ParseResult {
-	return &pdftype.ParseResult{Sections: sections}
-}
-
-func makePosSection(text string, page int, x0, x1, top, bottom float64) pdftype.Section {
-	return pdftype.Section{
-		Text:       text,
-		LayoutType: "text",
-		Positions:  []pdftype.Position{{PageNumbers: []int{page}, Left: x0, Right: x1, Top: top, Bottom: bottom}},
-	}
-}
-
-// ── normalizeLayoutType ────────────────────────────────────────────────
-
-func TestNormalizeLayoutType(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "a", LayoutType: ""},
-		pdftype.Section{Text: "b", LayoutType: "  "},
-		pdftype.Section{Text: "c", LayoutType: "table"},
-		pdftype.Section{Text: "d", LayoutType: "  figure  "},
-		pdftype.Section{Text: "e", LayoutType: "text"},
-	)
-	normalizeLayoutType(result)
-	want := []string{"text", "text", "table", "figure", "text"}
-	for i, s := range result.Sections {
-		if s.LayoutType != want[i] {
-			t.Errorf("Sections[%d]: got %q, want %q", i, s.LayoutType, want[i])
-		}
-	}
-}
-
-// ── filterHeaderFooter ─────────────────────────────────────────────────
-
-func TestFilterHeaderFooter(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "Page 1", LayoutType: "header"},
-		pdftype.Section{Text: "Chapter 1", LayoutType: "text"},
-		pdftype.Section{LayoutType: "footer"},
-		pdftype.Section{LayoutType: "number"},
-		pdftype.Section{Text: "Body", LayoutType: "text"},
-		pdftype.Section{Text: "reference item", LayoutType: "reference"},
-	)
-	filterHeaderFooter(result)
-	if len(result.Sections) != 2 {
-		t.Fatalf("expected 2 sections, got %d: %+v", len(result.Sections), result.Sections)
-	}
-	if result.Sections[0].Text != "Chapter 1" || result.Sections[1].Text != "Body" {
-		t.Errorf("wrong sections kept: %+v", result.Sections)
-	}
-}
-
-func TestFilterHeaderFooter_Empty(t *testing.T) {
-	result := newTestResult()
-	filterHeaderFooter(result)
-	if len(result.Sections) != 0 {
-		t.Error("expected empty result")
-	}
-}
-
-// ── assignDocTypeKwd ───────────────────────────────────────────────────
-
-func TestAssignDocTypeKwd_Normal(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "a", LayoutType: "table"},
-		pdftype.Section{Text: "b", LayoutType: "figure"},
-		pdftype.Section{Text: "c", LayoutType: "equation"},
-		pdftype.Section{Text: "d", LayoutType: "", Image: dummyBase64PNG},
-		pdftype.Section{Text: "e", LayoutType: "text"},
-		pdftype.Section{Text: "f", LayoutType: ""},
-	)
-	assignDocTypeKwd(result, false)
-	want := []string{"table", "image", "text", "image", "text", "text"}
-	for i, s := range result.Sections {
-		if s.DocTypeKwd != want[i] {
-			t.Errorf("Sections[%d]: got %q, want %q", i, s.DocTypeKwd, want[i])
-		}
-	}
-}
-
-func TestAssignDocTypeKwd_Flatten(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "a", LayoutType: "table", DocTypeKwd: "table", Image: dummyBase64PNG},
-		pdftype.Section{Text: "b", LayoutType: "figure", DocTypeKwd: "image", Image: dummyBase64PNG},
-		pdftype.Section{Text: "c", LayoutType: "text", DocTypeKwd: "text"},
-	)
-	assignDocTypeKwd(result, true)
-	for _, s := range result.Sections {
-		if s.DocTypeKwd != "text" {
-			t.Errorf("expected all 'text', got %q", s.DocTypeKwd)
-		}
-		if s.Image != "" {
-			t.Error("flatten should clear Image to prevent VLM enhancement")
-		}
-	}
-}
-
-// ── enhanceWithVision ──────────────────────────────────────────────────
-
-func TestEnhanceWithVision_NoOp(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "original", Image: dummyBase64PNG, DocTypeKwd: "table"},
-	)
-	_ = enhanceWithVision(context.Background(), result, nil)
-	if result.Sections[0].Text != "original" {
-		t.Errorf("text changed when describer is nil: %q", result.Sections[0].Text)
-	}
-}
-
-func TestEnhanceWithVision_Success(t *testing.T) {
-	want := "A table showing Q1 revenue."
-	desc := &mockImageDescriber{describe: want}
-
-	result := newTestResult(
-		pdftype.Section{Text: "", Image: dummyBase64PNG, DocTypeKwd: "table"},
-	)
-	if err := enhanceWithVision(context.Background(), result, desc); err != nil {
-		t.Fatal(err)
-	}
-	if result.Sections[0].Text != want {
-		t.Errorf("text not enhanced: got %q", result.Sections[0].Text)
-	}
-}
-
-func TestEnhanceWithVision_SkipText(t *testing.T) {
-	desc := &mockImageDescriber{describe: "should not be called"}
-
-	result := newTestResult(
-		pdftype.Section{Text: "plain text", DocTypeKwd: "text", Image: ""},
-	)
-	if err := enhanceWithVision(context.Background(), result, desc); err != nil {
-		t.Fatal(err)
-	}
-	if result.Sections[0].Text != "plain text" {
-		t.Errorf("text changed: %q", result.Sections[0].Text)
-	}
-}
-
-// ── removeTOCByOutlines ────────────────────────────────────────────────
-
-func TestRemoveTOCByOutlines_Removes(t *testing.T) {
-	outlines := []pdftype.Outline{
-		{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 1},
-		{Title: "目录", Level: 0, PageNumber: 3},
-		{Title: "Chapter 2 Methods", Level: 0, PageNumber: 5},
-	}
-	result := newTestResult(
-		makePosSection("s1", 1, 50, 550, 100, 120),
-		makePosSection("s2", 2, 50, 550, 100, 120),
-		makePosSection("toc1", 3, 50, 550, 100, 120),
-		makePosSection("toc2", 4, 50, 550, 100, 120),
-		makePosSection("body1", 5, 50, 550, 100, 120),
-		makePosSection("body2", 6, 50, 550, 100, 120),
-	)
-	removeTOCByOutlines(result, outlines)
-	if len(result.Sections) != 4 {
-		t.Fatalf("expected 4 sections, got %d", len(result.Sections))
-	}
-	if result.Sections[0].Text != "s1" || result.Sections[1].Text != "s2" {
-		t.Error("pre-TOC pages should be kept")
-	}
-	if result.Sections[2].Text != "body1" || result.Sections[3].Text != "body2" {
-		t.Error("post-TOC pages should be kept")
-	}
-}
-
-func TestRemoveTOCByOutlines_NoMatch(t *testing.T) {
-	outlines := []pdftype.Outline{
-		{Title: "1. Introduction", Level: 0, PageNumber: 1},
-		{Title: "2. Background", Level: 0, PageNumber: 3},
-	}
-	result := newTestResult(
-		makePosSection("s1", 1, 50, 550, 100, 120),
-		makePosSection("s2", 2, 50, 550, 100, 120),
-	)
-	removeTOCByOutlines(result, outlines)
-	if len(result.Sections) != 2 {
-		t.Errorf("expected 2 sections, got %d (no TOC should mean no removal)", len(result.Sections))
-	}
-}
-
-func TestRemoveTOCByOutlines_NilOutlines(t *testing.T) {
-	result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
-	removeTOCByOutlines(result, nil)
-	if len(result.Sections) != 1 {
-		t.Errorf("nil outlines should be no-op: got %d sections", len(result.Sections))
-	}
-}
-
-func TestRemoveTOCByOutlines_EmptyOutlines(t *testing.T) {
-	result := newTestResult(makePosSection("a", 1, 50, 550, 100, 120))
-	removeTOCByOutlines(result, []pdftype.Outline{})
-	if len(result.Sections) != 1 {
-		t.Errorf("empty outlines should be no-op: got %d sections", len(result.Sections))
-	}
-}
-
-func TestRemoveTOCByOutlines_NoNext(t *testing.T) {
-	outlines := []pdftype.Outline{
-		{Title: "目录", Level: 0, PageNumber: 2},
-	}
-	result := newTestResult(
-		makePosSection("toc", 2, 50, 550, 100, 120),
-		makePosSection("body", 3, 50, 550, 100, 120),
-	)
-	removeTOCByOutlines(result, outlines)
-	if len(result.Sections) != 2 {
-		t.Errorf("no next outline → keep all sections: got %d", len(result.Sections))
-	}
-}
-
-// ── reorderMultiColumn ─────────────────────────────────────────────────
-
-func TestReorderMultiColumn_SingleCol(t *testing.T) {
-	result := newTestResult(
-		makePosSection("B", 0, 50, 550, 200, 220),
-		makePosSection("A", 0, 50, 550, 100, 120),
-	)
-	reorderMultiColumn(result, 600.0, 1.0)
-	// medianW=500 >= 300 → single col, order preserved
-	if result.Sections[0].Text != "B" {
-		t.Fatal("single column should preserve original order")
-	}
-}
-
-func TestReorderMultiColumn_MultiCol(t *testing.T) {
-	result := newTestResult(
-		makePosSection("B", 0, 300, 500, 100, 120),
-		makePosSection("A", 0, 50, 250, 100, 120),
-	)
-	reorderMultiColumn(result, 600.0, 1.0)
-	if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
-		t.Log("multi-column: sections reordered")
-	}
-}
-
-func TestReorderMultiColumn_Empty(t *testing.T) {
-	result := newTestResult()
-	reorderMultiColumn(result, 600.0, 1.0)
-	if len(result.Sections) != 0 {
-		t.Error("empty sections should remain empty")
-	}
-}
-
-func TestReorderMultiColumn_NoText(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "t1", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 300, Right: 500, Top: 100, Bottom: 120}}},
-		pdftype.Section{Text: "t2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{0}, Left: 50, Right: 250, Top: 100, Bottom: 120}}},
-	)
-	reorderMultiColumn(result, 600.0, 1.0)
-	if len(result.Sections) != 2 {
-		t.Fatal("expected 2 sections")
-	}
-}
-
-// ── PostProcess integration ────────────────────────────────────────────
-
-func TestPostProcess_FullPipeline(t *testing.T) {
-	// Simulates post-processing after Parse(): all features enabled.
-	result := newTestResult(
-		// Page 1: TOC — should be removed
-		pdftype.Section{Text: "目录", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
-		pdftype.Section{Text: "Chapter 1 ... 1", LayoutType: "text", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 50, Right: 550, Top: 120, Bottom: 140}}},
-		// Page 1: header — should be removed
-		pdftype.Section{Text: "Page 1", LayoutType: "header", Positions: []pdftype.Position{{PageNumbers: []int{1}, Left: 500, Right: 550, Top: 10, Bottom: 20}}},
-		// Page 3: actual content
-		pdftype.Section{Text: "Introduction text", LayoutType: "", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 100, Bottom: 120}}},
-		pdftype.Section{Text: "Row1 Col1 Row1 Col2", LayoutType: "table", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 200, Bottom: 300}}, Image: dummyBase64PNG},
-		pdftype.Section{Text: "Chart description", LayoutType: "figure", Positions: []pdftype.Position{{PageNumbers: []int{3}, Left: 50, Right: 550, Top: 300, Bottom: 400}}, Image: dummyBase64PNG},
-		// Page 4: footer — should be removed
-		pdftype.Section{Text: "Confidential", LayoutType: "footer", Positions: []pdftype.Position{{PageNumbers: []int{4}, Left: 50, Right: 550, Top: 700, Bottom: 720}}},
-	)
-
-	outlines := []pdftype.Outline{
-		{Title: "目录", Level: 0, PageNumber: 1},
-		{Title: "Chapter 1 Introduction", Level: 0, PageNumber: 3},
-	}
-
-	wantVLM := "This table shows quarterly revenue data with 2 columns."
-	describer := &mockImageDescriber{describe: wantVLM}
-
-	// First pass: non-VLM steps through PostProcess
-	config := PipelineConfig{
-		ConfigKeyPageWidth: 600.0,
-		ConfigKeyZoom:      1.0,
-		ConfigKeyOutlines:  outlines,
-		ConfigKeyRemoveTOC: true,
-	}
-	if err := PostProcess(context.Background(), result, config); err != nil {
-		t.Fatal(err)
-	}
-	// Then: VLM enhancement through internal function (with mock)
-	if err := enhanceWithVision(context.Background(), result, describer); err != nil {
-		t.Fatal(err)
-	}
-	// Then: flatten
-	if err := PostProcess(context.Background(), result, PipelineConfig{
-		ConfigKeyFlattenMediaToText: true,
-	}); err != nil {
-		t.Fatal(err)
-	}
-
-	// Verify
-	if len(result.Sections) != 3 {
-		t.Fatalf("expected 3 sections after filtering, got %d: %+v", len(result.Sections), result.Sections)
-	}
-	for i, s := range result.Sections {
-		if s.DocTypeKwd != "text" {
-			t.Errorf("section[%d] DocTypeKwd = %q, want 'text'", i, s.DocTypeKwd)
-		}
-		if s.LayoutType == "header" || s.LayoutType == "footer" {
-			t.Errorf("section[%d] LayoutType = %q, should have been filtered out", i, s.LayoutType)
-		}
-	}
-	// Table section should have enhanced text
-	found := false
-	for _, s := range result.Sections {
-		if s.LayoutType == "table" {
-			found = true
-			if s.Text != "Row1 Col1 Row1 Col2\n"+wantVLM {
-				t.Errorf("table text not enhanced: %q", s.Text)
-			}
-		}
-	}
-	if !found {
-		t.Error("table section missing from result")
-	}
-}
-
-func TestPostProcess_Minimal(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "Hello", LayoutType: ""},
-		pdftype.Section{Text: "World", LayoutType: "  "},
-	)
-	if err := PostProcess(context.Background(), result, nil); err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Fatalf("expected 2 sections, got %d", len(result.Sections))
-	}
-	if result.Sections[0].LayoutType != "text" || result.Sections[1].LayoutType != "text" {
-		t.Error("layout not normalized")
-	}
-	if result.Sections[0].DocTypeKwd != "text" || result.Sections[1].DocTypeKwd != "text" {
-		t.Error("doc_type_kwd not assigned")
-	}
-}
-
-func TestPostProcess_NilResult(t *testing.T) {
-	if err := PostProcess(context.Background(), nil, nil); err == nil {
-		t.Error("expected error for nil result")
-	}
-}
-
-func TestPostProcess_EmptySections(t *testing.T) {
-	result := newTestResult()
-	if err := PostProcess(context.Background(), result, nil); err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 0 {
-		t.Error("empty should remain empty")
-	}
-}
-
-func TestPostProcess_FiguresLazy(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "Fig1", LayoutType: "figure"},
-		pdftype.Section{Text: "Body", LayoutType: "text"},
-		pdftype.Section{Text: "Fig2", LayoutType: "figure"},
-	)
-	if err := PostProcess(context.Background(), result, nil); err != nil {
-		t.Fatal(err)
-	}
-	figs := result.Figures()
-	if len(figs) != 2 {
-		t.Fatalf("expected 2 figures, got %d", len(figs))
-	}
-	if figs[0].Text != "Fig1" || figs[1].Text != "Fig2" {
-		t.Errorf("wrong figures: %+v", figs)
-	}
-}
-
-func TestPostProcess_FilterOnly(t *testing.T) {
-	result := newTestResult(
-		pdftype.Section{Text: "Header", LayoutType: "header"},
-		pdftype.Section{Text: "Second", LayoutType: "text"},
-		pdftype.Section{Text: "First", LayoutType: "text"},
-	)
-	if err := PostProcess(context.Background(), result, nil); err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Fatalf("expected 2 sections after filtering, got %d", len(result.Sections))
-	}
-	figs := result.Figures()
-	if len(figs) != 0 {
-		t.Errorf("expected 0 figures, got %d", len(figs))
-	}
-}
-
-func TestPostProcess_ReorderOnly(t *testing.T) {
-	result := newTestResult(
-		makePosSection("B", 0, 300, 500, 100, 120),
-		makePosSection("A", 0, 50, 250, 100, 120),
-	)
-	config := PipelineConfig{
-		ConfigKeyPageWidth: 600.0,
-		ConfigKeyZoom:      1.0,
-	}
-	// Remove the outlines key since we don't need it
-	if err := PostProcess(context.Background(), result, config); err != nil {
-		t.Fatal(err)
-	}
-	if len(result.Sections) != 2 {
-		t.Fatal("expected 2 sections")
-	}
-	// Should be reordered: col 1 leftmost: A then B
-	if result.Sections[0].Positions[0].Left > result.Sections[1].Positions[0].Left {
-		t.Log("multi-column: sections reordered left-to-right")
-	}
-}
--- a/internal/deepdoc/parser/pdf/post/vision_describe.go
+++ b/internal/deepdoc/parser/pdf/post/vision_describe.go
@@ -1,98 +0,0 @@
-package post
-
-import (
-	"context"
-	"errors"
-	"image"
-)
-
-// ImageDescriber describes an image using a vision language model.
-type ImageDescriber interface {
-	DescribeImage(ctx context.Context, img image.Image, prompt string) (string, error)
-}
-
-// maxDescribeConcurrency limits how many concurrent VLM calls are in flight.
-const maxDescribeConcurrency = 10
-
-// minImageSide is the minimum width or height (in pixels) for an image
-// to be sent to a VLM.  Tiny crops fail provider image-size limits.
-const minImageSide = 11
-
-// describePrompt is the default prompt for image/table description.
-// Python: vision_llm_figure_describe_prompt.md
-const describePrompt = `## ROLE
-
-You are an expert visual data analyst.
-
-## GOAL
-
-Analyze the image and produce a textual representation strictly based on what is visible in the image.
-
-## DECISION RULE (CRITICAL)
-
-First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
-
-## OUTPUT RULES (STRICT)
-
- Produce output in exactly one of the two modes defined below.
- Do NOT mention, label, or reference the modes in the output.
- Do NOT combine content from both modes.
- Do NOT explain or justify the choice of mode.
- Do NOT add any headings, titles, or commentary beyond what the mode requires.
-
---
-
-## MODE 1: STRUCTURED VISUAL DATA OUTPUT
-
-(Use only if the image contains enumerable data units forming a coherent dataset.)
-
-Output only the following fields, in list form:
- Visual Type:
- Title:
- Axes / Legends / Labels:
- Data Points:
- Captions / Annotations:
-
---
-
-## MODE 2: GENERAL FIGURE CONTENT
-
-(Use only if the image does NOT contain enumerable data units.)
-
-Write the content directly, starting from the first sentence.
-Do NOT add any introductory labels, titles, headings, or prefixes.
-
-Requirements:
- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
- Explicitly name interface elements or visual objects exactly as they appear.
- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
- Describe spatial grouping, containment, and alignment of elements.
- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
-
-Use concise, information-dense sentences.
-Do not use bullet lists or structured fields in this mode.`
-
-// DescribeImage calls the VLM to produce a natural-language description of
-// the given image.  Returns the description text or an error.
-//
-// Images smaller than minImageSide in either dimension are silently skipped
-// (returning an empty string and no error), matching Python's behavior.
-func DescribeImage(ctx context.Context, img image.Image, prompt string, client ImageDescriber) (string, error) {
-	if img == nil {
-		return "", errors.New("DescribeImage: nil image")
-	}
-	b := img.Bounds()
-	if b.Dx() == 0 || b.Dy() == 0 {
-		return "", errors.New("DescribeImage: empty image (0x0)")
-	}
-	if b.Dx() < minImageSide || b.Dy() < minImageSide {
-		return "", nil // skip tiny crops, Python compatible
-	}
-
-	if err := ctx.Err(); err != nil {
-		return "", err
-	}
-
-	return client.DescribeImage(ctx, img, prompt)
-}
--- a/internal/deepdoc/parser/pdf/post/vision_describe_test.go
+++ b/internal/deepdoc/parser/pdf/post/vision_describe_test.go
@@ -1,112 +0,0 @@
-package post
-
-import (
-	"context"
-	"errors"
-	"image"
-	"image/color"
-	"testing"
-)
-
-// ── mock image describer ───────────────────────────────────────────────
-
-type mockImageDescriber struct {
-	describe string
-	err      error
-}
-
-func (m *mockImageDescriber) DescribeImage(_ context.Context, _ image.Image, _ string) (string, error) {
-	return m.describe, m.err
-}
-
-// ── DescribeImage tests ────────────────────────────────────────────────
-
-func TestDescribeImage_Success(t *testing.T) {
-	img := newTestImage(100, 100)
-	want := "This is a bar chart showing quarterly revenue."
-	client := &mockImageDescriber{describe: want}
-
-	got, err := DescribeImage(context.Background(), img, "Describe this image", client)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if got != want {
-		t.Errorf("DescribeImage() = %q, want %q", got, want)
-	}
-}
-
-func TestDescribeImage_VLMError(t *testing.T) {
-	img := newTestImage(100, 100)
-	client := &mockImageDescriber{err: errors.New("VLM timeout")}
-
-	got, err := DescribeImage(context.Background(), img, "Describe this image", client)
-	if err == nil {
-		t.Fatal("expected error, got nil")
-	}
-	if got != "" {
-		t.Errorf("expected empty string on error, got %q", got)
-	}
-}
-
-func TestDescribeImage_CanceledContext(t *testing.T) {
-	ctx, cancel := context.WithCancel(context.Background())
-	cancel() // cancel immediately
-	img := newTestImage(100, 100)
-	client := &mockImageDescriber{describe: "should not be reached"}
-
-	got, err := DescribeImage(ctx, img, "prompt", client)
-	if err == nil {
-		t.Fatal("expected context error, got nil")
-	}
-	if got != "" {
-		t.Errorf("expected empty string, got %q", got)
-	}
-}
-
-func TestDescribeImage_NilImage(t *testing.T) {
-	client := &mockImageDescriber{describe: "should not be reached"}
-
-	got, err := DescribeImage(context.Background(), nil, "prompt", client)
-	if err == nil {
-		t.Fatal("expected error for nil image, got nil")
-	}
-	if got != "" {
-		t.Errorf("expected empty string, got %q", got)
-	}
-}
-
-func TestDescribeImage_EmptyImage(t *testing.T) {
-	img := newTestImage(0, 0)
-	client := &mockImageDescriber{describe: "should not be reached"}
-
-	_, err := DescribeImage(context.Background(), img, "prompt", client)
-	if err == nil {
-		t.Fatal("expected error for empty image, got nil")
-	}
-}
-
-func TestDescribeImage_TinyImage(t *testing.T) {
-	img := newTestImage(5, 5) // below minSide=11
-	client := &mockImageDescriber{describe: "should not be reached"}
-
-	got, err := DescribeImage(context.Background(), img, "prompt", client)
-	if err != nil {
-		t.Fatal("tiny images should be silently skipped, not error")
-	}
-	if got != "" {
-		t.Errorf("expected empty string for tiny image, got %q", got)
-	}
-}
-
-// ── helpers ────────────────────────────────────────────────────────────
-
-func newTestImage(w, h int) image.Image {
-	img := image.NewRGBA(image.Rect(0, 0, w, h))
-	// Fill with a recognizable pattern.
-	for y := 0; y < h; y++ {
-		for x := 0; x < w; x++ {
-			img.Set(x, y, color.RGBA{R: uint8(x % 256), G: uint8(y % 256), B: 128, A: 255})
-		}
-	}
-	return img
-}
--- a/internal/deepdoc/parser/pdf/render_compare_test.go
+++ b/internal/deepdoc/parser/pdf/render_compare_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"image"
@@ -53,7 +53,7 @@ func TestRenderCompare(t *testing.T) {
 		}

 		// Render page 0 with pdfium (Go).
-		goImg, err := renderPageToImage(eng, 0)
+		goImg, err := RenderPageToImage(eng, 0)
 		eng.Close()
 		if err != nil {
 			t.Logf("%s: render error: %v", name, err)
--- a/internal/deepdoc/parser/pdf/renderer.go
+++ b/internal/deepdoc/parser/pdf/renderer.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"image"
@@ -13,7 +13,7 @@ import (
 var renderFn = fallbackRender

 // renderPageToImage renders a page at 216 DPI for downstream DLA/TSR/OCR.
-func renderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
+func RenderPageToImage(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
 	return renderFn(engine, pageNum)
 }

@@ -25,7 +25,10 @@ func fallbackRender(engine pdf.PDFEngine, pageNum int) (image.Image, error) {
 	}
 	// Guard against typed-nil (e.g. (*image.RGBA)(nil) returned as non-nil
 	// interface).  The plain img==nil check misses that case.
-	if img == nil || reflect.ValueOf(img).IsNil() {
+	if img == nil {
+		return nil, ErrNoPDFData
+	}
+	if rv := reflect.ValueOf(img); rv.Kind() == reflect.Ptr && rv.IsNil() {
 		return nil, ErrNoPDFData
 	}
 	return img, nil
--- a/internal/deepdoc/parser/pdf/renderer_pdfium.go
+++ b/internal/deepdoc/parser/pdf/renderer_pdfium.go
@@ -1,6 +1,6 @@
 //go:build cgo

-package parser
+package pdf

 import (
 	"image"
--- a/internal/deepdoc/parser/pdf/rotate_test.go
+++ b/internal/deepdoc/parser/pdf/rotate_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"image"
@@ -24,8 +24,8 @@ func pdfiumPtSize(eng pdf.PDFEngine, file string, t *testing.T) (w, h float64) {
 	raw := eng.RawData()
 	if raw == nil {
 		// Fallback: use pdf_oxide pre-rotation size.
-		if pe, ok := eng.(*pdfoxideEngine); ok {
-			w, h, _ = pe.inner.PageSize(0)
+		if pe, ok := eng.(*PDFOxideEngine); ok {
+			w, h, _ = pe.Inner.PageSize(0)
 		}
 		return
 	}
@@ -302,7 +302,7 @@ func TestRotation_CropBoxWithRotate(t *testing.T) {
 	// CropBox excludes content from the page edges; chars near the
 	// CropBox boundary may end up outside the effective page after rotation.
 	if oobRate > 40 {
-		t.Errorf("too many OOB chars: %.1f%%", oobRate)
+		t.Errorf("too many OOB Chars: %.1f%%", oobRate)
 	}

 	// Verify render alignment.
--- a/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go
+++ b/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -43,9 +43,8 @@ func TestScanAllPDFs(t *testing.T) {

 		eng := mustOpenEngine(t, name)
 		cfg := pdf.DefaultParserConfig()
-		cfg.TableBuilder = NewDeepDocTableBuildService(client)
-		p := NewParser(cfg, client)
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.ParseRaw(context.Background(), eng, client)
 		eng.Close()
 		if err != nil {
 			fmt.Printf("  ❌ ERROR: %v\n", err)
--- a/internal/deepdoc/parser/pdf/snapshot_test.go
+++ b/internal/deepdoc/parser/pdf/snapshot_test.go
@@ -1,6 +1,6 @@
 //go:build manual

-package parser
+package pdf

 import (
 	"encoding/json"
@@ -16,7 +16,7 @@ import (
 	"testing"
 )

-// TestSnapshotStageComparison verifies Go's TextMerge output
+// TestSnapshotStageComparison verifies Go's lyt.TextMerge output
 // matches Python's _text_merge sample boxes using synthetic input.
 func TestSnapshotStageComparison(t *testing.T) {
 	snapDir := filepath.Join("testdata", "snapshots")
@@ -47,19 +47,19 @@ func TestSnapshotStageComparison(t *testing.T) {
 			// Convert sample boxes to Go pdf.TextBox format
 			goBoxes := snapshotBoxesToGo(s1.SampleBoxesPage0)

-			// Run Go TextMerge with default params
+			// Run Go lyt.TextMerge with default params
 			meanH := map[int]float64{0: avg(s1.MeanHeight)}
 			merged := lyt.TextMerge(goBoxes, meanH, 3)

 			// Compare counts
 			if len(merged) > 0 {
-				t.Logf("  Go TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
+				t.Logf("  Go lyt.TextMerge: %d -> %d boxes", len(goBoxes), len(merged))
 				mergeRatio := float64(len(merged)) / float64(len(goBoxes))
 				pyRatio := float64(s4.BoxesAfter) / float64(s4.BoxesBefore)
 				t.Logf("  Merge ratios: Go=%.0f%% Python=%.0f%%", mergeRatio*100, pyRatio*100)
 			}

-			// Run Go NaiveVerticalMerge
+			// Run Go lyt.NaiveVerticalMerge
 			meanW := map[int]float64{0: avg(s1.MeanWidth)}
 			vm := lyt.NaiveVerticalMerge(merged, meanH, meanW, s1.IsEnglish)
 			if s6, ok := snap.Stages["_naive_vertical_merge"]; ok {
--- a/internal/deepdoc/parser/pdf/table/table_construct.go
+++ b/internal/deepdoc/parser/pdf/table/table_construct.go
@@ -2,6 +2,7 @@ package table

 import (
 	"fmt"
+	"html"
 	"math"
 	"regexp"
 	"sort"
@@ -698,7 +699,47 @@ func RowsToHTML(rows [][]pdf.TSRCell, caption string, headerRows map[int]bool, s
 	return b.String()
 }

-// ── Span computation (Python: __cal_spans) ──
+// SimpleRowsToHTML converts plain string-based table data to an HTML table.
+// The first row is treated as a header (<th>).  Used by DOCX, XLSX, PPTX,
+// and HTML parsers that produce [][]string directly.
+func SimpleRowsToHTML(rows [][]string) string {
+	if len(rows) == 0 {
+		return "<table></table>"
+	}
+	nCols := 0
+	for _, row := range rows {
+		if len(row) > nCols {
+			nCols = len(row)
+		}
+	}
+	var b strings.Builder
+	b.WriteString("<table>")
+	for ri, row := range rows {
+		b.WriteString("<tr>")
+		tag := "td"
+		if ri == 0 {
+			tag = "th"
+		}
+		for ci := 0; ci < nCols; ci++ {
+			text := ""
+			if ci < len(row) {
+				text = row[ci]
+			}
+			b.WriteString("<")
+			b.WriteString(tag)
+			b.WriteString(" >")
+			b.WriteString(html.EscapeString(text))
+			b.WriteString("</")
+			b.WriteString(tag)
+			b.WriteString(">")
+		}
+		b.WriteString("</tr>")
+	}
+	b.WriteString("</table>")
+	return b.String()
+}
+
+// Span computation (Python: __cal_spans) ──

 // calSpans computes colspan and rowspan for spanning cells in the grid.
 // Returns spanInfo (row,col → colspan,rowspan) and covered (cells hidden by spans).
--- a/internal/deepdoc/parser/pdf/table_extract.go
+++ b/internal/deepdoc/parser/pdf/table_extract.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -12,10 +12,10 @@ import (
 	util "ragflow/internal/deepdoc/parser/pdf/util"
 )

-// enrichWithDeepDoc runs DLA+TSR via p.DeepDoc and returns detected tables.
+// enrichWithDeepDoc runs DLA+TSR via docAnalyzer and returns detected tables.
 // pageImages optionally provides pre-rendered page images to avoid re-rendering.
-func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image) []pdf.TableItem {
-	if !p.DeepDoc.Health() {
+func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult, engine pdf.PDFEngine, boxes []pdf.TextBox, pageImages map[int]image.Image, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
+	if !docAnalyzer.Health() {
 		return nil
 	}
 	// Group boxes by page for annotation write-back.
@@ -50,7 +50,7 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
 		for i, idx := range indices {
 			pageBoxes[i] = boxes[idx]
 		}
-		tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems))
+		tables := p.extractTableBoxes(ctx, result, pageBoxes, engine, pg, pageImages, len(tableItems), docAnalyzer, tb)
 		tableItems = append(tableItems, tables...)
 		// Write back DLA and TSR annotations (R/C/H/SP) to the original boxes.
 		for i, idx := range indices {
@@ -65,21 +65,21 @@ func (p *Parser) enrichWithDeepDoc(ctx context.Context, result *pdf.ParseResult,
 	return tableItems
 }

-func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int) []pdf.TableItem {
+func (p *Parser) extractTableBoxes(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, engine pdf.PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
 	pageImg, ok := pageImages[pageNum]
 	if !ok {
 		var err error
-		pageImg, err = renderPageToImage(engine, pageNum)
+		pageImg, err = RenderPageToImage(engine, pageNum)
 		if err != nil {
 			slog.Warn("render page for DeepDoc failed", "page", pageNum, "err", err)
 			return nil
 		}
 	}
-	return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx)
+	return p.extractTableBoxesFromImage(ctx, result, boxes, pageImg, pageNum, tableBaseIdx, docAnalyzer, tb)
 }

-func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int) []pdf.TableItem {
-	regions, err := p.DeepDoc.DLA(ctx, pageImg)
+func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, tableBaseIdx int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder) []pdf.TableItem {
+	regions, err := docAnalyzer.DLA(ctx, pageImg)
 	if err != nil {
 		slog.Warn("DLA failed", "page", pageNum, "err", err)
 		return nil
@@ -95,148 +95,117 @@ func (p *Parser) extractTableBoxesFromImage(ctx context.Context, result *pdf.Par
 	tableMatches := tbl.MatchTableRegions(boxes, regions, scale)
 	var items []pdf.TableItem
 	for _, tm := range tableMatches {
-		cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
-		if cropErr != nil {
-			// DLA returned an invalid region (e.g. x1 < x0).  Python
-			// PIL.Image.crop() raises ValueError here; we skip this
-			// table instead of passing a full-page image to TSR.
-			continue
+		item := p.processOneTable(ctx, result, boxes, pageImg, pageNum, docAnalyzer, tb, tm, scale, tableBaseIdx+len(items))
+		if item.ImageB64 != "" || len(item.Cells) > 0 || len(item.Positions) > 0 {
+			items = append(items, item)
 		}
+	}
+	return items
+}

-		// Rotation detection (Python: _evaluate_table_orientation).
-		// If rotated, TSR and OCR use the rotated image; cell coords
-		// are mapped back to original crop space for box matching.
-		autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
-		bestAngle := 0
-		origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
-		tsrImg := cropped
-		if autoRotate {
-			angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, p.DeepDoc)
-			bestAngle = angle
-			tsrImg = rotated
-		}
-
-		imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
-		if encErr != nil {
-			slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
-		}
-
-		var cells []pdf.TSRCell
-		var tsrErr error
-		cells, tsrErr = p.tableBuilder.DetectCells(ctx, tsrImg)
-		if tsrErr != nil {
-			slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
-		}
-		// Collect TSR raw cells for debug comparison.
-		if tsrErr == nil {
-			for _, c := range cells {
-				if result != nil {
-					result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
-						TableIndex: tableBaseIdx + len(items), Page: pageNum,
-						Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1,
-						Text: c.Text,
-					})
-				}
-			}
-		}
-		// Python margin: w*0.03, h*0.03 (_table_transformer_job:374-376).
-		w := tm.Region.X1 - tm.Region.X0
-		h := tm.Region.Y1 - tm.Region.Y0
-		marginX := w * 0.03
-		marginY := h * 0.03
-		cropOffX := math.Max(0, tm.Region.X0-marginX)
-		cropOffY := math.Max(0, tm.Region.Y0-marginY)
-
-		var boxInCrop []pdf.TextBox
-		if tsrErr == nil && len(cells) > 0 {
-			if bestAngle != 0 {
-				// OCR on rotated image before mapping cells back.
-				// Cells are in rotated-pixel space; OCR works best
-				// on upright text.  After mapping, cells move to
-				// original crop space where boxInCrop lives.
-				if !p.Config.SkipOCR {
-					ocrTableCells(ctx, cells, tsrImg, p.DeepDoc)
-				}
-				for i := range cells {
-					cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
-					cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
-				}
-			}
-			// Fill cell text from pre-merge boxes, skipping caption boxes
-			// (text entirely above the first TSR cell row).
-			firstCellTop := 1e9
-			for _, c := range cells {
-				if c.Y0 >= 0 && c.Y0 < firstCellTop {
-					firstCellTop = c.Y0
-				}
-			}
-			if firstCellTop == 1e9 {
-				firstCellTop = cells[0].Y0 // fallback if all cells have Y0 < 0
-			}
-			boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
-			for _, idx := range tm.BoxIdx {
-				b := boxes[idx]
-				if b.Bottom*scale-cropOffY < firstCellTop {
-					continue // caption box above first TSR cell
-				}
-				boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
-			}
-		}
-		var positions []pdf.Position
-		for _, idx := range tm.BoxIdx {
-			b := boxes[idx]
-			positions = append(positions, pdf.Position{
-				PageNumbers: []int{pageNum},
-				Left:        b.X0, Right: b.X1,
-				Top: b.Top, Bottom: b.Bottom,
+// processOneTable handles DLA+TSR+OCR for a single table region match.
+func (p *Parser) processOneTable(ctx context.Context, result *pdf.ParseResult, boxes []pdf.TextBox, pageImg image.Image, pageNum int, docAnalyzer pdf.DocAnalyzer, tb pdf.TableBuilder, tm tbl.TableMatch, scale float64, tableIdx int) pdf.TableItem {
+	cropped, cropErr := util.CropImageRegion(pageImg, tm.Region)
+	if cropErr != nil {
+		return pdf.TableItem{}
+	}
+	autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables
+	bestAngle := 0
+	origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy()
+	tsrImg := cropped
+	if autoRotate {
+		angle, rotated, _ := tbl.EvaluateTableOrientation(ctx, cropped, docAnalyzer)
+		bestAngle = angle
+		tsrImg = rotated
+	}
+	imgB64, encErr := util.EncodeImageToBase64PNG(cropped)
+	if encErr != nil {
+		slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr)
+	}
+	cells, tsrErr := tb.DetectCells(ctx, tsrImg)
+	if tsrErr != nil {
+		slog.Warn("TSR failed", "page", pageNum, "err", tsrErr)
+	}
+	if tsrErr == nil && result != nil {
+		for _, c := range cells {
+			result.TSRDebug = append(result.TSRDebug, pdf.TSRRawCell{
+				TableIndex: tableIdx, Page: pageNum,
+				Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, Text: c.Text,
 			})
 		}
-		// Pre-compute grid from raw TSR cells (without crop offset).
-		// Stored in pdf.TableItem for constructTable; annotateTableBoxes
-		// recomputes with offset cells for spatial matching precision.
-		var grid [][]pdf.TSRCell
-		if len(cells) > 0 {
-			grid = p.tableBuilder.GroupCells(cells)
-			// Fill cell text from boxes in crop space. Works for both
-			// Label-aware grouping (cells rearranged) vs. cross-product (creates new cells).
-			if len(grid) > 0 {
-				flat := tbl.FlattenGrid(grid)
-				tbl.FillCellTextFromBoxes(flat, boxInCrop)
-				idx := 0
+	}
+	w := tm.Region.X1 - tm.Region.X0
+	h := tm.Region.Y1 - tm.Region.Y0
+	cropOffX := math.Max(0, tm.Region.X0-w*0.03)
+	cropOffY := math.Max(0, tm.Region.Y0-h*0.03)
+	var boxInCrop []pdf.TextBox
+	if tsrErr == nil && len(cells) > 0 {
+		if bestAngle != 0 {
+			if !p.Config.SkipOCR {
+				ocrTableCells(ctx, cells, tsrImg, docAnalyzer)
+			}
+			for i := range cells {
+				cells[i].X0, cells[i].Y0 = util.MapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH)
+				cells[i].X1, cells[i].Y1 = util.MapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH)
+			}
+		}
+		firstCellTop := 1e9
+		for _, c := range cells {
+			if c.Y0 >= 0 && c.Y0 < firstCellTop {
+				firstCellTop = c.Y0
+			}
+		}
+		if firstCellTop == 1e9 {
+			firstCellTop = cells[0].Y0
+		}
+		boxInCrop = make([]pdf.TextBox, 0, len(tm.BoxIdx))
+		for _, idx := range tm.BoxIdx {
+			b := boxes[idx]
+			if b.Bottom*scale-cropOffY < firstCellTop {
+				continue
+			}
+			boxInCrop = append(boxInCrop, tbl.BoxToCropSpace(b, scale, cropOffX, cropOffY))
+		}
+	}
+	var positions []pdf.Position
+	for _, idx := range tm.BoxIdx {
+		b := boxes[idx]
+		positions = append(positions, pdf.Position{
+			PageNumbers: []int{pageNum},
+			Left:        b.X0, Right: b.X1, Top: b.Top, Bottom: b.Bottom,
+		})
+	}
+	var grid [][]pdf.TSRCell
+	if len(cells) > 0 {
+		grid = tb.GroupCells(cells)
+		if len(grid) > 0 {
+			flat := tbl.FlattenGrid(grid)
+			tbl.FillCellTextFromBoxes(flat, boxInCrop)
+			idx := 0
+			for ri := range grid {
+				for ci := range grid[ri] {
+					grid[ri][ci].Text = flat[idx].Text
+					idx++
+				}
+			}
+			if bestAngle == 0 && !p.Config.SkipOCR {
+				ocrTableCells(ctx, flat, tsrImg, docAnalyzer)
+				idx = 0
 				for ri := range grid {
 					for ci := range grid[ri] {
 						grid[ri][ci].Text = flat[idx].Text
 						idx++
 					}
 				}
-				if bestAngle == 0 && !p.Config.SkipOCR {
-					ocrTableCells(ctx, flat, tsrImg, p.DeepDoc)
-					idx = 0
-					for ri := range grid {
-						for ci := range grid[ri] {
-							grid[ri][ci].Text = flat[idx].Text
-							idx++
-						}
-					}
-				}
 			}
 		}
-		items = append(items, pdf.TableItem{
-			ImageB64:  imgB64,
-			Cells:     cells,
-			Grid:      grid,
-			Positions: positions,
-			Scale:     scale,
-			CropOffX:  cropOffX,
-			CropOffY:  cropOffY,
-			// DLA region in PDF point space (Python's cropout uses layout region boundaries).
-			RegionLeft:   tm.Region.X0 / scale,
-			RegionRight:  tm.Region.X1 / scale,
-			RegionTop:    tm.Region.Y0 / scale,
-			RegionBottom: tm.Region.Y1 / scale,
-		})
-
-		tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, p.tableBuilder)
 	}
-	return items
+	item := pdf.TableItem{
+		ImageB64: imgB64, Cells: cells, Grid: grid, Positions: positions,
+		Scale: scale, CropOffX: cropOffX, CropOffY: cropOffY,
+		RegionLeft: tm.Region.X0 / scale, RegionRight: tm.Region.X1 / scale,
+		RegionTop: tm.Region.Y0 / scale, RegionBottom: tm.Region.Y1 / scale,
+	}
+	tbl.WriteTableAnnotations(boxes, tm.BoxIdx, cells, scale, cropOffX, cropOffY, tb)
+	return item
 }
--- a/internal/deepdoc/parser/pdf/table_rotate_integration_test.go
+++ b/internal/deepdoc/parser/pdf/table_rotate_integration_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -9,6 +9,7 @@ import (
 	inf "ragflow/internal/deepdoc/parser/pdf/inference"
 	tbl "ragflow/internal/deepdoc/parser/pdf/table"
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
+	util "ragflow/internal/deepdoc/parser/pdf/util"
 	"testing"
 )

@@ -32,7 +33,7 @@ func TestTableRotation_Integration(t *testing.T) {
 	if baseURL == "" {
 		baseURL = "http://localhost:9390"
 	}
-	dd, err := inf.NewInferenceClient(baseURL)
+	dd, err := inf.NewClient(baseURL)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -59,10 +60,10 @@ func TestTableRotation_Integration(t *testing.T) {
 	cfg.ToPage = pageCount - 1
 	autoRotate := true
 	cfg.AutoRotateTables = &autoRotate
-	_ = NewParser(cfg, dd) // verify construction does not panic
+	_ = NewParser(cfg) // verify construction does not panic

 	for pg := 0; pg < pageCount; pg++ {
-		pageImg, err := renderPageToImage(eng, pg)
+		pageImg, err := RenderPageToImage(eng, pg)
 		if err != nil {
 			t.Fatalf("render page %d: %v", pg, err)
 		}
@@ -80,7 +81,7 @@ func TestTableRotation_Integration(t *testing.T) {
 			tableCount++

 			// Crop table region
-			cropped, err := cropImageRegion(pageImg, r)
+			cropped, err := util.CropImageRegion(pageImg, r)
 			if err != nil {
 				t.Errorf("  crop table %d: %v", tableCount, err)
 				continue
@@ -130,7 +131,7 @@ func TestTableRotation_Stability(t *testing.T) {
 	if baseURL == "" {
 		baseURL = "http://localhost:9390"
 	}
-	dd, err := inf.NewInferenceClient(baseURL)
+	dd, err := inf.NewClient(baseURL)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -163,7 +164,7 @@ func TestTableRotation_Stability(t *testing.T) {
 			continue
 		}

-		pageImg, err := renderPageToImage(eng, 0)
+		pageImg, err := RenderPageToImage(eng, 0)
 		eng.Close()
 		if err != nil {
 			continue
@@ -177,7 +178,11 @@ func TestTableRotation_Stability(t *testing.T) {
 				continue
 			}
 			tables++
-			cropped, _ := cropImageRegion(pageImg, r)
+			cropped, err := util.CropImageRegion(pageImg, r)
+			if err != nil {
+				t.Errorf("  %s crop table: %v", e.Name(), err)
+				continue
+			}
 			if cropped == nil {
 				continue
 			}
--- a/internal/deepdoc/parser/pdf/table_section_test.go
+++ b/internal/deepdoc/parser/pdf/table_section_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"context"
@@ -16,11 +16,11 @@ import (
 // entries. Go backfills pdf.Section.Text from pdf.TableItem.Rows after
 // linkTableSections.
 func TestTableSection_TextFromTSR(t *testing.T) {
-	eng := &mockEngine{
-		pageCount: 1,
-		renderW:   900, // 300pt at 3x = 900px (216 DPI)
-		renderH:   600,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		RenderW:  900, // 300pt at 3x = 900px (216 DPI)
+		RenderH:  600,
+		Chars: map[int][]pdf.TextChar{0: {
 			// PDF space (72 DPI): well inside DLA region
 			{X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"},
 			{X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"},
@@ -42,9 +42,9 @@ func TestTableSection_TextFromTSR(t *testing.T) {
 			{X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -93,14 +93,14 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
 			{X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

 	// 0 text boxes, but page 0 has a rendered image.
 	boxes := []pdf.TextBox{}
 	dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600))
 	pageImages := map[int]image.Image{0: dummyImg}

-	tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages)
+	tables := p.enrichWithDeepDoc(context.Background(), nil, nil, boxes, pageImages, mock, NewTableBuilderFor(mock))
 	if len(tables) == 0 {
 		t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0")
 	}
@@ -113,10 +113,10 @@ func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
 // is merged into the nearest "figure" pdf.Section and the caption pdf.Section is
 // removed. Matches Python _extract_table_figure caption matching.
 func TestFigureCaption_MergedIntoFigure(t *testing.T) {
-	eng := &mockEngine{
-		pageCount: 1,
-		renderW:   1800, renderH: 2400,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		RenderW:  1800, RenderH: 2400,
+		Chars: map[int][]pdf.TextChar{0: {
 			// Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100).
 			{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"},
 			// Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113).
@@ -131,9 +131,9 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
 			{X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -169,10 +169,10 @@ func TestFigureCaption_MergedIntoFigure(t *testing.T) {
 // TestTableCaption_MergedIntoTable verifies that "table caption" text
 // is merged into the nearest table pdf.Section and the caption is removed.
 func TestTableCaption_MergedIntoTable(t *testing.T) {
-	eng := &mockEngine{
-		pageCount: 1,
-		renderW:   1800, renderH: 2400,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		RenderW:  1800, RenderH: 2400,
+		Chars: map[int][]pdf.TextChar{0: {
 			// Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100).
 			{X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"},
 			// Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113).
@@ -190,9 +190,9 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
 			{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -224,10 +224,10 @@ func TestTableCaption_MergedIntoTable(t *testing.T) {
 // boxes overlapping a table region, regardless of their DLA label.
 // This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs.
 func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
-	eng := &mockEngine{
-		pageCount: 1,
-		renderW:   1800, renderH: 2400,
-		chars: map[int][]pdf.TextChar{0: {
+	eng := &MockEngine{
+		NumPages: 1,
+		RenderW:  1800, RenderH: 2400,
+		Chars: map[int][]pdf.TextChar{0: {
 			// Box A: inside DLA table region, labeled as "text" by DLA.
 			{X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"},
 			// Box B: inside DLA table region, same situation.
@@ -247,9 +247,9 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
 			{X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"},
 		},
 	}
-	p := NewParser(pdf.DefaultParserConfig(), mock)
+	p := NewParser(pdf.DefaultParserConfig())

-	result, err := p.Parse(context.Background(), eng)
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -286,9 +286,10 @@ func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {

 // TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully.
 func TestEmptyDoc_NoCrash(t *testing.T) {
-	eng := &mockEngine{pageCount: 0}
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), eng)
+	eng := &MockEngine{NumPages: 0}
+	mock := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
@@ -299,13 +300,69 @@ func TestEmptyDoc_NoCrash(t *testing.T) {

 // TestNilChars_handled verifies zero-chars pages don't crash.
 func TestNilChars_Handled(t *testing.T) {
-	eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200}
-	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
-	result, err := p.Parse(context.Background(), eng)
+	eng := &MockEngine{NumPages: 1, RenderW: 200, RenderH: 200}
+	mock := &MockDocAnalyzer{Healthy: true}
+	p := NewParser(pdf.DefaultParserConfig())
+	result, err := p.ParseRaw(context.Background(), eng, mock)
 	if err != nil {
 		t.Fatalf("Parse: %v", err)
 	}
-	if len(result.Sections) != 0 && p.DeepDoc != nil {
+	if len(result.Sections) != 0 {
 		t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections))
 	}
 }
+
+func TestMatchTableImage_ByPositions(t *testing.T) {
+	tableByRegion := map[string]string{
+		"0_50.0_500.0_100.0_300.0": "img_base64_positions",
+	}
+	sec := &pdf.Section{
+		LayoutType: pdf.LayoutTypeTable,
+		Positions:  []pdf.Position{{PageNumbers: []int{0}, Left: 50.0, Right: 500.0, Top: 100.0, Bottom: 300.0}},
+	}
+	img, ok := matchTableImage(sec, tableByRegion)
+	if !ok {
+		t.Fatal("expected match by Positions")
+	}
+	if img != "img_base64_positions" {
+		t.Errorf("got %q, want img_base64_positions", img)
+	}
+}
+
+func TestMatchTableImage_FallbackToRegion(t *testing.T) {
+	tableByRegion := map[string]string{
+		"0_80.0_520.0_200.0_400.0": "img_base64_region",
+	}
+	sec := &pdf.Section{
+		LayoutType: pdf.LayoutTypeTable,
+		Positions:  nil,
+		TableItem:  &pdf.TableItem{RegionLeft: 80.0, RegionRight: 520.0, RegionTop: 200.0, RegionBottom: 400.0},
+	}
+	img, ok := matchTableImage(sec, tableByRegion)
+	if !ok {
+		t.Fatal("expected match by Region fallback")
+	}
+	if img != "img_base64_region" {
+		t.Errorf("got %q, want img_base64_region", img)
+	}
+}
+
+func TestMatchTableImage_NoMatch(t *testing.T) {
+	tableByRegion := map[string]string{"0_10.0_20.0_30.0_40.0": "no_chance"}
+	sec := &pdf.Section{
+		LayoutType: pdf.LayoutTypeTable,
+		Positions:  []pdf.Position{{PageNumbers: []int{0}, Left: 100, Right: 200, Top: 300, Bottom: 400}},
+	}
+	_, ok := matchTableImage(sec, tableByRegion)
+	if ok {
+		t.Error("expected no match")
+	}
+}
+
+func TestMatchTableImage_EmptySection(t *testing.T) {
+	sec := &pdf.Section{LayoutType: pdf.LayoutTypeTable}
+	_, ok := matchTableImage(sec, map[string]string{"x": "y"})
+	if ok {
+		t.Error("expected no match for empty section")
+	}
+}
--- a/internal/deepdoc/parser/pdf/test_helpers_test.go
+++ b/internal/deepdoc/parser/pdf/test_helpers_test.go
@@ -1,4 +1,4 @@
-package parser
+package pdf

 import (
 	"image"
@@ -6,48 +6,6 @@ import (
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 )

-// ── mockEngine: minimal pdf.PDFEngine stub for unit tests ─────────────
-
-type mockEngine struct {
-	chars     map[int][]pdf.TextChar
-	pageCount int
-	renderW   int
-	renderH   int
-}
-
-func (m *mockEngine) ExtractChars(pg int) ([]pdf.TextChar, error) {
-	return m.chars[pg], nil
-}
-func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
-	w, h := m.renderW, m.renderH
-	if w <= 0 {
-		w = 595
-	}
-	if h <= 0 {
-		h = 842
-	}
-	return nil, nil
-}
-func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
-	w, h := m.renderW, m.renderH
-	if w <= 0 {
-		w = 100
-	}
-	if h <= 0 {
-		h = 100
-	}
-	return image.NewRGBA(image.Rect(0, 0, w, h)), nil
-}
-func (m *mockEngine) PageCount() (int, error) {
-	if m.pageCount <= 0 {
-		return 1, nil
-	}
-	return m.pageCount, nil
-}
-func (m *mockEngine) RawData() []byte                  { return nil }
-func (m *mockEngine) Close() error                     { return nil }
-func (m *mockEngine) Outlines() ([]pdf.Outline, error) { return nil, nil }
-
 // ── testPageImg: small test image for ocrMergeChars tests ─────────────
 // 90×120 px at 216 DPI → 30×40 pt in PDF space after /3.0 scaling.

--- a/internal/deepdoc/parser/pdf/text_dump_test.go
+++ b/internal/deepdoc/parser/pdf/text_dump_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"context"
@@ -66,8 +66,8 @@ func TestDumpTextOutput(t *testing.T) {
 		}

 		cfg := pdf.DefaultParserConfig()
-		p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
-		result, err := p.Parse(context.Background(), eng)
+		p := NewParser(cfg)
+		result, err := p.ParseRaw(context.Background(), eng, &MockDocAnalyzer{Healthy: true})
 		eng.Close()
 		if err != nil {
 			t.Logf("[%d/%d] %s — parse error: %v", i+1, count, name, err)
--- a/internal/deepdoc/parser/pdf/type/types.go
+++ b/internal/deepdoc/parser/pdf/type/types.go
@@ -1,320 +1,56 @@
-// Package pdftypes provides shared types, interfaces, and constants for the
-// PDF parser pipeline. It has zero dependencies on sibling packages so that
-// sub-packages (tables, geometry, etc.) can import it without circular imports.
+// Package pdftype provides PDF-specific types and re-exports shared types
+// from the doctype package via Go type aliases.  Existing PDF parser code
+// that imports this package continues to work without changes.
 package pdftype

-import (
-	"context"
-	"image"
-	"unicode"
-)
+import doctype "ragflow/internal/deepdoc/parser/type"

-// ── Pipeline types ────────────────────────────────────────────────────────
+// ── Re-export shared types via aliases ─────────────────────────────────────

-// PipelineMetrics records diagnostic counts at each pipeline stage.
-type PipelineMetrics struct {
-	BoxesInitial   int
-	BoxesTextMerge int
-	BoxesVertMerge int
-	BoxesFinal     int
-	TablesCount    int
-}
+type PipelineMetrics = doctype.PipelineMetrics
+type ParseResult = doctype.ParseResult
+type DLAPageRegions = doctype.DLAPageRegions
+type TSRRawCell = doctype.TSRRawCell
+type TextChar = doctype.TextChar
+type TextBox = doctype.TextBox
+type Position = doctype.Position
+type Section = doctype.Section
+type TableItem = doctype.TableItem
+type TSRCell = doctype.TSRCell
+type DLARegion = doctype.DLARegion
+type OCRBox = doctype.OCRBox
+type OCRText = doctype.OCRText
+type ParserConfig = doctype.ParserConfig
+type DocAnalyzer = doctype.DocAnalyzer
+type Outline = doctype.Outline
+type PDFEngine = doctype.PDFEngine
+type Tokenizer = doctype.Tokenizer
+type SampleFunc = doctype.SampleFunc
+type TableBuilder = doctype.TableBuilder
+type Rectangular = doctype.Rectangular

-// ParseResult encapsulates all outputs from a single Parse() call.
-type ParseResult struct {
-	Sections   []Section
-	Tables     []TableItem
-	PageImages map[int]image.Image
-	Metrics    PipelineMetrics
-	Outlines   []Outline // PDF outlines/bookmarks extracted from the document
+// ── Re-export constants ────────────────────────────────────────────────────

-	DLADebug []DLAPageRegions
-	TSRDebug []TSRRawCell
-}
-
-// Figures returns all sections with LayoutType "figure".
-// Computed on demand from Sections — no stored field.
-func (r *ParseResult) Figures() []Section {
-	return CollectFigures(r.Sections)
-}
-
-// DLAPageRegions holds DLA layout regions for one page.
-type DLAPageRegions struct {
-	Page    int
-	Regions []DLARegion
-}
-
-// TSRRawCell holds a raw TSR cell before row/column grouping.
-type TSRRawCell struct {
-	TableIndex int     `json:"table_index"`
-	Page       int     `json:"page"`
-	Label      string  `json:"label"`
-	X0         float64 `json:"x0"`
-	Y0         float64 `json:"y0"`
-	X1         float64 `json:"x1"`
-	Y1         float64 `json:"y1"`
-	Text       string  `json:"text"`
-}
-
-// ── Character and text box types ──────────────────────────────────────────
-
-// TextChar represents a single character extracted from a PDF page.
-type TextChar struct {
-	X0, X1      float64
-	Top, Bottom float64
-	Text        string
-	FontName    string
-	FontSize    float64
-	PageNumber  int
-	LayoutType  string
-	LayoutNo    string
-	ColID       int
-	R           int
-}
-
-func (c TextChar) Bounds() (float64, float64, float64, float64) {
-	return c.X0, c.Top, c.X1, c.Bottom
-}
-
-// TextBox represents a rectangular region of text on a PDF page.
-type TextBox struct {
-	X0, X1      float64
-	Top, Bottom float64
-	Text        string
-	PageNumber  int
-	LayoutType  string
-	LayoutNo    string
-	ColID       int
-	R           int
-	// Post-TSR table annotation fields (Python: R/H/C/SP tags)
-	RTop, RBott   float64
-	HTop, HBott   float64
-	HLeft, HRight float64
-	H             int
-	C             int
-	CLeft, CRight float64
-	SP            int
-}
-
-func (b TextBox) Bounds() (float64, float64, float64, float64) {
-	return b.X0, b.Top, b.X1, b.Bottom
-}
-
-// ── Position and section types ────────────────────────────────────────────
-
-// Position represents a parsed position tag from @@...## format.
-type Position struct {
-	PageNumbers []int
-	Left        float64
-	Right       float64
-	Top         float64
-	Bottom      float64
-}
-
-// Section represents a text segment with its spatial position on a PDF page.
-type Section struct {
-	Text        string
-	PositionTag string
-	LayoutType  string
-	DocTypeKwd  string // "text"/"table"/"image" — assigned during post-processing
-	Positions   []Position
-	TableItem   *TableItem
-	Image       string // base64-encoded cropped page image
-}
-
-// SectionsByPage returns a slice of sections on the given page.
-func SectionsByPage(sections []Section, page int) []Section {
-	var out []Section
-	for _, s := range sections {
-		for _, p := range s.Positions {
-			for _, pn := range p.PageNumbers {
-				if pn == page {
-					out = append(out, s)
-					break
-				}
-			}
-		}
-	}
-	return out
-}
-
-// CollectFigures returns all sections with LayoutType "figure".
-func CollectFigures(sections []Section) []Section {
-	if sections == nil {
-		return nil
-	}
-	figures := make([]Section, 0)
-	for _, s := range sections {
-		if s.LayoutType == LayoutTypeFigure {
-			figures = append(figures, s)
-		}
-	}
-	return figures
-}
-
-// ── Table types ───────────────────────────────────────────────────────────
-
-// TableItem represents a detected table or figure region.
-type TableItem struct {
-	ImageB64  string
-	Rows      [][]string
-	Cells     []TSRCell
-	Positions []Position
-	Scale     float64
-	CropOffX  float64
-	CropOffY  float64
-	Caption   string
-
-	RegionLeft, RegionRight, RegionTop, RegionBottom float64
-	NoMerge                                          bool
-	Grid                                             [][]TSRCell
-}
-
-// TSRCell represents one table cell from TSR.
-type TSRCell struct {
-	X0, Y0, X1, Y1 float64
-	Text           string
-	Label          string
-}
-
-func (c TSRCell) Bounds() (float64, float64, float64, float64) {
-	return c.X0, c.Y0, c.X1, c.Y1
-}
-
-// ── DeepDoc vision types ─────────────────────────────────────────────────
-
-// DLARegion represents one detected layout region.
-type DLARegion struct {
-	X0, Y0, X1, Y1 float64
-	Label          string
-	Confidence     float64
-}
-
-func (r DLARegion) Bounds() (float64, float64, float64, float64) {
-	return r.X0, r.Y0, r.X1, r.Y1
-}
-
-// OCRBox represents a detected text region from DeepDoc OCR detection.
-type OCRBox struct {
-	X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
-}
-
-// OCRText represents recognized text with confidence from DeepDoc OCR rec.
-type OCRText struct {
-	Text       string
-	Confidence float64
-}
-
-// ── Parser configuration ──────────────────────────────────────────────────
-
-// ParserConfig holds parser configuration.
-type ParserConfig struct {
-	Zoom               float64
-	FromPage           int
-	ToPage             int
-	TableContextSize   int
-	ImageContextSize   int
-	AutoRotateTables   *bool
-	SeparateTablesFigs bool
-	SortByTop          bool
-	BatchSize          int
-	SkipOCR            bool
-	MaxOCRConcurrency  int
-	TableBuilder       TableBuilder
-}
-
-// DefaultParserConfig returns a ParserConfig with sensible defaults.
-func DefaultParserConfig() ParserConfig {
-	return ParserConfig{
-		Zoom:               3,
-		FromPage:           0,
-		ToPage:             -1,
-		BatchSize:          50,
-		TableContextSize:   0,
-		ImageContextSize:   0,
-		SeparateTablesFigs: false,
-	}
-}
-
-// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
-const DlaDPI = 216
-
-// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
-const DlaScale = DlaDPI / 72.0
-
-// ── Layout type constants ─────────────────────────────────────────────────
+const DlaDPI = doctype.DlaDPI
+const DlaScale = doctype.DlaScale

 const (
-	LayoutTypeText      = "text"
-	LayoutTypeTable     = "table"
-	LayoutTypeFigure    = "figure"
-	LayoutTypeEquation  = "equation"
-	LayoutTypeTitle     = "title"
-	LayoutTypeReference = "reference"
-	LayoutTypeFooter    = "footer"
-	LayoutTypeHeader    = "header"
-
-	DLALabelFigureCaption = "figure caption"
-	DLALabelTableCaption  = "table caption"
+	LayoutTypeText        = doctype.LayoutTypeText
+	LayoutTypeTable       = doctype.LayoutTypeTable
+	LayoutTypeFigure      = doctype.LayoutTypeFigure
+	LayoutTypeEquation    = doctype.LayoutTypeEquation
+	LayoutTypeTitle       = doctype.LayoutTypeTitle
+	LayoutTypeReference   = doctype.LayoutTypeReference
+	LayoutTypeFooter      = doctype.LayoutTypeFooter
+	LayoutTypeHeader      = doctype.LayoutTypeHeader
+	DLALabelFigureCaption = doctype.DLALabelFigureCaption
+	DLALabelTableCaption  = doctype.DLALabelTableCaption
 )

-// ── Interfaces ────────────────────────────────────────────────────────────
+// ── Re-export functions and variables ──────────────────────────────────────

-// DocAnalyzer abstracts DeepDoc vision operations.
-type DocAnalyzer interface {
-	DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
-	TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
-	OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
-	OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
-	OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
-	Health() bool
-}
-
-// ── Outline ────────────────────────────────────────────────────────────
-
-// Outline represents one entry in a PDF's document outline (table of contents).
-// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
-type Outline struct {
-	Title      string
-	Level      int
-	PageNumber int // 1-indexed, matching Python
-}
-
-// PDFEngine abstracts page extraction capabilities.
-type PDFEngine interface {
-	ExtractChars(pageNum int) ([]TextChar, error)
-	RenderPage(pageNum int, dpi float64) ([]byte, error)
-	RenderPageImage(pageNum int, dpi float64) (image.Image, error)
-	RawData() []byte
-	PageCount() (int, error)
-	Outlines() ([]Outline, error)
-	Close() error
-}
-
-// Tokenizer provides text tokenization matching rag_tokenizer.
-type Tokenizer interface {
-	Tag(token string) string
-}
-
-// SampleFunc samples up to n characters from a page's chars.
-type SampleFunc func(chars []TextChar, n int) string
-
-// TableBuilder encapsulates TSR model-specific cell detection and grouping.
-type TableBuilder interface {
-	Name() string
-	DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
-	GroupCells(cells []TSRCell) [][]TSRCell
-}
-
-// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
-type Rectangular interface {
-	Bounds() (x0, y0, x1, y1 float64)
-}
-
-// IsCJK reports whether r is a CJK character.
-func IsCJK(r rune) bool {
-	return unicode.Is(unicode.Han, r) ||
-		unicode.Is(unicode.Hiragana, r) ||
-		unicode.Is(unicode.Katakana, r) ||
-		unicode.Is(unicode.Hangul, r)
-}
+var (
+	CollectFigures      = doctype.CollectFigures
+	DefaultParserConfig = doctype.DefaultParserConfig
+	IsCJK               = doctype.IsCJK
+)
--- a/internal/deepdoc/parser/pdf/util/geometry.go
+++ b/internal/deepdoc/parser/pdf/util/geometry.go
@@ -131,34 +131,6 @@ func OverlapX(a, b pdf.Rectangular) float64 {
 	return overlap / minWidth
 }

-// SortXByPage sorts boxes by page_number, then x0, then top.
-// After sorting, corrects for same-page boxes that have nearly the same x0
-// but inverted top ordering (a layout artifact).
-//
-// Python: pdf_parser.py:178 sort_X_by_page()
-func SortXByPage(boxes []pdf.TextBox, threshold float64) []pdf.TextBox {
-	sort.Slice(boxes, func(i, j int) bool {
-		if boxes[i].PageNumber != boxes[j].PageNumber {
-			return boxes[i].PageNumber < boxes[j].PageNumber
-		}
-		if boxes[i].X0 != boxes[j].X0 {
-			return boxes[i].X0 < boxes[j].X0
-		}
-		return boxes[i].Top < boxes[j].Top
-	})
-
-	for i := len(boxes) - 1; i >= 1; i-- {
-		for j := i - 1; j >= 0; j-- {
-			if math.Abs(boxes[j+1].X0-boxes[j].X0) < threshold &&
-				boxes[j+1].Top < boxes[j].Top &&
-				boxes[j+1].PageNumber == boxes[j].PageNumber {
-				boxes[j], boxes[j+1] = boxes[j+1], boxes[j]
-			}
-		}
-	}
-	return boxes
-}
-
 // MedianCharHeight computes the median character height for a page,
 // matching Python's np.median(char height) in __images__ (pdf_parser.py:1552).
 // Used as a reference unit for vertical spacing decisions.
--- a/internal/deepdoc/parser/pdf/util/geometry_test.go
+++ b/internal/deepdoc/parser/pdf/util/geometry_test.go
@@ -49,22 +49,6 @@ func TestYDis(t *testing.T) {
 	}
 }

-func TestSortXByPage(t *testing.T) {
-	boxes := []pdf.TextBox{
-		{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
-		{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
-		{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
-		{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
-	}
-	result := SortXByPage(boxes, 3)
-	if result[0].Text != "D" {
-		t.Errorf("first should be page 0: got %q", result[0].Text)
-	}
-	if result[1].Text != "B" || result[2].Text != "A" {
-		t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
-	}
-}
-
 func TestOverlapX(t *testing.T) {
 	b1 := pdf.TextBox{X0: 50, X1: 200}
 	b2 := pdf.TextBox{X0: 100, X1: 250}
--- a/internal/deepdoc/parser/pdf/ycoord_test.go
+++ b/internal/deepdoc/parser/pdf/ycoord_test.go
@@ -1,6 +1,6 @@
 //go:build cgo && manual

-package parser
+package pdf

 import (
 	"math"
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"testing"

+	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
 	"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
 	pdf "ragflow/internal/deepdoc/parser/pdf/type"
 )
@@ -41,7 +42,7 @@ func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) {
 		t.Fatal("no chars")
 	}

-	lines := groupCharsToLines(chars, false)
+	lines := lyt.GroupCharsToLines(chars, false)
 	for li, line := range lines {
 		if len(line) <= 1 {
 			continue
--- a/internal/deepdoc/parser/type/types.go
+++ b/internal/deepdoc/parser/type/types.go
@@ -0,0 +1,304 @@
+// Package doctype provides shared types, interfaces, and constants for the
+// deepdoc parser pipeline.  All format-specific parsers (pdf, docx, xlsx, etc.)
+// share these definitions.  The package has zero dependencies on sibling
+// packages so that any sub-package can import it without circular imports.
+package doctype
+
+import (
+	"context"
+	"image"
+	"unicode"
+)
+
+// ── Pipeline types ────────────────────────────────────────────────────────
+
+// PipelineMetrics records diagnostic counts at each pipeline stage.
+type PipelineMetrics struct {
+	BoxesInitial   int
+	BoxesTextMerge int
+	BoxesVertMerge int
+	BoxesFinal     int
+	TablesCount    int
+}
+
+// ParseResult encapsulates all outputs from a single Parse() call.
+type ParseResult struct {
+	Sections   []Section
+	Tables     []TableItem
+	PageImages map[int]image.Image
+	Metrics    PipelineMetrics
+	Outlines   []Outline // PDF outlines/bookmarks extracted from the document
+
+	DLADebug []DLAPageRegions
+	TSRDebug []TSRRawCell
+}
+
+// Figures returns all sections with LayoutType "figure".
+// Computed on demand from Sections — no stored field.
+func (r *ParseResult) Figures() []Section {
+	return CollectFigures(r.Sections)
+}
+
+// DLAPageRegions holds DLA layout regions for one page.
+type DLAPageRegions struct {
+	Page    int
+	Regions []DLARegion
+}
+
+// TSRRawCell holds a raw TSR cell before row/column grouping.
+type TSRRawCell struct {
+	TableIndex int     `json:"table_index"`
+	Page       int     `json:"page"`
+	Label      string  `json:"label"`
+	X0         float64 `json:"x0"`
+	Y0         float64 `json:"y0"`
+	X1         float64 `json:"x1"`
+	Y1         float64 `json:"y1"`
+	Text       string  `json:"text"`
+}
+
+// ── Character and text box types ──────────────────────────────────────────
+
+// TextChar represents a single character extracted from a PDF page.
+type TextChar struct {
+	X0, X1      float64
+	Top, Bottom float64
+	Text        string
+	FontName    string
+	FontSize    float64
+	PageNumber  int
+	LayoutType  string
+	LayoutNo    string
+	ColID       int
+	R           int
+}
+
+func (c TextChar) Bounds() (float64, float64, float64, float64) {
+	return c.X0, c.Top, c.X1, c.Bottom
+}
+
+// TextBox represents a rectangular region of text on a PDF page.
+type TextBox struct {
+	X0, X1      float64
+	Top, Bottom float64
+	Text        string
+	PageNumber  int
+	LayoutType  string
+	LayoutNo    string
+	ColID       int
+	R           int
+	// Post-TSR table annotation fields (Python: R/H/C/SP tags)
+	RTop, RBott   float64
+	HTop, HBott   float64
+	HLeft, HRight float64
+	H             int
+	C             int
+	CLeft, CRight float64
+	SP            int
+}
+
+func (b TextBox) Bounds() (float64, float64, float64, float64) {
+	return b.X0, b.Top, b.X1, b.Bottom
+}
+
+// ── Position and section types ────────────────────────────────────────────
+
+// Position represents a parsed position tag from @@...## format.
+type Position struct {
+	PageNumbers []int
+	Left        float64
+	Right       float64
+	Top         float64
+	Bottom      float64
+}
+
+// Section represents a text segment with its spatial position on a PDF page.
+type Section struct {
+	Text        string
+	PositionTag string
+	LayoutType  string
+	DocTypeKwd  string // "text"/"table"/"image" — assigned during post-processing
+	Positions   []Position
+	TableItem   *TableItem
+	Image       string // base64-encoded cropped page image
+}
+
+// CollectFigures returns all sections with LayoutType "figure".
+func CollectFigures(sections []Section) []Section {
+	if sections == nil {
+		return nil
+	}
+	figures := make([]Section, 0)
+	for _, s := range sections {
+		if s.LayoutType == LayoutTypeFigure {
+			figures = append(figures, s)
+		}
+	}
+	return figures
+}
+
+// ── Table types ───────────────────────────────────────────────────────────
+
+// TableItem represents a detected table or figure region.
+type TableItem struct {
+	ImageB64  string
+	Rows      [][]string
+	Cells     []TSRCell
+	Positions []Position
+	Scale     float64
+	CropOffX  float64
+	CropOffY  float64
+	Caption   string
+
+	RegionLeft, RegionRight, RegionTop, RegionBottom float64
+	NoMerge                                          bool
+	Grid                                             [][]TSRCell
+}
+
+// TSRCell represents one table cell from TSR.
+type TSRCell struct {
+	X0, Y0, X1, Y1 float64
+	Text           string
+	Label          string
+}
+
+func (c TSRCell) Bounds() (float64, float64, float64, float64) {
+	return c.X0, c.Y0, c.X1, c.Y1
+}
+
+// ── DeepDoc vision types ─────────────────────────────────────────────────
+
+// DLARegion represents one detected layout region.
+type DLARegion struct {
+	X0, Y0, X1, Y1 float64
+	Label          string
+	Confidence     float64
+}
+
+func (r DLARegion) Bounds() (float64, float64, float64, float64) {
+	return r.X0, r.Y0, r.X1, r.Y1
+}
+
+// OCRBox represents a detected text region from DeepDoc OCR detection.
+type OCRBox struct {
+	X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
+}
+
+// OCRText represents recognized text with confidence from DeepDoc OCR rec.
+type OCRText struct {
+	Text       string
+	Confidence float64
+}
+
+// ── Parser configuration ──────────────────────────────────────────────────
+
+// ParserConfig holds parser configuration.
+type ParserConfig struct {
+	Zoom               float64
+	FromPage           int
+	ToPage             int
+	TableContextSize   int
+	ImageContextSize   int
+	AutoRotateTables   *bool
+	SeparateTablesFigs bool
+	SortByTop          bool
+	BatchSize          int
+	SkipOCR            bool
+	MaxOCRConcurrency  int
+}
+
+// DefaultParserConfig returns a ParserConfig with sensible defaults.
+func DefaultParserConfig() ParserConfig {
+	return ParserConfig{
+		Zoom:               3,
+		FromPage:           0,
+		ToPage:             -1,
+		BatchSize:          50,
+		TableContextSize:   0,
+		ImageContextSize:   0,
+		SeparateTablesFigs: false,
+	}
+}
+
+// DlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
+const DlaDPI = 216
+
+// DlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
+const DlaScale = DlaDPI / 72.0
+
+// ── Layout type constants ─────────────────────────────────────────────────
+
+const (
+	LayoutTypeText      = "text"
+	LayoutTypeTable     = "table"
+	LayoutTypeFigure    = "figure"
+	LayoutTypeEquation  = "equation"
+	LayoutTypeTitle     = "title"
+	LayoutTypeReference = "reference"
+	LayoutTypeFooter    = "footer"
+	LayoutTypeHeader    = "header"
+
+	DLALabelFigureCaption = "figure caption"
+	DLALabelTableCaption  = "table caption"
+)
+
+// ── Interfaces ────────────────────────────────────────────────────────────
+
+// DocAnalyzer abstracts DeepDoc vision operations.
+type DocAnalyzer interface {
+	DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
+	TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
+	OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
+	OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
+	OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
+	Health() bool
+}
+
+// ── Outline ────────────────────────────────────────────────────────────
+
+// Outline represents one entry in a PDF's document outline (table of contents).
+// Python: extract_pdf_outlines() in deepdoc/parser/utils.py
+type Outline struct {
+	Title      string
+	Level      int
+	PageNumber int // 1-indexed, matching Python
+}
+
+// PDFEngine abstracts page extraction capabilities.
+type PDFEngine interface {
+	ExtractChars(pageNum int) ([]TextChar, error)
+	RenderPage(pageNum int, dpi float64) ([]byte, error)
+	RenderPageImage(pageNum int, dpi float64) (image.Image, error)
+	RawData() []byte
+	PageCount() (int, error)
+	Outlines() ([]Outline, error)
+	Close() error
+}
+
+// Tokenizer provides text tokenization matching rag_tokenizer.
+type Tokenizer interface {
+	Tag(token string) string
+}
+
+// SampleFunc samples up to n characters from a page's chars.
+type SampleFunc func(chars []TextChar, n int) string
+
+// TableBuilder encapsulates TSR model-specific cell detection and grouping.
+type TableBuilder interface {
+	Name() string
+	DetectCells(ctx context.Context, cropped image.Image) ([]TSRCell, error)
+	GroupCells(cells []TSRCell) [][]TSRCell
+}
+
+// Rectangular is any 2D axis-aligned rectangle that can report its bounds.
+type Rectangular interface {
+	Bounds() (x0, y0, x1, y1 float64)
+}
+
+// IsCJK reports whether r is a CJK character.
+func IsCJK(r rune) bool {
+	return unicode.Is(unicode.Han, r) ||
+		unicode.Is(unicode.Hiragana, r) ||
+		unicode.Is(unicode.Katakana, r) ||
+		unicode.Is(unicode.Hangul, r)
+}