Files
ragflow/internal/deepdoc/parser/pdf/python_char_adapter.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

91 lines
2.7 KiB
Go

package parser
import (
"encoding/json"
"fmt"
"image"
"os"
)
// PythonCharEngine implements PDFEngine by loading chars from a
// charspy/{pdf}.json file exported by dump_py_results.py.
// It is used for pipeline parity testing — same input chars as Python,
// so any difference in pipeline output is a Go pipeline logic bug.
type PythonCharEngine struct {
chars map[int][]TextChar // pageNum → chars
pages int
}
// LoadPythonChars loads chars from a charspy/{name}.json file.
func LoadPythonChars(jsonPath string) (*PythonCharEngine, error) {
data, err := os.ReadFile(jsonPath)
if err != nil {
return nil, fmt.Errorf("read charspy json: %w", err)
}
var wrapper struct {
Pages [][]struct {
Text string `json:"text"`
X0 float64 `json:"x0"`
X1 float64 `json:"x1"`
Top float64 `json:"top"`
Bottom float64 `json:"bottom"`
FontName string `json:"fontname"`
Size float64 `json:"size"`
} `json:"pages"`
}
if err := json.Unmarshal(data, &wrapper); err != nil {
return nil, fmt.Errorf("parse charspy json: %w", err)
}
chars := make(map[int][]TextChar, len(wrapper.Pages))
for pg, pageChars := range wrapper.Pages {
result := make([]TextChar, len(pageChars))
for i, c := range pageChars {
result[i] = TextChar{
Text: c.Text,
X0: c.X0,
X1: c.X1,
Top: c.Top,
Bottom: c.Bottom,
FontName: c.FontName,
FontSize: c.Size,
PageNumber: pg,
}
}
chars[pg] = result
}
return &PythonCharEngine{chars: chars, pages: len(wrapper.Pages)}, nil
}
// ExtractChars returns all characters for the given page (0-indexed).
func (e *PythonCharEngine) ExtractChars(pageNum int) ([]TextChar, error) {
if pageNum < 0 || pageNum >= e.pages {
return nil, fmt.Errorf("page %d out of range [0, %d)", pageNum, e.pages)
}
return e.chars[pageNum], nil
}
// RenderPage returns a 1x1 placeholder PNG (not used in parity tests).
func (e *PythonCharEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
return nil, fmt.Errorf("PythonCharEngine: RenderPage not supported")
}
// RenderPageImage returns a 1x1 placeholder image (not used in parity tests).
func (e *PythonCharEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
return nil, fmt.Errorf("PythonCharEngine: RenderPageImage not supported")
}
// PageCount returns the number of pages.
func (e *PythonCharEngine) PageCount() (int, error) {
return e.pages, nil
}
// RawData returns nil — this engine only supplies pre-loaded chars
// for pipeline parity tests and does not hold PDF bytes.
func (e *PythonCharEngine) RawData() []byte { return nil }
// Close is a no-op.
func (e *PythonCharEngine) Close() error {
return nil
}