mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
91 lines
2.7 KiB
Go
91 lines
2.7 KiB
Go
package parser
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"image"
|
|
"os"
|
|
)
|
|
|
|
// PythonCharEngine implements PDFEngine by loading chars from a
|
|
// charspy/{pdf}.json file exported by dump_py_results.py.
|
|
// It is used for pipeline parity testing — same input chars as Python,
|
|
// so any difference in pipeline output is a Go pipeline logic bug.
|
|
type PythonCharEngine struct {
|
|
chars map[int][]TextChar // pageNum → chars
|
|
pages int
|
|
}
|
|
|
|
// LoadPythonChars loads chars from a charspy/{name}.json file.
|
|
func LoadPythonChars(jsonPath string) (*PythonCharEngine, error) {
|
|
data, err := os.ReadFile(jsonPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read charspy json: %w", err)
|
|
}
|
|
var wrapper struct {
|
|
Pages [][]struct {
|
|
Text string `json:"text"`
|
|
X0 float64 `json:"x0"`
|
|
X1 float64 `json:"x1"`
|
|
Top float64 `json:"top"`
|
|
Bottom float64 `json:"bottom"`
|
|
FontName string `json:"fontname"`
|
|
Size float64 `json:"size"`
|
|
} `json:"pages"`
|
|
}
|
|
if err := json.Unmarshal(data, &wrapper); err != nil {
|
|
return nil, fmt.Errorf("parse charspy json: %w", err)
|
|
}
|
|
|
|
chars := make(map[int][]TextChar, len(wrapper.Pages))
|
|
for pg, pageChars := range wrapper.Pages {
|
|
result := make([]TextChar, len(pageChars))
|
|
for i, c := range pageChars {
|
|
result[i] = TextChar{
|
|
Text: c.Text,
|
|
X0: c.X0,
|
|
X1: c.X1,
|
|
Top: c.Top,
|
|
Bottom: c.Bottom,
|
|
FontName: c.FontName,
|
|
FontSize: c.Size,
|
|
PageNumber: pg,
|
|
}
|
|
}
|
|
chars[pg] = result
|
|
}
|
|
return &PythonCharEngine{chars: chars, pages: len(wrapper.Pages)}, nil
|
|
}
|
|
|
|
// ExtractChars returns all characters for the given page (0-indexed).
|
|
func (e *PythonCharEngine) ExtractChars(pageNum int) ([]TextChar, error) {
|
|
if pageNum < 0 || pageNum >= e.pages {
|
|
return nil, fmt.Errorf("page %d out of range [0, %d)", pageNum, e.pages)
|
|
}
|
|
return e.chars[pageNum], nil
|
|
}
|
|
|
|
// RenderPage returns a 1x1 placeholder PNG (not used in parity tests).
|
|
func (e *PythonCharEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
|
|
return nil, fmt.Errorf("PythonCharEngine: RenderPage not supported")
|
|
}
|
|
|
|
// RenderPageImage returns a 1x1 placeholder image (not used in parity tests).
|
|
func (e *PythonCharEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
|
|
return nil, fmt.Errorf("PythonCharEngine: RenderPageImage not supported")
|
|
}
|
|
|
|
// PageCount returns the number of pages.
|
|
func (e *PythonCharEngine) PageCount() (int, error) {
|
|
return e.pages, nil
|
|
}
|
|
|
|
// RawData returns nil — this engine only supplies pre-loaded chars
|
|
// for pipeline parity tests and does not hold PDF bytes.
|
|
func (e *PythonCharEngine) RawData() []byte { return nil }
|
|
|
|
// Close is a no-op.
|
|
func (e *PythonCharEngine) Close() error {
|
|
return nil
|
|
}
|