mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
This commit is contained in:
90
internal/deepdoc/parser/pdf/python_char_adapter.go
Normal file
90
internal/deepdoc/parser/pdf/python_char_adapter.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"image"
|
||||
"os"
|
||||
)
|
||||
|
||||
// PythonCharEngine implements PDFEngine by loading chars from a
|
||||
// charspy/{pdf}.json file exported by dump_py_results.py.
|
||||
// It is used for pipeline parity testing — same input chars as Python,
|
||||
// so any difference in pipeline output is a Go pipeline logic bug.
|
||||
type PythonCharEngine struct {
|
||||
chars map[int][]TextChar // pageNum → chars
|
||||
pages int
|
||||
}
|
||||
|
||||
// LoadPythonChars loads chars from a charspy/{name}.json file.
|
||||
func LoadPythonChars(jsonPath string) (*PythonCharEngine, error) {
|
||||
data, err := os.ReadFile(jsonPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read charspy json: %w", err)
|
||||
}
|
||||
var wrapper struct {
|
||||
Pages [][]struct {
|
||||
Text string `json:"text"`
|
||||
X0 float64 `json:"x0"`
|
||||
X1 float64 `json:"x1"`
|
||||
Top float64 `json:"top"`
|
||||
Bottom float64 `json:"bottom"`
|
||||
FontName string `json:"fontname"`
|
||||
Size float64 `json:"size"`
|
||||
} `json:"pages"`
|
||||
}
|
||||
if err := json.Unmarshal(data, &wrapper); err != nil {
|
||||
return nil, fmt.Errorf("parse charspy json: %w", err)
|
||||
}
|
||||
|
||||
chars := make(map[int][]TextChar, len(wrapper.Pages))
|
||||
for pg, pageChars := range wrapper.Pages {
|
||||
result := make([]TextChar, len(pageChars))
|
||||
for i, c := range pageChars {
|
||||
result[i] = TextChar{
|
||||
Text: c.Text,
|
||||
X0: c.X0,
|
||||
X1: c.X1,
|
||||
Top: c.Top,
|
||||
Bottom: c.Bottom,
|
||||
FontName: c.FontName,
|
||||
FontSize: c.Size,
|
||||
PageNumber: pg,
|
||||
}
|
||||
}
|
||||
chars[pg] = result
|
||||
}
|
||||
return &PythonCharEngine{chars: chars, pages: len(wrapper.Pages)}, nil
|
||||
}
|
||||
|
||||
// ExtractChars returns all characters for the given page (0-indexed).
|
||||
func (e *PythonCharEngine) ExtractChars(pageNum int) ([]TextChar, error) {
|
||||
if pageNum < 0 || pageNum >= e.pages {
|
||||
return nil, fmt.Errorf("page %d out of range [0, %d)", pageNum, e.pages)
|
||||
}
|
||||
return e.chars[pageNum], nil
|
||||
}
|
||||
|
||||
// RenderPage returns a 1x1 placeholder PNG (not used in parity tests).
|
||||
func (e *PythonCharEngine) RenderPage(pageNum int, dpi float64) ([]byte, error) {
|
||||
return nil, fmt.Errorf("PythonCharEngine: RenderPage not supported")
|
||||
}
|
||||
|
||||
// RenderPageImage returns a 1x1 placeholder image (not used in parity tests).
|
||||
func (e *PythonCharEngine) RenderPageImage(pageNum int, dpi float64) (image.Image, error) {
|
||||
return nil, fmt.Errorf("PythonCharEngine: RenderPageImage not supported")
|
||||
}
|
||||
|
||||
// PageCount returns the number of pages.
|
||||
func (e *PythonCharEngine) PageCount() (int, error) {
|
||||
return e.pages, nil
|
||||
}
|
||||
|
||||
// RawData returns nil — this engine only supplies pre-loaded chars
|
||||
// for pipeline parity tests and does not hold PDF bytes.
|
||||
func (e *PythonCharEngine) RawData() []byte { return nil }
|
||||
|
||||
// Close is a no-op.
|
||||
func (e *PythonCharEngine) Close() error {
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user