Files
ragflow/internal/deepdoc/parser/pdf/pdfium/pdfium.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

166 lines
5.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package pdfium renders PDF pages using the system's libpdfium.so
// (bundled with pypdfium2). It exists solely to replace pdf_oxide's
// RenderPageRaw for use cases where image quality matters for downstream
// OCR/DLA — pdf_oxide still handles all text/char/table extraction.
package pdfium
/*
#cgo LDFLAGS: -L/home/shenyushi/cc-workspace/ragflow/.venv/lib/python3.13/site-packages/pypdfium2_raw -lpdfium -lm -lpthread -ldl
#cgo linux LDFLAGS: -Wl,-rpath,/home/shenyushi/cc-workspace/ragflow/.venv/lib/python3.13/site-packages/pypdfium2_raw
#include <stdint.h>
#include <stdlib.h>
typedef struct FPDF_DOCUMENT__ { int unused; } *FPDF_DOCUMENT;
typedef struct FPDF_PAGE__ { int unused; } *FPDF_PAGE;
typedef struct FPDF_BITMAP__ { int unused; } *FPDF_BITMAP;
extern void FPDF_InitLibrary(void);
extern FPDF_DOCUMENT FPDF_LoadMemDocument(const void* data_buf, int size, const char* password);
extern void FPDF_CloseDocument(FPDF_DOCUMENT document);
extern int FPDF_GetPageCount(FPDF_DOCUMENT document);
extern FPDF_PAGE FPDF_LoadPage(FPDF_DOCUMENT document, int page_index);
extern void FPDF_ClosePage(FPDF_PAGE page);
extern double FPDF_GetPageWidth(FPDF_PAGE page);
extern double FPDF_GetPageHeight(FPDF_PAGE page);
extern FPDF_BITMAP FPDFBitmap_Create(int width, int height, int alpha);
extern void FPDFBitmap_Destroy(FPDF_BITMAP bitmap);
extern void FPDF_RenderPageBitmap(FPDF_BITMAP bitmap, FPDF_PAGE page,
int start_x, int start_y, int size_x, int size_y,
int rotate, int flags);
extern void* FPDFBitmap_GetBuffer(FPDF_BITMAP bitmap);
extern int FPDFBitmap_GetWidth(FPDF_BITMAP bitmap);
extern int FPDFBitmap_GetHeight(FPDF_BITMAP bitmap);
extern int FPDFBitmap_GetStride(FPDF_BITMAP bitmap);
*/
import "C"
import (
"fmt"
"image"
"image/color"
"math"
"sync"
"unsafe"
)
var initOnce sync.Once
// pdfiumMu serializes all pdfium C API access. pdfium is NOT thread-safe —
// concurrent calls to FPDF_LoadPage / FPDF_RenderPageBitmap corrupt the
// global heap, causing SIGSEGV. See TestPdfiumConcurrentSafety.
var pdfiumMu sync.Mutex
// Init initializes the PDFium library. Safe to call multiple times.
func Init() { initOnce.Do(func() { C.FPDF_InitLibrary() }) }
// PageSize returns the page dimensions in PDF points (1/72 inch) as seen
// after rotation. For a page with /Rotate 90 on A4, this returns ~842×595
// (swapped from the MediaBox 595×842). The call is cheap — it opens the
// document and page, reads dimensions, then closes.
func PageSize(pdfData []byte, pageIdx int) (width, height float64, err error) {
Init()
pdfiumMu.Lock()
defer pdfiumMu.Unlock()
_, _, pw, ph, closeAll, err := openPage(pdfData, pageIdx)
if err != nil {
return 0, 0, err
}
closeAll()
return pw, ph, nil
}
// RenderPage renders a single page of a PDF to an *image.RGBA at the given DPI.
// pdfData is the raw PDF bytes, pageIdx is 0-based.
func RenderPage(pdfData []byte, pageIdx int, dpi float64) (*image.RGBA, error) {
Init()
pdfiumMu.Lock()
defer pdfiumMu.Unlock()
_, page, pw, ph, closeAll, err := openPage(pdfData, pageIdx)
if err != nil {
return nil, err
}
defer closeAll()
scale := dpi / 72.0
pxW := int(math.Round(pw * scale))
pxH := int(math.Round(ph * scale))
bitmap := C.FPDFBitmap_Create(C.int(pxW), C.int(pxH), 1) // 1 = RGBA
if bitmap == nil {
return nil, fmt.Errorf("pdfium: FPDFBitmap_Create(%d,%d) returned nil", pxW, pxH)
}
defer C.FPDFBitmap_Destroy(bitmap)
// Fill with opaque white before rendering, so transparent areas
// (e.g. outside crop box) are white rather than undefined.
stride := int(C.FPDFBitmap_GetStride(bitmap))
buf := C.FPDFBitmap_GetBuffer(bitmap)
pixels := (*[1 << 30]byte)(unsafe.Pointer(buf))[: pxH*stride : pxH*stride]
for i := range pixels {
pixels[i] = 255
}
// FPDF_ANNOT (0x01) — render annotations.
// LCD text AA (0x02) is left off; default text smoothing is sufficient.
C.FPDF_RenderPageBitmap(bitmap, page, 0, 0, C.int(pxW), C.int(pxH), 0, 0x01)
// pdfium outputs BGRA; convert to RGBA.
img := image.NewRGBA(image.Rect(0, 0, pxW, pxH))
for y := 0; y < pxH; y++ {
for x := 0; x < pxW; x++ {
off := y*stride + x*4
img.SetRGBA(x, y, color.RGBA{
R: pixels[off+2], // B
G: pixels[off+1], // G
B: pixels[off], // R
A: 255,
})
}
}
return img, nil
}
// openPage opens a document and page, returning post-rotation dimensions
// and a cleanup function. Callers must call closeAll() to free resources.
func openPage(pdfData []byte, pageIdx int) (
doc C.FPDF_DOCUMENT,
page C.FPDF_PAGE,
pw, ph float64,
closeAll func(),
err error,
) {
cData := C.CBytes(pdfData)
doc = C.FPDF_LoadMemDocument(unsafe.Pointer(cData), C.int(len(pdfData)), nil)
if doc == nil {
C.free(cData)
err = fmt.Errorf("pdfium: FPDF_LoadMemDocument returned nil")
return
}
page = C.FPDF_LoadPage(doc, C.int(pageIdx))
if page == nil {
C.FPDF_CloseDocument(doc)
C.free(cData)
err = fmt.Errorf("pdfium: FPDF_LoadPage(%d) returned nil", pageIdx)
return
}
pw = float64(C.FPDF_GetPageWidth(page))
ph = float64(C.FPDF_GetPageHeight(page))
if pw <= 0 || ph <= 0 {
C.FPDF_ClosePage(page)
C.FPDF_CloseDocument(doc)
C.free(cData)
err = fmt.Errorf("pdfium: invalid page dimensions %.1fx%.1f", pw, ph)
return
}
closeAll = func() {
C.FPDF_ClosePage(page)
C.FPDF_CloseDocument(doc)
C.free(cData)
}
return
}