Fix: use .a to replace .so for pdfium/pdf_oxide/office_oxide (#16496)

### Summary

Use .a to replace .so for pdfium/pdf_oxide/office_oxide

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Jack
2026-07-01 21:21:31 +08:00
committed by GitHub
parent b0e6007131
commit 42a0faad18
5 changed files with 187 additions and 216 deletions

View File

@@ -234,10 +234,7 @@ jobs:
PKGS=$(go list ./... 2>/dev/null \
| grep -v '/internal/storage$' \
| grep -v '/internal/tokenizer$' \
| grep -v '/internal/handler$' \
| grep -v '/internal/deepdoc/parser/pdf/pdfium' \
| grep -v '/internal/deepdoc/parser/pdf/pdfoxide' \
| grep -v '/internal/deepdoc/parser/pdf' || true)
| grep -v '/internal/handler$' || true)
if [ -z "$PKGS" ]; then
./build.sh --test
else
@@ -680,10 +677,7 @@ jobs:
PKGS=$(go list ./... 2>/dev/null \
| grep -v '/internal/storage$' \
| grep -v '/internal/tokenizer$' \
| grep -v '/internal/handler$' \
| grep -v '/internal/deepdoc/parser/pdf/pdfium' \
| grep -v '/internal/deepdoc/parser/pdf/pdfoxide' \
| grep -v '/internal/deepdoc/parser/pdf' || true)
| grep -v '/internal/handler$' || true)
if [ -z "$PKGS" ]; then
./build.sh --test
else

324
build.sh
View File

@@ -22,17 +22,41 @@ RAGFLOW_CLI_BINARY="$PROJECT_ROOT/bin/ragflow-cli"
# Strip symbols from Go binaries (set via --strip / -s)
STRIP_SYMBOLS=""
# office_oxide native library settings
OFFICE_OXIDE_PREFIX="${HOME}/.office_oxide"
# Native static library settings. These are the user-cache paths (~/ragflow-native-libs/).
# If /opt/ragflow-native-libs/ exists (pre-seeded in CI runner image), it takes priority
# and skips the network (download_deps.py) fallback.
SYSTEM_DEPS="/opt/ragflow-native-libs"
# office_oxide native library settings — static linking
OFFICE_OXIDE_PREFIX="${HOME}/ragflow-native-libs/office_oxide"
OFFICE_OXIDE_VERSION="0.1.2"
# pdfium native library settings (from pypdfium2_raw PyPI wheel)
PDFIUM_PREFIX="${HOME}/.pdfium"
PDFIUM_VERSION="0.5.0"
# pdfium native library settings — static linking (kognitos/pdfium-static)
PDFIUM_STATIC_PREFIX="${HOME}/ragflow-native-libs/pdfium-static"
PDFIUM_STATIC_VERSION="7809"
# pdf_oxide native library settings (from GitHub Release)
PDF_OXIDE_PREFIX="${HOME}/.pdf_oxide"
PDF_OXIDE_VERSION="0.3.63"
# pdf_oxide native library settings — static linking (go-ffi tarball)
PDF_OXIDE_PREFIX="${HOME}/ragflow-native-libs/pdf_oxide"
PDF_OXIDE_VERSION="0.3.67"
# Copy a dependency from the system pre-seed directory to the user cache.
# Returns 0 if the dep was copied or already exists in cache, 1 otherwise.
_seed_from_system() {
local dep_name="$1" # e.g. "pdfium-static", "pdf_oxide", "office_oxide"
local dep_dir="${HOME}/ragflow-native-libs/${dep_name}"
local sys_dir="${SYSTEM_DEPS}/${dep_name}"
if [ -d "$dep_dir" ]; then
return 0 # already cached
fi
if [ -d "$sys_dir" ]; then
echo " ${dep_name}${sys_dir} (system)"
mkdir -p "$(dirname "$dep_dir")"
cp -r "$sys_dir" "$dep_dir"
return 0
fi
return 1
}
echo -e "${GREEN}=== RAGFlow Go Server Build Script ===${NC}"
@@ -138,14 +162,9 @@ _download_and_extract() {
# Check / install office_oxide native library (Rust → C FFI library)
check_office_oxide_deps() {
print_section "Checking office_oxide native library"
_seed_from_system "office_oxide"
local lib_file header_path
case "$(uname -s)" in
Linux) lib_file="liboffice_oxide.so" ;;
Darwin) lib_file="liboffice_oxide.dylib" ;;
*) echo -e "${RED}Unsupported OS for office_oxide${NC}"; return 1 ;;
esac
local lib_file="liboffice_oxide.a"
local lib_path="${OFFICE_OXIDE_PREFIX}/lib/${lib_file}"
local header_path="${OFFICE_OXIDE_PREFIX}/include/office_oxide_c/office_oxide.h"
@@ -154,177 +173,65 @@ check_office_oxide_deps() {
return 0
fi
echo "office_oxide native library not found. Installing..."
# Map platform to the release asset name. Note: the GitHub release archives
# omit the version number from the native-* asset filenames.
local asset_name
case "$(uname -s)" in
Linux)
case "$(uname -m)" in
x86_64) asset_name="native-linux-x86_64" ;;
aarch64|arm64) asset_name="native-linux-aarch64" ;;
*) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; return 1 ;;
esac
;;
Darwin)
case "$(uname -m)" in
x86_64) asset_name="native-macos-x86_64" ;;
aarch64|arm64) asset_name="native-macos-aarch64" ;;
*) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; return 1 ;;
esac
;;
esac
local release_url="https://github.com/yfedoseev/office_oxide/releases/download/v${OFFICE_OXIDE_VERSION}/${asset_name}.tar.gz"
mkdir -p "${OFFICE_OXIDE_PREFIX}"
_download_and_extract "$release_url" "${OFFICE_OXIDE_PREFIX}"
if [ ! -f "$lib_path" ]; then
echo -e "${YELLOW}Warning: Failed to install office_oxide native library (missing ${lib_path})${NC}"
echo " Try: curl -fsSL ${release_url} | tar xzf - -C ${OFFICE_OXIDE_PREFIX}"
return 1
fi
echo -e "${GREEN}✓ office_oxide native library installed${NC}"
echo -e "${RED}Error: office_oxide native library not found${NC}"
echo " Expected: ${lib_path}"
echo " Run: uv run download_deps.py"
echo " Or manually download: https://github.com/yfedoseev/office_oxide/releases/download/v${OFFICE_OXIDE_VERSION}/native-linux-x86_64.tar.gz"
exit 1
}
# Check / install pdfium native library (libpdfium.so from pypdfium2_raw wheel).
# Check pdfium static library (must be pre-installed via download_deps.py or CI image).
check_pdfium_deps() {
# 1. Check .venv (uv sync provides pypdfium2_raw).
local venv_py="${PROJECT_ROOT}/.venv/bin/python3"
if [ -x "$venv_py" ]; then
local venv_so=$("$venv_py" -c "import pypdfium2_raw,os;print(os.path.join(os.path.dirname(pypdfium2_raw.__file__),'libpdfium.so'))" 2>/dev/null)
if [ -n "$venv_so" ] && [ -f "$venv_so" ]; then
echo " pdfium → ${venv_so} (.venv)"
export CGO_LDFLAGS="$CGO_LDFLAGS -L$(dirname "$venv_so") -Wl,-rpath,$(dirname "$venv_so")"
export LD_LIBRARY_PATH="$(dirname "$venv_so"):${LD_LIBRARY_PATH}"
return 0
fi
fi
_seed_from_system "pdfium-static"
local lib_path="${PDFIUM_STATIC_PREFIX}/lib/libpdfium.a"
# 2. Check cache.
local lib_path="${PDFIUM_PREFIX}/libpdfium.so"
if [ -f "$lib_path" ]; then
echo " pdfium ${PDFIUM_PREFIX}"
echo " pdfium (static)${PDFIUM_STATIC_PREFIX}"
return 0
fi
echo " pdfium not found, installing..."
# 3. Map platform to PyPI wheel platform tag.
local whl_platform
case "$(uname -s)" in
Linux)
case "$(uname -m)" in
x86_64) whl_platform="manylinux_2_17_x86_64.manylinux2014_x86_64" ;;
aarch64|arm64) whl_platform="manylinux_2_17_aarch64.manylinux2014_aarch64" ;;
*) echo " pdfium → unsupported arch"; return 1 ;;
esac
;;
Darwin)
case "$(uname -m)" in
x86_64) whl_platform="macosx_11_0_x86_64" ;;
arm64) whl_platform="macosx_11_0_arm64" ;;
*) echo " pdfium → unsupported arch"; return 1 ;;
esac
;;
*) echo " pdfium → unsupported OS"; return 1 ;;
esac
# 4. Download .whl from PyPI and extract libpdfium.so (zero pip dependency).
local whl_url
whl_url=$(curl -fsSL "https://pypi.org/pypi/pypdfium2_raw/${PDFIUM_VERSION}/json" 2>/dev/null \
| grep -o '"url":"[^"]*'${whl_platform}'[^"]*"' | head -1 | cut -d'"' -f4)
if [ -n "$whl_url" ] && { command -v curl >/dev/null 2>&1 || command -v wget >/dev/null 2>&1; }; then
local tmp_whl="$(mktemp)"
if command -v curl >/dev/null 2>&1; then
curl -fsSL "$whl_url" -o "$tmp_whl"
else
wget -q "$whl_url" -O "$tmp_whl"
fi
mkdir -p "${PDFIUM_PREFIX}"
# Wheel is a zip; extract libpdfium.so via python3 or unzip.
if command -v python3 >/dev/null 2>&1; then
python3 -c "
import zipfile, os, shutil
with zipfile.ZipFile('$tmp_whl') as z:
for n in z.namelist():
if n.endswith('libpdfium.so'):
z.extract(n, '${PDFIUM_PREFIX}')
os.rename(os.path.join('${PDFIUM_PREFIX}', n), '$lib_path')
# Remove empty pypdfium2_raw dir
d = os.path.join('${PDFIUM_PREFIX}', 'pypdfium2_raw')
if os.path.isdir(d): shutil.rmtree(d, ignore_errors=True)
break
" 2>/dev/null
elif command -v unzip >/dev/null 2>&1; then
unzip -q -o "$tmp_whl" -d "${PDFIUM_PREFIX}" 'pypdfium2_raw/libpdfium.so' 2>/dev/null
[ -f "${PDFIUM_PREFIX}/pypdfium2_raw/libpdfium.so" ] && mv "${PDFIUM_PREFIX}/pypdfium2_raw/libpdfium.so" "$lib_path"
rm -rf "${PDFIUM_PREFIX}/pypdfium2_raw"
fi
rm -f "$tmp_whl"
fi
if [ -f "$lib_path" ]; then
echo -e "${GREEN}✓ pdfium installed to ${PDFIUM_PREFIX}${NC}"
else
echo " pdfium → install failed (requires .venv, curl/wget + python3, or pre-cached ~/.pdfium)"
return 1
fi
echo " pdfium (static) not found"
echo " Expected: ${lib_path}"
echo " Run: uv run download_deps.py"
echo " Or: curl -fsSL https://github.com/kognitos/pdfium-static/releases/download/chromium%2F${PDFIUM_STATIC_VERSION}/pdfium-linux-x64-static.tgz | tar xz -C ${PDFIUM_STATIC_PREFIX}"
return 1
}
# Check / install pdf_oxide native library (Rust -> C FFI library).
# Check / install pdf_oxide static library (go-ffi tarball from GitHub Release).
check_pdf_oxide_deps() {
local lib_path="${PDF_OXIDE_PREFIX}/libpdf_oxide.so"
if [ -f "$lib_path" ]; then
echo " pdf_oxide → ${PDF_OXIDE_PREFIX} (shared)"
return 0
fi
# Also check for static library (user's local installation).
local static_path="${PDF_OXIDE_PREFIX}/libpdf_oxide.a"
if [ -f "$static_path" ]; then
echo " pdf_oxide → ${PDF_OXIDE_PREFIX} (static)"
return 0
fi
echo " pdf_oxide not found, installing..."
# Map platform to the release asset name.
local asset_name
_seed_from_system "pdf_oxide"
# Map platform to tarball-internal subdirectory.
local platform_subdir
case "$(uname -s)" in
Linux)
case "$(uname -m)" in
x86_64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-linux-x86_64" ;;
aarch64|arm64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-linux-aarch64" ;;
*) echo " pdf_oxide → unsupported arch"; return 1 ;;
x86_64) platform_subdir="linux_amd64" ;;
aarch64|arm64) platform_subdir="linux_arm64" ;;
*) echo " pdf_oxide (static) → unsupported arch"; return 1 ;;
esac
;;
Darwin)
case "$(uname -m)" in
x86_64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-darwin-x86_64" ;;
arm64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-darwin-arm64" ;;
*) echo " pdf_oxide → unsupported arch"; return 1 ;;
x86_64) platform_subdir="darwin_amd64" ;;
arm64) platform_subdir="darwin_arm64" ;;
*) echo " pdf_oxide (static) → unsupported arch"; return 1 ;;
esac
;;
*) echo " pdf_oxide → unsupported OS"; return 1 ;;
*) echo " pdf_oxide (static) → unsupported OS"; return 1 ;;
esac
local release_url="https://github.com/yfedoseev/pdf_oxide/releases/download/v${PDF_OXIDE_VERSION}/${asset_name}.tar.gz"
mkdir -p "${PDF_OXIDE_PREFIX}"
_download_and_extract "$release_url" "${PDF_OXIDE_PREFIX}"
local lib_path="${PDF_OXIDE_PREFIX}/lib/${platform_subdir}/libpdf_oxide.a"
if [ -f "$lib_path" ]; then
echo -e "${GREEN} pdf_oxide installed to ${PDF_OXIDE_PREFIX}${NC}"
else
echo " pdf_oxide → install failed"
return 1
echo " pdf_oxide (static) → ${PDF_OXIDE_PREFIX}"
return 0
fi
echo " pdf_oxide (static) not found"
echo " Expected: ${lib_path}"
echo " Run: uv run download_deps.py"
echo " Or: curl -fsSL https://github.com/yfedoseev/pdf_oxide/releases/download/v${PDF_OXIDE_VERSION}/pdf_oxide-go-ffi-linux-amd64.tar.gz | tar xz -C ${PDF_OXIDE_PREFIX}"
return 1
}
# Build C++ static library
@@ -405,7 +312,6 @@ build_go() {
eval "$install_cmd"
fi
check_office_oxide_deps || true
setup_cgo_env
local strip_flags=()
@@ -446,44 +352,70 @@ build_go() {
echo -e "${GREEN}✓ Go ingestor built successfully: $INGESTOR_BINARY${NC}"
}
# Configure CGO flags for native libraries.
# setup_cgo_env — base: -I and -L paths only, no -l flags (those live in
# each package's own #cgo LDFLAGS pragma). Safe to call even when native
# libs are absent — just skips the paths that don't exist.
# setup_cgo_env_pdf — pdfium / pdf_oxide -L paths. Non-fatal when libs
# are missing. Only called by run_go_tests.
# Configure CGO flags for native libraries (office_oxide, pdfium, pdf_oxide).
# All three are statically linked — no LD_LIBRARY_PATH or -Wl,-rpath needed.
setup_cgo_env() {
# ── office_oxide (header + search path only, no -loffice_oxide) ───
if [ -f "${OFFICE_OXIDE_PREFIX}/include/office_oxide_c/office_oxide.h" ]; then
export CGO_CFLAGS="-I${OFFICE_OXIDE_PREFIX}/include/office_oxide_c${CGO_CFLAGS:+ $CGO_CFLAGS}"
fi
if [ -f "${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.so" ] || [ -f "${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.dylib" ]; then
export CGO_LDFLAGS="-L${OFFICE_OXIDE_PREFIX}/lib${CGO_LDFLAGS:+ $CGO_LDFLAGS}"
export LD_LIBRARY_PATH="${OFFICE_OXIDE_PREFIX}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
fi
# ── office_oxide ──────────────────────────────────────────────────
check_office_oxide_deps
export CGO_CFLAGS="-I${OFFICE_OXIDE_PREFIX}/include/office_oxide_c${CGO_CFLAGS:+ $CGO_CFLAGS}"
export CGO_LDFLAGS="${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.a"
echo "CGO_CFLAGS: $CGO_CFLAGS"
echo "CGO_LDFLAGS: $CGO_LDFLAGS"
}
setup_cgo_env_pdf() {
# ── pdfium ────────────────────────────────────────────────────────
check_pdfium_deps || true
if [ -f "${PDFIUM_PREFIX}/libpdfium.so" ]; then
export CGO_LDFLAGS="$CGO_LDFLAGS -L${PDFIUM_PREFIX}"
export LD_LIBRARY_PATH="${PDFIUM_PREFIX}:${LD_LIBRARY_PATH}"
check_pdfium_deps || return 1
export CGO_LDFLAGS="$CGO_LDFLAGS ${PDFIUM_STATIC_PREFIX}/lib/libpdfium.a"
# Linux: Chromium-built objects use Clang's .eh_frame format which GNU ld
# cannot merge. Use lld (LLVM linker) which handles them correctly.
# --allow-multiple-definition: pdf_oxide and office_oxide are both Rust
# staticlibs that embed the Rust runtime; linking them together produces
# duplicate rust_eh_personality symbols.
if [ "$(uname -s)" = "Linux" ]; then
if ! command -v ld.lld >/dev/null 2>&1; then
echo -e "${RED}Error: ld.lld not found. Install with: sudo apt install lld-20${NC}"
echo " lld is required to static-link Chromium-built pdfium (.eh_frame format)"
return 1
fi
export CGO_LDFLAGS="$CGO_LDFLAGS \
${PDFIUM_STATIC_PREFIX}/lib/libc++.a \
${PDFIUM_STATIC_PREFIX}/lib/libc++abi.a \
-fuse-ld=lld -Wl,--allow-multiple-definition"
fi
# ── pdf_oxide ─────────────────────────────────────────────────────
check_pdf_oxide_deps || true
if [ -f "${PDF_OXIDE_PREFIX}/libpdf_oxide.so" ]; then
export CGO_LDFLAGS="$CGO_LDFLAGS -L${PDF_OXIDE_PREFIX}"
export LD_LIBRARY_PATH="${PDF_OXIDE_PREFIX}:${LD_LIBRARY_PATH}"
elif [ -f "${PDF_OXIDE_PREFIX}/libpdf_oxide.a" ]; then
export CGO_LDFLAGS="$CGO_LDFLAGS ${PDF_OXIDE_PREFIX}/libpdf_oxide.a"
fi
check_pdf_oxide_deps || return 1
# The go-ffi tarball places the .a under lib/<platform_subdir>/.
local pdf_oxide_subdir
case "$(uname -s)" in
Linux)
case "$(uname -m)" in
x86_64) pdf_oxide_subdir="linux_amd64" ;;
aarch64|arm64) pdf_oxide_subdir="linux_arm64" ;;
*) echo "pdf_oxide: unsupported arch"; return 1 ;;
esac
;;
Darwin)
case "$(uname -m)" in
x86_64) pdf_oxide_subdir="darwin_amd64" ;;
arm64) pdf_oxide_subdir="darwin_arm64" ;;
*) echo "pdf_oxide: unsupported arch"; return 1 ;;
esac
;;
esac
export CGO_LDFLAGS="$CGO_LDFLAGS ${PDF_OXIDE_PREFIX}/lib/${pdf_oxide_subdir}/libpdf_oxide.a"
echo "CGO_LDFLAGS (with PDF): $CGO_LDFLAGS"
# ── platform-specific system libraries ────────────────────────────
case "$(uname -s)" in
Linux)
export CGO_LDFLAGS="$CGO_LDFLAGS -lm -lpthread -ldl -lrt -lgcc_s -lutil -lc"
;;
Darwin)
export CGO_LDFLAGS="$CGO_LDFLAGS \
-framework CoreFoundation -framework Security \
-framework SystemConfiguration -liconv -lresolv"
;;
esac
echo "CGO_CFLAGS: $CGO_CFLAGS"
echo "CGO_LDFLAGS: $CGO_LDFLAGS"
}
# Run Go unit tests with the same CGO env as `build_go`. Pass any extra args
@@ -492,9 +424,7 @@ run_go_tests() {
print_section "Running Go tests"
cd "$PROJECT_ROOT"
check_office_oxide_deps || true
setup_cgo_env
setup_cgo_env_pdf
if [ "$#" -eq 0 ]; then
set -- ./...
@@ -534,10 +464,6 @@ run() {
cd "$PROJECT_ROOT"
# Set LD_LIBRARY_PATH for native libraries that were linked at build time.
# Libraries are only in the search path when they were present during build.
setup_cgo_env
# admin_server must be running before ragflow_server, otherwise ragflow_server's
# heartbeats to admin will error out (see internal/development.md).
print_section "Starting admin server (background)"

View File

@@ -1,13 +1,11 @@
//go:build cgo
// Package pdfium renders PDF pages using the system's libpdfium.so
// (bundled with pypdfium2). It exists solely to replace pdf_oxide's
// Package pdfium renders PDF pages using libpdfium (statically linked
// at build time via CGO_LDFLAGS). It exists solely to replace pdf_oxide's
// RenderPageRaw for use cases where image quality matters for downstream
// OCR/DLA — pdf_oxide still handles all text/char/table extraction.
package pdfium
/*
#cgo LDFLAGS: -lpdfium -lm -lpthread -ldl
#cgo LDFLAGS: -lm -lpthread -ldl
#include <stdint.h>
#include <stdlib.h>

View File

@@ -27,12 +27,24 @@ docker compose -f docker/docker-compose-base.yml up -d
./build.sh -s --go
```
> **Note**: If you use IDEs like GoLand to run/debug directly (via Run/Debug buttons), or run `go build` / `go run` from command line, you must set the following two CGO environment variables in your run configuration or shell:
> **Note**: If you use IDEs like GoLand to run/debug directly (via Run/Debug buttons), or run `go build` / `go run` from command line, run `./build.sh --go` first to download native dependencies. Then set the following CGO environment variables in your run configuration or shell:
>
> ```bash
> export CGO_CFLAGS="-I${HOME}/.office_oxide/include/office_oxide_c"
> export CGO_LDFLAGS="-L${HOME}/.office_oxide/lib -loffice_oxide -Wl,-rpath,${HOME}/.office_oxide/lib"
> RAGFLOW_DEPS="${HOME}/ragflow-native-libs"
> PLATFORM="linux_amd64" # or darwin_amd64, linux_arm64, darwin_arm64
>
> export CGO_CFLAGS="-I${RAGFLOW_DEPS}/office_oxide/include/office_oxide_c"
> export CGO_LDFLAGS="\
> ${RAGFLOW_DEPS}/office_oxide/lib/liboffice_oxide.a \
> ${RAGFLOW_DEPS}/pdfium-static/lib/libpdfium.a \
> ${RAGFLOW_DEPS}/pdfium-static/lib/libc++.a \
> ${RAGFLOW_DEPS}/pdfium-static/lib/libc++abi.a \
> ${RAGFLOW_DEPS}/pdf_oxide/lib/${PLATFORM}/libpdf_oxide.a \
> -fuse-ld=lld \
> -lm -lpthread -ldl -lrt -lgcc_s -lutil -lc"
> ```
>
> All three native libraries are statically linked — no `LD_LIBRARY_PATH` or `-Wl,-rpath` needed.
## 3. Run Go Version RAGFlow
Note: admin_server must be started first; otherwise, ragflow_server will encounter errors when sending heartbeats.
@@ -85,7 +97,7 @@ Type \? for help, \q to quit
RAGFlow(api/default)> REGISTER USER 'aaa@aaa.com' AS 'aaa' PASSWORD 'aaa';
Register successfully
RAGFlow(api/default)> login user 'aaa@aaa.com';
password for aaa@aaa.com: Password:
password for aaa@aaa.com: Password:
Login user aaa@aaa.com successfully
RAGFlow(api/default)> logout;
SUCCESS

View File

@@ -67,6 +67,15 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]:
# compatibility contract.
"https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-x64",
"https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-arm64",
# Native static libraries for Go build (pdfium, pdf_oxide, office_oxide)
# Used by build.sh's check_*_deps functions — pre-downloaded to avoid
# network access during CI.
["https://github.com/kognitos/pdfium-static/releases/download/chromium%2F7809/pdfium-linux-x64-static.tgz",
"pdfium-linux-x64-static.tgz"],
["https://github.com/yfedoseev/pdf_oxide/releases/download/v0.3.67/pdf_oxide-go-ffi-linux-amd64.tar.gz",
"pdf_oxide-go-ffi-linux-amd64.tar.gz"],
["https://github.com/yfedoseev/office_oxide/releases/download/v0.1.2/native-linux-x86_64.tar.gz",
"office_oxide-linux-x86_64.tar.gz"],
]
else:
return [
@@ -95,6 +104,15 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]:
# compatibility contract.
"https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-x64",
"https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-arm64",
# Native static libraries for Go build (pdfium, pdf_oxide, office_oxide)
# Used by build.sh's check_*_deps functions — pre-downloaded to avoid
# network access during CI.
["https://github.com/kognitos/pdfium-static/releases/download/chromium%2F7809/pdfium-linux-x64-static.tgz",
"pdfium-linux-x64-static.tgz"],
["https://github.com/yfedoseev/pdf_oxide/releases/download/v0.3.67/pdf_oxide-go-ffi-linux-amd64.tar.gz",
"pdf_oxide-go-ffi-linux-amd64.tar.gz"],
["https://github.com/yfedoseev/office_oxide/releases/download/v0.1.2/native-linux-x86_64.tar.gz",
"office_oxide-linux-x86_64.tar.gz"],
]
@@ -136,6 +154,29 @@ if __name__ == "__main__":
if not os.path.exists(filename):
urllib.request.urlretrieve(download_url, filename)
# Extract native static libraries to ~/ragflow-native-libs for Go build.
# Ensures build.sh can find them without network access.
native_deps_dir = os.path.expanduser("~/ragflow-native-libs")
extractions = [
("pdfium-linux-x64-static.tgz", "pdfium-static"),
("pdf_oxide-go-ffi-linux-amd64.tar.gz", "pdf_oxide"),
("office_oxide-linux-x86_64.tar.gz", "office_oxide"),
]
import tarfile
for archive, subdir in extractions:
archive_path = os.path.join(os.getcwd(), archive)
if not os.path.isfile(archive_path):
print(f" Skipping extraction: {archive} not found")
continue
target = os.path.join(native_deps_dir, subdir)
if os.path.isdir(target):
print(f"{subdir} already extracted to {target}")
continue
os.makedirs(target, exist_ok=True)
print(f" Extracting {archive}{target}")
with tarfile.open(archive_path) as tf:
tf.extractall(target)
local_dir = os.path.abspath("nltk_data")
for data in ["wordnet", "punkt", "punkt_tab"]:
print(f"Downloading nltk {data}...")