diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 42ebb779f4..f5fbaa2f7d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -234,10 +234,7 @@ jobs: PKGS=$(go list ./... 2>/dev/null \ | grep -v '/internal/storage$' \ | grep -v '/internal/tokenizer$' \ - | grep -v '/internal/handler$' \ - | grep -v '/internal/deepdoc/parser/pdf/pdfium' \ - | grep -v '/internal/deepdoc/parser/pdf/pdfoxide' \ - | grep -v '/internal/deepdoc/parser/pdf' || true) + | grep -v '/internal/handler$' || true) if [ -z "$PKGS" ]; then ./build.sh --test else @@ -680,10 +677,7 @@ jobs: PKGS=$(go list ./... 2>/dev/null \ | grep -v '/internal/storage$' \ | grep -v '/internal/tokenizer$' \ - | grep -v '/internal/handler$' \ - | grep -v '/internal/deepdoc/parser/pdf/pdfium' \ - | grep -v '/internal/deepdoc/parser/pdf/pdfoxide' \ - | grep -v '/internal/deepdoc/parser/pdf' || true) + | grep -v '/internal/handler$' || true) if [ -z "$PKGS" ]; then ./build.sh --test else diff --git a/build.sh b/build.sh index e13fb23a1f..46a1f8003b 100755 --- a/build.sh +++ b/build.sh @@ -22,17 +22,41 @@ RAGFLOW_CLI_BINARY="$PROJECT_ROOT/bin/ragflow-cli" # Strip symbols from Go binaries (set via --strip / -s) STRIP_SYMBOLS="" -# office_oxide native library settings -OFFICE_OXIDE_PREFIX="${HOME}/.office_oxide" +# Native static library settings. These are the user-cache paths (~/ragflow-native-libs/). +# If /opt/ragflow-native-libs/ exists (pre-seeded in CI runner image), it takes priority +# and skips the network (download_deps.py) fallback. +SYSTEM_DEPS="/opt/ragflow-native-libs" + +# office_oxide native library settings — static linking +OFFICE_OXIDE_PREFIX="${HOME}/ragflow-native-libs/office_oxide" OFFICE_OXIDE_VERSION="0.1.2" -# pdfium native library settings (from pypdfium2_raw PyPI wheel) -PDFIUM_PREFIX="${HOME}/.pdfium" -PDFIUM_VERSION="0.5.0" +# pdfium native library settings — static linking (kognitos/pdfium-static) +PDFIUM_STATIC_PREFIX="${HOME}/ragflow-native-libs/pdfium-static" +PDFIUM_STATIC_VERSION="7809" -# pdf_oxide native library settings (from GitHub Release) -PDF_OXIDE_PREFIX="${HOME}/.pdf_oxide" -PDF_OXIDE_VERSION="0.3.63" +# pdf_oxide native library settings — static linking (go-ffi tarball) +PDF_OXIDE_PREFIX="${HOME}/ragflow-native-libs/pdf_oxide" +PDF_OXIDE_VERSION="0.3.67" + +# Copy a dependency from the system pre-seed directory to the user cache. +# Returns 0 if the dep was copied or already exists in cache, 1 otherwise. +_seed_from_system() { + local dep_name="$1" # e.g. "pdfium-static", "pdf_oxide", "office_oxide" + local dep_dir="${HOME}/ragflow-native-libs/${dep_name}" + local sys_dir="${SYSTEM_DEPS}/${dep_name}" + + if [ -d "$dep_dir" ]; then + return 0 # already cached + fi + if [ -d "$sys_dir" ]; then + echo " ${dep_name} → ${sys_dir} (system)" + mkdir -p "$(dirname "$dep_dir")" + cp -r "$sys_dir" "$dep_dir" + return 0 + fi + return 1 +} echo -e "${GREEN}=== RAGFlow Go Server Build Script ===${NC}" @@ -138,14 +162,9 @@ _download_and_extract() { # Check / install office_oxide native library (Rust → C FFI library) check_office_oxide_deps() { print_section "Checking office_oxide native library" + _seed_from_system "office_oxide" - local lib_file header_path - case "$(uname -s)" in - Linux) lib_file="liboffice_oxide.so" ;; - Darwin) lib_file="liboffice_oxide.dylib" ;; - *) echo -e "${RED}Unsupported OS for office_oxide${NC}"; return 1 ;; - esac - + local lib_file="liboffice_oxide.a" local lib_path="${OFFICE_OXIDE_PREFIX}/lib/${lib_file}" local header_path="${OFFICE_OXIDE_PREFIX}/include/office_oxide_c/office_oxide.h" @@ -154,177 +173,65 @@ check_office_oxide_deps() { return 0 fi - echo "office_oxide native library not found. Installing..." - - # Map platform to the release asset name. Note: the GitHub release archives - # omit the version number from the native-* asset filenames. - local asset_name - case "$(uname -s)" in - Linux) - case "$(uname -m)" in - x86_64) asset_name="native-linux-x86_64" ;; - aarch64|arm64) asset_name="native-linux-aarch64" ;; - *) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; return 1 ;; - esac - ;; - Darwin) - case "$(uname -m)" in - x86_64) asset_name="native-macos-x86_64" ;; - aarch64|arm64) asset_name="native-macos-aarch64" ;; - *) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; return 1 ;; - esac - ;; - esac - - local release_url="https://github.com/yfedoseev/office_oxide/releases/download/v${OFFICE_OXIDE_VERSION}/${asset_name}.tar.gz" - - mkdir -p "${OFFICE_OXIDE_PREFIX}" - _download_and_extract "$release_url" "${OFFICE_OXIDE_PREFIX}" - - if [ ! -f "$lib_path" ]; then - echo -e "${YELLOW}Warning: Failed to install office_oxide native library (missing ${lib_path})${NC}" - echo " Try: curl -fsSL ${release_url} | tar xzf - -C ${OFFICE_OXIDE_PREFIX}" - return 1 - fi - - echo -e "${GREEN}✓ office_oxide native library installed${NC}" + echo -e "${RED}Error: office_oxide native library not found${NC}" + echo " Expected: ${lib_path}" + echo " Run: uv run download_deps.py" + echo " Or manually download: https://github.com/yfedoseev/office_oxide/releases/download/v${OFFICE_OXIDE_VERSION}/native-linux-x86_64.tar.gz" + exit 1 } -# Check / install pdfium native library (libpdfium.so from pypdfium2_raw wheel). +# Check pdfium static library (must be pre-installed via download_deps.py or CI image). check_pdfium_deps() { - # 1. Check .venv (uv sync provides pypdfium2_raw). - local venv_py="${PROJECT_ROOT}/.venv/bin/python3" - if [ -x "$venv_py" ]; then - local venv_so=$("$venv_py" -c "import pypdfium2_raw,os;print(os.path.join(os.path.dirname(pypdfium2_raw.__file__),'libpdfium.so'))" 2>/dev/null) - if [ -n "$venv_so" ] && [ -f "$venv_so" ]; then - echo " pdfium → ${venv_so} (.venv)" - export CGO_LDFLAGS="$CGO_LDFLAGS -L$(dirname "$venv_so") -Wl,-rpath,$(dirname "$venv_so")" - export LD_LIBRARY_PATH="$(dirname "$venv_so"):${LD_LIBRARY_PATH}" - return 0 - fi - fi + _seed_from_system "pdfium-static" + local lib_path="${PDFIUM_STATIC_PREFIX}/lib/libpdfium.a" - # 2. Check cache. - local lib_path="${PDFIUM_PREFIX}/libpdfium.so" if [ -f "$lib_path" ]; then - echo " pdfium → ${PDFIUM_PREFIX}" + echo " pdfium (static) → ${PDFIUM_STATIC_PREFIX}" return 0 fi - echo " pdfium not found, installing..." - - # 3. Map platform to PyPI wheel platform tag. - local whl_platform - case "$(uname -s)" in - Linux) - case "$(uname -m)" in - x86_64) whl_platform="manylinux_2_17_x86_64.manylinux2014_x86_64" ;; - aarch64|arm64) whl_platform="manylinux_2_17_aarch64.manylinux2014_aarch64" ;; - *) echo " pdfium → unsupported arch"; return 1 ;; - esac - ;; - Darwin) - case "$(uname -m)" in - x86_64) whl_platform="macosx_11_0_x86_64" ;; - arm64) whl_platform="macosx_11_0_arm64" ;; - *) echo " pdfium → unsupported arch"; return 1 ;; - esac - ;; - *) echo " pdfium → unsupported OS"; return 1 ;; - esac - - # 4. Download .whl from PyPI and extract libpdfium.so (zero pip dependency). - local whl_url - whl_url=$(curl -fsSL "https://pypi.org/pypi/pypdfium2_raw/${PDFIUM_VERSION}/json" 2>/dev/null \ - | grep -o '"url":"[^"]*'${whl_platform}'[^"]*"' | head -1 | cut -d'"' -f4) - - if [ -n "$whl_url" ] && { command -v curl >/dev/null 2>&1 || command -v wget >/dev/null 2>&1; }; then - local tmp_whl="$(mktemp)" - if command -v curl >/dev/null 2>&1; then - curl -fsSL "$whl_url" -o "$tmp_whl" - else - wget -q "$whl_url" -O "$tmp_whl" - fi - mkdir -p "${PDFIUM_PREFIX}" - # Wheel is a zip; extract libpdfium.so via python3 or unzip. - if command -v python3 >/dev/null 2>&1; then - python3 -c " -import zipfile, os, shutil -with zipfile.ZipFile('$tmp_whl') as z: - for n in z.namelist(): - if n.endswith('libpdfium.so'): - z.extract(n, '${PDFIUM_PREFIX}') - os.rename(os.path.join('${PDFIUM_PREFIX}', n), '$lib_path') - # Remove empty pypdfium2_raw dir - d = os.path.join('${PDFIUM_PREFIX}', 'pypdfium2_raw') - if os.path.isdir(d): shutil.rmtree(d, ignore_errors=True) - break -" 2>/dev/null - elif command -v unzip >/dev/null 2>&1; then - unzip -q -o "$tmp_whl" -d "${PDFIUM_PREFIX}" 'pypdfium2_raw/libpdfium.so' 2>/dev/null - [ -f "${PDFIUM_PREFIX}/pypdfium2_raw/libpdfium.so" ] && mv "${PDFIUM_PREFIX}/pypdfium2_raw/libpdfium.so" "$lib_path" - rm -rf "${PDFIUM_PREFIX}/pypdfium2_raw" - fi - rm -f "$tmp_whl" - fi - - if [ -f "$lib_path" ]; then - echo -e "${GREEN}✓ pdfium installed to ${PDFIUM_PREFIX}${NC}" - else - echo " pdfium → install failed (requires .venv, curl/wget + python3, or pre-cached ~/.pdfium)" - return 1 - fi + echo " pdfium (static) not found" + echo " Expected: ${lib_path}" + echo " Run: uv run download_deps.py" + echo " Or: curl -fsSL https://github.com/kognitos/pdfium-static/releases/download/chromium%2F${PDFIUM_STATIC_VERSION}/pdfium-linux-x64-static.tgz | tar xz -C ${PDFIUM_STATIC_PREFIX}" + return 1 } -# Check / install pdf_oxide native library (Rust -> C FFI library). +# Check / install pdf_oxide static library (go-ffi tarball from GitHub Release). check_pdf_oxide_deps() { - local lib_path="${PDF_OXIDE_PREFIX}/libpdf_oxide.so" - - if [ -f "$lib_path" ]; then - echo " pdf_oxide → ${PDF_OXIDE_PREFIX} (shared)" - return 0 - fi - - # Also check for static library (user's local installation). - local static_path="${PDF_OXIDE_PREFIX}/libpdf_oxide.a" - if [ -f "$static_path" ]; then - echo " pdf_oxide → ${PDF_OXIDE_PREFIX} (static)" - return 0 - fi - - echo " pdf_oxide not found, installing..." - - # Map platform to the release asset name. - local asset_name + _seed_from_system "pdf_oxide" + # Map platform to tarball-internal subdirectory. + local platform_subdir case "$(uname -s)" in Linux) case "$(uname -m)" in - x86_64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-linux-x86_64" ;; - aarch64|arm64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-linux-aarch64" ;; - *) echo " pdf_oxide → unsupported arch"; return 1 ;; + x86_64) platform_subdir="linux_amd64" ;; + aarch64|arm64) platform_subdir="linux_arm64" ;; + *) echo " pdf_oxide (static) → unsupported arch"; return 1 ;; esac ;; Darwin) case "$(uname -m)" in - x86_64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-darwin-x86_64" ;; - arm64) asset_name="libpdf_oxide-v${PDF_OXIDE_VERSION}-darwin-arm64" ;; - *) echo " pdf_oxide → unsupported arch"; return 1 ;; + x86_64) platform_subdir="darwin_amd64" ;; + arm64) platform_subdir="darwin_arm64" ;; + *) echo " pdf_oxide (static) → unsupported arch"; return 1 ;; esac ;; - *) echo " pdf_oxide → unsupported OS"; return 1 ;; + *) echo " pdf_oxide (static) → unsupported OS"; return 1 ;; esac - local release_url="https://github.com/yfedoseev/pdf_oxide/releases/download/v${PDF_OXIDE_VERSION}/${asset_name}.tar.gz" - - mkdir -p "${PDF_OXIDE_PREFIX}" - _download_and_extract "$release_url" "${PDF_OXIDE_PREFIX}" + local lib_path="${PDF_OXIDE_PREFIX}/lib/${platform_subdir}/libpdf_oxide.a" if [ -f "$lib_path" ]; then - echo -e "${GREEN}✓ pdf_oxide installed to ${PDF_OXIDE_PREFIX}${NC}" - else - echo " pdf_oxide → install failed" - return 1 + echo " pdf_oxide (static) → ${PDF_OXIDE_PREFIX}" + return 0 fi + + echo " pdf_oxide (static) not found" + echo " Expected: ${lib_path}" + echo " Run: uv run download_deps.py" + echo " Or: curl -fsSL https://github.com/yfedoseev/pdf_oxide/releases/download/v${PDF_OXIDE_VERSION}/pdf_oxide-go-ffi-linux-amd64.tar.gz | tar xz -C ${PDF_OXIDE_PREFIX}" + return 1 } # Build C++ static library @@ -405,7 +312,6 @@ build_go() { eval "$install_cmd" fi - check_office_oxide_deps || true setup_cgo_env local strip_flags=() @@ -446,44 +352,70 @@ build_go() { echo -e "${GREEN}✓ Go ingestor built successfully: $INGESTOR_BINARY${NC}" } -# Configure CGO flags for native libraries. -# setup_cgo_env — base: -I and -L paths only, no -l flags (those live in -# each package's own #cgo LDFLAGS pragma). Safe to call even when native -# libs are absent — just skips the paths that don't exist. -# setup_cgo_env_pdf — pdfium / pdf_oxide -L paths. Non-fatal when libs -# are missing. Only called by run_go_tests. +# Configure CGO flags for native libraries (office_oxide, pdfium, pdf_oxide). +# All three are statically linked — no LD_LIBRARY_PATH or -Wl,-rpath needed. setup_cgo_env() { - # ── office_oxide (header + search path only, no -loffice_oxide) ─── - if [ -f "${OFFICE_OXIDE_PREFIX}/include/office_oxide_c/office_oxide.h" ]; then - export CGO_CFLAGS="-I${OFFICE_OXIDE_PREFIX}/include/office_oxide_c${CGO_CFLAGS:+ $CGO_CFLAGS}" - fi - if [ -f "${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.so" ] || [ -f "${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.dylib" ]; then - export CGO_LDFLAGS="-L${OFFICE_OXIDE_PREFIX}/lib${CGO_LDFLAGS:+ $CGO_LDFLAGS}" - export LD_LIBRARY_PATH="${OFFICE_OXIDE_PREFIX}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" - fi + # ── office_oxide ────────────────────────────────────────────────── + check_office_oxide_deps + export CGO_CFLAGS="-I${OFFICE_OXIDE_PREFIX}/include/office_oxide_c${CGO_CFLAGS:+ $CGO_CFLAGS}" + export CGO_LDFLAGS="${OFFICE_OXIDE_PREFIX}/lib/liboffice_oxide.a" - echo "CGO_CFLAGS: $CGO_CFLAGS" - echo "CGO_LDFLAGS: $CGO_LDFLAGS" -} - -setup_cgo_env_pdf() { # ── pdfium ──────────────────────────────────────────────────────── - check_pdfium_deps || true - if [ -f "${PDFIUM_PREFIX}/libpdfium.so" ]; then - export CGO_LDFLAGS="$CGO_LDFLAGS -L${PDFIUM_PREFIX}" - export LD_LIBRARY_PATH="${PDFIUM_PREFIX}:${LD_LIBRARY_PATH}" + check_pdfium_deps || return 1 + export CGO_LDFLAGS="$CGO_LDFLAGS ${PDFIUM_STATIC_PREFIX}/lib/libpdfium.a" + # Linux: Chromium-built objects use Clang's .eh_frame format which GNU ld + # cannot merge. Use lld (LLVM linker) which handles them correctly. + # --allow-multiple-definition: pdf_oxide and office_oxide are both Rust + # staticlibs that embed the Rust runtime; linking them together produces + # duplicate rust_eh_personality symbols. + if [ "$(uname -s)" = "Linux" ]; then + if ! command -v ld.lld >/dev/null 2>&1; then + echo -e "${RED}Error: ld.lld not found. Install with: sudo apt install lld-20${NC}" + echo " lld is required to static-link Chromium-built pdfium (.eh_frame format)" + return 1 + fi + export CGO_LDFLAGS="$CGO_LDFLAGS \ + ${PDFIUM_STATIC_PREFIX}/lib/libc++.a \ + ${PDFIUM_STATIC_PREFIX}/lib/libc++abi.a \ + -fuse-ld=lld -Wl,--allow-multiple-definition" fi # ── pdf_oxide ───────────────────────────────────────────────────── - check_pdf_oxide_deps || true - if [ -f "${PDF_OXIDE_PREFIX}/libpdf_oxide.so" ]; then - export CGO_LDFLAGS="$CGO_LDFLAGS -L${PDF_OXIDE_PREFIX}" - export LD_LIBRARY_PATH="${PDF_OXIDE_PREFIX}:${LD_LIBRARY_PATH}" - elif [ -f "${PDF_OXIDE_PREFIX}/libpdf_oxide.a" ]; then - export CGO_LDFLAGS="$CGO_LDFLAGS ${PDF_OXIDE_PREFIX}/libpdf_oxide.a" - fi + check_pdf_oxide_deps || return 1 + # The go-ffi tarball places the .a under lib//. + local pdf_oxide_subdir + case "$(uname -s)" in + Linux) + case "$(uname -m)" in + x86_64) pdf_oxide_subdir="linux_amd64" ;; + aarch64|arm64) pdf_oxide_subdir="linux_arm64" ;; + *) echo "pdf_oxide: unsupported arch"; return 1 ;; + esac + ;; + Darwin) + case "$(uname -m)" in + x86_64) pdf_oxide_subdir="darwin_amd64" ;; + arm64) pdf_oxide_subdir="darwin_arm64" ;; + *) echo "pdf_oxide: unsupported arch"; return 1 ;; + esac + ;; + esac + export CGO_LDFLAGS="$CGO_LDFLAGS ${PDF_OXIDE_PREFIX}/lib/${pdf_oxide_subdir}/libpdf_oxide.a" - echo "CGO_LDFLAGS (with PDF): $CGO_LDFLAGS" + # ── platform-specific system libraries ──────────────────────────── + case "$(uname -s)" in + Linux) + export CGO_LDFLAGS="$CGO_LDFLAGS -lm -lpthread -ldl -lrt -lgcc_s -lutil -lc" + ;; + Darwin) + export CGO_LDFLAGS="$CGO_LDFLAGS \ + -framework CoreFoundation -framework Security \ + -framework SystemConfiguration -liconv -lresolv" + ;; + esac + + echo "CGO_CFLAGS: $CGO_CFLAGS" + echo "CGO_LDFLAGS: $CGO_LDFLAGS" } # Run Go unit tests with the same CGO env as `build_go`. Pass any extra args @@ -492,9 +424,7 @@ run_go_tests() { print_section "Running Go tests" cd "$PROJECT_ROOT" - check_office_oxide_deps || true setup_cgo_env - setup_cgo_env_pdf if [ "$#" -eq 0 ]; then set -- ./... @@ -534,10 +464,6 @@ run() { cd "$PROJECT_ROOT" - # Set LD_LIBRARY_PATH for native libraries that were linked at build time. - # Libraries are only in the search path when they were present during build. - setup_cgo_env - # admin_server must be running before ragflow_server, otherwise ragflow_server's # heartbeats to admin will error out (see internal/development.md). print_section "Starting admin server (background)" diff --git a/internal/deepdoc/parser/pdf/pdfium/pdfium.go b/internal/deepdoc/parser/pdf/pdfium/pdfium.go index 93b91e4a2a..9fbeeeabe6 100644 --- a/internal/deepdoc/parser/pdf/pdfium/pdfium.go +++ b/internal/deepdoc/parser/pdf/pdfium/pdfium.go @@ -1,13 +1,11 @@ -//go:build cgo - -// Package pdfium renders PDF pages using the system's libpdfium.so -// (bundled with pypdfium2). It exists solely to replace pdf_oxide's +// Package pdfium renders PDF pages using libpdfium (statically linked +// at build time via CGO_LDFLAGS). It exists solely to replace pdf_oxide's // RenderPageRaw for use cases where image quality matters for downstream // OCR/DLA — pdf_oxide still handles all text/char/table extraction. package pdfium /* -#cgo LDFLAGS: -lpdfium -lm -lpthread -ldl +#cgo LDFLAGS: -lm -lpthread -ldl #include #include diff --git a/internal/development.md b/internal/development.md index ea1e11fe63..0ab8468677 100644 --- a/internal/development.md +++ b/internal/development.md @@ -27,12 +27,24 @@ docker compose -f docker/docker-compose-base.yml up -d ./build.sh -s --go ``` -> **Note**: If you use IDEs like GoLand to run/debug directly (via Run/Debug buttons), or run `go build` / `go run` from command line, you must set the following two CGO environment variables in your run configuration or shell: +> **Note**: If you use IDEs like GoLand to run/debug directly (via Run/Debug buttons), or run `go build` / `go run` from command line, run `./build.sh --go` first to download native dependencies. Then set the following CGO environment variables in your run configuration or shell: > > ```bash -> export CGO_CFLAGS="-I${HOME}/.office_oxide/include/office_oxide_c" -> export CGO_LDFLAGS="-L${HOME}/.office_oxide/lib -loffice_oxide -Wl,-rpath,${HOME}/.office_oxide/lib" +> RAGFLOW_DEPS="${HOME}/ragflow-native-libs" +> PLATFORM="linux_amd64" # or darwin_amd64, linux_arm64, darwin_arm64 +> +> export CGO_CFLAGS="-I${RAGFLOW_DEPS}/office_oxide/include/office_oxide_c" +> export CGO_LDFLAGS="\ +> ${RAGFLOW_DEPS}/office_oxide/lib/liboffice_oxide.a \ +> ${RAGFLOW_DEPS}/pdfium-static/lib/libpdfium.a \ +> ${RAGFLOW_DEPS}/pdfium-static/lib/libc++.a \ +> ${RAGFLOW_DEPS}/pdfium-static/lib/libc++abi.a \ +> ${RAGFLOW_DEPS}/pdf_oxide/lib/${PLATFORM}/libpdf_oxide.a \ +> -fuse-ld=lld \ +> -lm -lpthread -ldl -lrt -lgcc_s -lutil -lc" > ``` +> +> All three native libraries are statically linked — no `LD_LIBRARY_PATH` or `-Wl,-rpath` needed. ## 3. Run Go Version RAGFlow Note: admin_server must be started first; otherwise, ragflow_server will encounter errors when sending heartbeats. @@ -85,7 +97,7 @@ Type \? for help, \q to quit RAGFlow(api/default)> REGISTER USER 'aaa@aaa.com' AS 'aaa' PASSWORD 'aaa'; Register successfully RAGFlow(api/default)> login user 'aaa@aaa.com'; -password for aaa@aaa.com: Password: +password for aaa@aaa.com: Password: Login user aaa@aaa.com successfully RAGFlow(api/default)> logout; SUCCESS diff --git a/ragflow_deps/download_deps.py b/ragflow_deps/download_deps.py index 066e65a6fe..cf3839098e 100644 --- a/ragflow_deps/download_deps.py +++ b/ragflow_deps/download_deps.py @@ -67,6 +67,15 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]: # compatibility contract. "https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-x64", "https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-arm64", + # Native static libraries for Go build (pdfium, pdf_oxide, office_oxide) + # Used by build.sh's check_*_deps functions — pre-downloaded to avoid + # network access during CI. + ["https://github.com/kognitos/pdfium-static/releases/download/chromium%2F7809/pdfium-linux-x64-static.tgz", + "pdfium-linux-x64-static.tgz"], + ["https://github.com/yfedoseev/pdf_oxide/releases/download/v0.3.67/pdf_oxide-go-ffi-linux-amd64.tar.gz", + "pdf_oxide-go-ffi-linux-amd64.tar.gz"], + ["https://github.com/yfedoseev/office_oxide/releases/download/v0.1.2/native-linux-x86_64.tar.gz", + "office_oxide-linux-x86_64.tar.gz"], ] else: return [ @@ -95,6 +104,15 @@ def get_urls(use_china_mirrors=False) -> list[Union[str, list[str]]]: # compatibility contract. "https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-x64", "https://github.com/browserbase/stagehand/releases/download/stagehand-server-v3/v3.7.2/stagehand-server-v3-linux-arm64", + # Native static libraries for Go build (pdfium, pdf_oxide, office_oxide) + # Used by build.sh's check_*_deps functions — pre-downloaded to avoid + # network access during CI. + ["https://github.com/kognitos/pdfium-static/releases/download/chromium%2F7809/pdfium-linux-x64-static.tgz", + "pdfium-linux-x64-static.tgz"], + ["https://github.com/yfedoseev/pdf_oxide/releases/download/v0.3.67/pdf_oxide-go-ffi-linux-amd64.tar.gz", + "pdf_oxide-go-ffi-linux-amd64.tar.gz"], + ["https://github.com/yfedoseev/office_oxide/releases/download/v0.1.2/native-linux-x86_64.tar.gz", + "office_oxide-linux-x86_64.tar.gz"], ] @@ -136,6 +154,29 @@ if __name__ == "__main__": if not os.path.exists(filename): urllib.request.urlretrieve(download_url, filename) + # Extract native static libraries to ~/ragflow-native-libs for Go build. + # Ensures build.sh can find them without network access. + native_deps_dir = os.path.expanduser("~/ragflow-native-libs") + extractions = [ + ("pdfium-linux-x64-static.tgz", "pdfium-static"), + ("pdf_oxide-go-ffi-linux-amd64.tar.gz", "pdf_oxide"), + ("office_oxide-linux-x86_64.tar.gz", "office_oxide"), + ] + import tarfile + for archive, subdir in extractions: + archive_path = os.path.join(os.getcwd(), archive) + if not os.path.isfile(archive_path): + print(f" Skipping extraction: {archive} not found") + continue + target = os.path.join(native_deps_dir, subdir) + if os.path.isdir(target): + print(f" ✓ {subdir} already extracted to {target}") + continue + os.makedirs(target, exist_ok=True) + print(f" Extracting {archive} → {target}") + with tarfile.open(archive_path) as tf: + tf.extractall(target) + local_dir = os.path.abspath("nltk_data") for data in ["wordnet", "punkt", "punkt_tab"]: print(f"Downloading nltk {data}...")