diff --git a/build.sh b/build.sh index 349ac645fa..2d4b7546f0 100755 --- a/build.sh +++ b/build.sh @@ -84,10 +84,17 @@ build_go() { exit 1 fi - # Check for pcre2 library - if [ -f "/usr/lib/x86_64-linux-gnu/libpcre2-8.a" ] || [ -f "/usr/local/lib/libpcre2-8.a" ]; then + # Check for pcre2 library — known Linux paths + macOS Homebrew (Apple Silicon + # at /opt/homebrew, Intel Macs at /usr/local). + if [ -f "/usr/lib/x86_64-linux-gnu/libpcre2-8.a" ] \ + || [ -f "/usr/local/lib/libpcre2-8.a" ] \ + || [ -f "/opt/homebrew/lib/libpcre2-8.a" ]; then echo "✓ pcre2 library found" else + if [ "$(uname)" = "Darwin" ]; then + echo -e "${RED}Error: libpcre2-8.a not found. Install with: brew install pcre2${NC}" + exit 1 + fi echo -e "${YELLOW}Warning: libpcre2-8.a not found. You may need to install libpcre2-dev:${NC}" sudo apt -y install libpcre2-dev fi diff --git a/internal/binding/rag_analyzer.go b/internal/binding/rag_analyzer.go index f1386f51a8..38f02b640b 100644 --- a/internal/binding/rag_analyzer.go +++ b/internal/binding/rag_analyzer.go @@ -19,7 +19,9 @@ package rag_analyzer /* #cgo CXXFLAGS: -std=c++20 -I${SRCDIR}/.. #cgo linux LDFLAGS: ${SRCDIR}/../cpp/cmake-build-release/librag_tokenizer_c_api.a -lstdc++ -lm -lpthread /usr/lib/x86_64-linux-gnu/libpcre2-8.a -#cgo darwin LDFLAGS: ${SRCDIR}/../cpp/cmake-build-release/librag_tokenizer_c_api.a -lstdc++ -lm -lpthread /usr/local/lib/libpcre2-8.a +// Apple Silicon: Homebrew installs to /opt/homebrew; Intel Macs keep /usr/local. +#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/../cpp/cmake-build-release/librag_tokenizer_c_api.a -lstdc++ -lm -lpthread /opt/homebrew/lib/libpcre2-8.a +#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/../cpp/cmake-build-release/librag_tokenizer_c_api.a -lstdc++ -lm -lpthread /usr/local/lib/libpcre2-8.a #include #include "../cpp/rag_analyzer_c_api.h" diff --git a/internal/cpp/CMakeLists.txt b/internal/cpp/CMakeLists.txt index 9c4b4f5e29..bcd96a5fe9 100644 --- a/internal/cpp/CMakeLists.txt +++ b/internal/cpp/CMakeLists.txt @@ -3,6 +3,40 @@ project(rag_tokenizer) set(CMAKE_CXX_STANDARD 23) +# macOS dependency discovery — Homebrew installs headers and libs under a +# prefix that is NOT on the compiler's default search path (Apple Silicon: +# /opt/homebrew, Intel: /usr/local). Linux is left completely untouched: +# the infinity_builder image already ships pcre2 + simde where the +# toolchain finds them, so adding paths there risks shadowing them. +if(APPLE) + execute_process( + COMMAND brew --prefix + OUTPUT_VARIABLE HOMEBREW_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE BREW_RC + ) + if(BREW_RC EQUAL 0 AND HOMEBREW_PREFIX) + message(STATUS "macOS detected; Homebrew prefix: ${HOMEBREW_PREFIX}") + include_directories(SYSTEM "${HOMEBREW_PREFIX}/include") + link_directories("${HOMEBREW_PREFIX}/lib") + endif() +endif() + +# Resolve libpcre2-8. +# - Linux: keep upstream's bare `libpcre2-8.a` token verbatim. The linker +# resolves it from its own default search path, which the +# infinity_builder image populates. find_library() does NOT see that +# path (pcre2 is built from source there), so calling it here would +# break the CI build that worked before. +# - macOS: the bare token fails (libpcre2-8.a is under the Homebrew +# prefix, off the default path), so resolve the full path explicitly. +if(APPLE) + find_library(PCRE2_LIB NAMES pcre2-8 REQUIRED) +else() + set(PCRE2_LIB libpcre2-8.a) +endif() +message(STATUS "PCRE2 library: ${PCRE2_LIB}") + # Option to enable AddressSanitizer option(ENABLE_ASAN "Enable AddressSanitizer" OFF) @@ -88,7 +122,7 @@ add_executable(rag_tokenizer ${darts_src} ${re2_src}) -target_link_libraries(rag_tokenizer stdc++ m libpcre2-8.a) +target_link_libraries(rag_tokenizer stdc++ m ${PCRE2_LIB}) target_include_directories(rag_tokenizer PUBLIC "${CMAKE_SOURCE_DIR}") set_target_properties(rag_tokenizer PROPERTIES CXX_STANDARD 20 @@ -118,7 +152,7 @@ add_library(rag_tokenizer_c_api STATIC ${re2_src} ) -target_link_libraries(rag_tokenizer_c_api stdc++ libm.a libpcre2-8.a) +target_link_libraries(rag_tokenizer_c_api stdc++ libm.a ${PCRE2_LIB}) target_include_directories(rag_tokenizer_c_api PUBLIC "${CMAKE_SOURCE_DIR}") set_target_properties(rag_tokenizer_c_api PROPERTIES CXX_STANDARD 20 @@ -130,7 +164,7 @@ add_executable(rag_analyzer_c_test rag_analyzer_c_test.cpp ) -target_link_libraries(rag_analyzer_c_test rag_tokenizer_c_api stdc++ libm.a libpcre2-8.a) +target_link_libraries(rag_analyzer_c_test rag_tokenizer_c_api stdc++ libm.a ${PCRE2_LIB}) target_include_directories(rag_analyzer_c_test PUBLIC "${CMAKE_SOURCE_DIR}") set_target_properties(rag_analyzer_c_test PROPERTIES CXX_STANDARD 20 diff --git a/internal/cpp/rag_analyzer.cpp b/internal/cpp/rag_analyzer.cpp index c52ab5745f..658afec6f3 100644 --- a/internal/cpp/rag_analyzer.cpp +++ b/internal/cpp/rag_analyzer.cpp @@ -22,6 +22,7 @@ #include "re2/re2.h" #include +#include // std::ostringstream / std::istringstream — explicit for libc++ (macOS) #include #include #include @@ -143,7 +144,10 @@ std::string Join(const std::vector &tokens, int start, int end, const std::st template std::string Join(const std::vector &tokens, int start, const std::string &delim = " ") { - return Join(tokens, start, tokens.size(), delim); + // C++23 strict overload resolution refuses the implicit size_t → int + // narrowing conversion; the explicit cast makes the 4-arg overload above + // unambiguous on libc++ (macOS) without changing behaviour on libstdc++. + return Join(tokens, start, static_cast(tokens.size()), delim); } std::string Join(const TermList &tokens, int start, int end, const std::string &delim = " ") { diff --git a/internal/cpp/wordnet_lemmatizer.cpp b/internal/cpp/wordnet_lemmatizer.cpp index d267beeba5..d92c3463f0 100644 --- a/internal/cpp/wordnet_lemmatizer.cpp +++ b/internal/cpp/wordnet_lemmatizer.cpp @@ -15,6 +15,7 @@ #include "wordnet_lemmatizer.h" #include #include +#include // std::istringstream — implicit via on libstdc++ (Linux), explicit on libc++ (macOS) namespace fs = std::filesystem;