mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 17:21:59 +08:00
Go: add office_oxide and parse docx file. (#15976)
### What problem does this PR solve? As title. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
101
build.sh
101
build.sh
@@ -18,6 +18,10 @@ RAGFLOW_SERVER_BINARY="$PROJECT_ROOT/bin/server_main"
|
||||
ADMIN_SERVER_BINARY="$PROJECT_ROOT/bin/admin_server"
|
||||
RAGFLOW_CLI_BINARY="$PROJECT_ROOT/bin/ragflow_cli"
|
||||
|
||||
# office_oxide native library settings
|
||||
OFFICE_OXIDE_PREFIX="${HOME}/.office_oxide"
|
||||
OFFICE_OXIDE_VERSION="0.1.2"
|
||||
|
||||
echo -e "${GREEN}=== RAGFlow Go Server Build Script ===${NC}"
|
||||
|
||||
# Function to print section headers
|
||||
@@ -54,6 +58,79 @@ check_go_deps() {
|
||||
echo "✓ Required tools are available"
|
||||
}
|
||||
|
||||
# Download and extract a tar.gz from a URL to a target directory
|
||||
_download_and_extract() {
|
||||
local url="$1" target_dir="$2"
|
||||
echo "Downloading ${url} ..."
|
||||
local tmpfile
|
||||
tmpfile="$(mktemp)"
|
||||
if command -v curl >/dev/null 2>&1; then
|
||||
curl -fsSL "$url" -o "$tmpfile"
|
||||
elif command -v wget >/dev/null 2>&1; then
|
||||
wget -q "$url" -O "$tmpfile"
|
||||
else
|
||||
echo -e "${RED}Error: need curl or wget to download office_oxide${NC}"
|
||||
exit 1
|
||||
fi
|
||||
tar xzf "$tmpfile" -C "$target_dir"
|
||||
rm -f "$tmpfile"
|
||||
}
|
||||
|
||||
# Check / install office_oxide native library (Rust → C FFI library)
|
||||
check_office_oxide_deps() {
|
||||
print_section "Checking office_oxide native library"
|
||||
|
||||
local lib_file header_path
|
||||
case "$(uname -s)" in
|
||||
Linux) lib_file="liboffice_oxide.so" ;;
|
||||
Darwin) lib_file="liboffice_oxide.dylib" ;;
|
||||
*) echo -e "${RED}Unsupported OS for office_oxide${NC}"; exit 1 ;;
|
||||
esac
|
||||
|
||||
local lib_path="${OFFICE_OXIDE_PREFIX}/lib/${lib_file}"
|
||||
local header_path="${OFFICE_OXIDE_PREFIX}/include/office_oxide_c/office_oxide.h"
|
||||
|
||||
if [ -f "$lib_path" ] && [ -f "$header_path" ]; then
|
||||
echo "✓ office_oxide native library found at ${OFFICE_OXIDE_PREFIX}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "office_oxide native library not found. Installing..."
|
||||
|
||||
# Map platform to the release asset name. Note: the GitHub release archives
|
||||
# omit the version number from the native-* asset filenames.
|
||||
local asset_name
|
||||
case "$(uname -s)" in
|
||||
Linux)
|
||||
case "$(uname -m)" in
|
||||
x86_64) asset_name="native-linux-x86_64" ;;
|
||||
aarch64|arm64) asset_name="native-linux-aarch64" ;;
|
||||
*) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; exit 1 ;;
|
||||
esac
|
||||
;;
|
||||
Darwin)
|
||||
case "$(uname -m)" in
|
||||
x86_64) asset_name="native-macos-x86_64" ;;
|
||||
aarch64|arm64) asset_name="native-macos-aarch64" ;;
|
||||
*) echo -e "${RED}Unsupported arch: $(uname -m)${NC}"; exit 1 ;;
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
|
||||
local release_url="https://github.com/yfedoseev/office_oxide/releases/download/v${OFFICE_OXIDE_VERSION}/${asset_name}.tar.gz"
|
||||
|
||||
mkdir -p "${OFFICE_OXIDE_PREFIX}"
|
||||
_download_and_extract "$release_url" "${OFFICE_OXIDE_PREFIX}"
|
||||
|
||||
if [ ! -f "$lib_path" ]; then
|
||||
echo -e "${RED}Error: Failed to install office_oxide native library (missing ${lib_path})${NC}"
|
||||
echo " Try: curl -fsSL ${release_url} | tar xzf - -C ${OFFICE_OXIDE_PREFIX}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓ office_oxide native library installed${NC}"
|
||||
}
|
||||
|
||||
# Build C++ static library
|
||||
build_cpp() {
|
||||
print_section "Building C++ static library"
|
||||
@@ -103,11 +180,26 @@ build_go() {
|
||||
echo -e "${YELLOW}Warning: libpcre2-8 not found. You may need to install libpcre2-dev:${NC}"
|
||||
sudo apt -y install libpcre2-dev
|
||||
fi
|
||||
|
||||
|
||||
# Check / install office_oxide native library
|
||||
check_office_oxide_deps
|
||||
|
||||
# Export CGO flags so go build can find office_oxide headers and library
|
||||
export CGO_CFLAGS="-I${OFFICE_OXIDE_PREFIX}/include/office_oxide_c${CGO_CFLAGS:+ $CGO_CFLAGS}"
|
||||
echo "Exporting CGO_CFLAGS: $CGO_CFLAGS"
|
||||
export CGO_LDFLAGS="-L${OFFICE_OXIDE_PREFIX}/lib -loffice_oxide -Wl,-rpath,${OFFICE_OXIDE_PREFIX}/lib${CGO_LDFLAGS:+ $CGO_LDFLAGS}"
|
||||
echo "Exporting CGO_LDFLAGS: $CGO_LDFLAGS"
|
||||
|
||||
echo "Building RAGFlow binary: $RAGFLOW_SERVER_BINARY, $ADMIN_SERVER_BINARY, and $RAGFLOW_CLI_BINARY"
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 go build -o "$RAGFLOW_SERVER_BINARY" cmd/server_main.go
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 go build -o "$ADMIN_SERVER_BINARY" cmd/admin_server.go
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 go build -o "$RAGFLOW_CLI_BINARY" cmd/ragflow_cli.go
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 \
|
||||
CGO_CFLAGS="$CGO_CFLAGS" CGO_LDFLAGS="$CGO_LDFLAGS" \
|
||||
go build -o "$RAGFLOW_SERVER_BINARY" cmd/server_main.go
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 \
|
||||
CGO_CFLAGS="$CGO_CFLAGS" CGO_LDFLAGS="$CGO_LDFLAGS" \
|
||||
go build -o "$ADMIN_SERVER_BINARY" cmd/admin_server.go
|
||||
GOPROXY=${GOPROXY:-https://goproxy.cn,https://proxy.golang.org,direct} CGO_ENABLED=1 \
|
||||
CGO_CFLAGS="$CGO_CFLAGS" CGO_LDFLAGS="$CGO_LDFLAGS" \
|
||||
go build -o "$RAGFLOW_CLI_BINARY" cmd/ragflow_cli.go
|
||||
|
||||
if [ ! -f "$RAGFLOW_SERVER_BINARY" ]; then
|
||||
echo -e "${RED}Error: Failed to build RAGFlow server binary${NC}"
|
||||
@@ -183,6 +275,7 @@ DEPENDENCIES:
|
||||
- go >= 1.24
|
||||
- g++ with C++17/23 support
|
||||
- libpcre2-dev
|
||||
- office_oxide native library (auto-downloaded on first build)
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
3
go.mod
3
go.mod
@@ -26,6 +26,8 @@ require (
|
||||
github.com/redis/go-redis/v9 v9.18.0
|
||||
github.com/siongui/gojianfan v0.0.0-20210926212422-2f175ac615de
|
||||
github.com/spf13/viper v1.18.2
|
||||
github.com/yfedoseev/office_oxide/go v0.1.2
|
||||
github.com/yfedoseev/pdf_oxide/go v0.3.63
|
||||
go.uber.org/zap v1.27.1
|
||||
golang.org/x/crypto v0.49.0
|
||||
golang.org/x/net v0.51.0
|
||||
@@ -58,6 +60,7 @@ require (
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/ebitengine/purego v0.10.1 // indirect
|
||||
github.com/elastic/elastic-transport-go/v8 v8.8.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.7.0 // indirect
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
|
||||
|
||||
6
go.sum
6
go.sum
@@ -69,6 +69,8 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/ebitengine/purego v0.10.1 h1:dewVBCBT2GaMu1SrNTYxQhgQBethzfhiwvZiLGP/qyY=
|
||||
github.com/ebitengine/purego v0.10.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
|
||||
github.com/elastic/elastic-transport-go/v8 v8.8.0 h1:7k1Ua+qluFr6p1jfJjGDl97ssJS/P7cHNInzfxgBQAo=
|
||||
github.com/elastic/elastic-transport-go/v8 v8.8.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk=
|
||||
github.com/elastic/go-elasticsearch/v8 v8.19.1 h1:0iEGt5/Ds9MNVxEp3hqLsXdbe6SjleaVHONg/FuR09Q=
|
||||
@@ -252,6 +254,10 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
|
||||
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
|
||||
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
||||
github.com/yfedoseev/office_oxide/go v0.1.2 h1:LnyVGXgJJF4tanuRUYVHZNn8e+IwGvOqtIFmQGDjPE4=
|
||||
github.com/yfedoseev/office_oxide/go v0.1.2/go.mod h1:YLtMlKUkRCp/Q96wsy7D6yoBKDeJnP66UH+c9Bb+E+M=
|
||||
github.com/yfedoseev/pdf_oxide/go v0.3.63 h1:6qlNQdaiGBGlo70je1fApQcCjeKg6AVUSUo+URCLl/s=
|
||||
github.com/yfedoseev/pdf_oxide/go v0.3.63/go.mod h1:QbJ/nLbez0al2EnqEdEPIlGflFprWmiuUM4mo9rNNOI=
|
||||
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
|
||||
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
|
||||
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
|
||||
|
||||
@@ -29,6 +29,8 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"ragflow/internal/ingestion"
|
||||
"ragflow/internal/ingestion/parser"
|
||||
"ragflow/internal/utility"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -3249,6 +3251,22 @@ func (c *CLI) UserParseLocalFile(cmd *Command) (ResponseIf, error) {
|
||||
docParseModel = ""
|
||||
}
|
||||
|
||||
fileType := utility.GetFileType(filename)
|
||||
|
||||
fileParser, err := parser.GetParser(fileType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fileContent, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read dsl file: %w", err)
|
||||
}
|
||||
|
||||
if err = fileParser.Parse(filename, fileContent); err != nil {
|
||||
return nil, formatRequestError("parse local file", err)
|
||||
}
|
||||
|
||||
var result SimpleResponse
|
||||
result.Code = 0
|
||||
result.Message = fmt.Sprintf("Success to parse local file %q, vision: %v, chat: %v, asr: %v, ocr: %v, embedding: %v, doc_parse: %v", filename, visionModel, chatModel, asrModel, ocrModel, embeddingModel, docParseModel)
|
||||
|
||||
@@ -7,7 +7,7 @@ docker compose -f docker/docker-compose-base.yml up -d
|
||||
```
|
||||
|
||||
## 2. Build Go Version RAGFlow
|
||||
- First build (includes C++ dependencies):
|
||||
- First build (includes C++ dependencies and office_oxide native library):
|
||||
|
||||
```bash
|
||||
./build.sh --cpp
|
||||
@@ -19,6 +19,13 @@ docker compose -f docker/docker-compose-base.yml up -d
|
||||
./build.sh --go
|
||||
```
|
||||
|
||||
> **Note**: If you use IDEs like GoLand to run/debug directly (via Run/Debug buttons), or run `go build` / `go run` from command line, you must set the following two CGO environment variables in your run configuration or shell:
|
||||
>
|
||||
> ```bash
|
||||
> export CGO_CFLAGS="-I${HOME}/.office_oxide/include/office_oxide_c"
|
||||
> export CGO_LDFLAGS="-L${HOME}/.office_oxide/lib -loffice_oxide -Wl,-rpath,${HOME}/.office_oxide/lib"
|
||||
> ```
|
||||
|
||||
## 3. Run Go Version RAGFlow
|
||||
Note: admin_server must be started first; otherwise, ragflow_server will encounter errors when sending heartbeats.
|
||||
|
||||
|
||||
35
internal/ingestion/parser/doc_parser.go
Normal file
35
internal/ingestion/parser/doc_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type DOCParser struct {
|
||||
}
|
||||
|
||||
func NewDOCParser() *DOCParser {
|
||||
return &DOCParser{}
|
||||
}
|
||||
|
||||
func (p *DOCParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing DOC file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *DOCParser) String() string {
|
||||
return "DOCParser"
|
||||
}
|
||||
64
internal/ingestion/parser/docx_parser.go
Normal file
64
internal/ingestion/parser/docx_parser.go
Normal file
@@ -0,0 +1,64 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
officeOxide "github.com/yfedoseev/office_oxide/go"
|
||||
)
|
||||
|
||||
type DOCXParser struct {
|
||||
}
|
||||
|
||||
func NewDOCXParser() *DOCXParser {
|
||||
return &DOCXParser{}
|
||||
}
|
||||
|
||||
func (p *DOCXParser) Parse(filename string, data []byte) error {
|
||||
|
||||
fmt.Printf("Parsing DOCX file: %s\n", filename)
|
||||
doc, err := officeOxide.OpenFromBytes(data, "docx")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer doc.Close()
|
||||
|
||||
docFormat, err := doc.Format()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println("Document format:", docFormat)
|
||||
|
||||
docContext, err := doc.PlainText()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println("Document context:", docContext)
|
||||
|
||||
md, err := doc.ToMarkdown()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println("Document Markdown:", md)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *DOCXParser) String() string {
|
||||
return "DOCXParser"
|
||||
}
|
||||
35
internal/ingestion/parser/pdf_parser.go
Normal file
35
internal/ingestion/parser/pdf_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type PDFParser struct {
|
||||
}
|
||||
|
||||
func NewPDFParser() *PDFParser {
|
||||
return &PDFParser{}
|
||||
}
|
||||
|
||||
func (p *PDFParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing PDF file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *PDFParser) String() string {
|
||||
return "PDFParser"
|
||||
}
|
||||
35
internal/ingestion/parser/ppt_parser.go
Normal file
35
internal/ingestion/parser/ppt_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type PPTParser struct {
|
||||
}
|
||||
|
||||
func NewPPTParser() *PPTParser {
|
||||
return &PPTParser{}
|
||||
}
|
||||
|
||||
func (p *PPTParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing PPT file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *PPTParser) String() string {
|
||||
return "PPTParser"
|
||||
}
|
||||
35
internal/ingestion/parser/pptx_parser.go
Normal file
35
internal/ingestion/parser/pptx_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type PPTXParser struct {
|
||||
}
|
||||
|
||||
func NewPPTXParser() *PPTXParser {
|
||||
return &PPTXParser{}
|
||||
}
|
||||
|
||||
func (p *PPTXParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing PPTX file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *PPTXParser) String() string {
|
||||
return "PPTXParser"
|
||||
}
|
||||
51
internal/ingestion/parser/type.go
Normal file
51
internal/ingestion/parser/type.go
Normal file
@@ -0,0 +1,51 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"ragflow/internal/utility"
|
||||
)
|
||||
|
||||
func GetParser(fileType utility.FileType) (FileParser, error) {
|
||||
switch fileType {
|
||||
case utility.FileTypePPTX:
|
||||
return NewPPTXParser(), nil
|
||||
case utility.FileTypePPT:
|
||||
return NewPPTParser(), nil
|
||||
case utility.FileTypeXLSX:
|
||||
return NewXLSXParser(), nil
|
||||
case utility.FileTypeXLS:
|
||||
return NewXLSParser(), nil
|
||||
case utility.FileTypeDOCX:
|
||||
return NewDOCXParser(), nil
|
||||
case utility.FileTypeDOC:
|
||||
return NewDOCParser(), nil
|
||||
case utility.FileTypePDF:
|
||||
return NewPDFParser(), nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported file type: %s", fileType)
|
||||
}
|
||||
}
|
||||
|
||||
// FileParser defines the interface for all file parsers.
|
||||
type FileParser interface {
|
||||
// Parse parses the input text.
|
||||
Parse(filename string, data []byte) error
|
||||
|
||||
String() string
|
||||
}
|
||||
35
internal/ingestion/parser/xls_parser.go
Normal file
35
internal/ingestion/parser/xls_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type XLSParser struct {
|
||||
}
|
||||
|
||||
func NewXLSParser() *XLSParser {
|
||||
return &XLSParser{}
|
||||
}
|
||||
|
||||
func (p *XLSParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing XLS file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *XLSParser) String() string {
|
||||
return "XLSParser"
|
||||
}
|
||||
35
internal/ingestion/parser/xlsx_parser.go
Normal file
35
internal/ingestion/parser/xlsx_parser.go
Normal file
@@ -0,0 +1,35 @@
|
||||
//
|
||||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package parser
|
||||
|
||||
import "fmt"
|
||||
|
||||
type XLSXParser struct {
|
||||
}
|
||||
|
||||
func NewXLSXParser() *XLSXParser {
|
||||
return &XLSXParser{}
|
||||
}
|
||||
|
||||
func (p *XLSXParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing XLSX file: %s\n", filename)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *XLSXParser) String() string {
|
||||
return "XLSXParser"
|
||||
}
|
||||
@@ -375,7 +375,7 @@ func (s *FileService) UploadFile(tenantID, parentID string, files []*multipart.F
|
||||
Name: uniqueName,
|
||||
Location: &location,
|
||||
Size: int64(len(data)),
|
||||
Type: fileType,
|
||||
Type: string(fileType),
|
||||
SourceType: "",
|
||||
}
|
||||
|
||||
|
||||
@@ -22,13 +22,20 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
type FileType string
|
||||
|
||||
const (
|
||||
FileTypePDF = "pdf"
|
||||
FileTypeDOC = "doc"
|
||||
FileTypeVISUAL = "visual"
|
||||
FileTypeAURAL = "aural"
|
||||
FileTypeFOLDER = "folder"
|
||||
FileTypeOTHER = "other"
|
||||
FileTypePDF FileType = "pdf"
|
||||
FileTypeDOC FileType = "doc"
|
||||
FileTypeDOCX FileType = "docx"
|
||||
FileTypePPT FileType = "ppt"
|
||||
FileTypePPTX FileType = "pptx"
|
||||
FileTypeXLS FileType = "xls"
|
||||
FileTypeXLSX FileType = "xlsx"
|
||||
FileTypeVISUAL FileType = "visual"
|
||||
FileTypeAURAL FileType = "aural"
|
||||
FileTypeFOLDER FileType = "folder"
|
||||
FileTypeOTHER FileType = "other"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -50,7 +57,37 @@ func normalizeFilename(filename string) (string, bool) {
|
||||
return strings.ToLower(base), true
|
||||
}
|
||||
|
||||
func FilenameType(filename string) string {
|
||||
func GetFileType(filename string) FileType {
|
||||
|
||||
ext := filepath.Ext(filename)
|
||||
var suffix string
|
||||
if len(ext) > 0 && ext[0] == '.' {
|
||||
suffix = strings.ToLower(ext[1:])
|
||||
} else {
|
||||
suffix = strings.ToLower(ext)
|
||||
}
|
||||
|
||||
switch suffix {
|
||||
case "pdf":
|
||||
return FileTypePDF
|
||||
case "xls":
|
||||
return FileTypeXLS
|
||||
case "xlsx":
|
||||
return FileTypeXLSX
|
||||
case "doc":
|
||||
return FileTypeDOC
|
||||
case "docx":
|
||||
return FileTypeDOCX
|
||||
case "ppt":
|
||||
return FileTypePPT
|
||||
case "pptx":
|
||||
return FileTypePPTX
|
||||
default:
|
||||
return FileTypeOTHER
|
||||
}
|
||||
}
|
||||
|
||||
func FilenameType(filename string) FileType {
|
||||
normalized, ok := normalizeFilename(filename)
|
||||
if !ok {
|
||||
return FileTypeOTHER
|
||||
@@ -216,7 +253,7 @@ var FORCE_ATTACHMENT_CONTENT_TYPES = map[string]bool{
|
||||
"image/svg+xml": true,
|
||||
"application/xhtml+xml": true,
|
||||
"text/xml": true,
|
||||
"application/xml": true,
|
||||
"application/xml": true,
|
||||
"multipart/related": true,
|
||||
}
|
||||
|
||||
@@ -241,7 +278,7 @@ func GetContentType(ext string, fileType string) string {
|
||||
return contentType
|
||||
}
|
||||
fallbackPrefix := "application"
|
||||
if fileType == FileTypeVISUAL {
|
||||
if fileType == string(FileTypeVISUAL) {
|
||||
fallbackPrefix = "image"
|
||||
}
|
||||
return fallbackPrefix + "/" + normalizedExt
|
||||
|
||||
Reference in New Issue
Block a user