feat(go-api): implement dataset document upload API (#16295)

## Summary
Migrated the dataset document upload API (`POST
/api/v1/datasets/:dataset_id/documents`) from Python to the Go backend.
It supports local file uploads (`type=local`), web page ingestion
(`type=web`), and empty document creation (`type=empty`).

## Changes
- **Router**: Registered `POST /api/v1/datasets/:dataset_id/documents`
route.
- **Handler**: Implemented `UploadDocuments` handler and its routing
functions (`uploadLocalDocuments`, `uploadWebDocument`,
`uploadEmptyDocument`).
- **Service**: Implemented `UploadLocalDocuments`, `UploadWebDocument`,
and `UploadEmptyDocument` in `DocumentService`.
- **Refactoring**: Moved permission checking logic to a shared helper
for reuse in file and document services.
- **Tests**: Added comprehensive unit tests for the new handler and
service upload paths.

## Verification
Ran and passed the test suite for service and handler packages:
- `go test ./internal/service`
- `go test ./internal/handler`
This commit is contained in:
Hz_
2026-06-25 13:36:49 +08:00
committed by GitHub
parent ced51114f4
commit a6cc3023c5
12 changed files with 1161 additions and 70 deletions

View File

@@ -20,9 +20,11 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"mime"
"mime/multipart"
"net/http"
"net/url"
"path/filepath"
"ragflow/internal/common"
"ragflow/internal/entity"
@@ -59,6 +61,9 @@ type documentServiceIface interface {
GetDocumentMetadataByID(docID string) (map[string]interface{}, error)
GetDocumentArtifact(filename string) (*service.ArtifactResponse, error)
GetDocumentPreview(docID string) (*service.DocumentPreview, error)
UploadLocalDocuments(kb *entity.Knowledgebase, tenantID string, files []*multipart.FileHeader, parentPath string, parserConfigOverride map[string]interface{}) ([]map[string]interface{}, []string)
UploadWebDocument(kb *entity.Knowledgebase, tenantID, name, url string) (map[string]interface{}, common.ErrorCode, error)
UploadEmptyDocument(kb *entity.Knowledgebase, tenantID, name string) (map[string]interface{}, common.ErrorCode, error)
DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error)
UpdateDatasetDocument(userID, datasetID, documentID string, req *service.UpdateDatasetDocumentRequest, present map[string]bool) (*service.UpdateDatasetDocumentResponse, common.ErrorCode, error)
BatchUpdateDocumentMetadatas(datasetID string, selector *service.DocumentMetadataSelector, updates []service.DocumentMetadataUpdate, deletes []service.DocumentMetadataDelete) (*service.BatchUpdateDocumentMetadatasResponse, common.ErrorCode, error)
@@ -532,6 +537,197 @@ func (h *DocumentHandler) ListDocuments(c *gin.Context) {
})
}
func (h *DocumentHandler) UploadDocuments(c *gin.Context) {
user, errorCode, errorMessage := GetUser(c)
if errorCode != common.CodeSuccess {
jsonError(c, errorCode, errorMessage)
return
}
tenantID := user.ID
datasetID := c.Param("dataset_id")
uploadType := strings.ToLower(c.DefaultQuery("type", "local"))
kb, err := h.datasetService.GetKnowledgebaseByID(datasetID)
if err != nil || kb == nil {
jsonError(c, common.CodeDataError, fmt.Sprintf("Can't find the dataset with ID %s!", datasetID))
return
}
if !h.datasetService.CheckKBTeamPermission(kb, tenantID) {
jsonError(c, common.CodeAuthenticationError, "No authorization.")
return
}
switch uploadType {
case "web":
h.uploadWebDocument(c, kb, tenantID)
case "empty":
h.uploadEmptyDocument(c, kb, tenantID)
case "local":
h.uploadLocalDocuments(c, kb, tenantID)
default:
jsonError(c, common.CodeArgumentError, `"type" must be one of "local", "web", or "empty".`)
}
}
func (h *DocumentHandler) uploadLocalDocuments(c *gin.Context, kb *entity.Knowledgebase, tenantID string) {
form, err := c.MultipartForm()
if err != nil || form == nil || len(form.File["file"]) == 0 {
jsonError(c, common.CodeArgumentError, "No file part!")
return
}
files := form.File["file"]
for _, fh := range files {
if fh == nil || fh.Filename == "" {
jsonError(c, common.CodeArgumentError, "No file selected!")
return
}
if len([]byte(fh.Filename)) > 255 {
jsonError(c, common.CodeArgumentError, "File name must be 255 bytes or less.")
return
}
}
// Optional parser_config override — only the allow-listed table column keys.
// Python ignores malformed or non-object input here instead of failing the
// whole upload request.
var override map[string]interface{}
if raw := strings.TrimSpace(c.PostForm("parser_config")); raw != "" {
var parsed map[string]interface{}
if err := json.Unmarshal([]byte(raw), &parsed); err == nil && parsed != nil {
override = map[string]interface{}{}
for _, k := range []string{"table_column_mode", "table_column_roles"} {
if v, ok := parsed[k]; ok {
override[k] = v
}
}
if len(override) == 0 {
override = nil
}
}
}
data, errMsgs := h.documentService.UploadLocalDocuments(kb, tenantID, files, c.PostForm("parent_path"), override)
if len(data) == 0 && len(errMsgs) > 0 {
jsonError(c, common.CodeServerError, strings.Join(errMsgs, "\n"))
return
}
if len(data) == 0 {
jsonError(c, common.CodeDataError, "There seems to be an issue with your file format. please verify it is correct and not corrupted.")
return
}
if strings.ToLower(c.DefaultQuery("return_raw_files", "false")) == "true" {
if len(errMsgs) > 0 {
jsonSuccess(c, gin.H{"documents": data, "errors": errMsgs})
return
}
jsonSuccess(c, data)
return
}
mapped := make([]map[string]interface{}, len(data))
for i, d := range data {
mapped[i] = mapDocKeysWithRunStatus(d)
}
if len(errMsgs) > 0 {
jsonSuccess(c, gin.H{"documents": mapped, "errors": errMsgs})
return
}
jsonSuccess(c, mapped)
}
func (h *DocumentHandler) uploadEmptyDocument(c *gin.Context, kb *entity.Knowledgebase, tenantID string) {
var req struct {
Name string `json:"name"`
}
// An empty body is valid (falls through to the name-required check below);
// a non-empty but malformed body should report the syntax error, not a
// misleading "File name can't be empty."
if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
jsonError(c, common.CodeArgumentError, "Invalid JSON body: "+err.Error())
return
}
name := strings.TrimSpace(req.Name)
if name == "" {
jsonError(c, common.CodeArgumentError, "File name can't be empty.")
return
}
if len([]byte(name)) > 255 {
jsonError(c, common.CodeArgumentError, "File name must be 255 bytes or less.")
return
}
data, code, err := h.documentService.UploadEmptyDocument(kb, tenantID, name)
if err != nil {
jsonError(c, code, err.Error())
return
}
jsonSuccess(c, mapDocKeysWithRunStatus(data))
}
func (h *DocumentHandler) uploadWebDocument(c *gin.Context, kb *entity.Knowledgebase, tenantID string) {
name := strings.TrimSpace(c.PostForm("name"))
rawURL := c.PostForm("url")
if name == "" {
jsonError(c, common.CodeArgumentError, `Lack of "name"`)
return
}
if rawURL == "" {
jsonError(c, common.CodeArgumentError, `Lack of "url"`)
return
}
if len([]byte(name)) > 255 {
jsonError(c, common.CodeArgumentError, "File name must be 255 bytes or less.")
return
}
if !isValidHTTPURL(rawURL) {
jsonError(c, common.CodeArgumentError, "The URL format is invalid")
return
}
data, code, err := h.documentService.UploadWebDocument(kb, tenantID, name, rawURL)
if err != nil {
jsonError(c, code, err.Error())
return
}
jsonSuccess(c, mapDocKeysWithRunStatus(data))
}
// jsonSuccess writes the standard {code:0,message:"success",data} envelope.
func jsonSuccess(c *gin.Context, data interface{}) {
c.JSON(http.StatusOK, gin.H{
"code": common.CodeSuccess,
"message": "success",
"data": data,
})
}
// mapDocKeysWithRunStatus renames a freshly-created document's raw keys to the
// public response shape (chunk_num→chunk_count, token_num→token_count,
// kb_id→dataset_id, parser_id→chunk_method) and reports run as a label.
// Mirrors Python map_doc_keys_with_run_status / map_doc_keys.
func mapDocKeysWithRunStatus(raw map[string]interface{}) map[string]interface{} {
out := map[string]interface{}{
"chunk_count": raw["chunk_num"],
"token_count": raw["token_num"],
"dataset_id": raw["kb_id"],
"chunk_method": raw["parser_id"],
"run": "UNSTART",
}
for _, k := range []string{"id", "name", "type", "size", "suffix", "source_type", "created_by", "parser_config", "location", "pipeline_id", "content_hash"} {
if v, ok := raw[k]; ok {
out[k] = v
}
}
return out
}
// isValidHTTPURL mirrors Python is_valid_url: requires an http/https scheme and a host.
func isValidHTTPURL(raw string) bool {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return false
}
return (u.Scheme == "http" || u.Scheme == "https") && u.Host != ""
}
func (h *DocumentHandler) DownloadDocument(c *gin.Context) {
datasetID := c.Param("dataset_id")
docID := c.Param("document_id")

View File

@@ -17,6 +17,7 @@
package handler
import (
"bytes"
"encoding/json"
"fmt"
"mime/multipart"
@@ -45,6 +46,11 @@ type fakeDocumentService struct {
metadataErr error
metadataKBID string
metadataDocIDs []string
uploadLocalData []map[string]interface{}
uploadLocalErrs []string
uploadLocalKB *entity.Knowledgebase
uploadLocalPath string
uploadOverride map[string]interface{}
ingestCode common.ErrorCode
ingestErr error
ingestUserID string
@@ -60,6 +66,8 @@ func (f *fakeDocumentService) Ingest(userID string, req *service.IngestDocumentR
return common.CodeSuccess, nil
}
const uploadTestDatasetID = "123e4567-e89b-12d3-a456-426614174000"
func (f *fakeDocumentService) UpdateDatasetDocument(userID, datasetID, documentID string, req *service.UpdateDatasetDocumentRequest, present map[string]bool) (*service.UpdateDatasetDocumentResponse, common.ErrorCode, error) {
return nil, common.CodeSuccess, nil
}
@@ -163,6 +171,18 @@ func (f *fakeDocumentService) DeleteDocumentAllMetadata(docID string) error {
func (f *fakeDocumentService) GetDocumentMetadataByID(docID string) (map[string]interface{}, error) {
return nil, nil
}
func (f *fakeDocumentService) UploadLocalDocuments(kb *entity.Knowledgebase, tenantID string, files []*multipart.FileHeader, parentPath string, parserConfigOverride map[string]interface{}) ([]map[string]interface{}, []string) {
f.uploadLocalKB = kb
f.uploadLocalPath = parentPath
f.uploadOverride = parserConfigOverride
return f.uploadLocalData, f.uploadLocalErrs
}
func (f *fakeDocumentService) UploadWebDocument(kb *entity.Knowledgebase, tenantID, name, url string) (map[string]interface{}, common.ErrorCode, error) {
return nil, common.CodeServerError, fmt.Errorf("not implemented")
}
func (f *fakeDocumentService) UploadEmptyDocument(kb *entity.Knowledgebase, tenantID, name string) (map[string]interface{}, common.ErrorCode, error) {
return nil, common.CodeServerError, fmt.Errorf("not implemented")
}
func (f *fakeDocumentService) ListIngestionTasks(userID string, datasetID *string, page, pageSize int) ([]*entity.IngestionTask, error) {
return nil, nil
@@ -189,6 +209,81 @@ func setupGinContextWithUser(method, path, body string) (*gin.Context, *httptest
return c, w
}
func setupUploadHandlerDB(t *testing.T, role string) *gorm.DB {
t.Helper()
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{
TranslateError: true,
})
if err != nil {
t.Fatalf("failed to open sqlite: %v", err)
}
if err := db.AutoMigrate(
&entity.User{},
&entity.Tenant{},
&entity.UserTenant{},
&entity.Knowledgebase{},
); err != nil {
t.Fatalf("failed to migrate: %v", err)
}
if err := db.Create(&entity.User{ID: "user-1", Nickname: "test", Email: "test@test.com", Password: sptr("x")}).Error; err != nil {
t.Fatalf("insert user: %v", err)
}
if err := db.Create(&entity.Tenant{ID: "tenant-1", LLMID: "llm-1", EmbdID: "embd-1", ASRID: "asr-1", Status: sptr(string(entity.StatusValid))}).Error; err != nil {
t.Fatalf("insert tenant: %v", err)
}
if err := db.Create(&entity.UserTenant{ID: "ut-1", UserID: "user-1", TenantID: "tenant-1", Role: role, Status: sptr(string(entity.StatusValid))}).Error; err != nil {
t.Fatalf("insert user_tenant: %v", err)
}
pipelineID := "pipe-1"
if err := db.Create(&entity.Knowledgebase{
ID: "123e4567e89b12d3a456426614174000",
TenantID: "tenant-1",
Name: "kb-upload",
EmbdID: "embd-1",
CreatedBy: "user-1",
Permission: string(entity.TenantPermissionTeam),
ParserID: "naive",
PipelineID: &pipelineID,
ParserConfig: entity.JSONMap{"base": "cfg"},
Status: sptr(string(entity.StatusValid)),
}).Error; err != nil {
t.Fatalf("insert knowledgebase: %v", err)
}
return db
}
func setupUploadContext(t *testing.T, path string, fields map[string]string, fileName string, fileContent []byte) (*gin.Context, *httptest.ResponseRecorder) {
t.Helper()
gin.SetMode(gin.TestMode)
w := httptest.NewRecorder()
var body bytes.Buffer
writer := multipart.NewWriter(&body)
for k, v := range fields {
if err := writer.WriteField(k, v); err != nil {
t.Fatalf("write field %s: %v", k, err)
}
}
part, err := writer.CreateFormFile("file", fileName)
if err != nil {
t.Fatalf("create form file: %v", err)
}
if _, err := part.Write(fileContent); err != nil {
t.Fatalf("write form file: %v", err)
}
if err := writer.Close(); err != nil {
t.Fatalf("close writer: %v", err)
}
req := httptest.NewRequest(http.MethodPost, path, &body)
req.Header.Set("Content-Type", writer.FormDataContentType())
c, _ := gin.CreateTestContext(w)
c.Request = req
c.Set("user", &entity.User{ID: "user-1"})
c.Set("user_id", "user-1")
c.Params = gin.Params{{Key: "dataset_id", Value: uploadTestDatasetID}}
return c, w
}
func setupDocumentIngestRoute(userID string, svc *fakeDocumentService) *gin.Engine {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
@@ -233,6 +328,115 @@ func TestDeleteDocumentsHandler_Success(t *testing.T) {
}
}
func TestUploadDocumentsHandler_LocalUsesFullKBAndIgnoresBadParserConfig(t *testing.T) {
db := setupUploadHandlerDB(t, "normal")
orig := dao.DB
dao.DB = db
t.Cleanup(func() { dao.DB = orig })
fake := &fakeDocumentService{
uploadLocalData: []map[string]interface{}{
{"id": "doc-1", "kb_id": "ds-1", "parser_id": "naive", "chunk_num": int64(0), "token_num": int64(0), "name": "a.txt"},
},
}
h := &DocumentHandler{
documentService: fake,
datasetService: service.NewDatasetService(),
}
c, w := setupUploadContext(t, "/api/v1/datasets/ds-1/documents?type=local", map[string]string{
"parent_path": "nested/path",
"parser_config": "{bad json",
}, "a.txt", []byte("abc"))
h.UploadDocuments(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
if fake.uploadLocalKB == nil {
t.Fatalf("UploadLocalDocuments was not called, response=%s", w.Body.String())
}
if fake.uploadLocalKB.TenantID != "tenant-1" || fake.uploadLocalKB.Name != "kb-upload" || fake.uploadLocalKB.ParserID != "naive" {
t.Fatalf("incomplete kb passed to service: %+v", fake.uploadLocalKB)
}
if fake.uploadLocalPath != "nested/path" {
t.Fatalf("parent path=%q, want nested/path", fake.uploadLocalPath)
}
if fake.uploadOverride != nil {
t.Fatalf("bad parser_config should be ignored, got %v", fake.uploadOverride)
}
}
func TestUploadDocumentsHandler_LocalReturnsPartialSuccess(t *testing.T) {
db := setupUploadHandlerDB(t, "normal")
orig := dao.DB
dao.DB = db
t.Cleanup(func() { dao.DB = orig })
fake := &fakeDocumentService{
uploadLocalData: []map[string]interface{}{
{"id": "doc-1", "kb_id": "ds-1", "parser_id": "naive", "chunk_num": int64(0), "token_num": int64(0), "name": "ok.txt"},
},
uploadLocalErrs: []string{"bad.exe: This type of file has not been supported yet!"},
}
h := &DocumentHandler{
documentService: fake,
datasetService: service.NewDatasetService(),
}
c, w := setupUploadContext(t, "/api/v1/datasets/ds-1/documents?type=local", nil, "ok.txt", []byte("abc"))
h.UploadDocuments(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("unmarshal response: %v", err)
}
if resp["code"] != float64(common.CodeSuccess) {
t.Fatalf("expected success for partial upload, got %v", resp)
}
data := resp["data"].(map[string]interface{})
if len(data["documents"].([]interface{})) != 1 {
t.Fatalf("expected one successful document, got %v", data["documents"])
}
if len(data["errors"].([]interface{})) != 1 {
t.Fatalf("expected one file error, got %v", data["errors"])
}
}
func TestUploadDocumentsHandler_DeniesNonNormalTeamRole(t *testing.T) {
db := setupUploadHandlerDB(t, "admin")
orig := dao.DB
dao.DB = db
t.Cleanup(func() { dao.DB = orig })
fake := &fakeDocumentService{}
h := &DocumentHandler{
documentService: fake,
datasetService: service.NewDatasetService(),
}
c, w := setupUploadContext(t, "/api/v1/datasets/ds-1/documents?type=local", nil, "a.txt", []byte("abc"))
h.UploadDocuments(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("unmarshal response: %v", err)
}
if resp["code"] == float64(common.CodeSuccess) {
t.Fatalf("expected authorization error, got %v", resp)
}
if fake.uploadLocalKB != nil {
t.Fatal("service should not be called on denied upload")
}
}
func TestDeleteDocumentsHandler_DeleteAll(t *testing.T) {
gin.SetMode(gin.TestMode)