feat(go-api): Align document metadata batch APIs and upload_info with Python (#16269)

## Summary

  Align the Go implementations of these APIs with the Python behavior:

  - `POST /api/v1/datasets/:dataset_id/metadata/update`
  - `PATCH /api/v1/datasets/:dataset_id/documents/metadatas`
  - `POST /api/v1/documents/upload`

  ## What changed

  - Added the Go routes and handlers for the 3 APIs.
  - Aligned batch document metadata updates with Python semantics:
    - support `match` in update items
    - support list append / replace behavior
    - support deleting specific list values
    - remove metadata entirely when it becomes empty
- create metadata for documents that previously had none when updates
apply
    - count `updated` only when a document actually changes
- Aligned `documents/upload` file uploads with Python-style
`upload_info` behavior:
    - store upload-info blobs in the per-user downloads bucket
- return lightweight upload descriptors instead of normal
file-management responses
  - Improved URL upload behavior:
    - SSRF-guarded fetch with redirect validation
    - redirect limit aligned to Python behavior
    - normalize filename and MIME type
    - add `.pdf` when the fetched content is PDF
- normalize HTML content into readable text instead of storing raw HTML
shells

  ## Validation

  ### Unit tests

  Passed:

  - `go test ./internal/service`
  - `go test ./internal/handler`

  Also verified targeted cases for:

  - batch metadata update semantics
  - upload_info URL handling
  - upload_info download bucket behavior

  ### curl checks

Verified the new Go endpoints with `curl` and compared the response
shape and behavior with Python for:

  - `POST /api/v1/datasets/{dataset_id}/metadata/update`
  - `PATCH /api/v1/datasets/{dataset_id}/documents/metadatas`
  - `POST /api/v1/documents/upload`

  The Go responses were checked against Python for:
  - argument validation
  - success response shape
  - metadata update results
  - upload_info result structure
  - file vs URL input handling
This commit is contained in:
Hz_
2026-06-24 14:52:47 +08:00
committed by GitHub
parent 97718ec779
commit e35860ad74
8 changed files with 1427 additions and 11 deletions

View File

@@ -21,6 +21,7 @@ import (
"errors"
"fmt"
"mime"
"mime/multipart"
"net/http"
"path/filepath"
"ragflow/internal/common"
@@ -60,6 +61,9 @@ type documentServiceIface interface {
GetDocumentPreview(docID string) (*service.DocumentPreview, error)
DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error)
UpdateDatasetDocument(userID, datasetID, documentID string, req *service.UpdateDatasetDocumentRequest, present map[string]bool) (*service.UpdateDatasetDocumentResponse, common.ErrorCode, error)
BatchUpdateDocumentMetadatas(datasetID string, selector *service.DocumentMetadataSelector, updates []service.DocumentMetadataUpdate, deletes []service.DocumentMetadataDelete) (*service.BatchUpdateDocumentMetadatasResponse, common.ErrorCode, error)
UploadDocumentInfos(userID string, files []*multipart.FileHeader) ([]map[string]interface{}, common.ErrorCode, error)
UploadDocumentInfoByURL(userID, rawURL string) (map[string]interface{}, common.ErrorCode, error)
ListIngestionTasks(userID string, datasetID *string, page, pageSize int) ([]*entity.IngestionTask, error)
IngestDocuments(datasetID, userID string, docIDs []string) ([]*service.ParseDocumentResponse, error)
StopIngestionTasks(tasks []string, userID string) ([]*entity.IngestionTask, error)
@@ -1296,3 +1300,123 @@ func (h *DocumentHandler) UpdateDatasetDocument(c *gin.Context) {
"data": data,
})
}
func (h *DocumentHandler) UploadInfo(c *gin.Context) {
user, errorCode, errorMessage := GetUser(c)
if errorCode != common.CodeSuccess {
jsonError(c, errorCode, errorMessage)
return
}
form, err := c.MultipartForm()
if err != nil && !strings.Contains(err.Error(), "request Content-Type isn't multipart/form-data") {
jsonError(c, common.CodeArgumentError, "Failed to parse multipart form: "+err.Error())
return
}
var fileHeaders []*multipart.FileHeader
if form != nil && form.File != nil {
fileHeaders = form.File["file"]
}
rawURL := strings.TrimSpace(c.Query("url"))
if len(fileHeaders) > 0 && rawURL != "" {
jsonError(c, common.CodeArgumentError, "Provide either multipart file(s) or ?url=..., not both.")
return
}
if len(fileHeaders) == 0 && rawURL == "" {
jsonError(c, common.CodeArgumentError, "Missing input: provide multipart file(s) or url")
return
}
if rawURL != "" {
data, code, err := h.documentService.UploadDocumentInfoByURL(user.ID, rawURL)
if err != nil {
jsonError(c, code, err.Error())
return
}
c.JSON(http.StatusOK, gin.H{
"code": common.CodeSuccess,
"data": data,
"message": "success",
})
return
}
data, code, err := h.documentService.UploadDocumentInfos(user.ID, fileHeaders)
if err != nil {
jsonError(c, code, err.Error())
return
}
var payload interface{}
if len(data) == 1 {
payload = data[0]
} else {
payload = data
}
c.JSON(http.StatusOK, gin.H{
"code": common.CodeSuccess,
"data": payload,
"message": "success",
})
}
type documentMetadataBatchRequest struct {
Selector *service.DocumentMetadataSelector `json:"selector"`
Updates []service.DocumentMetadataUpdate `json:"updates"`
Deletes []service.DocumentMetadataDelete `json:"deletes"`
}
func (h *DocumentHandler) MetadataBatchUpdate(c *gin.Context) {
h.handleBatchUpdateDocumentMetadatas(c)
}
func (h *DocumentHandler) UpdateDocumentMetadatas(c *gin.Context) {
h.handleBatchUpdateDocumentMetadatas(c)
}
func (h *DocumentHandler) handleBatchUpdateDocumentMetadatas(c *gin.Context) {
user, errorCode, errorMessage := GetUser(c)
if errorCode != common.CodeSuccess {
jsonError(c, errorCode, errorMessage)
return
}
datasetID := strings.TrimSpace(c.Param("dataset_id"))
if datasetID == "" {
jsonError(c, common.CodeArgumentError, "dataset_id is required")
return
}
if !h.datasetService.Accessible(datasetID, user.ID) {
jsonError(c, common.CodeDataError, "You don't own the dataset "+datasetID+".")
return
}
var req documentMetadataBatchRequest
if err := c.ShouldBindJSON(&req); err != nil {
jsonError(c, common.CodeDataError, err.Error())
return
}
if req.Selector == nil {
req.Selector = &service.DocumentMetadataSelector{}
}
if req.Updates == nil {
req.Updates = []service.DocumentMetadataUpdate{}
}
if req.Deletes == nil {
req.Deletes = []service.DocumentMetadataDelete{}
}
resp, code, err := h.documentService.BatchUpdateDocumentMetadatas(datasetID, req.Selector, req.Updates, req.Deletes)
if err != nil {
jsonError(c, code, err.Error())
return
}
c.JSON(http.StatusOK, gin.H{
"code": common.CodeSuccess,
"data": resp,
"message": "success",
})
}

View File

@@ -19,6 +19,7 @@ package handler
import (
"encoding/json"
"fmt"
"mime/multipart"
"net/http"
"net/http/httptest"
"strings"
@@ -49,6 +50,15 @@ type fakeDocumentService struct {
func (f *fakeDocumentService) UpdateDatasetDocument(userID, datasetID, documentID string, req *service.UpdateDatasetDocumentRequest, present map[string]bool) (*service.UpdateDatasetDocumentResponse, common.ErrorCode, error) {
return nil, common.CodeSuccess, nil
}
func (f *fakeDocumentService) BatchUpdateDocumentMetadatas(datasetID string, selector *service.DocumentMetadataSelector, updates []service.DocumentMetadataUpdate, deletes []service.DocumentMetadataDelete) (*service.BatchUpdateDocumentMetadatasResponse, common.ErrorCode, error) {
return nil, common.CodeSuccess, nil
}
func (f *fakeDocumentService) UploadDocumentInfos(userID string, files []*multipart.FileHeader) ([]map[string]interface{}, common.ErrorCode, error) {
return nil, common.CodeSuccess, nil
}
func (f *fakeDocumentService) UploadDocumentInfoByURL(userID, rawURL string) (map[string]interface{}, common.ErrorCode, error) {
return nil, common.CodeSuccess, nil
}
func (f *fakeDocumentService) GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) {
if filename == "error.txt" {