fix(go-api): sync document handler interface and enforce preview acce… (#15688)

### Description

This PR syncs the `documentServiceIface` interface and introduces
handler methods for document preview, artifact fetching, and downloading
in the Go API. It also ensures that strict dataset alignment and access
checks are enforced when retrieving or downloading documents.

Furthermore, this PR introduces comprehensive unit tests for both the
newly added Handler and Service methods to ensure robustness and prevent
future regressions.

### Key Changes
* **Router & Handler Integration**: 
  * Added and wired new API endpoints in `internal/router/router.go`.
* Synchronized the `documentServiceIface` with `GetDocumentArtifact`,
`GetDocumentPreview`, and `DownloadDocument`.
* Implemented handlers for these endpoints in
`internal/handler/document.go`.
* **Access & Validation Enforcement**: 
* Refactored `internal/service/document.go` to strictly check if a
document belongs to the requested dataset before allowing downloads or
previews.
* Added robust artifact file sanitization (`sanitizeArtifactFilename`)
and attachment handling (`shouldForceArtifactAttachment`).
* **Comprehensive Unit Testing**:
* **Handler Layer (`internal/handler/document_test.go`)**: Added mock
service implementations and Gin router tests covering success,
not-found, and internal error states for all 3 new endpoints.
* **Service Layer (`internal/service/document_test.go`)**: Added
extensive business logic tests including dataset mismatch checks,
non-existent document checks, and artifact file validation.
This commit is contained in:
Hz_
2026-06-08 11:37:06 +08:00
committed by GitHub
parent b05d5a5228
commit 074c331cdf
5 changed files with 573 additions and 0 deletions

View File

@@ -25,6 +25,7 @@ import (
"path/filepath"
"ragflow/internal/common"
"ragflow/internal/entity"
"ragflow/internal/utility"
"strconv"
"strings"
"time"
@@ -55,6 +56,9 @@ type documentServiceIface interface {
DeleteDocumentMetadata(docID string, keys []string) error
DeleteDocumentAllMetadata(docID string) error
GetDocumentMetadataByID(docID string) (map[string]interface{}, error)
GetDocumentArtifact(filename string) (*service.ArtifactResponse, error)
GetDocumentPreview(docID string) (*service.DocumentPreview, error)
DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error)
}
// DocumentHandler document handler
@@ -198,6 +202,68 @@ func (h *DocumentHandler) GetDocumentImage(c *gin.Context) {
c.Data(http.StatusOK, contentType, data)
}
func (h *DocumentHandler) GetDocumentArtifact(c *gin.Context) {
filename := c.Param("filename")
artifact, err := h.documentService.GetDocumentArtifact(filename)
if err != nil {
switch {
case errors.Is(err, service.ErrArtifactInvalidFilename),
errors.Is(err, service.ErrArtifactInvalidFileType),
errors.Is(err, service.ErrArtifactNotFound):
c.JSON(http.StatusOK, gin.H{
"code": common.CodeDataError,
"message": err.Error(),
})
default:
c.JSON(http.StatusOK, gin.H{
"code": common.CodeExceptionError,
"data": nil,
"message": err.Error(),
})
}
return
}
c.Header("Content-Type", artifact.ContentType)
if artifact.ForceAttachment {
c.Header("X-Content-Type-Options", "nosniff")
c.Header("Content-Disposition", "attachment")
} else {
c.Header("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, artifact.SafeFilename))
}
c.Data(http.StatusOK, artifact.ContentType, artifact.Data)
}
func (h *DocumentHandler) GetDocumentPreview(c *gin.Context) {
docID := c.Param("id")
if docID == "" {
jsonError(c, common.CodeParamError, "id is required")
return
}
preview, err := h.documentService.GetDocumentPreview(docID)
if err != nil {
c.JSON(http.StatusOK, gin.H{
"code": common.CodeDataError,
"message": "Document not found!",
})
return
}
ext := utility.GetFileExtension(preview.FileName)
if preview.ContentType != "" {
c.Header("Content-Type", preview.ContentType)
}
if utility.ShouldForceAttachment(ext, preview.ContentType) {
c.Header("X-Content-Type-Options", "nosniff")
c.Header("Content-Disposition", "attachment")
}
c.Data(http.StatusOK, preview.ContentType, preview.Data)
}
// UpdateDocument update document
// @Summary Update Document
// @Description Update document info
@@ -382,6 +448,40 @@ func (h *DocumentHandler) ListDocuments(c *gin.Context) {
})
}
func (h *DocumentHandler) DownloadDocument(c *gin.Context) {
datasetID := c.Param("dataset_id")
docID := c.Param("document_id")
if docID == "" {
c.JSON(http.StatusOK, gin.H{
"code": common.CodeDataError,
"message": "Specify document_id please.",
})
return
}
if datasetID == "" {
c.JSON(http.StatusOK, gin.H{
"code": common.CodeDataError,
"message": fmt.Sprintf("The dataset not own the document %s.", docID),
})
return
}
res, err := h.documentService.DownloadDocument(datasetID, docID)
if err != nil {
c.JSON(http.StatusOK, gin.H{
"code": common.CodeDataError,
"message": err.Error(),
})
return
}
c.Header("Content-Type", res.ContentType)
c.Header("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, res.FileName))
c.Data(http.StatusOK, res.ContentType, res.Data)
}
func mapDocumentListItem(doc *entity.DocumentListItem, metaFields map[string]interface{}) map[string]interface{} {
item := map[string]interface{}{
"id": doc.ID,

View File

@@ -42,6 +42,40 @@ type fakeDocumentService struct {
stopErr error
}
func (f *fakeDocumentService) GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) {
if filename == "error.txt" {
return nil, service.ErrArtifactNotFound
}
if filename == "unexpected.txt" {
return nil, fmt.Errorf("unexpected error")
}
return &service.ArtifactResponse{
Data: []byte("artifact content"),
ContentType: "text/plain",
SafeFilename: "safe.txt",
ForceAttachment: false,
}, nil
}
func (f *fakeDocumentService) GetDocumentPreview(docID string) (*service.DocumentPreview, error) {
if docID == "not-found" {
return nil, fmt.Errorf("not found")
}
return &service.DocumentPreview{
Data: []byte("preview content"),
ContentType: "text/plain",
FileName: "preview.txt",
}, nil
}
func (f *fakeDocumentService) DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error) {
if docID == "not-found" {
return nil, fmt.Errorf("not found")
}
return &service.DownloadDocumentResp{
Data: []byte("document data"),
ContentType: "application/pdf",
FileName: "doc.pdf",
}, nil
}
func (f *fakeDocumentService) CreateDocument(req *service.CreateDocumentRequest) (*entity.Document, error) {
return nil, nil
}
@@ -442,3 +476,146 @@ func TestStopParseDocumentsHandler_NotAccessible(t *testing.T) {
t.Fatal("expected error for no authorization")
}
}
func TestGetDocumentArtifact_Success(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/test.txt", "")
c.Params = gin.Params{{Key: "filename", Value: "test.txt"}}
h.GetDocumentArtifact(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
if w.Header().Get("Content-Type") != "text/plain" {
t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type"))
}
if w.Body.String() != "artifact content" {
t.Fatalf("unexpected body: %s", w.Body.String())
}
}
func TestGetDocumentArtifact_NotFound(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/error.txt", "")
c.Params = gin.Params{{Key: "filename", Value: "error.txt"}}
h.GetDocumentArtifact(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if resp["code"] != float64(common.CodeDataError) {
t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"])
}
}
func TestGetDocumentArtifact_UnexpectedError(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/unexpected.txt", "")
c.Params = gin.Params{{Key: "filename", Value: "unexpected.txt"}}
h.GetDocumentArtifact(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if resp["code"] != float64(common.CodeExceptionError) {
t.Fatalf("expected code %d, got %v", common.CodeExceptionError, resp["code"])
}
}
func TestGetDocumentPreview_Success(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/documents/doc-1/preview", "")
c.Params = gin.Params{{Key: "id", Value: "doc-1"}}
h.GetDocumentPreview(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
if w.Header().Get("Content-Type") != "text/plain" {
t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type"))
}
if w.Body.String() != "preview content" {
t.Fatalf("unexpected body: %s", w.Body.String())
}
}
func TestGetDocumentPreview_NotFound(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/documents/not-found/preview", "")
c.Params = gin.Params{{Key: "id", Value: "not-found"}}
h.GetDocumentPreview(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if resp["code"] != float64(common.CodeDataError) {
t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"])
}
}
func TestDownloadDocument_Success(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/doc-1", "")
c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "doc-1"}}
h.DownloadDocument(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
if w.Header().Get("Content-Type") != "application/pdf" {
t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type"))
}
if w.Body.String() != "document data" {
t.Fatalf("unexpected body: %s", w.Body.String())
}
}
func TestDownloadDocument_NotFound(t *testing.T) {
gin.SetMode(gin.TestMode)
h := &DocumentHandler{
documentService: &fakeDocumentService{},
}
c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/not-found", "")
c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "not-found"}}
h.DownloadDocument(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d", w.Code)
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if resp["code"] != float64(common.CodeDataError) {
t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"])
}
}

View File

@@ -207,6 +207,8 @@ func (r *Router) Setup(engine *gin.Engine) {
{
documents.POST("", r.documentHandler.CreateDocument)
documents.GET("", r.documentHandler.ListDocuments)
documents.GET("/artifact/:filename", r.documentHandler.GetDocumentArtifact)
documents.GET("/:id/preview", r.documentHandler.GetDocumentPreview)
documents.GET("/:id", r.documentHandler.GetDocumentByID)
documents.PUT("/:id", r.documentHandler.UpdateDocument)
documents.DELETE("/:id", r.documentHandler.DeleteDocument)
@@ -247,6 +249,7 @@ func (r *Router) Setup(engine *gin.Engine) {
// Dataset documents
datasets.GET("/:dataset_id/documents", r.documentHandler.ListDocuments)
datasets.GET("/:dataset_id/documents/:document_id", r.documentHandler.DownloadDocument)
datasets.DELETE("/:dataset_id/documents", r.documentHandler.DeleteDocuments)
// Dataset document chunk

View File

@@ -19,10 +19,14 @@ package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"ragflow/internal/common"
"ragflow/internal/entity"
"ragflow/internal/storage"
"ragflow/internal/utility"
"regexp"
"sort"
"strings"
@@ -117,6 +121,51 @@ type ThumbnailResponse struct {
KbID string `json:"kb_id"`
}
type ArtifactResponse struct {
Data []byte
ContentType string
SafeFilename string
ForceAttachment bool
}
var (
ErrArtifactInvalidFilename = errors.New("Invalid filename.")
ErrArtifactInvalidFileType = errors.New("Invalid file type.")
ErrArtifactNotFound = errors.New("Artifact not found.")
)
var artifactContentTypes = map[string]string{
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".svg": "image/svg+xml",
".pdf": "application/pdf",
".csv": "text/csv",
".json": "application/json",
".html": "text/html",
}
var artifactForceAttachmentExtensions = map[string]struct{}{
".htm": {},
".html": {},
".shtml": {},
".xht": {},
".xhtml": {},
".xml": {},
".mhtml": {},
".svg": {},
}
var artifactForceAttachmentContentTypes = map[string]struct{}{
"text/html": {},
"image/svg+xml": {},
"application/xhtml+xml": {},
"text/xml": {},
"application/xml": {},
"multipart/related": {},
}
var artifactUnsafeFilenameChars = regexp.MustCompile(`[^\pL\pN_.-]`)
// GetDocumentImage retrieves an image object from storage.
func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) {
parts := strings.Split(imageID, "-")
@@ -132,6 +181,186 @@ func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) {
return storageImpl.Get(parts[0], parts[1])
}
// GetDocumentArtifact retrieves a sandbox artifact from object storage.
func (s *DocumentService) GetDocumentArtifact(filename string) (*ArtifactResponse, error) {
basename := filepath.Base(filename)
if basename != filename || strings.Contains(filename, "/") || strings.Contains(filename, "\\") {
return nil, ErrArtifactInvalidFilename
}
ext := strings.ToLower(filepath.Ext(basename))
contentType, ok := artifactContentTypes[ext]
if !ok {
return nil, ErrArtifactInvalidFileType
}
storageImpl := storage.GetStorageFactory().GetStorage()
if storageImpl == nil {
return nil, fmt.Errorf("storage not initialized")
}
bucket := sandboxArtifactBucket()
if !storageImpl.ObjExist(bucket, basename) {
return nil, ErrArtifactNotFound
}
data, err := storageImpl.Get(bucket, basename)
if err != nil {
return nil, err
}
if len(data) == 0 {
return nil, ErrArtifactNotFound
}
return &ArtifactResponse{
Data: data,
ContentType: contentType,
SafeFilename: sanitizeArtifactFilename(basename),
ForceAttachment: shouldForceArtifactAttachment(ext, contentType),
}, nil
}
func sandboxArtifactBucket() string {
if bucket := os.Getenv("SANDBOX_ARTIFACT_BUCKET"); bucket != "" {
return bucket
}
return "sandbox-artifacts"
}
func sanitizeArtifactFilename(filename string) string {
return artifactUnsafeFilenameChars.ReplaceAllString(filename, "_")
}
func shouldForceArtifactAttachment(ext, contentType string) bool {
if _, ok := artifactForceAttachmentExtensions[strings.ToLower(ext)]; ok {
return true
}
_, ok := artifactForceAttachmentContentTypes[strings.ToLower(contentType)]
return ok
}
type DocumentPreview struct {
Data []byte
ContentType string
FileName string
}
func (s *DocumentService) GetDocumentPreview(docID string) (*DocumentPreview, error) {
doc, err := s.documentDAO.GetByID(docID)
if err != nil {
return nil, err
}
bucket, name, err := s.GetDocumentStorageAddress(doc)
if err != nil {
return nil, err
}
storageImpl := storage.GetStorageFactory().GetStorage()
if storageImpl == nil {
return nil, fmt.Errorf("storage not initialized")
}
data, err := storageImpl.Get(bucket, name)
if err != nil {
return nil, err
}
if len(data) == 0 {
return nil, ErrArtifactNotFound
}
fileName := ""
if doc.Name != nil {
fileName = *doc.Name
}
ext := utility.GetFileExtension(fileName)
contentType := utility.GetContentType(ext, doc.Type)
return &DocumentPreview{
Data: data,
ContentType: contentType,
FileName: fileName,
}, nil
}
func (s *DocumentService) GetDocumentStorageAddress(doc *entity.Document) (string, string, error) {
if doc == nil {
return "", "", fmt.Errorf("document is nil")
}
file2DocumentDAO := dao.NewFile2DocumentDAO()
fileDAO := dao.NewFileDAO()
mappings, err := file2DocumentDAO.GetByDocumentID(doc.ID)
if err != nil {
return "", "", err
}
if len(mappings) > 0 && mappings[0].FileID != nil {
file, err := fileDAO.GetByID(*mappings[0].FileID)
if err != nil {
return "", "", err
}
if file.SourceType == "" || entity.FileSource(file.SourceType) == entity.FileSourceLocal {
if file.Location == nil || *file.Location == "" {
return "", "", fmt.Errorf("file location is empty")
}
return file.ParentID, *file.Location, nil
}
}
if doc.Location == nil || *doc.Location == "" {
return "", "", fmt.Errorf("document location is empty")
}
return doc.KbID, *doc.Location, nil
}
type DownloadDocumentResp struct {
Data []byte
FileName string
ContentType string
}
func (s *DocumentService) DownloadDocument(datasetID, docID string) (*DownloadDocumentResp, error) {
if docID == "" {
return nil, fmt.Errorf("Specify document_id please.")
}
doc, err := s.documentDAO.GetByID(docID)
if err != nil || doc.KbID != datasetID {
return nil, fmt.Errorf("The dataset not own the document %s.", docID)
}
bucket, name, err := s.GetDocumentStorageAddress(doc)
if err != nil {
return nil, err
}
storageImpl := storage.GetStorageFactory().GetStorage()
if storageImpl == nil {
return nil, fmt.Errorf("storage not initialized")
}
data, err := storageImpl.Get(bucket, name)
if err != nil {
return nil, err
}
if len(data) == 0 {
return nil, fmt.Errorf("This file is empty.")
}
fileName := ""
if doc.Name != nil {
fileName = *doc.Name
}
return &DownloadDocumentResp{
Data: data,
FileName: fileName,
ContentType: "application/octet-stream",
}, nil
}
// CreateDocument create document
func (s *DocumentService) CreateDocument(req *CreateDocumentRequest) (*entity.Document, error) {
document := &entity.Document{

View File

@@ -855,3 +855,67 @@ func TestCleanupFileReferences_SharedFileSurvives(t *testing.T) {
t.Fatalf("expected 1 f2d for doc-2, got %d", len(mappings))
}
}
func TestArtifactHelpers(t *testing.T) {
// Test sanitizeArtifactFilename
safe := sanitizeArtifactFilename("test@#file.txt")
if safe != "test__file.txt" {
t.Errorf("expected test__file.txt, got %s", safe)
}
// Test shouldForceArtifactAttachment
if !shouldForceArtifactAttachment(".html", "text/html") {
t.Error("expected true for .html")
}
if shouldForceArtifactAttachment(".txt", "text/plain") {
t.Error("expected false for .txt")
}
}
func TestGetDocumentArtifact_InvalidFilename(t *testing.T) {
svc := testDocumentService(t)
_, err := svc.GetDocumentArtifact("../test.txt")
if err != ErrArtifactInvalidFilename {
t.Errorf("expected ErrArtifactInvalidFilename, got %v", err)
}
}
func TestGetDocumentArtifact_InvalidFileType(t *testing.T) {
svc := testDocumentService(t)
_, err := svc.GetDocumentArtifact("test.exe")
if err != ErrArtifactInvalidFileType {
t.Errorf("expected ErrArtifactInvalidFileType, got %v", err)
}
}
func TestGetDocumentPreview_DocumentNotFound(t *testing.T) {
db := setupServiceTestDB(t)
pushServiceDB(t, db)
svc := testDocumentService(t)
_, err := svc.GetDocumentPreview("nonexistent")
if err == nil {
t.Error("expected error for nonexistent document")
}
}
func TestDownloadDocument_MissingDocID(t *testing.T) {
svc := testDocumentService(t)
_, err := svc.DownloadDocument("ds-1", "")
if err == nil {
t.Error("expected error for missing docID")
}
}
func TestDownloadDocument_WrongDataset(t *testing.T) {
db := setupServiceTestDB(t)
pushServiceDB(t, db)
insertTestKB(t, "kb-1", "tenant-1", 1, 5, 2)
insertTestDoc(t, "doc-1", "kb-1", 5, 2)
svc := testDocumentService(t)
_, err := svc.DownloadDocument("wrong-ds", "doc-1")
if err == nil {
t.Error("expected error for wrong dataset")
}
}