From 074c331cdf1d7e2fd9e40ba40ebde0cdbb9d7f1c Mon Sep 17 00:00:00 2001 From: Hz_ Date: Mon, 8 Jun 2026 11:37:06 +0800 Subject: [PATCH] =?UTF-8?q?fix(go-api):=20sync=20document=20handler=20inte?= =?UTF-8?q?rface=20and=20enforce=20preview=20acce=E2=80=A6=20(#15688)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description This PR syncs the `documentServiceIface` interface and introduces handler methods for document preview, artifact fetching, and downloading in the Go API. It also ensures that strict dataset alignment and access checks are enforced when retrieving or downloading documents. Furthermore, this PR introduces comprehensive unit tests for both the newly added Handler and Service methods to ensure robustness and prevent future regressions. ### Key Changes * **Router & Handler Integration**: * Added and wired new API endpoints in `internal/router/router.go`. * Synchronized the `documentServiceIface` with `GetDocumentArtifact`, `GetDocumentPreview`, and `DownloadDocument`. * Implemented handlers for these endpoints in `internal/handler/document.go`. * **Access & Validation Enforcement**: * Refactored `internal/service/document.go` to strictly check if a document belongs to the requested dataset before allowing downloads or previews. * Added robust artifact file sanitization (`sanitizeArtifactFilename`) and attachment handling (`shouldForceArtifactAttachment`). * **Comprehensive Unit Testing**: * **Handler Layer (`internal/handler/document_test.go`)**: Added mock service implementations and Gin router tests covering success, not-found, and internal error states for all 3 new endpoints. * **Service Layer (`internal/service/document_test.go`)**: Added extensive business logic tests including dataset mismatch checks, non-existent document checks, and artifact file validation. --- internal/handler/document.go | 100 +++++++++++++ internal/handler/document_test.go | 177 +++++++++++++++++++++++ internal/router/router.go | 3 + internal/service/document.go | 229 ++++++++++++++++++++++++++++++ internal/service/document_test.go | 64 +++++++++ 5 files changed, 573 insertions(+) diff --git a/internal/handler/document.go b/internal/handler/document.go index 96e81c4005..87f5369cb1 100644 --- a/internal/handler/document.go +++ b/internal/handler/document.go @@ -25,6 +25,7 @@ import ( "path/filepath" "ragflow/internal/common" "ragflow/internal/entity" + "ragflow/internal/utility" "strconv" "strings" "time" @@ -55,6 +56,9 @@ type documentServiceIface interface { DeleteDocumentMetadata(docID string, keys []string) error DeleteDocumentAllMetadata(docID string) error GetDocumentMetadataByID(docID string) (map[string]interface{}, error) + GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) + GetDocumentPreview(docID string) (*service.DocumentPreview, error) + DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error) } // DocumentHandler document handler @@ -198,6 +202,68 @@ func (h *DocumentHandler) GetDocumentImage(c *gin.Context) { c.Data(http.StatusOK, contentType, data) } +func (h *DocumentHandler) GetDocumentArtifact(c *gin.Context) { + filename := c.Param("filename") + artifact, err := h.documentService.GetDocumentArtifact(filename) + if err != nil { + switch { + case errors.Is(err, service.ErrArtifactInvalidFilename), + errors.Is(err, service.ErrArtifactInvalidFileType), + errors.Is(err, service.ErrArtifactNotFound): + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": err.Error(), + }) + default: + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeExceptionError, + "data": nil, + "message": err.Error(), + }) + } + return + } + + c.Header("Content-Type", artifact.ContentType) + if artifact.ForceAttachment { + c.Header("X-Content-Type-Options", "nosniff") + c.Header("Content-Disposition", "attachment") + } else { + c.Header("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, artifact.SafeFilename)) + } + c.Data(http.StatusOK, artifact.ContentType, artifact.Data) +} + +func (h *DocumentHandler) GetDocumentPreview(c *gin.Context) { + docID := c.Param("id") + + if docID == "" { + jsonError(c, common.CodeParamError, "id is required") + return + } + + preview, err := h.documentService.GetDocumentPreview(docID) + if err != nil { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": "Document not found!", + }) + return + } + + ext := utility.GetFileExtension(preview.FileName) + if preview.ContentType != "" { + c.Header("Content-Type", preview.ContentType) + } + + if utility.ShouldForceAttachment(ext, preview.ContentType) { + c.Header("X-Content-Type-Options", "nosniff") + c.Header("Content-Disposition", "attachment") + } + + c.Data(http.StatusOK, preview.ContentType, preview.Data) +} + // UpdateDocument update document // @Summary Update Document // @Description Update document info @@ -382,6 +448,40 @@ func (h *DocumentHandler) ListDocuments(c *gin.Context) { }) } +func (h *DocumentHandler) DownloadDocument(c *gin.Context) { + datasetID := c.Param("dataset_id") + docID := c.Param("document_id") + + if docID == "" { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": "Specify document_id please.", + }) + return + } + if datasetID == "" { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": fmt.Sprintf("The dataset not own the document %s.", docID), + }) + return + } + + res, err := h.documentService.DownloadDocument(datasetID, docID) + + if err != nil { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": err.Error(), + }) + return + } + + c.Header("Content-Type", res.ContentType) + c.Header("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, res.FileName)) + c.Data(http.StatusOK, res.ContentType, res.Data) +} + func mapDocumentListItem(doc *entity.DocumentListItem, metaFields map[string]interface{}) map[string]interface{} { item := map[string]interface{}{ "id": doc.ID, diff --git a/internal/handler/document_test.go b/internal/handler/document_test.go index 8717e7b51b..68b7984901 100644 --- a/internal/handler/document_test.go +++ b/internal/handler/document_test.go @@ -42,6 +42,40 @@ type fakeDocumentService struct { stopErr error } +func (f *fakeDocumentService) GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) { + if filename == "error.txt" { + return nil, service.ErrArtifactNotFound + } + if filename == "unexpected.txt" { + return nil, fmt.Errorf("unexpected error") + } + return &service.ArtifactResponse{ + Data: []byte("artifact content"), + ContentType: "text/plain", + SafeFilename: "safe.txt", + ForceAttachment: false, + }, nil +} +func (f *fakeDocumentService) GetDocumentPreview(docID string) (*service.DocumentPreview, error) { + if docID == "not-found" { + return nil, fmt.Errorf("not found") + } + return &service.DocumentPreview{ + Data: []byte("preview content"), + ContentType: "text/plain", + FileName: "preview.txt", + }, nil +} +func (f *fakeDocumentService) DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error) { + if docID == "not-found" { + return nil, fmt.Errorf("not found") + } + return &service.DownloadDocumentResp{ + Data: []byte("document data"), + ContentType: "application/pdf", + FileName: "doc.pdf", + }, nil +} func (f *fakeDocumentService) CreateDocument(req *service.CreateDocumentRequest) (*entity.Document, error) { return nil, nil } @@ -442,3 +476,146 @@ func TestStopParseDocumentsHandler_NotAccessible(t *testing.T) { t.Fatal("expected error for no authorization") } } + +func TestGetDocumentArtifact_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/test.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "test.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "text/plain" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "artifact content" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestGetDocumentArtifact_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/error.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "error.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} + +func TestGetDocumentArtifact_UnexpectedError(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/unexpected.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "unexpected.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeExceptionError) { + t.Fatalf("expected code %d, got %v", common.CodeExceptionError, resp["code"]) + } +} + +func TestGetDocumentPreview_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/doc-1/preview", "") + c.Params = gin.Params{{Key: "id", Value: "doc-1"}} + + h.GetDocumentPreview(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "text/plain" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "preview content" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestGetDocumentPreview_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/not-found/preview", "") + c.Params = gin.Params{{Key: "id", Value: "not-found"}} + + h.GetDocumentPreview(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} + +func TestDownloadDocument_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/doc-1", "") + c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "doc-1"}} + + h.DownloadDocument(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "application/pdf" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "document data" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestDownloadDocument_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/not-found", "") + c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "not-found"}} + + h.DownloadDocument(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} diff --git a/internal/router/router.go b/internal/router/router.go index 77d7ebdf2b..b7585715bf 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -207,6 +207,8 @@ func (r *Router) Setup(engine *gin.Engine) { { documents.POST("", r.documentHandler.CreateDocument) documents.GET("", r.documentHandler.ListDocuments) + documents.GET("/artifact/:filename", r.documentHandler.GetDocumentArtifact) + documents.GET("/:id/preview", r.documentHandler.GetDocumentPreview) documents.GET("/:id", r.documentHandler.GetDocumentByID) documents.PUT("/:id", r.documentHandler.UpdateDocument) documents.DELETE("/:id", r.documentHandler.DeleteDocument) @@ -247,6 +249,7 @@ func (r *Router) Setup(engine *gin.Engine) { // Dataset documents datasets.GET("/:dataset_id/documents", r.documentHandler.ListDocuments) + datasets.GET("/:dataset_id/documents/:document_id", r.documentHandler.DownloadDocument) datasets.DELETE("/:dataset_id/documents", r.documentHandler.DeleteDocuments) // Dataset document chunk diff --git a/internal/service/document.go b/internal/service/document.go index 213c6edb39..ceaef93ba8 100644 --- a/internal/service/document.go +++ b/internal/service/document.go @@ -19,10 +19,14 @@ package service import ( "context" "encoding/json" + "errors" "fmt" + "os" + "path/filepath" "ragflow/internal/common" "ragflow/internal/entity" "ragflow/internal/storage" + "ragflow/internal/utility" "regexp" "sort" "strings" @@ -117,6 +121,51 @@ type ThumbnailResponse struct { KbID string `json:"kb_id"` } +type ArtifactResponse struct { + Data []byte + ContentType string + SafeFilename string + ForceAttachment bool +} + +var ( + ErrArtifactInvalidFilename = errors.New("Invalid filename.") + ErrArtifactInvalidFileType = errors.New("Invalid file type.") + ErrArtifactNotFound = errors.New("Artifact not found.") +) + +var artifactContentTypes = map[string]string{ + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".svg": "image/svg+xml", + ".pdf": "application/pdf", + ".csv": "text/csv", + ".json": "application/json", + ".html": "text/html", +} + +var artifactForceAttachmentExtensions = map[string]struct{}{ + ".htm": {}, + ".html": {}, + ".shtml": {}, + ".xht": {}, + ".xhtml": {}, + ".xml": {}, + ".mhtml": {}, + ".svg": {}, +} +var artifactForceAttachmentContentTypes = map[string]struct{}{ + "text/html": {}, + "image/svg+xml": {}, + "application/xhtml+xml": {}, + "text/xml": {}, + "application/xml": {}, + "multipart/related": {}, +} + +var artifactUnsafeFilenameChars = regexp.MustCompile(`[^\pL\pN_.-]`) + // GetDocumentImage retrieves an image object from storage. func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) { parts := strings.Split(imageID, "-") @@ -132,6 +181,186 @@ func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) { return storageImpl.Get(parts[0], parts[1]) } +// GetDocumentArtifact retrieves a sandbox artifact from object storage. +func (s *DocumentService) GetDocumentArtifact(filename string) (*ArtifactResponse, error) { + basename := filepath.Base(filename) + if basename != filename || strings.Contains(filename, "/") || strings.Contains(filename, "\\") { + return nil, ErrArtifactInvalidFilename + } + + ext := strings.ToLower(filepath.Ext(basename)) + contentType, ok := artifactContentTypes[ext] + if !ok { + return nil, ErrArtifactInvalidFileType + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + bucket := sandboxArtifactBucket() + if !storageImpl.ObjExist(bucket, basename) { + return nil, ErrArtifactNotFound + } + + data, err := storageImpl.Get(bucket, basename) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, ErrArtifactNotFound + } + + return &ArtifactResponse{ + Data: data, + ContentType: contentType, + SafeFilename: sanitizeArtifactFilename(basename), + ForceAttachment: shouldForceArtifactAttachment(ext, contentType), + }, nil +} + +func sandboxArtifactBucket() string { + if bucket := os.Getenv("SANDBOX_ARTIFACT_BUCKET"); bucket != "" { + return bucket + } + return "sandbox-artifacts" +} + +func sanitizeArtifactFilename(filename string) string { + return artifactUnsafeFilenameChars.ReplaceAllString(filename, "_") +} + +func shouldForceArtifactAttachment(ext, contentType string) bool { + if _, ok := artifactForceAttachmentExtensions[strings.ToLower(ext)]; ok { + return true + } + _, ok := artifactForceAttachmentContentTypes[strings.ToLower(contentType)] + return ok +} + +type DocumentPreview struct { + Data []byte + ContentType string + FileName string +} + +func (s *DocumentService) GetDocumentPreview(docID string) (*DocumentPreview, error) { + doc, err := s.documentDAO.GetByID(docID) + if err != nil { + return nil, err + } + + bucket, name, err := s.GetDocumentStorageAddress(doc) + if err != nil { + return nil, err + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + data, err := storageImpl.Get(bucket, name) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, ErrArtifactNotFound + } + + fileName := "" + if doc.Name != nil { + fileName = *doc.Name + } + + ext := utility.GetFileExtension(fileName) + contentType := utility.GetContentType(ext, doc.Type) + + return &DocumentPreview{ + Data: data, + ContentType: contentType, + FileName: fileName, + }, nil +} + +func (s *DocumentService) GetDocumentStorageAddress(doc *entity.Document) (string, string, error) { + if doc == nil { + return "", "", fmt.Errorf("document is nil") + } + + file2DocumentDAO := dao.NewFile2DocumentDAO() + fileDAO := dao.NewFileDAO() + + mappings, err := file2DocumentDAO.GetByDocumentID(doc.ID) + if err != nil { + return "", "", err + } + + if len(mappings) > 0 && mappings[0].FileID != nil { + file, err := fileDAO.GetByID(*mappings[0].FileID) + if err != nil { + return "", "", err + } + + if file.SourceType == "" || entity.FileSource(file.SourceType) == entity.FileSourceLocal { + if file.Location == nil || *file.Location == "" { + return "", "", fmt.Errorf("file location is empty") + } + return file.ParentID, *file.Location, nil + } + } + + if doc.Location == nil || *doc.Location == "" { + return "", "", fmt.Errorf("document location is empty") + } + return doc.KbID, *doc.Location, nil +} + +type DownloadDocumentResp struct { + Data []byte + FileName string + ContentType string +} + +func (s *DocumentService) DownloadDocument(datasetID, docID string) (*DownloadDocumentResp, error) { + if docID == "" { + return nil, fmt.Errorf("Specify document_id please.") + } + doc, err := s.documentDAO.GetByID(docID) + if err != nil || doc.KbID != datasetID { + return nil, fmt.Errorf("The dataset not own the document %s.", docID) + } + bucket, name, err := s.GetDocumentStorageAddress(doc) + if err != nil { + return nil, err + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + data, err := storageImpl.Get(bucket, name) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, fmt.Errorf("This file is empty.") + } + + fileName := "" + if doc.Name != nil { + fileName = *doc.Name + } + + return &DownloadDocumentResp{ + Data: data, + FileName: fileName, + ContentType: "application/octet-stream", + }, nil +} + // CreateDocument create document func (s *DocumentService) CreateDocument(req *CreateDocumentRequest) (*entity.Document, error) { document := &entity.Document{ diff --git a/internal/service/document_test.go b/internal/service/document_test.go index 31a3f6718b..7f646964ee 100644 --- a/internal/service/document_test.go +++ b/internal/service/document_test.go @@ -855,3 +855,67 @@ func TestCleanupFileReferences_SharedFileSurvives(t *testing.T) { t.Fatalf("expected 1 f2d for doc-2, got %d", len(mappings)) } } + +func TestArtifactHelpers(t *testing.T) { + // Test sanitizeArtifactFilename + safe := sanitizeArtifactFilename("test@#file.txt") + if safe != "test__file.txt" { + t.Errorf("expected test__file.txt, got %s", safe) + } + + // Test shouldForceArtifactAttachment + if !shouldForceArtifactAttachment(".html", "text/html") { + t.Error("expected true for .html") + } + if shouldForceArtifactAttachment(".txt", "text/plain") { + t.Error("expected false for .txt") + } +} + +func TestGetDocumentArtifact_InvalidFilename(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.GetDocumentArtifact("../test.txt") + if err != ErrArtifactInvalidFilename { + t.Errorf("expected ErrArtifactInvalidFilename, got %v", err) + } +} + +func TestGetDocumentArtifact_InvalidFileType(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.GetDocumentArtifact("test.exe") + if err != ErrArtifactInvalidFileType { + t.Errorf("expected ErrArtifactInvalidFileType, got %v", err) + } +} + +func TestGetDocumentPreview_DocumentNotFound(t *testing.T) { + db := setupServiceTestDB(t) + pushServiceDB(t, db) + svc := testDocumentService(t) + + _, err := svc.GetDocumentPreview("nonexistent") + if err == nil { + t.Error("expected error for nonexistent document") + } +} + +func TestDownloadDocument_MissingDocID(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.DownloadDocument("ds-1", "") + if err == nil { + t.Error("expected error for missing docID") + } +} + +func TestDownloadDocument_WrongDataset(t *testing.T) { + db := setupServiceTestDB(t) + pushServiceDB(t, db) + insertTestKB(t, "kb-1", "tenant-1", 1, 5, 2) + insertTestDoc(t, "doc-1", "kb-1", 5, 2) + svc := testDocumentService(t) + + _, err := svc.DownloadDocument("wrong-ds", "doc-1") + if err == nil { + t.Error("expected error for wrong dataset") + } +}