Files
ragflow/internal/service/file_test.go
Hz_ e35860ad74 feat(go-api): Align document metadata batch APIs and upload_info with Python (#16269)
## Summary

  Align the Go implementations of these APIs with the Python behavior:

  - `POST /api/v1/datasets/:dataset_id/metadata/update`
  - `PATCH /api/v1/datasets/:dataset_id/documents/metadatas`
  - `POST /api/v1/documents/upload`

  ## What changed

  - Added the Go routes and handlers for the 3 APIs.
  - Aligned batch document metadata updates with Python semantics:
    - support `match` in update items
    - support list append / replace behavior
    - support deleting specific list values
    - remove metadata entirely when it becomes empty
- create metadata for documents that previously had none when updates
apply
    - count `updated` only when a document actually changes
- Aligned `documents/upload` file uploads with Python-style
`upload_info` behavior:
    - store upload-info blobs in the per-user downloads bucket
- return lightweight upload descriptors instead of normal
file-management responses
  - Improved URL upload behavior:
    - SSRF-guarded fetch with redirect validation
    - redirect limit aligned to Python behavior
    - normalize filename and MIME type
    - add `.pdf` when the fetched content is PDF
- normalize HTML content into readable text instead of storing raw HTML
shells

  ## Validation

  ### Unit tests

  Passed:

  - `go test ./internal/service`
  - `go test ./internal/handler`

  Also verified targeted cases for:

  - batch metadata update semantics
  - upload_info URL handling
  - upload_info download bucket behavior

  ### curl checks

Verified the new Go endpoints with `curl` and compared the response
shape and behavior with Python for:

  - `POST /api/v1/datasets/{dataset_id}/metadata/update`
  - `PATCH /api/v1/datasets/{dataset_id}/documents/metadatas`
  - `POST /api/v1/documents/upload`

  The Go responses were checked against Python for:
  - argument validation
  - success response shape
  - metadata update results
  - upload_info result structure
  - file vs URL input handling
2026-06-24 14:52:47 +08:00

263 lines
7.1 KiB
Go

package service
import (
"bytes"
"errors"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"ragflow/internal/storage"
)
// fakeStorage mocks storage.Storage for testing DownloadAgentFile.
type fakeStorage struct {
lastBucket string
lastFnm string
blob []byte
err error
exists bool
}
func (f *fakeStorage) Health() bool {
return true
}
func (f *fakeStorage) Put(bucket, fnm string, binary []byte, tenantID ...string) error {
f.lastBucket = bucket
f.lastFnm = fnm
f.blob = binary
f.exists = true
return f.err
}
func (f *fakeStorage) Get(bucket, fnm string, tenantID ...string) ([]byte, error) {
f.lastBucket = bucket
f.lastFnm = fnm
return f.blob, f.err
}
func (f *fakeStorage) Remove(bucket, fnm string, tenantID ...string) error {
panic("not implemented in fakeStorage")
}
func (f *fakeStorage) ObjExist(bucket, fnm string, tenantID ...string) bool {
return f.exists && f.lastBucket == bucket && f.lastFnm == fnm
}
func (f *fakeStorage) GetPresignedURL(bucket, fnm string, expires time.Duration, tenantID ...string) (string, error) {
panic("not implemented in fakeStorage")
}
func (f *fakeStorage) BucketExists(bucket string) bool {
panic("not implemented in fakeStorage")
}
func (f *fakeStorage) RemoveBucket(bucket string) error {
panic("not implemented in fakeStorage")
}
func (f *fakeStorage) Copy(srcBucket, srcPath, destBucket, destPath string) bool {
panic("not implemented in fakeStorage")
}
func (f *fakeStorage) Move(srcBucket, srcPath, destBucket, destPath string) bool {
panic("not implemented in fakeStorage")
}
func TestFileService_DownloadAgentFile_Success(t *testing.T) {
// Setup mock storage
expectedBlob := []byte("fake file content")
mockStorage := &fakeStorage{
blob: expectedBlob,
err: nil,
}
factory := storage.GetStorageFactory()
originalStorage := factory.GetStorage()
factory.SetStorage(mockStorage)
t.Cleanup(func() {
factory.SetStorage(originalStorage)
})
svc := NewFileService()
tenantID := "tenant123"
location := "file-abc.txt"
blob, err := svc.DownloadAgentFile(tenantID, location)
if err != nil {
t.Fatalf("expected no error, got %v", err)
}
if mockStorage.lastBucket != "tenant123-downloads" {
t.Errorf("expected bucket 'tenant123-downloads', got %q", mockStorage.lastBucket)
}
if mockStorage.lastFnm != location {
t.Errorf("expected fnm %q, got %q", location, mockStorage.lastFnm)
}
if !bytes.Equal(blob, expectedBlob) {
t.Errorf("expected blob %v, got %v", expectedBlob, blob)
}
}
func TestFileService_DownloadAgentFile_Error(t *testing.T) {
// Setup mock storage
expectedErr := errors.New("not found")
mockStorage := &fakeStorage{
blob: nil,
err: expectedErr,
}
factory := storage.GetStorageFactory()
originalStorage := factory.GetStorage()
factory.SetStorage(mockStorage)
t.Cleanup(func() {
factory.SetStorage(originalStorage)
})
svc := NewFileService()
tenantID := "tenant123"
location := "file-abc.txt"
blob, err := svc.DownloadAgentFile(tenantID, location)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !errors.Is(err, expectedErr) {
t.Errorf("expected error %v, got %v", expectedErr, err)
}
if blob != nil {
t.Errorf("expected nil blob, got %v", blob)
}
}
func TestFileService_UploadFromURL_PDFAddsExtensionAndStoresToDownloads(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/pdf")
_, _ = w.Write([]byte("%PDF-1.7 fake pdf"))
}))
defer server.Close()
origAssert := assertURLSafe
origPinned := pinnedHTTPClient
assertURLSafe = func(rawURL string) (string, string, error) {
return "127.0.0.1", "127.0.0.1", nil
}
pinnedHTTPClient = func(hostname, resolvedIP string, timeout time.Duration) *http.Client {
return server.Client()
}
t.Cleanup(func() {
assertURLSafe = origAssert
pinnedHTTPClient = origPinned
})
mockStorage := &fakeStorage{}
factory := storage.GetStorageFactory()
originalStorage := factory.GetStorage()
factory.SetStorage(mockStorage)
t.Cleanup(func() { factory.SetStorage(originalStorage) })
svc := NewFileService()
resp, err := svc.UploadFromURL("tenant123", server.URL+"/report")
if err != nil {
t.Fatalf("UploadFromURL failed: %v", err)
}
if mockStorage.lastBucket != "tenant123-downloads" {
t.Fatalf("bucket = %q", mockStorage.lastBucket)
}
if resp["name"] != "report.pdf" {
t.Fatalf("name = %#v, want report.pdf", resp["name"])
}
if resp["mime_type"] != "application/pdf" {
t.Fatalf("mime_type = %#v", resp["mime_type"])
}
if resp["id"] != mockStorage.lastFnm {
t.Fatalf("id = %#v, stored key = %q", resp["id"], mockStorage.lastFnm)
}
}
func TestFileService_UploadFromURL_HTMLNormalizesReadableContent(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(`<html><head><script>bad()</script></head><body><div>Hello</div><p>World</p></body></html>`))
}))
defer server.Close()
origAssert := assertURLSafe
origPinned := pinnedHTTPClient
assertURLSafe = func(rawURL string) (string, string, error) {
return "127.0.0.1", "127.0.0.1", nil
}
pinnedHTTPClient = func(hostname, resolvedIP string, timeout time.Duration) *http.Client {
return server.Client()
}
t.Cleanup(func() {
assertURLSafe = origAssert
pinnedHTTPClient = origPinned
})
mockStorage := &fakeStorage{}
factory := storage.GetStorageFactory()
originalStorage := factory.GetStorage()
factory.SetStorage(mockStorage)
t.Cleanup(func() { factory.SetStorage(originalStorage) })
svc := NewFileService()
resp, err := svc.UploadFromURL("tenant123", server.URL+"/page")
if err != nil {
t.Fatalf("UploadFromURL failed: %v", err)
}
stored := string(mockStorage.blob)
if strings.Contains(strings.ToLower(stored), "<html") || strings.Contains(strings.ToLower(stored), "<script") {
t.Fatalf("stored html was not normalized: %q", stored)
}
if !strings.Contains(stored, "Hello") || !strings.Contains(stored, "World") {
t.Fatalf("stored normalized text missing content: %q", stored)
}
if resp["mime_type"] != "text/html" {
t.Fatalf("mime_type = %#v", resp["mime_type"])
}
}
func TestNormalizeUploadInfoContent_PDFTakesPrecedenceOverHTML(t *testing.T) {
filename, contentType, data := normalizeUploadInfoContent(
"report",
"text/html",
[]byte("%PDF-1.7 fake pdf"),
)
if filename != "report.pdf" {
t.Fatalf("filename = %q, want report.pdf", filename)
}
if contentType != "application/pdf" {
t.Fatalf("contentType = %q, want application/pdf", contentType)
}
if !bytes.Equal(data, []byte("%PDF-1.7 fake pdf")) {
t.Fatalf("pdf bytes were unexpectedly transformed: %q", string(data))
}
}
func TestReadUploadInfoData_RejectsOversizedInput(t *testing.T) {
reader := io.LimitReader(zeroReader{}, maxRemoteFileSize+1)
_, err := readUploadInfoData(reader)
if err == nil {
t.Fatal("expected oversized input to be rejected")
}
if !strings.Contains(err.Error(), "file size exceeds") {
t.Fatalf("err = %v, want size limit message", err)
}
}
type zeroReader struct{}
func (zeroReader) Read(p []byte) (int, error) {
for i := range p {
p[i] = 0
}
return len(p), nil
}