ragflow/internal/service/document.go

//
//  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
//

package service

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"ragflow/internal/common"
	"ragflow/internal/entity"
	"ragflow/internal/storage"
	"ragflow/internal/utility"
	"regexp"
	"sort"
	"strings"
	"time"

	"ragflow/internal/cache"
	"ragflow/internal/dao"
	"ragflow/internal/engine"

	"gorm.io/gorm"
	"ragflow/internal/server"
)

// DocumentService document service
type DocumentService struct {
	documentDAO         *dao.DocumentDAO
	kbDAO               *dao.KnowledgebaseDAO
	ingestionTaskDAO    *dao.IngestionTaskDAO
	ingestionTaskLogDAO *dao.IngestionTaskLogDAO
	docEngine           engine.DocEngine
	engineType          server.EngineType
	metadataSvc         *MetadataService
	taskDAO             *dao.TaskDAO
	file2DocumentDAO    *dao.File2DocumentDAO
}

// NewDocumentService create document service
func NewDocumentService() *DocumentService {
	cfg := server.GetConfig()
	return &DocumentService{
		documentDAO:         dao.NewDocumentDAO(),
		ingestionTaskDAO:    dao.NewIngestionTaskDAO(),
		ingestionTaskLogDAO: dao.NewIngestionTaskLogDAO(),
		kbDAO:               dao.NewKnowledgebaseDAO(),
		docEngine:           engine.Get(),
		engineType:          cfg.DocEngine.Type,
		metadataSvc:         NewMetadataService(),
		taskDAO:             dao.NewTaskDAO(),
		file2DocumentDAO:    dao.NewFile2DocumentDAO(),
	}
}

// CreateDocumentRequest create document request
type CreateDocumentRequest struct {
	Name      string `json:"name" binding:"required"`
	KbID      string `json:"kb_id" binding:"required"`
	ParserID  string `json:"parser_id" binding:"required"`
	CreatedBy string `json:"created_by" binding:"required"`
	Type      string `json:"type"`
	Source    string `json:"source"`
}

// UpdateDocumentRequest update document request
type UpdateDocumentRequest struct {
	Name        *string  `json:"name"`
	Run         *string  `json:"run"`
	TokenNum    *int64   `json:"token_num"`
	ChunkNum    *int64   `json:"chunk_num"`
	Progress    *float64 `json:"progress"`
	ProgressMsg *string  `json:"progress_msg"`
}

// DocumentResponse document response
type DocumentResponse struct {
	ID              string  `json:"id"`
	Name            *string `json:"name,omitempty"`
	KbID            string  `json:"kb_id"`
	ParserID        string  `json:"parser_id"`
	PipelineID      *string `json:"pipeline_id,omitempty"`
	Type            string  `json:"type"`
	SourceType      string  `json:"source_type"`
	CreatedBy       string  `json:"created_by"`
	Location        *string `json:"location,omitempty"`
	Size            int64   `json:"size"`
	TokenNum        int64   `json:"token_num"`
	ChunkNum        int64   `json:"chunk_num"`
	Progress        float64 `json:"progress"`
	ProgressMsg     *string `json:"progress_msg,omitempty"`
	ProcessDuration float64 `json:"process_duration"`
	Suffix          string  `json:"suffix"`
	Run             *string `json:"run,omitempty"`
	Status          *string `json:"status,omitempty"`
	CreatedAt       string  `json:"created_at"`
	UpdatedAt       string  `json:"updated_at"`
}

type ThumbnailResponse struct {
	ID        string  `json:"id"`
	Thumbnail *string `json:"thumbnail,omitempty"`
	KbID      string  `json:"kb_id"`
}

type ArtifactResponse struct {
	Data            []byte
	ContentType     string
	SafeFilename    string
	ForceAttachment bool
}

var (
	ErrArtifactInvalidFilename = errors.New("Invalid filename.")
	ErrArtifactInvalidFileType = errors.New("Invalid file type.")
	ErrArtifactNotFound        = errors.New("Artifact not found.")
)

var artifactContentTypes = map[string]string{
	".png":  "image/png",
	".jpg":  "image/jpeg",
	".jpeg": "image/jpeg",
	".svg":  "image/svg+xml",
	".pdf":  "application/pdf",
	".csv":  "text/csv",
	".json": "application/json",
	".html": "text/html",
}

var artifactForceAttachmentExtensions = map[string]struct{}{
	".htm":   {},
	".html":  {},
	".shtml": {},
	".xht":   {},
	".xhtml": {},
	".xml":   {},
	".mhtml": {},
	".svg":   {},
}
var artifactForceAttachmentContentTypes = map[string]struct{}{
	"text/html":             {},
	"image/svg+xml":         {},
	"application/xhtml+xml": {},
	"text/xml":              {},
	"application/xml":       {},
	"multipart/related":     {},
}

var artifactUnsafeFilenameChars = regexp.MustCompile(`[^\pL\pN_.-]`)

// GetDocumentImage retrieves an image object from storage.
func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) {
	parts := strings.Split(imageID, "-")
	if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
		return nil, fmt.Errorf("Image not found.")
	}

	storageImpl := storage.GetStorageFactory().GetStorage()
	if storageImpl == nil {
		return nil, fmt.Errorf("storage not initialized")
	}

	return storageImpl.Get(parts[0], parts[1])
}

// GetDocumentArtifact retrieves a sandbox artifact from object storage.
func (s *DocumentService) GetDocumentArtifact(filename string) (*ArtifactResponse, error) {
	basename := filepath.Base(filename)
	if basename != filename || strings.Contains(filename, "/") || strings.Contains(filename, "\\") {
		return nil, ErrArtifactInvalidFilename
	}

	ext := strings.ToLower(filepath.Ext(basename))
	contentType, ok := artifactContentTypes[ext]
	if !ok {
		return nil, ErrArtifactInvalidFileType
	}

	storageImpl := storage.GetStorageFactory().GetStorage()
	if storageImpl == nil {
		return nil, fmt.Errorf("storage not initialized")
	}

	bucket := sandboxArtifactBucket()
	if !storageImpl.ObjExist(bucket, basename) {
		return nil, ErrArtifactNotFound
	}

	data, err := storageImpl.Get(bucket, basename)
	if err != nil {
		return nil, err
	}
	if len(data) == 0 {
		return nil, ErrArtifactNotFound
	}

	return &ArtifactResponse{
		Data:            data,
		ContentType:     contentType,
		SafeFilename:    sanitizeArtifactFilename(basename),
		ForceAttachment: shouldForceArtifactAttachment(ext, contentType),
	}, nil
}

func sandboxArtifactBucket() string {
	if bucket := os.Getenv("SANDBOX_ARTIFACT_BUCKET"); bucket != "" {
		return bucket
	}
	return "sandbox-artifacts"
}

func sanitizeArtifactFilename(filename string) string {
	return artifactUnsafeFilenameChars.ReplaceAllString(filename, "_")
}

func shouldForceArtifactAttachment(ext, contentType string) bool {
	if _, ok := artifactForceAttachmentExtensions[strings.ToLower(ext)]; ok {
		return true
	}
	_, ok := artifactForceAttachmentContentTypes[strings.ToLower(contentType)]
	return ok
}

type DocumentPreview struct {
	Data        []byte
	ContentType string
	FileName    string
}

func (s *DocumentService) GetDocumentPreview(docID string) (*DocumentPreview, error) {
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return nil, err
	}

	bucket, name, err := s.GetDocumentStorageAddress(doc)
	if err != nil {
		return nil, err
	}

	storageImpl := storage.GetStorageFactory().GetStorage()
	if storageImpl == nil {
		return nil, fmt.Errorf("storage not initialized")
	}

	data, err := storageImpl.Get(bucket, name)
	if err != nil {
		return nil, err
	}
	if len(data) == 0 {
		return nil, ErrArtifactNotFound
	}

	fileName := ""
	if doc.Name != nil {
		fileName = *doc.Name
	}

	ext := utility.GetFileExtension(fileName)
	contentType := utility.GetContentType(ext, doc.Type)

	return &DocumentPreview{
		Data:        data,
		ContentType: contentType,
		FileName:    fileName,
	}, nil
}

func (s *DocumentService) GetDocumentStorageAddress(doc *entity.Document) (string, string, error) {
	if doc == nil {
		return "", "", fmt.Errorf("document is nil")
	}

	file2DocumentDAO := dao.NewFile2DocumentDAO()
	fileDAO := dao.NewFileDAO()

	mappings, err := file2DocumentDAO.GetByDocumentID(doc.ID)
	if err != nil {
		return "", "", err
	}

	if len(mappings) > 0 && mappings[0].FileID != nil {
		file, err := fileDAO.GetByID(*mappings[0].FileID)
		if err != nil {
			return "", "", err
		}

		if file.SourceType == "" || entity.FileSource(file.SourceType) == entity.FileSourceLocal {
			if file.Location == nil || *file.Location == "" {
				return "", "", fmt.Errorf("file location is empty")
			}
			return file.ParentID, *file.Location, nil
		}
	}

	if doc.Location == nil || *doc.Location == "" {
		return "", "", fmt.Errorf("document location is empty")
	}
	return doc.KbID, *doc.Location, nil
}

type DownloadDocumentResp struct {
	Data        []byte
	FileName    string
	ContentType string
}

func (s *DocumentService) DownloadDocument(datasetID, docID string) (*DownloadDocumentResp, error) {
	if docID == "" {
		return nil, fmt.Errorf("Specify document_id please.")
	}
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil || doc.KbID != datasetID {
		return nil, fmt.Errorf("The dataset not own the document %s.", docID)
	}
	bucket, name, err := s.GetDocumentStorageAddress(doc)
	if err != nil {
		return nil, err
	}

	storageImpl := storage.GetStorageFactory().GetStorage()
	if storageImpl == nil {
		return nil, fmt.Errorf("storage not initialized")
	}

	data, err := storageImpl.Get(bucket, name)
	if err != nil {
		return nil, err
	}
	if len(data) == 0 {
		return nil, fmt.Errorf("This file is empty.")
	}

	fileName := ""
	if doc.Name != nil {
		fileName = *doc.Name
	}

	return &DownloadDocumentResp{
		Data:        data,
		FileName:    fileName,
		ContentType: "application/octet-stream",
	}, nil
}

// CreateDocument create document
func (s *DocumentService) CreateDocument(req *CreateDocumentRequest) (*entity.Document, error) {
	document := &entity.Document{
		Name:       &req.Name,
		KbID:       req.KbID,
		ParserID:   req.ParserID,
		CreatedBy:  req.CreatedBy,
		Type:       req.Type,
		SourceType: req.Source,
		Suffix:     ".doc",
		Status:     func() *string { s := "0"; return &s }(),
	}

	if err := s.documentDAO.Create(document); err != nil {
		return nil, fmt.Errorf("failed to create document: %w", err)
	}

	return document, nil
}

// GetDocumentByID get document by ID
func (s *DocumentService) GetDocumentByID(id string) (*DocumentResponse, error) {
	document, err := s.documentDAO.GetByID(id)
	if err != nil {
		return nil, err
	}

	return s.toResponse(document), nil
}

// UpdateDocument update document
func (s *DocumentService) UpdateDocument(id string, req *UpdateDocumentRequest) error {
	document, err := s.documentDAO.GetByID(id)
	if err != nil {
		return err
	}

	if req.Name != nil {
		document.Name = req.Name
	}
	if req.Run != nil {
		document.Run = req.Run
	}
	if req.TokenNum != nil {
		document.TokenNum = *req.TokenNum
	}
	if req.ChunkNum != nil {
		document.ChunkNum = *req.ChunkNum
	}
	if req.Progress != nil {
		document.Progress = *req.Progress
	}
	if req.ProgressMsg != nil {
		document.ProgressMsg = req.ProgressMsg
	}

	return s.documentDAO.Update(document)
}

// DeleteDocument delete document — delegates to full cleanup logic.
func (s *DocumentService) DeleteDocument(id string) error {
	return s.deleteDocumentFull(id)
}

// DeleteDocuments deletes multiple documents under a dataset.
//
//	ids: specific document IDs; deleteAll: delete all docs in the dataset.
//	Returns the number of successfully deleted documents.
func (s *DocumentService) DeleteDocuments(ids []string, deleteAll bool, datasetID, userID string) (int, error) {
	// 1. Check dataset is accessible by the user
	if !s.kbDAO.Accessible(datasetID, userID) {
		return 0, fmt.Errorf("You don't own the dataset %s.", datasetID)
	}

	// 2. Resolve document IDs
	if deleteAll {
		if err := dao.DB.Model(&entity.Document{}).
			Where("kb_id = ?", datasetID).
			Pluck("id", &ids).Error; err != nil {
			return 0, fmt.Errorf("failed to query documents: %w", err)
		}
	}
	if len(ids) == 0 {
		return 0, nil
	}

	// 3. Deduplicate (before validation so dup count doesn't matter)
	ids = common.Deduplicate(ids)

	// 4. Validate IDs belong to this dataset (only for explicit ids; deleteAll is already scoped)
	if !deleteAll {
		if _, err := s.validateDocsInDataset(ids, datasetID); err != nil {
			return 0, err
		}
	}

	// 5. Delete each document (non-critical failures are tolerated per doc)
	deleted := 0
	for _, docID := range ids {
		if err := s.deleteDocumentFull(docID); err != nil {
			common.Logger.Warn(fmt.Sprintf("DeleteDocuments: failed to delete %s: %v", docID, err))
			continue
		}
		deleted++
	}

	return deleted, nil
}

// deleteDocumentFull performs full document cleanup. Non-critical failures
// are tolerated (logged and continue). Critical failures (e.g. document or
// KB not found) return an error immediately.
func (s *DocumentService) deleteDocumentFull(docID string) error {
	doc, kb, err := s.resolveDocAndKB(docID)
	if err != nil {
		return err
	}

	// Delete tasks from DB
	if _, delErr := s.taskDAO.DeleteByDocIDs([]string{docID}); delErr != nil {
		common.Logger.Warn(fmt.Sprintf("failed to delete tasks for %s: %v", docID, delErr))
	}
	s.deleteDocEngineData(docID, kb.TenantID, doc.KbID)
	if err := s.deleteDocRecordWithCounters(doc, kb.ID); err != nil {
		return err
	}
	s.cleanupFileReferences(docID)

	return nil
}

// RemoveDocumentKeepFile removes a document's chunks/metadata and the document
// row, decrementing the KB counters (doc_num/chunk_num/token_num), WITHOUT
// deleting the underlying file record, its storage blob, or its file2document
// mappings. Mirrors Python DocumentService.remove_document — the caller is
// responsible for cleaning up the file2document mappings separately.
func (s *DocumentService) RemoveDocumentKeepFile(docID string) error {
	doc, kb, err := s.resolveDocAndKB(docID)
	if err != nil {
		return err
	}
	if _, delErr := s.taskDAO.DeleteByDocIDs([]string{docID}); delErr != nil {
		common.Logger.Warn(fmt.Sprintf("RemoveDocumentKeepFile: failed to delete tasks for %s: %v", docID, delErr))
	}
	s.deleteDocEngineData(docID, kb.TenantID, doc.KbID)
	return s.deleteDocRecordWithCounters(doc, kb.ID)
}

// InsertDocument creates a document row and increments the owning KB's doc_num
// counter in a single transaction. Mirrors Python DocumentService.insert, which
// updates dataset/document counters on insert. The document's ID and timestamps
// are populated by the caller / model hooks before insertion.
func (s *DocumentService) InsertDocument(doc *entity.Document) error {
	return dao.DB.Transaction(func(tx *gorm.DB) error {
		if err := tx.Create(doc).Error; err != nil {
			return fmt.Errorf("failed to create document: %w", err)
		}
		// Guard the counter bump with RowsAffected: documents.kb_id has no DB-level
		// FK, so Create can succeed against a non-existent KB and the Update would
		// then report a nil error with 0 rows touched, silently desyncing doc_num.
		// Roll the whole transaction back in that case (mirrors the counter checks
		// in deleteDocRecordWithCounters).
		result := tx.Model(&entity.Knowledgebase{}).
			Where("id = ?", doc.KbID).
			Update("doc_num", gorm.Expr("doc_num + 1"))
		if result.Error != nil {
			return fmt.Errorf("failed to increment doc_num for KB %s: %w", doc.KbID, result.Error)
		}
		if result.RowsAffected == 0 {
			return fmt.Errorf("knowledgebase %s not found", doc.KbID)
		}
		return nil
	})
}

// resolveDocAndKB loads the document and its knowledgebase, returning both or
// an error.
func (s *DocumentService) resolveDocAndKB(docID string) (*entity.Document, *entity.Knowledgebase, error) {
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return nil, nil, fmt.Errorf("document not found: %w", err)
	}
	kb, err := s.kbDAO.GetByID(doc.KbID)
	if err != nil {
		return nil, nil, fmt.Errorf("knowledgebase not found: %w", err)
	}
	return doc, kb, nil
}

// deleteDocEngineData removes chunks and metadata from the document engine.
// No-op when the engine is nil.
func (s *DocumentService) deleteDocEngineData(docID, tenantID, kbID string) {
	if s.docEngine == nil {
		return
	}
	ctx := context.Background()
	indexName := fmt.Sprintf("ragflow_%s", tenantID)
	if _, delErr := s.docEngine.DeleteChunks(ctx, map[string]interface{}{"doc_id": docID}, indexName, kbID); delErr != nil {
		common.Logger.Warn(fmt.Sprintf("deleteDocEngineData: failed to delete chunks for %s: %v", docID, delErr))
	}
	if s.metadataSvc != nil {
		_ = s.DeleteDocumentAllMetadata(docID) // logs internally
	}
}

// deleteDocRecordWithCounters hard-deletes the document row and decrements the
// KB counters in a single transaction. Counters are only decremented when a
// document row was actually removed (RowsAffected > 0), guarding against
// double-decrement on retries or concurrent deletes.
func (s *DocumentService) deleteDocRecordWithCounters(doc *entity.Document, kbID string) error {
	return dao.DB.Transaction(func(tx *gorm.DB) error {
		result := tx.Where("id = ?", doc.ID).Delete(&entity.Document{})
		if result.Error != nil {
			return fmt.Errorf("failed to delete document %s: %w", doc.ID, result.Error)
		}
		if result.RowsAffected == 0 {
			return nil // already deleted by a concurrent request — skip counters
		}

		decErr := tx.Model(&entity.Knowledgebase{}).
			Where("id = ?", kbID).
			Updates(map[string]interface{}{
				"doc_num":   gorm.Expr("doc_num - 1"),
				"chunk_num": gorm.Expr("chunk_num - ?", doc.ChunkNum),
				"token_num": gorm.Expr("token_num - ?", doc.TokenNum),
			}).Error
		if decErr != nil {
			common.Logger.Warn(fmt.Sprintf("deleteDocRecordWithCounters: failed to decrement KB %s: %v", kbID, decErr))
		}
		return nil
	})
}

// cleanupFileReferences deletes file2document mappings for docID, and for each
// referenced file, only hard-deletes the file record and its storage blob when
// no other document still references the same file_id.
func (s *DocumentService) cleanupFileReferences(docID string) {
	mappings, mapErr := s.file2DocumentDAO.GetByDocumentID(docID)
	if mapErr != nil {
		common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: failed to get f2d mappings for %s: %v", docID, mapErr))
	}
	if len(mappings) == 0 {
		return
	}

	// Collect unique file_ids
	seen := make(map[string]bool)
	var fileIDs []string
	for _, m := range mappings {
		if m.FileID == nil || seen[*m.FileID] {
			continue
		}
		seen[*m.FileID] = true
		fileIDs = append(fileIDs, *m.FileID)
	}

	// Delete all file2document rows for this document
	if delErr := s.file2DocumentDAO.DeleteByDocumentID(docID); delErr != nil {
		common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: failed to delete f2d for %s: %v", docID, delErr))
	}

	// For each file, only delete the record and blob when no other doc references it
	for _, fileID := range fileIDs {
		remaining, remErr := s.file2DocumentDAO.GetByFileID(fileID)
		if remErr != nil {
			common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: failed to check remaining f2d for %s: %v", fileID, remErr))
			continue
		}
		if len(remaining) > 0 {
			continue
		}

		fileDAO := dao.NewFileDAO()
		file, fErr := fileDAO.GetByID(fileID)
		if fErr != nil || file == nil {
			common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: file not found %s: %v", fileID, fErr))
			continue
		}
		if _, delErr := fileDAO.DeleteByIDs([]string{fileID}); delErr != nil {
			common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: failed to delete file %s: %v", fileID, delErr))
			continue // keep the blob so the live file row still has its object
		}
		if file.Location != nil && *file.Location != "" {
			storageImpl := storage.GetStorageFactory().GetStorage()
			if storageImpl != nil {
				if rmErr := storageImpl.Remove(file.ParentID, *file.Location); rmErr != nil {
					common.Logger.Warn(fmt.Sprintf("cleanupFileReferences: failed to remove blob %s/%s: %v", file.ParentID, *file.Location, rmErr))
				}
			}
		}
	}
}

// ListDocuments list documents
func (s *DocumentService) ListDocuments(page, pageSize int) ([]*DocumentResponse, int64, error) {
	offset := (page - 1) * pageSize
	documents, total, err := s.documentDAO.List(offset, pageSize)
	if err != nil {
		return nil, 0, err
	}

	responses := make([]*DocumentResponse, len(documents))
	for i, doc := range documents {
		responses[i] = s.toResponse(doc)
	}

	return responses, total, nil
}

func (s *DocumentService) GetThumbnail(docID string) (*ThumbnailResponse, error) {
	document, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return nil, err
	}

	var result ThumbnailResponse
	result.ID = document.ID
	result.Thumbnail = document.Thumbnail
	result.KbID = document.KbID
	return &result, nil
}

// ListDocumentsByDatasetID list documents by knowledge base ID
func (s *DocumentService) ListDocumentsByDatasetID(kbID string, page, pageSize int) ([]*entity.DocumentListItem, int64, error) {
	offset := (page - 1) * pageSize
	documents, total, err := s.documentDAO.ListByKBID(kbID, offset, pageSize)
	if err != nil {
		return nil, 0, err
	}

	responses := make([]*entity.DocumentListItem, len(documents))
	for i, doc := range documents {
		responses[i] = doc
	}

	return responses, total, nil
}

// GetDocumentsByAuthorID get documents by author ID
func (s *DocumentService) GetDocumentsByAuthorID(authorID, page, pageSize int) ([]*DocumentResponse, int64, error) {
	offset := (page - 1) * pageSize
	documents, total, err := s.documentDAO.GetByAuthorID(fmt.Sprintf("%d", authorID), offset, pageSize)
	if err != nil {
		return nil, 0, err
	}

	responses := make([]*DocumentResponse, len(documents))
	for i, doc := range documents {
		responses[i] = s.toResponse(doc)
	}

	return responses, total, nil
}

func (s *DocumentService) ListIngestionTasks(userID string, datasetID *string, page, pageSize int) ([]*entity.IngestionTask, error) {
	offset := (page - 1) * pageSize

	var tasks []*entity.IngestionTask
	var err error
	if datasetID == nil {
		tasks, err = s.ingestionTaskDAO.ListByUserID(userID, offset, pageSize)
	} else {
		tasks, err = s.ingestionTaskDAO.ListByUserIDAndDatasetID(userID, *datasetID, offset, pageSize)
	}

	if err != nil {
		return nil, err
	}

	return tasks, nil
}

type ParseDocumentResponse struct {
	DocumentID string `json:"document_id"`
	Result     string `json:"result"`
}

func (s *DocumentService) IngestDocuments(datasetID, userID string, docIDs []string) ([]*ParseDocumentResponse, error) {
	// deduplicate the document id
	uniqueDocIDs := common.Deduplicate(docIDs)
	if uniqueDocIDs == nil || len(uniqueDocIDs) == 0 {
		return nil, fmt.Errorf("no documents to parse")
	}

	var responses []*ParseDocumentResponse

	// query database, if the document ids are valid
	for _, docID := range uniqueDocIDs {
		doc, err := s.documentDAO.GetByID(docID)

		if err != nil {
			errorMessage := err.Error()
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}

		if doc == nil {
			errorMessage := "no such document"
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}

		task := &entity.IngestionTask{
			DocumentID: docID,
			UserID:     userID,
			DatasetID:  datasetID,
			Schema:     nil,
			Status:     common.CREATED,
		}

		// save the task to database
		task, err = s.ingestionTaskDAO.CheckAndCreate(task)
		if err != nil {
			errorMessage := err.Error()
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}

		msgQueueEngine := engine.GetMessageQueueEngine()

		taskMessage := common.TaskMessage{
			TaskID:   task.ID,
			TaskType: common.TaskTypeIngestionTask,
		}

		// convert task
		taskMessageStr, err := json.Marshal(taskMessage)
		if err != nil {
			return nil, err
		}

		err = msgQueueEngine.PublishTask("tasks.RAGFLOW", taskMessageStr)
		if err != nil {
			return nil, err
		}

		responses = append(responses, &ParseDocumentResponse{
			DocumentID: docID,
			Result:     fmt.Sprintf("task_id: %s", task.ID),
		})
	}

	common.Info(fmt.Sprintf("parse documents, dataset: %s, documents: %v", datasetID, docIDs))
	return responses, nil
}

func (s *DocumentService) StopIngestionTasks(tasks []string, userID string) ([]*entity.IngestionTask, error) {

	var taskResponses []*entity.IngestionTask
	for _, taskID := range tasks {
		task, err := s.ingestionTaskDAO.SetStoppingByAPIServer(taskID)
		if err != nil {
			return nil, err
		}
		taskResponses = append(taskResponses, task)
	}
	return taskResponses, nil
}

func (s *DocumentService) RemoveIngestionTasks(tasks []string, userID string) ([]map[string]string, error) {

	var deletedTasks []map[string]string
	for _, taskID := range tasks {
		taskRecord := map[string]string{
			"task_id": taskID,
		}
		_, err := s.ingestionTaskDAO.RemoveByAPIServerOrAdminServer(taskID, &userID)
		if err != nil {
			taskRecord["remove"] = fmt.Sprintf("fail: %s", err.Error())
		} else {
			taskRecord["remove"] = "success"
		}
		deletedTasks = append(deletedTasks, taskRecord)
	}
	return deletedTasks, nil
}

func (s *DocumentService) ParseDocuments(datasetID, userID string, docIDs []string) ([]*ParseDocumentResponse, error) {
	// create document parse id
	// save to task table
	// send to message queue

	// deduplicate the document id
	uniqueDocIDs := common.Deduplicate(docIDs)
	if uniqueDocIDs == nil || len(uniqueDocIDs) == 0 {
		return nil, fmt.Errorf("no documents to parse")
	}

	var responses []*ParseDocumentResponse

	// query database, if the document ids are valid
	for _, docID := range uniqueDocIDs {
		doc, err := s.documentDAO.GetByID(docID)
		if err != nil {
			errorMessage := err.Error()
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}
		if doc == nil {
			errorMessage := "no such document"
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}

		if doc.Status != nil && *doc.Status != "0" {
			errorMessage := fmt.Sprintf("document %s is already parsed", docID)
			responses = append(responses, &ParseDocumentResponse{
				DocumentID: docID,
				Result:     errorMessage,
			})
			continue
		}

		// create task for each document
		//task := &entity.IngestionTask{
		//	ID:         utility.GenerateToken(),
		//	DocumentID: docID,
		//	UserID:     userID,
		//}

		// save the task to database
		//err = s.ingestionTaskDAO.Create(task)
		//if err != nil {
		//	errorMessage := err.Error()
		//	responses = append(responses, &ParseDocumentResponse{
		//		DocumentID: docID,
		//		Result:     &errorMessage,
		//	})
		//	continue
		//}

		// Send task to message queue

	}

	common.Info(fmt.Sprintf("parse documents, dataset: %s, documents: %v", datasetID, docIDs))
	return responses, nil
}

// StopParseDocuments stops parsing for the given documents in a dataset.
// It sets Redis cancel signals for associated tasks and updates doc.run to CANCEL.
// Returns a map with success_count and optionally errors.
func (s *DocumentService) StopParseDocuments(datasetID string, docIDs []string) (map[string]interface{}, error) {
	deduped := common.Deduplicate(docIDs)
	if len(deduped) == 0 {
		return nil, fmt.Errorf("no document IDs provided")
	}

	docs, err := s.validateDocsInDataset(deduped, datasetID)
	if err != nil {
		return nil, err
	}

	var errors []string
	successCount := 0
	for _, doc := range docs {
		if cancelErr := s.cancelDocParse(doc); cancelErr != nil {
			errors = append(errors, cancelErr.Error())
			continue
		}
		successCount++
	}

	result := map[string]interface{}{"success_count": successCount}
	if len(errors) > 0 {
		result["errors"] = errors
	}
	return result, nil
}

// validateDocsInDataset deduplicates IDs, fetches the documents, and ensures
// every document exists and belongs to the given dataset. Returns the resolved
// documents.
func (s *DocumentService) validateDocsInDataset(docIDs []string, datasetID string) ([]*entity.Document, error) {
	docs, err := s.documentDAO.GetByIDs(docIDs)
	if err != nil {
		return nil, fmt.Errorf("failed to fetch documents: %w", err)
	}
	if len(docs) != len(docIDs) {
		return nil, fmt.Errorf("some document IDs not found in dataset %s", datasetID)
	}
	var invalid []string
	for _, d := range docs {
		if d.KbID != datasetID {
			invalid = append(invalid, d.ID)
		}
	}
	if len(invalid) > 0 {
		return nil, fmt.Errorf("these documents do not belong to dataset %s: %v", datasetID, invalid)
	}
	return docs, nil
}

// cancelDocParse sets Redis cancel signals for the document's active tasks and
// marks the document run status as CANCEL. Returns an error if the document is
// not in a cancellable state or the status update fails.
func (s *DocumentService) cancelDocParse(doc *entity.Document) error {
	tasks, taskErr := s.taskDAO.GetByDocID(doc.ID)
	if taskErr != nil {
		return fmt.Errorf("failed to get tasks for %s: %v", doc.ID, taskErr)
	}

	hasUnfinishedTask := false
	for _, t := range tasks {
		if t.Progress < 1 {
			hasUnfinishedTask = true
			break
		}
	}

	canCancel := false
	if doc.Run != nil {
		if *doc.Run == string(entity.TaskStatusRunning) || *doc.Run == string(entity.TaskStatusCancel) {
			canCancel = true
		}
	}
	if hasUnfinishedTask {
		canCancel = true
	}
	if !canCancel {
		return fmt.Errorf("can't stop parsing document that has not started or already completed")
	}

	// Set Redis cancel signal for each task (best-effort)
	redisClient := cache.Get()
	for _, t := range tasks {
		if redisClient != nil {
			redisClient.Set(fmt.Sprintf("%s-cancel", t.ID), "x", 0)
		}
	}

	if upErr := s.documentDAO.UpdateByID(doc.ID, map[string]interface{}{"run": string(entity.TaskStatusCancel)}); upErr != nil {
		return fmt.Errorf("failed to update document %s: %v", doc.ID, upErr)
	}
	return nil
}

// toResponse convert model.Document to DocumentResponse
func (s *DocumentService) toResponse(doc *entity.Document) *DocumentResponse {
	createdAt := ""
	if doc.CreateTime != nil {
		// Check if timestamp is in milliseconds (13 digits) or seconds (10 digits)
		var ts int64
		if *doc.CreateTime > 1000000000000 {
			// Milliseconds - convert to seconds
			ts = *doc.CreateTime / 1000
		} else {
			ts = *doc.CreateTime
		}
		createdAt = time.Unix(ts, 0).Format("2006-01-02 15:04:05")
	}
	updatedAt := ""
	if doc.UpdateTime != nil {
		// Accept both historical second-based values and current millisecond-based values.
		ts := *doc.UpdateTime
		if ts > 1000000000000 {
			ts /= 1000
		}
		updatedAt = time.Unix(ts, 0).Format("2006-01-02 15:04:05")
	}
	return &DocumentResponse{
		ID:              doc.ID,
		Name:            doc.Name,
		KbID:            doc.KbID,
		ParserID:        doc.ParserID,
		PipelineID:      doc.PipelineID,
		Type:            doc.Type,
		SourceType:      doc.SourceType,
		CreatedBy:       doc.CreatedBy,
		Location:        doc.Location,
		Size:            doc.Size,
		TokenNum:        doc.TokenNum,
		ChunkNum:        doc.ChunkNum,
		Progress:        doc.Progress,
		ProgressMsg:     doc.ProgressMsg,
		ProcessDuration: doc.ProcessDuration,
		Suffix:          doc.Suffix,
		Run:             doc.Run,
		Status:          doc.Status,
		CreatedAt:       createdAt,
		UpdatedAt:       updatedAt,
	}
}

// GetMetadataSummaryRequest request for metadata summary
type GetMetadataSummaryRequest struct {
	KBID   string   `json:"kb_id" binding:"required"`
	DocIDs []string `json:"doc_ids"`
}

// GetMetadataSummaryResponse response for metadata summary
type GetMetadataSummaryResponse struct {
	Summary map[string]interface{} `json:"summary"`
}

// GetMetadataSummary get metadata summary for documents
func (s *DocumentService) GetMetadataSummary(kbID string, docIDs []string) (map[string]interface{}, error) {
	tenantID, err := s.metadataSvc.GetTenantIDByKBID(kbID)
	if err != nil {
		return nil, err
	}

	searchResult, err := s.metadataSvc.SearchMetadata(kbID, tenantID, docIDs, 1000)
	if err != nil {
		return nil, err
	}

	// Aggregate metadata from results
	return aggregateMetadata(searchResult.MetadataRecords), nil
}

// SetDocumentMetadata sets metadata for a document in the document engine
func (s *DocumentService) SetDocumentMetadata(docID string, meta map[string]interface{}) error {
	// Get document to find kb_id
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return fmt.Errorf("document not found: %w", err)
	}

	// Get tenant ID
	tenantID, err := s.metadataSvc.GetTenantIDByKBID(doc.KbID)
	if err != nil {
		return fmt.Errorf("failed to get tenant ID: %w", err)
	}

	// Update metadata using the document engine (merges with existing)
	err = s.docEngine.UpdateMetadata(nil, docID, doc.KbID, meta, tenantID)
	if err != nil {
		return fmt.Errorf("failed to update metadata: %w", err)
	}

	return nil
}

// DeleteDocumentMetadata deletes metadata keys for a document in the document engine
func (s *DocumentService) DeleteDocumentMetadata(docID string, keys []string) error {
	// Get document to find kb_id
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return fmt.Errorf("document not found: %w", err)
	}

	// Get tenant ID
	tenantID, err := s.metadataSvc.GetTenantIDByKBID(doc.KbID)
	if err != nil {
		return fmt.Errorf("failed to get tenant ID: %w", err)
	}

	// Delete metadata using the document engine
	err = s.docEngine.DeleteMetadataKeys(nil, docID, doc.KbID, keys, tenantID)
	if err != nil {
		return fmt.Errorf("failed to delete metadata: %w", err)
	}

	return nil
}

// DeleteDocumentAllMetadata deletes all metadata for a document in the document engine
func (s *DocumentService) DeleteDocumentAllMetadata(docID string) error {
	// Get document to find kb_id
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return fmt.Errorf("document not found: %w", err)
	}

	// Get tenant ID
	tenantID, err := s.metadataSvc.GetTenantIDByKBID(doc.KbID)
	if err != nil {
		return fmt.Errorf("failed to get tenant ID: %w", err)
	}

	// Build condition to match the document
	condition := map[string]interface{}{
		"id":    docID,
		"kb_id": doc.KbID,
	}

	// Delete entire document metadata
	_, err = s.docEngine.DeleteMetadata(nil, condition, tenantID)
	if err != nil {
		return fmt.Errorf("failed to delete document metadata: %w", err)
	}

	return nil
}

// GetDocumentMetadataByID get metadata for a specific document
func (s *DocumentService) GetDocumentMetadataByID(docID string) (map[string]interface{}, error) {
	// Get document to find kb_id
	doc, err := s.documentDAO.GetByID(docID)
	if err != nil {
		return nil, fmt.Errorf("document not found: %w", err)
	}

	tenantID, err := s.metadataSvc.GetTenantIDByKBID(doc.KbID)
	if err != nil {
		return nil, err
	}

	searchResult, err := s.metadataSvc.SearchMetadata(doc.KbID, tenantID, []string{docID}, 1)
	if err != nil {
		return nil, err
	}

	// Return metadata if found
	if len(searchResult.MetadataRecords) > 0 {
		metadata := searchResult.MetadataRecords[0]
		return ExtractMetaFields(metadata)
	}

	return make(map[string]interface{}), nil
}

// GetMetadataByKBs get metadata for knowledge bases
func (s *DocumentService) GetMetadataByKBs(kbIDs []string) (map[string]interface{}, error) {
	if len(kbIDs) == 0 {
		return make(map[string]interface{}), nil
	}

	searchResult, err := s.metadataSvc.SearchMetadataByKBs(kbIDs, 10000)
	if err != nil {
		return nil, err
	}

	flattenedMeta := make(map[string]map[string][]string)
	numMetadata := len(searchResult.MetadataRecords)

	var allMetaFields []map[string]interface{}
	if numMetadata > 1 && len(searchResult.MetadataRecords) > 0 {
		firstMetadata := searchResult.MetadataRecords[0]
		if metaFieldsVal := firstMetadata["meta_fields"]; metaFieldsVal != nil {
			if v, ok := metaFieldsVal.([]byte); ok {
				allMetaFields = ParseAllLengthPrefixedJSON(v)
			}
		}
	}

	for idx, metadata := range searchResult.MetadataRecords {
		docID, ok := ExtractDocumentID(metadata)
		if !ok {
			continue
		}

		var metaFields map[string]interface{}
		var metaFieldsVal interface{}

		if len(allMetaFields) > 0 && idx < len(allMetaFields) {
			// Use pre-parsed meta_fields from concatenated data
			metaFields = allMetaFields[idx]
		} else {
			// Normal case - get from chunk
			metaFieldsVal = metadata["meta_fields"]
			if metaFieldsVal != nil {
				switch v := metaFieldsVal.(type) {
				case string:
					if err := json.Unmarshal([]byte(v), &metaFields); err != nil {
						continue
					}
				case []byte:
					// Try direct JSON parse first
					if err := json.Unmarshal(v, &metaFields); err != nil {
						// Try to parse as concatenated JSON objects
						metaFields = ParseLengthPrefixedJSON(v)
					}
				case map[string]interface{}:
					metaFields = v
				default:
					continue
				}
			}
		}

		if metaFields == nil {
			continue
		}

		// Process each metadata field
		for fieldName, fieldValue := range metaFields {
			if fieldName == "kb_id" || fieldName == "id" {
				continue
			}

			if _, ok := flattenedMeta[fieldName]; !ok {
				flattenedMeta[fieldName] = make(map[string][]string)
			}

			// Handle list and single values
			var values []interface{}
			switch v := fieldValue.(type) {
			case []interface{}:
				values = v
			default:
				values = []interface{}{v}
			}

			for _, val := range values {
				if val == nil {
					continue
				}
				strVal := fmt.Sprintf("%v", val)
				flattenedMeta[fieldName][strVal] = append(flattenedMeta[fieldName][strVal], docID)
			}
		}
	}

	// Convert to map[string]interface{} for return
	var metaResult map[string]interface{} = make(map[string]interface{})
	for k, v := range flattenedMeta {
		metaResult[k] = v
	}

	return metaResult, nil
}

// valueInfo holds count and order of first appearance
type valueInfo struct {
	count      int
	firstOrder int
}

// aggregateMetadata aggregates metadata from search results
func aggregateMetadata(chunks []map[string]interface{}) map[string]interface{} {
	// summary: map[fieldName]map[value]valueInfo
	summary := make(map[string]map[string]valueInfo)
	typeCounter := make(map[string]map[string]int)
	orderCounter := 0

	for _, chunk := range chunks {
		// For metadata table, the actual metadata is in the "meta_fields" JSON field
		// Extract it first
		metaFieldsVal := chunk["meta_fields"]
		if metaFieldsVal == nil {
			continue
		}

		// Parse meta_fields - could be a string (JSON) or a map
		var metaFields map[string]interface{}
		switch v := metaFieldsVal.(type) {
		case string:
			// Parse JSON string
			if err := json.Unmarshal([]byte(v), &metaFields); err != nil {
				continue
			}
		case []byte:
			// Handle byte slice - Infinity returns concatenated JSON objects with length prefixes
			rawBytes := v

			// Try to detect and handle length-prefixed format
			// Format: [4-byte length][JSON][4-byte length][JSON]...
			parsedMetaFields := make(map[string]interface{})
			offset := 0
			for offset < len(rawBytes) {
				// Need at least 4 bytes for length prefix
				if offset+4 > len(rawBytes) {
					break
				}

				// Read 4-byte length (little-endian, not big-endian!)
				length := uint32(rawBytes[offset]) | uint32(rawBytes[offset+1])<<8 |
					uint32(rawBytes[offset+2])<<16 | uint32(rawBytes[offset+3])<<24

				// Check if length looks valid (not too large)
				if length > 10000 || length == 0 {
					// Try to find next '{' from current position
					nextBrace := -1
					for i := offset; i < len(rawBytes) && i < offset+100; i++ {
						if rawBytes[i] == '{' {
							nextBrace = i
							break
						}
					}
					if nextBrace > offset {
						// Skip to the next '{'
						offset = nextBrace
						continue
					}
					break
				}

				// Extract JSON data
				jsonStart := offset + 4
				jsonEnd := jsonStart + int(length)
				if jsonEnd > len(rawBytes) {
					jsonEnd = len(rawBytes)
				}

				jsonBytes := rawBytes[jsonStart:jsonEnd]

				// Try to parse this JSON
				var singleMeta map[string]interface{}
				if err := json.Unmarshal(jsonBytes, &singleMeta); err == nil {
					// Merge metadata from this document
					for k, vv := range singleMeta {
						if existing, ok := parsedMetaFields[k]; ok {
							// Combine values
							if existList, ok := existing.([]interface{}); ok {
								if newList, ok := vv.([]interface{}); ok {
									parsedMetaFields[k] = append(existList, newList...)
								} else {
									parsedMetaFields[k] = append(existList, vv)
								}
							} else {
								parsedMetaFields[k] = []interface{}{existing, vv}
							}
						} else {
							parsedMetaFields[k] = vv
						}
					}
				}

				offset = jsonEnd
			}

			// If we successfully parsed multiple JSON objects, use the merged result
			if len(parsedMetaFields) > 0 {
				metaFields = parsedMetaFields
			} else {
				// Fallback: try the original parsing method
				startIdx := -1
				for i, b := range rawBytes {
					if b == '{' {
						startIdx = i
						break
					}
				}
				if startIdx > 0 {
					strVal := string(rawBytes[startIdx:])
					if err := json.Unmarshal([]byte(strVal), &metaFields); err != nil {
						metaFields = map[string]interface{}{"raw": strVal}
					}
				} else if err := json.Unmarshal(rawBytes, &metaFields); err != nil {
					metaFields = map[string]interface{}{"raw": string(rawBytes)}
				}
			}
		case map[string]interface{}:
			metaFields = v
		default:
			continue
		}

		// Now iterate over the extracted metadata fields
		for k, v := range metaFields {
			// Skip nil values
			if v == nil {
				continue
			}

			// Determine value type
			valueType := getMetaValueType(v)

			// Track type counts
			if valueType != "" {
				if _, ok := typeCounter[k]; !ok {
					typeCounter[k] = make(map[string]int)
				}
				typeCounter[k][valueType] = typeCounter[k][valueType] + 1
			}

			// Aggregate value counts
			values := v
			if v, ok := v.([]interface{}); ok {
				values = v
			} else {
				values = []interface{}{v}
			}

			for _, vv := range values.([]interface{}) {
				if vv == nil {
					continue
				}
				sv := fmt.Sprintf("%v", vv)

				if _, ok := summary[k]; !ok {
					summary[k] = make(map[string]valueInfo)
				}

				if existing, ok := summary[k][sv]; ok {
					// Already exists, just increment count
					existing.count++
					summary[k][sv] = existing
				} else {
					// First time seeing this value - record order
					summary[k][sv] = valueInfo{count: 1, firstOrder: orderCounter}
					orderCounter++
				}
			}
		}
	}

	// Build result with type information and sorted values
	result := make(map[string]interface{})
	for k, v := range summary {
		// Sort by count descending, then by firstOrder ascending (to match Python stable sort)
		// values: [value, count, firstOrder]
		values := make([][3]interface{}, 0, len(v))
		for val, info := range v {
			values = append(values, [3]interface{}{val, info.count, info.firstOrder})
		}
		// Use stable sort - sort by count descending, then by firstOrder
		sort.SliceStable(values, func(i, j int) bool {
			cntI := values[i][1].(int)
			cntJ := values[j][1].(int)
			if cntI != cntJ {
				return cntI > cntJ // count descending
			}
			// If counts equal, use firstOrder ascending (earlier appearance first)
			return values[i][2].(int) < values[j][2].(int)
		})

		// Determine dominant type
		valueType := "string"
		if typeCounts, ok := typeCounter[k]; ok {
			maxCount := 0
			for t, c := range typeCounts {
				if c > maxCount {
					maxCount = c
					valueType = t
				}
			}
		}

		// Convert from [value, count, firstOrder] to [value, count] for output
		outputValues := make([][2]interface{}, len(values))
		for i, val := range values {
			outputValues[i] = [2]interface{}{val[0], val[1]}
		}

		result[k] = map[string]interface{}{
			"type":   valueType,
			"values": outputValues,
		}
	}

	return result
}

// getMetaValueType determines the type of a metadata value
func getMetaValueType(value interface{}) string {
	if value == nil {
		return ""
	}

	switch v := value.(type) {
	case []interface{}:
		if len(v) > 0 {
			return "list"
		}
		return ""
	case bool:
		return "string"
	case int, int8, int16, int32, int64:
		return "number"
	case float32, float64:
		return "number"
	case string:
		if isTimeString(v) {
			return "time"
		}
		return "string"
	}
	return "string"
}

// isTimeString checks if a string is an ISO 8601 datetime
func isTimeString(s string) bool {
	matched, _ := regexp.MatchString(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$`, s)
	return matched
}