diff --git a/cmd/server_main.go b/cmd/server_main.go index e4a634e72a..ab14dacd87 100644 --- a/cmd/server_main.go +++ b/cmd/server_main.go @@ -166,8 +166,7 @@ func startServer(config *server.Config) { // Initialize service layer userService := service.NewUserService() documentService := service.NewDocumentService() - datasetsService := service.NewDatasetsService() - kbService := service.NewKnowledgebaseService() + datasetsService := service.NewDatasetService() chunkService := service.NewChunkService() llmService := service.NewLLMService() tenantService := service.NewTenantService() @@ -187,10 +186,9 @@ func startServer(config *server.Config) { authHandler := handler.NewAuthHandler() userHandler := handler.NewUserHandler(userService) tenantHandler := handler.NewTenantHandler(tenantService, userService) - documentHandler := handler.NewDocumentHandler(documentService) + documentHandler := handler.NewDocumentHandler(documentService, datasetsService) datasetsHandler := handler.NewDatasetsHandler(datasetsService) systemHandler := handler.NewSystemHandler(systemService) - kbHandler := handler.NewKnowledgebaseHandler(kbService, userService, documentService) chunkHandler := handler.NewChunkHandler(chunkService, userService) llmHandler := handler.NewLLMHandler(llmService, userService) chatHandler := handler.NewChatHandler(chatService, userService) @@ -203,7 +201,7 @@ func startServer(config *server.Config) { providerHandler := handler.NewProviderHandler(userService, modelProviderService) // Initialize router - r := router.NewRouter(authHandler, userHandler, tenantHandler, documentHandler, datasetsHandler, systemHandler, kbHandler, chunkHandler, llmHandler, chatHandler, chatSessionHandler, connectorHandler, searchHandler, fileHandler, memoryHandler, skillSearchHandler, providerHandler) + r := router.NewRouter(authHandler, userHandler, tenantHandler, documentHandler, datasetsHandler, systemHandler, chunkHandler, llmHandler, chatHandler, chatSessionHandler, connectorHandler, searchHandler, fileHandler, memoryHandler, skillSearchHandler, providerHandler) // Create Gin engine ginEngine := gin.New() diff --git a/internal/cli/client.go b/internal/cli/client.go index f28b59464e..679e1d19b5 100644 --- a/internal/cli/client.go +++ b/internal/cli/client.go @@ -203,6 +203,8 @@ func (c *RAGFlowClient) ExecuteUserCommand(cmd *Command) (ResponseIf, error) { return c.RunBenchmark(cmd) case "list_datasets": return c.ListDatasets(cmd) + case "list_dataset_documents": + return c.ListDatasetDocumentUserCommand(cmd) case "search_on_datasets": return c.SearchOnDatasets(cmd) case "create_token": diff --git a/internal/cli/lexer.go b/internal/cli/lexer.go index 5f0bf18287..6df63fde0c 100644 --- a/internal/cli/lexer.go +++ b/internal/cli/lexer.go @@ -431,6 +431,8 @@ func (l *Lexer) lookupIdent(ident string) Token { return Token{Type: TokenChunks, Value: ident} case "DOCUMENT": return Token{Type: TokenDocument, Value: ident} + case "DOCUMENTS": + return Token{Type: TokenDocuments, Value: ident} case "TAGS": return Token{Type: TokenTag, Value: ident} case "REGION": diff --git a/internal/cli/response.go b/internal/cli/response.go index ffdd18f1d7..440345be9d 100644 --- a/internal/cli/response.go +++ b/internal/cli/response.go @@ -85,6 +85,42 @@ func (r *CommonDataResponse) PrintOut() { } } +type ListDocumentsResponse struct { + Code int `json:"code"` + Data map[string]interface{} `json:"data"` + Message string `json:"message"` + Duration float64 + OutputFormat OutputFormat +} + +func (r *ListDocumentsResponse) Type() string { + return "list_documents" +} + +func (r *ListDocumentsResponse) TimeCost() float64 { + return r.Duration +} + +func (r *ListDocumentsResponse) SetOutputFormat(format OutputFormat) { + r.OutputFormat = format +} + +func (r *ListDocumentsResponse) PrintOut() { + if r.Code == 0 { + total := r.Data["total"].(float64) + fmt.Printf("Total: %0.0f\n", total) + docs := r.Data["docs"].([]interface{}) + table := make([]map[string]interface{}, 0) + for _, doc := range docs { + table = append(table, doc.(map[string]interface{})) + } + PrintTableSimpleByFormat(table, r.OutputFormat) + } else { + fmt.Println("ERROR") + fmt.Printf("%d, %s\n", r.Code, r.Message) + } +} + type SimpleResponse struct { Code int `json:"code"` Message string `json:"message"` diff --git a/internal/cli/types.go b/internal/cli/types.go index bbcf09a432..e5bf55d9fc 100644 --- a/internal/cli/types.go +++ b/internal/cli/types.go @@ -149,6 +149,7 @@ const ( TokenChunk TokenChunks TokenDocument + TokenDocuments TokenTag TokenRegion TokenURL diff --git a/internal/cli/user_command.go b/internal/cli/user_command.go index e99912a400..1d4cc970ae 100644 --- a/internal/cli/user_command.go +++ b/internal/cli/user_command.go @@ -391,7 +391,63 @@ func (c *RAGFlowClient) ListDatasets(cmd *Command) (ResponseIf, error) { var result CommonResponse if err = json.Unmarshal(resp.Body, &result); err != nil { - return nil, fmt.Errorf("list users failed: invalid JSON (%w)", err) + return nil, fmt.Errorf("list datasets failed: invalid JSON (%w)", err) + } + + if result.Code != 0 { + return nil, fmt.Errorf("%s", result.Message) + } + result.Duration = resp.Duration + + return &result, nil +} + +// ListDatasetDocumentUserCommand lists dataset documents +func (c *RAGFlowClient) ListDatasetDocumentUserCommand(cmd *Command) (ResponseIf, error) { + if c.ServerType != "user" { + return nil, fmt.Errorf("this command is only allowed in USER mode") + } + + // Check for benchmark iterations + iterations := 1 + if val, ok := cmd.Params["iterations"].(int); ok && val > 1 { + iterations = val + } + + // Determine auth kind based on whether API token is being used + if c.HTTPClient.LoginToken == "" && !c.HTTPClient.useAPIToken { + return nil, fmt.Errorf("no authorization") + } + + datasetID, ok := cmd.Params["dataset_id"].(string) + if !ok { + return nil, fmt.Errorf("no dataset id") + } + + page := 1 + pageSize := 10 + keywords := "" + returnEmptyMetadata := "true" + url := fmt.Sprintf("/datasets/%s/documents?page=%d&page_size=%d&keywords=%s&return_empty_metadata=%s", datasetID, page, pageSize, keywords, returnEmptyMetadata) + + if iterations > 1 { + // Benchmark mode - return raw result for benchmark stats + return c.HTTPClient.RequestWithIterations("GET", url, "web", nil, nil, iterations) + } + + // Normal mode + resp, err := c.HTTPClient.Request("GET", url, "web", nil, nil) + if err != nil { + return nil, fmt.Errorf("failed to list documents: %w", err) + } + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("failed to list documents: HTTP %d, body: %s", resp.StatusCode, string(resp.Body)) + } + + var result ListDocumentsResponse + if err = json.Unmarshal(resp.Body, &result); err != nil { + return nil, fmt.Errorf("list documents failed: invalid JSON (%w)", err) } if result.Code != 0 { diff --git a/internal/cli/user_parser.go b/internal/cli/user_parser.go index 7b40ee9b2c..38b2d2d29d 100644 --- a/internal/cli/user_parser.go +++ b/internal/cli/user_parser.go @@ -136,6 +136,8 @@ func (p *Parser) parseListCommand() (*Command, error) { return NewCommand("list_environments"), nil case TokenDatasets: return p.parseListDatasets() + case TokenDocuments: + return p.parseListDatasetDocuments() case TokenAgents: return p.parseListAgents() case TokenTokens: @@ -181,6 +183,31 @@ func (p *Parser) parseListDatasets() (*Command, error) { return cmd, nil } +func (p *Parser) parseListDatasetDocuments() (*Command, error) { + p.nextToken() // consume DOCUMENTS + + if p.curToken.Type != TokenFrom { + return nil, fmt.Errorf("expected FROM") + } + p.nextToken() + + datasetID, err := p.parseQuotedString() + if err != nil { + return nil, err + } + p.nextToken() + + cmd := NewCommand("list_dataset_documents") + cmd.Params["dataset_id"] = datasetID + + // Semicolon is optional for UNSET TOKEN + if p.curToken.Type == TokenSemicolon { + p.nextToken() + } + + return cmd, nil +} + func (p *Parser) parseListAgents() (*Command, error) { p.nextToken() // consume AGENTS diff --git a/internal/handler/datasets.go b/internal/handler/datasets.go index f740212329..aa0d896cb0 100644 --- a/internal/handler/datasets.go +++ b/internal/handler/datasets.go @@ -30,7 +30,7 @@ import ( // DatasetsHandler handles the RESTful dataset endpoints. type DatasetsHandler struct { - datasetsService *service.DatasetsService + datasetsService *service.DatasetService } type listDatasetsExt struct { @@ -40,7 +40,7 @@ type listDatasetsExt struct { } // NewDatasetsHandler creates a new datasets handler. -func NewDatasetsHandler(datasetsService *service.DatasetsService) *DatasetsHandler { +func NewDatasetsHandler(datasetsService *service.DatasetService) *DatasetsHandler { return &DatasetsHandler{datasetsService: datasetsService} } diff --git a/internal/handler/document.go b/internal/handler/document.go index a4152c07dc..fd7e775dfa 100644 --- a/internal/handler/document.go +++ b/internal/handler/document.go @@ -32,12 +32,14 @@ import ( // DocumentHandler document handler type DocumentHandler struct { documentService *service.DocumentService + datasetService *service.DatasetService } // NewDocumentHandler create document handler -func NewDocumentHandler(documentService *service.DocumentService) *DocumentHandler { +func NewDocumentHandler(documentService *service.DocumentService, datasetService *service.DatasetService) *DocumentHandler { return &DocumentHandler{ documentService: documentService, + datasetService: datasetService, } } @@ -198,35 +200,22 @@ func (h *DocumentHandler) DeleteDocument(c *gin.Context) { } // ListDocuments document list -// @Summary Document List -// @Description Get paginated document list -// @Tags documents -// @Accept json -// @Produce json -// @Param page query int false "page number" default(1) -// @Param page_size query int false "items per page" default(10) -// @Success 200 {object} map[string]interface{} -// @Router /api/v1/document/list [post] + func (h *DocumentHandler) ListDocuments(c *gin.Context) { - _, errorCode, errorMessage := GetUser(c) - if errorCode != common.CodeSuccess { - jsonError(c, errorCode, errorMessage) + + datasetID := c.Param("dataset_id") + pageStr := c.Query("page") + pageSizeStr := c.Query("page_size") + page, _ := strconv.Atoi(pageStr) + pageSize, _ := strconv.Atoi(pageSizeStr) + + userID := c.GetString("user_id") + + if !h.datasetService.Accessible(datasetID, userID) { + jsonError(c, common.CodeAuthenticationError, "No authorization.") return } - kbID := c.Query("kb_id") - if kbID == "" { - c.JSON(http.StatusOK, gin.H{ - "code": 1, - "message": "Lack of KB ID", - "data": false, - }) - return - } - - page, _ := strconv.Atoi(c.DefaultQuery("page", "1")) - pageSize, _ := strconv.Atoi(c.DefaultQuery("page_size", "10")) - if page < 1 { page = 1 } @@ -235,7 +224,7 @@ func (h *DocumentHandler) ListDocuments(c *gin.Context) { } // Use kbID to filter documents - documents, total, err := h.documentService.ListDocumentsByKBID(kbID, page, pageSize) + documents, total, err := h.documentService.ListDocumentsByDatasetID(datasetID, page, pageSize) if err != nil { c.JSON(http.StatusOK, gin.H{ "code": 1, @@ -482,4 +471,37 @@ func (h *DocumentHandler) SetMeta(c *gin.Context) { "message": "success", "data": true, }) -} \ No newline at end of file +} + +type ParseDocumentRequest struct { + Documents []string `json:"documents" binding:"required"` + DatasetID string `json:"dataset_id" binding:"required"` +} + +func (h *DocumentHandler) ParseDocuments(c *gin.Context) { + var req ParseDocumentRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeBadRequest, + "message": err.Error(), + }) + return + } + + userID := c.GetString("user_id") + + if !h.datasetService.Accessible(req.DatasetID, userID) { + jsonError(c, common.CodeAuthenticationError, "No authorization to access the dataset.") + return + } + + err := h.documentService.ParseDocuments(req.DatasetID, userID, req.Documents) + if err != nil { + jsonError(c, common.CodeExceptionError, err.Error()) + return + } + c.JSON(http.StatusOK, gin.H{ + "code": 0, + "message": "success", + }) +} diff --git a/internal/router/router.go b/internal/router/router.go index 4354551b9a..6b858a89f7 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -50,7 +50,6 @@ func NewRouter( documentHandler *handler.DocumentHandler, datasetsHandler *handler.DatasetsHandler, systemHandler *handler.SystemHandler, - knowledgebaseHandler *handler.KnowledgebaseHandler, chunkHandler *handler.ChunkHandler, llmHandler *handler.LLMHandler, chatHandler *handler.ChatHandler, @@ -63,23 +62,22 @@ func NewRouter( providerHandler *handler.ProviderHandler, ) *Router { return &Router{ - authHandler: authHandler, - userHandler: userHandler, - tenantHandler: tenantHandler, - documentHandler: documentHandler, - datasetsHandler: datasetsHandler, - systemHandler: systemHandler, - knowledgebaseHandler: knowledgebaseHandler, - chunkHandler: chunkHandler, - llmHandler: llmHandler, - chatHandler: chatHandler, - chatSessionHandler: chatSessionHandler, - connectorHandler: connectorHandler, - searchHandler: searchHandler, - fileHandler: fileHandler, - memoryHandler: memoryHandler, - skillSearchHandler: skillSearchHandler, - providerHandler: providerHandler, + authHandler: authHandler, + userHandler: userHandler, + tenantHandler: tenantHandler, + documentHandler: documentHandler, + datasetsHandler: datasetsHandler, + systemHandler: systemHandler, + chunkHandler: chunkHandler, + llmHandler: llmHandler, + chatHandler: chatHandler, + chatSessionHandler: chatSessionHandler, + connectorHandler: connectorHandler, + searchHandler: searchHandler, + fileHandler: fileHandler, + memoryHandler: memoryHandler, + skillSearchHandler: skillSearchHandler, + providerHandler: providerHandler, } } @@ -159,6 +157,7 @@ func (r *Router) Setup(engine *gin.Engine) { documents.GET("/:id", r.documentHandler.GetDocumentByID) documents.PUT("/:id", r.documentHandler.UpdateDocument) documents.DELETE("/:id", r.documentHandler.DeleteDocument) + documents.POST("/parse", r.documentHandler.ParseDocuments) } // Chat routes @@ -177,6 +176,9 @@ func (r *Router) Setup(engine *gin.Engine) { datasets.POST("", r.datasetsHandler.CreateDataset) datasets.DELETE("", r.datasetsHandler.DeleteDatasets) datasets.POST("/search", r.chunkHandler.RetrievalTest) + + // Dataset documents + datasets.GET("/:dataset_id/documents", r.documentHandler.ListDocuments) } // Search routes diff --git a/internal/service/datasets.go b/internal/service/dataset.go similarity index 95% rename from internal/service/datasets.go rename to internal/service/dataset.go index c163f891e4..19be742525 100644 --- a/internal/service/datasets.go +++ b/internal/service/dataset.go @@ -57,8 +57,8 @@ var ( datasetChunkMethodErrorMessage = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'resume', 'table' or 'tag'" ) -// DatasetsService implements the RESTful dataset APIs from dataset_api.py. -type DatasetsService struct { +// DatasetService implements the RESTful dataset APIs from dataset_api.py. +type DatasetService struct { kbDAO *dao.KnowledgebaseDAO documentDAO *dao.DocumentDAO connectorDAO *dao.ConnectorDAO @@ -66,9 +66,9 @@ type DatasetsService struct { tenantLLMDAO *dao.TenantLLMDAO } -// NewDatasetsService creates a new datasets service. -func NewDatasetsService() *DatasetsService { - return &DatasetsService{ +// NewDatasetService creates a new datasets service. +func NewDatasetService() *DatasetService { + return &DatasetService{ kbDAO: dao.NewKnowledgebaseDAO(), documentDAO: dao.NewDocumentDAO(), connectorDAO: dao.NewConnectorDAO(), @@ -108,7 +108,7 @@ type CreateDatasetRequest struct { } // ListDatasets lists datasets with pagination and filtering. -func (s *DatasetsService) ListDatasets(id, name string, page, pageSize int, orderby string, desc bool, keywords string, ownerIDs []string, parserID, userID string) ([]map[string]interface{}, int64, common.ErrorCode, error) { +func (s *DatasetService) ListDatasets(id, name string, page, pageSize int, orderby string, desc bool, keywords string, ownerIDs []string, parserID, userID string) ([]map[string]interface{}, int64, common.ErrorCode, error) { id = strings.TrimSpace(id) if id != "" { normalizedID, err := normalizeDatasetUUID1(id) @@ -190,7 +190,7 @@ func (s *DatasetsService) ListDatasets(id, name string, page, pageSize int, orde } // CreateDataset creates a new dataset. -func (s *DatasetsService) CreateDataset(req *CreateDatasetRequest, tenantID string) (map[string]interface{}, common.ErrorCode, error) { +func (s *DatasetService) CreateDataset(req *CreateDatasetRequest, tenantID string) (map[string]interface{}, common.ErrorCode, error) { if !isValidString(req.Name) { return nil, common.CodeDataError, errors.New("Dataset name must be string.") } @@ -441,7 +441,7 @@ func (s *DatasetsService) CreateDataset(req *CreateDatasetRequest, tenantID stri } // DeleteDatasets deletes multiple datasets. -func (s *DatasetsService) DeleteDatasets(ids []string, deleteAll bool, tenantID string) (map[string]interface{}, common.ErrorCode, error) { +func (s *DatasetService) DeleteDatasets(ids []string, deleteAll bool, tenantID string) (map[string]interface{}, common.ErrorCode, error) { normalizedIDs := make([]string, 0, len(ids)) seenIDs := make(map[string]struct{}, len(ids)) @@ -521,7 +521,7 @@ func (s *DatasetsService) DeleteDatasets(ids []string, deleteAll bool, tenantID } // GetDataset gets a single dataset with its size and linked connectors. -func (s *DatasetsService) GetDataset(datasetID, userID string) (map[string]interface{}, common.ErrorCode, error) { +func (s *DatasetService) GetDataset(datasetID, userID string) (map[string]interface{}, common.ErrorCode, error) { datasetID = strings.TrimSpace(datasetID) if datasetID == "" { return nil, common.CodeDataError, errors.New("Lack of \"Dataset ID\"") @@ -559,7 +559,12 @@ func (s *DatasetsService) GetDataset(datasetID, userID string) (map[string]inter return data, common.CodeSuccess, nil } -func (s *DatasetsService) deleteDataset(tenantID string, kb *entity.Knowledgebase) error { +// Accessible checks if a knowledge base is accessible by a user +func (s *DatasetService) Accessible(kbID, userID string) bool { + return s.kbDAO.Accessible(kbID, userID) +} + +func (s *DatasetService) deleteDataset(tenantID string, kb *entity.Knowledgebase) error { return dao.DB.Transaction(func(tx *gorm.DB) error { var documents []entity.Document if err := tx.Where("kb_id = ?", kb.ID).Find(&documents).Error; err != nil { @@ -706,7 +711,7 @@ func normalizeDatasetUUID1(id string) (string, error) { return strings.ReplaceAll(parsedUUID.String(), "-", ""), nil } -func (s *DatasetsService) verifyEmbeddingAvailability(embdID string, tenantID string) (bool, string) { +func (s *DatasetService) verifyEmbeddingAvailability(embdID string, tenantID string) (bool, string) { modelName, _, provider, err := parseModelName(embdID) if err != nil { return false, "Embedding model identifier must follow @ format" diff --git a/internal/service/document.go b/internal/service/document.go index d625bef484..29ed2d4b69 100644 --- a/internal/service/document.go +++ b/internal/service/document.go @@ -175,8 +175,8 @@ func (s *DocumentService) ListDocuments(page, pageSize int) ([]*DocumentResponse return responses, total, nil } -// ListDocumentsByKBID list documents by knowledge base ID -func (s *DocumentService) ListDocumentsByKBID(kbID string, page, pageSize int) ([]*DocumentResponse, int64, error) { +// ListDocumentsByDatasetID list documents by knowledge base ID +func (s *DocumentService) ListDocumentsByDatasetID(kbID string, page, pageSize int) ([]*DocumentResponse, int64, error) { offset := (page - 1) * pageSize documents, total, err := s.documentDAO.ListByKBID(kbID, offset, pageSize) if err != nil { @@ -207,6 +207,13 @@ func (s *DocumentService) GetDocumentsByAuthorID(authorID, page, pageSize int) ( return responses, total, nil } +func (s *DocumentService) ParseDocuments(datasetID, userID string, docIDs []string) error { + // create document parse id + // save to task table + // send to message queue + return nil +} + // toResponse convert model.Document to DocumentResponse func (s *DocumentService) toResponse(doc *entity.Document) *DocumentResponse { createdAt := ""