Files
ragflow/tools/firecrawl/integration.py

150 lines
5.0 KiB
Python
Raw Normal View History

Add Firecrawl integration for RAGFlow (#10152) ## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
2025-09-19 07:28:17 +05:30
"""
RAGFlow Integration Entry Point for Firecrawl
This file provides the main entry point for the Firecrawl integration with RAGFlow.
It follows RAGFlow's integration patterns and provides the necessary interfaces.
"""
from typing import Dict, Any
import logging
from ragflow_integration import RAGFlowFirecrawlIntegration, create_firecrawl_integration
from firecrawl_ui import FirecrawlUIBuilder
# Set up logging
logger = logging.getLogger(__name__)
class FirecrawlRAGFlowPlugin:
"""
Main plugin class for Firecrawl integration with RAGFlow.
This class provides the interface that RAGFlow expects from integrations.
"""
def __init__(self):
"""Initialize the Firecrawl plugin."""
self.name = "firecrawl"
self.display_name = "Firecrawl Web Scraper"
self.description = "Import web content using Firecrawl's powerful scraping capabilities"
self.version = "1.0.0"
self.author = "Firecrawl Team"
self.category = "web"
self.icon = "🌐"
logger.info(f"Initialized {self.display_name} plugin v{self.version}")
def get_plugin_info(self) -> Dict[str, Any]:
"""Get plugin information for RAGFlow."""
return {
"name": self.name,
"display_name": self.display_name,
"description": self.description,
"version": self.version,
"author": self.author,
"category": self.category,
"icon": self.icon,
"supported_formats": ["markdown", "html", "links", "screenshot"],
"supported_scrape_types": ["single", "crawl", "batch"]
}
def get_config_schema(self) -> Dict[str, Any]:
"""Get configuration schema for RAGFlow."""
return FirecrawlUIBuilder.create_data_source_config()["config_schema"]
def get_ui_schema(self) -> Dict[str, Any]:
"""Get UI schema for RAGFlow."""
return FirecrawlUIBuilder.create_ui_schema()
def validate_config(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate configuration and return any errors."""
try:
integration = create_firecrawl_integration(config)
return integration.validate_config(config)
except Exception as e:
logger.error(f"Configuration validation error: {e}")
return {"general": str(e)}
def test_connection(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""Test connection to Firecrawl API."""
try:
integration = create_firecrawl_integration(config)
# Run the async test_connection method
import asyncio
return asyncio.run(integration.test_connection())
except Exception as e:
logger.error(f"Connection test error: {e}")
return {
"success": False,
"error": str(e),
"message": "Connection test failed"
}
def create_integration(self, config: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
"""Create and return a Firecrawl integration instance."""
return create_firecrawl_integration(config)
def get_help_text(self) -> Dict[str, str]:
"""Get help text for users."""
return FirecrawlUIBuilder.create_help_text()
def get_validation_rules(self) -> Dict[str, Any]:
"""Get validation rules for configuration."""
return FirecrawlUIBuilder.create_validation_rules()
# RAGFlow integration entry points
def get_plugin() -> FirecrawlRAGFlowPlugin:
"""Get the plugin instance for RAGFlow."""
return FirecrawlRAGFlowPlugin()
def get_integration(config: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
"""Get an integration instance with the given configuration."""
return create_firecrawl_integration(config)
def get_config_schema() -> Dict[str, Any]:
"""Get the configuration schema."""
return FirecrawlUIBuilder.create_data_source_config()["config_schema"]
def get_ui_schema() -> Dict[str, Any]:
"""Get the UI schema."""
return FirecrawlUIBuilder.create_ui_schema()
def validate_config(config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate configuration."""
try:
integration = create_firecrawl_integration(config)
return integration.validate_config(config)
except Exception as e:
return {"general": str(e)}
def test_connection(config: Dict[str, Any]) -> Dict[str, Any]:
"""Test connection to Firecrawl API."""
try:
integration = create_firecrawl_integration(config)
return integration.test_connection()
except Exception as e:
return {
"success": False,
"error": str(e),
"message": "Connection test failed"
}
# Export main functions and classes
__all__ = [
"FirecrawlRAGFlowPlugin",
"get_plugin",
"get_integration",
"get_config_schema",
"get_ui_schema",
"validate_config",
"test_connection",
"RAGFlowFirecrawlIntegration",
"create_firecrawl_integration"
]