mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-05 19:08:38 +08:00
177 lines
8.2 KiB
Python
177 lines
8.2 KiB
Python
"""
|
|
UI components for Firecrawl integration in RAGFlow.
|
|
"""
|
|
|
|
from typing import Dict, Any, List, Optional
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class FirecrawlUIComponent:
|
|
"""Represents a UI component for Firecrawl integration."""
|
|
|
|
component_type: str
|
|
props: Dict[str, Any]
|
|
children: Optional[List["FirecrawlUIComponent"]] = None
|
|
|
|
|
|
class FirecrawlUIBuilder:
|
|
"""Builder for Firecrawl UI components in RAGFlow."""
|
|
|
|
@staticmethod
|
|
def create_data_source_config() -> Dict[str, Any]:
|
|
"""Create configuration for Firecrawl data source."""
|
|
return {
|
|
"name": "firecrawl",
|
|
"display_name": "Firecrawl Web Scraper",
|
|
"description": "Import web content using Firecrawl's powerful scraping capabilities",
|
|
"icon": "🌐",
|
|
"category": "web",
|
|
"version": "1.0.0",
|
|
"author": "Firecrawl Team",
|
|
"config_schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"api_key": {"type": "string", "title": "Firecrawl API Key", "description": "Your Firecrawl API key (starts with 'fc-')", "format": "password", "required": True},
|
|
"api_url": {"type": "string", "title": "API URL", "description": "Firecrawl API endpoint", "default": "https://api.firecrawl.dev", "required": False},
|
|
"max_retries": {"type": "integer", "title": "Max Retries", "description": "Maximum number of retry attempts", "default": 3, "minimum": 1, "maximum": 10},
|
|
"timeout": {"type": "integer", "title": "Timeout (seconds)", "description": "Request timeout in seconds", "default": 30, "minimum": 5, "maximum": 300},
|
|
"rate_limit_delay": {"type": "number", "title": "Rate Limit Delay", "description": "Delay between requests in seconds", "default": 1.0, "minimum": 0.1, "maximum": 10.0},
|
|
},
|
|
"required": ["api_key"],
|
|
},
|
|
}
|
|
|
|
@staticmethod
|
|
def create_scraping_form() -> Dict[str, Any]:
|
|
"""Create form for scraping configuration."""
|
|
return {
|
|
"type": "form",
|
|
"title": "Firecrawl Web Scraping",
|
|
"description": "Configure web scraping parameters",
|
|
"fields": [
|
|
{
|
|
"name": "urls",
|
|
"type": "array",
|
|
"title": "URLs to Scrape",
|
|
"description": "Enter URLs to scrape (one per line)",
|
|
"items": {"type": "string", "format": "uri"},
|
|
"required": True,
|
|
"minItems": 1,
|
|
},
|
|
{
|
|
"name": "scrape_type",
|
|
"type": "string",
|
|
"title": "Scrape Type",
|
|
"description": "Choose scraping method",
|
|
"enum": ["single", "crawl", "batch"],
|
|
"enumNames": ["Single URL", "Crawl Website", "Batch URLs"],
|
|
"default": "single",
|
|
"required": True,
|
|
},
|
|
{
|
|
"name": "formats",
|
|
"type": "array",
|
|
"title": "Output Formats",
|
|
"description": "Select output formats",
|
|
"items": {"type": "string", "enum": ["markdown", "html", "links", "screenshot"]},
|
|
"default": ["markdown", "html"],
|
|
"required": True,
|
|
},
|
|
{
|
|
"name": "crawl_limit",
|
|
"type": "integer",
|
|
"title": "Crawl Limit",
|
|
"description": "Maximum number of pages to crawl (for crawl type)",
|
|
"default": 100,
|
|
"minimum": 1,
|
|
"maximum": 1000,
|
|
"condition": {"field": "scrape_type", "equals": "crawl"},
|
|
},
|
|
{
|
|
"name": "extract_options",
|
|
"type": "object",
|
|
"title": "Extraction Options",
|
|
"description": "Advanced extraction settings",
|
|
"properties": {
|
|
"extractMainContent": {"type": "boolean", "title": "Extract Main Content Only", "default": True},
|
|
"excludeTags": {"type": "array", "title": "Exclude Tags", "description": "HTML tags to exclude", "items": {"type": "string"}, "default": ["nav", "footer", "header", "aside"]},
|
|
"includeTags": {
|
|
"type": "array",
|
|
"title": "Include Tags",
|
|
"description": "HTML tags to include",
|
|
"items": {"type": "string"},
|
|
"default": ["main", "article", "section", "div", "p"],
|
|
},
|
|
},
|
|
},
|
|
],
|
|
}
|
|
|
|
@staticmethod
|
|
def create_progress_component() -> Dict[str, Any]:
|
|
"""Create progress tracking component."""
|
|
return {
|
|
"type": "progress",
|
|
"title": "Scraping Progress",
|
|
"description": "Track the progress of your web scraping job",
|
|
"properties": {"show_percentage": True, "show_eta": True, "show_details": True},
|
|
}
|
|
|
|
@staticmethod
|
|
def create_results_view() -> Dict[str, Any]:
|
|
"""Create results display component."""
|
|
return {
|
|
"type": "results",
|
|
"title": "Scraping Results",
|
|
"description": "View and manage scraped content",
|
|
"properties": {"show_preview": True, "show_metadata": True, "allow_editing": True, "show_chunks": True},
|
|
}
|
|
|
|
@staticmethod
|
|
def create_error_handler() -> Dict[str, Any]:
|
|
"""Create error handling component."""
|
|
return {
|
|
"type": "error_handler",
|
|
"title": "Error Handling",
|
|
"description": "Handle scraping errors and retries",
|
|
"properties": {"show_retry_button": True, "show_error_details": True, "auto_retry": False, "max_retries": 3},
|
|
}
|
|
|
|
@staticmethod
|
|
def create_validation_rules() -> Dict[str, Any]:
|
|
"""Create validation rules for Firecrawl integration."""
|
|
return {
|
|
"url_validation": {"pattern": r"^https?://.+", "message": "URL must start with http:// or https://"},
|
|
"api_key_validation": {"pattern": r"^fc-[a-zA-Z0-9]+$", "message": "API key must start with 'fc-' followed by alphanumeric characters"},
|
|
"rate_limit_validation": {"min": 0.1, "max": 10.0, "message": "Rate limit delay must be between 0.1 and 10.0 seconds"},
|
|
}
|
|
|
|
@staticmethod
|
|
def create_help_text() -> Dict[str, str]:
|
|
"""Create help text for users."""
|
|
return {
|
|
"api_key_help": "Get your API key from https://firecrawl.dev. Sign up for a free account to get started.",
|
|
"url_help": "Enter the URLs you want to scrape. You can add multiple URLs for batch processing.",
|
|
"crawl_help": "Crawling will follow links from the starting URL and scrape all accessible pages within the limit.",
|
|
"formats_help": "Choose the output formats you need. Markdown is recommended for RAG processing.",
|
|
"extract_help": "Extraction options help filter content to get only the main content without navigation and ads.",
|
|
}
|
|
|
|
@staticmethod
|
|
def create_ui_schema() -> Dict[str, Any]:
|
|
"""Create complete UI schema for Firecrawl integration."""
|
|
return {
|
|
"version": "1.0.0",
|
|
"components": {
|
|
"data_source_config": FirecrawlUIBuilder.create_data_source_config(),
|
|
"scraping_form": FirecrawlUIBuilder.create_scraping_form(),
|
|
"progress_component": FirecrawlUIBuilder.create_progress_component(),
|
|
"results_view": FirecrawlUIBuilder.create_results_view(),
|
|
"error_handler": FirecrawlUIBuilder.create_error_handler(),
|
|
},
|
|
"validation_rules": FirecrawlUIBuilder.create_validation_rules(),
|
|
"help_text": FirecrawlUIBuilder.create_help_text(),
|
|
"workflow": ["configure_data_source", "setup_scraping_parameters", "start_scraping_job", "monitor_progress", "review_results", "import_to_ragflow"],
|
|
}
|