From a99d29da330412108c857af6991b70ff4d5a8088 Mon Sep 17 00:00:00 2001 From: zlei9 Date: Sun, 29 Mar 2026 08:32:38 +0800 Subject: [PATCH] Initial commit with translated description --- SKILL.md | 42 +++++++++++++++++ _meta.json | 6 +++ references/api.md | 85 +++++++++++++++++++++++++++++++++ scripts/crawl.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++ scripts/scrape.py | 89 +++++++++++++++++++++++++++++++++++ scripts/search.py | 77 ++++++++++++++++++++++++++++++ 6 files changed, 416 insertions(+) create mode 100644 SKILL.md create mode 100644 _meta.json create mode 100644 references/api.md create mode 100644 scripts/crawl.py create mode 100644 scripts/scrape.py create mode 100644 scripts/search.py diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..46d0792 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,42 @@ +--- +name: firecrawl +description: "通过Firecrawl API进行网络搜索和抓取。在需要搜索网页、抓取网站(包括JS重型页面)、爬取整个站点或从网页提取结构化数据时使用。需要FIRECRAWL_API_KEY环境变量。" +--- + +# Firecrawl + +Web search and scraping via Firecrawl API. + +## Prerequisites + +Set `FIRECRAWL_API_KEY` in your environment or `.env` file: +```bash +export FIRECRAWL_API_KEY=fc-xxxxxxxxxx +``` + +## Quick Start + +### Search the web +```bash +firecrawl_search "your search query" --limit 10 +``` + +### Scrape a single page +```bash +firecrawl_scrape "https://example.com" +``` + +### Crawl an entire site +```bash +firecrawl_crawl "https://example.com" --max-pages 50 +``` + +## API Reference + +See [references/api.md](references/api.md) for detailed API documentation and advanced options. + +## Scripts + +- `scripts/search.py` - Search the web with Firecrawl +- `scripts/scrape.py` - Scrape a single URL +- `scripts/crawl.py` - Crawl an entire website diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..b8a629d --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn72sgg184w49a3mwz6fhdfc5s800zf0", + "slug": "firecrawl-search", + "version": "1.0.0", + "publishedAt": 1769544521483 +} \ No newline at end of file diff --git a/references/api.md b/references/api.md new file mode 100644 index 0000000..26d6953 --- /dev/null +++ b/references/api.md @@ -0,0 +1,85 @@ +# Firecrawl API Reference + +## Environment + +Set your API key: +```bash +export FIRECRAWL_API_KEY=fc-xxxxxxxxxx +``` + +## API Endpoints + +### Search +```bash +POST https://api.firecrawl.dev/v1/search +``` + +Request body: +```json +{ + "query": "search terms", + "limit": 10, + "lang": "en", + "country": "us" +} +``` + +### Scrape +```bash +POST https://api.firecrawl.dev/v1/scrape +``` + +Request body: +```json +{ + "url": "https://example.com", + "formats": ["markdown", "html", "screenshot"], + "onlyMainContent": true, + "includeTags": ["h1", "p", "article"], + "excludeTags": ["nav", "footer", "aside"] +} +``` + +### Crawl +```bash +POST https://api.firecrawl.dev/v1/crawl +``` + +Request body: +```json +{ + "url": "https://example.com", + "limit": 50, + "excludePaths": ["/blog", "/admin"], + "scrapeOptions": { + "formats": ["markdown"], + "onlyMainContent": true + } +} +``` + +Check status: +```bash +GET https://api.firecrawl.dev/v1/crawl/{job_id} +``` + +## Response Format + +All responses follow this structure: +```json +{ + "success": true, + "data": { ... }, + "status": "completed" +} +``` + +## Rate Limits + +- Search: Check your Firecrawl dashboard +- Scrape: Check your Firecrawl dashboard +- Crawl: Check your Firecrawl dashboard + +## Pricing + +See https://firecrawl.dev/pricing for current rates. diff --git a/scripts/crawl.py b/scripts/crawl.py new file mode 100644 index 0000000..be56e9e --- /dev/null +++ b/scripts/crawl.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Firecrawl crawl script for crawling entire websites.""" +import argparse +import json +import os +import sys +import time +import urllib.request +from urllib.error import HTTPError + + +def start_crawl(url: str, max_pages: int = 50, exclude_paths: list = None): + """Start a crawl job.""" + api_key = os.environ.get("FIRECRAWL_API_KEY") + if not api_key: + print("Error: FIRECRAWL_API_KEY not set", file=sys.stderr) + sys.exit(1) + + req_url = "https://api.firecrawl.dev/v1/crawl" + + payload = { + "url": url, + "limit": max_pages, + "scrapeOptions": { + "formats": ["markdown"], + "onlyMainContent": True + } + } + + if exclude_paths: + payload["excludePaths"] = exclude_paths + + data = json.dumps(payload).encode() + + req = urllib.request.Request( + req_url, + data=data, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode()) + except HTTPError as e: + print(f"Error: {e.code} - {e.reason}", file=sys.stderr) + sys.exit(1) + + +def check_crawl_status(job_id: str): + """Check crawl job status.""" + api_key = os.environ.get("FIRECRAWL_API_KEY") + req_url = f"https://api.firecrawl.dev/v1/crawl/{job_id}" + + req = urllib.request.Request( + req_url, + headers={"Authorization": f"Bearer {api_key}"} + ) + + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode()) + + +def main(): + parser = argparse.ArgumentParser(description="Crawl a website via Firecrawl") + parser.add_argument("url", help="URL to start crawling from") + parser.add_argument("--max-pages", type=int, default=50, help="Max pages to crawl") + parser.add_argument("--wait", action="store_true", help="Wait for completion") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + + args = parser.parse_args() + + # Start crawl + result = start_crawl(args.url, args.max_pages) + + if not result.get("success"): + print("Error: Failed to start crawl", file=sys.stderr) + print(json.dumps(result, indent=2)) + sys.exit(1) + + job_id = result.get("id") + print(f"Crawl started: {job_id}") + + if not args.wait: + print(f"Check status with: firecrawl_crawl_status {job_id}") + return + + # Poll for completion + print("Waiting for completion...") + while True: + status = check_crawl_status(job_id) + + if status.get("status") in ["completed", "failed", "cancelled"]: + break + + print(f"Status: {status.get('status')}...") + time.sleep(2) + + if args.json: + print(json.dumps(status, indent=2)) + else: + print(f"\nCrawl {status.get('status')}") + if "data" in status: + print(f"Pages crawled: {len(status['data'])}") + for page in status["data"]: + print(f"\n{'='*60}") + print(f"URL: {page.get('metadata', {}).get('sourceURL', 'N/A')}") + if "markdown" in page: + preview = page["markdown"][:300] + "..." if len(page["markdown"]) > 300 else page["markdown"] + print(f"Preview: {preview}") + + +if __name__ == "__main__": + main() diff --git a/scripts/scrape.py b/scripts/scrape.py new file mode 100644 index 0000000..51a0f21 --- /dev/null +++ b/scripts/scrape.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Firecrawl scrape script for single URLs.""" +import argparse +import json +import os +import sys +import urllib.request +from urllib.error import HTTPError + + +def scrape(url: str, formats: list = None, only_main: bool = True): + """Scrape a URL using Firecrawl.""" + api_key = os.environ.get("FIRECRAWL_API_KEY") + if not api_key: + print("Error: FIRECRAWL_API_KEY not set", file=sys.stderr) + sys.exit(1) + + formats = formats or ["markdown"] + + req_url = "https://api.firecrawl.dev/v1/scrape" + + data = json.dumps({ + "url": url, + "formats": formats, + "onlyMainContent": only_main + }).encode() + + req = urllib.request.Request( + req_url, + data=data, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=60) as resp: + result = json.loads(resp.read().decode()) + return result + except HTTPError as e: + print(f"Error: {e.code} - {e.reason}", file=sys.stderr) + print(e.read().decode(), file=sys.stderr) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser(description="Scrape a URL via Firecrawl") + parser.add_argument("url", help="URL to scrape") + parser.add_argument("--html", action="store_true", help="Include HTML format") + parser.add_argument("--markdown", action="store_true", default=True, help="Include markdown format") + parser.add_argument("--screenshot", action="store_true", help="Include screenshot") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + + args = parser.parse_args() + + formats = [] + if args.html: + formats.append("html") + if args.markdown: + formats.append("markdown") + if args.screenshot: + formats.append("screenshot") + + result = scrape(args.url, formats) + + if args.json: + print(json.dumps(result, indent=2)) + return + + # Pretty print results + if result.get("success") and "data" in result: + data = result["data"] + print(f"Title: {data.get('metadata', {}).get('title', 'N/A')}") + print(f"URL: {data.get('metadata', {}).get('sourceURL', args.url)}") + print(f"\n{'='*60}\n") + + if "markdown" in data: + print(data["markdown"]) + elif "html" in data: + print(data["html"][:5000]) + else: + print("Error: Failed to scrape") + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/scripts/search.py b/scripts/search.py new file mode 100644 index 0000000..1c054a1 --- /dev/null +++ b/scripts/search.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Firecrawl web search script.""" +import argparse +import json +import os +import sys +import urllib.request +from urllib.error import HTTPError + + +def search(query: str, limit: int = 10): + """Search the web using Firecrawl.""" + api_key = os.environ.get("FIRECRAWL_API_KEY") + if not api_key: + print("Error: FIRECRAWL_API_KEY not set", file=sys.stderr) + sys.exit(1) + + url = "https://api.firecrawl.dev/v1/search" + + data = json.dumps({ + "query": query, + "limit": limit, + "lang": "en", + "country": "us" + }).encode() + + req = urllib.request.Request( + url, + data=data, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + method="POST" + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + result = json.loads(resp.read().decode()) + return result + except HTTPError as e: + print(f"Error: {e.code} - {e.reason}", file=sys.stderr) + print(e.read().decode(), file=sys.stderr) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser(description="Search the web via Firecrawl") + parser.add_argument("query", help="Search query") + parser.add_argument("--limit", type=int, default=10, help="Max results") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + + args = parser.parse_args() + + result = search(args.query, args.limit) + + if args.json: + print(json.dumps(result, indent=2)) + return + + # Pretty print results + if result.get("success") and "data" in result: + for item in result["data"]: + print(f"\n{'='*60}") + print(f"Title: {item.get('title', 'N/A')}") + print(f"URL: {item.get('url', 'N/A')}") + print(f"Description: {item.get('description', 'N/A')}") + if "markdown" in item: + content = item["markdown"][:500] + "..." if len(item["markdown"]) > 500 else item["markdown"] + print(f"Content preview:\n{content}") + else: + print("No results found") + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main()