Initial commit with translated description

This commit is contained in:
2026-03-29 08:32:38 +08:00
commit a99d29da33
6 changed files with 416 additions and 0 deletions

89
scripts/scrape.py Normal file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Firecrawl scrape script for single URLs."""
import argparse
import json
import os
import sys
import urllib.request
from urllib.error import HTTPError
def scrape(url: str, formats: list = None, only_main: bool = True):
"""Scrape a URL using Firecrawl."""
api_key = os.environ.get("FIRECRAWL_API_KEY")
if not api_key:
print("Error: FIRECRAWL_API_KEY not set", file=sys.stderr)
sys.exit(1)
formats = formats or ["markdown"]
req_url = "https://api.firecrawl.dev/v1/scrape"
data = json.dumps({
"url": url,
"formats": formats,
"onlyMainContent": only_main
}).encode()
req = urllib.request.Request(
req_url,
data=data,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
result = json.loads(resp.read().decode())
return result
except HTTPError as e:
print(f"Error: {e.code} - {e.reason}", file=sys.stderr)
print(e.read().decode(), file=sys.stderr)
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Scrape a URL via Firecrawl")
parser.add_argument("url", help="URL to scrape")
parser.add_argument("--html", action="store_true", help="Include HTML format")
parser.add_argument("--markdown", action="store_true", default=True, help="Include markdown format")
parser.add_argument("--screenshot", action="store_true", help="Include screenshot")
parser.add_argument("--json", action="store_true", help="Output raw JSON")
args = parser.parse_args()
formats = []
if args.html:
formats.append("html")
if args.markdown:
formats.append("markdown")
if args.screenshot:
formats.append("screenshot")
result = scrape(args.url, formats)
if args.json:
print(json.dumps(result, indent=2))
return
# Pretty print results
if result.get("success") and "data" in result:
data = result["data"]
print(f"Title: {data.get('metadata', {}).get('title', 'N/A')}")
print(f"URL: {data.get('metadata', {}).get('sourceURL', args.url)}")
print(f"\n{'='*60}\n")
if "markdown" in data:
print(data["markdown"])
elif "html" in data:
print(data["html"][:5000])
else:
print("Error: Failed to scrape")
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()