Initial commit with translated description
This commit is contained in:
89
scripts/scrape.py
Normal file
89
scripts/scrape.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Firecrawl scrape script for single URLs."""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
from urllib.error import HTTPError
|
||||
|
||||
|
||||
def scrape(url: str, formats: list = None, only_main: bool = True):
|
||||
"""Scrape a URL using Firecrawl."""
|
||||
api_key = os.environ.get("FIRECRAWL_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: FIRECRAWL_API_KEY not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
formats = formats or ["markdown"]
|
||||
|
||||
req_url = "https://api.firecrawl.dev/v1/scrape"
|
||||
|
||||
data = json.dumps({
|
||||
"url": url,
|
||||
"formats": formats,
|
||||
"onlyMainContent": only_main
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
req_url,
|
||||
data=data,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
method="POST"
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
return result
|
||||
except HTTPError as e:
|
||||
print(f"Error: {e.code} - {e.reason}", file=sys.stderr)
|
||||
print(e.read().decode(), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Scrape a URL via Firecrawl")
|
||||
parser.add_argument("url", help="URL to scrape")
|
||||
parser.add_argument("--html", action="store_true", help="Include HTML format")
|
||||
parser.add_argument("--markdown", action="store_true", default=True, help="Include markdown format")
|
||||
parser.add_argument("--screenshot", action="store_true", help="Include screenshot")
|
||||
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
formats = []
|
||||
if args.html:
|
||||
formats.append("html")
|
||||
if args.markdown:
|
||||
formats.append("markdown")
|
||||
if args.screenshot:
|
||||
formats.append("screenshot")
|
||||
|
||||
result = scrape(args.url, formats)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2))
|
||||
return
|
||||
|
||||
# Pretty print results
|
||||
if result.get("success") and "data" in result:
|
||||
data = result["data"]
|
||||
print(f"Title: {data.get('metadata', {}).get('title', 'N/A')}")
|
||||
print(f"URL: {data.get('metadata', {}).get('sourceURL', args.url)}")
|
||||
print(f"\n{'='*60}\n")
|
||||
|
||||
if "markdown" in data:
|
||||
print(data["markdown"])
|
||||
elif "html" in data:
|
||||
print(data["html"][:5000])
|
||||
else:
|
||||
print("Error: Failed to scrape")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user