Files
liranudi_web-pilot/scripts/download_file.py

191 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""Download files from URLs. Handles PDFs, images, documents, and any binary content.
Usage:
python3 download_file.py <url> [--output DIR] [--filename NAME]
Flags:
--output DIR Directory to save to (default: /tmp/downloads)
--filename NAME Override filename (auto-detected from URL/headers if omitted)
Outputs JSON {status, path, filename, size_bytes, content_type}.
Detects file type from Content-Type header and URL. For PDFs, also extracts
text if possible (requires pdfplumber or falls back to basic extraction).
"""
import argparse
import json
import os
import re
import sys
import urllib.parse
import requests
def json_error(message: str) -> str:
"""Return standardized JSON error format."""
return json.dumps({"error": message}, indent=2, ensure_ascii=False)
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
# File types we handle specially
TEXT_EXTRACTABLE = {
"application/pdf": "pdf",
}
def guess_filename(url: str, resp: requests.Response) -> str:
"""Determine filename from Content-Disposition, URL, or Content-Type."""
# Check Content-Disposition header
cd = resp.headers.get("Content-Disposition", "")
if "filename=" in cd:
match = re.search(r'filename[*]?=["\']?([^"\';]+)', cd)
if match:
return match.group(1).strip()
# Extract from URL path
parsed = urllib.parse.urlparse(url)
path_name = os.path.basename(parsed.path)
if path_name and "." in path_name:
return urllib.parse.unquote(path_name)
# Fall back to content type
ct = resp.headers.get("Content-Type", "")
ext_map = {
"application/pdf": "download.pdf",
"image/png": "download.png",
"image/jpeg": "download.jpg",
"image/gif": "download.gif",
"image/webp": "download.webp",
"application/zip": "download.zip",
"text/html": "download.html",
"text/plain": "download.txt",
"application/json": "download.json",
}
for mime, name in ext_map.items():
if mime in ct:
return name
return "download.bin"
def extract_pdf_text(filepath: str) -> str:
"""Try to extract text from a PDF. Returns empty string on failure."""
# Try pdfplumber first
try:
import pdfplumber
text_parts = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text_parts.append(t)
return "\n\n".join(text_parts)
except ImportError:
pass
# Try PyPDF2
try:
from PyPDF2 import PdfReader
reader = PdfReader(filepath)
text_parts = []
for page in reader.pages:
t = page.extract_text()
if t:
text_parts.append(t)
return "\n\n".join(text_parts)
except ImportError:
pass
return ""
def download(url: str, output_dir: str = "/tmp/downloads", filename: str = None,
proxy: str = None, user_agent: str = None) -> dict:
os.makedirs(output_dir, exist_ok=True)
headers = HEADERS.copy()
if user_agent:
headers["User-Agent"] = user_agent
proxies = {}
if proxy:
proxies = {"http": proxy, "https": proxy}
try:
resp = requests.get(url, headers=headers, timeout=30, stream=True,
allow_redirects=True, proxies=proxies)
except requests.exceptions.SSLError:
# Retry without SSL verification if certs are broken
resp = requests.get(url, headers=headers, timeout=30, stream=True,
allow_redirects=True, proxies=proxies, verify=False)
resp.raise_for_status()
if not filename:
filename = guess_filename(url, resp)
filepath = os.path.join(output_dir, filename)
# Avoid overwriting — add suffix if exists
base, ext = os.path.splitext(filepath)
counter = 1
while os.path.exists(filepath):
filepath = f"{base}_{counter}{ext}"
counter += 1
# Stream to disk
total = 0
with open(filepath, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
total += len(chunk)
content_type = resp.headers.get("Content-Type", "unknown")
result = {
"status": "downloaded",
"path": filepath,
"filename": os.path.basename(filepath),
"size_bytes": total,
"content_type": content_type,
"url": url,
}
# Add redirect URL if redirected
if resp.url != url:
result["redirect_url"] = resp.url
# Extract text from PDFs
if "pdf" in content_type.lower() or filepath.lower().endswith(".pdf"):
text = extract_pdf_text(filepath)
if text:
result["extracted_text"] = text
result["extracted_chars"] = len(text)
else:
result["extracted_text"] = ""
result["note"] = "PDF text extraction failed. Install pdfplumber or PyPDF2 for text extraction."
return result
def main():
parser = argparse.ArgumentParser(description="Download files from URLs")
parser.add_argument("url", help="URL to download")
parser.add_argument("--output", default="/tmp/downloads", help="Output directory (default: /tmp/downloads)")
parser.add_argument("--filename", default=None, help="Override filename")
parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
parser.add_argument("--user-agent", help="Override User-Agent string")
args = parser.parse_args()
try:
result = download(args.url, args.output, args.filename, args.proxy, args.user_agent)
print(json.dumps(result, indent=2, ensure_ascii=False))
except Exception as e:
print(json_error(f"Download failed: {str(e)}"))
if __name__ == "__main__":
main()