Initial commit with translated description
This commit is contained in:
190
scripts/download_file.py
Normal file
190
scripts/download_file.py
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Download files from URLs. Handles PDFs, images, documents, and any binary content.
|
||||
|
||||
Usage:
|
||||
python3 download_file.py <url> [--output DIR] [--filename NAME]
|
||||
|
||||
Flags:
|
||||
--output DIR Directory to save to (default: /tmp/downloads)
|
||||
--filename NAME Override filename (auto-detected from URL/headers if omitted)
|
||||
|
||||
Outputs JSON {status, path, filename, size_bytes, content_type}.
|
||||
Detects file type from Content-Type header and URL. For PDFs, also extracts
|
||||
text if possible (requires pdfplumber or falls back to basic extraction).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def json_error(message: str) -> str:
|
||||
"""Return standardized JSON error format."""
|
||||
return json.dumps({"error": message}, indent=2, ensure_ascii=False)
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
# File types we handle specially
|
||||
TEXT_EXTRACTABLE = {
|
||||
"application/pdf": "pdf",
|
||||
}
|
||||
|
||||
|
||||
def guess_filename(url: str, resp: requests.Response) -> str:
|
||||
"""Determine filename from Content-Disposition, URL, or Content-Type."""
|
||||
# Check Content-Disposition header
|
||||
cd = resp.headers.get("Content-Disposition", "")
|
||||
if "filename=" in cd:
|
||||
match = re.search(r'filename[*]?=["\']?([^"\';]+)', cd)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# Extract from URL path
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path_name = os.path.basename(parsed.path)
|
||||
if path_name and "." in path_name:
|
||||
return urllib.parse.unquote(path_name)
|
||||
|
||||
# Fall back to content type
|
||||
ct = resp.headers.get("Content-Type", "")
|
||||
ext_map = {
|
||||
"application/pdf": "download.pdf",
|
||||
"image/png": "download.png",
|
||||
"image/jpeg": "download.jpg",
|
||||
"image/gif": "download.gif",
|
||||
"image/webp": "download.webp",
|
||||
"application/zip": "download.zip",
|
||||
"text/html": "download.html",
|
||||
"text/plain": "download.txt",
|
||||
"application/json": "download.json",
|
||||
}
|
||||
for mime, name in ext_map.items():
|
||||
if mime in ct:
|
||||
return name
|
||||
|
||||
return "download.bin"
|
||||
|
||||
|
||||
def extract_pdf_text(filepath: str) -> str:
|
||||
"""Try to extract text from a PDF. Returns empty string on failure."""
|
||||
# Try pdfplumber first
|
||||
try:
|
||||
import pdfplumber
|
||||
text_parts = []
|
||||
with pdfplumber.open(filepath) as pdf:
|
||||
for page in pdf.pages:
|
||||
t = page.extract_text()
|
||||
if t:
|
||||
text_parts.append(t)
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Try PyPDF2
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
reader = PdfReader(filepath)
|
||||
text_parts = []
|
||||
for page in reader.pages:
|
||||
t = page.extract_text()
|
||||
if t:
|
||||
text_parts.append(t)
|
||||
return "\n\n".join(text_parts)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def download(url: str, output_dir: str = "/tmp/downloads", filename: str = None,
|
||||
proxy: str = None, user_agent: str = None) -> dict:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
headers = HEADERS.copy()
|
||||
if user_agent:
|
||||
headers["User-Agent"] = user_agent
|
||||
|
||||
proxies = {}
|
||||
if proxy:
|
||||
proxies = {"http": proxy, "https": proxy}
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=30, stream=True,
|
||||
allow_redirects=True, proxies=proxies)
|
||||
except requests.exceptions.SSLError:
|
||||
# Retry without SSL verification if certs are broken
|
||||
resp = requests.get(url, headers=headers, timeout=30, stream=True,
|
||||
allow_redirects=True, proxies=proxies, verify=False)
|
||||
resp.raise_for_status()
|
||||
|
||||
if not filename:
|
||||
filename = guess_filename(url, resp)
|
||||
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
|
||||
# Avoid overwriting — add suffix if exists
|
||||
base, ext = os.path.splitext(filepath)
|
||||
counter = 1
|
||||
while os.path.exists(filepath):
|
||||
filepath = f"{base}_{counter}{ext}"
|
||||
counter += 1
|
||||
|
||||
# Stream to disk
|
||||
total = 0
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
|
||||
content_type = resp.headers.get("Content-Type", "unknown")
|
||||
result = {
|
||||
"status": "downloaded",
|
||||
"path": filepath,
|
||||
"filename": os.path.basename(filepath),
|
||||
"size_bytes": total,
|
||||
"content_type": content_type,
|
||||
"url": url,
|
||||
}
|
||||
|
||||
# Add redirect URL if redirected
|
||||
if resp.url != url:
|
||||
result["redirect_url"] = resp.url
|
||||
|
||||
# Extract text from PDFs
|
||||
if "pdf" in content_type.lower() or filepath.lower().endswith(".pdf"):
|
||||
text = extract_pdf_text(filepath)
|
||||
if text:
|
||||
result["extracted_text"] = text
|
||||
result["extracted_chars"] = len(text)
|
||||
else:
|
||||
result["extracted_text"] = ""
|
||||
result["note"] = "PDF text extraction failed. Install pdfplumber or PyPDF2 for text extraction."
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download files from URLs")
|
||||
parser.add_argument("url", help="URL to download")
|
||||
parser.add_argument("--output", default="/tmp/downloads", help="Output directory (default: /tmp/downloads)")
|
||||
parser.add_argument("--filename", default=None, help="Override filename")
|
||||
parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
|
||||
parser.add_argument("--user-agent", help="Override User-Agent string")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
result = download(args.url, args.output, args.filename, args.proxy, args.user_agent)
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
except Exception as e:
|
||||
print(json_error(f"Download failed: {str(e)}"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user