Initial commit with translated description

2026-03-29 10:22:50 +08:00
commit 84d54a6ee9
7 changed files with 1596 additions and 0 deletions
--- a/scripts/download_file.py
+++ b/scripts/download_file.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""Download files from URLs. Handles PDFs, images, documents, and any binary content.
+
+Usage:
+    python3 download_file.py <url> [--output DIR] [--filename NAME]
+
+Flags:
+    --output DIR     Directory to save to (default: /tmp/downloads)
+    --filename NAME  Override filename (auto-detected from URL/headers if omitted)
+
+Outputs JSON {status, path, filename, size_bytes, content_type}.
+Detects file type from Content-Type header and URL. For PDFs, also extracts
+text if possible (requires pdfplumber or falls back to basic extraction).
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import urllib.parse
+
+import requests
+
+
+def json_error(message: str) -> str:
+    """Return standardized JSON error format."""
+    return json.dumps({"error": message}, indent=2, ensure_ascii=False)
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+}
+
+# File types we handle specially
+TEXT_EXTRACTABLE = {
+    "application/pdf": "pdf",
+}
+
+
+def guess_filename(url: str, resp: requests.Response) -> str:
+    """Determine filename from Content-Disposition, URL, or Content-Type."""
+    # Check Content-Disposition header
+    cd = resp.headers.get("Content-Disposition", "")
+    if "filename=" in cd:
+        match = re.search(r'filename[*]?=["\']?([^"\';]+)', cd)
+        if match:
+            return match.group(1).strip()
+
+    # Extract from URL path
+    parsed = urllib.parse.urlparse(url)
+    path_name = os.path.basename(parsed.path)
+    if path_name and "." in path_name:
+        return urllib.parse.unquote(path_name)
+
+    # Fall back to content type
+    ct = resp.headers.get("Content-Type", "")
+    ext_map = {
+        "application/pdf": "download.pdf",
+        "image/png": "download.png",
+        "image/jpeg": "download.jpg",
+        "image/gif": "download.gif",
+        "image/webp": "download.webp",
+        "application/zip": "download.zip",
+        "text/html": "download.html",
+        "text/plain": "download.txt",
+        "application/json": "download.json",
+    }
+    for mime, name in ext_map.items():
+        if mime in ct:
+            return name
+
+    return "download.bin"
+
+
+def extract_pdf_text(filepath: str) -> str:
+    """Try to extract text from a PDF. Returns empty string on failure."""
+    # Try pdfplumber first
+    try:
+        import pdfplumber
+        text_parts = []
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                t = page.extract_text()
+                if t:
+                    text_parts.append(t)
+        return "\n\n".join(text_parts)
+    except ImportError:
+        pass
+
+    # Try PyPDF2
+    try:
+        from PyPDF2 import PdfReader
+        reader = PdfReader(filepath)
+        text_parts = []
+        for page in reader.pages:
+            t = page.extract_text()
+            if t:
+                text_parts.append(t)
+        return "\n\n".join(text_parts)
+    except ImportError:
+        pass
+
+    return ""
+
+
+def download(url: str, output_dir: str = "/tmp/downloads", filename: str = None, 
+            proxy: str = None, user_agent: str = None) -> dict:
+    os.makedirs(output_dir, exist_ok=True)
+
+    headers = HEADERS.copy()
+    if user_agent:
+        headers["User-Agent"] = user_agent
+
+    proxies = {}
+    if proxy:
+        proxies = {"http": proxy, "https": proxy}
+
+    try:
+        resp = requests.get(url, headers=headers, timeout=30, stream=True, 
+                           allow_redirects=True, proxies=proxies)
+    except requests.exceptions.SSLError:
+        # Retry without SSL verification if certs are broken
+        resp = requests.get(url, headers=headers, timeout=30, stream=True,
+                           allow_redirects=True, proxies=proxies, verify=False)
+    resp.raise_for_status()
+
+    if not filename:
+        filename = guess_filename(url, resp)
+
+    filepath = os.path.join(output_dir, filename)
+
+    # Avoid overwriting — add suffix if exists
+    base, ext = os.path.splitext(filepath)
+    counter = 1
+    while os.path.exists(filepath):
+        filepath = f"{base}_{counter}{ext}"
+        counter += 1
+
+    # Stream to disk
+    total = 0
+    with open(filepath, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=8192):
+            f.write(chunk)
+            total += len(chunk)
+
+    content_type = resp.headers.get("Content-Type", "unknown")
+    result = {
+        "status": "downloaded",
+        "path": filepath,
+        "filename": os.path.basename(filepath),
+        "size_bytes": total,
+        "content_type": content_type,
+        "url": url,
+    }
+    
+    # Add redirect URL if redirected
+    if resp.url != url:
+        result["redirect_url"] = resp.url
+
+    # Extract text from PDFs
+    if "pdf" in content_type.lower() or filepath.lower().endswith(".pdf"):
+        text = extract_pdf_text(filepath)
+        if text:
+            result["extracted_text"] = text
+            result["extracted_chars"] = len(text)
+        else:
+            result["extracted_text"] = ""
+            result["note"] = "PDF text extraction failed. Install pdfplumber or PyPDF2 for text extraction."
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download files from URLs")
+    parser.add_argument("url", help="URL to download")
+    parser.add_argument("--output", default="/tmp/downloads", help="Output directory (default: /tmp/downloads)")
+    parser.add_argument("--filename", default=None, help="Override filename")
+    parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
+    parser.add_argument("--user-agent", help="Override User-Agent string")
+    args = parser.parse_args()
+
+    try:
+        result = download(args.url, args.output, args.filename, args.proxy, args.user_agent)
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+    except Exception as e:
+        print(json_error(f"Download failed: {str(e)}"))
+
+
+if __name__ == "__main__":
+    main()