#!/usr/bin/env python3
"""Convert public URLs to Markdown through markdown.new."""
import argparse
import json
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Dict, Optional, Tuple
DEFAULT_API_URL = "https://markdown.new/"
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Fetch Markdown from a public URL using markdown.new"
)
parser.add_argument("url", help="Public URL to convert (http/https)")
parser.add_argument(
"--method",
choices=["auto", "ai", "browser"],
default="auto",
help="Conversion method to request (default: auto)",
)
parser.add_argument(
"--retain-images",
action="store_true",
help="Request image retention in output markdown",
)
parser.add_argument(
"--output",
help="Write markdown to this file instead of stdout",
)
parser.add_argument(
"--timeout",
type=float,
default=45.0,
help="Request timeout in seconds (default: 45)",
)
parser.add_argument(
"--api-url",
default=DEFAULT_API_URL,
help="markdown.new API endpoint (default: https://markdown.new/)",
)
parser.add_argument(
"--show-headers",
action="store_true",
help="Print response headers to stderr",
)
parser.add_argument(
"--deliver-md",
action="store_true",
help=(
"Write output into a .md file and wrap content in pseudo-XML "
"... tags"
),
)
return parser
def validate_url(url: str) -> None:
parsed = urllib.parse.urlparse(url)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
raise ValueError(f"Invalid URL: {url!r}. Use absolute http/https URL.")
def build_request(api_url: str, payload: Dict[str, object]) -> urllib.request.Request:
data = json.dumps(payload).encode("utf-8")
return urllib.request.Request(
api_url,
data=data,
method="POST",
headers={
"Content-Type": "application/json",
"Accept": "text/markdown",
"User-Agent": "codex-markdown-new-skill/1.0",
},
)
def normalize_body(body: str, headers: Dict[str, str]) -> Tuple[str, Dict[str, str]]:
content_type = headers.get("content-type", "")
if "application/json" not in content_type:
return body, {}
try:
payload = json.loads(body)
except json.JSONDecodeError:
return body, {}
if not isinstance(payload, dict):
return body, {}
markdown = payload.get("content")
if not isinstance(markdown, str):
return body, {}
metadata: Dict[str, str] = {}
for key in ("title", "url", "method", "duration_ms", "timestamp"):
value = payload.get(key)
if value is not None:
metadata[f"response_{key}"] = str(value)
return markdown, metadata
def print_metadata(
headers: Dict[str, str], show_headers: bool, response_meta: Dict[str, str]
) -> None:
tokens = headers.get("x-markdown-tokens")
remaining = headers.get("x-rate-limit-remaining")
if tokens:
print(f"x-markdown-tokens: {tokens}", file=sys.stderr)
if remaining:
print(f"x-rate-limit-remaining: {remaining}", file=sys.stderr)
for key, value in sorted(response_meta.items()):
print(f"{key}: {value}", file=sys.stderr)
if show_headers:
for key, value in sorted(headers.items()):
print(f"{key}: {value}", file=sys.stderr)
def slugify_url(url: str) -> str:
parsed = urllib.parse.urlparse(url)
raw = f"{parsed.netloc}{parsed.path}".strip("/")
if not raw:
raw = parsed.netloc or "page"
slug = re.sub(r"[^a-zA-Z0-9._-]+", "_", raw).strip("_")
return slug or "page"
def resolve_output_path(url: str, output_path: Optional[str], deliver_md: bool) -> Optional[str]:
if output_path:
if deliver_md and not output_path.lower().endswith(".md"):
output_path = f"{output_path}.md"
return output_path
if deliver_md:
return f"{slugify_url(url)}.md"
return None
def wrap_in_url_tag(markdown: str) -> str:
body = markdown.rstrip("\n")
return f"\n{body}\n\n"
def write_output(markdown: str, output_path: Optional[str]) -> None:
if output_path:
path = Path(output_path)
if path.parent != Path("."):
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(markdown)
return
sys.stdout.write(markdown)
def main() -> int:
parser = build_parser()
args = parser.parse_args()
try:
validate_url(args.url)
except ValueError as exc:
print(str(exc), file=sys.stderr)
return 2
payload = {
"url": args.url,
"method": args.method,
"retain_images": bool(args.retain_images),
}
req = build_request(args.api_url, payload)
try:
with urllib.request.urlopen(req, timeout=args.timeout) as resp:
headers = {k.lower(): v for k, v in resp.headers.items()}
body = resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as err:
error_body = err.read().decode("utf-8", errors="replace")
print(f"HTTP {err.code} from markdown.new", file=sys.stderr)
if err.code == 429:
print(
"Rate limit reached (documented: 500 requests/day/IP). Retry later.",
file=sys.stderr,
)
if error_body:
print(error_body.strip(), file=sys.stderr)
return 1
except urllib.error.URLError as err:
print(f"Request failed: {err.reason}", file=sys.stderr)
return 1
markdown, response_meta = normalize_body(body, headers)
print_metadata(headers, args.show_headers, response_meta)
content = wrap_in_url_tag(markdown) if args.deliver_md else markdown
output_path = resolve_output_path(args.url, args.output, args.deliver_md)
write_output(content, output_path)
if args.deliver_md and output_path:
print(f"output_file: {Path(output_path).resolve()}", file=sys.stderr)
return 0
if __name__ == "__main__":
raise SystemExit(main())