commit c027104b0b719d67d68ae99da41dd48ce9327c62 Author: zlei9 Date: Sun Mar 29 10:19:02 2026 +0800 Initial commit with translated description diff --git a/README.md b/README.md new file mode 100644 index 0000000..e8c1999 --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# Markdown.new Skill + +Single-skill repository for `markdown-new` - official Cloudflare URL-to-Markdown service ([markdown.new](https://markdown.new/)) converted into a skill. + +Skill entrypoint: +- `markdown-new/SKILL.md` + +## What It Does + +`markdown-new` converts public web pages into LLM-ready Markdown using [markdown.new](https://markdown.new), with: +- URL-to-Markdown conversion for summarization, extraction, RAG, and archiving +- conversion fallback control (`auto`, `ai`, `browser`) +- optional image retention +- optional wrapped delivery mode for downstream parsing + +## Path Resolution (Important) + +- Relative paths such as `scripts/markdown_new_fetch.py` are relative to the skill directory. +- Do not run `python3 scripts/markdown_new_fetch.py ...` from workspace root unless `scripts/` exists there. +- Safe command from any current directory: + +```bash +python3 ~/.codex/skills/markdown-new/scripts/markdown_new_fetch.py 'https://example.com' +``` + +## Modes + +### Conversion Modes (`--method`) +- `auto`: default pipeline, fastest successful path +- `ai`: force Workers AI conversion path +- `browser`: force Browser Rendering for JS-heavy pages + +### Output Modes +- default: print Markdown to stdout +- `--output `: write Markdown to file +- `--deliver-md`: write `.md` output with wrapped content; useful for reasoning LLMs on long reads because it reduces format confusion: + +```text + +...markdown... + +``` + +If `--deliver-md` is used without `--output`, filename is auto-generated from the URL. + +## How It Works + +1. Validate the input URL (`http/https`). +2. Call `POST https://markdown.new/` with `url`, `method`, and `retain_images`. +3. Accept response as either raw markdown or JSON with markdown in `content`. +4. Normalize metadata and choose output behavior. +5. Return stdout by default, `--output` for files, and `--deliver-md` for wrapped `.md` packets. + +## Install Paths + +- Codex (macOS/Linux): `~/.codex/skills/markdown-new` +- Claude Code (macOS/Linux): `~/.claude/skills/markdown-new` + +## Install on macOS/Linux (single command) + +### Codex + +```bash +mkdir -p ~/.codex/skills && rm -rf ~/.codex/skills/markdown-new && cp -R /Users/pro16/Dropbox/experiments/skills-i-use/markdown-new ~/.codex/skills/ +``` + +### Claude Code + +```bash +mkdir -p ~/.claude/skills && rm -rf ~/.claude/skills/markdown-new && cp -R /Users/pro16/Dropbox/experiments/skills-i-use/markdown-new ~/.claude/skills/ +``` + +## Quick Usage + +```bash +python3 scripts/markdown_new_fetch.py 'https://example.com' +python3 scripts/markdown_new_fetch.py 'https://example.com' --method browser --retain-images --output page.md +python3 scripts/markdown_new_fetch.py 'https://example.com' --deliver-md +``` + +## Credits + +- `webservervis` for the markdown conversion service powering this skill. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..62d1ec5 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,90 @@ +--- +name: markdown-new +description: "使用markdown.new将公共网页转换为干净的Markdown,用于AI工作流。" +--- + +# Markdown.new + +Use this skill to convert public URLs into LLM-ready Markdown via [markdown.new](https://markdown.new). + +## Path Resolution (Critical) + +- Resolve relative paths like `scripts/...` and `references/...` from the skill directory, not workspace root. +- If current directory is unknown, use an absolute script path. + +```bash +python3 ~/.codex/skills/markdown-new/scripts/markdown_new_fetch.py 'https://example.com' +``` + +```bash +cd ~/.codex/skills/markdown-new +python3 scripts/markdown_new_fetch.py 'https://example.com' +``` + +Avoid this pattern from an arbitrary workspace root: + +```bash +python3 scripts/markdown_new_fetch.py 'https://example.com' +``` + +## Workflow + +1. Validate the input URL is public `http` or `https`. +2. Run `scripts/markdown_new_fetch.py` with `--method auto` first. +3. Re-run with `--method browser` if output misses JS-rendered content. +4. Enable `--retain-images` only when image links are required. +5. Capture response metadata (`x-markdown-tokens`, `x-rate-limit-remaining`, and JSON metadata when present) for downstream planning. + +## Quick Start + +Commands below assume current directory is the skill root (`~/.codex/skills/markdown-new`). + +```bash +python3 scripts/markdown_new_fetch.py 'https://example.com' > page.md +``` + +```bash +python3 scripts/markdown_new_fetch.py 'https://example.com' --method browser --retain-images --output page.md +``` + +```bash +python3 scripts/markdown_new_fetch.py 'https://example.com' --deliver-md +``` + +## Method Selection + +- `auto`: default. Let markdown.new use its fastest successful pipeline. +- `ai`: force Workers AI HTML-to-Markdown conversion. +- `browser`: force headless browser rendering for JS-heavy pages. + +Use `auto` first, then retry with `browser` only when needed. + +## Delivery Mode + +- Use `--deliver-md` to force file output in `.md` format. +- In delivery mode, content is wrapped as: + - `` + - `...markdown...` + - `` +- If `--output` is omitted, the script auto-generates a filename from the URL. + +## API Modes + +- Prefix mode: + - `https://markdown.new/https://example.com?method=browser&retain_images=true` +- POST mode: + - `POST https://markdown.new/` + - JSON body: `{"url":"https://example.com","method":"auto","retain_images":false}` + +Prefer POST mode for automation and explicit parameters. + +## Limits And Safety + +- Treat `429` as rate limiting (documented limit: 500 requests/day/IP). +- Convert only publicly accessible pages. +- Respect `robots.txt`, terms of service, and copyright constraints. +- Do not treat markdown.new output as guaranteed complete for every page; verify critical extractions. + +## References + +- `references/markdown-new-api.md` diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..48d3d93 --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn7a3xe57dnqxgg5kfp78twfj98022a5", + "slug": "markdown-convert", + "version": "1.0.0", + "publishedAt": 1771528764013 +} \ No newline at end of file diff --git a/agents/openai.yaml b/agents/openai.yaml new file mode 100644 index 0000000..d292299 --- /dev/null +++ b/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Markdown.new" + short_description: "Convert web URLs to LLM-ready Markdown" + default_prompt: "Use $markdown-new to fetch clean Markdown from public URLs for summarization, RAG, or extraction tasks." diff --git a/references/markdown-new-api.md b/references/markdown-new-api.md new file mode 100644 index 0000000..a90e77b --- /dev/null +++ b/references/markdown-new-api.md @@ -0,0 +1,56 @@ +# markdown.new API Reference Notes + +## Endpoints + +- Prefix conversion: `GET https://markdown.new/` +- API conversion: `POST https://markdown.new/` + +## POST Request + +- Content type: `application/json` +- Body fields: + - `url` (string, required): public URL to convert + - `method` (string, optional): `auto` (default), `ai`, or `browser` + - `retain_images` (boolean, optional): `false` (default) + +Example: + +```json +{ + "url": "https://example.com", + "method": "auto", + "retain_images": false +} +``` + +## Response + +- Status: `200` on success +- Prefix mode typically returns Markdown (`text/markdown`) +- POST mode may return JSON (`application/json`) with Markdown in `content` +- Common headers: + - `x-markdown-tokens`: estimated token count for the returned Markdown + - `x-rate-limit-remaining`: remaining requests for current daily quota + +## Conversion Pipeline (as documented) + +1. Request native Markdown via `Accept: text/markdown` +2. Fall back to Workers AI `toMarkdown()` when HTML is returned +3. Fall back to Browser Rendering for JS-heavy pages + +## Operational Notes + +- Documented rate limit: 500 requests/day per IP +- `429` indicates rate-limit exhaustion +- Public URLs only; authenticated/paywalled pages may fail +- Browser rendering usually adds latency compared with `auto`/`ai` + +## Skill Script Notes + +- `scripts/markdown_new_fetch.py --deliver-md` writes a `.md` file and wraps the markdown body with pseudo-XML tags: + +```text + +...markdown... + +``` diff --git a/scripts/markdown_new_fetch.py b/scripts/markdown_new_fetch.py new file mode 100644 index 0000000..b1b5122 --- /dev/null +++ b/scripts/markdown_new_fetch.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +"""Convert public URLs to Markdown through markdown.new.""" + +import argparse +import json +import re +import sys +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Dict, Optional, Tuple + + +DEFAULT_API_URL = "https://markdown.new/" + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Fetch Markdown from a public URL using markdown.new" + ) + parser.add_argument("url", help="Public URL to convert (http/https)") + parser.add_argument( + "--method", + choices=["auto", "ai", "browser"], + default="auto", + help="Conversion method to request (default: auto)", + ) + parser.add_argument( + "--retain-images", + action="store_true", + help="Request image retention in output markdown", + ) + parser.add_argument( + "--output", + help="Write markdown to this file instead of stdout", + ) + parser.add_argument( + "--timeout", + type=float, + default=45.0, + help="Request timeout in seconds (default: 45)", + ) + parser.add_argument( + "--api-url", + default=DEFAULT_API_URL, + help="markdown.new API endpoint (default: https://markdown.new/)", + ) + parser.add_argument( + "--show-headers", + action="store_true", + help="Print response headers to stderr", + ) + parser.add_argument( + "--deliver-md", + action="store_true", + help=( + "Write output into a .md file and wrap content in pseudo-XML " + "... tags" + ), + ) + return parser + + +def validate_url(url: str) -> None: + parsed = urllib.parse.urlparse(url) + if parsed.scheme not in {"http", "https"} or not parsed.netloc: + raise ValueError(f"Invalid URL: {url!r}. Use absolute http/https URL.") + + +def build_request(api_url: str, payload: Dict[str, object]) -> urllib.request.Request: + data = json.dumps(payload).encode("utf-8") + return urllib.request.Request( + api_url, + data=data, + method="POST", + headers={ + "Content-Type": "application/json", + "Accept": "text/markdown", + "User-Agent": "codex-markdown-new-skill/1.0", + }, + ) + + +def normalize_body(body: str, headers: Dict[str, str]) -> Tuple[str, Dict[str, str]]: + content_type = headers.get("content-type", "") + if "application/json" not in content_type: + return body, {} + + try: + payload = json.loads(body) + except json.JSONDecodeError: + return body, {} + + if not isinstance(payload, dict): + return body, {} + + markdown = payload.get("content") + if not isinstance(markdown, str): + return body, {} + + metadata: Dict[str, str] = {} + for key in ("title", "url", "method", "duration_ms", "timestamp"): + value = payload.get(key) + if value is not None: + metadata[f"response_{key}"] = str(value) + + return markdown, metadata + + +def print_metadata( + headers: Dict[str, str], show_headers: bool, response_meta: Dict[str, str] +) -> None: + tokens = headers.get("x-markdown-tokens") + remaining = headers.get("x-rate-limit-remaining") + + if tokens: + print(f"x-markdown-tokens: {tokens}", file=sys.stderr) + if remaining: + print(f"x-rate-limit-remaining: {remaining}", file=sys.stderr) + for key, value in sorted(response_meta.items()): + print(f"{key}: {value}", file=sys.stderr) + + if show_headers: + for key, value in sorted(headers.items()): + print(f"{key}: {value}", file=sys.stderr) + + +def slugify_url(url: str) -> str: + parsed = urllib.parse.urlparse(url) + raw = f"{parsed.netloc}{parsed.path}".strip("/") + if not raw: + raw = parsed.netloc or "page" + slug = re.sub(r"[^a-zA-Z0-9._-]+", "_", raw).strip("_") + return slug or "page" + + +def resolve_output_path(url: str, output_path: Optional[str], deliver_md: bool) -> Optional[str]: + if output_path: + if deliver_md and not output_path.lower().endswith(".md"): + output_path = f"{output_path}.md" + return output_path + + if deliver_md: + return f"{slugify_url(url)}.md" + + return None + + +def wrap_in_url_tag(markdown: str) -> str: + body = markdown.rstrip("\n") + return f"\n{body}\n\n" + + +def write_output(markdown: str, output_path: Optional[str]) -> None: + if output_path: + path = Path(output_path) + if path.parent != Path("."): + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.write(markdown) + return + sys.stdout.write(markdown) + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + try: + validate_url(args.url) + except ValueError as exc: + print(str(exc), file=sys.stderr) + return 2 + + payload = { + "url": args.url, + "method": args.method, + "retain_images": bool(args.retain_images), + } + + req = build_request(args.api_url, payload) + + try: + with urllib.request.urlopen(req, timeout=args.timeout) as resp: + headers = {k.lower(): v for k, v in resp.headers.items()} + body = resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as err: + error_body = err.read().decode("utf-8", errors="replace") + print(f"HTTP {err.code} from markdown.new", file=sys.stderr) + if err.code == 429: + print( + "Rate limit reached (documented: 500 requests/day/IP). Retry later.", + file=sys.stderr, + ) + if error_body: + print(error_body.strip(), file=sys.stderr) + return 1 + except urllib.error.URLError as err: + print(f"Request failed: {err.reason}", file=sys.stderr) + return 1 + + markdown, response_meta = normalize_body(body, headers) + print_metadata(headers, args.show_headers, response_meta) + + content = wrap_in_url_tag(markdown) if args.deliver_md else markdown + output_path = resolve_output_path(args.url, args.output, args.deliver_md) + write_output(content, output_path) + + if args.deliver_md and output_path: + print(f"output_file: {Path(output_path).resolve()}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())