commit 948eac953852475f3c643b85c7a9e8ae8d5c9889 Author: zlei9 Date: Sun Mar 29 14:39:59 2026 +0800 Initial commit with translated description diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..11b9e6c --- /dev/null +++ b/SKILL.md @@ -0,0 +1,44 @@ +--- +name: youtube-transcript +description: "获取和总结YouTube视频转录。当被要求总结、转录或从YouTube视频提取内容时使用。" +--- + +# YouTube Transcript + +Fetch transcripts from YouTube videos and optionally summarize them. + +## Quick Start + +```bash +python3 scripts/fetch_transcript.py [languages] +``` + +**Examples:** +```bash +python3 scripts/fetch_transcript.py dQw4w9WgXcQ +python3 scripts/fetch_transcript.py "https://www.youtube.com/watch?v=dQw4w9WgXcQ" +python3 scripts/fetch_transcript.py dQw4w9WgXcQ "fr,en,de" +``` + +**Output:** JSON with `video_id`, `title`, `author`, `full_text`, and timestamped `transcript` array. + +## Workflow + +1. Run `fetch_transcript.py` with video ID or URL +2. Script checks VPN, brings it up if needed +3. Returns JSON with full transcript text +4. Summarize the `full_text` field as needed + +## Language Codes + +Default priority: `en, fr, de, es, it, pt, nl` + +Override with second argument: `python3 scripts/fetch_transcript.py VIDEO_ID "ja,ko,zh"` + +## Setup & Configuration + +See [references/SETUP.md](references/SETUP.md) for: +- Python dependencies installation +- WireGuard VPN configuration (required for cloud VPS) +- Troubleshooting common errors +- Alternative proxy options diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..cf270e1 --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn71x3dvswfagd3c57tytkx1s98016s2", + "slug": "youtube-transcript", + "version": "1.0.1", + "publishedAt": 1769532361984 +} \ No newline at end of file diff --git a/references/SETUP.md b/references/SETUP.md new file mode 100644 index 0000000..e7dbe8f --- /dev/null +++ b/references/SETUP.md @@ -0,0 +1,98 @@ +# YouTube Transcript - Setup Guide + +## Why Residential IP? + +YouTube blocks cloud provider IPs (AWS, Hetzner, GCP, Azure, etc.) from accessing transcripts. Requests from these IPs get 403/429 errors or bot detection. + +**Solution:** Route requests through a residential IP via WireGuard VPN to a home router. + +## Prerequisites + +- Python 3.x installed +- WireGuard installed on VPS +- Access to a residential network (home router with WireGuard support) + +## 1. Install Python Dependencies + +```bash +pip3 install youtube-transcript-api requests +``` + +## 2. Configure WireGuard VPN + +You need a WireGuard server on a residential network (home router, NAS, etc.). + +### On Your Home Router (Server) + +```bash +# Generate keys +wg genkey | tee /etc/wireguard/privatekey | wg pubkey > /etc/wireguard/publickey + +# Configure interface (e.g., /etc/wireguard/wg0.conf or via LuCI/OpenWRT) +[Interface] +PrivateKey = +Address = 10.100.0.1/24 +ListenPort = 51820 + +[Peer] +PublicKey = +AllowedIPs = 10.100.0.2/32 +``` + +Enable masquerading/NAT so the VPS can route traffic through your home IP. + +### On Your VPS (Client) + +```bash +# Generate keys +wg genkey | tee /etc/wireguard/privatekey | wg pubkey > /etc/wireguard/publickey + +# Configure /etc/wireguard/wg0.conf +[Interface] +PrivateKey = +Address = 10.100.0.2/24 +Table = 51820 + +[Peer] +PublicKey = +Endpoint = :51820 +AllowedIPs = 0.0.0.0/0 +PersistentKeepalive = 25 +``` + +### Bring Up VPN + +```bash +wg-quick up wg0 +ip rule add from 10.100.0.2 table 51820 +``` + +### Verify + +```bash +curl --interface 10.100.0.2 ifconfig.me # Should show your home IP +``` + +## 3. Configure Script (If Needed) + +Edit `scripts/fetch_transcript.py` and adjust: +```python +VPN_INTERFACE = "wg0" # Your WireGuard interface name +VPN_SOURCE_IP = "10.100.0.2" # Your VPS's WireGuard IP +``` + +## Troubleshooting + +| Error | Cause | Solution | +|-------|-------|----------| +| VPN not available | WireGuard down | Script auto-retries; check `wg show` | +| Transcripts disabled | Creator disabled captions | No workaround | +| No transcript found | No captions in requested languages | Try different language codes | +| RequestBlocked | VPN not routing properly | Verify `curl --interface ifconfig.me` shows residential IP | + +## Alternatives to WireGuard + +If you can't set up WireGuard: +- **SSH tunnel**: `ssh -D 1080 user@home-server` + configure SOCKS proxy +- **Residential proxy service**: Bright Data, Oxylabs, SmartProxy (paid) +- **Tailscale/ZeroTier**: Easier setup than raw WireGuard diff --git a/scripts/fetch_transcript.py b/scripts/fetch_transcript.py new file mode 100644 index 0000000..067cad2 --- /dev/null +++ b/scripts/fetch_transcript.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""Fetch YouTube transcript via residential IP (WireGuard VPN).""" + +import sys +import json +import subprocess +import requests +from requests.adapters import HTTPAdapter +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound + +VPN_INTERFACE = "wg0" +VPN_SOURCE_IP = "10.100.0.2" +LANGUAGES = ["en", "fr", "de", "es", "it", "pt", "nl"] + + +class SourceIPAdapter(HTTPAdapter): + """Bind requests to a specific source IP.""" + + def __init__(self, source_ip, **kwargs): + self.source_ip = source_ip + super().__init__(**kwargs) + + def init_poolmanager(self, *args, **kwargs): + kwargs["source_address"] = (self.source_ip, 0) + super().init_poolmanager(*args, **kwargs) + + +def check_vpn(): + """Check if WireGuard VPN is up and has recent handshake.""" + try: + result = subprocess.run( + ["wg", "show", VPN_INTERFACE], + capture_output=True, text=True, timeout=5 + ) + if result.returncode != 0: + return False, "VPN interface not found" + if "latest handshake" not in result.stdout: + return False, "No VPN handshake established" + return True, "VPN connected" + except Exception as e: + return False, str(e) + + +def bring_up_vpn(): + """Attempt to bring up VPN.""" + try: + subprocess.run(["wg-quick", "up", VPN_INTERFACE], capture_output=True, timeout=10) + subprocess.run( + ["ip", "rule", "add", "from", VPN_SOURCE_IP, "table", "51820"], + capture_output=True, timeout=5 + ) + return check_vpn() + except Exception as e: + return False, str(e) + + +def extract_video_id(url_or_id): + """Extract video ID from URL or return as-is.""" + import re + patterns = [ + r"(?:v=|/v/|youtu\.be/|/embed/)([a-zA-Z0-9_-]{11})", + r"^([a-zA-Z0-9_-]{11})$" + ] + for pattern in patterns: + match = re.search(pattern, url_or_id) + if match: + return match.group(1) + return url_or_id + + +def fetch_transcript(video_id, languages=None): + """Fetch transcript for a YouTube video.""" + if languages is None: + languages = LANGUAGES + + # Create session bound to VPN IP + session = requests.Session() + session.mount("http://", SourceIPAdapter(VPN_SOURCE_IP)) + session.mount("https://", SourceIPAdapter(VPN_SOURCE_IP)) + + api = YouTubeTranscriptApi(http_client=session) + transcript = api.fetch(video_id, languages=languages) + + return [{"text": entry.text, "start": entry.start, "duration": entry.duration} for entry in transcript] + + +def get_video_title(video_id): + """Get video title via oembed.""" + try: + resp = requests.get( + f"https://noembed.com/embed?url=https://www.youtube.com/watch?v={video_id}", + timeout=10 + ) + data = resp.json() + return data.get("title", "Unknown"), data.get("author_name", "Unknown") + except: + return "Unknown", "Unknown" + + +def main(): + if len(sys.argv) < 2: + print(json.dumps({"error": "Usage: fetch_transcript.py [languages]"})) + sys.exit(1) + + video_input = sys.argv[1] + languages = sys.argv[2].split(",") if len(sys.argv) > 2 else LANGUAGES + + # Extract video ID + video_id = extract_video_id(video_input) + + # Check VPN + vpn_ok, vpn_msg = check_vpn() + if not vpn_ok: + vpn_ok, vpn_msg = bring_up_vpn() + if not vpn_ok: + print(json.dumps({"error": f"VPN not available: {vpn_msg}"})) + sys.exit(1) + + # Get title + title, author = get_video_title(video_id) + + # Fetch transcript + try: + transcript = fetch_transcript(video_id, languages) + full_text = " ".join([entry["text"] for entry in transcript]) + + print(json.dumps({ + "video_id": video_id, + "title": title, + "author": author, + "language": languages[0] if transcript else None, + "entries": len(transcript), + "full_text": full_text, + "transcript": transcript + })) + except TranscriptsDisabled: + print(json.dumps({"error": "Transcripts are disabled for this video", "video_id": video_id})) + sys.exit(1) + except NoTranscriptFound: + print(json.dumps({"error": f"No transcript found in languages: {languages}", "video_id": video_id})) + sys.exit(1) + except Exception as e: + print(json.dumps({"error": str(e), "video_id": video_id})) + sys.exit(1) + + +if __name__ == "__main__": + main()