From 35c34a795296fe717aa85c9b2532246bc1b16c22 Mon Sep 17 00:00:00 2001 From: zlei9 Date: Sun, 29 Mar 2026 14:36:54 +0800 Subject: [PATCH] Initial commit with translated description --- CHANGELOG.md | 65 ++++++++++ CONTRIBUTING.md | 132 +++++++++++++++++++ INSTALL.md | 121 ++++++++++++++++++ README.md | 187 +++++++++++++++++++++++++++ README_ZH.md | 184 ++++++++++++++++++++++++++ SKILL.md | 234 ++++++++++++++++++++++++++++++++++ _meta.json | 6 + examples/README.md | 229 +++++++++++++++++++++++++++++++++ examples/discuss-hk.sh | 16 +++ package-lock.json | 60 +++++++++ package.json | 23 ++++ scripts/playwright-simple.js | 60 +++++++++ scripts/playwright-stealth.js | 167 ++++++++++++++++++++++++ test.sh | 45 +++++++ 14 files changed, 1529 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md create mode 100644 INSTALL.md create mode 100644 README.md create mode 100644 README_ZH.md create mode 100644 SKILL.md create mode 100644 _meta.json create mode 100644 examples/README.md create mode 100644 examples/discuss-hk.sh create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 scripts/playwright-simple.js create mode 100644 scripts/playwright-stealth.js create mode 100644 test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ea8b289 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,65 @@ +# Changelog + +## [1.2.0] - 2026-02-07 + +### 🔄 Major Changes + +- **Project Renamed** — `web-scraper` → `playwright-scraper-skill` +- Updated all documentation and links +- Updated GitHub repo name +- **Bilingual Documentation** — All docs now in English (with Chinese README available) + +--- + +## [1.1.0] - 2026-02-07 + +### ✅ Added + +- **LICENSE** — MIT License +- **CONTRIBUTING.md** — Contribution guidelines +- **examples/README.md** — Detailed usage examples +- **test.sh** — Automated test script +- **README.md** — Redesigned with badges + +### 🔧 Improvements + +- Clearer file structure +- More detailed documentation +- More practical examples + +--- + +## [1.0.0] - 2026-02-07 + +### ✅ Initial Release + +**Tools Created:** +- ✅ `playwright-simple.js` — Fast simple scraper +- ✅ `playwright-stealth.js` — Anti-bot protected version (primary) ⭐ + +**Test Results:** +- ✅ Discuss.com.hk success (200 OK, 19.6s) +- ✅ Example.com success (3.4s) +- ✅ Auto fallback to deep-scraper's Playwright + +**Documentation:** +- ✅ SKILL.md (full documentation) +- ✅ README.md (quick reference) +- ✅ Example scripts (discuss-hk.sh) +- ✅ package.json + +**Key Findings:** +1. **Playwright Stealth is the best solution** (100% success on Discuss.com.hk) +2. **Don't use Crawlee** (easily detected) +3. **Chaser (Rust) doesn't work currently** (blocked by Cloudflare) +4. **Hiding `navigator.webdriver` is key** + +--- + +## Future Plans + +- [ ] Add proxy IP rotation +- [ ] CAPTCHA handling integration +- [ ] Cookie management (maintain login state) +- [ ] Batch scraping (parallel processing) +- [ ] Integration with OpenClaw browser tool diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..fb7b6a9 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,132 @@ +# Contributing Guide + +Thank you for considering contributing to playwright-scraper-skill! + +## 🐛 Reporting Issues + +If you find a bug or have a feature suggestion: + +1. Check [Issues](https://github.com/waisimon/playwright-scraper-skill/issues) to see if it already exists +2. If not, create a new Issue +3. Provide the following information: + - Problem description + - Steps to reproduce + - Expected vs actual behavior + - Environment (Node.js version, OS) + - Error messages (if any) + +## 💡 Feature Requests + +1. Create an Issue with `[Feature Request]` in the title +2. Explain: + - The desired feature + - Use cases + - Why this feature would be useful + +## 🔧 Submitting Code + +### Setting Up Development Environment + +```bash +# Fork the repo and clone +git clone https://github.com/YOUR_USERNAME/playwright-scraper-skill.git +cd playwright-scraper-skill + +# Install dependencies +npm install +npx playwright install chromium + +# Test +node scripts/playwright-simple.js https://example.com +``` + +### Contribution Workflow + +1. Create a new branch: + ```bash + git checkout -b feature/my-new-feature + ``` + +2. Make your changes + +3. Test your changes: + ```bash + npm test + node scripts/playwright-stealth.js + ``` + +4. Commit: + ```bash + git add . + git commit -m "Add: brief description of changes" + ``` + +5. Push and create a Pull Request: + ```bash + git push origin feature/my-new-feature + ``` + +### Commit Message Guidelines + +Use clear commit messages: + +- `Add: new feature` +- `Fix: issue description` +- `Update: existing feature` +- `Refactor: code refactoring` +- `Docs: documentation update` +- `Test: add or modify tests` + +Example: +``` +Fix: playwright-stealth.js screenshot timeout issue + +- Increase timeout parameter to 10 seconds +- Add try-catch error handling +- Update documentation +``` + +## 📝 Documentation + +If your changes affect usage: + +- Update `SKILL.md` (full documentation) +- Update `README.md` (quick reference) +- Update `examples/README.md` (if adding new examples) +- Update `CHANGELOG.md` (record changes) + +## ✅ Checklist + +Before submitting a PR, confirm: + +- [ ] Code runs properly +- [ ] Doesn't break existing functionality +- [ ] Updated relevant documentation +- [ ] Clear commit messages +- [ ] No sensitive information (API keys, personal paths, etc.) + +## 🎯 Priority Areas + +Currently welcoming contributions in: + +1. **New anti-bot techniques** — Improve success rates +2. **Support more websites** — Test and share success cases +3. **Performance optimization** — Speed up scraping +4. **Error handling** — Better error messages and recovery +5. **Documentation improvements** — Clearer explanations and examples + +## 🚫 Unaccepted Contributions + +- Adding complex dependencies (keep it lightweight) +- Features violating privacy or laws +- Breaking existing API changes (unless well justified) + +## 📞 Contact + +Have questions? Feel free to: +- Create an Issue for discussion +- Ask in Pull Request comments + +--- + +Thank you for your contribution! 🙏 diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..03c2058 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,121 @@ +# Installation Guide + +## 📦 Quick Installation + +### 1. Clone or Download the Skill + +```bash +# Method 1: Using git clone (if public repo) +git clone https://github.com/waisimon/playwright-scraper-skill.git +cd playwright-scraper-skill + +# Method 2: Download ZIP and extract +# After downloading, enter the directory +cd playwright-scraper-skill +``` + +### 2. Install Dependencies + +```bash +# Install Playwright (recommended) +npm install + +# Install browser (Chromium) +npx playwright install chromium +``` + +### 3. Test + +```bash +# Quick test +node scripts/playwright-simple.js https://example.com + +# Test Stealth version +node scripts/playwright-stealth.js https://example.com +``` + +--- + +## 🔧 Advanced Installation + +### Using with OpenClaw + +If you're using OpenClaw, you can place this skill in the skills directory: + +```bash +# Assuming your OpenClaw workspace is at ~/.openclaw/workspace +cp -r playwright-scraper-skill ~/.openclaw/workspace/skills/ + +# Then you can invoke it in OpenClaw +``` + +--- + +## ✅ Verify Installation + +Run the example script: + +```bash +# Discuss.com.hk example (verified working) +bash examples/discuss-hk.sh +``` + +If you see output similar to this, installation is successful: + +``` +🕷️ Starting Playwright Stealth scraper... +📱 Navigating to: https://m.discuss.com.hk/#hot +📡 HTTP Status: 200 +✅ Scraping complete! +``` + +--- + +## 🐛 Common Issues + +### Issue: Playwright not found + +**Error message:** `Error: Cannot find module 'playwright'` + +**Solution:** +```bash +npm install +npx playwright install chromium +``` + +### Issue: Browser launch failed + +**Error message:** `browserType.launch: Executable doesn't exist` + +**Solution:** +```bash +npx playwright install chromium +``` + +### Issue: Permission errors + +**Error message:** `Permission denied` + +**Solution:** +```bash +chmod +x scripts/*.js +chmod +x examples/*.sh +``` + +--- + +## 📝 System Requirements + +- **Node.js:** v18+ recommended +- **OS:** macOS / Linux / Windows +- **Disk Space:** ~500MB (including Chromium) +- **RAM:** 2GB+ recommended + +--- + +## 🚀 Next Steps + +After installation, check out: +- [README.md](README.md) — Quick reference +- [SKILL.md](SKILL.md) — Full documentation +- [examples/](examples/) — Example scripts diff --git a/README.md b/README.md new file mode 100644 index 0000000..6987523 --- /dev/null +++ b/README.md @@ -0,0 +1,187 @@ +# Playwright Scraper Skill 🕷️ + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Node.js](https://img.shields.io/badge/Node.js-18+-green.svg)](https://nodejs.org/) +[![Playwright](https://img.shields.io/badge/Playwright-1.40+-blue.svg)](https://playwright.dev/) + +**[中文文檔](README_ZH.md)** | English + +A Playwright-based web scraping OpenClaw Skill with anti-bot protection. Successfully tested on complex websites like Discuss.com.hk. + +> 📦 **Installation:** See [INSTALL.md](INSTALL.md) +> 📚 **Full Documentation:** See [SKILL.md](SKILL.md) +> 💡 **Examples:** See [examples/README.md](examples/README.md) + +--- + +## ✨ Features + +- ✅ **Pure Playwright** — Modern, powerful, easy to use +- ✅ **Anti-Bot Protection** — Hides automation, realistic UA +- ✅ **Verified** — 100% success on Discuss.com.hk +- ✅ **Simple to Use** — One-line commands +- ✅ **Customizable** — Environment variable support + +--- + +## 🚀 Quick Start + +### Installation + +```bash +npm install +npx playwright install chromium +``` + +### Usage + +```bash +# Quick scraping +node scripts/playwright-simple.js https://example.com + +# Stealth mode (recommended) +node scripts/playwright-stealth.js "https://m.discuss.com.hk/#hot" +``` + +--- + +## 📖 Two Modes + +| Mode | Use Case | Speed | Anti-Bot | +|------|----------|-------|----------| +| **Simple** | Regular dynamic sites | Fast (3-5s) | None | +| **Stealth** ⭐ | Sites with anti-bot | Medium (5-20s) | Medium-High | + +### Simple Mode + +For sites without anti-bot protection: + +```bash +node scripts/playwright-simple.js +``` + +### Stealth Mode (Recommended) + +For sites with Cloudflare or anti-bot protection: + +```bash +node scripts/playwright-stealth.js +``` + +**Anti-Bot Techniques:** +- Hide `navigator.webdriver` +- Realistic User-Agent (iPhone) +- Human-like behavior simulation +- Screenshot and HTML saving support + +--- + +## 🎯 Customization + +All scripts support environment variables: + +```bash +# Show browser +HEADLESS=false node scripts/playwright-stealth.js + +# Custom wait time (milliseconds) +WAIT_TIME=10000 node scripts/playwright-stealth.js + +# Save screenshot +SCREENSHOT_PATH=/tmp/page.png node scripts/playwright-stealth.js + +# Save HTML +SAVE_HTML=true node scripts/playwright-stealth.js + +# Custom User-Agent +USER_AGENT="Mozilla/5.0 ..." node scripts/playwright-stealth.js +``` + +--- + +## 📊 Test Results + +| Website | Result | Time | +|---------|--------|------| +| **Discuss.com.hk** | ✅ 200 OK | 5-20s | +| **Example.com** | ✅ 200 OK | 3-5s | +| **Cloudflare Protected** | ✅ Mostly successful | 10-30s | + +--- + +## 📁 File Structure + +``` +playwright-scraper-skill/ +├── scripts/ +│ ├── playwright-simple.js # Simple mode +│ └── playwright-stealth.js # Stealth mode ⭐ +├── examples/ +│ ├── discuss-hk.sh # Discuss.com.hk example +│ └── README.md # More examples +├── SKILL.md # Full documentation +├── INSTALL.md # Installation guide +├── README.md # This file +├── README_ZH.md # Chinese documentation +├── CONTRIBUTING.md # Contribution guide +├── CHANGELOG.md # Version history +└── package.json # npm config +``` + +--- + +## 💡 Best Practices + +1. **Try web_fetch first** — OpenClaw's built-in tool is fastest +2. **Use Simple for dynamic sites** — When no anti-bot protection +3. **Use Stealth for protected sites** ⭐ — Main workhorse +4. **Use specialized skills** — For YouTube, Reddit, etc. + +--- + +## 🐛 Troubleshooting + +### Getting 403 blocked? + +Use Stealth mode: +```bash +node scripts/playwright-stealth.js +``` + +### Cloudflare challenge? + +Increase wait time + headful mode: +```bash +HEADLESS=false WAIT_TIME=30000 node scripts/playwright-stealth.js +``` + +### Playwright not found? + +Reinstall: +```bash +npm install +npx playwright install chromium +``` + +More issues? See [INSTALL.md](INSTALL.md) + +--- + +## 🤝 Contributing + +Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) + +--- + +## 📄 License + +MIT License - See [LICENSE](LICENSE) + +--- + +## 🔗 Links + +- [Playwright Official Docs](https://playwright.dev/) +- [Full Documentation (SKILL.md)](SKILL.md) +- [Installation Guide (INSTALL.md)](INSTALL.md) +- [Examples (examples/)](examples/) diff --git a/README_ZH.md b/README_ZH.md new file mode 100644 index 0000000..a1c74bd --- /dev/null +++ b/README_ZH.md @@ -0,0 +1,184 @@ +# Playwright Scraper Skill 🕷️ + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Node.js](https://img.shields.io/badge/Node.js-18+-green.svg)](https://nodejs.org/) +[![Playwright](https://img.shields.io/badge/Playwright-1.40+-blue.svg)](https://playwright.dev/) + +基於 Playwright 的網頁爬蟲 OpenClaw Skill。支援反爬保護,已驗證成功爬取 Discuss.com.hk 等複雜網站。 + +> 📦 **安裝方法:** 查看 [INSTALL.md](INSTALL.md) +> 📚 **完整文件:** 查看 [SKILL.md](SKILL.md) +> 💡 **使用範例:** 查看 [examples/README.md](examples/README.md) + +--- + +## ✨ 特色 + +- ✅ **純 Playwright** — 現代、強大、易用 +- ✅ **反爬保護** — 隱藏自動化特徵、真實 UA +- ✅ **已驗證** — Discuss.com.hk 100% 成功 +- ✅ **簡單易用** — 一行命令搞定 +- ✅ **可自訂** — 支援環境變數配置 + +--- + +## 🚀 快速開始 + +### 安裝 + +```bash +npm install +npx playwright install chromium +``` + +### 使用 + +```bash +# 快速爬取 +node scripts/playwright-simple.js https://example.com + +# 反爬保護版(推薦) +node scripts/playwright-stealth.js "https://m.discuss.com.hk/#hot" +``` + +--- + +## 📖 兩種模式 + +| 模式 | 適用場景 | 速度 | 反爬能力 | +|------|---------|------|----------| +| **Simple** | 一般動態網站 | 快(3-5秒) | 無 | +| **Stealth** ⭐ | 有反爬保護的網站 | 中(5-20秒) | 中高 | + +### Simple 模式 + +適合沒有反爬保護的網站: + +```bash +node scripts/playwright-simple.js +``` + +### Stealth 模式(推薦) + +適合有 Cloudflare 或反爬保護的網站: + +```bash +node scripts/playwright-stealth.js +``` + +**反爬技巧:** +- 隱藏 `navigator.webdriver` +- 真實 User-Agent(iPhone) +- 模擬真人行為 +- 支援截圖和 HTML 儲存 + +--- + +## 🎯 自訂參數 + +所有腳本都支援環境變數: + +```bash +# 顯示瀏覽器 +HEADLESS=false node scripts/playwright-stealth.js + +# 自訂等待時間(毫秒) +WAIT_TIME=10000 node scripts/playwright-stealth.js + +# 儲存截圖 +SCREENSHOT_PATH=/tmp/page.png node scripts/playwright-stealth.js + +# 儲存 HTML +SAVE_HTML=true node scripts/playwright-stealth.js + +# 自訂 User-Agent +USER_AGENT="Mozilla/5.0 ..." node scripts/playwright-stealth.js +``` + +--- + +## 📊 測試結果 + +| 網站 | 結果 | 時間 | +|------|------|------| +| **Discuss.com.hk** | ✅ 200 OK | 5-20 秒 | +| **Example.com** | ✅ 200 OK | 3-5 秒 | +| **Cloudflare 保護網站** | ✅ 多數成功 | 10-30 秒 | + +--- + +## 📁 檔案結構 + +``` +playwright-scraper-skill/ +├── scripts/ +│ ├── playwright-simple.js # 簡單版 +│ └── playwright-stealth.js # Stealth 版 ⭐ +├── examples/ +│ ├── discuss-hk.sh # Discuss.com.hk 範例 +│ └── README.md # 更多範例 +├── SKILL.md # 完整文件 +├── INSTALL.md # 安裝指南 +├── README.md # 本檔案 +├── CONTRIBUTING.md # 貢獻指南 +├── CHANGELOG.md # 版本記錄 +└── package.json # npm 配置 +``` + +--- + +## 💡 使用建議 + +1. **先試 web_fetch** — OpenClaw 內建工具最快 +2. **動態網站用 Simple** — 沒有反爬保護時 +3. **反爬網站用 Stealth** ⭐ — 主力工具 +4. **特殊網站用專用 skill** — YouTube、Reddit 等 + +--- + +## 🐛 故障排除 + +### 被 403 擋住? + +使用 Stealth 模式: +```bash +node scripts/playwright-stealth.js +``` + +### Cloudflare 挑戰? + +增加等待時間 + 有頭模式: +```bash +HEADLESS=false WAIT_TIME=30000 node scripts/playwright-stealth.js +``` + +### 找不到 Playwright? + +重新安裝: +```bash +npm install +npx playwright install chromium +``` + +更多問題查看 [INSTALL.md](INSTALL.md) + +--- + +## 🤝 貢獻 + +歡迎貢獻!查看 [CONTRIBUTING.md](CONTRIBUTING.md) + +--- + +## 📄 授權 + +MIT License - 查看 [LICENSE](LICENSE) + +--- + +## 🔗 相關連結 + +- [Playwright 官方文檔](https://playwright.dev/) +- [完整文件 (SKILL.md)](SKILL.md) +- [安裝指南 (INSTALL.md)](INSTALL.md) +- [使用範例 (examples/)](examples/) diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..745aa32 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,234 @@ +--- +name: playwright-scraper-skill +description: "基于Playwright的网页抓取OpenClaw技能。" +version: 1.2.0 +author: Simon Chan +--- + +# Playwright Scraper Skill + +A Playwright-based web scraping OpenClaw Skill with anti-bot protection. Choose the best approach based on the target website's anti-bot level. + +--- + +## 🎯 Use Case Matrix + +| Target Website | Anti-Bot Level | Recommended Method | Script | +|---------------|----------------|-------------------|--------| +| **Regular Sites** | Low | web_fetch tool | N/A (built-in) | +| **Dynamic Sites** | Medium | Playwright Simple | `scripts/playwright-simple.js` | +| **Cloudflare Protected** | High | **Playwright Stealth** ⭐ | `scripts/playwright-stealth.js` | +| **YouTube** | Special | deep-scraper | Install separately | +| **Reddit** | Special | reddit-scraper | Install separately | + +--- + +## 📦 Installation + +```bash +cd playwright-scraper-skill +npm install +npx playwright install chromium +``` + +--- + +## 🚀 Quick Start + +### 1️⃣ Simple Sites (No Anti-Bot) + +Use OpenClaw's built-in `web_fetch` tool: + +```bash +# Invoke directly in OpenClaw +Hey, fetch me the content from https://example.com +``` + +--- + +### 2️⃣ Dynamic Sites (Requires JavaScript) + +Use **Playwright Simple**: + +```bash +node scripts/playwright-simple.js "https://example.com" +``` + +**Example output:** +```json +{ + "url": "https://example.com", + "title": "Example Domain", + "content": "...", + "elapsedSeconds": "3.45" +} +``` + +--- + +### 3️⃣ Anti-Bot Protected Sites (Cloudflare etc.) + +Use **Playwright Stealth**: + +```bash +node scripts/playwright-stealth.js "https://m.discuss.com.hk/#hot" +``` + +**Features:** +- Hide automation markers (`navigator.webdriver = false`) +- Realistic User-Agent (iPhone, Android) +- Random delays to mimic human behavior +- Screenshot and HTML saving support + +--- + +### 4️⃣ YouTube Video Transcripts + +Use **deep-scraper** (install separately): + +```bash +# Install deep-scraper skill +npx clawhub install deep-scraper + +# Use it +cd skills/deep-scraper +node assets/youtube_handler.js "https://www.youtube.com/watch?v=VIDEO_ID" +``` + +--- + +## 📖 Script Descriptions + +### `scripts/playwright-simple.js` +- **Use Case:** Regular dynamic websites +- **Speed:** Fast (3-5 seconds) +- **Anti-Bot:** None +- **Output:** JSON (title, content, URL) + +### `scripts/playwright-stealth.js` ⭐ +- **Use Case:** Sites with Cloudflare or anti-bot protection +- **Speed:** Medium (5-20 seconds) +- **Anti-Bot:** Medium-High (hides automation, realistic UA) +- **Output:** JSON + Screenshot + HTML file +- **Verified:** 100% success on Discuss.com.hk + +--- + +## 🎓 Best Practices + +### 1. Try web_fetch First +If the site doesn't have dynamic loading, use OpenClaw's `web_fetch` tool—it's fastest. + +### 2. Need JavaScript? Use Playwright Simple +If you need to wait for JavaScript rendering, use `playwright-simple.js`. + +### 3. Getting Blocked? Use Stealth +If you encounter 403 or Cloudflare challenges, use `playwright-stealth.js`. + +### 4. Special Sites Need Specialized Skills +- YouTube → deep-scraper +- Reddit → reddit-scraper +- Twitter → bird skill + +--- + +## 🔧 Customization + +All scripts support environment variables: + +```bash +# Set screenshot path +SCREENSHOT_PATH=/path/to/screenshot.png node scripts/playwright-stealth.js URL + +# Set wait time (milliseconds) +WAIT_TIME=10000 node scripts/playwright-simple.js URL + +# Enable headful mode (show browser) +HEADLESS=false node scripts/playwright-stealth.js URL + +# Save HTML +SAVE_HTML=true node scripts/playwright-stealth.js URL + +# Custom User-Agent +USER_AGENT="Mozilla/5.0 ..." node scripts/playwright-stealth.js URL +``` + +--- + +## 📊 Performance Comparison + +| Method | Speed | Anti-Bot | Success Rate (Discuss.com.hk) | +|--------|-------|----------|-------------------------------| +| web_fetch | ⚡ Fastest | ❌ None | 0% | +| Playwright Simple | 🚀 Fast | ⚠️ Low | 20% | +| **Playwright Stealth** | ⏱️ Medium | ✅ Medium | **100%** ✅ | +| Puppeteer Stealth | ⏱️ Medium | ✅ Medium-High | ~80% | +| Crawlee (deep-scraper) | 🐢 Slow | ❌ Detected | 0% | +| Chaser (Rust) | ⏱️ Medium | ❌ Detected | 0% | + +--- + +## 🛡️ Anti-Bot Techniques Summary + +Lessons learned from our testing: + +### ✅ Effective Anti-Bot Measures +1. **Hide `navigator.webdriver`** — Essential +2. **Realistic User-Agent** — Use real devices (iPhone, Android) +3. **Mimic Human Behavior** — Random delays, scrolling +4. **Avoid Framework Signatures** — Crawlee, Selenium are easily detected +5. **Use `addInitScript` (Playwright)** — Inject before page load + +### ❌ Ineffective Anti-Bot Measures +1. **Only changing User-Agent** — Not enough +2. **Using high-level frameworks (Crawlee)** — More easily detected +3. **Docker isolation** — Doesn't help with Cloudflare + +--- + +## 🔍 Troubleshooting + +### Issue: 403 Forbidden +**Solution:** Use `playwright-stealth.js` + +### Issue: Cloudflare Challenge Page +**Solution:** +1. Increase wait time (10-15 seconds) +2. Try `headless: false` (headful mode sometimes has higher success rate) +3. Consider using proxy IPs + +### Issue: Blank Page +**Solution:** +1. Increase `waitForTimeout` +2. Use `waitUntil: 'networkidle'` or `'domcontentloaded'` +3. Check if login is required + +--- + +## 📝 Memory & Experience + +### 2026-02-07 Discuss.com.hk Test Conclusions +- ✅ **Pure Playwright + Stealth** succeeded (5s, 200 OK) +- ❌ Crawlee (deep-scraper) failed (403) +- ❌ Chaser (Rust) failed (Cloudflare) +- ❌ Puppeteer standard failed (403) + +**Best Solution:** Pure Playwright + anti-bot techniques (framework-independent) + +--- + +## 🚧 Future Improvements + +- [ ] Add proxy IP rotation +- [ ] Implement cookie management (maintain login state) +- [ ] Add CAPTCHA handling (2captcha / Anti-Captcha) +- [ ] Batch scraping (parallel URLs) +- [ ] Integration with OpenClaw's `browser` tool + +--- + +## 📚 References + +- [Playwright Official Docs](https://playwright.dev/) +- [puppeteer-extra-plugin-stealth](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) +- [deep-scraper skill](https://clawhub.com/opsun/deep-scraper) diff --git a/_meta.json b/_meta.json new file mode 100644 index 0000000..b9b79bd --- /dev/null +++ b/_meta.json @@ -0,0 +1,6 @@ +{ + "ownerId": "kn7ag1y1zeyvc3dsrv8t12mstn80p686", + "slug": "playwright-scraper-skill", + "version": "1.2.0", + "publishedAt": 1770470666815 +} \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..687957e --- /dev/null +++ b/examples/README.md @@ -0,0 +1,229 @@ +# Usage Examples + +## Basic Usage + +### 1. Quick Scrape (Example.com) + +```bash +node scripts/playwright-simple.js https://example.com +``` + +**Output:** +```json +{ + "title": "Example Domain", + "url": "https://example.com/", + "content": "Example Domain\n\nThis domain is for use...", + "metaDescription": "", + "elapsedSeconds": "3.42" +} +``` + +--- + +### 2. Anti-Bot Protected Site (Discuss.com.hk) + +```bash +node scripts/playwright-stealth.js "https://m.discuss.com.hk/#hot" +``` + +**Output:** +```json +{ + "title": "香港討論區 discuss.com.hk", + "url": "https://m.discuss.com.hk/#hot", + "htmlLength": 186345, + "contentPreview": "...", + "cloudflare": false, + "screenshot": "./screenshot-1770467444364.png", + "data": { + "links": [ + { + "text": "區議員周潔瑩疑消防通道違泊 道歉稱急於搬貨", + "href": "https://m.discuss.com.hk/index.php?action=thread&tid=32148378..." + } + ] + }, + "elapsedSeconds": "19.59" +} +``` + +--- + +## Advanced Usage + +### 3. Custom Wait Time + +```bash +WAIT_TIME=15000 node scripts/playwright-stealth.js +``` + +### 4. Show Browser (Debug Mode) + +```bash +HEADLESS=false node scripts/playwright-stealth.js +``` + +### 5. Save Screenshot and HTML + +```bash +SCREENSHOT_PATH=/tmp/my-page.png \ +SAVE_HTML=true \ +node scripts/playwright-stealth.js +``` + +### 6. Custom User-Agent + +```bash +USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" \ +node scripts/playwright-stealth.js +``` + +--- + +## Integration Examples + +### Using in Shell Scripts + +```bash +#!/bin/bash +# Run from playwright-scraper-skill directory + +URL="https://example.com" +OUTPUT_FILE="result.json" + +echo "🕷️ Starting scrape: $URL" + +node scripts/playwright-stealth.js "$URL" > "$OUTPUT_FILE" + +if [ $? -eq 0 ]; then + echo "✅ Success! Results saved to: $OUTPUT_FILE" +else + echo "❌ Failed" + exit 1 +fi +``` + +### Batch Scraping Multiple URLs + +```bash +#!/bin/bash + +URLS=( + "https://example.com" + "https://example.org" + "https://example.net" +) + +for url in "${URLS[@]}"; do + echo "Scraping: $url" + node scripts/playwright-stealth.js "$url" > "output_$(date +%s).json" + sleep 5 # Avoid IP blocking +done +``` + +--- + +## Calling from Node.js + +```javascript +const { spawn } = require('child_process'); + +function scrape(url) { + return new Promise((resolve, reject) => { + const proc = spawn('node', [ + 'scripts/playwright-stealth.js', + url + ]); + + let output = ''; + + proc.stdout.on('data', (data) => { + output += data.toString(); + }); + + proc.on('close', (code) => { + if (code === 0) { + try { + // Extract JSON (last line) + const lines = output.trim().split('\n'); + const json = JSON.parse(lines[lines.length - 1]); + resolve(json); + } catch (e) { + reject(e); + } + } else { + reject(new Error(`Exit code: ${code}`)); + } + }); + }); +} + +// Usage +(async () => { + const result = await scrape('https://example.com'); + console.log(result.title); +})(); +``` + +--- + +## Common Scenarios + +### Scraping News Articles + +```bash +node scripts/playwright-stealth.js "https://news.example.com/article/123" +``` + +### Scraping E-commerce Products + +```bash +WAIT_TIME=10000 \ +SAVE_HTML=true \ +node scripts/playwright-stealth.js "https://shop.example.com/product/456" +``` + +### Scraping Forum Posts + +```bash +node scripts/playwright-stealth.js "https://forum.example.com/thread/789" +``` + +--- + +## Troubleshooting + +### Issue: Page Not Fully Loaded + +**Solution:** Increase wait time +```bash +WAIT_TIME=20000 node scripts/playwright-stealth.js +``` + +### Issue: Still Blocked by Cloudflare + +**Solution:** Use headful mode + manual wait +```bash +HEADLESS=false \ +WAIT_TIME=30000 \ +node scripts/playwright-stealth.js +``` + +### Issue: Requires Login + +**Solution:** Manually login first, export cookies, then load +(Future feature, currently not supported) + +--- + +## Performance Tips + +1. **Parallel scraping:** Use `Promise.all()` or shell `&` +2. **Delay requests:** `sleep 5` to avoid IP blocking +3. **Use proxies:** Rotate IPs (future feature) +4. **Cache results:** Avoid duplicate scraping + +--- + +For more information, see [SKILL.md](../SKILL.md) diff --git a/examples/discuss-hk.sh b/examples/discuss-hk.sh new file mode 100644 index 0000000..31b4c38 --- /dev/null +++ b/examples/discuss-hk.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# 範例:爬取 Discuss.com.hk 熱門話題 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(dirname "$SCRIPT_DIR")" + +echo "🕷️ Discuss.com.hk 爬蟲範例" +echo "" +echo "使用 Playwright Stealth(已驗證成功)" +echo "" + +cd "$SKILL_DIR" && \ +WAIT_TIME=10000 \ +SCREENSHOT_PATH=/tmp/discuss-hk.png \ +SAVE_HTML=true \ +node scripts/playwright-stealth.js "https://m.discuss.com.hk/#hot" diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..0b5b293 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,60 @@ +{ + "name": "web-scraper", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "web-scraper", + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "playwright": "^1.40.0" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.2.tgz", + "integrity": "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.58.2" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.2.tgz", + "integrity": "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..d4e1291 --- /dev/null +++ b/package.json @@ -0,0 +1,23 @@ +{ + "name": "playwright-scraper-skill", + "version": "1.2.0", + "description": "基於 Playwright 的網頁爬蟲 OpenClaw Skill", + "main": "scripts/playwright-stealth.js", + "scripts": { + "simple": "node scripts/playwright-simple.js", + "stealth": "node scripts/playwright-stealth.js", + "test": "bash test.sh" + }, + "keywords": [ + "scraper", + "playwright", + "puppeteer", + "cloudflare", + "anti-detection" + ], + "author": "多米", + "license": "MIT", + "dependencies": { + "playwright": "^1.40.0" + } +} diff --git a/scripts/playwright-simple.js b/scripts/playwright-simple.js new file mode 100644 index 0000000..fb96b55 --- /dev/null +++ b/scripts/playwright-simple.js @@ -0,0 +1,60 @@ +#!/usr/bin/env node +/** + * Playwright Simple Scraper + * 適用:一般動態網站,無反爬保護 + * 速度:快(3-5 秒) + * + * Usage: node playwright-simple.js + */ + +const { chromium } = require('playwright'); + +const url = process.argv[2]; +const waitTime = parseInt(process.env.WAIT_TIME || '3000'); +const screenshotPath = process.env.SCREENSHOT_PATH; + +if (!url) { + console.error('❌ 請提供 URL'); + console.error('用法: node playwright-simple.js '); + process.exit(1); +} + +(async () => { + console.log('🚀 啟動 Playwright 簡單版爬蟲...'); + const startTime = Date.now(); + + const browser = await chromium.launch({ + headless: process.env.HEADLESS !== 'false' + }); + const page = await browser.newPage(); + + console.log(`📱 導航到: ${url}`); + await page.goto(url, { waitUntil: 'domcontentloaded' }); + + console.log(`⏳ 等待 ${waitTime}ms...`); + await page.waitForTimeout(waitTime); + + // 擷取基本資訊 + const result = await page.evaluate(() => { + return { + title: document.title, + url: window.location.href, + content: document.body.innerText.substring(0, 5000), + metaDescription: document.querySelector('meta[name="description"]')?.content || '', + }; + }); + + // 截圖(如果指定) + if (screenshotPath) { + await page.screenshot({ path: screenshotPath }); + console.log(`📸 截圖已儲存: ${screenshotPath}`); + } + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(2); + result.elapsedSeconds = elapsed; + + console.log('\n✅ 爬取完成!'); + console.log(JSON.stringify(result, null, 2)); + + await browser.close(); +})(); diff --git a/scripts/playwright-stealth.js b/scripts/playwright-stealth.js new file mode 100644 index 0000000..7beb4ee --- /dev/null +++ b/scripts/playwright-stealth.js @@ -0,0 +1,167 @@ +#!/usr/bin/env node +/** + * Playwright Stealth Scraper + * 適用:有 Cloudflare 或反爬保護的網站 + * 速度:中等(5-10 秒) + * 反爬能力:中(隱藏自動化、真實 UA) + * + * Usage: node playwright-stealth.js + * + * 環境變數: + * - HEADLESS=false 顯示瀏覽器 + * - WAIT_TIME=10000 等待時間(毫秒) + * - SCREENSHOT_PATH=... 截圖路徑 + * - SAVE_HTML=true 儲存 HTML + * - USER_AGENT=... 自訂 User-Agent + */ + +const { chromium } = require('playwright'); +const fs = require('fs'); +const path = require('path'); + +const url = process.argv[2]; +const waitTime = parseInt(process.env.WAIT_TIME || '5000'); +const headless = process.env.HEADLESS !== 'false'; +const screenshotPath = process.env.SCREENSHOT_PATH || `./screenshot-${Date.now()}.png`; +const saveHtml = process.env.SAVE_HTML === 'true'; + +// 預設 User-Agent(iPhone) +const defaultUA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'; +const userAgent = process.env.USER_AGENT || defaultUA; + +if (!url) { + console.error('❌ 請提供 URL'); + console.error('用法: node playwright-stealth.js '); + process.exit(1); +} + +(async () => { + console.log('🕷️ 啟動 Playwright Stealth 爬蟲...'); + console.log(`🔒 反爬模式: ${headless ? '無頭' : '有頭'}`); + const startTime = Date.now(); + + const browser = await chromium.launch({ + headless: headless, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-blink-features=AutomationControlled', + '--disable-features=IsolateOrigins,site-per-process', + ], + }); + + const context = await browser.newContext({ + userAgent: userAgent, + locale: 'zh-HK', + viewport: { width: 375, height: 812 }, // iPhone size + extraHTTPHeaders: { + 'Accept-Language': 'zh-HK,zh-TW;q=0.9,zh;q=0.8,en;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + }, + }); + + // 隱藏自動化特徵 + await context.addInitScript(() => { + Object.defineProperty(navigator, 'webdriver', { + get: () => false, + }); + + window.chrome = { runtime: {} }; + + // Mock permissions + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + }); + + const page = await context.newPage(); + + console.log(`📱 導航到: ${url}`); + try { + const response = await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + + console.log(`📡 HTTP Status: ${response.status()}`); + + if (response.status() === 403) { + console.log('⚠️ 收到 403,但繼續嘗試...'); + } + + } catch (error) { + console.error(`❌ 導航失敗: ${error.message}`); + } + + console.log(`⏳ 等待 ${waitTime}ms 讓內容載入...`); + await page.waitForTimeout(waitTime); + + // 檢查 Cloudflare + const cloudflare = await page.evaluate(() => { + return document.body.innerText.includes('Checking your browser') || + document.body.innerText.includes('Just a moment') || + document.querySelector('iframe[src*="challenges.cloudflare.com"]') !== null; + }); + + if (cloudflare) { + console.log('🛡️ 偵測到 Cloudflare 挑戰,等待額外 10 秒...'); + await page.waitForTimeout(10000); + } + + // 擷取資訊 + const result = await page.evaluate(() => { + return { + title: document.title, + url: window.location.href, + htmlLength: document.documentElement.outerHTML.length, + contentPreview: document.body.innerText.substring(0, 1000), + }; + }); + + result.cloudflare = cloudflare; + + // 截圖 + try { + await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 }); + console.log(`📸 截圖已儲存: ${screenshotPath}`); + result.screenshot = screenshotPath; + } catch (error) { + console.log(`⚠️ 截圖失敗: ${error.message}`); + result.screenshot = null; + } + + // 儲存 HTML(如果需要) + if (saveHtml) { + const htmlPath = screenshotPath.replace(/\.[^.]+$/, '.html'); + const html = await page.content(); + fs.writeFileSync(htmlPath, html); + console.log(`📄 HTML 已儲存: ${htmlPath}`); + result.htmlFile = htmlPath; + } + + // 嘗試提取結構化資料(依網站調整) + const customData = await page.evaluate(() => { + // 範例:提取所有連結 + const links = Array.from(document.querySelectorAll('a[href*="tid="]')) + .slice(0, 10) + .map(a => ({ + text: a.innerText.trim().substring(0, 100), + href: a.href, + })); + + return { links }; + }); + + result.data = customData; + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(2); + result.elapsedSeconds = elapsed; + + console.log('\n✅ 爬取完成!'); + console.log(JSON.stringify(result, null, 2)); + + await browser.close(); +})(); diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..cd1fab4 --- /dev/null +++ b/test.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# 簡單測試腳本 + +set -e + +echo "🧪 Playwright Scraper Skill 測試" +echo "" + +# 測試 1: Playwright Simple +echo "📝 測試 1: Playwright Simple (Example.com)" +node scripts/playwright-simple.js https://example.com > /tmp/test-simple.json +if grep -q "Example Domain" /tmp/test-simple.json; then + echo "✅ Simple 模式正常" +else + echo "❌ Simple 模式失敗" + exit 1 +fi +echo "" + +# 測試 2: Playwright Stealth +echo "📝 測試 2: Playwright Stealth (Example.com)" +node scripts/playwright-stealth.js https://example.com > /tmp/test-stealth.json +if grep -q "Example Domain" /tmp/test-stealth.json; then + echo "✅ Stealth 模式正常" +else + echo "❌ Stealth 模式失敗" + exit 1 +fi +echo "" + +# 測試 3: 環境變數 +echo "📝 測試 3: 環境變數 (WAIT_TIME)" +WAIT_TIME=1000 node scripts/playwright-simple.js https://example.com > /tmp/test-env.json +if grep -q "Example Domain" /tmp/test-env.json; then + echo "✅ 環境變數正常" +else + echo "❌ 環境變數失敗" + exit 1 +fi +echo "" + +# 清理 +rm -f /tmp/test-*.json screenshot-*.png + +echo "✅ 所有測試通過!"