Files
waisimon_playwright-scraper…/scripts/playwright-stealth.js

168 lines
5.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Playwright Stealth Scraper
* 適用:有 Cloudflare 或反爬保護的網站
* 速度中等5-10 秒)
* 反爬能力:中(隱藏自動化、真實 UA
*
* Usage: node playwright-stealth.js <URL>
*
* 環境變數:
* - HEADLESS=false 顯示瀏覽器
* - WAIT_TIME=10000 等待時間(毫秒)
* - SCREENSHOT_PATH=... 截圖路徑
* - SAVE_HTML=true 儲存 HTML
* - USER_AGENT=... 自訂 User-Agent
*/
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const url = process.argv[2];
const waitTime = parseInt(process.env.WAIT_TIME || '5000');
const headless = process.env.HEADLESS !== 'false';
const screenshotPath = process.env.SCREENSHOT_PATH || `./screenshot-${Date.now()}.png`;
const saveHtml = process.env.SAVE_HTML === 'true';
// 預設 User-AgentiPhone
const defaultUA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
const userAgent = process.env.USER_AGENT || defaultUA;
if (!url) {
console.error('❌ 請提供 URL');
console.error('用法: node playwright-stealth.js <URL>');
process.exit(1);
}
(async () => {
console.log('🕷️ 啟動 Playwright Stealth 爬蟲...');
console.log(`🔒 反爬模式: ${headless ? '無頭' : '有頭'}`);
const startTime = Date.now();
const browser = await chromium.launch({
headless: headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
],
});
const context = await browser.newContext({
userAgent: userAgent,
locale: 'zh-HK',
viewport: { width: 375, height: 812 }, // iPhone size
extraHTTPHeaders: {
'Accept-Language': 'zh-HK,zh-TW;q=0.9,zh;q=0.8,en;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
},
});
// 隱藏自動化特徵
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
window.chrome = { runtime: {} };
// Mock permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
const page = await context.newPage();
console.log(`📱 導航到: ${url}`);
try {
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000,
});
console.log(`📡 HTTP Status: ${response.status()}`);
if (response.status() === 403) {
console.log('⚠️ 收到 403但繼續嘗試...');
}
} catch (error) {
console.error(`❌ 導航失敗: ${error.message}`);
}
console.log(`⏳ 等待 ${waitTime}ms 讓內容載入...`);
await page.waitForTimeout(waitTime);
// 檢查 Cloudflare
const cloudflare = await page.evaluate(() => {
return document.body.innerText.includes('Checking your browser') ||
document.body.innerText.includes('Just a moment') ||
document.querySelector('iframe[src*="challenges.cloudflare.com"]') !== null;
});
if (cloudflare) {
console.log('🛡️ 偵測到 Cloudflare 挑戰,等待額外 10 秒...');
await page.waitForTimeout(10000);
}
// 擷取資訊
const result = await page.evaluate(() => {
return {
title: document.title,
url: window.location.href,
htmlLength: document.documentElement.outerHTML.length,
contentPreview: document.body.innerText.substring(0, 1000),
};
});
result.cloudflare = cloudflare;
// 截圖
try {
await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 });
console.log(`📸 截圖已儲存: ${screenshotPath}`);
result.screenshot = screenshotPath;
} catch (error) {
console.log(`⚠️ 截圖失敗: ${error.message}`);
result.screenshot = null;
}
// 儲存 HTML如果需要
if (saveHtml) {
const htmlPath = screenshotPath.replace(/\.[^.]+$/, '.html');
const html = await page.content();
fs.writeFileSync(htmlPath, html);
console.log(`📄 HTML 已儲存: ${htmlPath}`);
result.htmlFile = htmlPath;
}
// 嘗試提取結構化資料(依網站調整)
const customData = await page.evaluate(() => {
// 範例:提取所有連結
const links = Array.from(document.querySelectorAll('a[href*="tid="]'))
.slice(0, 10)
.map(a => ({
text: a.innerText.trim().substring(0, 100),
href: a.href,
}));
return { links };
});
result.data = customData;
const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
result.elapsedSeconds = elapsed;
console.log('\n✅ 爬取完成!');
console.log(JSON.stringify(result, null, 2));
await browser.close();
})();