Initial commit with translated description

This commit is contained in:
2026-03-29 14:36:54 +08:00
commit 35c34a7952
14 changed files with 1529 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
#!/usr/bin/env node
/**
* Playwright Simple Scraper
* 適用:一般動態網站,無反爬保護
* 速度3-5 秒)
*
* Usage: node playwright-simple.js <URL>
*/
const { chromium } = require('playwright');
const url = process.argv[2];
const waitTime = parseInt(process.env.WAIT_TIME || '3000');
const screenshotPath = process.env.SCREENSHOT_PATH;
if (!url) {
console.error('❌ 請提供 URL');
console.error('用法: node playwright-simple.js <URL>');
process.exit(1);
}
(async () => {
console.log('🚀 啟動 Playwright 簡單版爬蟲...');
const startTime = Date.now();
const browser = await chromium.launch({
headless: process.env.HEADLESS !== 'false'
});
const page = await browser.newPage();
console.log(`📱 導航到: ${url}`);
await page.goto(url, { waitUntil: 'domcontentloaded' });
console.log(`⏳ 等待 ${waitTime}ms...`);
await page.waitForTimeout(waitTime);
// 擷取基本資訊
const result = await page.evaluate(() => {
return {
title: document.title,
url: window.location.href,
content: document.body.innerText.substring(0, 5000),
metaDescription: document.querySelector('meta[name="description"]')?.content || '',
};
});
// 截圖(如果指定)
if (screenshotPath) {
await page.screenshot({ path: screenshotPath });
console.log(`📸 截圖已儲存: ${screenshotPath}`);
}
const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
result.elapsedSeconds = elapsed;
console.log('\n✅ 爬取完成!');
console.log(JSON.stringify(result, null, 2));
await browser.close();
})();

View File

@@ -0,0 +1,167 @@
#!/usr/bin/env node
/**
* Playwright Stealth Scraper
* 適用:有 Cloudflare 或反爬保護的網站
* 速度中等5-10 秒)
* 反爬能力:中(隱藏自動化、真實 UA
*
* Usage: node playwright-stealth.js <URL>
*
* 環境變數:
* - HEADLESS=false 顯示瀏覽器
* - WAIT_TIME=10000 等待時間(毫秒)
* - SCREENSHOT_PATH=... 截圖路徑
* - SAVE_HTML=true 儲存 HTML
* - USER_AGENT=... 自訂 User-Agent
*/
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const url = process.argv[2];
const waitTime = parseInt(process.env.WAIT_TIME || '5000');
const headless = process.env.HEADLESS !== 'false';
const screenshotPath = process.env.SCREENSHOT_PATH || `./screenshot-${Date.now()}.png`;
const saveHtml = process.env.SAVE_HTML === 'true';
// 預設 User-AgentiPhone
const defaultUA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
const userAgent = process.env.USER_AGENT || defaultUA;
if (!url) {
console.error('❌ 請提供 URL');
console.error('用法: node playwright-stealth.js <URL>');
process.exit(1);
}
(async () => {
console.log('🕷️ 啟動 Playwright Stealth 爬蟲...');
console.log(`🔒 反爬模式: ${headless ? '無頭' : '有頭'}`);
const startTime = Date.now();
const browser = await chromium.launch({
headless: headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
],
});
const context = await browser.newContext({
userAgent: userAgent,
locale: 'zh-HK',
viewport: { width: 375, height: 812 }, // iPhone size
extraHTTPHeaders: {
'Accept-Language': 'zh-HK,zh-TW;q=0.9,zh;q=0.8,en;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
},
});
// 隱藏自動化特徵
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
window.chrome = { runtime: {} };
// Mock permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
const page = await context.newPage();
console.log(`📱 導航到: ${url}`);
try {
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 30000,
});
console.log(`📡 HTTP Status: ${response.status()}`);
if (response.status() === 403) {
console.log('⚠️ 收到 403但繼續嘗試...');
}
} catch (error) {
console.error(`❌ 導航失敗: ${error.message}`);
}
console.log(`⏳ 等待 ${waitTime}ms 讓內容載入...`);
await page.waitForTimeout(waitTime);
// 檢查 Cloudflare
const cloudflare = await page.evaluate(() => {
return document.body.innerText.includes('Checking your browser') ||
document.body.innerText.includes('Just a moment') ||
document.querySelector('iframe[src*="challenges.cloudflare.com"]') !== null;
});
if (cloudflare) {
console.log('🛡️ 偵測到 Cloudflare 挑戰,等待額外 10 秒...');
await page.waitForTimeout(10000);
}
// 擷取資訊
const result = await page.evaluate(() => {
return {
title: document.title,
url: window.location.href,
htmlLength: document.documentElement.outerHTML.length,
contentPreview: document.body.innerText.substring(0, 1000),
};
});
result.cloudflare = cloudflare;
// 截圖
try {
await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 });
console.log(`📸 截圖已儲存: ${screenshotPath}`);
result.screenshot = screenshotPath;
} catch (error) {
console.log(`⚠️ 截圖失敗: ${error.message}`);
result.screenshot = null;
}
// 儲存 HTML如果需要
if (saveHtml) {
const htmlPath = screenshotPath.replace(/\.[^.]+$/, '.html');
const html = await page.content();
fs.writeFileSync(htmlPath, html);
console.log(`📄 HTML 已儲存: ${htmlPath}`);
result.htmlFile = htmlPath;
}
// 嘗試提取結構化資料(依網站調整)
const customData = await page.evaluate(() => {
// 範例:提取所有連結
const links = Array.from(document.querySelectorAll('a[href*="tid="]'))
.slice(0, 10)
.map(a => ({
text: a.innerText.trim().substring(0, 100),
href: a.href,
}));
return { links };
});
result.data = customData;
const elapsed = ((Date.now() - startTime) / 1000).toFixed(2);
result.elapsedSeconds = elapsed;
console.log('\n✅ 爬取完成!');
console.log(JSON.stringify(result, null, 2));
await browser.close();
})();