Files
yuhangch_cctv-news-fetcher/scripts/news_crawler.js

171 lines
6.7 KiB
JavaScript
Raw Normal View History

const { parse } = require('node-html-parser');
async function fetchOlderNews(date) {
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
try {
const response = await fetch(url);
const text = await response.text();
const rawList = text.match(/title_array_01\((.*)/g) || [];
const pageUrls = rawList.slice(1).map(item => item.match(/(http.*)/)?.[0].split('\'')[0] || '');
const headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
'Host': 'tv.cctv.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
};
const data = await Promise.all(pageUrls.map(async pageUrl => {
try {
const pageResponse = await fetch(pageUrl, { headers });
const pageText = await pageResponse.text();
const soup = parse(pageText);
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
return { date, title, content };
} catch (err) {
console.error(`Error fetching page ${pageUrl}:`, err.message);
return null;
}
}));
return data.filter(item => item !== null);
} catch (err) {
console.error(`Error fetching older news for ${date}:`, err.message);
return [];
}
}
async function fetchMidNews(date) {
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
try {
const response = await fetch(url);
const text = await response.text();
const soup = parse(text);
const pageUrls = soup.querySelectorAll('#contentELMT1368521805488378 li a')
.slice(1)
.map(a => a.getAttribute('href') || '');
const headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
'Host': 'tv.cctv.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
};
const data = await Promise.all(pageUrls.map(async pageUrl => {
try {
const pageResponse = await fetch(pageUrl, { headers });
const pageText = await pageResponse.text();
const soup = parse(pageText);
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
return { date, title, content };
} catch (err) {
console.error(`Error fetching page ${pageUrl}:`, err.message);
return null;
}
}));
return data.filter(item => item !== null);
} catch (err) {
console.error(`Error fetching mid news for ${date}:`, err.message);
return [];
}
}
async function fetchRecentNews(date) {
const url = `https://tv.cctv.com/lm/xwlb/day/${date}.shtml`;
try {
const response = await fetch(url);
const text = await response.text();
const soup = parse(text);
const pageUrls = soup.querySelectorAll('li a').slice(1).map(a => a.getAttribute('href') || '');
const headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
'Host': 'tv.cctv.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
};
const data = await Promise.all(pageUrls.map(async pageUrl => {
try {
const pageResponse = await fetch(pageUrl, { headers });
const pageText = await pageResponse.text();
const soup = parse(pageText);
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || soup.querySelector('.tit')?.text.trim() || '';
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || soup.querySelector('.content_area')?.text.trim() || '';
return { date, title, content };
} catch (err) {
console.error(`Error fetching page ${pageUrl}:`, err.message);
return null;
}
}));
return data.filter(item => item !== null);
} catch (err) {
console.error(`Error fetching recent news for ${date}:`, err.message);
return [];
}
}
async function main() {
let date = process.argv[2];
if (!date) {
const today = new Date();
date = today.toISOString().slice(0, 10).replace(/-/g, '');
}
console.log(`Fetching news for date: ${date}...`);
// Try recent first, as per original logic
let news = await fetchRecentNews(date);
if (news.length === 0) {
console.log("No news found via recent crawler, trying mid...");
news = await fetchMidNews(date);
}
if (news.length === 0) {
console.log("No news found via mid crawler, trying older...");
news = await fetchOlderNews(date);
}
console.log(JSON.stringify(news, null, 2));
}
if (require.main === module) {
main().catch(err => {
console.error("Critical error:", err);
process.exit(1);
});
}
module.exports = {
fetchRecentNews,
fetchMidNews,
fetchOlderNews
};