171 lines
6.7 KiB
JavaScript
171 lines
6.7 KiB
JavaScript
|
|
const { parse } = require('node-html-parser');
|
||
|
|
|
||
|
|
async function fetchOlderNews(date) {
|
||
|
|
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
|
||
|
|
try {
|
||
|
|
const response = await fetch(url);
|
||
|
|
const text = await response.text();
|
||
|
|
|
||
|
|
const rawList = text.match(/title_array_01\((.*)/g) || [];
|
||
|
|
const pageUrls = rawList.slice(1).map(item => item.match(/(http.*)/)?.[0].split('\'')[0] || '');
|
||
|
|
|
||
|
|
const headers = {
|
||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||
|
|
'Accept-Encoding': 'gzip, deflate',
|
||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
|
|
'Cache-Control': 'no-cache',
|
||
|
|
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||
|
|
'Host': 'tv.cctv.com',
|
||
|
|
'Pragma': 'no-cache',
|
||
|
|
'Proxy-Connection': 'keep-alive',
|
||
|
|
'Upgrade-Insecure-Requests': '1',
|
||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||
|
|
};
|
||
|
|
|
||
|
|
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||
|
|
try {
|
||
|
|
const pageResponse = await fetch(pageUrl, { headers });
|
||
|
|
const pageText = await pageResponse.text();
|
||
|
|
const soup = parse(pageText);
|
||
|
|
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
|
||
|
|
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
|
||
|
|
return { date, title, content };
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}));
|
||
|
|
|
||
|
|
return data.filter(item => item !== null);
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching older news for ${date}:`, err.message);
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchMidNews(date) {
|
||
|
|
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
|
||
|
|
try {
|
||
|
|
const response = await fetch(url);
|
||
|
|
const text = await response.text();
|
||
|
|
const soup = parse(text);
|
||
|
|
|
||
|
|
const pageUrls = soup.querySelectorAll('#contentELMT1368521805488378 li a')
|
||
|
|
.slice(1)
|
||
|
|
.map(a => a.getAttribute('href') || '');
|
||
|
|
|
||
|
|
const headers = {
|
||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||
|
|
'Accept-Encoding': 'gzip, deflate',
|
||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
|
|
'Cache-Control': 'no-cache',
|
||
|
|
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||
|
|
'Host': 'tv.cctv.com',
|
||
|
|
'Pragma': 'no-cache',
|
||
|
|
'Proxy-Connection': 'keep-alive',
|
||
|
|
'Upgrade-Insecure-Requests': '1',
|
||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||
|
|
};
|
||
|
|
|
||
|
|
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||
|
|
try {
|
||
|
|
const pageResponse = await fetch(pageUrl, { headers });
|
||
|
|
const pageText = await pageResponse.text();
|
||
|
|
const soup = parse(pageText);
|
||
|
|
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
|
||
|
|
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
|
||
|
|
return { date, title, content };
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}));
|
||
|
|
|
||
|
|
return data.filter(item => item !== null);
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching mid news for ${date}:`, err.message);
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchRecentNews(date) {
|
||
|
|
const url = `https://tv.cctv.com/lm/xwlb/day/${date}.shtml`;
|
||
|
|
try {
|
||
|
|
const response = await fetch(url);
|
||
|
|
const text = await response.text();
|
||
|
|
const soup = parse(text);
|
||
|
|
|
||
|
|
const pageUrls = soup.querySelectorAll('li a').slice(1).map(a => a.getAttribute('href') || '');
|
||
|
|
|
||
|
|
const headers = {
|
||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||
|
|
'Accept-Encoding': 'gzip, deflate',
|
||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
|
|
'Cache-Control': 'no-cache',
|
||
|
|
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||
|
|
'Host': 'tv.cctv.com',
|
||
|
|
'Pragma': 'no-cache',
|
||
|
|
'Proxy-Connection': 'keep-alive',
|
||
|
|
'Upgrade-Insecure-Requests': '1',
|
||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||
|
|
};
|
||
|
|
|
||
|
|
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||
|
|
try {
|
||
|
|
const pageResponse = await fetch(pageUrl, { headers });
|
||
|
|
const pageText = await pageResponse.text();
|
||
|
|
const soup = parse(pageText);
|
||
|
|
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || soup.querySelector('.tit')?.text.trim() || '';
|
||
|
|
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || soup.querySelector('.content_area')?.text.trim() || '';
|
||
|
|
return { date, title, content };
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}));
|
||
|
|
|
||
|
|
return data.filter(item => item !== null);
|
||
|
|
} catch (err) {
|
||
|
|
console.error(`Error fetching recent news for ${date}:`, err.message);
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
let date = process.argv[2];
|
||
|
|
if (!date) {
|
||
|
|
const today = new Date();
|
||
|
|
date = today.toISOString().slice(0, 10).replace(/-/g, '');
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`Fetching news for date: ${date}...`);
|
||
|
|
|
||
|
|
// Try recent first, as per original logic
|
||
|
|
let news = await fetchRecentNews(date);
|
||
|
|
|
||
|
|
if (news.length === 0) {
|
||
|
|
console.log("No news found via recent crawler, trying mid...");
|
||
|
|
news = await fetchMidNews(date);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (news.length === 0) {
|
||
|
|
console.log("No news found via mid crawler, trying older...");
|
||
|
|
news = await fetchOlderNews(date);
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(JSON.stringify(news, null, 2));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (require.main === module) {
|
||
|
|
main().catch(err => {
|
||
|
|
console.error("Critical error:", err);
|
||
|
|
process.exit(1);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
module.exports = {
|
||
|
|
fetchRecentNews,
|
||
|
|
fetchMidNews,
|
||
|
|
fetchOlderNews
|
||
|
|
};
|