Initial commit with translated description
This commit is contained in:
170
scripts/news_crawler.js
Normal file
170
scripts/news_crawler.js
Normal file
@@ -0,0 +1,170 @@
|
||||
const { parse } = require('node-html-parser');
|
||||
|
||||
async function fetchOlderNews(date) {
|
||||
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
const text = await response.text();
|
||||
|
||||
const rawList = text.match(/title_array_01\((.*)/g) || [];
|
||||
const pageUrls = rawList.slice(1).map(item => item.match(/(http.*)/)?.[0].split('\'')[0] || '');
|
||||
|
||||
const headers = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||||
'Host': 'tv.cctv.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Proxy-Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||||
};
|
||||
|
||||
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||||
try {
|
||||
const pageResponse = await fetch(pageUrl, { headers });
|
||||
const pageText = await pageResponse.text();
|
||||
const soup = parse(pageText);
|
||||
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
|
||||
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
|
||||
return { date, title, content };
|
||||
} catch (err) {
|
||||
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||||
return null;
|
||||
}
|
||||
}));
|
||||
|
||||
return data.filter(item => item !== null);
|
||||
} catch (err) {
|
||||
console.error(`Error fetching older news for ${date}:`, err.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchMidNews(date) {
|
||||
const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`;
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
const text = await response.text();
|
||||
const soup = parse(text);
|
||||
|
||||
const pageUrls = soup.querySelectorAll('#contentELMT1368521805488378 li a')
|
||||
.slice(1)
|
||||
.map(a => a.getAttribute('href') || '');
|
||||
|
||||
const headers = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||||
'Host': 'tv.cctv.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Proxy-Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||||
};
|
||||
|
||||
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||||
try {
|
||||
const pageResponse = await fetch(pageUrl, { headers });
|
||||
const pageText = await pageResponse.text();
|
||||
const soup = parse(pageText);
|
||||
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || '';
|
||||
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || '';
|
||||
return { date, title, content };
|
||||
} catch (err) {
|
||||
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||||
return null;
|
||||
}
|
||||
}));
|
||||
|
||||
return data.filter(item => item !== null);
|
||||
} catch (err) {
|
||||
console.error(`Error fetching mid news for ${date}:`, err.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchRecentNews(date) {
|
||||
const url = `https://tv.cctv.com/lm/xwlb/day/${date}.shtml`;
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
const text = await response.text();
|
||||
const soup = parse(text);
|
||||
|
||||
const pageUrls = soup.querySelectorAll('li a').slice(1).map(a => a.getAttribute('href') || '');
|
||||
|
||||
const headers = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6',
|
||||
'Host': 'tv.cctv.com',
|
||||
'Pragma': 'no-cache',
|
||||
'Proxy-Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
|
||||
};
|
||||
|
||||
const data = await Promise.all(pageUrls.map(async pageUrl => {
|
||||
try {
|
||||
const pageResponse = await fetch(pageUrl, { headers });
|
||||
const pageText = await pageResponse.text();
|
||||
const soup = parse(pageText);
|
||||
const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || soup.querySelector('.tit')?.text.trim() || '';
|
||||
const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || soup.querySelector('.content_area')?.text.trim() || '';
|
||||
return { date, title, content };
|
||||
} catch (err) {
|
||||
console.error(`Error fetching page ${pageUrl}:`, err.message);
|
||||
return null;
|
||||
}
|
||||
}));
|
||||
|
||||
return data.filter(item => item !== null);
|
||||
} catch (err) {
|
||||
console.error(`Error fetching recent news for ${date}:`, err.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
let date = process.argv[2];
|
||||
if (!date) {
|
||||
const today = new Date();
|
||||
date = today.toISOString().slice(0, 10).replace(/-/g, '');
|
||||
}
|
||||
|
||||
console.log(`Fetching news for date: ${date}...`);
|
||||
|
||||
// Try recent first, as per original logic
|
||||
let news = await fetchRecentNews(date);
|
||||
|
||||
if (news.length === 0) {
|
||||
console.log("No news found via recent crawler, trying mid...");
|
||||
news = await fetchMidNews(date);
|
||||
}
|
||||
|
||||
if (news.length === 0) {
|
||||
console.log("No news found via mid crawler, trying older...");
|
||||
news = await fetchOlderNews(date);
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(news, null, 2));
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main().catch(err => {
|
||||
console.error("Critical error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchRecentNews,
|
||||
fetchMidNews,
|
||||
fetchOlderNews
|
||||
};
|
||||
Reference in New Issue
Block a user