const { parse } = require('node-html-parser'); async function fetchOlderNews(date) { const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`; try { const response = await fetch(url); const text = await response.text(); const rawList = text.match(/title_array_01\((.*)/g) || []; const pageUrls = rawList.slice(1).map(item => item.match(/(http.*)/)?.[0].split('\'')[0] || ''); const headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6', 'Host': 'tv.cctv.com', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' }; const data = await Promise.all(pageUrls.map(async pageUrl => { try { const pageResponse = await fetch(pageUrl, { headers }); const pageText = await pageResponse.text(); const soup = parse(pageText); const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || ''; const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || ''; return { date, title, content }; } catch (err) { console.error(`Error fetching page ${pageUrl}:`, err.message); return null; } })); return data.filter(item => item !== null); } catch (err) { console.error(`Error fetching older news for ${date}:`, err.message); return []; } } async function fetchMidNews(date) { const url = `https://cctv.cntv.cn/lm/xinwenlianbo/${date}.shtml`; try { const response = await fetch(url); const text = await response.text(); const soup = parse(text); const pageUrls = soup.querySelectorAll('#contentELMT1368521805488378 li a') .slice(1) .map(a => a.getAttribute('href') || ''); const headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6', 'Host': 'tv.cctv.com', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' }; const data = await Promise.all(pageUrls.map(async pageUrl => { try { const pageResponse = await fetch(pageUrl, { headers }); const pageText = await pageResponse.text(); const soup = parse(pageText); const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || ''; const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || ''; return { date, title, content }; } catch (err) { console.error(`Error fetching page ${pageUrl}:`, err.message); return null; } })); return data.filter(item => item !== null); } catch (err) { console.error(`Error fetching mid news for ${date}:`, err.message); return []; } } async function fetchRecentNews(date) { const url = `https://tv.cctv.com/lm/xwlb/day/${date}.shtml`; try { const response = await fetch(url); const text = await response.text(); const soup = parse(text); const pageUrls = soup.querySelectorAll('li a').slice(1).map(a => a.getAttribute('href') || ''); const headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Cookie': 'cna=DLYSGBDthG4CAbRVCNxSxGT6', 'Host': 'tv.cctv.com', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' }; const data = await Promise.all(pageUrls.map(async pageUrl => { try { const pageResponse = await fetch(pageUrl, { headers }); const pageText = await pageResponse.text(); const soup = parse(pageText); const title = soup.querySelector('h3')?.text.replace('[视频]', '').trim() || soup.querySelector('.tit')?.text.trim() || ''; const content = soup.querySelector('.cnt_bd')?.text.replace(/\n/g, ' ').trim() || soup.querySelector('.content_area')?.text.trim() || ''; return { date, title, content }; } catch (err) { console.error(`Error fetching page ${pageUrl}:`, err.message); return null; } })); return data.filter(item => item !== null); } catch (err) { console.error(`Error fetching recent news for ${date}:`, err.message); return []; } } async function main() { let date = process.argv[2]; if (!date) { const today = new Date(); date = today.toISOString().slice(0, 10).replace(/-/g, ''); } console.log(`Fetching news for date: ${date}...`); // Try recent first, as per original logic let news = await fetchRecentNews(date); if (news.length === 0) { console.log("No news found via recent crawler, trying mid..."); news = await fetchMidNews(date); } if (news.length === 0) { console.log("No news found via mid crawler, trying older..."); news = await fetchOlderNews(date); } console.log(JSON.stringify(news, null, 2)); } if (require.main === module) { main().catch(err => { console.error("Critical error:", err); process.exit(1); }); } module.exports = { fetchRecentNews, fetchMidNews, fetchOlderNews };