2026-04-07 15:43:13 +08:00

120 lines
3.4 KiB
JavaScript

/**
* 每日爬取主脚本
* 执行截图 + 数据获取 + 数据提取
*/
const { chromium } = require('playwright');
const fs = require('fs');
const fsp = require('fs/promises');
const path = require('path');
const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd());
const PIC_DIR = path.join(BASE_DIR, 'pic');
const DATA_DIR = path.join(BASE_DIR, 'data');
const TARGET_URL = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=307749';
// 获取当前日期
function getToday() {
return new Date().toISOString().split('T')[0];
}
// 主函数
async function main() {
const today = getToday();
console.log('==========================================');
console.log(`开始爬取: ${today}`);
console.log(`目标URL: ${TARGET_URL}`);
console.log('==========================================\n');
const picPath = path.join(PIC_DIR, `${today}.png`);
const dataPath = path.join(DATA_DIR, `${today}.json`);
// 确保输出目录存在
if (!fs.existsSync(PIC_DIR)) {
fs.mkdirSync(PIC_DIR, { recursive: true });
}
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
try {
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
// 设置超时
page.setDefaultTimeout(60000);
page.setDefaultNavigationTimeout(60000);
// 访问页面
await page.goto(TARGET_URL, {
waitUntil: 'networkidle',
timeout: 60000
});
// 等待页面加载完成
await page.waitForLoadState('domcontentloaded');
// 额外等待3秒确保动态内容加载
await page.waitForTimeout(3000);
// 截图
await page.screenshot({
path: picPath,
fullPage: true
});
console.log(` ✓ 截图已保存: ${picPath}`);
for (const p of ['define.js', 'extract.js']) {
const injectJsPath = path.join(BASE_DIR, 'public', p);
const injectJsContent = await fsp.readFile(injectJsPath, 'utf-8');
await page.addScriptTag({ content: injectJsContent });
}
console.log(' ✓ 数据提取脚本已注入');
// 获取页面文本内容
const data = await page.evaluate(() => {
return JSON.stringify(extractData());
});
console.log(' ✓ 数据已提取');
const content = JSON.stringify({
date: today,
timestamp: Date.now(),
source: TARGET_URL,
data: JSON.parse(data)
}, null, 2);
// 保存原始内容
await Bun.write(dataPath, content);
console.log(`\n ✓ 数据已保存: ${dataPath}`);
} catch (error) {
console.error(` ✗ 操作失败: ${error.message}`);
process.exit(1);
} finally {
await browser.close();
}
console.log('\n==========================================');
console.log(`爬取完成: ${today}`);
console.log(`截图文件: ${picPath}`);
console.log(`数据文件: ${dataPath}`);
console.log('==========================================');
}
main().catch(err => {
console.error('执行失败:', err);
process.exit(1);
});