129 lines
3.8 KiB
JavaScript
129 lines
3.8 KiB
JavaScript
/**
|
|
* 每日爬取主脚本
|
|
* 执行截图 + 数据获取 + 数据提取
|
|
*/
|
|
const { chromium } = require('playwright');
|
|
|
|
const fs = require('fs');
|
|
const fsp = require('fs/promises');
|
|
const path = require('path');
|
|
|
|
const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd());
|
|
const PIC_DIR = path.join(BASE_DIR, 'pic');
|
|
const DATA_DIR = path.join(BASE_DIR, 'data');
|
|
|
|
const TARGET_URL = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=307749';
|
|
|
|
function formatDate(date) {
|
|
const year = date.getFullYear();
|
|
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
const day = String(date.getDate()).padStart(2, '0');
|
|
return `${year}-${month}-${day}`;
|
|
}
|
|
|
|
// 口径调整:当天执行时,落地前一天的数据
|
|
function getDataDate() {
|
|
const date = new Date();
|
|
date.setDate(date.getDate() - 1);
|
|
return formatDate(date);
|
|
}
|
|
|
|
// 主函数
|
|
async function main() {
|
|
const dataDate = getDataDate();
|
|
|
|
console.log('==========================================');
|
|
console.log(`开始爬取(落地日期): ${dataDate}`);
|
|
console.log(`目标URL: ${TARGET_URL}`);
|
|
console.log('==========================================\n');
|
|
|
|
const picPath = path.join(PIC_DIR, `${dataDate}.png`);
|
|
const dataPath = path.join(DATA_DIR, `${dataDate}.json`);
|
|
|
|
// 确保输出目录存在
|
|
if (!fs.existsSync(PIC_DIR)) {
|
|
fs.mkdirSync(PIC_DIR, { recursive: true });
|
|
}
|
|
if (!fs.existsSync(DATA_DIR)) {
|
|
fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
}
|
|
|
|
const browser = await chromium.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
try {
|
|
const context = await browser.newContext({
|
|
viewport: { width: 1920, height: 1080 }
|
|
});
|
|
|
|
const page = await context.newPage();
|
|
|
|
// 设置超时
|
|
page.setDefaultTimeout(60000);
|
|
page.setDefaultNavigationTimeout(60000);
|
|
|
|
// 访问页面
|
|
await page.goto(TARGET_URL, {
|
|
waitUntil: 'networkidle',
|
|
timeout: 60000
|
|
});
|
|
|
|
// 等待页面加载完成
|
|
await page.waitForLoadState('domcontentloaded');
|
|
|
|
// 额外等待3秒确保动态内容加载
|
|
await page.waitForTimeout(3000);
|
|
|
|
// 截图
|
|
await page.screenshot({
|
|
path: picPath,
|
|
fullPage: true
|
|
});
|
|
|
|
console.log(` ✓ 截图已保存: ${picPath}`);
|
|
|
|
for (const p of ['define.js', 'extract.js']) {
|
|
const injectJsPath = path.join(BASE_DIR, 'public', p);
|
|
const injectJsContent = await fsp.readFile(injectJsPath, 'utf-8');
|
|
await page.addScriptTag({ content: injectJsContent });
|
|
}
|
|
console.log(' ✓ 数据提取脚本已注入');
|
|
|
|
// 获取页面文本内容
|
|
const data = await page.evaluate(() => {
|
|
return JSON.stringify(extractData());
|
|
});
|
|
console.log(' ✓ 数据已提取');
|
|
|
|
const content = JSON.stringify({
|
|
date: dataDate,
|
|
timestamp: Date.now(),
|
|
source: TARGET_URL,
|
|
data: JSON.parse(data)
|
|
}, null, 2);
|
|
|
|
// 保存原始内容
|
|
await Bun.write(dataPath, content);
|
|
console.log(`\n ✓ 数据已保存: ${dataPath}`);
|
|
|
|
} catch (error) {
|
|
console.error(` ✗ 操作失败: ${error.message}`);
|
|
process.exit(1);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
|
|
console.log('\n==========================================');
|
|
console.log(`爬取完成(落地日期): ${dataDate}`);
|
|
console.log(`截图文件: ${picPath}`);
|
|
console.log(`数据文件: ${dataPath}`);
|
|
console.log('==========================================');
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('执行失败:', err);
|
|
process.exit(1);
|
|
});
|