137 lines
4.0 KiB
JavaScript
137 lines
4.0 KiB
JavaScript
/**
|
||
* 每日爬取主脚本
|
||
* 执行截图 + 数据获取 + 数据提取
|
||
*/
|
||
const { chromium } = require('playwright');
|
||
|
||
const fs = require('fs');
|
||
const fsp = require('fs/promises');
|
||
const path = require('path');
|
||
|
||
const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd());
|
||
const PIC_DIR = path.join(BASE_DIR, 'pic');
|
||
const DATA_DIR = path.join(BASE_DIR, 'data');
|
||
|
||
const TARGET_URL = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=307749';
|
||
|
||
function formatDate(date) {
|
||
const year = date.getFullYear();
|
||
const month = String(date.getMonth() + 1).padStart(2, '0');
|
||
const day = String(date.getDate()).padStart(2, '0');
|
||
return `${year}-${month}-${day}`;
|
||
}
|
||
|
||
// 口径调整:当天执行时,落地前一天的数据
|
||
function getDataDate() {
|
||
const date = new Date();
|
||
date.setDate(date.getDate() - 1);
|
||
return formatDate(date);
|
||
}
|
||
|
||
// 主函数
|
||
async function main() {
|
||
const dataDate = getDataDate();
|
||
|
||
console.log('==========================================');
|
||
console.log(`开始爬取(落地日期): ${dataDate}`);
|
||
console.log(`目标URL: ${TARGET_URL}`);
|
||
console.log('==========================================\n');
|
||
|
||
const picPath = path.join(PIC_DIR, `${dataDate}.png`);
|
||
const dataPath = path.join(DATA_DIR, `${dataDate}.json`);
|
||
|
||
// 确保输出目录存在
|
||
if (!fs.existsSync(PIC_DIR)) {
|
||
fs.mkdirSync(PIC_DIR, { recursive: true });
|
||
}
|
||
if (!fs.existsSync(DATA_DIR)) {
|
||
fs.mkdirSync(DATA_DIR, { recursive: true });
|
||
}
|
||
|
||
const browser = await chromium.launch({
|
||
headless: true,
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||
});
|
||
|
||
try {
|
||
const context = await browser.newContext({
|
||
viewport: { width: 1920, height: 1080 }
|
||
});
|
||
|
||
const page = await context.newPage();
|
||
|
||
// 设置超时
|
||
page.setDefaultTimeout(60000);
|
||
page.setDefaultNavigationTimeout(60000);
|
||
|
||
// 访问页面
|
||
await page.goto(TARGET_URL, {
|
||
waitUntil: 'networkidle',
|
||
timeout: 60000
|
||
});
|
||
|
||
// 等待页面加载完成
|
||
await page.waitForLoadState('domcontentloaded');
|
||
|
||
// 额外等待3秒确保动态内容加载
|
||
await page.waitForTimeout(3000);
|
||
|
||
// 执行JS:将所有 table 标签的 display 改为 table
|
||
await page.evaluate(() => {
|
||
const tables = document.querySelectorAll('table');
|
||
tables.forEach(table => {
|
||
table.style.display = 'table';
|
||
});
|
||
});
|
||
|
||
// 截图
|
||
await page.screenshot({
|
||
path: picPath,
|
||
fullPage: true
|
||
});
|
||
|
||
console.log(` ✓ 截图已保存: ${picPath}`);
|
||
|
||
for (const p of ['define.js', 'extract.js']) {
|
||
const injectJsPath = path.join(BASE_DIR, 'public', p);
|
||
const injectJsContent = await fsp.readFile(injectJsPath, 'utf-8');
|
||
await page.addScriptTag({ content: injectJsContent });
|
||
}
|
||
console.log(' ✓ 数据提取脚本已注入');
|
||
|
||
// 获取页面文本内容
|
||
const data = await page.evaluate(() => {
|
||
return JSON.stringify(extractData());
|
||
});
|
||
console.log(' ✓ 数据已提取');
|
||
|
||
const content = JSON.stringify({
|
||
date: dataDate,
|
||
timestamp: Date.now(),
|
||
source: TARGET_URL,
|
||
data: JSON.parse(data)
|
||
}, null, 2);
|
||
|
||
// 保存原始内容
|
||
await Bun.write(dataPath, content);
|
||
console.log(`\n ✓ 数据已保存: ${dataPath}`);
|
||
|
||
} catch (error) {
|
||
console.error(` ✗ 操作失败: ${error.message}`);
|
||
process.exit(1);
|
||
} finally {
|
||
await browser.close();
|
||
}
|
||
|
||
console.log('\n==========================================');
|
||
console.log(`爬取完成(落地日期): ${dataDate}`);
|
||
console.log(`截图文件: ${picPath}`);
|
||
console.log(`数据文件: ${dataPath}`);
|
||
console.log('==========================================');
|
||
}
|
||
|
||
main().catch(err => {
|
||
console.error('执行失败:', err);
|
||
process.exit(1);
|
||
});
|