137 lines
4.0 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 每日爬取主脚本
* 执行截图 + 数据获取 + 数据提取
*/
const { chromium } = require('playwright');
const fs = require('fs');
const fsp = require('fs/promises');
const path = require('path');
const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd());
const PIC_DIR = path.join(BASE_DIR, 'pic');
const DATA_DIR = path.join(BASE_DIR, 'data');
const TARGET_URL = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=307749';
function formatDate(date) {
const year = date.getFullYear();
const month = String(date.getMonth() + 1).padStart(2, '0');
const day = String(date.getDate()).padStart(2, '0');
return `${year}-${month}-${day}`;
}
// 口径调整:当天执行时,落地前一天的数据
function getDataDate() {
const date = new Date();
date.setDate(date.getDate() - 1);
return formatDate(date);
}
// 主函数
async function main() {
const dataDate = getDataDate();
console.log('==========================================');
console.log(`开始爬取(落地日期): ${dataDate}`);
console.log(`目标URL: ${TARGET_URL}`);
console.log('==========================================\n');
const picPath = path.join(PIC_DIR, `${dataDate}.png`);
const dataPath = path.join(DATA_DIR, `${dataDate}.json`);
// 确保输出目录存在
if (!fs.existsSync(PIC_DIR)) {
fs.mkdirSync(PIC_DIR, { recursive: true });
}
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
try {
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
// 设置超时
page.setDefaultTimeout(60000);
page.setDefaultNavigationTimeout(60000);
// 访问页面
await page.goto(TARGET_URL, {
waitUntil: 'networkidle',
timeout: 60000
});
// 等待页面加载完成
await page.waitForLoadState('domcontentloaded');
// 额外等待3秒确保动态内容加载
await page.waitForTimeout(3000);
// 执行JS将所有 table 标签的 display 改为 table
await page.evaluate(() => {
const tables = document.querySelectorAll('table');
tables.forEach(table => {
table.style.display = 'table';
});
});
// 截图
await page.screenshot({
path: picPath,
fullPage: true
});
console.log(` ✓ 截图已保存: ${picPath}`);
for (const p of ['define.js', 'extract.js']) {
const injectJsPath = path.join(BASE_DIR, 'public', p);
const injectJsContent = await fsp.readFile(injectJsPath, 'utf-8');
await page.addScriptTag({ content: injectJsContent });
}
console.log(' ✓ 数据提取脚本已注入');
// 获取页面文本内容
const data = await page.evaluate(() => {
return JSON.stringify(extractData());
});
console.log(' ✓ 数据已提取');
const content = JSON.stringify({
date: dataDate,
timestamp: Date.now(),
source: TARGET_URL,
data: JSON.parse(data)
}, null, 2);
// 保存原始内容
await Bun.write(dataPath, content);
console.log(`\n ✓ 数据已保存: ${dataPath}`);
} catch (error) {
console.error(` ✗ 操作失败: ${error.message}`);
process.exit(1);
} finally {
await browser.close();
}
console.log('\n==========================================');
console.log(`爬取完成(落地日期): ${dataDate}`);
console.log(`截图文件: ${picPath}`);
console.log(`数据文件: ${dataPath}`);
console.log('==========================================');
}
main().catch(err => {
console.error('执行失败:', err);
process.exit(1);
});