diff --git a/.env b/.env index cadad16..15b7127 100644 --- a/.env +++ b/.env @@ -1,2 +1,2 @@ -BASE_DIR=/app/houseDream -PORT=8080 \ No newline at end of file +BASE_DIR=/Users/liyanyan/study/house-data-collect +PORT=8888 \ No newline at end of file diff --git a/README.md b/README.md index 2a0dc6e..bc18465 100644 --- a/README.md +++ b/README.md @@ -57,27 +57,38 @@ ### 依赖安装 ```bash cd /app/houseDream -npm install +bun install ``` ### 手动运行 ```bash # 启动 Web 服务器 -npm run server +bun run start + +# 开发模式(文件变更自动重启) +bun run dev # 执行完整爬取(截图+数据提取) -npm run daily +bun run daily # 仅截图 -npm run screenshot +bun run screenshot +``` + +### Playwright 浏览器安装 + +首次安装依赖后请执行: + +```bash +bunx playwright install chromium ``` ### PM2 管理(推荐) ```bash # 启动服务 -pm2 start server.js --name houseDream +pm2 start "bun run start" --name houseDream # 查看状态 pm2 list @@ -109,7 +120,7 @@ pm2 startup ## 技术栈 - **爬虫**: Playwright (Chromium) -- **后端**: Node.js + 原生 HTTP 模块 +- **后端**: Bun + Bun.serve - **前端**: HTML5 + CSS3 + Vanilla JavaScript - **进程管理**: PM2 @@ -117,13 +128,15 @@ pm2 startup | 变量 | 默认值 | 说明 | |------|--------|------| +| `BASE_DIR` | 当前工作目录 | 项目根目录(用于定位 data/pic/web/public) | +| `HOST` | 127.0.0.1 | 服务器监听地址 | | `PORT` | 8080 | 服务器监听端口 | ## 注意事项 1. 数据文件永久保留,不会自动清理 2. 页面使用 Playwright 获取完整渲染后的内容 -3. 服务器默认绑定到 localhost,如需外网访问请修改 server.js 中的监听地址 +3. 默认监听地址为 127.0.0.1,可通过环境变量 HOST 修改 ## 许可证 diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..55473d7 --- /dev/null +++ b/bun.lock @@ -0,0 +1,19 @@ +{ + "lockfileVersion": 1, + "configVersion": 0, + "workspaces": { + "": { + "name": "housedream", + "dependencies": { + "playwright": "^1.40.0", + }, + }, + }, + "packages": { + "fsevents": ["fsevents@2.3.2", "http://mirrors.tencentyun.com/npm/fsevents/-/fsevents-2.3.2.tgz", { "os": "darwin" }, "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA=="], + + "playwright": ["playwright@1.58.2", "http://mirrors.tencentyun.com/npm/playwright/-/playwright-1.58.2.tgz", { "dependencies": { "playwright-core": "1.58.2" }, "optionalDependencies": { "fsevents": "2.3.2" }, "bin": "cli.js" }, "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A=="], + + "playwright-core": ["playwright-core@1.58.2", "http://mirrors.tencentyun.com/npm/playwright-core/-/playwright-core-1.58.2.tgz", { "bin": "cli.js" }, "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg=="], + } +} diff --git a/cron_daily.sh b/cron_daily.sh new file mode 100755 index 0000000..2302180 --- /dev/null +++ b/cron_daily.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$PROJECT_DIR" + +if ! command -v bun >/dev/null 2>&1; then + echo "[ERROR] bun 未安装或不在 PATH 中" + exit 1 +fi + +if ! command -v git >/dev/null 2>&1; then + echo "[ERROR] git 未安装或不在 PATH 中" + exit 1 +fi + +if [[ ! -d .git ]]; then + echo "[ERROR] 当前目录不是 Git 仓库: $PROJECT_DIR" + exit 1 +fi + +echo "[INFO] 项目目录: $PROJECT_DIR" +echo "[INFO] 开始执行 daily 采集..." +bun run daily + +echo "[INFO] 添加产物到暂存区..." +git add data pic + +if git diff --cached --quiet; then + echo "[INFO] 没有新增变更,跳过提交和推送" + exit 0 +fi + +COMMIT_MSG="chore: daily data update $(date +%F)" +echo "[INFO] 提交变更: $COMMIT_MSG" +git commit -m "$COMMIT_MSG" + +echo "[INFO] 推送到远端..." +git push + +echo "[INFO] 执行完成" diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index 37ce143..0000000 --- a/package-lock.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "name": "housedream", - "version": "1.0.0", - "lockfileVersion": 3, - "requires": true, - "packages": { - "": { - "name": "housedream", - "version": "1.0.0", - "license": "ISC", - "dependencies": { - "playwright": "^1.58.2" - } - }, - "node_modules/fsevents": { - "version": "2.3.2", - "resolved": "http://mirrors.tencentyun.com/npm/fsevents/-/fsevents-2.3.2.tgz", - "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^8.16.0 || ^10.6.0 || >=11.0.0" - } - }, - "node_modules/playwright": { - "version": "1.58.2", - "resolved": "http://mirrors.tencentyun.com/npm/playwright/-/playwright-1.58.2.tgz", - "integrity": "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A==", - "license": "Apache-2.0", - "dependencies": { - "playwright-core": "1.58.2" - }, - "bin": { - "playwright": "cli.js" - }, - "engines": { - "node": ">=18" - }, - "optionalDependencies": { - "fsevents": "2.3.2" - } - }, - "node_modules/playwright-core": { - "version": "1.58.2", - "resolved": "http://mirrors.tencentyun.com/npm/playwright-core/-/playwright-core-1.58.2.tgz", - "integrity": "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==", - "license": "Apache-2.0", - "bin": { - "playwright-core": "cli.js" - }, - "engines": { - "node": ">=18" - } - } - } -} diff --git a/package.json b/package.json index 58c4ff5..1070c3f 100644 --- a/package.json +++ b/package.json @@ -3,10 +3,16 @@ "version": "1.0.0", "description": "北京市房地产数据监控系统 - 自动爬取、提取、可视化展示", "main": "server.js", + "packageManager": "bun@1.2.13", + "engines": { + "bun": ">=1.1.0" + }, "scripts": { - "server": "node server.js", - "screenshot": "node scripts/screenshot.js", - "daily": "node scripts/daily.js" + "dev": "bun --watch server.js", + "start": "bun server.js", + "server": "bun run start", + "screenshot": "bun scripts/screenshot.js", + "daily": "bun scripts/daily.js" }, "keywords": [ "房地产", @@ -18,7 +24,6 @@ "author": "", "license": "ISC", "dependencies": { - "dotenv": "^17.3.1", "playwright": "^1.40.0" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml deleted file mode 100644 index 5642a64..0000000 --- a/pnpm-lock.yaml +++ /dev/null @@ -1,52 +0,0 @@ -lockfileVersion: '9.0' - -settings: - autoInstallPeers: true - excludeLinksFromLockfile: false - -importers: - - .: - dependencies: - dotenv: - specifier: ^17.3.1 - version: 17.3.1 - playwright: - specifier: ^1.40.0 - version: 1.58.2 - -packages: - - dotenv@17.3.1: - resolution: {integrity: sha512-IO8C/dzEb6O3F9/twg6ZLXz164a2fhTnEWb95H23Dm4OuN+92NmEAlTrupP9VW6Jm3sO26tQlqyvyi4CsnY9GA==} - engines: {node: '>=12'} - - fsevents@2.3.2: - resolution: {integrity: sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==} - engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} - os: [darwin] - - playwright-core@1.58.2: - resolution: {integrity: sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==} - engines: {node: '>=18'} - hasBin: true - - playwright@1.58.2: - resolution: {integrity: sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A==} - engines: {node: '>=18'} - hasBin: true - -snapshots: - - dotenv@17.3.1: {} - - fsevents@2.3.2: - optional: true - - playwright-core@1.58.2: {} - - playwright@1.58.2: - dependencies: - playwright-core: 1.58.2 - optionalDependencies: - fsevents: 2.3.2 diff --git a/scripts/daily.js b/scripts/daily.js index a464a90..461a88d 100644 --- a/scripts/daily.js +++ b/scripts/daily.js @@ -5,12 +5,10 @@ const { chromium } = require('playwright'); const fs = require('fs'); +const fsp = require('fs/promises'); const path = require('path'); -const dotenv = require('dotenv'); -dotenv.config({path: ['.env.local', '.env'], quiet: true}); - -const BASE_DIR = process.env.BASE_DIR; +const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd()); const PIC_DIR = path.join(BASE_DIR, 'pic'); const DATA_DIR = path.join(BASE_DIR, 'data'); @@ -77,9 +75,9 @@ async function main() { console.log(` ✓ 截图已保存: ${picPath}`); - for (let p of ['define.js', 'extract.js']) { + for (const p of ['define.js', 'extract.js']) { const injectJsPath = path.join(BASE_DIR, 'public', p); - const injectJsContent = fs.readFileSync(injectJsPath, 'utf-8'); + const injectJsContent = await fsp.readFile(injectJsPath, 'utf-8'); await page.addScriptTag({ content: injectJsContent }); } console.log(' ✓ 数据提取脚本已注入'); @@ -98,7 +96,7 @@ async function main() { }, null, 2); // 保存原始内容 - fs.writeFileSync(dataPath, content, 'utf-8'); + await Bun.write(dataPath, content); console.log(`\n ✓ 数据已保存: ${dataPath}`); } catch (error) { diff --git a/scripts/screenshot.js b/scripts/screenshot.js index 75f31c9..a56e72f 100644 --- a/scripts/screenshot.js +++ b/scripts/screenshot.js @@ -5,11 +5,8 @@ const { chromium } = require('playwright'); const fs = require('fs'); const path = require('path'); -const dotenv = require('dotenv'); -dotenv.config({path: ['.env.local', '.env'], quiet: true}); - -const BASE_DIR = process.env.BASE_DIR; +const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd()); const PIC_DIR = path.join(BASE_DIR, 'pic'); const TARGET_URL = 'http://bjjs.zjw.beijing.gov.cn/eportal/ui?pageId=307749'; diff --git a/server.js b/server.js index a95096a..14acf01 100644 --- a/server.js +++ b/server.js @@ -1,173 +1,154 @@ /** - * Web服务器脚本 + * Bun Web服务器脚本 * 提供静态文件服务和API接口 */ -const http = require('http'); const fs = require('fs'); const path = require('path'); -const url = require('url'); -const dotenv = require('dotenv'); -dotenv.config({path: ['.env.local', '.env'], quiet: true}); - -console.log('环境变量:', process.env.BASE_DIR); - -const BASE_DIR = process.env.BASE_DIR; +const BASE_DIR = path.resolve(process.env.BASE_DIR || process.cwd()); const WEB_DIR = path.join(BASE_DIR, 'web'); const DATA_DIR = path.join(BASE_DIR, 'data'); const PIC_DIR = path.join(BASE_DIR, 'pic'); +const PUBLIC_DIR = path.join(BASE_DIR, 'public'); -const PORT = process.env.PORT || 8080; +const PORT = Number(process.env.PORT || 8080); +const HOST = process.env.HOST || '127.0.0.1'; -// MIME类型映射 -const mimeTypes = { - '.html': 'text/html', - '.css': 'text/css', - '.js': 'application/javascript', - '.json': 'application/json', - '.png': 'image/png', - '.jpg': 'image/jpeg', - '.gif': 'image/gif', - '.ico': 'image/x-icon' +const CORS_HEADERS = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET, POST, OPTIONS', + 'Access-Control-Allow-Headers': 'Content-Type' }; -// 获取MIME类型 -function getMimeType(filePath) { - const ext = path.extname(filePath).toLowerCase(); - return mimeTypes[ext] || 'application/octet-stream'; -} - -// 读取文件 -function readFile(filePath) { - return new Promise((resolve, reject) => { - fs.readFile(filePath, (err, data) => { - if (err) reject(err); - else resolve(data); - }); +function json(data, status = 200) { + return Response.json(data, { + status, + headers: CORS_HEADERS }); } -// 列出可用日期 +function text(body, status = 200) { + return new Response(body, { + status, + headers: CORS_HEADERS + }); +} + +function resolveSafePath(baseDir, requestPath) { + const relativePath = requestPath.replace(/^\/+/, ''); + const resolvedPath = path.resolve(baseDir, relativePath); + + if (resolvedPath === baseDir || resolvedPath.startsWith(`${baseDir}${path.sep}`)) { + return resolvedPath; + } + + return null; +} + function listAvailableDates() { try { const files = fs.readdirSync(DATA_DIR); return files - .filter(f => f.endsWith('.json') && !f.includes('_raw') && !f.includes('test')) - .map(f => f.replace('.json', '')) + .filter((file) => file.endsWith('.json') && !file.includes('_raw') && !file.includes('test')) + .map((file) => file.replace('.json', '')) .sort() .reverse(); - } catch (err) { + } catch { return []; } } -// 创建服务器 -const server = http.createServer(async (req, res) => { - const parsedUrl = url.parse(req.url, true); - let pathname = parsedUrl.pathname; - - // 设置CORS头 - res.setHeader('Access-Control-Allow-Origin', '*'); - res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); - res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); - - if (req.method === 'OPTIONS') { - res.writeHead(200); - res.end(); - return; +async function serveFile(filePath) { + const file = Bun.file(filePath); + + if (!(await file.exists())) { + return text('Not Found', 404); } - - try { - // API: 列出可用日期 - if (pathname === '/api/dates') { - const dates = listAvailableDates(); - res.writeHead(200, { 'Content-Type': 'application/json' }); - res.end(JSON.stringify({ dates })); - return; + + return new Response(file, { + status: 200, + headers: CORS_HEADERS + }); +} + +const server = Bun.serve({ + hostname: HOST, + port: PORT, + async fetch(req) { + if (req.method === 'OPTIONS') { + return new Response(null, { + status: 204, + headers: CORS_HEADERS + }); } - - // API: 获取指定日期数据 - if (pathname.startsWith('/api/data/')) { - const date = pathname.replace('/api/data/', ''); - const filePath = path.join(DATA_DIR, `${date}.json`); - - if (fs.existsSync(filePath)) { - const data = await readFile(filePath); - res.writeHead(200, { 'Content-Type': 'application/json' }); - res.end(data); - } else { - res.writeHead(404, { 'Content-Type': 'application/json' }); - res.end(JSON.stringify({ error: '数据不存在' })); + + try { + const { pathname } = new URL(req.url); + + if (pathname === '/api/dates') { + return json({ dates: listAvailableDates() }); } - return; - } - - // 静态文件服务 - if (pathname === '/') { - pathname = '/index.html'; - } - - // 处理 data 和 pic 路径 - let filePath; - if (pathname.startsWith('/data/')) { - filePath = path.join(DATA_DIR, pathname.replace('/data/', '')); - } else if (pathname.startsWith('/pic/')) { - filePath = path.join(PIC_DIR, pathname.replace('/pic/', '')); - } else if (pathname.startsWith('/public/')) { - filePath = path.join(BASE_DIR, pathname); - } else { - filePath = path.join(WEB_DIR, pathname); - } - - // 安全检查:防止目录遍历 - if (!filePath.startsWith(BASE_DIR)) { - res.writeHead(403, { 'Content-Type': 'text/plain' }); - res.end('Forbidden'); - return; - } - - const data = await readFile(filePath); - const mimeType = getMimeType(filePath); - - res.writeHead(200, { 'Content-Type': mimeType }); - res.end(data); - - } catch (err) { - console.error('文件读取错误:', err); - if (err.code === 'ENOENT') { - res.writeHead(404, { 'Content-Type': 'text/plain' }); - res.end('Not Found'); - } else { - console.error('服务器错误:', err); - res.writeHead(500, { 'Content-Type': 'text/plain' }); - res.end('Internal Server Error'); + + if (pathname.startsWith('/api/data/')) { + const date = pathname.replace('/api/data/', ''); + const filePath = resolveSafePath(DATA_DIR, `${date}.json`); + + if (!filePath) { + return json({ error: '非法路径' }, 403); + } + + const file = Bun.file(filePath); + if (!(await file.exists())) { + return json({ error: '数据不存在' }, 404); + } + + return new Response(file, { + status: 200, + headers: { + ...CORS_HEADERS, + 'Content-Type': 'application/json; charset=utf-8' + } + }); + } + + let filePath; + if (pathname === '/') { + filePath = path.join(WEB_DIR, 'index.html'); + } else if (pathname.startsWith('/data/')) { + filePath = resolveSafePath(DATA_DIR, pathname.slice('/data/'.length)); + } else if (pathname.startsWith('/pic/')) { + filePath = resolveSafePath(PIC_DIR, pathname.slice('/pic/'.length)); + } else if (pathname.startsWith('/public/')) { + filePath = resolveSafePath(PUBLIC_DIR, pathname.slice('/public/'.length)); + } else { + filePath = resolveSafePath(WEB_DIR, pathname); + } + + if (!filePath) { + return text('Forbidden', 403); + } + + return serveFile(filePath); + } catch (error) { + console.error('服务器错误:', error); + return text('Internal Server Error', 500); } } }); -// 启动服务器 -server.listen(PORT, 'localhost', () => { - console.log('=========================================='); - console.log('北京市房地产数据监控服务器已启动'); - console.log(`访问地址: http://localhost:${PORT}`); - console.log('按 Ctrl+C 停止服务器'); - console.log('=========================================='); -}); +console.log('=========================================='); +console.log('北京市房地产数据监控服务器已启动'); +console.log(`访问地址: http://${HOST}:${PORT}`); +console.log('按 Ctrl+C 停止服务器'); +console.log('=========================================='); -// 优雅关闭 -process.on('SIGTERM', () => { - console.log('\nSIGTERM\n正在关闭服务器...'); - server.close(() => { - console.log('服务器已关闭'); - process.exit(0); - }); -}); +function shutdown(signal) { + console.log(`\n${signal}\n正在关闭服务器...`); + server.stop(true); + console.log('服务器已关闭'); + process.exit(0); +} -process.on('SIGINT', () => { - console.log('\nSIGINT\n正在关闭服务器...'); - server.close(() => { - console.log('服务器已关闭'); - process.exit(0); - }); -}); +process.on('SIGTERM', () => shutdown('SIGTERM')); +process.on('SIGINT', () => shutdown('SIGINT')); diff --git a/startup.sh b/startup.sh index b7bde30..4d06487 100755 --- a/startup.sh +++ b/startup.sh @@ -2,4 +2,4 @@ # 启动 houseDream # cd /app/houseDream -pnpm run server \ No newline at end of file +bun run start