From 20900f008f1acd4044b6f58969a03a7516af33e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=B2=A9=E5=B2=A9?= Date: Wed, 27 May 2026 13:42:17 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=B8=BB=E7=89=88=E6=9C=AC=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E4=BB=98=E6=AC=BE=E4=BA=BA=E7=BB=B4=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + README.md | 131 ++++++++-------- index_fk.go | 418 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+), 73 deletions(-) create mode 100644 index_fk.go diff --git a/.gitignore b/.gitignore index c62241d..5089621 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ source 执行结果记录_**.txt *.log *.exe +dist +.idea +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 1e13515..32b0a78 100644 --- a/README.md +++ b/README.md @@ -2,97 +2,81 @@ 基于 Go 的 PDF 回执拆分与分类工具。 -功能说明: +## 1. 功能概览 + - 扫描输入目录中的 PDF 文件 -- 将回执按页面内多条回执进行拆分并导出图片 -- 从文本中提取“收款人全称”并按名称分类到输出目录 -- 识别失败时可使用 OCR 兜底 +- 按页面内回执数量拆分并导出图片 +- 从文本中提取“收/付款人全称”并按名称分类到输出目录 +- 文本提取失败时可使用 OCR 兜底 - 生成执行结果记录文件(含每个 PDF 的目录与图片明细) -## 1. 项目结构 +## 2. 目录结构 -- index.go: Go 主程序 -- go.mod / go.sum: Go 模块依赖 -- source/: 默认输入目录(放 PDF) -- output/: 默认输出目录(分类后的图片) +- index.go:主程序入口(收款人版本) +- index_fk.go:主程序入口(付款人版本) +- go.mod / go.sum:Go 模块依赖 +- source/:默认输入目录(放 PDF) +- output/:默认输出目录(分类后的图片) +- dist/:构建产物目录 -## 2. 运行环境 +## 3. 运行环境 -- Windows - Go 1.19+ -- C 编译器(用于 cgo,go-fitz 依赖) +- C 编译器(go-fitz 依赖 cgo) - 可选:Tesseract OCR(用于识别扫描件) -当前项目依赖: +当前依赖: - github.com/gen2brain/go-fitz v1.20.2 -## 3. 工具链配置(Windows) +## 4. 本地运行(Go) -如果构建时报错 `gcc not found`,可使用 LLVM-MinGW。 +### 4.1 直接运行 index.go -示例(已验证可用路径): - -1) 安装 LLVM-MinGW(任选一种) -- 使用 winget -- 手动下载并解压到 C:/llvm-mingw - -2) 配置 Go 使用 clang 作为 cgo 编译器(PowerShell) - -```powershell -go env -w CC="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang.exe" -go env -w CXX="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang++.exe" -go env -w CGO_ENABLED=1 +```bash +go run index.go ``` -3) 检查配置 +默认目录: +- 输入:./source +- 输出:./output -```powershell -go env CC CXX CGO_ENABLED +### 4.2 指定目录(示例) + +```bash +go run index.go -i ./source -o ./output2 ``` -## 4. 编译 +## 5. Windows 打包 + +### 5.1 Windows 本机构建(PowerShell) ```powershell go mod tidy -go build index.go +go build -o dist\pdf-picker.exe index_fk.go ``` -编译成功后会生成可执行文件: -- index.exe +### 5.2 macOS 交叉编译 Windows exe(本次已验证) -## 5. 运行 +先安装 MinGW: -### 5.1 默认目录 - -```powershell -.\index.exe +```bash +brew install mingw-w64 ``` -等价于: -- 输入目录:./source -- 输出目录:./output +打包命令: -### 5.2 指定目录(短参数) - -```powershell -.\index.exe -i ./source -o ./output2 +```bash +mkdir -p dist && CGO_ENABLED=1 GOOS=windows GOARCH=amd64 CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ go build -o dist/pdf-picker.exe index_fk.go ``` -### 5.3 指定目录(位置参数) +产物: +- dist/pdf-picker.exe -```powershell -.\index.exe ./source ./output2 -``` +## 6. 输出说明 -优先级: -- 命名参数(-i/-o 或 --input/--output) -- 位置参数 -- 默认目录(source/output) - -## 6. 输出结果 - -运行结束后会看到控制台统计,并在项目根目录生成记录文件: +运行后会生成: +- 控制台统计信息 - 执行结果记录_YYYYMMDD_HHMMSS.txt 记录内容包括: @@ -103,38 +87,39 @@ go build index.go ## 7. OCR 说明 -程序在文本提取失败时会尝试调用: +文本提取失败时会尝试调用: ```text tesseract stdout -l chi_sim+eng ``` -如果本机未安装 tesseract,OCR 兜底会自动跳过,不影响程序主流程(但扫描件识别率会下降)。 +未安装 tesseract 时会自动跳过 OCR,不影响主流程(但扫描件识别率会下降)。 ## 8. 常见问题 -### Q1: 构建时报 `gcc not found` +### Q1:构建时报 gcc not found 原因:缺少 cgo 编译器。 -处理:按第 3 节安装并配置 LLVM-MinGW。 -### Q2: 构建时出现大量 `ld.lld: warning: duplicate symbol` +处理: +- Windows:安装 LLVM-MinGW 或 MinGW-w64,并配置 CC/CXX +- macOS 交叉编译:安装 mingw-w64 -这是链接阶段警告,若最终退出码为 0 且生成 index.exe,可继续使用。 +### Q2:没有识别出“付款人全称” -### Q3: 没有识别出“收款人全称” - -- 可能是 PDF 为扫描件且 OCR 不可用 -- 可能是单据模板字段格式与当前规则不一致 +可能原因: +- PDF 为扫描件且 OCR 不可用 +- 单据模板字段格式与当前规则不一致 可通过增强正则规则或补充 OCR 环境提升识别率。 ## 9. 快速开始 -```powershell +```bash # 1) 放入 PDF 到 source -# 2) 编译 -go build index.go -# 3) 运行 -.\index.exe -o ./output2 +# 2) 运行 +go run index_fk.go + +# 3) 或直接打包 Windows exe(macOS 交叉编译) +mkdir -p dist && CGO_ENABLED=1 GOOS=windows GOARCH=amd64 CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ go build -o dist/pdf-picker.exe index_fk.go ``` diff --git a/index_fk.go b/index_fk.go new file mode 100644 index 0000000..c8f1e97 --- /dev/null +++ b/index_fk.go @@ -0,0 +1,418 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "image" + "image/png" + "os" + "os/exec" + "path/filepath" + "regexp" + "sort" + "strings" + "time" + + fitz "github.com/gen2brain/go-fitz" +) + +const receiptHeader = "中国建设银行网上银行电子回执" + +type processResult struct { + SuccessCount int + FailedCount int + PerDirCounts map[string]int +} + +var ( + invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`) + spacePattern = regexp.MustCompile(`\s+`) + payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|收款人全称`) +) + +func safeDirName(name string) string { + cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_") + cleaned = spacePattern.ReplaceAllString(cleaned, " ") + if cleaned == "" { + return "未识别付款人" + } + if len([]rune(cleaned)) > 120 { + return string([]rune(cleaned)[:120]) + } + return cleaned +} + +func extractPayeeName(text string) string { + if strings.TrimSpace(text) == "" { + return "" + } + + normalized := strings.ReplaceAll(text, "\u3000", " ") + compact := spacePattern.ReplaceAllString(normalized, "") + + if idx := strings.Index(compact, "付款人全称"); idx >= 0 { + tail := strings.TrimLeft(compact[idx+len("付款人全称"):], "::") + if tail != "" { + if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 { + tail = tail[:stop[0]] + } + tail = strings.Trim(tail, " ::") + if tail != "" { + return tail + } + } + } + + patterns := []string{ + `付款人全称\s*[::]\s*([^\n\r]+)`, + `付款人\s*全称\s*[::]\s*([^\n\r]+)`, + `付\s*款\s*人\s*全\s*称\s*[::]?\s*([^\n\r]+)`, + `付款人全称\s+([^\n\r]+)`, + } + for _, p := range patterns { + re := regexp.MustCompile(p) + match := re.FindStringSubmatch(normalized) + if len(match) > 1 { + value := strings.TrimSpace(match[1]) + cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`) + value = strings.Trim(cutRe.Split(value, 2)[0], " ::") + if value != "" { + return value + } + } + } + + lines := strings.Split(normalized, "\n") + filtered := make([]string, 0, len(lines)) + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" { + filtered = append(filtered, line) + } + } + for i, line := range filtered { + if strings.Contains(line, "付款人全称") { + after := strings.Trim(strings.SplitN(line, "付款人全称", 2)[1], " ::") + if after != "" { + return after + } + if i+1 < len(filtered) { + candidate := strings.Trim(filtered[i+1], " ::") + if candidate != "" { + return candidate + } + } + } + } + + return "" +} + +func extractTextViaOCR(imagePath string) string { + cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return "" + } + return out.String() +} + +func cropImage(src image.Image, rect image.Rectangle) image.Image { + rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy())) + for y := rect.Min.Y; y < rect.Max.Y; y++ { + for x := rect.Min.X; x < rect.Max.X; x++ { + rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y)) + } + } + return rgba +} + +func splitPageTextByHeader(pageText string) []string { + if strings.TrimSpace(pageText) == "" { + return []string{""} + } + count := strings.Count(pageText, receiptHeader) + if count <= 1 { + return []string{pageText} + } + + parts := strings.Split(pageText, receiptHeader) + segments := make([]string, 0, count) + for i := 1; i < len(parts); i++ { + segments = append(segments, receiptHeader+parts[i]) + } + if len(segments) == 0 { + return []string{pageText} + } + return segments +} + +func splitPageImageByHeaderCount(img image.Image, count int) []image.Image { + b := img.Bounds() + if count <= 1 { + return []image.Image{img} + } + height := b.Dy() + width := b.Dx() + segments := make([]image.Image, 0, count) + for i := 0; i < count; i++ { + y0 := b.Min.Y + (height*i)/count + y1 := b.Min.Y + (height*(i+1))/count + if y1 <= y0 { + continue + } + rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1) + segments = append(segments, cropImage(img, rect)) + } + if len(segments) == 0 { + return []image.Image{img} + } + return segments +} + +func processPDF(pdfPath, outputDir string) (processResult, error) { + res := processResult{PerDirCounts: map[string]int{}} + + doc, err := fitz.New(pdfPath) + if err != nil { + return res, err + } + defer doc.Close() + + totalPages := doc.NumPage() + for pageIndex := 0; pageIndex < totalPages; pageIndex++ { + pageText, textErr := doc.Text(pageIndex) + if textErr != nil { + pageText = "" + } + + headerCount := strings.Count(pageText, receiptHeader) + if headerCount <= 0 { + headerCount = 1 + } + + img, imgErr := doc.ImageDPI(pageIndex, 200) + if imgErr != nil { + return res, imgErr + } + + imgSegments := splitPageImageByHeaderCount(img, headerCount) + textSegments := splitPageTextByHeader(pageText) + + for i, segment := range imgSegments { + receiptIndex := i + 1 + imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex) + tempImagePath := filepath.Join(outputDir, imageName) + + f, createErr := os.Create(tempImagePath) + if createErr != nil { + return res, createErr + } + encodeErr := png.Encode(f, segment) + closeErr := f.Close() + if encodeErr != nil { + return res, encodeErr + } + if closeErr != nil { + return res, closeErr + } + + receiptText := pageText + if i < len(textSegments) { + receiptText = textSegments[i] + } + payeeName := extractPayeeName(receiptText) + + if payeeName == "" { + ocrText := extractTextViaOCR(tempImagePath) + payeeName = extractPayeeName(ocrText) + } + + if payeeName == "" { + payeeName = "未识别付款人" + res.FailedCount++ + } else { + res.SuccessCount++ + } + + targetDirName := safeDirName(payeeName) + targetDir := filepath.Join(outputDir, targetDirName) + if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil { + return res, mkErr + } + res.PerDirCounts[targetDirName]++ + + finalImagePath := filepath.Join(targetDir, imageName) + if _, statErr := os.Stat(finalImagePath); statErr == nil { + finalImagePath = filepath.Join(targetDir, + fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid())) + } + + if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil { + return res, renameErr + } + } + } + + return res, nil +} + +func writeExecutionReport(baseDir string, lines []string) (string, error) { + timestamp := time.Now().Format("20060102_150405") + reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp)) + content := strings.Join(lines, "\n") + "\n" + if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil { + return "", err + } + return reportPath, nil +} + +func resolveIODirs(baseDir string) (string, string) { + defaultSourceDir := filepath.Join(baseDir, "source") + defaultOutputDir := filepath.Join(baseDir, "output") + + inputOpt := flag.String("input", "", "输入目录") + outputOpt := flag.String("output", "", "输出目录") + inputShort := flag.String("i", "", "输入目录") + outputShort := flag.String("o", "", "输出目录") + flag.Parse() + + args := flag.Args() + posInput := "" + posOutput := "" + if len(args) >= 1 { + posInput = args[0] + } + if len(args) >= 2 { + posOutput = args[1] + } + + inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput) + outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput) + + sourceDir := defaultSourceDir + if inputRaw != "" { + sourceDir = inputRaw + } + outputDir := defaultOutputDir + if outputRaw != "" { + outputDir = outputRaw + } + + sourceAbs, _ := filepath.Abs(sourceDir) + outputAbs, _ := filepath.Abs(outputDir) + return sourceAbs, outputAbs +} + +func firstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +func main() { + baseDir, wdErr := os.Getwd() + if wdErr != nil { + fmt.Printf("获取当前目录失败: %v\n", wdErr) + os.Exit(1) + } + + sourceDir, outputDir := resolveIODirs(baseDir) + if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil { + fmt.Printf("创建输入目录失败: %v\n", err) + os.Exit(1) + } + if err := os.MkdirAll(outputDir, os.ModePerm); err != nil { + fmt.Printf("创建输出目录失败: %v\n", err) + os.Exit(1) + } + + entries, readErr := os.ReadDir(sourceDir) + if readErr != nil { + fmt.Printf("读取输入目录失败: %v\n", readErr) + os.Exit(1) + } + + pdfFiles := make([]string, 0) + for _, e := range entries { + if e.IsDir() { + continue + } + if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") { + pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name())) + } + } + sort.Strings(pdfFiles) + + if len(pdfFiles) == 0 { + fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir) + fmt.Println("请将 PDF 文件放到 source 目录后重试。") + return + } + + totalOK := 0 + totalUnknown := 0 + reportLines := []string{ + fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")), + fmt.Sprintf("输入目录: %s", sourceDir), + fmt.Sprintf("输出目录: %s", outputDir), + "", + } + + for _, pdfPath := range pdfFiles { + fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath)) + res, err := processPDF(pdfPath, outputDir) + if err != nil { + fmt.Printf(" 处理失败: %v\n", err) + continue + } + + fmt.Printf(" 识别到付款人全称的图片数: %d\n", res.SuccessCount) + fmt.Printf(" 未识别付款人全称的图片数: %d\n", res.FailedCount) + + totalOK += res.SuccessCount + totalUnknown += res.FailedCount + + reportLines = append(reportLines, + fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)), + fmt.Sprintf(" 拆分目录数: %d", len(res.PerDirCounts)), + fmt.Sprintf(" 识别到付款人全称的图片数: %d", res.SuccessCount), + fmt.Sprintf(" 未识别付款人全称的图片数: %d", res.FailedCount), + " 目录明细:", + ) + + dirNames := make([]string, 0, len(res.PerDirCounts)) + for dir := range res.PerDirCounts { + dirNames = append(dirNames, dir) + } + sort.Strings(dirNames) + for _, dir := range dirNames { + reportLines = append(reportLines, fmt.Sprintf(" - %s: %d 张", dir, res.PerDirCounts[dir])) + } + reportLines = append(reportLines, "") + } + + fmt.Println("\n处理完成") + fmt.Printf("识别到付款人全称的图片数: %d\n", totalOK) + fmt.Printf("未识别付款人全称的图片数: %d\n", totalUnknown) + fmt.Printf("输出目录: %s\n", outputDir) + + reportLines = append(reportLines, + "汇总:", + fmt.Sprintf(" 识别到付款人全称的图片总数: %d", totalOK), + fmt.Sprintf(" 未识别付款人全称的图片总数: %d", totalUnknown), + ) + + reportPath, err := writeExecutionReport(baseDir, reportLines) + if err != nil { + fmt.Printf("写入执行结果记录失败: %v\n", err) + os.Exit(1) + } + fmt.Printf("执行结果记录文件: %s\n", reportPath) +}