From 7878c7bee6b478d387e1fc03a85c5b8ea15329fa Mon Sep 17 00:00:00 2001 From: liyanyan <215952619@qq.com> Date: Tue, 26 May 2026 21:36:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0golang=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 6 + README.md | 140 ++++++++++++++++++ go.mod | 5 + go.sum | 2 + index.go | 418 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 571 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 go.mod create mode 100644 go.sum create mode 100644 index.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c62241d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +output +source +执行结果记录_**.txt +*.log +*.exe diff --git a/README.md b/README.md new file mode 100644 index 0000000..1e13515 --- /dev/null +++ b/README.md @@ -0,0 +1,140 @@ +# pdf-picker + +基于 Go 的 PDF 回执拆分与分类工具。 + +功能说明: +- 扫描输入目录中的 PDF 文件 +- 将回执按页面内多条回执进行拆分并导出图片 +- 从文本中提取“收款人全称”并按名称分类到输出目录 +- 识别失败时可使用 OCR 兜底 +- 生成执行结果记录文件(含每个 PDF 的目录与图片明细) + +## 1. 项目结构 + +- index.go: Go 主程序 +- go.mod / go.sum: Go 模块依赖 +- source/: 默认输入目录(放 PDF) +- output/: 默认输出目录(分类后的图片) + +## 2. 运行环境 + +- Windows +- Go 1.19+ +- C 编译器(用于 cgo,go-fitz 依赖) +- 可选:Tesseract OCR(用于识别扫描件) + +当前项目依赖: +- github.com/gen2brain/go-fitz v1.20.2 + +## 3. 工具链配置(Windows) + +如果构建时报错 `gcc not found`,可使用 LLVM-MinGW。 + +示例(已验证可用路径): + +1) 安装 LLVM-MinGW(任选一种) +- 使用 winget +- 手动下载并解压到 C:/llvm-mingw + +2) 配置 Go 使用 clang 作为 cgo 编译器(PowerShell) + +```powershell +go env -w CC="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang.exe" +go env -w CXX="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang++.exe" +go env -w CGO_ENABLED=1 +``` + +3) 检查配置 + +```powershell +go env CC CXX CGO_ENABLED +``` + +## 4. 编译 + +```powershell +go mod tidy +go build index.go +``` + +编译成功后会生成可执行文件: +- index.exe + +## 5. 运行 + +### 5.1 默认目录 + +```powershell +.\index.exe +``` + +等价于: +- 输入目录:./source +- 输出目录:./output + +### 5.2 指定目录(短参数) + +```powershell +.\index.exe -i ./source -o ./output2 +``` + +### 5.3 指定目录(位置参数) + +```powershell +.\index.exe ./source ./output2 +``` + +优先级: +- 命名参数(-i/-o 或 --input/--output) +- 位置参数 +- 默认目录(source/output) + +## 6. 输出结果 + +运行结束后会看到控制台统计,并在项目根目录生成记录文件: + +- 执行结果记录_YYYYMMDD_HHMMSS.txt + +记录内容包括: +- 每个 PDF 拆分目录数 +- 每个目录对应图片数量 +- 识别成功/失败数量 +- 全部汇总 + +## 7. OCR 说明 + +程序在文本提取失败时会尝试调用: + +```text +tesseract stdout -l chi_sim+eng +``` + +如果本机未安装 tesseract,OCR 兜底会自动跳过,不影响程序主流程(但扫描件识别率会下降)。 + +## 8. 常见问题 + +### Q1: 构建时报 `gcc not found` + +原因:缺少 cgo 编译器。 +处理:按第 3 节安装并配置 LLVM-MinGW。 + +### Q2: 构建时出现大量 `ld.lld: warning: duplicate symbol` + +这是链接阶段警告,若最终退出码为 0 且生成 index.exe,可继续使用。 + +### Q3: 没有识别出“收款人全称” + +- 可能是 PDF 为扫描件且 OCR 不可用 +- 可能是单据模板字段格式与当前规则不一致 + +可通过增强正则规则或补充 OCR 环境提升识别率。 + +## 9. 快速开始 + +```powershell +# 1) 放入 PDF 到 source +# 2) 编译 +go build index.go +# 3) 运行 +.\index.exe -o ./output2 +``` diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6b60c61 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module pdf-picker + +go 1.19 + +require github.com/gen2brain/go-fitz v1.20.2 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..edb0cd0 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/gen2brain/go-fitz v1.20.2 h1:4FPJCU/ImQ32oojBsYn/+oTkRORxbAhAA+Yw1Fm97MA= +github.com/gen2brain/go-fitz v1.20.2/go.mod h1:YbQPODTC/UnQ/RK4JyD3zfpDQ19UKiV85nMMT3XpT0s= diff --git a/index.go b/index.go new file mode 100644 index 0000000..9c43947 --- /dev/null +++ b/index.go @@ -0,0 +1,418 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "image" + "image/png" + "os" + "os/exec" + "path/filepath" + "regexp" + "sort" + "strings" + "time" + + fitz "github.com/gen2brain/go-fitz" +) + +const receiptHeader = "中国建设银行网上银行电子回执" + +type processResult struct { + SuccessCount int + FailedCount int + PerDirCounts map[string]int +} + +var ( + invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`) + spacePattern = regexp.MustCompile(`\s+`) + payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|付款人全称`) +) + +func safeDirName(name string) string { + cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_") + cleaned = spacePattern.ReplaceAllString(cleaned, " ") + if cleaned == "" { + return "未识别收款人" + } + if len([]rune(cleaned)) > 120 { + return string([]rune(cleaned)[:120]) + } + return cleaned +} + +func extractPayeeName(text string) string { + if strings.TrimSpace(text) == "" { + return "" + } + + normalized := strings.ReplaceAll(text, "\u3000", " ") + compact := spacePattern.ReplaceAllString(normalized, "") + + if idx := strings.Index(compact, "收款人全称"); idx >= 0 { + tail := strings.TrimLeft(compact[idx+len("收款人全称"):], "::") + if tail != "" { + if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 { + tail = tail[:stop[0]] + } + tail = strings.Trim(tail, " ::") + if tail != "" { + return tail + } + } + } + + patterns := []string{ + `收款人全称\s*[::]\s*([^\n\r]+)`, + `收款人\s*全称\s*[::]\s*([^\n\r]+)`, + `收\s*款\s*人\s*全\s*称\s*[::]?\s*([^\n\r]+)`, + `收款人全称\s+([^\n\r]+)`, + } + for _, p := range patterns { + re := regexp.MustCompile(p) + match := re.FindStringSubmatch(normalized) + if len(match) > 1 { + value := strings.TrimSpace(match[1]) + cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`) + value = strings.Trim(cutRe.Split(value, 2)[0], " ::") + if value != "" { + return value + } + } + } + + lines := strings.Split(normalized, "\n") + filtered := make([]string, 0, len(lines)) + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" { + filtered = append(filtered, line) + } + } + for i, line := range filtered { + if strings.Contains(line, "收款人全称") { + after := strings.Trim(strings.SplitN(line, "收款人全称", 2)[1], " ::") + if after != "" { + return after + } + if i+1 < len(filtered) { + candidate := strings.Trim(filtered[i+1], " ::") + if candidate != "" { + return candidate + } + } + } + } + + return "" +} + +func extractTextViaOCR(imagePath string) string { + cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return "" + } + return out.String() +} + +func cropImage(src image.Image, rect image.Rectangle) image.Image { + rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy())) + for y := rect.Min.Y; y < rect.Max.Y; y++ { + for x := rect.Min.X; x < rect.Max.X; x++ { + rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y)) + } + } + return rgba +} + +func splitPageTextByHeader(pageText string) []string { + if strings.TrimSpace(pageText) == "" { + return []string{""} + } + count := strings.Count(pageText, receiptHeader) + if count <= 1 { + return []string{pageText} + } + + parts := strings.Split(pageText, receiptHeader) + segments := make([]string, 0, count) + for i := 1; i < len(parts); i++ { + segments = append(segments, receiptHeader+parts[i]) + } + if len(segments) == 0 { + return []string{pageText} + } + return segments +} + +func splitPageImageByHeaderCount(img image.Image, count int) []image.Image { + b := img.Bounds() + if count <= 1 { + return []image.Image{img} + } + height := b.Dy() + width := b.Dx() + segments := make([]image.Image, 0, count) + for i := 0; i < count; i++ { + y0 := b.Min.Y + (height*i)/count + y1 := b.Min.Y + (height*(i+1))/count + if y1 <= y0 { + continue + } + rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1) + segments = append(segments, cropImage(img, rect)) + } + if len(segments) == 0 { + return []image.Image{img} + } + return segments +} + +func processPDF(pdfPath, outputDir string) (processResult, error) { + res := processResult{PerDirCounts: map[string]int{}} + + doc, err := fitz.New(pdfPath) + if err != nil { + return res, err + } + defer doc.Close() + + totalPages := doc.NumPage() + for pageIndex := 0; pageIndex < totalPages; pageIndex++ { + pageText, textErr := doc.Text(pageIndex) + if textErr != nil { + pageText = "" + } + + headerCount := strings.Count(pageText, receiptHeader) + if headerCount <= 0 { + headerCount = 1 + } + + img, imgErr := doc.ImageDPI(pageIndex, 200) + if imgErr != nil { + return res, imgErr + } + + imgSegments := splitPageImageByHeaderCount(img, headerCount) + textSegments := splitPageTextByHeader(pageText) + + for i, segment := range imgSegments { + receiptIndex := i + 1 + imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex) + tempImagePath := filepath.Join(outputDir, imageName) + + f, createErr := os.Create(tempImagePath) + if createErr != nil { + return res, createErr + } + encodeErr := png.Encode(f, segment) + closeErr := f.Close() + if encodeErr != nil { + return res, encodeErr + } + if closeErr != nil { + return res, closeErr + } + + receiptText := pageText + if i < len(textSegments) { + receiptText = textSegments[i] + } + payeeName := extractPayeeName(receiptText) + + if payeeName == "" { + ocrText := extractTextViaOCR(tempImagePath) + payeeName = extractPayeeName(ocrText) + } + + if payeeName == "" { + payeeName = "未识别收款人" + res.FailedCount++ + } else { + res.SuccessCount++ + } + + targetDirName := safeDirName(payeeName) + targetDir := filepath.Join(outputDir, targetDirName) + if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil { + return res, mkErr + } + res.PerDirCounts[targetDirName]++ + + finalImagePath := filepath.Join(targetDir, imageName) + if _, statErr := os.Stat(finalImagePath); statErr == nil { + finalImagePath = filepath.Join(targetDir, + fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid())) + } + + if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil { + return res, renameErr + } + } + } + + return res, nil +} + +func writeExecutionReport(baseDir string, lines []string) (string, error) { + timestamp := time.Now().Format("20060102_150405") + reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp)) + content := strings.Join(lines, "\n") + "\n" + if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil { + return "", err + } + return reportPath, nil +} + +func resolveIODirs(baseDir string) (string, string) { + defaultSourceDir := filepath.Join(baseDir, "source") + defaultOutputDir := filepath.Join(baseDir, "output") + + inputOpt := flag.String("input", "", "输入目录") + outputOpt := flag.String("output", "", "输出目录") + inputShort := flag.String("i", "", "输入目录") + outputShort := flag.String("o", "", "输出目录") + flag.Parse() + + args := flag.Args() + posInput := "" + posOutput := "" + if len(args) >= 1 { + posInput = args[0] + } + if len(args) >= 2 { + posOutput = args[1] + } + + inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput) + outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput) + + sourceDir := defaultSourceDir + if inputRaw != "" { + sourceDir = inputRaw + } + outputDir := defaultOutputDir + if outputRaw != "" { + outputDir = outputRaw + } + + sourceAbs, _ := filepath.Abs(sourceDir) + outputAbs, _ := filepath.Abs(outputDir) + return sourceAbs, outputAbs +} + +func firstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +func main() { + baseDir, wdErr := os.Getwd() + if wdErr != nil { + fmt.Printf("获取当前目录失败: %v\n", wdErr) + os.Exit(1) + } + + sourceDir, outputDir := resolveIODirs(baseDir) + if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil { + fmt.Printf("创建输入目录失败: %v\n", err) + os.Exit(1) + } + if err := os.MkdirAll(outputDir, os.ModePerm); err != nil { + fmt.Printf("创建输出目录失败: %v\n", err) + os.Exit(1) + } + + entries, readErr := os.ReadDir(sourceDir) + if readErr != nil { + fmt.Printf("读取输入目录失败: %v\n", readErr) + os.Exit(1) + } + + pdfFiles := make([]string, 0) + for _, e := range entries { + if e.IsDir() { + continue + } + if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") { + pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name())) + } + } + sort.Strings(pdfFiles) + + if len(pdfFiles) == 0 { + fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir) + fmt.Println("请将 PDF 文件放到 source 目录后重试。") + return + } + + totalOK := 0 + totalUnknown := 0 + reportLines := []string{ + fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")), + fmt.Sprintf("输入目录: %s", sourceDir), + fmt.Sprintf("输出目录: %s", outputDir), + "", + } + + for _, pdfPath := range pdfFiles { + fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath)) + res, err := processPDF(pdfPath, outputDir) + if err != nil { + fmt.Printf(" 处理失败: %v\n", err) + continue + } + + fmt.Printf(" 识别到收款人全称的图片数: %d\n", res.SuccessCount) + fmt.Printf(" 未识别收款人全称的图片数: %d\n", res.FailedCount) + + totalOK += res.SuccessCount + totalUnknown += res.FailedCount + + reportLines = append(reportLines, + fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)), + fmt.Sprintf(" 拆分目录数: %d", len(res.PerDirCounts)), + fmt.Sprintf(" 识别到收款人全称的图片数: %d", res.SuccessCount), + fmt.Sprintf(" 未识别收款人全称的图片数: %d", res.FailedCount), + " 目录明细:", + ) + + dirNames := make([]string, 0, len(res.PerDirCounts)) + for dir := range res.PerDirCounts { + dirNames = append(dirNames, dir) + } + sort.Strings(dirNames) + for _, dir := range dirNames { + reportLines = append(reportLines, fmt.Sprintf(" - %s: %d 张", dir, res.PerDirCounts[dir])) + } + reportLines = append(reportLines, "") + } + + fmt.Println("\n处理完成") + fmt.Printf("识别到收款人全称的图片数: %d\n", totalOK) + fmt.Printf("未识别收款人全称的图片数: %d\n", totalUnknown) + fmt.Printf("输出目录: %s\n", outputDir) + + reportLines = append(reportLines, + "汇总:", + fmt.Sprintf(" 识别到收款人全称的图片总数: %d", totalOK), + fmt.Sprintf(" 未识别收款人全称的图片总数: %d", totalUnknown), + ) + + reportPath, err := writeExecutionReport(baseDir, reportLines) + if err != nil { + fmt.Printf("写入执行结果记录失败: %v\n", err) + os.Exit(1) + } + fmt.Printf("执行结果记录文件: %s\n", reportPath) +}