package main import ( "bytes" "flag" "fmt" "image" "image/png" "os" "os/exec" "path/filepath" "regexp" "sort" "strings" "time" fitz "github.com/gen2brain/go-fitz" ) const receiptHeader = "中国建设银行网上银行电子回执" type processResult struct { SuccessCount int FailedCount int PerDirCounts map[string]int } var ( invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`) spacePattern = regexp.MustCompile(`\s+`) payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|付款人全称`) ) func safeDirName(name string) string { cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_") cleaned = spacePattern.ReplaceAllString(cleaned, " ") if cleaned == "" { return "未识别收款人" } if len([]rune(cleaned)) > 120 { return string([]rune(cleaned)[:120]) } return cleaned } func extractPayeeName(text string) string { if strings.TrimSpace(text) == "" { return "" } normalized := strings.ReplaceAll(text, "\u3000", " ") compact := spacePattern.ReplaceAllString(normalized, "") if idx := strings.Index(compact, "收款人全称"); idx >= 0 { tail := strings.TrimLeft(compact[idx+len("收款人全称"):], "::") if tail != "" { if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 { tail = tail[:stop[0]] } tail = strings.Trim(tail, " ::") if tail != "" { return tail } } } patterns := []string{ `收款人全称\s*[::]\s*([^\n\r]+)`, `收款人\s*全称\s*[::]\s*([^\n\r]+)`, `收\s*款\s*人\s*全\s*称\s*[::]?\s*([^\n\r]+)`, `收款人全称\s+([^\n\r]+)`, } for _, p := range patterns { re := regexp.MustCompile(p) match := re.FindStringSubmatch(normalized) if len(match) > 1 { value := strings.TrimSpace(match[1]) cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`) value = strings.Trim(cutRe.Split(value, 2)[0], " ::") if value != "" { return value } } } lines := strings.Split(normalized, "\n") filtered := make([]string, 0, len(lines)) for _, line := range lines { line = strings.TrimSpace(line) if line != "" { filtered = append(filtered, line) } } for i, line := range filtered { if strings.Contains(line, "收款人全称") { after := strings.Trim(strings.SplitN(line, "收款人全称", 2)[1], " ::") if after != "" { return after } if i+1 < len(filtered) { candidate := strings.Trim(filtered[i+1], " ::") if candidate != "" { return candidate } } } } return "" } func extractTextViaOCR(imagePath string) string { cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng") var out bytes.Buffer cmd.Stdout = &out cmd.Stderr = nil if err := cmd.Run(); err != nil { return "" } return out.String() } func cropImage(src image.Image, rect image.Rectangle) image.Image { rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy())) for y := rect.Min.Y; y < rect.Max.Y; y++ { for x := rect.Min.X; x < rect.Max.X; x++ { rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y)) } } return rgba } func splitPageTextByHeader(pageText string) []string { if strings.TrimSpace(pageText) == "" { return []string{""} } count := strings.Count(pageText, receiptHeader) if count <= 1 { return []string{pageText} } parts := strings.Split(pageText, receiptHeader) segments := make([]string, 0, count) for i := 1; i < len(parts); i++ { segments = append(segments, receiptHeader+parts[i]) } if len(segments) == 0 { return []string{pageText} } return segments } func splitPageImageByHeaderCount(img image.Image, count int) []image.Image { b := img.Bounds() if count <= 1 { return []image.Image{img} } height := b.Dy() width := b.Dx() segments := make([]image.Image, 0, count) for i := 0; i < count; i++ { y0 := b.Min.Y + (height*i)/count y1 := b.Min.Y + (height*(i+1))/count if y1 <= y0 { continue } rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1) segments = append(segments, cropImage(img, rect)) } if len(segments) == 0 { return []image.Image{img} } return segments } func processPDF(pdfPath, outputDir string) (processResult, error) { res := processResult{PerDirCounts: map[string]int{}} doc, err := fitz.New(pdfPath) if err != nil { return res, err } defer doc.Close() totalPages := doc.NumPage() for pageIndex := 0; pageIndex < totalPages; pageIndex++ { pageText, textErr := doc.Text(pageIndex) if textErr != nil { pageText = "" } headerCount := strings.Count(pageText, receiptHeader) if headerCount <= 0 { headerCount = 1 } img, imgErr := doc.ImageDPI(pageIndex, 200) if imgErr != nil { return res, imgErr } imgSegments := splitPageImageByHeaderCount(img, headerCount) textSegments := splitPageTextByHeader(pageText) for i, segment := range imgSegments { receiptIndex := i + 1 imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex) tempImagePath := filepath.Join(outputDir, imageName) f, createErr := os.Create(tempImagePath) if createErr != nil { return res, createErr } encodeErr := png.Encode(f, segment) closeErr := f.Close() if encodeErr != nil { return res, encodeErr } if closeErr != nil { return res, closeErr } receiptText := pageText if i < len(textSegments) { receiptText = textSegments[i] } payeeName := extractPayeeName(receiptText) if payeeName == "" { ocrText := extractTextViaOCR(tempImagePath) payeeName = extractPayeeName(ocrText) } if payeeName == "" { payeeName = "未识别收款人" res.FailedCount++ } else { res.SuccessCount++ } targetDirName := safeDirName(payeeName) targetDir := filepath.Join(outputDir, targetDirName) if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil { return res, mkErr } res.PerDirCounts[targetDirName]++ finalImagePath := filepath.Join(targetDir, imageName) if _, statErr := os.Stat(finalImagePath); statErr == nil { finalImagePath = filepath.Join(targetDir, fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid())) } if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil { return res, renameErr } } } return res, nil } func writeExecutionReport(baseDir string, lines []string) (string, error) { timestamp := time.Now().Format("20060102_150405") reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp)) content := strings.Join(lines, "\n") + "\n" if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil { return "", err } return reportPath, nil } func resolveIODirs(baseDir string) (string, string) { defaultSourceDir := filepath.Join(baseDir, "source") defaultOutputDir := filepath.Join(baseDir, "output") inputOpt := flag.String("input", "", "输入目录") outputOpt := flag.String("output", "", "输出目录") inputShort := flag.String("i", "", "输入目录") outputShort := flag.String("o", "", "输出目录") flag.Parse() args := flag.Args() posInput := "" posOutput := "" if len(args) >= 1 { posInput = args[0] } if len(args) >= 2 { posOutput = args[1] } inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput) outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput) sourceDir := defaultSourceDir if inputRaw != "" { sourceDir = inputRaw } outputDir := defaultOutputDir if outputRaw != "" { outputDir = outputRaw } sourceAbs, _ := filepath.Abs(sourceDir) outputAbs, _ := filepath.Abs(outputDir) return sourceAbs, outputAbs } func firstNonEmpty(values ...string) string { for _, v := range values { if strings.TrimSpace(v) != "" { return v } } return "" } func main() { baseDir, wdErr := os.Getwd() if wdErr != nil { fmt.Printf("获取当前目录失败: %v\n", wdErr) os.Exit(1) } sourceDir, outputDir := resolveIODirs(baseDir) if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil { fmt.Printf("创建输入目录失败: %v\n", err) os.Exit(1) } if err := os.MkdirAll(outputDir, os.ModePerm); err != nil { fmt.Printf("创建输出目录失败: %v\n", err) os.Exit(1) } entries, readErr := os.ReadDir(sourceDir) if readErr != nil { fmt.Printf("读取输入目录失败: %v\n", readErr) os.Exit(1) } pdfFiles := make([]string, 0) for _, e := range entries { if e.IsDir() { continue } if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") { pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name())) } } sort.Strings(pdfFiles) if len(pdfFiles) == 0 { fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir) fmt.Println("请将 PDF 文件放到 source 目录后重试。") return } totalOK := 0 totalUnknown := 0 reportLines := []string{ fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")), fmt.Sprintf("输入目录: %s", sourceDir), fmt.Sprintf("输出目录: %s", outputDir), "", } for _, pdfPath := range pdfFiles { fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath)) res, err := processPDF(pdfPath, outputDir) if err != nil { fmt.Printf(" 处理失败: %v\n", err) continue } fmt.Printf(" 识别到收款人全称的图片数: %d\n", res.SuccessCount) fmt.Printf(" 未识别收款人全称的图片数: %d\n", res.FailedCount) totalOK += res.SuccessCount totalUnknown += res.FailedCount reportLines = append(reportLines, fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)), fmt.Sprintf(" 拆分目录数: %d", len(res.PerDirCounts)), fmt.Sprintf(" 识别到收款人全称的图片数: %d", res.SuccessCount), fmt.Sprintf(" 未识别收款人全称的图片数: %d", res.FailedCount), " 目录明细:", ) dirNames := make([]string, 0, len(res.PerDirCounts)) for dir := range res.PerDirCounts { dirNames = append(dirNames, dir) } sort.Strings(dirNames) for _, dir := range dirNames { reportLines = append(reportLines, fmt.Sprintf(" - %s: %d 张", dir, res.PerDirCounts[dir])) } reportLines = append(reportLines, "") } fmt.Println("\n处理完成") fmt.Printf("识别到收款人全称的图片数: %d\n", totalOK) fmt.Printf("未识别收款人全称的图片数: %d\n", totalUnknown) fmt.Printf("输出目录: %s\n", outputDir) reportLines = append(reportLines, "汇总:", fmt.Sprintf(" 识别到收款人全称的图片总数: %d", totalOK), fmt.Sprintf(" 未识别收款人全称的图片总数: %d", totalUnknown), ) reportPath, err := writeExecutionReport(baseDir, reportLines) if err != nil { fmt.Printf("写入执行结果记录失败: %v\n", err) os.Exit(1) } fmt.Printf("执行结果记录文件: %s\n", reportPath) }