pdf-picker/index.go

package main

import (
	"bytes"
	"flag"
	"fmt"
	"image"
	"image/png"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"time"

	fitz "github.com/gen2brain/go-fitz"
)

const receiptHeader = "中国建设银行网上银行电子回执"

type processResult struct {
	SuccessCount int
	FailedCount  int
	PerDirCounts map[string]int
}

var (
	invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`)
	spacePattern     = regexp.MustCompile(`\s+`)
	payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|付款人全称`)
)

func safeDirName(name string) string {
	cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_")
	cleaned = spacePattern.ReplaceAllString(cleaned, " ")
	if cleaned == "" {
		return "未识别收款人"
	}
	if len([]rune(cleaned)) > 120 {
		return string([]rune(cleaned)[:120])
	}
	return cleaned
}

func extractPayeeName(text string) string {
	if strings.TrimSpace(text) == "" {
		return ""
	}

	normalized := strings.ReplaceAll(text, "\u3000", " ")
	compact := spacePattern.ReplaceAllString(normalized, "")

	if idx := strings.Index(compact, "收款人全称"); idx >= 0 {
		tail := strings.TrimLeft(compact[idx+len("收款人全称"):], ":：")
		if tail != "" {
			if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 {
				tail = tail[:stop[0]]
			}
			tail = strings.Trim(tail, " :：")
			if tail != "" {
				return tail
			}
		}
	}

	patterns := []string{
		`收款人全称\s*[:：]\s*([^\n\r]+)`,
		`收款人\s*全称\s*[:：]\s*([^\n\r]+)`,
		`收\s*款\s*人\s*全\s*称\s*[:：]?\s*([^\n\r]+)`,
		`收款人全称\s+([^\n\r]+)`,
	}
	for _, p := range patterns {
		re := regexp.MustCompile(p)
		match := re.FindStringSubmatch(normalized)
		if len(match) > 1 {
			value := strings.TrimSpace(match[1])
			cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`)
			value = strings.Trim(cutRe.Split(value, 2)[0], " :：")
			if value != "" {
				return value
			}
		}
	}

	lines := strings.Split(normalized, "\n")
	filtered := make([]string, 0, len(lines))
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line != "" {
			filtered = append(filtered, line)
		}
	}
	for i, line := range filtered {
		if strings.Contains(line, "收款人全称") {
			after := strings.Trim(strings.SplitN(line, "收款人全称", 2)[1], " :：")
			if after != "" {
				return after
			}
			if i+1 < len(filtered) {
				candidate := strings.Trim(filtered[i+1], " :：")
				if candidate != "" {
					return candidate
				}
			}
		}
	}

	return ""
}

func extractTextViaOCR(imagePath string) string {
	cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng")
	var out bytes.Buffer
	cmd.Stdout = &out
	cmd.Stderr = nil
	if err := cmd.Run(); err != nil {
		return ""
	}
	return out.String()
}

func cropImage(src image.Image, rect image.Rectangle) image.Image {
	rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
	for y := rect.Min.Y; y < rect.Max.Y; y++ {
		for x := rect.Min.X; x < rect.Max.X; x++ {
			rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y))
		}
	}
	return rgba
}

func splitPageTextByHeader(pageText string) []string {
	if strings.TrimSpace(pageText) == "" {
		return []string{""}
	}
	count := strings.Count(pageText, receiptHeader)
	if count <= 1 {
		return []string{pageText}
	}

	parts := strings.Split(pageText, receiptHeader)
	segments := make([]string, 0, count)
	for i := 1; i < len(parts); i++ {
		segments = append(segments, receiptHeader+parts[i])
	}
	if len(segments) == 0 {
		return []string{pageText}
	}
	return segments
}

func splitPageImageByHeaderCount(img image.Image, count int) []image.Image {
	b := img.Bounds()
	if count <= 1 {
		return []image.Image{img}
	}
	height := b.Dy()
	width := b.Dx()
	segments := make([]image.Image, 0, count)
	for i := 0; i < count; i++ {
		y0 := b.Min.Y + (height*i)/count
		y1 := b.Min.Y + (height*(i+1))/count
		if y1 <= y0 {
			continue
		}
		rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1)
		segments = append(segments, cropImage(img, rect))
	}
	if len(segments) == 0 {
		return []image.Image{img}
	}
	return segments
}

func processPDF(pdfPath, outputDir string) (processResult, error) {
	res := processResult{PerDirCounts: map[string]int{}}

	doc, err := fitz.New(pdfPath)
	if err != nil {
		return res, err
	}
	defer doc.Close()

	totalPages := doc.NumPage()
	for pageIndex := 0; pageIndex < totalPages; pageIndex++ {
		pageText, textErr := doc.Text(pageIndex)
		if textErr != nil {
			pageText = ""
		}

		headerCount := strings.Count(pageText, receiptHeader)
		if headerCount <= 0 {
			headerCount = 1
		}

		img, imgErr := doc.ImageDPI(pageIndex, 200)
		if imgErr != nil {
			return res, imgErr
		}

		imgSegments := splitPageImageByHeaderCount(img, headerCount)
		textSegments := splitPageTextByHeader(pageText)

		for i, segment := range imgSegments {
			receiptIndex := i + 1
			imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex)
			tempImagePath := filepath.Join(outputDir, imageName)

			f, createErr := os.Create(tempImagePath)
			if createErr != nil {
				return res, createErr
			}
			encodeErr := png.Encode(f, segment)
			closeErr := f.Close()
			if encodeErr != nil {
				return res, encodeErr
			}
			if closeErr != nil {
				return res, closeErr
			}

			receiptText := pageText
			if i < len(textSegments) {
				receiptText = textSegments[i]
			}
			payeeName := extractPayeeName(receiptText)

			if payeeName == "" {
				ocrText := extractTextViaOCR(tempImagePath)
				payeeName = extractPayeeName(ocrText)
			}

			if payeeName == "" {
				payeeName = "未识别收款人"
				res.FailedCount++
			} else {
				res.SuccessCount++
			}

			targetDirName := safeDirName(payeeName)
			targetDir := filepath.Join(outputDir, targetDirName)
			if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil {
				return res, mkErr
			}
			res.PerDirCounts[targetDirName]++

			finalImagePath := filepath.Join(targetDir, imageName)
			if _, statErr := os.Stat(finalImagePath); statErr == nil {
				finalImagePath = filepath.Join(targetDir,
					fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid()))
			}

			if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil {
				return res, renameErr
			}
		}
	}

	return res, nil
}

func writeExecutionReport(baseDir string, lines []string) (string, error) {
	timestamp := time.Now().Format("20060102_150405")
	reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp))
	content := strings.Join(lines, "\n") + "\n"
	if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil {
		return "", err
	}
	return reportPath, nil
}

func resolveIODirs(baseDir string) (string, string) {
	defaultSourceDir := filepath.Join(baseDir, "source")
	defaultOutputDir := filepath.Join(baseDir, "output")

	inputOpt := flag.String("input", "", "输入目录")
	outputOpt := flag.String("output", "", "输出目录")
	inputShort := flag.String("i", "", "输入目录")
	outputShort := flag.String("o", "", "输出目录")
	flag.Parse()

	args := flag.Args()
	posInput := ""
	posOutput := ""
	if len(args) >= 1 {
		posInput = args[0]
	}
	if len(args) >= 2 {
		posOutput = args[1]
	}

	inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput)
	outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput)

	sourceDir := defaultSourceDir
	if inputRaw != "" {
		sourceDir = inputRaw
	}
	outputDir := defaultOutputDir
	if outputRaw != "" {
		outputDir = outputRaw
	}

	sourceAbs, _ := filepath.Abs(sourceDir)
	outputAbs, _ := filepath.Abs(outputDir)
	return sourceAbs, outputAbs
}

func firstNonEmpty(values ...string) string {
	for _, v := range values {
		if strings.TrimSpace(v) != "" {
			return v
		}
	}
	return ""
}

func main() {
	baseDir, wdErr := os.Getwd()
	if wdErr != nil {
		fmt.Printf("获取当前目录失败: %v\n", wdErr)
		os.Exit(1)
	}

	sourceDir, outputDir := resolveIODirs(baseDir)
	if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil {
		fmt.Printf("创建输入目录失败: %v\n", err)
		os.Exit(1)
	}
	if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
		fmt.Printf("创建输出目录失败: %v\n", err)
		os.Exit(1)
	}

	entries, readErr := os.ReadDir(sourceDir)
	if readErr != nil {
		fmt.Printf("读取输入目录失败: %v\n", readErr)
		os.Exit(1)
	}

	pdfFiles := make([]string, 0)
	for _, e := range entries {
		if e.IsDir() {
			continue
		}
		if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") {
			pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name()))
		}
	}
	sort.Strings(pdfFiles)

	if len(pdfFiles) == 0 {
		fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir)
		fmt.Println("请将 PDF 文件放到 source 目录后重试。")
		return
	}

	totalOK := 0
	totalUnknown := 0
	reportLines := []string{
		fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")),
		fmt.Sprintf("输入目录: %s", sourceDir),
		fmt.Sprintf("输出目录: %s", outputDir),
		"",
	}

	for _, pdfPath := range pdfFiles {
		fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath))
		res, err := processPDF(pdfPath, outputDir)
		if err != nil {
			fmt.Printf("    处理失败: %v\n", err)
			continue
		}

		fmt.Printf("    识别到收款人全称的图片数: %d\n", res.SuccessCount)
		fmt.Printf("    未识别收款人全称的图片数: %d\n", res.FailedCount)

		totalOK += res.SuccessCount
		totalUnknown += res.FailedCount

		reportLines = append(reportLines,
			fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)),
			fmt.Sprintf("  拆分目录数: %d", len(res.PerDirCounts)),
			fmt.Sprintf("  识别到收款人全称的图片数: %d", res.SuccessCount),
			fmt.Sprintf("  未识别收款人全称的图片数: %d", res.FailedCount),
			"  目录明细:",
		)

		dirNames := make([]string, 0, len(res.PerDirCounts))
		for dir := range res.PerDirCounts {
			dirNames = append(dirNames, dir)
		}
		sort.Strings(dirNames)
		for _, dir := range dirNames {
			reportLines = append(reportLines, fmt.Sprintf("    - %s: %d 张", dir, res.PerDirCounts[dir]))
		}
		reportLines = append(reportLines, "")
	}

	fmt.Println("\n处理完成")
	fmt.Printf("识别到收款人全称的图片数: %d\n", totalOK)
	fmt.Printf("未识别收款人全称的图片数: %d\n", totalUnknown)
	fmt.Printf("输出目录: %s\n", outputDir)

	reportLines = append(reportLines,
		"汇总:",
		fmt.Sprintf("  识别到收款人全称的图片总数: %d", totalOK),
		fmt.Sprintf("  未识别收款人全称的图片总数: %d", totalUnknown),
	)

	reportPath, err := writeExecutionReport(baseDir, reportLines)
	if err != nil {
		fmt.Printf("写入执行结果记录失败: %v\n", err)
		os.Exit(1)
	}
	fmt.Printf("执行结果记录文件: %s\n", reportPath)
}