feat: 主版本增加付款人维度

2026-05-27 13:42:17 +08:00 · 2026-05-27 13:42:17 +08:00 · 20900f008f
commit 20900f008f
parent 7878c7bee6
3 changed files with 479 additions and 73 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,6 @@ source
 执行结果记录_**.txt
 *.log
 *.exe
 dist
 .idea
 .DS_Store
--- a/README.md
+++ b/README.md
@ -2,97 +2,81 @@
 基于 Go 的 PDF 回执拆分与分类工具。
-功能说明：
+## 1. 功能概览
 - 扫描输入目录中的 PDF 文件
- 将回执按页面内多条回执进行拆分并导出图片
+- 按页面内回执数量拆分并导出图片
- 从文本中提取“收款人全称”并按名称分类到输出目录
+- 从文本中提取“收/付款人全称”并按名称分类到输出目录
- 识别失败时可使用 OCR 兜底
+- 文本提取失败时可使用 OCR 兜底
 - 生成执行结果记录文件（含每个 PDF 的目录与图片明细）
-## 1. 项目结构
+## 2. 目录结构
- index.go: Go 主程序
+- index.go：主程序入口（收款人版本）
- go.mod / go.sum: Go 模块依赖
+- index_fk.go：主程序入口（付款人版本）
- source/: 默认输入目录（放 PDF）
+- go.mod / go.sum：Go 模块依赖
- output/: 默认输出目录（分类后的图片）
+- source/：默认输入目录（放 PDF）
 - output/：默认输出目录（分类后的图片）
 - dist/：构建产物目录
-## 2. 运行环境
+## 3. 运行环境
 - Windows
 - Go 1.19+
- C 编译器（用于 cgo，go-fitz 依赖）
+- C 编译器（go-fitz 依赖 cgo）
 - 可选：Tesseract OCR（用于识别扫描件）
-当前项目依赖：
+当前依赖：
 - github.com/gen2brain/go-fitz v1.20.2
-## 3. 工具链配置（Windows）
+## 4. 本地运行（Go）
-如果构建时报错 `gcc not found`，可使用 LLVM-MinGW。
+### 4.1 直接运行 index.go
-示例（已验证可用路径）：
+```bash
-
+go run index.go
 1) 安装 LLVM-MinGW（任选一种）
 - 使用 winget
 - 手动下载并解压到 C:/llvm-mingw
 2) 配置 Go 使用 clang 作为 cgo 编译器（PowerShell）
 ```powershell
 go env -w CC="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang.exe"
 go env -w CXX="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang++.exe"
 go env -w CGO_ENABLED=1
 ```
-3) 检查配置
+默认目录：
 - 输入：./source
 - 输出：./output
-```powershell
+### 4.2 指定目录（示例）
-go env CC CXX CGO_ENABLED
+
 ```bash
 go run index.go -i ./source -o ./output2
 ```
-## 4. 编译
+## 5. Windows 打包
 ### 5.1 Windows 本机构建（PowerShell）
 ```powershell
 go mod tidy
-go build index.go
+go build -o dist\pdf-picker.exe index_fk.go
 ```
-编译成功后会生成可执行文件：
+### 5.2 macOS 交叉编译 Windows exe（本次已验证）
 - index.exe
-## 5. 运行
+先安装 MinGW：
-### 5.1 默认目录
+```bash
-
+brew install mingw-w64
 ```powershell
 .\index.exe
 ```
-等价于：
+打包命令：
 - 输入目录：./source
 - 输出目录：./output
-### 5.2 指定目录（短参数）
+```bash
-
+mkdir -p dist && CGO_ENABLED=1 GOOS=windows GOARCH=amd64 CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ go build -o dist/pdf-picker.exe index_fk.go
 ```powershell
 .\index.exe -i ./source -o ./output2
 ```
-### 5.3 指定目录（位置参数）
+产物：
 - dist/pdf-picker.exe
-```powershell
+## 6. 输出说明
 .\index.exe ./source ./output2
 ```
-优先级：
+运行后会生成：
 - 命名参数（-i/-o 或 --input/--output）
 - 位置参数
 - 默认目录（source/output）
 ## 6. 输出结果
 运行结束后会看到控制台统计，并在项目根目录生成记录文件：
 - 控制台统计信息
 - 执行结果记录_YYYYMMDD_HHMMSS.txt
 记录内容包括：
@ -103,38 +87,39 @@ go build index.go
 ## 7. OCR 说明
-程序在文本提取失败时会尝试调用：
+文本提取失败时会尝试调用：
 ```text
 tesseract <image> stdout -l chi_sim+eng
 ```
-如果本机未安装 tesseract，OCR 兜底会自动跳过，不影响程序主流程（但扫描件识别率会下降）。
+未安装 tesseract 时会自动跳过 OCR，不影响主流程（但扫描件识别率会下降）。
 ## 8. 常见问题
-### Q1: 构建时报 `gcc not found`
+### Q1：构建时报 gcc not found
 原因：缺少 cgo 编译器。
 处理：按第 3 节安装并配置 LLVM-MinGW。
-### Q2: 构建时出现大量 `ld.lld: warning: duplicate symbol`
+处理：
 - Windows：安装 LLVM-MinGW 或 MinGW-w64，并配置 CC/CXX
 - macOS 交叉编译：安装 mingw-w64
-这是链接阶段警告，若最终退出码为 0 且生成 index.exe，可继续使用。
+### Q2：没有识别出“付款人全称”
-### Q3: 没有识别出“收款人全称”
+可能原因：
-
+- PDF 为扫描件且 OCR 不可用
- 可能是 PDF 为扫描件且 OCR 不可用
+- 单据模板字段格式与当前规则不一致
 - 可能是单据模板字段格式与当前规则不一致
 可通过增强正则规则或补充 OCR 环境提升识别率。
 ## 9. 快速开始
-```powershell
+```bash
 # 1) 放入 PDF 到 source
-# 2) 编译
+# 2) 运行
-go build index.go
+go run index_fk.go
-# 3) 运行
+
-.\index.exe -o ./output2
+# 3) 或直接打包 Windows exe（macOS 交叉编译）
 mkdir -p dist && CGO_ENABLED=1 GOOS=windows GOARCH=amd64 CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ go build -o dist/pdf-picker.exe index_fk.go
 ```
--- a/index_fk.go
+++ b/index_fk.go
@ -0,0 +1,418 @@
 package main
 import (
 	"bytes"
 	"flag"
 	"fmt"
 	"image"
 	"image/png"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
 	"strings"
 	"time"
 	fitz "github.com/gen2brain/go-fitz"
 )
 const receiptHeader = "中国建设银行网上银行电子回执"
 type processResult struct {
 	SuccessCount int
 	FailedCount  int
 	PerDirCounts map[string]int
 }
 var (
 	invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`)
 	spacePattern     = regexp.MustCompile(`\s+`)
 	payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|收款人全称`)
 )
 func safeDirName(name string) string {
 	cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_")
 	cleaned = spacePattern.ReplaceAllString(cleaned, " ")
 	if cleaned == "" {
 		return "未识别付款人"
 	}
 	if len([]rune(cleaned)) > 120 {
 		return string([]rune(cleaned)[:120])
 	}
 	return cleaned
 }
 func extractPayeeName(text string) string {
 	if strings.TrimSpace(text) == "" {
 		return ""
 	}
 	normalized := strings.ReplaceAll(text, "\u3000", " ")
 	compact := spacePattern.ReplaceAllString(normalized, "")
 	if idx := strings.Index(compact, "付款人全称"); idx >= 0 {
 		tail := strings.TrimLeft(compact[idx+len("付款人全称"):], ":：")
 		if tail != "" {
 			if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 {
 				tail = tail[:stop[0]]
 			}
 			tail = strings.Trim(tail, " :：")
 			if tail != "" {
 				return tail
 			}
 		}
 	}
 	patterns := []string{
 		`付款人全称\s*[:：]\s*([^\n\r]+)`,
 		`付款人\s*全称\s*[:：]\s*([^\n\r]+)`,
 		`付\s*款\s*人\s*全\s*称\s*[:：]?\s*([^\n\r]+)`,
 		`付款人全称\s+([^\n\r]+)`,
 	}
 	for _, p := range patterns {
 		re := regexp.MustCompile(p)
 		match := re.FindStringSubmatch(normalized)
 		if len(match) > 1 {
 			value := strings.TrimSpace(match[1])
 			cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`)
 			value = strings.Trim(cutRe.Split(value, 2)[0], " :：")
 			if value != "" {
 				return value
 			}
 		}
 	}
 	lines := strings.Split(normalized, "\n")
 	filtered := make([]string, 0, len(lines))
 	for _, line := range lines {
 		line = strings.TrimSpace(line)
 		if line != "" {
 			filtered = append(filtered, line)
 		}
 	}
 	for i, line := range filtered {
 		if strings.Contains(line, "付款人全称") {
 			after := strings.Trim(strings.SplitN(line, "付款人全称", 2)[1], " :：")
 			if after != "" {
 				return after
 			}
 			if i+1 < len(filtered) {
 				candidate := strings.Trim(filtered[i+1], " :：")
 				if candidate != "" {
 					return candidate
 				}
 			}
 		}
 	}
 	return ""
 }
 func extractTextViaOCR(imagePath string) string {
 	cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng")
 	var out bytes.Buffer
 	cmd.Stdout = &out
 	cmd.Stderr = nil
 	if err := cmd.Run(); err != nil {
 		return ""
 	}
 	return out.String()
 }
 func cropImage(src image.Image, rect image.Rectangle) image.Image {
 	rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
 	for y := rect.Min.Y; y < rect.Max.Y; y++ {
 		for x := rect.Min.X; x < rect.Max.X; x++ {
 			rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y))
 		}
 	}
 	return rgba
 }
 func splitPageTextByHeader(pageText string) []string {
 	if strings.TrimSpace(pageText) == "" {
 		return []string{""}
 	}
 	count := strings.Count(pageText, receiptHeader)
 	if count <= 1 {
 		return []string{pageText}
 	}
 	parts := strings.Split(pageText, receiptHeader)
 	segments := make([]string, 0, count)
 	for i := 1; i < len(parts); i++ {
 		segments = append(segments, receiptHeader+parts[i])
 	}
 	if len(segments) == 0 {
 		return []string{pageText}
 	}
 	return segments
 }
 func splitPageImageByHeaderCount(img image.Image, count int) []image.Image {
 	b := img.Bounds()
 	if count <= 1 {
 		return []image.Image{img}
 	}
 	height := b.Dy()
 	width := b.Dx()
 	segments := make([]image.Image, 0, count)
 	for i := 0; i < count; i++ {
 		y0 := b.Min.Y + (height*i)/count
 		y1 := b.Min.Y + (height*(i+1))/count
 		if y1 <= y0 {
 			continue
 		}
 		rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1)
 		segments = append(segments, cropImage(img, rect))
 	}
 	if len(segments) == 0 {
 		return []image.Image{img}
 	}
 	return segments
 }
 func processPDF(pdfPath, outputDir string) (processResult, error) {
 	res := processResult{PerDirCounts: map[string]int{}}
 	doc, err := fitz.New(pdfPath)
 	if err != nil {
 		return res, err
 	}
 	defer doc.Close()
 	totalPages := doc.NumPage()
 	for pageIndex := 0; pageIndex < totalPages; pageIndex++ {
 		pageText, textErr := doc.Text(pageIndex)
 		if textErr != nil {
 			pageText = ""
 		}
 		headerCount := strings.Count(pageText, receiptHeader)
 		if headerCount <= 0 {
 			headerCount = 1
 		}
 		img, imgErr := doc.ImageDPI(pageIndex, 200)
 		if imgErr != nil {
 			return res, imgErr
 		}
 		imgSegments := splitPageImageByHeaderCount(img, headerCount)
 		textSegments := splitPageTextByHeader(pageText)
 		for i, segment := range imgSegments {
 			receiptIndex := i + 1
 			imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex)
 			tempImagePath := filepath.Join(outputDir, imageName)
 			f, createErr := os.Create(tempImagePath)
 			if createErr != nil {
 				return res, createErr
 			}
 			encodeErr := png.Encode(f, segment)
 			closeErr := f.Close()
 			if encodeErr != nil {
 				return res, encodeErr
 			}
 			if closeErr != nil {
 				return res, closeErr
 			}
 			receiptText := pageText
 			if i < len(textSegments) {
 				receiptText = textSegments[i]
 			}
 			payeeName := extractPayeeName(receiptText)
 			if payeeName == "" {
 				ocrText := extractTextViaOCR(tempImagePath)
 				payeeName = extractPayeeName(ocrText)
 			}
 			if payeeName == "" {
 				payeeName = "未识别付款人"
 				res.FailedCount++
 			} else {
 				res.SuccessCount++
 			}
 			targetDirName := safeDirName(payeeName)
 			targetDir := filepath.Join(outputDir, targetDirName)
 			if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil {
 				return res, mkErr
 			}
 			res.PerDirCounts[targetDirName]++
 			finalImagePath := filepath.Join(targetDir, imageName)
 			if _, statErr := os.Stat(finalImagePath); statErr == nil {
 				finalImagePath = filepath.Join(targetDir,
 					fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid()))
 			}
 			if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil {
 				return res, renameErr
 			}
 		}
 	}
 	return res, nil
 }
 func writeExecutionReport(baseDir string, lines []string) (string, error) {
 	timestamp := time.Now().Format("20060102_150405")
 	reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp))
 	content := strings.Join(lines, "\n") + "\n"
 	if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil {
 		return "", err
 	}
 	return reportPath, nil
 }
 func resolveIODirs(baseDir string) (string, string) {
 	defaultSourceDir := filepath.Join(baseDir, "source")
 	defaultOutputDir := filepath.Join(baseDir, "output")
 	inputOpt := flag.String("input", "", "输入目录")
 	outputOpt := flag.String("output", "", "输出目录")
 	inputShort := flag.String("i", "", "输入目录")
 	outputShort := flag.String("o", "", "输出目录")
 	flag.Parse()
 	args := flag.Args()
 	posInput := ""
 	posOutput := ""
 	if len(args) >= 1 {
 		posInput = args[0]
 	}
 	if len(args) >= 2 {
 		posOutput = args[1]
 	}
 	inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput)
 	outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput)
 	sourceDir := defaultSourceDir
 	if inputRaw != "" {
 		sourceDir = inputRaw
 	}
 	outputDir := defaultOutputDir
 	if outputRaw != "" {
 		outputDir = outputRaw
 	}
 	sourceAbs, _ := filepath.Abs(sourceDir)
 	outputAbs, _ := filepath.Abs(outputDir)
 	return sourceAbs, outputAbs
 }
 func firstNonEmpty(values ...string) string {
 	for _, v := range values {
 		if strings.TrimSpace(v) != "" {
 			return v
 		}
 	}
 	return ""
 }
 func main() {
 	baseDir, wdErr := os.Getwd()
 	if wdErr != nil {
 		fmt.Printf("获取当前目录失败: %v\n", wdErr)
 		os.Exit(1)
 	}
 	sourceDir, outputDir := resolveIODirs(baseDir)
 	if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil {
 		fmt.Printf("创建输入目录失败: %v\n", err)
 		os.Exit(1)
 	}
 	if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
 		fmt.Printf("创建输出目录失败: %v\n", err)
 		os.Exit(1)
 	}
 	entries, readErr := os.ReadDir(sourceDir)
 	if readErr != nil {
 		fmt.Printf("读取输入目录失败: %v\n", readErr)
 		os.Exit(1)
 	}
 	pdfFiles := make([]string, 0)
 	for _, e := range entries {
 		if e.IsDir() {
 			continue
 		}
 		if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") {
 			pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name()))
 		}
 	}
 	sort.Strings(pdfFiles)
 	if len(pdfFiles) == 0 {
 		fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir)
 		fmt.Println("请将 PDF 文件放到 source 目录后重试。")
 		return
 	}
 	totalOK := 0
 	totalUnknown := 0
 	reportLines := []string{
 		fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")),
 		fmt.Sprintf("输入目录: %s", sourceDir),
 		fmt.Sprintf("输出目录: %s", outputDir),
 		"",
 	}
 	for _, pdfPath := range pdfFiles {
 		fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath))
 		res, err := processPDF(pdfPath, outputDir)
 		if err != nil {
 			fmt.Printf("    处理失败: %v\n", err)
 			continue
 		}
 		fmt.Printf("    识别到付款人全称的图片数: %d\n", res.SuccessCount)
 		fmt.Printf("    未识别付款人全称的图片数: %d\n", res.FailedCount)
 		totalOK += res.SuccessCount
 		totalUnknown += res.FailedCount
 		reportLines = append(reportLines,
 			fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)),
 			fmt.Sprintf("  拆分目录数: %d", len(res.PerDirCounts)),
 			fmt.Sprintf("  识别到付款人全称的图片数: %d", res.SuccessCount),
 			fmt.Sprintf("  未识别付款人全称的图片数: %d", res.FailedCount),
 			"  目录明细:",
 		)
 		dirNames := make([]string, 0, len(res.PerDirCounts))
 		for dir := range res.PerDirCounts {
 			dirNames = append(dirNames, dir)
 		}
 		sort.Strings(dirNames)
 		for _, dir := range dirNames {
 			reportLines = append(reportLines, fmt.Sprintf("    - %s: %d 张", dir, res.PerDirCounts[dir]))
 		}
 		reportLines = append(reportLines, "")
 	}
 	fmt.Println("\n处理完成")
 	fmt.Printf("识别到付款人全称的图片数: %d\n", totalOK)
 	fmt.Printf("未识别付款人全称的图片数: %d\n", totalUnknown)
 	fmt.Printf("输出目录: %s\n", outputDir)
 	reportLines = append(reportLines,
 		"汇总:",
 		fmt.Sprintf("  识别到付款人全称的图片总数: %d", totalOK),
 		fmt.Sprintf("  未识别付款人全称的图片总数: %d", totalUnknown),
 	)
 	reportPath, err := writeExecutionReport(baseDir, reportLines)
 	if err != nil {
 		fmt.Printf("写入执行结果记录失败: %v\n", err)
 		os.Exit(1)
 	}
 	fmt.Printf("执行结果记录文件: %s\n", reportPath)
 }