添加golang版本

This commit is contained in:
liyanyan 2026-05-26 21:36:35 +08:00
parent 106ebc4c4a
commit 7878c7bee6
5 changed files with 571 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
__pycache__
output
source
执行结果记录_**.txt
*.log
*.exe

140
README.md Normal file
View File

@ -0,0 +1,140 @@
# pdf-picker
基于 Go 的 PDF 回执拆分与分类工具。
功能说明:
- 扫描输入目录中的 PDF 文件
- 将回执按页面内多条回执进行拆分并导出图片
- 从文本中提取“收款人全称”并按名称分类到输出目录
- 识别失败时可使用 OCR 兜底
- 生成执行结果记录文件(含每个 PDF 的目录与图片明细)
## 1. 项目结构
- index.go: Go 主程序
- go.mod / go.sum: Go 模块依赖
- source/: 默认输入目录(放 PDF
- output/: 默认输出目录(分类后的图片)
## 2. 运行环境
- Windows
- Go 1.19+
- C 编译器(用于 cgogo-fitz 依赖)
- 可选Tesseract OCR用于识别扫描件
当前项目依赖:
- github.com/gen2brain/go-fitz v1.20.2
## 3. 工具链配置Windows
如果构建时报错 `gcc not found`,可使用 LLVM-MinGW。
示例(已验证可用路径):
1) 安装 LLVM-MinGW任选一种
- 使用 winget
- 手动下载并解压到 C:/llvm-mingw
2) 配置 Go 使用 clang 作为 cgo 编译器PowerShell
```powershell
go env -w CC="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang.exe"
go env -w CXX="C:\llvm-mingw\llvm-mingw-20260519-ucrt-x86_64\bin\clang++.exe"
go env -w CGO_ENABLED=1
```
3) 检查配置
```powershell
go env CC CXX CGO_ENABLED
```
## 4. 编译
```powershell
go mod tidy
go build index.go
```
编译成功后会生成可执行文件:
- index.exe
## 5. 运行
### 5.1 默认目录
```powershell
.\index.exe
```
等价于:
- 输入目录:./source
- 输出目录:./output
### 5.2 指定目录(短参数)
```powershell
.\index.exe -i ./source -o ./output2
```
### 5.3 指定目录(位置参数)
```powershell
.\index.exe ./source ./output2
```
优先级:
- 命名参数(-i/-o 或 --input/--output
- 位置参数
- 默认目录source/output
## 6. 输出结果
运行结束后会看到控制台统计,并在项目根目录生成记录文件:
- 执行结果记录_YYYYMMDD_HHMMSS.txt
记录内容包括:
- 每个 PDF 拆分目录数
- 每个目录对应图片数量
- 识别成功/失败数量
- 全部汇总
## 7. OCR 说明
程序在文本提取失败时会尝试调用:
```text
tesseract <image> stdout -l chi_sim+eng
```
如果本机未安装 tesseractOCR 兜底会自动跳过,不影响程序主流程(但扫描件识别率会下降)。
## 8. 常见问题
### Q1: 构建时报 `gcc not found`
原因:缺少 cgo 编译器。
处理:按第 3 节安装并配置 LLVM-MinGW。
### Q2: 构建时出现大量 `ld.lld: warning: duplicate symbol`
这是链接阶段警告,若最终退出码为 0 且生成 index.exe可继续使用。
### Q3: 没有识别出“收款人全称”
- 可能是 PDF 为扫描件且 OCR 不可用
- 可能是单据模板字段格式与当前规则不一致
可通过增强正则规则或补充 OCR 环境提升识别率。
## 9. 快速开始
```powershell
# 1) 放入 PDF 到 source
# 2) 编译
go build index.go
# 3) 运行
.\index.exe -o ./output2
```

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module pdf-picker
go 1.19
require github.com/gen2brain/go-fitz v1.20.2

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
github.com/gen2brain/go-fitz v1.20.2 h1:4FPJCU/ImQ32oojBsYn/+oTkRORxbAhAA+Yw1Fm97MA=
github.com/gen2brain/go-fitz v1.20.2/go.mod h1:YbQPODTC/UnQ/RK4JyD3zfpDQ19UKiV85nMMT3XpT0s=

418
index.go Normal file
View File

@ -0,0 +1,418 @@
package main
import (
"bytes"
"flag"
"fmt"
"image"
"image/png"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strings"
"time"
fitz "github.com/gen2brain/go-fitz"
)
const receiptHeader = "中国建设银行网上银行电子回执"
type processResult struct {
SuccessCount int
FailedCount int
PerDirCounts map[string]int
}
var (
invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`)
spacePattern = regexp.MustCompile(`\s+`)
payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|付款人全称`)
)
func safeDirName(name string) string {
cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_")
cleaned = spacePattern.ReplaceAllString(cleaned, " ")
if cleaned == "" {
return "未识别收款人"
}
if len([]rune(cleaned)) > 120 {
return string([]rune(cleaned)[:120])
}
return cleaned
}
func extractPayeeName(text string) string {
if strings.TrimSpace(text) == "" {
return ""
}
normalized := strings.ReplaceAll(text, "\u3000", " ")
compact := spacePattern.ReplaceAllString(normalized, "")
if idx := strings.Index(compact, "收款人全称"); idx >= 0 {
tail := strings.TrimLeft(compact[idx+len("收款人全称"):], ":")
if tail != "" {
if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 {
tail = tail[:stop[0]]
}
tail = strings.Trim(tail, " :")
if tail != "" {
return tail
}
}
}
patterns := []string{
`收款人全称\s*[:]\s*([^\n\r]+)`,
`收款人\s*全称\s*[:]\s*([^\n\r]+)`,
`收\s*款\s*人\s*全\s*称\s*[:]?\s*([^\n\r]+)`,
`收款人全称\s+([^\n\r]+)`,
}
for _, p := range patterns {
re := regexp.MustCompile(p)
match := re.FindStringSubmatch(normalized)
if len(match) > 1 {
value := strings.TrimSpace(match[1])
cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`)
value = strings.Trim(cutRe.Split(value, 2)[0], " :")
if value != "" {
return value
}
}
}
lines := strings.Split(normalized, "\n")
filtered := make([]string, 0, len(lines))
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
filtered = append(filtered, line)
}
}
for i, line := range filtered {
if strings.Contains(line, "收款人全称") {
after := strings.Trim(strings.SplitN(line, "收款人全称", 2)[1], " :")
if after != "" {
return after
}
if i+1 < len(filtered) {
candidate := strings.Trim(filtered[i+1], " :")
if candidate != "" {
return candidate
}
}
}
}
return ""
}
func extractTextViaOCR(imagePath string) string {
cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng")
var out bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return ""
}
return out.String()
}
func cropImage(src image.Image, rect image.Rectangle) image.Image {
rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
for y := rect.Min.Y; y < rect.Max.Y; y++ {
for x := rect.Min.X; x < rect.Max.X; x++ {
rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y))
}
}
return rgba
}
func splitPageTextByHeader(pageText string) []string {
if strings.TrimSpace(pageText) == "" {
return []string{""}
}
count := strings.Count(pageText, receiptHeader)
if count <= 1 {
return []string{pageText}
}
parts := strings.Split(pageText, receiptHeader)
segments := make([]string, 0, count)
for i := 1; i < len(parts); i++ {
segments = append(segments, receiptHeader+parts[i])
}
if len(segments) == 0 {
return []string{pageText}
}
return segments
}
func splitPageImageByHeaderCount(img image.Image, count int) []image.Image {
b := img.Bounds()
if count <= 1 {
return []image.Image{img}
}
height := b.Dy()
width := b.Dx()
segments := make([]image.Image, 0, count)
for i := 0; i < count; i++ {
y0 := b.Min.Y + (height*i)/count
y1 := b.Min.Y + (height*(i+1))/count
if y1 <= y0 {
continue
}
rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1)
segments = append(segments, cropImage(img, rect))
}
if len(segments) == 0 {
return []image.Image{img}
}
return segments
}
func processPDF(pdfPath, outputDir string) (processResult, error) {
res := processResult{PerDirCounts: map[string]int{}}
doc, err := fitz.New(pdfPath)
if err != nil {
return res, err
}
defer doc.Close()
totalPages := doc.NumPage()
for pageIndex := 0; pageIndex < totalPages; pageIndex++ {
pageText, textErr := doc.Text(pageIndex)
if textErr != nil {
pageText = ""
}
headerCount := strings.Count(pageText, receiptHeader)
if headerCount <= 0 {
headerCount = 1
}
img, imgErr := doc.ImageDPI(pageIndex, 200)
if imgErr != nil {
return res, imgErr
}
imgSegments := splitPageImageByHeaderCount(img, headerCount)
textSegments := splitPageTextByHeader(pageText)
for i, segment := range imgSegments {
receiptIndex := i + 1
imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex)
tempImagePath := filepath.Join(outputDir, imageName)
f, createErr := os.Create(tempImagePath)
if createErr != nil {
return res, createErr
}
encodeErr := png.Encode(f, segment)
closeErr := f.Close()
if encodeErr != nil {
return res, encodeErr
}
if closeErr != nil {
return res, closeErr
}
receiptText := pageText
if i < len(textSegments) {
receiptText = textSegments[i]
}
payeeName := extractPayeeName(receiptText)
if payeeName == "" {
ocrText := extractTextViaOCR(tempImagePath)
payeeName = extractPayeeName(ocrText)
}
if payeeName == "" {
payeeName = "未识别收款人"
res.FailedCount++
} else {
res.SuccessCount++
}
targetDirName := safeDirName(payeeName)
targetDir := filepath.Join(outputDir, targetDirName)
if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil {
return res, mkErr
}
res.PerDirCounts[targetDirName]++
finalImagePath := filepath.Join(targetDir, imageName)
if _, statErr := os.Stat(finalImagePath); statErr == nil {
finalImagePath = filepath.Join(targetDir,
fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid()))
}
if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil {
return res, renameErr
}
}
}
return res, nil
}
func writeExecutionReport(baseDir string, lines []string) (string, error) {
timestamp := time.Now().Format("20060102_150405")
reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp))
content := strings.Join(lines, "\n") + "\n"
if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil {
return "", err
}
return reportPath, nil
}
func resolveIODirs(baseDir string) (string, string) {
defaultSourceDir := filepath.Join(baseDir, "source")
defaultOutputDir := filepath.Join(baseDir, "output")
inputOpt := flag.String("input", "", "输入目录")
outputOpt := flag.String("output", "", "输出目录")
inputShort := flag.String("i", "", "输入目录")
outputShort := flag.String("o", "", "输出目录")
flag.Parse()
args := flag.Args()
posInput := ""
posOutput := ""
if len(args) >= 1 {
posInput = args[0]
}
if len(args) >= 2 {
posOutput = args[1]
}
inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput)
outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput)
sourceDir := defaultSourceDir
if inputRaw != "" {
sourceDir = inputRaw
}
outputDir := defaultOutputDir
if outputRaw != "" {
outputDir = outputRaw
}
sourceAbs, _ := filepath.Abs(sourceDir)
outputAbs, _ := filepath.Abs(outputDir)
return sourceAbs, outputAbs
}
func firstNonEmpty(values ...string) string {
for _, v := range values {
if strings.TrimSpace(v) != "" {
return v
}
}
return ""
}
func main() {
baseDir, wdErr := os.Getwd()
if wdErr != nil {
fmt.Printf("获取当前目录失败: %v\n", wdErr)
os.Exit(1)
}
sourceDir, outputDir := resolveIODirs(baseDir)
if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil {
fmt.Printf("创建输入目录失败: %v\n", err)
os.Exit(1)
}
if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
fmt.Printf("创建输出目录失败: %v\n", err)
os.Exit(1)
}
entries, readErr := os.ReadDir(sourceDir)
if readErr != nil {
fmt.Printf("读取输入目录失败: %v\n", readErr)
os.Exit(1)
}
pdfFiles := make([]string, 0)
for _, e := range entries {
if e.IsDir() {
continue
}
if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") {
pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name()))
}
}
sort.Strings(pdfFiles)
if len(pdfFiles) == 0 {
fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir)
fmt.Println("请将 PDF 文件放到 source 目录后重试。")
return
}
totalOK := 0
totalUnknown := 0
reportLines := []string{
fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")),
fmt.Sprintf("输入目录: %s", sourceDir),
fmt.Sprintf("输出目录: %s", outputDir),
"",
}
for _, pdfPath := range pdfFiles {
fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath))
res, err := processPDF(pdfPath, outputDir)
if err != nil {
fmt.Printf(" 处理失败: %v\n", err)
continue
}
fmt.Printf(" 识别到收款人全称的图片数: %d\n", res.SuccessCount)
fmt.Printf(" 未识别收款人全称的图片数: %d\n", res.FailedCount)
totalOK += res.SuccessCount
totalUnknown += res.FailedCount
reportLines = append(reportLines,
fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)),
fmt.Sprintf(" 拆分目录数: %d", len(res.PerDirCounts)),
fmt.Sprintf(" 识别到收款人全称的图片数: %d", res.SuccessCount),
fmt.Sprintf(" 未识别收款人全称的图片数: %d", res.FailedCount),
" 目录明细:",
)
dirNames := make([]string, 0, len(res.PerDirCounts))
for dir := range res.PerDirCounts {
dirNames = append(dirNames, dir)
}
sort.Strings(dirNames)
for _, dir := range dirNames {
reportLines = append(reportLines, fmt.Sprintf(" - %s: %d 张", dir, res.PerDirCounts[dir]))
}
reportLines = append(reportLines, "")
}
fmt.Println("\n处理完成")
fmt.Printf("识别到收款人全称的图片数: %d\n", totalOK)
fmt.Printf("未识别收款人全称的图片数: %d\n", totalUnknown)
fmt.Printf("输出目录: %s\n", outputDir)
reportLines = append(reportLines,
"汇总:",
fmt.Sprintf(" 识别到收款人全称的图片总数: %d", totalOK),
fmt.Sprintf(" 未识别收款人全称的图片总数: %d", totalUnknown),
)
reportPath, err := writeExecutionReport(baseDir, reportLines)
if err != nil {
fmt.Printf("写入执行结果记录失败: %v\n", err)
os.Exit(1)
}
fmt.Printf("执行结果记录文件: %s\n", reportPath)
}