pdf-picker/index.go
2026-05-26 21:36:35 +08:00

419 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bytes"
"flag"
"fmt"
"image"
"image/png"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strings"
"time"
fitz "github.com/gen2brain/go-fitz"
)
const receiptHeader = "中国建设银行网上银行电子回执"
type processResult struct {
SuccessCount int
FailedCount int
PerDirCounts map[string]int
}
var (
invalidPathChars = regexp.MustCompile(`[\\/:*?"<>|]`)
spacePattern = regexp.MustCompile(`\s+`)
payeeStopPattern = regexp.MustCompile(`别名|账号|开户行|大写金额|小写金额|用途|钞汇标志|摘要|重要提示|付款人全称`)
)
func safeDirName(name string) string {
cleaned := invalidPathChars.ReplaceAllString(strings.TrimSpace(name), "_")
cleaned = spacePattern.ReplaceAllString(cleaned, " ")
if cleaned == "" {
return "未识别收款人"
}
if len([]rune(cleaned)) > 120 {
return string([]rune(cleaned)[:120])
}
return cleaned
}
func extractPayeeName(text string) string {
if strings.TrimSpace(text) == "" {
return ""
}
normalized := strings.ReplaceAll(text, "\u3000", " ")
compact := spacePattern.ReplaceAllString(normalized, "")
if idx := strings.Index(compact, "收款人全称"); idx >= 0 {
tail := strings.TrimLeft(compact[idx+len("收款人全称"):], ":")
if tail != "" {
if stop := payeeStopPattern.FindStringIndex(tail); stop != nil && stop[0] > 0 {
tail = tail[:stop[0]]
}
tail = strings.Trim(tail, " :")
if tail != "" {
return tail
}
}
}
patterns := []string{
`收款人全称\s*[:]\s*([^\n\r]+)`,
`收款人\s*全称\s*[:]\s*([^\n\r]+)`,
`收\s*款\s*人\s*全\s*称\s*[:]?\s*([^\n\r]+)`,
`收款人全称\s+([^\n\r]+)`,
}
for _, p := range patterns {
re := regexp.MustCompile(p)
match := re.FindStringSubmatch(normalized)
if len(match) > 1 {
value := strings.TrimSpace(match[1])
cutRe := regexp.MustCompile(`\s{2,}|金额|开户行|账号|日期`)
value = strings.Trim(cutRe.Split(value, 2)[0], " :")
if value != "" {
return value
}
}
}
lines := strings.Split(normalized, "\n")
filtered := make([]string, 0, len(lines))
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
filtered = append(filtered, line)
}
}
for i, line := range filtered {
if strings.Contains(line, "收款人全称") {
after := strings.Trim(strings.SplitN(line, "收款人全称", 2)[1], " :")
if after != "" {
return after
}
if i+1 < len(filtered) {
candidate := strings.Trim(filtered[i+1], " :")
if candidate != "" {
return candidate
}
}
}
}
return ""
}
func extractTextViaOCR(imagePath string) string {
cmd := exec.Command("tesseract", imagePath, "stdout", "-l", "chi_sim+eng")
var out bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return ""
}
return out.String()
}
func cropImage(src image.Image, rect image.Rectangle) image.Image {
rgba := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
for y := rect.Min.Y; y < rect.Max.Y; y++ {
for x := rect.Min.X; x < rect.Max.X; x++ {
rgba.Set(x-rect.Min.X, y-rect.Min.Y, src.At(x, y))
}
}
return rgba
}
func splitPageTextByHeader(pageText string) []string {
if strings.TrimSpace(pageText) == "" {
return []string{""}
}
count := strings.Count(pageText, receiptHeader)
if count <= 1 {
return []string{pageText}
}
parts := strings.Split(pageText, receiptHeader)
segments := make([]string, 0, count)
for i := 1; i < len(parts); i++ {
segments = append(segments, receiptHeader+parts[i])
}
if len(segments) == 0 {
return []string{pageText}
}
return segments
}
func splitPageImageByHeaderCount(img image.Image, count int) []image.Image {
b := img.Bounds()
if count <= 1 {
return []image.Image{img}
}
height := b.Dy()
width := b.Dx()
segments := make([]image.Image, 0, count)
for i := 0; i < count; i++ {
y0 := b.Min.Y + (height*i)/count
y1 := b.Min.Y + (height*(i+1))/count
if y1 <= y0 {
continue
}
rect := image.Rect(b.Min.X, y0, b.Min.X+width, y1)
segments = append(segments, cropImage(img, rect))
}
if len(segments) == 0 {
return []image.Image{img}
}
return segments
}
func processPDF(pdfPath, outputDir string) (processResult, error) {
res := processResult{PerDirCounts: map[string]int{}}
doc, err := fitz.New(pdfPath)
if err != nil {
return res, err
}
defer doc.Close()
totalPages := doc.NumPage()
for pageIndex := 0; pageIndex < totalPages; pageIndex++ {
pageText, textErr := doc.Text(pageIndex)
if textErr != nil {
pageText = ""
}
headerCount := strings.Count(pageText, receiptHeader)
if headerCount <= 0 {
headerCount = 1
}
img, imgErr := doc.ImageDPI(pageIndex, 200)
if imgErr != nil {
return res, imgErr
}
imgSegments := splitPageImageByHeaderCount(img, headerCount)
textSegments := splitPageTextByHeader(pageText)
for i, segment := range imgSegments {
receiptIndex := i + 1
imageName := fmt.Sprintf("%s_p%03d_r%03d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex)
tempImagePath := filepath.Join(outputDir, imageName)
f, createErr := os.Create(tempImagePath)
if createErr != nil {
return res, createErr
}
encodeErr := png.Encode(f, segment)
closeErr := f.Close()
if encodeErr != nil {
return res, encodeErr
}
if closeErr != nil {
return res, closeErr
}
receiptText := pageText
if i < len(textSegments) {
receiptText = textSegments[i]
}
payeeName := extractPayeeName(receiptText)
if payeeName == "" {
ocrText := extractTextViaOCR(tempImagePath)
payeeName = extractPayeeName(ocrText)
}
if payeeName == "" {
payeeName = "未识别收款人"
res.FailedCount++
} else {
res.SuccessCount++
}
targetDirName := safeDirName(payeeName)
targetDir := filepath.Join(outputDir, targetDirName)
if mkErr := os.MkdirAll(targetDir, os.ModePerm); mkErr != nil {
return res, mkErr
}
res.PerDirCounts[targetDirName]++
finalImagePath := filepath.Join(targetDir, imageName)
if _, statErr := os.Stat(finalImagePath); statErr == nil {
finalImagePath = filepath.Join(targetDir,
fmt.Sprintf("%s_p%03d_r%03d_%d.png", strings.TrimSuffix(filepath.Base(pdfPath), filepath.Ext(pdfPath)), pageIndex+1, receiptIndex, os.Getpid()))
}
if renameErr := os.Rename(tempImagePath, finalImagePath); renameErr != nil {
return res, renameErr
}
}
}
return res, nil
}
func writeExecutionReport(baseDir string, lines []string) (string, error) {
timestamp := time.Now().Format("20060102_150405")
reportPath := filepath.Join(baseDir, fmt.Sprintf("执行结果记录_%s.txt", timestamp))
content := strings.Join(lines, "\n") + "\n"
if err := os.WriteFile(reportPath, []byte(content), 0o644); err != nil {
return "", err
}
return reportPath, nil
}
func resolveIODirs(baseDir string) (string, string) {
defaultSourceDir := filepath.Join(baseDir, "source")
defaultOutputDir := filepath.Join(baseDir, "output")
inputOpt := flag.String("input", "", "输入目录")
outputOpt := flag.String("output", "", "输出目录")
inputShort := flag.String("i", "", "输入目录")
outputShort := flag.String("o", "", "输出目录")
flag.Parse()
args := flag.Args()
posInput := ""
posOutput := ""
if len(args) >= 1 {
posInput = args[0]
}
if len(args) >= 2 {
posOutput = args[1]
}
inputRaw := firstNonEmpty(*inputOpt, *inputShort, posInput)
outputRaw := firstNonEmpty(*outputOpt, *outputShort, posOutput)
sourceDir := defaultSourceDir
if inputRaw != "" {
sourceDir = inputRaw
}
outputDir := defaultOutputDir
if outputRaw != "" {
outputDir = outputRaw
}
sourceAbs, _ := filepath.Abs(sourceDir)
outputAbs, _ := filepath.Abs(outputDir)
return sourceAbs, outputAbs
}
func firstNonEmpty(values ...string) string {
for _, v := range values {
if strings.TrimSpace(v) != "" {
return v
}
}
return ""
}
func main() {
baseDir, wdErr := os.Getwd()
if wdErr != nil {
fmt.Printf("获取当前目录失败: %v\n", wdErr)
os.Exit(1)
}
sourceDir, outputDir := resolveIODirs(baseDir)
if err := os.MkdirAll(sourceDir, os.ModePerm); err != nil {
fmt.Printf("创建输入目录失败: %v\n", err)
os.Exit(1)
}
if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
fmt.Printf("创建输出目录失败: %v\n", err)
os.Exit(1)
}
entries, readErr := os.ReadDir(sourceDir)
if readErr != nil {
fmt.Printf("读取输入目录失败: %v\n", readErr)
os.Exit(1)
}
pdfFiles := make([]string, 0)
for _, e := range entries {
if e.IsDir() {
continue
}
if strings.EqualFold(filepath.Ext(e.Name()), ".pdf") {
pdfFiles = append(pdfFiles, filepath.Join(sourceDir, e.Name()))
}
}
sort.Strings(pdfFiles)
if len(pdfFiles) == 0 {
fmt.Printf("未在目录中发现 PDF: %s\n", sourceDir)
fmt.Println("请将 PDF 文件放到 source 目录后重试。")
return
}
totalOK := 0
totalUnknown := 0
reportLines := []string{
fmt.Sprintf("执行时间: %s", time.Now().Format("2006-01-02 15:04:05")),
fmt.Sprintf("输入目录: %s", sourceDir),
fmt.Sprintf("输出目录: %s", outputDir),
"",
}
for _, pdfPath := range pdfFiles {
fmt.Printf("处理文件: %s\n", filepath.Base(pdfPath))
res, err := processPDF(pdfPath, outputDir)
if err != nil {
fmt.Printf(" 处理失败: %v\n", err)
continue
}
fmt.Printf(" 识别到收款人全称的图片数: %d\n", res.SuccessCount)
fmt.Printf(" 未识别收款人全称的图片数: %d\n", res.FailedCount)
totalOK += res.SuccessCount
totalUnknown += res.FailedCount
reportLines = append(reportLines,
fmt.Sprintf("PDF: %s", filepath.Base(pdfPath)),
fmt.Sprintf(" 拆分目录数: %d", len(res.PerDirCounts)),
fmt.Sprintf(" 识别到收款人全称的图片数: %d", res.SuccessCount),
fmt.Sprintf(" 未识别收款人全称的图片数: %d", res.FailedCount),
" 目录明细:",
)
dirNames := make([]string, 0, len(res.PerDirCounts))
for dir := range res.PerDirCounts {
dirNames = append(dirNames, dir)
}
sort.Strings(dirNames)
for _, dir := range dirNames {
reportLines = append(reportLines, fmt.Sprintf(" - %s: %d 张", dir, res.PerDirCounts[dir]))
}
reportLines = append(reportLines, "")
}
fmt.Println("\n处理完成")
fmt.Printf("识别到收款人全称的图片数: %d\n", totalOK)
fmt.Printf("未识别收款人全称的图片数: %d\n", totalUnknown)
fmt.Printf("输出目录: %s\n", outputDir)
reportLines = append(reportLines,
"汇总:",
fmt.Sprintf(" 识别到收款人全称的图片总数: %d", totalOK),
fmt.Sprintf(" 未识别收款人全称的图片总数: %d", totalUnknown),
)
reportPath, err := writeExecutionReport(baseDir, reportLines)
if err != nil {
fmt.Printf("写入执行结果记录失败: %v\n", err)
os.Exit(1)
}
fmt.Printf("执行结果记录文件: %s\n", reportPath)
}