Files
office-data-matcher/matcher.go
sakuradairong 31a21d5364 fix: 修复审查发现的多个问题并补全开发环境
- 修复 MaxPreview=0 仍被覆盖为默认值的 bug
- 修复 API Endpoint 自动补全逻辑(避免 /v1/v1/chat/completions)
- 为 AI 配置与匹配状态字段增加并发锁
- AI 增强未匹配行改为按索引跟踪,避免重复行误判
- 无时间列时 AI 匹配 B 表行数可配置并增加截断警告
- 导出时防御参差不齐行导致的数组越界 panic
- Excel 读取时对单元格统一 TrimSpace
- 删除未使用的 minInt 函数
- 修复 wails.json 开发服务器地址为 http://localhost:5173
- 重新生成 Wails 前端绑定
- 新增 ai_test.go / export_test.go 单元测试
2026-06-23 20:55:32 +00:00

282 lines
6.7 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"encoding/csv"
"fmt"
"math"
"os"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/xuri/excelize/v2"
)
// ---------- 健壮的时间解析 ----------
// 多种时间格式,覆盖月报和日报的不同格式
var timeFormats = []string{
"2006-01-02 15:04:05",
"2006-01-02 15:04",
"2006/01/02 15:04:05",
"2006/01/02 15:04",
"2006-1-2 15:04:05",
"2006-1-2 15:04",
"2006/1/2 15:04:05",
"2006/1/2 15:04",
"2006-01-02T15:04:05",
"2006/01/02T15:04:05",
"01/02/2006 15:04",
"1/2/2006 15:04",
"2006-01-02",
"2006/01/02",
}
// parseTimeFlexible 使用多种格式尝试解析时间字符串
func parseTimeFlexible(timeStr string) (time.Time, error) {
timeStr = strings.TrimSpace(timeStr)
for _, format := range timeFormats {
if t, err := time.Parse(format, timeStr); err == nil {
return t, nil
}
}
return time.Time{}, fmt.Errorf("无法解析时间格式: %s", timeStr)
}
// ---------- Levenshtein 距离算法 ----------
func levenshteinDistance(runes1, runes2 []rune) int {
m, n := len(runes1), len(runes2)
// 使用一维数组优化空间复杂度
dp := make([]int, n+1)
for j := range dp {
dp[j] = j
}
for i := 1; i <= m; i++ {
prev := dp[0]
dp[0] = i
for j := 1; j <= n; j++ {
temp := dp[j]
cost := 1
if runes1[i-1] == runes2[j-1] {
cost = 0
}
dp[j] = min(dp[j]+1, min(dp[j-1]+1, prev+cost))
prev = temp
}
}
return dp[n]
}
// calcSimilarity 带自定义正则的相似度计算reg 为 nil 时不做清洗直接比对
func calcSimilarity(s1, s2 string, reg *regexp.Regexp, caseSensitive bool) float64 {
clean1 := s1
clean2 := s2
if reg != nil {
clean1 = reg.ReplaceAllString(s1, "")
clean2 = reg.ReplaceAllString(s2, "")
}
if !caseSensitive {
clean1 = strings.ToLower(clean1)
clean2 = strings.ToLower(clean2)
}
r1 := []rune(clean1)
r2 := []rune(clean2)
if len(r1) == 0 && len(r2) == 0 {
return 1.0
}
if len(r1) == 0 || len(r2) == 0 {
return 0.0
}
dist := levenshteinDistance(r1, r2)
maxLen := math.Max(float64(len(r1)), float64(len(r2)))
return 1.0 - float64(dist)/maxLen
}
// similarityFromCleaned 基于已清洗字符串计算相似度(跳过 regex 步骤,避免重复清洗)
func similarityFromCleaned(clean1, clean2 string, caseSensitive bool) float64 {
if !caseSensitive {
clean1 = strings.ToLower(clean1)
clean2 = strings.ToLower(clean2)
}
r1 := []rune(clean1)
r2 := []rune(clean2)
if len(r1) == 0 && len(r2) == 0 {
return 1.0
}
if len(r1) == 0 || len(r2) == 0 {
return 0.0
}
dist := levenshteinDistance(r1, r2)
maxLen := math.Max(float64(len(r1)), float64(len(r2)))
return 1.0 - float64(dist)/maxLen
}
// cleanWithRegex 使用自定义正则清洗字符串reg 为 nil 时返回原文
func cleanWithRegex(input string, reg *regexp.Regexp) string {
if reg == nil {
return input
}
return reg.ReplaceAllString(input, "")
}
// ---------- 文件读取(通用)----------
// readRawRows 读取 Excel/CSV 文件返回原始二维字符串切片row[0] = 表头)
func (a *App) readRawRows(path string) ([][]string, error) {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".csv":
return a.readCSVRaw(path)
default:
return a.readExcelRaw(path)
}
}
func (a *App) readCSVRaw(path string) ([][]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("打开 CSV 文件失败: %v", err)
}
defer f.Close()
reader := csv.NewReader(f)
reader.LazyQuotes = true
reader.TrimLeadingSpace = true
allRows, err := reader.ReadAll()
if err != nil {
return nil, fmt.Errorf("读取 CSV 文件失败: %v", err)
}
if len(allRows) < 2 {
return nil, fmt.Errorf("CSV 文件至少需要标题行和一条数据")
}
for i := range allRows {
for j := range allRows[i] {
allRows[i][j] = strings.TrimSpace(allRows[i][j])
}
}
return allRows, nil
}
func (a *App) readExcelRaw(path string) ([][]string, error) {
f, err := excelize.OpenFile(path)
if err != nil {
return nil, fmt.Errorf("打开 Excel 文件失败: %v", err)
}
defer f.Close()
sheetName := f.GetSheetName(0)
allRows, err := f.GetRows(sheetName)
if err != nil {
return nil, fmt.Errorf("读取工作表失败: %v", err)
}
if len(allRows) < 2 {
return nil, fmt.Errorf("Excel 文件至少需要标题行和一条数据")
}
for i := range allRows {
for j := range allRows[i] {
allRows[i][j] = strings.TrimSpace(allRows[i][j])
}
}
return allRows, nil
}
// getCell 安全获取行中指定索引的单元格值,越界返回空字符串
func getCell(row []string, idx int) string {
if idx < 0 || idx >= len(row) {
return ""
}
return strings.TrimSpace(row[idx])
}
// ---------- 匹配准备 ----------
// matchPrep 匹配准备的中间结果
type matchPrep struct {
dataA, dataB [][]string
reg *regexp.Regexp
timeWindow float64
threshold float64
windowDuration time.Duration
maxBRowsNoTime int // 无时间列时 AI 匹配最多取 B 表多少行
}
// compileRegex 编译正则nil 表示跳过清洗
func compileRegex(pattern string) (*regexp.Regexp, error) {
if pattern == "" {
return nil, nil
}
reg, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("正则表达式格式错误,请检查: %v", err)
}
fmt.Printf("[DEBUG] 使用正则: '%s'\n", pattern)
return reg, nil
}
// prepareMatch 编译正则、读取文件、初始化默认值RunMatch / RunMatchWithAI 共用)
func (a *App) prepareMatch(config MatchConfig) (*matchPrep, error) {
reg, err := compileRegex(config.RegexPattern)
if err != nil {
return nil, err
}
tw := config.TimeWindow
if tw <= 0 {
tw = DefaultTimeWindowHours
}
th := config.Threshold
if th <= 0 {
th = DefaultThreshold
}
maxBRows := config.MaxBRowsNoTime
if maxBRows <= 0 {
maxBRows = defaultMaxBNoTime
}
a.emitProgress(0, 100, "正在读取 A 表...", "reading")
rowsA, err := a.readRawRows(config.FileAPath)
if err != nil {
return nil, fmt.Errorf("读取 A 表失败: %v", err)
}
a.emitProgress(0, 100, "正在读取 B 表...", "reading")
rowsB, err := a.readRawRows(config.FileBPath)
if err != nil {
return nil, fmt.Errorf("读取 B 表失败: %v", err)
}
if len(rowsA) < 2 {
return nil, fmt.Errorf("A 表无有效数据行")
}
if len(rowsB) < 2 {
return nil, fmt.Errorf("B 表无有效数据行")
}
a.dataMu.Lock()
a.headersA = rowsA[0]
a.headersB = rowsB[0]
a.lastConfig = config
a.dataMu.Unlock()
return &matchPrep{
dataA: rowsA[1:],
dataB: rowsB[1:],
reg: reg,
timeWindow: tw,
threshold: th,
windowDuration: time.Duration(tw * float64(time.Hour)),
maxBRowsNoTime: maxBRows,
}, nil
}
// defaultMaxBNoTime 无时间列时 B 表最大行数
const defaultMaxBNoTime = 200
// defaultAIWindowPadH AI 时间窗口额外余量(小时)
const defaultAIWindowPadH = 3.0