- Merge remote improvements: generic AI API, row-level cache, CSV export, matchPrep, prompt truncation, O(1) cache index - Split app.go (1645 -> 5 files: app.go, cache.go, ai.go, matcher.go, export.go) - Remove V1 dead code: 6 methods, 4 helpers, ~300 lines - Fix AICache 3 bugs: TOCTOU saveToFile, silent loadFromFile, full-sort put - Extract 8 named constants (threshold, time window, batch size...) - Frontend: isRunning guard, buildMatchConfig dedup, CSS variables - Upgrade Go to 1.24.0
278 lines
6.5 KiB
Go
278 lines
6.5 KiB
Go
package main
|
||
|
||
import (
|
||
"encoding/csv"
|
||
"fmt"
|
||
"math"
|
||
"os"
|
||
"path/filepath"
|
||
"regexp"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/xuri/excelize/v2"
|
||
)
|
||
|
||
// ---------- 健壮的时间解析 ----------
|
||
|
||
// 多种时间格式,覆盖月报和日报的不同格式
|
||
var timeFormats = []string{
|
||
"2006-01-02 15:04:05",
|
||
"2006-01-02 15:04",
|
||
"2006/01/02 15:04:05",
|
||
"2006/01/02 15:04",
|
||
"2006-1-2 15:04:05",
|
||
"2006-1-2 15:04",
|
||
"2006/1/2 15:04:05",
|
||
"2006/1/2 15:04",
|
||
"2006-01-02T15:04:05",
|
||
"2006/01/02T15:04:05",
|
||
"01/02/2006 15:04",
|
||
"1/2/2006 15:04",
|
||
"2006-01-02",
|
||
"2006/01/02",
|
||
}
|
||
|
||
// parseTimeFlexible 使用多种格式尝试解析时间字符串
|
||
func parseTimeFlexible(timeStr string) (time.Time, error) {
|
||
timeStr = strings.TrimSpace(timeStr)
|
||
for _, format := range timeFormats {
|
||
if t, err := time.Parse(format, timeStr); err == nil {
|
||
return t, nil
|
||
}
|
||
}
|
||
return time.Time{}, fmt.Errorf("无法解析时间格式: %s", timeStr)
|
||
}
|
||
|
||
// ---------- Levenshtein 距离算法 ----------
|
||
|
||
func levenshteinDistance(runes1, runes2 []rune) int {
|
||
m, n := len(runes1), len(runes2)
|
||
|
||
// 使用一维数组优化空间复杂度
|
||
dp := make([]int, n+1)
|
||
for j := range dp {
|
||
dp[j] = j
|
||
}
|
||
|
||
for i := 1; i <= m; i++ {
|
||
prev := dp[0]
|
||
dp[0] = i
|
||
for j := 1; j <= n; j++ {
|
||
temp := dp[j]
|
||
cost := 1
|
||
if runes1[i-1] == runes2[j-1] {
|
||
cost = 0
|
||
}
|
||
dp[j] = min(dp[j]+1, min(dp[j-1]+1, prev+cost))
|
||
prev = temp
|
||
}
|
||
}
|
||
return dp[n]
|
||
}
|
||
|
||
func minInt(a, b int) int {
|
||
if a < b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// calcSimilarity 带自定义正则的相似度计算;reg 为 nil 时不做清洗直接比对
|
||
func calcSimilarity(s1, s2 string, reg *regexp.Regexp, caseSensitive bool) float64 {
|
||
clean1 := s1
|
||
clean2 := s2
|
||
if reg != nil {
|
||
clean1 = reg.ReplaceAllString(s1, "")
|
||
clean2 = reg.ReplaceAllString(s2, "")
|
||
}
|
||
if !caseSensitive {
|
||
clean1 = strings.ToLower(clean1)
|
||
clean2 = strings.ToLower(clean2)
|
||
}
|
||
|
||
r1 := []rune(clean1)
|
||
r2 := []rune(clean2)
|
||
|
||
if len(r1) == 0 && len(r2) == 0 {
|
||
return 1.0
|
||
}
|
||
if len(r1) == 0 || len(r2) == 0 {
|
||
return 0.0
|
||
}
|
||
|
||
dist := levenshteinDistance(r1, r2)
|
||
maxLen := math.Max(float64(len(r1)), float64(len(r2)))
|
||
return 1.0 - float64(dist)/maxLen
|
||
}
|
||
|
||
// similarityFromCleaned 基于已清洗字符串计算相似度(跳过 regex 步骤,避免重复清洗)
|
||
func similarityFromCleaned(clean1, clean2 string, caseSensitive bool) float64 {
|
||
if !caseSensitive {
|
||
clean1 = strings.ToLower(clean1)
|
||
clean2 = strings.ToLower(clean2)
|
||
}
|
||
r1 := []rune(clean1)
|
||
r2 := []rune(clean2)
|
||
if len(r1) == 0 && len(r2) == 0 {
|
||
return 1.0
|
||
}
|
||
if len(r1) == 0 || len(r2) == 0 {
|
||
return 0.0
|
||
}
|
||
dist := levenshteinDistance(r1, r2)
|
||
maxLen := math.Max(float64(len(r1)), float64(len(r2)))
|
||
return 1.0 - float64(dist)/maxLen
|
||
}
|
||
|
||
// cleanWithRegex 使用自定义正则清洗字符串;reg 为 nil 时返回原文
|
||
func cleanWithRegex(input string, reg *regexp.Regexp) string {
|
||
if reg == nil {
|
||
return input
|
||
}
|
||
return reg.ReplaceAllString(input, "")
|
||
}
|
||
|
||
// ---------- 文件读取(通用)----------
|
||
|
||
// readRawRows 读取 Excel/CSV 文件,返回原始二维字符串切片(row[0] = 表头)
|
||
func (a *App) readRawRows(path string) ([][]string, error) {
|
||
ext := strings.ToLower(filepath.Ext(path))
|
||
switch ext {
|
||
case ".csv":
|
||
return a.readCSVRaw(path)
|
||
default:
|
||
return a.readExcelRaw(path)
|
||
}
|
||
}
|
||
|
||
func (a *App) readCSVRaw(path string) ([][]string, error) {
|
||
f, err := os.Open(path)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("打开 CSV 文件失败: %v", err)
|
||
}
|
||
defer f.Close()
|
||
|
||
reader := csv.NewReader(f)
|
||
reader.LazyQuotes = true
|
||
reader.TrimLeadingSpace = true
|
||
allRows, err := reader.ReadAll()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取 CSV 文件失败: %v", err)
|
||
}
|
||
if len(allRows) < 2 {
|
||
return nil, fmt.Errorf("CSV 文件至少需要标题行和一条数据")
|
||
}
|
||
for i := range allRows {
|
||
for j := range allRows[i] {
|
||
allRows[i][j] = strings.TrimSpace(allRows[i][j])
|
||
}
|
||
}
|
||
return allRows, nil
|
||
}
|
||
|
||
func (a *App) readExcelRaw(path string) ([][]string, error) {
|
||
f, err := excelize.OpenFile(path)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("打开 Excel 文件失败: %v", err)
|
||
}
|
||
defer f.Close()
|
||
|
||
sheetName := f.GetSheetName(0)
|
||
allRows, err := f.GetRows(sheetName)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取工作表失败: %v", err)
|
||
}
|
||
if len(allRows) < 2 {
|
||
return nil, fmt.Errorf("Excel 文件至少需要标题行和一条数据")
|
||
}
|
||
return allRows, nil
|
||
}
|
||
|
||
// getCell 安全获取行中指定索引的单元格值,越界返回空字符串
|
||
func getCell(row []string, idx int) string {
|
||
if idx < 0 || idx >= len(row) {
|
||
return ""
|
||
}
|
||
return strings.TrimSpace(row[idx])
|
||
}
|
||
|
||
// ---------- 匹配准备 ----------
|
||
|
||
// matchPrep 匹配准备的中间结果
|
||
type matchPrep struct {
|
||
dataA, dataB [][]string
|
||
reg *regexp.Regexp
|
||
timeWindow float64
|
||
threshold float64
|
||
windowDuration time.Duration
|
||
}
|
||
|
||
// compileRegex 编译正则,nil 表示跳过清洗
|
||
func compileRegex(pattern string) (*regexp.Regexp, error) {
|
||
if pattern == "" {
|
||
return nil, nil
|
||
}
|
||
reg, err := regexp.Compile(pattern)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("正则表达式格式错误,请检查: %v", err)
|
||
}
|
||
fmt.Printf("[DEBUG] 使用正则: '%s'\n", pattern)
|
||
return reg, nil
|
||
}
|
||
|
||
// prepareMatch 编译正则、读取文件、初始化默认值(RunMatch / RunMatchWithAI 共用)
|
||
func (a *App) prepareMatch(config MatchConfig) (*matchPrep, error) {
|
||
reg, err := compileRegex(config.RegexPattern)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
tw := config.TimeWindow
|
||
if tw <= 0 {
|
||
tw = DefaultTimeWindowHours
|
||
}
|
||
th := config.Threshold
|
||
if th <= 0 {
|
||
th = DefaultThreshold
|
||
}
|
||
|
||
a.emitProgress(0, 100, "正在读取 A 表...", "reading")
|
||
rowsA, err := a.readRawRows(config.FileAPath)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取 A 表失败: %v", err)
|
||
}
|
||
a.emitProgress(0, 100, "正在读取 B 表...", "reading")
|
||
rowsB, err := a.readRawRows(config.FileBPath)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("读取 B 表失败: %v", err)
|
||
}
|
||
if len(rowsA) < 2 {
|
||
return nil, fmt.Errorf("A 表无有效数据行")
|
||
}
|
||
if len(rowsB) < 2 {
|
||
return nil, fmt.Errorf("B 表无有效数据行")
|
||
}
|
||
|
||
a.dataMu.Lock()
|
||
a.headersA = rowsA[0]
|
||
a.headersB = rowsB[0]
|
||
a.lastConfig = config
|
||
a.dataMu.Unlock()
|
||
|
||
return &matchPrep{
|
||
dataA: rowsA[1:],
|
||
dataB: rowsB[1:],
|
||
reg: reg,
|
||
timeWindow: tw,
|
||
threshold: th,
|
||
windowDuration: time.Duration(tw * float64(time.Hour)),
|
||
}, nil
|
||
}
|
||
|
||
// defaultMaxBNoTime 无时间列时 B 表最大行数
|
||
const defaultMaxBNoTime = 200
|
||
|
||
// defaultAIWindowPadH AI 时间窗口额外余量(小时)
|
||
const defaultAIWindowPadH = 3.0
|