Files
office-data-matcher/app.go
sakuradairong 31a21d5364 fix: 修复审查发现的多个问题并补全开发环境
- 修复 MaxPreview=0 仍被覆盖为默认值的 bug
- 修复 API Endpoint 自动补全逻辑(避免 /v1/v1/chat/completions)
- 为 AI 配置与匹配状态字段增加并发锁
- AI 增强未匹配行改为按索引跟踪,避免重复行误判
- 无时间列时 AI 匹配 B 表行数可配置并增加截断警告
- 导出时防御参差不齐行导致的数组越界 panic
- Excel 读取时对单元格统一 TrimSpace
- 删除未使用的 minInt 函数
- 修复 wails.json 开发服务器地址为 http://localhost:5173
- 重新生成 Wails 前端绑定
- 新增 ai_test.go / export_test.go 单元测试
2026-06-23 20:55:32 +00:00

616 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"context"
"encoding/json"
"fmt"
"math"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"github.com/wailsapp/wails/v2/pkg/runtime"
)
// ---------- 常量 ----------
// DefaultThreshold 默认匹配阈值
const DefaultThreshold = 0.65
// DefaultTimeWindowHours 默认时间窗口(小时)
const DefaultTimeWindowHours = 12.0
// DefaultBatchSize AI 分批调用每批条数
const DefaultBatchSize = 8
// DefaultMaxPreview 调试日志预览条数
const DefaultMaxPreview = 3
// deepseekModel Deepseek 模型名称
const deepseekModel = "deepseek-chat"
// deepseekTemperature AI 温度参数
const deepseekTemperature = 0.05
// deepseekMaxTokens AI 最大 token 数
const deepseekMaxTokens = 2048
// cacheMaxSize AI 缓存最大条目数
const cacheMaxSize = 500
// ---------- 数据结构 ----------
// MatchResult 匹配结果(新旧字段兼容)
type MatchResult struct {
// 新字段(通用化)
RowAData []string `json:"rowAData"` // A 表原始所有列(新)
RowBKey string `json:"rowBKey"` // B 表匹配列的值(新)
ExtractValue string `json:"extractValue"` // 从 B 表提取的目标列值(新)
// 旧字段(向后兼容)
MonthlyCellName string `json:"monthlyCellName"`
DailyCellID string `json:"dailyCellId"`
InterruptReason string `json:"interruptReason"`
// 公共字段
TimeDiff string `json:"timeDiff"`
SimilarityScore float64 `json:"similarityScore"`
AIMatched bool `json:"aiMatched"`
}
// ProgressPayload 进度信息
type ProgressPayload struct {
Current int `json:"current"`
Total int `json:"total"`
Message string `json:"message"`
Phase string `json:"phase"` // reading / matching / ai-enhancing / done
}
// MatchConfig 前端传递的完整匹配配置
type MatchConfig struct {
// 文件路径
FileAPath string `json:"fileAPath"`
FileBPath string `json:"fileBPath"`
// A 表列索引(-1 表示不使用)
ColAMatchIndex int `json:"colAMatchIndex"` // A 表匹配列
ColATimeIndex int `json:"colATimeIndex"` // A 表时间列(可选,-1 跳过时间剪枝)
// B 表列索引
ColBMatchIndex int `json:"colBMatchIndex"` // B 表匹配列
ColBTimeIndex int `json:"colBTimeIndex"` // B 表时间列(可选,-1 跳过时间剪枝)
ColBExtractIndex int `json:"colBExtractIndex"` // B 表要提取的目标列
// 清洗与匹配参数
RegexPattern string `json:"regexPattern"` // 空字符串 = 跳过清洗
TimeWindow float64 `json:"timeWindow"` // 小时
Threshold float64 `json:"threshold"` // 0.0 - 1.0
// 扩展选项
AllMatches bool `json:"allMatches"` // true=返回该A行所有匹配(>=阈值)而非仅最佳
CaseSensitive bool `json:"caseSensitive"` // true=大小写敏感匹配
SortBy string `json:"sortBy"` // "similarity" / "timeDiff" / ""=不排序
MaxPreview int `json:"maxPreview"` // 调试日志中打印的前 N 条比对详情0=不打印
MaxBRowsNoTime int `json:"maxBRowsNoTime"` // 无时间列时 AI 匹配最多取 B 表多少行0=使用默认值 200
ExportFormat string `json:"exportFormat"` // "xlsx"(默认) / "csv"
IncludeHeader bool `json:"includeHeader"` // 导出时是否包含表头行
}
// AICacheInfo 缓存状态信息
type AICacheInfo struct {
Count int `json:"count"`
FilePath string `json:"filePath"`
}
// ---------- App 结构体 ----------
type App struct {
ctx context.Context
aiCache *AICache
// AI API 配置(并发访问需要加锁)
aiMu sync.RWMutex
apiKey string // AI API 密钥(兼容 OpenAI/Deepseek/本地模型)
apiEndpoint string // API 端点(默认 https://api.deepseek.com/v1/chat/completions
apiModel string // 模型名称(默认 deepseek-chat
// 最近一次匹配的配置和表头(供导出使用)
dataMu sync.RWMutex
lastConfig MatchConfig
headersA []string
headersB []string
}
// NewApp 创建 App 实例
func NewApp() *App {
return &App{
aiCache: newAICache(),
}
}
// startup 保存上下文
func (a *App) startup(ctx context.Context) {
a.ctx = ctx
count, path := a.aiCache.stat()
fmt.Printf("[CACHE] AI 缓存已加载,当前 %d 条缓存记录 (文件: %s)\n", count, path)
}
// emitProgress 向前端发送进度事件
func (a *App) emitProgress(current, total int, message, phase string) {
if a.ctx == nil {
return
}
runtime.EventsEmit(a.ctx, "match-progress", ProgressPayload{
Current: current,
Total: total,
Message: message,
Phase: phase,
})
}
// ---------- AI 配置 ----------
// SetDeepseekAPIKey 设置 Deepseek API 密钥(仅保存在内存中,向后兼容)
func (a *App) SetDeepseekAPIKey(key string) string {
a.aiMu.Lock()
defer a.aiMu.Unlock()
a.apiKey = strings.TrimSpace(key)
if a.apiKey == "" {
return "已清除 Deepseek API 密钥"
}
return "Deepseek API 密钥已设置"
}
// SetAIConfig 统一设置 AI API 配置(端点、模型、密钥)
func (a *App) SetAIConfig(endpoint, model, key string) string {
a.aiMu.Lock()
defer a.aiMu.Unlock()
if endpoint != "" {
a.apiEndpoint = strings.TrimSpace(endpoint)
}
if model != "" {
a.apiModel = strings.TrimSpace(model)
}
if key != "" {
a.apiKey = strings.TrimSpace(key)
}
return fmt.Sprintf("AI 配置已更新 (端点=%s, 模型=%s)", a.apiEndpoint, a.apiModel)
}
// SetAPIKey 设置 AI API 密钥(仅保存在内存中)
func (a *App) SetAPIKey(key string) string {
a.aiMu.Lock()
defer a.aiMu.Unlock()
a.apiKey = strings.TrimSpace(key)
if a.apiKey == "" {
return "已清除 AI API 密钥"
}
return "AI API 密钥已设置"
}
// GetDeepseekStatus 返回是否已配置 Deepseek API 密钥
func (a *App) GetDeepseekStatus() bool {
a.aiMu.RLock()
defer a.aiMu.RUnlock()
return a.apiKey != ""
}
// GetAIStatus 返回 AI API 配置状态
func (a *App) GetAIStatus() map[string]string {
a.aiMu.RLock()
defer a.aiMu.RUnlock()
return map[string]string{
"ready": fmt.Sprintf("%v", a.apiKey != ""),
"endpoint": a.apiEndpoint,
"model": a.apiModel,
}
}
// ClearAICache 清除所有 AI 缓存
func (a *App) ClearAICache() string {
before, _ := a.aiCache.stat()
a.aiCache.clear()
return fmt.Sprintf("已清除 %d 条 AI 缓存记录", before)
}
// GetAICacheInfo 返回 AI 缓存信息(条目数、文件路径)
func (a *App) GetAICacheInfo() AICacheInfo {
count, path := a.aiCache.stat()
return AICacheInfo{Count: count, FilePath: path}
}
// ---------- 文件选择对话框 ----------
// OpenFileA 打开文件对话框选择 A 表(基准表)
func (a *App) OpenFileA() (string, error) {
return a.openFileDialog("选择 A 表文件(基准表)")
}
// OpenFileB 打开文件对话框选择 B 表(数据源表)
func (a *App) OpenFileB() (string, error) {
return a.openFileDialog("选择 B 表文件(数据源表)")
}
func (a *App) openFileDialog(title string) (string, error) {
file, err := runtime.OpenFileDialog(a.ctx, runtime.OpenDialogOptions{
Title: title,
Filters: []runtime.FileFilter{
{DisplayName: "Excel / CSV 文件 (*.xlsx, *.xls, *.csv)", Pattern: "*.xlsx;*.xls;*.csv"},
},
})
if err != nil {
return "", err
}
return file, nil
}
// ParseHeaders 读取文件第一行作为表头数组返回给前端,用于动态渲染列映射下拉框
func (a *App) ParseHeaders(filePath string) ([]string, error) {
if filePath == "" {
return nil, fmt.Errorf("文件路径为空")
}
allRows, err := a.readRawRows(filePath)
if err != nil {
return nil, err
}
if len(allRows) == 0 {
return nil, fmt.Errorf("文件为空,无表头")
}
headers := allRows[0]
// TrimSpace 每个表头
for i := range headers {
headers[i] = strings.TrimSpace(headers[i])
}
fmt.Printf("[DEBUG] ParseHeaders: '%s' → %d 列 %v\n", filepath.Base(filePath), len(headers), headers)
return headers, nil
}
// ---------- 通用匹配引擎 ----------
// RunMatch 接收完整 MatchConfig按列索引执行通用匹配
func (a *App) RunMatch(config MatchConfig) ([]MatchResult, error) {
prep, err := a.prepareMatch(config)
if err != nil {
return nil, err
}
results, _, err := a.runMatchOnData(prep, config)
return results, err
}
// runMatchOnData 在已读取的数据上执行匹配
// 返回匹配结果,以及被匹配到的 A 表行索引集合(供 AI 增强阶段判断未匹配行)
func (a *App) runMatchOnData(prep *matchPrep, config MatchConfig) ([]MatchResult, map[int]bool, error) {
useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0
totalA := len(prep.dataA)
var results []MatchResult
matchedAIndices := make(map[int]bool)
useAllMatches := config.AllMatches
maxPreview := config.MaxPreview
if maxPreview < 0 {
maxPreview = DefaultMaxPreview
}
// 预计算 B 表清洗后的匹配值,避免内层循环中重复 regex 替换
totalB := len(prep.dataB)
cleanedBMatch := make([]string, totalB)
origBMatch := make([]string, totalB)
parsedBTime := make([]time.Time, totalB)
hasBTime := make([]bool, totalB)
bExtractVal := make([]string, totalB)
for bIdx, rowB := range prep.dataB {
matchStrB := getCell(rowB, config.ColBMatchIndex)
origBMatch[bIdx] = matchStrB
if matchStrB == "" {
cleanedBMatch[bIdx] = ""
} else {
cleanedBMatch[bIdx] = cleanWithRegex(matchStrB, prep.reg)
}
if useTime {
t, err := parseTimeFlexible(getCell(rowB, config.ColBTimeIndex))
if err == nil {
parsedBTime[bIdx] = t
hasBTime[bIdx] = true
}
}
bExtractVal[bIdx] = getCell(rowB, config.ColBExtractIndex)
}
for i, rowA := range prep.dataA {
if i%10 == 0 || i == totalA-1 {
pct := (i + 1) * 100 / totalA
a.emitProgress(i+1, totalA,
fmt.Sprintf("匹配中 %d/%d (%d%%)...", i+1, totalA, pct), "matching")
}
matchStrA := getCell(rowA, config.ColAMatchIndex)
if matchStrA == "" {
continue
}
var timeA time.Time
var hasTimeA bool
if useTime {
t, err := parseTimeFlexible(getCell(rowA, config.ColATimeIndex))
if err == nil { timeA = t; hasTimeA = true }
}
cleanA := cleanWithRegex(matchStrA, prep.reg)
// 收集该 A 行的所有候选匹配
var candidates []MatchResult
for bIdx := range prep.dataB {
if cleanedBMatch[bIdx] == "" { continue }
if hasTimeA && useTime && hasBTime[bIdx] {
td := timeA.Sub(parsedBTime[bIdx])
if td < -prep.windowDuration || td > prep.windowDuration { continue }
}
similarity := similarityFromCleaned(cleanA, cleanedBMatch[bIdx], config.CaseSensitive)
if i < maxPreview {
fmt.Printf("[DEBUG] | A[%d]='%s'→'%s' | B='%s'→'%s' | 相似度=%.4f\n",
i, matchStrA, cleanA, origBMatch[bIdx], cleanedBMatch[bIdx], similarity)
}
if similarity >= prep.threshold {
var timeDiff time.Duration
if hasTimeA && useTime && hasBTime[bIdx] {
timeDiff = timeA.Sub(parsedBTime[bIdx])
}
mr := MatchResult{
RowAData: rowA,
RowBKey: origBMatch[bIdx],
ExtractValue: bExtractVal[bIdx],
TimeDiff: formatTimeDiff(timeDiff),
SimilarityScore: math.Round(similarity*10000) / 10000,
AIMatched: false,
}
if useAllMatches {
candidates = append(candidates, mr)
} else if len(candidates) == 0 || similarity > candidates[0].SimilarityScore {
candidates = []MatchResult{mr}
}
}
}
if len(candidates) > 0 {
if i < maxPreview {
for _, c := range candidates {
fmt.Printf("[DEBUG] ✓ 命中 | A='%s'→B='%s' | 相似度=%.4f\n",
matchStrA, c.RowBKey, c.SimilarityScore)
}
}
results = append(results, candidates...)
matchedAIndices[i] = true
}
}
// 结果排序
switch config.SortBy {
case "similarity":
sort.Slice(results, func(i, j int) bool {
return results[i].SimilarityScore > results[j].SimilarityScore
})
case "timeDiff":
sort.Slice(results, func(i, j int) bool {
return parseTimeDiffDuration(results[i].TimeDiff) < parseTimeDiffDuration(results[j].TimeDiff)
})
}
a.emitProgress(totalA, totalA,
fmt.Sprintf("匹配完成!共匹配成功 %d 条记录", len(results)), "done")
return results, matchedAIndices, nil
}
// RunMatchWithAI 执行基础匹配 + AI 增强匹配(配置驱动)
func (a *App) RunMatchWithAI(config MatchConfig) ([]MatchResult, error) {
a.aiMu.RLock()
if a.apiKey == "" {
a.aiMu.RUnlock()
return nil, fmt.Errorf("请先设置 AI API 密钥")
}
a.aiMu.RUnlock()
prep, err := a.prepareMatch(config)
if err != nil {
return nil, err
}
// 1. 先执行基础匹配
results, matchedAIndices, err := a.runMatchOnData(prep, config)
if err != nil {
return nil, err
}
// 2. 找出未被基础匹配覆盖的 A 表行(按索引,避免重复行内容相同导致误判)
var unmatchedA [][]string
for i, row := range prep.dataA {
if !matchedAIndices[i] {
unmatchedA = append(unmatchedA, row)
}
}
if len(unmatchedA) == 0 {
a.emitProgress(1, 1, "全部已匹配,无需 AI 增强", "done")
return results, nil
}
// 3. AI 增强匹配(先查行级缓存,减少 API 调用)
useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0
aiMatched := 0
var failedBatches []int
// 3a. 检查行级缓存,命中则直接加入结果
var uncachedA [][]string
cacheHits := 0
for _, row := range unmatchedA {
matchVal := getCell(row, config.ColAMatchIndex)
timeStr := ""
if useTime {
timeStr = getCell(row, config.ColATimeIndex)
}
cacheKey := a.buildRowCacheKey(matchVal, timeStr, config)
if cachedVal, ok := a.aiCache.getRow(cacheKey); ok {
results = append(results, MatchResult{
RowAData: row,
RowBKey: "",
ExtractValue: cachedVal,
SimilarityScore: 0,
AIMatched: true,
})
aiMatched++
cacheHits++
} else {
uncachedA = append(uncachedA, row)
}
}
if cacheHits > 0 {
fmt.Printf("[CACHE] ✓ 行级缓存命中 %d 条,剩余 %d 条需 AI 处理\n", cacheHits, len(uncachedA))
}
if len(uncachedA) == 0 {
a.emitProgress(1, 1,
fmt.Sprintf("AI 增强完成!全部 %d 条命中缓存", cacheHits), "done")
return results, nil
}
totalUnmatched := len(uncachedA)
a.emitProgress(0, totalUnmatched,
fmt.Sprintf("AI 增强匹配:%d 条命中缓存,%d 条需调用 AI...", cacheHits, totalUnmatched),
"ai-enhancing")
for batchStart := 0; batchStart < totalUnmatched; batchStart += DefaultBatchSize {
end := min(batchStart+DefaultBatchSize, totalUnmatched)
batchNum := (batchStart / DefaultBatchSize) + 1
a.emitProgress(batchStart+1, totalUnmatched,
fmt.Sprintf("AI 分析中 %d/%d (第 %d 批)...", end, totalUnmatched, batchNum),
"ai-enhancing")
batch := uncachedA[batchStart:end]
// 计算本批 A 表的时间范围
var minTime, maxTime time.Time
hasBatchTime := false
if useTime {
for _, row := range batch {
t, err := parseTimeFlexible(getCell(row, config.ColATimeIndex))
if err != nil {
continue
}
if !hasBatchTime {
minTime, maxTime = t, t
hasBatchTime = true
} else {
if t.Before(minTime) {
minTime = t
}
if t.After(maxTime) {
maxTime = t
}
}
}
}
// 过滤 B 表在时间窗口内的行(用户配置时间窗口 + 额外余量覆盖批次跨度)
var relevantB [][]string
if hasBatchTime && useTime {
padding := prep.windowDuration + time.Duration(defaultAIWindowPadH)*time.Hour
ws := minTime.Add(-padding)
we := maxTime.Add(padding)
for _, row := range prep.dataB {
t, err := parseTimeFlexible(getCell(row, config.ColBTimeIndex))
if err != nil || t.Before(ws) || t.After(we) {
continue
}
relevantB = append(relevantB, row)
}
} else {
// 无时间列时限制 B 表条数以控制 token 消耗
if len(prep.dataB) > prep.maxBRowsNoTime {
fmt.Printf("[AI-WARN] 无时间列B 表共 %d 行AI 匹配仅取前 %d 行(可在高级设置调整)\n",
len(prep.dataB), prep.maxBRowsNoTime)
}
maxB := min(prep.maxBRowsNoTime, len(prep.dataB))
relevantB = prep.dataB[:maxB]
}
// 构建 AI 提示
prompt := a.buildGenericAIPrompt(batch, relevantB, config, prep.windowDuration, hasBatchTime)
aiResp, err := a.callAIAPI(prompt)
if err != nil {
fmt.Printf("[AI-WARN] 第 %d 批 API 调用失败: %v\n", batchNum, err)
failedBatches = append(failedBatches, batchNum)
continue
}
// 解析 AI 返回
var matchResp struct {
Matches []struct {
Index int `json:"index"`
Value string `json:"value"`
} `json:"matches"`
}
parseErr := json.Unmarshal([]byte(aiResp), &matchResp)
if parseErr != nil {
if idx := strings.Index(aiResp, "{"); idx >= 0 {
if endIdx := strings.LastIndex(aiResp, "}"); endIdx > idx {
parseErr = json.Unmarshal([]byte(aiResp[idx:endIdx+1]), &matchResp)
}
}
}
if parseErr != nil {
fmt.Printf("[AI-WARN] 响应解析失败 (第 %d 批): %s\n 原始响应: %.200s\n",
batchNum, parseErr.Error(), aiResp)
failedBatches = append(failedBatches, batchNum)
continue
}
for _, item := range matchResp.Matches {
idx := item.Index
val := strings.TrimSpace(item.Value)
if idx < 0 || idx >= len(batch) || val == "" {
continue
}
rowA := batch[idx]
mr := MatchResult{
RowAData: rowA,
RowBKey: "",
ExtractValue: val,
SimilarityScore: 0,
AIMatched: true,
}
results = append(results, mr)
aiMatched++
// 写入行级缓存
matchVal := getCell(rowA, config.ColAMatchIndex)
timeStr := ""
if useTime {
timeStr = getCell(rowA, config.ColATimeIndex)
}
cacheKey := a.buildRowCacheKey(matchVal, timeStr, config)
a.aiCache.putRow(cacheKey, val)
}
}
a.aiCache.saveToFile()
// 构建完成消息
msg := fmt.Sprintf("AI 增强完成!基础匹配 %d 条 + AI 补充 %d 条 = 共 %d 条",
len(results)-aiMatched, aiMatched, len(results))
if len(failedBatches) > 0 {
msg += fmt.Sprintf("(警告:第 %v 批失败)", failedBatches)
}
a.emitProgress(totalUnmatched, totalUnmatched, msg, "done")
return results, nil
}