Files
office-data-matcher/app.go
RainySY 40c3966e9a fix: 修复代码审查全部18项问题,重构导出与匹配引擎
A级(严重):
- ExportResults 支持 CSV 格式导出和 IncludeHeader 配置,使用实际表头名
- RunMatchWithAI 消除重复文件读取,提取 runMatchOnData() 内部函数
- AI 缓存文件权限收紧至 0600

B级(中等):
- 移除废弃代码约400行 (MonthlyReport/DailyReport/StartMatching/DeepseekEnhanceMatching)
- 替换自定义 parseCSVLine 为标准 encoding/csv
- GetAICacheInfo 返回命名结构体 AICacheInfo
- 时间差排序改为数值比较
- App.vue 提取 buildMatchConfig() 工厂函数消除配置重复
- AllMatches=false 时命中 1.0 相似度可提前结束 B 表循环

C级(轻微):
- 魔法数字提取为命名常量
- main.go 替换 println 为 log.Fatalf
- 清理未使用变量

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 14:27:15 +08:00

1239 lines
33 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"bytes"
"context"
"crypto/sha256"
"encoding/csv"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"time"
"github.com/wailsapp/wails/v2/pkg/runtime"
"github.com/xuri/excelize/v2"
)
// ---------- 数据结构 ----------
// MatchResult 匹配结果
type MatchResult struct {
// 新字段(通用化)
RowAData []string `json:"rowAData"` // A 表原始所有列(新)
RowBKey string `json:"rowBKey"` // B 表匹配列的值(新)
ExtractValue string `json:"extractValue"` // 从 B 表提取的目标列值(新)
// 旧字段(向后兼容)
MonthlyCellName string `json:"monthlyCellName"`
DailyCellID string `json:"dailyCellId"`
InterruptReason string `json:"interruptReason"`
// 公共字段
TimeDiff string `json:"timeDiff"`
SimilarityScore float64 `json:"similarityScore"`
AIMatched bool `json:"aiMatched"`
}
// ProgressPayload 进度信息
type ProgressPayload struct {
Current int `json:"current"`
Total int `json:"total"`
Message string `json:"message"`
Phase string `json:"phase"` // reading / matching / ai-enhancing / done
}
// MatchConfig 前端传递的完整匹配配置
type MatchConfig struct {
// 文件路径
FileAPath string `json:"fileAPath"`
FileBPath string `json:"fileBPath"`
// A 表列索引(-1 表示不使用)
ColAMatchIndex int `json:"colAMatchIndex"` // A 表匹配列
ColATimeIndex int `json:"colATimeIndex"` // A 表时间列(可选,-1 跳过时间剪枝)
// B 表列索引
ColBMatchIndex int `json:"colBMatchIndex"` // B 表匹配列
ColBTimeIndex int `json:"colBTimeIndex"` // B 表时间列(可选,-1 跳过时间剪枝)
ColBExtractIndex int `json:"colBExtractIndex"` // B 表要提取的目标列
// 清洗与匹配参数
RegexPattern string `json:"regexPattern"` // 空字符串 = 跳过清洗
TimeWindow float64 `json:"timeWindow"` // 小时
Threshold float64 `json:"threshold"` // 0.0 - 1.0
// 扩展选项
AllMatches bool `json:"allMatches"` // true=返回该A行所有匹配(>=阈值)而非仅最佳
CaseSensitive bool `json:"caseSensitive"` // true=大小写敏感匹配
SortBy string `json:"sortBy"` // "similarity" / "timeDiff" / ""=不排序
MaxPreview int `json:"maxPreview"` // 调试日志中打印的前 N 条比对详情0=不打印
ExportFormat string `json:"exportFormat"` // "xlsx"(默认) / "csv"
IncludeHeader bool `json:"includeHeader"` // 导出时是否包含表头行
}
// ---------- Deepseek API 类型 ----------
type deepseekMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type deepseekRequest struct {
Model string `json:"model"`
Messages []deepseekMessage `json:"messages"`
Temperature float64 `json:"temperature"`
MaxTokens int `json:"max_tokens,omitempty"`
}
type deepseekResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
Error *struct {
Message string `json:"message"`
} `json:"error,omitempty"`
}
// ---------- AI 缓存 ----------
// AICacheInfo 缓存信息(前端展示用)
type AICacheInfo struct {
Count int `json:"count"`
FilePath string `json:"filePath"`
}
// AICacheEntry 单条缓存记录
type AICacheEntry struct {
PromptHash string `json:"promptHash"`
Response string `json:"response"`
CreatedAt int64 `json:"createdAt"`
}
// AICache AI 响应缓存(持久化到临时文件)
type AICache struct {
Entries []AICacheEntry `json:"entries"`
mu sync.RWMutex // 小写,必须保持非导出以兼容 JSON 序列化
filePath string
maxSize int // 最大缓存条目数
}
// cacheFileName 缓存文件名
const cacheFileName = "data-matcher-ai-cache.json"
// 默认常量
const (
defaultThreshold = 0.65
defaultTimeWindowH = 12.0
defaultBatchSize = 8
defaultMaxPreview = 3
defaultMaxBNoTime = 200
defaultAIWindowPadH = 3.0
)
// newAICache 创建缓存实例并加载已有数据
func newAICache() *AICache {
c := &AICache{
filePath: filepath.Join(os.TempDir(), cacheFileName),
maxSize: 500,
}
c.loadFromFile()
return c
}
// loadFromFile 从磁盘加载缓存
func (c *AICache) loadFromFile() {
data, err := os.ReadFile(c.filePath)
if err != nil {
return // 文件不存在或无法读取,从空缓存开始
}
c.mu.Lock()
defer c.mu.Unlock()
_ = json.Unmarshal(data, c) // 忽略解析错误,重置为 entries
}
// saveToFile 将缓存写入磁盘
func (c *AICache) saveToFile() {
c.mu.RLock()
data, err := json.Marshal(c)
c.mu.RUnlock()
if err != nil {
return
}
_ = os.WriteFile(c.filePath, data, 0600)
}
// get 根据 hash 查找缓存,命中返回响应,否则返回空
func (c *AICache) get(hash string) (string, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
for i := range c.Entries {
if c.Entries[i].PromptHash == hash {
return c.Entries[i].Response, true
}
}
return "", false
}
// put 存入一条缓存(线程安全 + 自动裁剪)
func (c *AICache) put(hash, response string) {
c.mu.Lock()
defer c.mu.Unlock()
// 去重:如果已存在则覆盖
for i := range c.Entries {
if c.Entries[i].PromptHash == hash {
c.Entries[i].Response = response
c.Entries[i].CreatedAt = time.Now().Unix()
return
}
}
c.Entries = append(c.Entries, AICacheEntry{
PromptHash: hash,
Response: response,
CreatedAt: time.Now().Unix(),
})
// 超过上限则删除最旧的条目
if len(c.Entries) > c.maxSize {
// 按 CreatedAt 排序保留最新的
sort.Slice(c.Entries, func(i, j int) bool {
return c.Entries[i].CreatedAt > c.Entries[j].CreatedAt
})
c.Entries = c.Entries[:c.maxSize]
}
}
// clear 清空所有缓存
func (c *AICache) clear() {
c.mu.Lock()
defer c.mu.Unlock()
c.Entries = nil
_ = os.Remove(c.filePath)
}
// stat 返回缓存统计
func (c *AICache) stat() (count int, path string) {
c.mu.RLock()
defer c.mu.RUnlock()
return len(c.Entries), c.filePath
}
// ---------- App 结构体 ----------
type App struct {
ctx context.Context
deepseekKey string
aiCache *AICache
// 最近一次匹配的配置和表头(供导出使用)
lastConfig MatchConfig
headersA []string
headersB []string
}
// NewApp 创建 App 实例
func NewApp() *App {
return &App{
aiCache: newAICache(),
}
}
// startup 保存上下文
func (a *App) startup(ctx context.Context) {
a.ctx = ctx
count, path := a.aiCache.stat()
fmt.Printf("[CACHE] AI 缓存已加载,当前 %d 条缓存记录 (文件: %s)\n", count, path)
}
// emitProgress 向前端发送进度事件
func (a *App) emitProgress(current, total int, message, phase string) {
if a.ctx == nil {
return
}
runtime.EventsEmit(a.ctx, "match-progress", ProgressPayload{
Current: current,
Total: total,
Message: message,
Phase: phase,
})
}
// SetDeepseekAPIKey 设置 Deepseek API 密钥(仅保存在内存中)
func (a *App) SetDeepseekAPIKey(key string) string {
a.deepseekKey = strings.TrimSpace(key)
if a.deepseekKey == "" {
return "已清除 Deepseek API 密钥"
}
return "Deepseek API 密钥已设置"
}
// GetDeepseekStatus 返回是否已配置 Deepseek API 密钥
func (a *App) GetDeepseekStatus() bool {
return a.deepseekKey != ""
}
// ClearAICache 清除所有 AI 缓存
func (a *App) ClearAICache() string {
before, _ := a.aiCache.stat()
a.aiCache.clear()
return fmt.Sprintf("已清除 %d 条 AI 缓存记录", before)
}
// GetAICacheInfo 返回 AI 缓存信息(条目数、文件路径)
func (a *App) GetAICacheInfo() AICacheInfo {
count, path := a.aiCache.stat()
return AICacheInfo{Count: count, FilePath: path}
}
// ---------- 文件选择对话框 ----------
// OpenMonthlyReport 打开文件对话框选择月报
func (a *App) OpenMonthlyReport() (string, error) {
file, err := runtime.OpenFileDialog(a.ctx, runtime.OpenDialogOptions{
Title: "选择月报文件",
Filters: []runtime.FileFilter{
{DisplayName: "Excel / CSV 文件 (*.xlsx, *.xls, *.csv)", Pattern: "*.xlsx;*.xls;*.csv"},
},
})
if err != nil {
return "", err
}
return file, nil
}
// OpenDailyReport 打开文件对话框选择日报
func (a *App) OpenDailyReport() (string, error) {
file, err := runtime.OpenFileDialog(a.ctx, runtime.OpenDialogOptions{
Title: "选择日报文件",
Filters: []runtime.FileFilter{
{DisplayName: "Excel / CSV 文件 (*.xlsx, *.xls, *.csv)", Pattern: "*.xlsx;*.xls;*.csv"},
},
})
if err != nil {
return "", err
}
return file, nil
}
// OpenFileA 打开文件对话框选择 A 表(基准表)
func (a *App) OpenFileA() (string, error) {
return a.openFileDialog("选择 A 表文件(基准表)")
}
// OpenFileB 打开文件对话框选择 B 表(数据源表)
func (a *App) OpenFileB() (string, error) {
return a.openFileDialog("选择 B 表文件(数据源表)")
}
func (a *App) openFileDialog(title string) (string, error) {
file, err := runtime.OpenFileDialog(a.ctx, runtime.OpenDialogOptions{
Title: title,
Filters: []runtime.FileFilter{
{DisplayName: "Excel / CSV 文件 (*.xlsx, *.xls, *.csv)", Pattern: "*.xlsx;*.xls;*.csv"},
},
})
if err != nil {
return "", err
}
return file, nil
}
// ParseHeaders 读取文件第一行作为表头数组返回给前端,用于动态渲染列映射下拉框
func (a *App) ParseHeaders(filePath string) ([]string, error) {
if filePath == "" {
return nil, fmt.Errorf("文件路径为空")
}
allRows, err := a.readRawRows(filePath)
if err != nil {
return nil, err
}
if len(allRows) == 0 {
return nil, fmt.Errorf("文件为空,无表头")
}
headers := allRows[0]
// TrimSpace 每个表头
for i := range headers {
headers[i] = strings.TrimSpace(headers[i])
}
fmt.Printf("[DEBUG] ParseHeaders: '%s' → %d 列 %v\n", filepath.Base(filePath), len(headers), headers)
return headers, nil
}
var nonChineseRegex = regexp.MustCompile(`[^\p{Han}]+`)
// CleanString 剔除字符串中的所有非中文字符,仅保留纯中文字符
func (a *App) CleanString(input string) string {
return nonChineseRegex.ReplaceAllString(input, "")
}
// ---------- 健壮的时间解析 ----------
// 多种时间格式,覆盖月报和日报的不同格式
var timeFormats = []string{
"2006-01-02 15:04:05",
"2006-01-02 15:04",
"2006/01/02 15:04:05",
"2006/01/02 15:04",
"2006-1-2 15:04:05",
"2006-1-2 15:04",
"2006/1/2 15:04:05",
"2006/1/2 15:04",
"2006-01-02T15:04:05",
"2006/01/02T15:04:05",
"01/02/2006 15:04",
"1/2/2006 15:04",
"2006-01-02",
"2006/01/02",
}
// parseTimeFlexible 使用多种格式尝试解析时间字符串
func parseTimeFlexible(timeStr string) (time.Time, error) {
timeStr = strings.TrimSpace(timeStr)
for _, format := range timeFormats {
if t, err := time.Parse(format, timeStr); err == nil {
return t, nil
}
}
return time.Time{}, fmt.Errorf("无法解析时间格式: %s", timeStr)
}
// ---------- Levenshtein 距离算法 ----------
func levenshteinDistance(s1, s2 string) int {
runes1 := []rune(s1)
runes2 := []rune(s2)
m, n := len(runes1), len(runes2)
// 使用一维数组优化空间复杂度
dp := make([]int, n+1)
for j := range dp {
dp[j] = j
}
for i := 1; i <= m; i++ {
prev := dp[0]
dp[0] = i
for j := 1; j <= n; j++ {
temp := dp[j]
cost := 1
if runes1[i-1] == runes2[j-1] {
cost = 0
}
dp[j] = min(dp[j]+1, min(dp[j-1]+1, prev+cost))
prev = temp
}
}
return dp[n]
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
// CalculateSimilarity 计算清洗后中文名称的相似度(基于 Levenshtein 距离归一化)
func (a *App) CalculateSimilarity(s1, s2 string) float64 {
return calcSimilarity(s1, s2, nonChineseRegex, false)
}
// calcSimilarity 带自定义正则的相似度计算reg 为 nil 时不做清洗直接比对
func calcSimilarity(s1, s2 string, reg *regexp.Regexp, caseSensitive bool) float64 {
clean1 := s1
clean2 := s2
if reg != nil {
clean1 = reg.ReplaceAllString(s1, "")
clean2 = reg.ReplaceAllString(s2, "")
}
if !caseSensitive {
clean1 = strings.ToLower(clean1)
clean2 = strings.ToLower(clean2)
}
r1 := []rune(clean1)
r2 := []rune(clean2)
if len(r1) == 0 && len(r2) == 0 {
return 1.0
}
if len(r1) == 0 || len(r2) == 0 {
return 0.0
}
dist := levenshteinDistance(clean1, clean2)
maxLen := math.Max(float64(len(r1)), float64(len(r2)))
return 1.0 - float64(dist)/maxLen
}
// cleanWithRegex 使用自定义正则清洗字符串reg 为 nil 时返回原文
func cleanWithRegex(input string, reg *regexp.Regexp) string {
if reg == nil {
return input
}
return reg.ReplaceAllString(input, "")
}
// ---------- 文件读取(通用)----------
// readRawRows 读取 Excel/CSV 文件返回原始二维字符串切片row[0] = 表头)
func (a *App) readRawRows(path string) ([][]string, error) {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".csv":
return a.readCSVRaw(path)
default:
return a.readExcelRaw(path)
}
}
func (a *App) readCSVRaw(path string) ([][]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("打开 CSV 文件失败: %v", err)
}
defer f.Close()
reader := csv.NewReader(f)
reader.LazyQuotes = true
reader.TrimLeadingSpace = true
allRows, err := reader.ReadAll()
if err != nil {
return nil, fmt.Errorf("读取 CSV 文件失败: %v", err)
}
if len(allRows) < 2 {
return nil, fmt.Errorf("CSV 文件至少需要标题行和一条数据")
}
for i := range allRows {
for j := range allRows[i] {
allRows[i][j] = strings.TrimSpace(allRows[i][j])
}
}
return allRows, nil
}
func (a *App) readExcelRaw(path string) ([][]string, error) {
f, err := excelize.OpenFile(path)
if err != nil {
return nil, fmt.Errorf("打开 Excel 文件失败: %v", err)
}
defer f.Close()
sheetName := f.GetSheetName(0)
allRows, err := f.GetRows(sheetName)
if err != nil {
return nil, fmt.Errorf("读取工作表失败: %v", err)
}
if len(allRows) < 2 {
return nil, fmt.Errorf("Excel 文件至少需要标题行和一条数据")
}
return allRows, nil
}
// getCell 安全获取行中指定索引的单元格值,越界返回空字符串
func getCell(row []string, idx int) string {
if idx < 0 || idx >= len(row) {
return ""
}
return strings.TrimSpace(row[idx])
}
// RunMatch 接收完整 MatchConfig按列索引执行通用匹配
func (a *App) RunMatch(config MatchConfig) ([]MatchResult, error) {
// 1. 编译正则
reg, err := compileRegex(config.RegexPattern)
if err != nil {
return nil, err
}
// 2. 默认值兜底
timeWindow := config.TimeWindow
if timeWindow <= 0 {
timeWindow = defaultTimeWindowH
}
threshold := config.Threshold
if threshold <= 0 {
threshold = defaultThreshold
}
// 3. 读取原始数据
a.emitProgress(0, 100, "正在读取 A 表...", "reading")
rowsA, err := a.readRawRows(config.FileAPath)
if err != nil {
return nil, fmt.Errorf("读取 A 表失败: %v", err)
}
a.emitProgress(0, 100, "正在读取 B 表...", "reading")
rowsB, err := a.readRawRows(config.FileBPath)
if err != nil {
return nil, fmt.Errorf("读取 B 表失败: %v", err)
}
if len(rowsA) < 2 {
return nil, fmt.Errorf("A 表无有效数据行")
}
if len(rowsB) < 2 {
return nil, fmt.Errorf("B 表无有效数据行")
}
// 保存表头供导出使用
a.headersA = rowsA[0]
a.headersB = rowsB[0]
a.lastConfig = config
dataA := rowsA[1:]
dataB := rowsB[1:]
windowDuration := time.Duration(timeWindow * float64(time.Hour))
return a.runMatchOnData(dataA, dataB, reg, config, timeWindow, threshold, windowDuration)
}
// runMatchOnData 在已读取的数据上执行匹配(内部使用,避免重复 I/O
func (a *App) runMatchOnData(dataA, dataB [][]string, reg *regexp.Regexp, config MatchConfig, _, threshold float64, windowDuration time.Duration) ([]MatchResult, error) {
useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0
totalA := len(dataA)
var results []MatchResult
useAllMatches := config.AllMatches
maxPreview := config.MaxPreview
if maxPreview <= 0 {
maxPreview = defaultMaxPreview
}
for i, rowA := range dataA {
if i%10 == 0 || i == totalA-1 {
pct := (i + 1) * 100 / totalA
a.emitProgress(i+1, totalA,
fmt.Sprintf("匹配中 %d/%d (%d%%)...", i+1, totalA, pct), "matching")
}
matchStrA := getCell(rowA, config.ColAMatchIndex)
if matchStrA == "" {
continue
}
var timeA time.Time
var hasTimeA bool
if useTime {
t, err := parseTimeFlexible(getCell(rowA, config.ColATimeIndex))
if err == nil {
timeA = t
hasTimeA = true
}
}
cleanA := cleanWithRegex(matchStrA, reg)
// 收集该 A 行的所有候选匹配
var candidates []MatchResult
for _, rowB := range dataB {
matchStrB := getCell(rowB, config.ColBMatchIndex)
if matchStrB == "" {
continue
}
var timeDiff time.Duration
if hasTimeA && useTime {
tB, err := parseTimeFlexible(getCell(rowB, config.ColBTimeIndex))
if err != nil {
continue
}
td := timeA.Sub(tB)
if td < -windowDuration || td > windowDuration {
continue
}
timeDiff = td
}
cleanB := cleanWithRegex(matchStrB, reg)
if cleanA == "" || cleanB == "" {
continue
}
similarity := calcSimilarity(matchStrA, matchStrB, reg, config.CaseSensitive)
if i < maxPreview {
fmt.Printf("[DEBUG] | A[%d]='%s'→'%s' | B='%s'→'%s' | 相似度=%.4f\n",
i, matchStrA, cleanA, matchStrB, cleanB, similarity)
}
if similarity >= threshold {
mr := MatchResult{
RowAData: rowA,
RowBKey: matchStrB,
ExtractValue: getCell(rowB, config.ColBExtractIndex),
TimeDiff: formatTimeDiff(timeDiff),
SimilarityScore: math.Round(similarity*10000) / 10000,
AIMatched: false,
}
if useAllMatches {
candidates = append(candidates, mr)
} else {
if len(candidates) == 0 || similarity > candidates[0].SimilarityScore {
candidates = []MatchResult{mr}
}
// B7: 完美匹配时提前退出
if similarity == 1.0 {
break
}
}
}
}
if len(candidates) > 0 {
if i < maxPreview {
for _, c := range candidates {
fmt.Printf("[DEBUG] ✓ 命中 | A='%s'→B='%s' | 相似度=%.4f\n",
matchStrA, c.RowBKey, c.SimilarityScore)
}
}
results = append(results, candidates...)
}
}
// 结果排序
switch config.SortBy {
case "similarity":
sort.Slice(results, func(i, j int) bool {
return results[i].SimilarityScore > results[j].SimilarityScore
})
case "timeDiff":
// B5: 使用原始数据重新解析时间差做数值排序
sort.Slice(results, func(i, j int) bool {
return parseTimeDiffDuration(results[i].TimeDiff) < parseTimeDiffDuration(results[j].TimeDiff)
})
}
a.emitProgress(totalA, totalA,
fmt.Sprintf("匹配完成!共匹配成功 %d 条记录", len(results)), "done")
return results, nil
}
// compileRegex 编译正则nil 表示跳过清洗
func compileRegex(pattern string) (*regexp.Regexp, error) {
if pattern == "" {
return nil, nil
}
reg, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("正则表达式格式错误,请检查: %v", err)
}
fmt.Printf("[DEBUG] 使用正则: '%s'\n", pattern)
return reg, nil
}
// parseTimeDiffDuration 将 TimeDiff 字符串(如 "1h30m")解析为 time.Duration用于排序
func parseTimeDiffDuration(s string) time.Duration {
if s == "" {
return 0
}
sign := time.Duration(1)
if s[0] == '-' {
sign = -1
s = s[1:]
}
d, err := time.ParseDuration(s)
if err != nil {
return 0
}
return sign * d
}
// RunMatchWithAI 执行基础匹配 + Deepseek AI 增强匹配(配置驱动)
func (a *App) RunMatchWithAI(config MatchConfig) ([]MatchResult, error) {
if a.deepseekKey == "" {
return nil, fmt.Errorf("请先设置 Deepseek API 密钥")
}
// 1. 编译正则
reg, err := compileRegex(config.RegexPattern)
if err != nil {
return nil, err
}
// 2. 默认值兜底
timeWindow := config.TimeWindow
if timeWindow <= 0 {
timeWindow = defaultTimeWindowH
}
threshold := config.Threshold
if threshold <= 0 {
threshold = defaultThreshold
}
windowDuration := time.Duration(timeWindow * float64(time.Hour))
// 3. 一次读取数据,供匹配和 AI 使用
a.emitProgress(0, 100, "正在读取 A 表...", "reading")
rowsA, err := a.readRawRows(config.FileAPath)
if err != nil {
return nil, fmt.Errorf("读取 A 表失败: %v", err)
}
a.emitProgress(0, 100, "正在读取 B 表...", "reading")
rowsB, err := a.readRawRows(config.FileBPath)
if err != nil {
return nil, fmt.Errorf("读取 B 表失败: %v", err)
}
if len(rowsA) < 2 || len(rowsB) < 2 {
return nil, fmt.Errorf("数据表无有效数据行")
}
// 保存表头供导出使用
a.headersA = rowsA[0]
a.headersB = rowsB[0]
a.lastConfig = config
dataA := rowsA[1:]
dataB := rowsB[1:]
// 4. 先执行基础匹配(使用已读取的数据,避免重复 I/O
results, err := a.runMatchOnData(dataA, dataB, reg, config, timeWindow, threshold, windowDuration)
if err != nil {
return nil, err
}
// 5. 找出未被基础匹配覆盖的 A 表行
matchedSet := make(map[string]bool)
for _, r := range results {
matchedSet[strings.Join(r.RowAData, "\x00")] = true
}
var unmatchedA [][]string
for _, row := range dataA {
if !matchedSet[strings.Join(row, "\x00")] {
unmatchedA = append(unmatchedA, row)
}
}
if len(unmatchedA) == 0 {
a.emitProgress(1, 1, "全部已匹配,无需 AI 增强", "done")
return results, nil
}
// 6. AI 增强匹配
useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0
totalUnmatched := len(unmatchedA)
a.emitProgress(0, totalUnmatched,
fmt.Sprintf("AI 增强匹配:还有 %d 条未匹配记录,正在调用 Deepseek...", totalUnmatched),
"ai-enhancing")
aiMatched := 0
for batchStart := 0; batchStart < totalUnmatched; batchStart += defaultBatchSize {
end := batchStart + defaultBatchSize
if end > totalUnmatched {
end = totalUnmatched
}
a.emitProgress(batchStart+1, totalUnmatched,
fmt.Sprintf("AI 分析中 %d/%d (第 %d 批)...", end, totalUnmatched, (batchStart/defaultBatchSize)+1),
"ai-enhancing")
batch := unmatchedA[batchStart:end]
// 计算本批 A 表的时间范围
var minTime, maxTime time.Time
hasBatchTime := false
if useTime {
for _, row := range batch {
t, err := parseTimeFlexible(getCell(row, config.ColATimeIndex))
if err != nil {
continue
}
if !hasBatchTime {
minTime, maxTime = t, t
hasBatchTime = true
} else {
if t.Before(minTime) {
minTime = t
}
if t.After(maxTime) {
maxTime = t
}
}
}
}
// 过滤 B 表在时间窗口内的行(用户配置时间窗口 + 额外余量覆盖批次跨度)
var relevantB [][]string
if hasBatchTime && useTime {
padding := windowDuration + time.Duration(defaultAIWindowPadH)*time.Hour
ws := minTime.Add(-padding)
we := maxTime.Add(padding)
for _, row := range dataB {
t, err := parseTimeFlexible(getCell(row, config.ColBTimeIndex))
if err != nil || t.Before(ws) || t.After(we) {
continue
}
relevantB = append(relevantB, row)
}
} else {
// 无时间列时限制 B 表条数以控制 token 消耗
maxB := defaultMaxBNoTime
if len(dataB) < maxB {
maxB = len(dataB)
}
relevantB = dataB[:maxB]
}
// 构建 AI 提示
prompt := a.buildGenericAIPrompt(batch, relevantB, config, windowDuration, hasBatchTime)
aiResp, err := a.callDeepseekAPI(prompt)
if err != nil {
continue
}
// 解析 AI 返回
var matchResp struct {
Matches []struct {
Index int `json:"index"`
Value string `json:"value"`
} `json:"matches"`
}
parseErr := json.Unmarshal([]byte(aiResp), &matchResp)
if parseErr != nil {
if idx := strings.Index(aiResp, "{"); idx >= 0 {
if endIdx := strings.LastIndex(aiResp, "}"); endIdx > idx {
parseErr = json.Unmarshal([]byte(aiResp[idx:endIdx+1]), &matchResp)
}
}
}
if parseErr != nil {
fmt.Printf("[AI-WARN] 响应解析失败 (第 %d 批): %s\n 原始响应: %.200s\n",
(batchStart/defaultBatchSize)+1, parseErr.Error(), aiResp)
continue
}
for _, item := range matchResp.Matches {
idx := item.Index
val := strings.TrimSpace(item.Value)
if idx < 0 || idx >= len(batch) || val == "" {
continue
}
rowA := batch[idx]
mr := MatchResult{
RowAData: rowA,
RowBKey: "",
ExtractValue: val,
SimilarityScore: 0,
AIMatched: true,
}
results = append(results, mr)
aiMatched++
}
}
a.emitProgress(totalUnmatched, totalUnmatched,
fmt.Sprintf("AI 增强完成!基础匹配 %d 条 + AI 补充 %d 条 = 共 %d 条",
len(results)-aiMatched, aiMatched, len(results)), "done")
return results, nil
}
// buildGenericAIPrompt 构建通用 AI 匹配提示词
func (a *App) buildGenericAIPrompt(unmatched, bRows [][]string, config MatchConfig, windowDuration time.Duration, hasTime bool) []deepseekMessage {
var sb strings.Builder
sb.WriteString("你是一个数据匹配专家。请根据以下 A 表记录,从 B 表数据中找出最匹配的记录。\n\n")
sb.WriteString("匹配规则:\n")
sb.WriteString("1. 根据文本相似度匹配(注意中文字段的核心含义,忽略字母数字前缀后缀)\n")
if hasTime {
sb.WriteString(fmt.Sprintf("2. 时间差应在 %.0f 小时内\n", windowDuration.Hours()))
}
sb.WriteString(fmt.Sprintf("3. 返回匹配到的 B 表记录的目标列值(第 %d 列)\n\n", config.ColBExtractIndex+1))
sb.WriteString("请严格按照以下 JSON 格式返回结果:\n")
sb.WriteString(`{"matches":[{"index":0,"value":"匹配到的目标列值"},{"index":1,"value":""}]}` + "\n")
sb.WriteString("如果某条无法匹配value 设为空字符串。\n\n")
sb.WriteString(fmt.Sprintf("A 表记录(需要匹配,共 %d 条):\n", len(unmatched)))
for i, row := range unmatched {
matchVal := getCell(row, config.ColAMatchIndex)
sb.WriteString(fmt.Sprintf("- 索引 %d: 「%s」", i, matchVal))
if hasTime {
sb.WriteString(fmt.Sprintf(", 时间=%s", getCell(row, config.ColATimeIndex)))
}
sb.WriteString("\n")
}
sb.WriteString(fmt.Sprintf("\nB 表参考数据(共 %d 条):\n", len(bRows)))
for _, row := range bRows {
matchVal := getCell(row, config.ColBMatchIndex)
extractVal := getCell(row, config.ColBExtractIndex)
sb.WriteString(fmt.Sprintf(" 「%s」 → 目标列值: 「%s」", matchVal, extractVal))
if hasTime {
sb.WriteString(fmt.Sprintf(", 时间=%s", getCell(row, config.ColBTimeIndex)))
}
sb.WriteString("\n")
}
sb.WriteString("\n请返回 JSON 格式的匹配结果。")
return []deepseekMessage{
{Role: "system", Content: "你是一个数据匹配专家。请严格按照 JSON 格式返回结果,不要添加额外说明。"},
{Role: "user", Content: sb.String()},
}
}
// ---------- Deepseek AI 增强匹配 ----------
// hashPrompt 对 prompt 消息计算 SHA256用于缓存键
func hashPrompt(messages []deepseekMessage) string {
h := sha256.New()
for _, m := range messages {
h.Write([]byte(m.Role))
h.Write([]byte{0})
h.Write([]byte(m.Content))
h.Write([]byte{0})
}
return hex.EncodeToString(h.Sum(nil))
}
// callDeepseekAPI 调用 Deepseek Chat API带缓存
func (a *App) callDeepseekAPI(messages []deepseekMessage) (string, error) {
if a.deepseekKey == "" {
return "", fmt.Errorf("请先设置 Deepseek API 密钥")
}
hash := hashPrompt(messages)
// 先查缓存
if cached, ok := a.aiCache.get(hash); ok {
fmt.Printf("[CACHE] ✓ 命中 AI 缓存 (hash=%s)\n", hash[:12])
return cached, nil
}
fmt.Printf("[CACHE] ✗ 缓存未命中 (hash=%s),调用 API...\n", hash[:12])
reqBody := deepseekRequest{
Model: "deepseek-chat",
Messages: messages,
Temperature: 0.05,
MaxTokens: 2048,
}
bodyBytes, _ := json.Marshal(reqBody)
httpReq, err := http.NewRequest("POST", "https://api.deepseek.com/v1/chat/completions",
bytes.NewReader(bodyBytes))
if err != nil {
return "", fmt.Errorf("创建请求失败: %v", err)
}
httpReq.Header.Set("Content-Type", "application/json")
httpReq.Header.Set("Authorization", "Bearer "+a.deepseekKey)
client := &http.Client{Timeout: 60 * time.Second}
resp, err := client.Do(httpReq)
if err != nil {
return "", fmt.Errorf("调用 Deepseek API 失败: %v", err)
}
defer resp.Body.Close()
respBytes, _ := io.ReadAll(resp.Body)
var dr deepseekResponse
if err := json.Unmarshal(respBytes, &dr); err != nil {
return "", fmt.Errorf("解析 Deepseek 响应失败: %v", err)
}
if dr.Error != nil {
return "", fmt.Errorf("Deepseek API 错误: %s", dr.Error.Message)
}
if len(dr.Choices) == 0 {
return "", fmt.Errorf("Deepseek 未返回有效结果")
}
result := strings.TrimSpace(dr.Choices[0].Message.Content)
// 写入缓存并持久化
a.aiCache.put(hash, result)
a.aiCache.saveToFile()
return result, nil
}
// formatTimeDiff 格式化时间差为可读字符串
func formatTimeDiff(d time.Duration) string {
abs := d
if abs < 0 {
abs = -abs
}
hours := int(abs.Hours())
mins := int(abs.Minutes()) % 60
secs := int(abs.Seconds()) % 60
sign := ""
if d < 0 {
sign = "-"
}
if hours > 0 {
return fmt.Sprintf("%s%dh%dm%ds", sign, hours, mins, secs)
} else if mins > 0 {
return fmt.Sprintf("%s%dm%ds", sign, mins, secs)
}
return fmt.Sprintf("%s%ds", sign, secs)
}
// ---------- 导出结果 ----------
// ExportResults 将匹配结果导出为 Excel 或 CSV 文件
func (a *App) ExportResults(results []MatchResult) (string, error) {
if len(results) == 0 {
return "", fmt.Errorf("没有匹配结果可以导出")
}
isCSV := a.lastConfig.ExportFormat == "csv"
ext := ".xlsx"
filterDisplay := "Excel 文件 (*.xlsx)"
filterPattern := "*.xlsx"
if isCSV {
ext = ".csv"
filterDisplay = "CSV 文件 (*.csv)"
filterPattern = "*.csv"
}
savePath, err := runtime.SaveFileDialog(a.ctx, runtime.SaveDialogOptions{
Title: "导出匹配结果",
DefaultFilename: fmt.Sprintf("匹配结果_%s%s", time.Now().Format("20060102_150405"), ext),
Filters: []runtime.FileFilter{
{DisplayName: filterDisplay, Pattern: filterPattern},
},
})
if err != nil {
return "", fmt.Errorf("打开保存对话框失败: %v", err)
}
if savePath == "" {
return "", nil
}
if !strings.HasSuffix(strings.ToLower(savePath), ext) {
savePath += ext
}
if isCSV {
return a.exportResultsCSV(results, savePath)
}
return a.exportResultsXLSX(results, savePath)
}
// exportHeaders 构建导出表头行(使用真实表头或回退默认)
func (a *App) exportHeaders(numACols int) []string {
headers := make([]string, 0, numACols+1)
if len(a.headersA) >= numACols {
for _, h := range a.headersA[:numACols] {
n := h
if n == "" {
n = fmt.Sprintf("Col%d", len(headers)+1)
}
headers = append(headers, n)
}
} else {
for i := 0; i < numACols; i++ {
headers = append(headers, fmt.Sprintf("A-Col%d", i+1))
}
}
headers = append(headers, "匹配结果(由B表提取)")
return headers
}
func (a *App) exportResultsXLSX(results []MatchResult, savePath string) (string, error) {
f := excelize.NewFile()
defer f.Close()
sheetName := "匹配结果"
f.SetSheetName("Sheet1", sheetName)
numACols := len(results[0].RowAData)
colLetter := func(n int) string { c, _ := excelize.ColumnNumberToName(n + 1); return c }
headers := a.exportHeaders(numACols)
extractCol := numACols
// 表头
if a.lastConfig.IncludeHeader {
for i, h := range headers {
f.SetCellValue(sheetName, fmt.Sprintf("%s1", colLetter(i)), h)
}
headerStyle, _ := f.NewStyle(&excelize.Style{
Font: &excelize.Font{Bold: true, Size: 12, Color: "FFFFFF"},
Fill: excelize.Fill{Type: "pattern", Pattern: 1, Color: []string{"4472C4"}},
})
f.SetCellStyle(sheetName, "A1", fmt.Sprintf("%s1", colLetter(extractCol)), headerStyle)
}
// 数据行
for i, r := range results {
rowNum := i + 2
if !a.lastConfig.IncludeHeader {
rowNum = i + 1
}
for ci := 0; ci < numACols; ci++ {
f.SetCellValue(sheetName, fmt.Sprintf("%s%d", colLetter(ci), rowNum), r.RowAData[ci])
}
f.SetCellValue(sheetName, fmt.Sprintf("%s%d", colLetter(extractCol), rowNum), r.ExtractValue)
}
// 列宽
for ci := 0; ci <= numACols; ci++ {
f.SetColWidth(sheetName, colLetter(ci), colLetter(ci), 22)
}
if err := f.SaveAs(savePath); err != nil {
return "", fmt.Errorf("保存文件失败: %v", err)
}
return savePath, nil
}
func (a *App) exportResultsCSV(results []MatchResult, savePath string) (string, error) {
var buf bytes.Buffer
// 使用 UTF-8 BOM 帮助 Excel 正确识别编码
buf.Write([]byte{0xEF, 0xBB, 0xBF})
numACols := len(results[0].RowAData)
headers := a.exportHeaders(numACols)
// 表头行
if a.lastConfig.IncludeHeader {
for i, h := range headers {
if i > 0 {
buf.WriteByte(',')
}
buf.WriteString(csvEscape(h))
}
buf.WriteByte('\n')
}
// 数据行
for _, r := range results {
for ci := 0; ci < numACols; ci++ {
if ci > 0 {
buf.WriteByte(',')
}
buf.WriteString(csvEscape(r.RowAData[ci]))
}
buf.WriteByte(',')
buf.WriteString(csvEscape(r.ExtractValue))
buf.WriteByte('\n')
}
if err := os.WriteFile(savePath, buf.Bytes(), 0600); err != nil {
return "", fmt.Errorf("保存 CSV 文件失败: %v", err)
}
return savePath, nil
}
// csvEscape 对 CSV 字段进行转义(含逗号或引号时包裹双引号)
func csvEscape(s string) string {
if strings.ContainsAny(s, "\",\n\r") {
return `"` + strings.ReplaceAll(s, `"`, `""`) + `"`
}
return s
}