office-data-matcher/app.go

package main

import (
	"context"
	"encoding/json"
	"fmt"
	"math"
	"path/filepath"
	"sort"
	"strings"
	"sync"
	"time"

	"github.com/wailsapp/wails/v2/pkg/runtime"
)

// ---------- 常量 ----------

// DefaultThreshold 默认匹配阈值
const DefaultThreshold = 0.65

// DefaultTimeWindowHours 默认时间窗口（小时）
const DefaultTimeWindowHours = 12.0

// DefaultBatchSize AI 分批调用每批条数
const DefaultBatchSize = 8

// DefaultMaxPreview 调试日志预览条数
const DefaultMaxPreview = 3

// deepseekModel Deepseek 模型名称
const deepseekModel = "deepseek-chat"

// deepseekTemperature AI 温度参数
const deepseekTemperature = 0.05

// deepseekMaxTokens AI 最大 token 数
const deepseekMaxTokens = 2048

// cacheMaxSize AI 缓存最大条目数
const cacheMaxSize = 500

// ---------- 数据结构 ----------

// MatchResult 匹配结果（新旧字段兼容）
type MatchResult struct {
	// 新字段（通用化）
	RowAData     []string `json:"rowAData"`    // A 表原始所有列（新）
	RowBKey      string   `json:"rowBKey"`     // B 表匹配列的值（新）
	ExtractValue string   `json:"extractValue"` // 从 B 表提取的目标列值（新）

	// 旧字段（向后兼容）
	MonthlyCellName string `json:"monthlyCellName"`
	DailyCellID     string `json:"dailyCellId"`
	InterruptReason string `json:"interruptReason"`

	// 公共字段
	TimeDiff        string  `json:"timeDiff"`
	SimilarityScore float64 `json:"similarityScore"`
	AIMatched       bool    `json:"aiMatched"`
}

// ProgressPayload 进度信息
type ProgressPayload struct {
	Current int    `json:"current"`
	Total   int    `json:"total"`
	Message string `json:"message"`
	Phase   string `json:"phase"` // reading / matching / ai-enhancing / done
}

// MatchConfig 前端传递的完整匹配配置
type MatchConfig struct {
	// 文件路径
	FileAPath string `json:"fileAPath"`
	FileBPath string `json:"fileBPath"`

	// A 表列索引（-1 表示不使用）
	ColAMatchIndex int `json:"colAMatchIndex"` // A 表匹配列
	ColATimeIndex  int `json:"colATimeIndex"`  // A 表时间列（可选，-1 跳过时间剪枝）

	// B 表列索引
	ColBMatchIndex   int `json:"colBMatchIndex"`   // B 表匹配列
	ColBTimeIndex    int `json:"colBTimeIndex"`    // B 表时间列（可选，-1 跳过时间剪枝）
	ColBExtractIndex int `json:"colBExtractIndex"` // B 表要提取的目标列

	// 清洗与匹配参数
	RegexPattern string  `json:"regexPattern"` // 空字符串 = 跳过清洗
	TimeWindow   float64 `json:"timeWindow"`   // 小时
	Threshold    float64 `json:"threshold"`    // 0.0 - 1.0

	// 扩展选项
	AllMatches    bool   `json:"allMatches"`    // true=返回该A行所有匹配(>=阈值)而非仅最佳
	CaseSensitive bool   `json:"caseSensitive"` // true=大小写敏感匹配
	SortBy        string `json:"sortBy"`        // "similarity" / "timeDiff" / ""=不排序
	MaxPreview     int `json:"maxPreview"`     // 调试日志中打印的前 N 条比对详情，0=不打印
	MaxBRowsNoTime int `json:"maxBRowsNoTime"` // 无时间列时 AI 匹配最多取 B 表多少行（0=使用默认值 200）
	ExportFormat   string `json:"exportFormat"`  // "xlsx"(默认) / "csv"
	IncludeHeader  bool   `json:"includeHeader"` // 导出时是否包含表头行
}

// AICacheInfo 缓存状态信息
type AICacheInfo struct {
	Count    int    `json:"count"`
	FilePath string `json:"filePath"`
}

// ---------- App 结构体 ----------

type App struct {
	ctx         context.Context
	aiCache     *AICache

	// AI API 配置（并发访问需要加锁）
	aiMu        sync.RWMutex
	apiKey      string // AI API 密钥（兼容 OpenAI/Deepseek/本地模型）
	apiEndpoint string // API 端点（默认 https://api.deepseek.com/v1/chat/completions）
	apiModel    string // 模型名称（默认 deepseek-chat）

	// 最近一次匹配的配置和表头（供导出使用）
	dataMu      sync.RWMutex
	lastConfig  MatchConfig
	headersA    []string
	headersB    []string
}

// NewApp 创建 App 实例
func NewApp() *App {
	return &App{
		aiCache: newAICache(),
	}
}

// startup 保存上下文
func (a *App) startup(ctx context.Context) {
	a.ctx = ctx
	count, path := a.aiCache.stat()
	fmt.Printf("[CACHE] AI 缓存已加载，当前 %d 条缓存记录 (文件: %s)\n", count, path)
}

// emitProgress 向前端发送进度事件
func (a *App) emitProgress(current, total int, message, phase string) {
	if a.ctx == nil {
		return
	}
	runtime.EventsEmit(a.ctx, "match-progress", ProgressPayload{
		Current: current,
		Total:   total,
		Message: message,
		Phase:   phase,
	})
}

// ---------- AI 配置 ----------

// SetDeepseekAPIKey 设置 Deepseek API 密钥（仅保存在内存中，向后兼容）
func (a *App) SetDeepseekAPIKey(key string) string {
	a.aiMu.Lock()
	defer a.aiMu.Unlock()
	a.apiKey = strings.TrimSpace(key)
	if a.apiKey == "" {
		return "已清除 Deepseek API 密钥"
	}
	return "Deepseek API 密钥已设置"
}

// SetAIConfig 统一设置 AI API 配置（端点、模型、密钥）
func (a *App) SetAIConfig(endpoint, model, key string) string {
	a.aiMu.Lock()
	defer a.aiMu.Unlock()
	if endpoint != "" {
		a.apiEndpoint = strings.TrimSpace(endpoint)
	}
	if model != "" {
		a.apiModel = strings.TrimSpace(model)
	}
	if key != "" {
		a.apiKey = strings.TrimSpace(key)
	}
	return fmt.Sprintf("AI 配置已更新 (端点=%s, 模型=%s)", a.apiEndpoint, a.apiModel)
}

// SetAPIKey 设置 AI API 密钥（仅保存在内存中）
func (a *App) SetAPIKey(key string) string {
	a.aiMu.Lock()
	defer a.aiMu.Unlock()
	a.apiKey = strings.TrimSpace(key)
	if a.apiKey == "" {
		return "已清除 AI API 密钥"
	}
	return "AI API 密钥已设置"
}

// GetDeepseekStatus 返回是否已配置 Deepseek API 密钥
func (a *App) GetDeepseekStatus() bool {
	a.aiMu.RLock()
	defer a.aiMu.RUnlock()
	return a.apiKey != ""
}

// GetAIStatus 返回 AI API 配置状态
func (a *App) GetAIStatus() map[string]string {
	a.aiMu.RLock()
	defer a.aiMu.RUnlock()
	return map[string]string{
		"ready":    fmt.Sprintf("%v", a.apiKey != ""),
		"endpoint": a.apiEndpoint,
		"model":    a.apiModel,
	}
}

// ClearAICache 清除所有 AI 缓存
func (a *App) ClearAICache() string {
	before, _ := a.aiCache.stat()
	a.aiCache.clear()
	return fmt.Sprintf("已清除 %d 条 AI 缓存记录", before)
}

// GetAICacheInfo 返回 AI 缓存信息（条目数、文件路径）
func (a *App) GetAICacheInfo() AICacheInfo {
	count, path := a.aiCache.stat()
	return AICacheInfo{Count: count, FilePath: path}
}

// ---------- 文件选择对话框 ----------

// OpenFileA 打开文件对话框选择 A 表（基准表）
func (a *App) OpenFileA() (string, error) {
	return a.openFileDialog("选择 A 表文件（基准表）")
}

// OpenFileB 打开文件对话框选择 B 表（数据源表）
func (a *App) OpenFileB() (string, error) {
	return a.openFileDialog("选择 B 表文件（数据源表）")
}

func (a *App) openFileDialog(title string) (string, error) {
	file, err := runtime.OpenFileDialog(a.ctx, runtime.OpenDialogOptions{
		Title: title,
		Filters: []runtime.FileFilter{
			{DisplayName: "Excel / CSV 文件 (*.xlsx, *.xls, *.csv)", Pattern: "*.xlsx;*.xls;*.csv"},
		},
	})
	if err != nil {
		return "", err
	}
	return file, nil
}

// ParseHeaders 读取文件第一行作为表头数组返回给前端，用于动态渲染列映射下拉框
func (a *App) ParseHeaders(filePath string) ([]string, error) {
	if filePath == "" {
		return nil, fmt.Errorf("文件路径为空")
	}
	allRows, err := a.readRawRows(filePath)
	if err != nil {
		return nil, err
	}
	if len(allRows) == 0 {
		return nil, fmt.Errorf("文件为空，无表头")
	}
	headers := allRows[0]
	// TrimSpace 每个表头
	for i := range headers {
		headers[i] = strings.TrimSpace(headers[i])
	}
	fmt.Printf("[DEBUG] ParseHeaders: '%s' → %d 列 %v\n", filepath.Base(filePath), len(headers), headers)
	return headers, nil
}

// ---------- 通用匹配引擎 ----------

// RunMatch 接收完整 MatchConfig，按列索引执行通用匹配
func (a *App) RunMatch(config MatchConfig) ([]MatchResult, error) {
	prep, err := a.prepareMatch(config)
	if err != nil {
		return nil, err
	}
	results, _, err := a.runMatchOnData(prep, config)
	return results, err
}

// runMatchOnData 在已读取的数据上执行匹配
// 返回匹配结果，以及被匹配到的 A 表行索引集合（供 AI 增强阶段判断未匹配行）
func (a *App) runMatchOnData(prep *matchPrep, config MatchConfig) ([]MatchResult, map[int]bool, error) {
	useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0
	totalA := len(prep.dataA)
	var results []MatchResult
	matchedAIndices := make(map[int]bool)

	useAllMatches := config.AllMatches
	maxPreview := config.MaxPreview
	if maxPreview < 0 {
		maxPreview = DefaultMaxPreview
	}

	// 预计算 B 表清洗后的匹配值，避免内层循环中重复 regex 替换
	totalB := len(prep.dataB)
	cleanedBMatch := make([]string, totalB)
	origBMatch := make([]string, totalB)
	parsedBTime := make([]time.Time, totalB)
	hasBTime := make([]bool, totalB)
	bExtractVal := make([]string, totalB)
	for bIdx, rowB := range prep.dataB {
		matchStrB := getCell(rowB, config.ColBMatchIndex)
		origBMatch[bIdx] = matchStrB
		if matchStrB == "" {
			cleanedBMatch[bIdx] = ""
		} else {
			cleanedBMatch[bIdx] = cleanWithRegex(matchStrB, prep.reg)
		}
		if useTime {
			t, err := parseTimeFlexible(getCell(rowB, config.ColBTimeIndex))
			if err == nil {
				parsedBTime[bIdx] = t
				hasBTime[bIdx] = true
			}
		}
		bExtractVal[bIdx] = getCell(rowB, config.ColBExtractIndex)
	}

	for i, rowA := range prep.dataA {
		if i%10 == 0 || i == totalA-1 {
			pct := (i + 1) * 100 / totalA
			a.emitProgress(i+1, totalA,
				fmt.Sprintf("匹配中 %d/%d (%d%%)...", i+1, totalA, pct), "matching")
		}

		matchStrA := getCell(rowA, config.ColAMatchIndex)
		if matchStrA == "" {
			continue
		}

		var timeA time.Time
		var hasTimeA bool
		if useTime {
			t, err := parseTimeFlexible(getCell(rowA, config.ColATimeIndex))
			if err == nil { timeA = t; hasTimeA = true }
		}

		cleanA := cleanWithRegex(matchStrA, prep.reg)

		// 收集该 A 行的所有候选匹配
		var candidates []MatchResult

		for bIdx := range prep.dataB {
			if cleanedBMatch[bIdx] == "" { continue }

			if hasTimeA && useTime && hasBTime[bIdx] {
				td := timeA.Sub(parsedBTime[bIdx])
				if td < -prep.windowDuration || td > prep.windowDuration { continue }
			}

			similarity := similarityFromCleaned(cleanA, cleanedBMatch[bIdx], config.CaseSensitive)

			if i < maxPreview {
				fmt.Printf("[DEBUG] | A[%d]='%s'→'%s' | B='%s'→'%s' | 相似度=%.4f\n",
					i, matchStrA, cleanA, origBMatch[bIdx], cleanedBMatch[bIdx], similarity)
			}

			if similarity >= prep.threshold {
				var timeDiff time.Duration
				if hasTimeA && useTime && hasBTime[bIdx] {
					timeDiff = timeA.Sub(parsedBTime[bIdx])
				}
				mr := MatchResult{
					RowAData:        rowA,
					RowBKey:         origBMatch[bIdx],
					ExtractValue:    bExtractVal[bIdx],
					TimeDiff:        formatTimeDiff(timeDiff),
					SimilarityScore: math.Round(similarity*10000) / 10000,
					AIMatched:       false,
				}
				if useAllMatches {
					candidates = append(candidates, mr)
				} else if len(candidates) == 0 || similarity > candidates[0].SimilarityScore {
					candidates = []MatchResult{mr}
				}
			}
		}

		if len(candidates) > 0 {
			if i < maxPreview {
				for _, c := range candidates {
					fmt.Printf("[DEBUG] ✓ 命中 | A='%s'→B='%s' | 相似度=%.4f\n",
						matchStrA, c.RowBKey, c.SimilarityScore)
				}
			}
			results = append(results, candidates...)
			matchedAIndices[i] = true
		}
	}

	// 结果排序
	switch config.SortBy {
	case "similarity":
		sort.Slice(results, func(i, j int) bool {
			return results[i].SimilarityScore > results[j].SimilarityScore
		})
	case "timeDiff":
		sort.Slice(results, func(i, j int) bool {
			return parseTimeDiffDuration(results[i].TimeDiff) < parseTimeDiffDuration(results[j].TimeDiff)
		})
	}

	a.emitProgress(totalA, totalA,
		fmt.Sprintf("匹配完成！共匹配成功 %d 条记录", len(results)), "done")

	return results, matchedAIndices, nil
}

// RunMatchWithAI 执行基础匹配 + AI 增强匹配（配置驱动）
func (a *App) RunMatchWithAI(config MatchConfig) ([]MatchResult, error) {
	a.aiMu.RLock()
	if a.apiKey == "" {
		a.aiMu.RUnlock()
		return nil, fmt.Errorf("请先设置 AI API 密钥")
	}
	a.aiMu.RUnlock()

	prep, err := a.prepareMatch(config)
	if err != nil {
		return nil, err
	}

	// 1. 先执行基础匹配
	results, matchedAIndices, err := a.runMatchOnData(prep, config)
	if err != nil {
		return nil, err
	}

	// 2. 找出未被基础匹配覆盖的 A 表行（按索引，避免重复行内容相同导致误判）
	var unmatchedA [][]string
	for i, row := range prep.dataA {
		if !matchedAIndices[i] {
			unmatchedA = append(unmatchedA, row)
		}
	}

	if len(unmatchedA) == 0 {
		a.emitProgress(1, 1, "全部已匹配，无需 AI 增强", "done")
		return results, nil
	}

	// 3. AI 增强匹配（先查行级缓存，减少 API 调用）
	useTime := config.ColATimeIndex >= 0 && config.ColBTimeIndex >= 0

	aiMatched := 0
	var failedBatches []int

	// 3a. 检查行级缓存，命中则直接加入结果
	var uncachedA [][]string
	cacheHits := 0
	for _, row := range unmatchedA {
		matchVal := getCell(row, config.ColAMatchIndex)
		timeStr := ""
		if useTime {
			timeStr = getCell(row, config.ColATimeIndex)
		}
		cacheKey := a.buildRowCacheKey(matchVal, timeStr, config)
		if cachedVal, ok := a.aiCache.getRow(cacheKey); ok {
			results = append(results, MatchResult{
				RowAData:        row,
				RowBKey:         "",
				ExtractValue:    cachedVal,
				SimilarityScore: 0,
				AIMatched:       true,
			})
			aiMatched++
			cacheHits++
		} else {
			uncachedA = append(uncachedA, row)
		}
	}

	if cacheHits > 0 {
		fmt.Printf("[CACHE] ✓ 行级缓存命中 %d 条，剩余 %d 条需 AI 处理\n", cacheHits, len(uncachedA))
	}

	if len(uncachedA) == 0 {
		a.emitProgress(1, 1,
			fmt.Sprintf("AI 增强完成！全部 %d 条命中缓存", cacheHits), "done")
		return results, nil
	}

	totalUnmatched := len(uncachedA)
	a.emitProgress(0, totalUnmatched,
		fmt.Sprintf("AI 增强匹配：%d 条命中缓存，%d 条需调用 AI...", cacheHits, totalUnmatched),
		"ai-enhancing")

	for batchStart := 0; batchStart < totalUnmatched; batchStart += DefaultBatchSize {
		end := min(batchStart+DefaultBatchSize, totalUnmatched)
		batchNum := (batchStart / DefaultBatchSize) + 1

		a.emitProgress(batchStart+1, totalUnmatched,
			fmt.Sprintf("AI 分析中 %d/%d (第 %d 批)...", end, totalUnmatched, batchNum),
			"ai-enhancing")

		batch := uncachedA[batchStart:end]

		// 计算本批 A 表的时间范围
		var minTime, maxTime time.Time
		hasBatchTime := false
		if useTime {
			for _, row := range batch {
				t, err := parseTimeFlexible(getCell(row, config.ColATimeIndex))
				if err != nil {
					continue
				}
				if !hasBatchTime {
					minTime, maxTime = t, t
					hasBatchTime = true
				} else {
					if t.Before(minTime) {
						minTime = t
					}
					if t.After(maxTime) {
						maxTime = t
					}
				}
			}
		}

		// 过滤 B 表在时间窗口内的行（用户配置时间窗口 + 额外余量覆盖批次跨度）
		var relevantB [][]string
		if hasBatchTime && useTime {
			padding := prep.windowDuration + time.Duration(defaultAIWindowPadH)*time.Hour
			ws := minTime.Add(-padding)
			we := maxTime.Add(padding)
			for _, row := range prep.dataB {
				t, err := parseTimeFlexible(getCell(row, config.ColBTimeIndex))
				if err != nil || t.Before(ws) || t.After(we) {
					continue
				}
				relevantB = append(relevantB, row)
			}
		} else {
			// 无时间列时限制 B 表条数以控制 token 消耗
			if len(prep.dataB) > prep.maxBRowsNoTime {
				fmt.Printf("[AI-WARN] 无时间列，B 表共 %d 行，AI 匹配仅取前 %d 行（可在高级设置调整）\n",
					len(prep.dataB), prep.maxBRowsNoTime)
			}
			maxB := min(prep.maxBRowsNoTime, len(prep.dataB))
			relevantB = prep.dataB[:maxB]
		}

		// 构建 AI 提示
		prompt := a.buildGenericAIPrompt(batch, relevantB, config, prep.windowDuration, hasBatchTime)
		aiResp, err := a.callAIAPI(prompt)
		if err != nil {
			fmt.Printf("[AI-WARN] 第 %d 批 API 调用失败: %v\n", batchNum, err)
			failedBatches = append(failedBatches, batchNum)
			continue
		}

		// 解析 AI 返回
		var matchResp struct {
			Matches []struct {
				Index int    `json:"index"`
				Value string `json:"value"`
			} `json:"matches"`
		}
		parseErr := json.Unmarshal([]byte(aiResp), &matchResp)
		if parseErr != nil {
			if idx := strings.Index(aiResp, "{"); idx >= 0 {
				if endIdx := strings.LastIndex(aiResp, "}"); endIdx > idx {
					parseErr = json.Unmarshal([]byte(aiResp[idx:endIdx+1]), &matchResp)
				}
			}
		}
		if parseErr != nil {
			fmt.Printf("[AI-WARN] 响应解析失败 (第 %d 批): %s\n   原始响应: %.200s\n",
				batchNum, parseErr.Error(), aiResp)
			failedBatches = append(failedBatches, batchNum)
			continue
		}

		for _, item := range matchResp.Matches {
			idx := item.Index
			val := strings.TrimSpace(item.Value)
			if idx < 0 || idx >= len(batch) || val == "" {
				continue
			}
			rowA := batch[idx]
			mr := MatchResult{
				RowAData:        rowA,
				RowBKey:         "",
				ExtractValue:    val,
				SimilarityScore: 0,
				AIMatched:       true,
			}
			results = append(results, mr)
			aiMatched++

			// 写入行级缓存
			matchVal := getCell(rowA, config.ColAMatchIndex)
			timeStr := ""
			if useTime {
				timeStr = getCell(rowA, config.ColATimeIndex)
			}
			cacheKey := a.buildRowCacheKey(matchVal, timeStr, config)
			a.aiCache.putRow(cacheKey, val)
		}
	}
	a.aiCache.saveToFile()

	// 构建完成消息
	msg := fmt.Sprintf("AI 增强完成！基础匹配 %d 条 + AI 补充 %d 条 = 共 %d 条",
		len(results)-aiMatched, aiMatched, len(results))
	if len(failedBatches) > 0 {
		msg += fmt.Sprintf("（警告：第 %v 批失败）", failedBatches)
	}
	a.emitProgress(totalUnmatched, totalUnmatched, msg, "done")

	return results, nil
}