Go语言机器学习实战:构建分类模型
引言
机器学习是人工智能的核心领域之一,它使计算机能够从数据中学习并做出预测。Go语言以其高性能和并发能力,成为构建机器学习应用的理想选择。本文将介绍如何使用Go语言构建机器学习分类模型。
一、机器学习基础概念
1.1 监督学习与无监督学习
// 监督学习:有标签数据 // 无监督学习:无标签数据,聚类分析 // 分类问题:预测离散值(如猫/狗) // 回归问题:预测连续值(如房价)1.2 评估指标
// 准确率:正确预测的比例 // 精确率:预测为正例的样本中真正为正例的比例 // 召回率:真正为正例的样本中被预测为正例的比例 // F1分数:精确率和召回率的调和平均二、决策树算法
2.1 实现决策树
package main import ( "fmt" "math" ) type DecisionTreeNode struct { feature int threshold float64 left *DecisionTreeNode right *DecisionTreeNode class int isLeaf bool } func giniImpurity(labels []int) float64 { counts := make(map[int]int) for _, label := range labels { counts[label]++ } impurity := 1.0 total := float64(len(labels)) for _, count := range counts { prob := float64(count) / total impurity -= prob * prob } return impurity } func splitData(data [][]float64, labels []int, feature int, threshold float64) ([][]float64, []int, [][]float64, []int) { var leftData, rightData [][]float64 var leftLabels, rightLabels []int for i, row := range data { if row[feature] <= threshold { leftData = append(leftData, row) leftLabels = append(leftLabels, labels[i]) } else { rightData = append(rightData, row) rightLabels = append(rightLabels, labels[i]) } } return leftData, leftLabels, rightData, rightLabels } func findBestSplit(data [][]float64, labels []int) (int, float64, float64) { bestFeature := -1 bestThreshold := 0.0 bestGain := 0.0 numFeatures := len(data[0]) currentImpurity := giniImpurity(labels) for feature := 0; feature < numFeatures; feature++ { // 获取该特征的所有值 values := make(map[float64]bool) for _, row := range data { values[row[feature]] = true } for value := range values { leftData, leftLabels, rightData, rightLabels := splitData(data, labels, feature, value) if len(leftLabels) == 0 || len(rightLabels) == 0 { continue } // 计算信息增益 leftWeight := float64(len(leftLabels)) / float64(len(labels)) rightWeight := float64(len(rightLabels)) / float64(len(labels)) gain := currentImpurity - leftWeight*giniImpurity(leftLabels) - rightWeight*giniImpurity(rightLabels) if gain > bestGain { bestGain = gain bestFeature = feature bestThreshold = value } } } return bestFeature, bestThreshold, bestGain } func buildTree(data [][]float64, labels []int, depth int, maxDepth int) *DecisionTreeNode { // 终止条件 if depth >= maxDepth || len(labels) == 0 { // 返回最常见的类别 counts := make(map[int]int) for _, label := range labels { counts[label]++ } maxCount := 0 bestClass := 0 for class, count := range counts { if count > maxCount { maxCount = count bestClass = class } } return &DecisionTreeNode{ class: bestClass, isLeaf: true, } } feature, threshold, gain := findBestSplit(data, labels) if gain == 0 { // 返回最常见的类别 counts := make(map[int]int) for _, label := range labels { counts[label]++ } maxCount := 0 bestClass := 0 for class, count := range counts { if count > maxCount { maxCount = count bestClass = class } } return &DecisionTreeNode{ class: bestClass, isLeaf: true, } } leftData, leftLabels, rightData, rightLabels := splitData(data, labels, feature, threshold) node := &DecisionTreeNode{ feature: feature, threshold: threshold, isLeaf: false, } node.left = buildTree(leftData, leftLabels, depth+1, maxDepth) node.right = buildTree(rightData, rightLabels, depth+1, maxDepth) return node } func (node *DecisionTreeNode) Predict(row []float64) int { if node.isLeaf { return node.class } if row[node.feature] <= node.threshold { return node.left.Predict(row) } return node.right.Predict(row) } func main() { // 示例数据:特征为[年龄, 收入],标签为是否购买(0/1) data := [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels := []int{0, 1, 1, 0, 0, 1, 1, 0} tree := buildTree(data, labels, 0, 3) // 预测 testData := []float64{32, 55000} prediction := tree.Predict(testData) fmt.Printf("预测结果: %d\n", prediction) }三、随机森林
3.1 实现随机森林
package main import ( "fmt" "math/rand" "time" ) type RandomForest struct { trees []*DecisionTreeNode numTrees int maxDepth int } func NewRandomForest(numTrees, maxDepth int) *RandomForest { rand.Seed(time.Now().UnixNano()) return &RandomForest{ numTrees: numTrees, maxDepth: maxDepth, } } func (rf *RandomForest) Fit(data [][]float64, labels []int) { for i := 0; i < rf.numTrees; i++ { // 随机采样(有放回) sampleData, sampleLabels := bootstrapSample(data, labels) tree := buildTree(sampleData, sampleLabels, 0, rf.maxDepth) rf.trees = append(rf.trees, tree) } } func bootstrapSample(data [][]float64, labels []int) ([][]float64, []int) { n := len(data) sampleData := make([][]float64, n) sampleLabels := make([]int, n) for i := 0; i < n; i++ { idx := rand.Intn(n) sampleData[i] = data[idx] sampleLabels[i] = labels[idx] } return sampleData, sampleLabels } func (rf *RandomForest) Predict(row []float64) int { votes := make(map[int]int) for _, tree := range rf.trees { prediction := tree.Predict(row) votes[prediction]++ } maxVotes := 0 bestClass := 0 for class, votes := range votes { if votes > maxVotes { maxVotes = votes bestClass = class } } return bestClass } func main() { data := [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels := []int{0, 1, 1, 0, 0, 1, 1, 0} rf := NewRandomForest(10, 3) rf.Fit(data, labels) testData := []float64{32, 55000} prediction := rf.Predict(testData) fmt.Printf("随机森林预测结果: %d\n", prediction) }四、支持向量机
4.1 SVM基础
package main import ( "fmt" "math" ) type SVM struct { weights []float64 bias float64 lr float64 } func NewSVM(featureCount int, lr float64) *SVM { return &SVM{ weights: make([]float64, featureCount), bias: 0, lr: lr, } } func (svm *SVM) trainOne(data []float64, label int) { // 预测 prediction := svm.predictRaw(data) // 如果分类正确,不更新 if label*prediction >= 1 { return } // 更新权重和偏置 for i := range svm.weights { svm.weights[i] += svm.lr * float64(label) * data[i] } svm.bias += svm.lr * float64(label) } func (svm *SVM) predictRaw(data []float64) float64 { var result float64 for i, w := range svm.weights { result += w * data[i] } result += svm.bias return result } func (svm *SVM) Predict(data []float64) int { result := svm.predictRaw(data) if result >= 0 { return 1 } return -1 } func main() { // 线性可分数据 data := [][]float64{ {1, 2}, {2, 3}, {3, 3}, {2, 1}, {3, 2}, {4, 1}, } labels := []int{1, 1, 1, -1, -1, -1} svm := NewSVM(2, 0.1) // 训练 for epoch := 0; epoch < 100; epoch++ { for i, row := range data { svm.trainOne(row, labels[i]) } } // 预测 testData := []float64{2.5, 2.5} prediction := svm.Predict(testData) fmt.Printf("SVM预测结果: %d\n", prediction) }五、K近邻算法
5.1 KNN实现
package main import ( "fmt" "math" "sort" ) type KNN struct { k int } func NewKNN(k int) *KNN { return &KNN{k: k} } func euclideanDistance(a, b []float64) float64 { var sum float64 for i := range a { sum += math.Pow(a[i]-b[i], 2) } return math.Sqrt(sum) } func (knn *KNN) Predict(trainData [][]float64, trainLabels []int, testData []float64) int { // 计算距离 type neighbor struct { distance float64 label int } var neighbors []neighbor for i, row := range trainData { dist := euclideanDistance(row, testData) neighbors = append(neighbors, neighbor{distance: dist, label: trainLabels[i]}) } // 按距离排序 sort.Slice(neighbors, func(i, j int) bool { return neighbors[i].distance < neighbors[j].distance }) // 取前k个 votes := make(map[int]int) for i := 0; i < knn.k; i++ { votes[neighbors[i].label]++ } // 投票 maxVotes := 0 bestClass := 0 for class, count := range votes { if count > maxVotes { maxVotes = count bestClass = class } } return bestClass } func main() { data := [][]float64{ {25, 40000}, {35, 60000}, {45, 80000}, {20, 20000}, {30, 50000}, {40, 70000}, {50, 90000}, {22, 30000}, } labels := []int{0, 1, 1, 0, 0, 1, 1, 0} knn := NewKNN(3) testData := []float64{32, 55000} prediction := knn.Predict(data, labels, testData) fmt.Printf("KNN预测结果: %d\n", prediction) }六、模型评估
6.1 混淆矩阵
package main import ( "fmt" ) func ConfusionMatrix(trueLabels, predictions []int) ([2][2]int, error) { if len(trueLabels) != len(predictions) { return [2][2]int{}, fmt.Errorf("长度不匹配") } var matrix [2][2]int for i := range trueLabels { trueLabel := trueLabels[i] predLabel := predictions[i] if trueLabel == 0 && predLabel == 0 { matrix[0][0]++ // TN } else if trueLabel == 0 && predLabel == 1 { matrix[0][1]++ // FP } else if trueLabel == 1 && predLabel == 0 { matrix[1][0]++ // FN } else if trueLabel == 1 && predLabel == 1 { matrix[1][1]++ // TP } } return matrix, nil } func Accuracy(matrix [2][2]int) float64 { total := matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1] correct := matrix[0][0] + matrix[1][1] return float64(correct) / float64(total) } func Precision(matrix [2][2]int) float64 { predictedPositives := matrix[0][1] + matrix[1][1] if predictedPositives == 0 { return 0 } return float64(matrix[1][1]) / float64(predictedPositives) } func Recall(matrix [2][2]int) float64 { actualPositives := matrix[1][0] + matrix[1][1] if actualPositives == 0 { return 0 } return float64(matrix[1][1]) / float64(actualPositives) } func F1Score(matrix [2][2]int) float64 { precision := Precision(matrix) recall := Recall(matrix) if precision+recall == 0 { return 0 } return 2 * precision * recall / (precision + recall) } func main() { trueLabels := []int{0, 1, 1, 0, 1, 0, 1, 0} predictions := []int{0, 1, 0, 0, 1, 1, 1, 0} matrix, _ := ConfusionMatrix(trueLabels, predictions) fmt.Printf("混淆矩阵:\n") fmt.Printf("[[%d %d]\n", matrix[0][0], matrix[0][1]) fmt.Printf(" [%d %d]]\n", matrix[1][0], matrix[1][1]) fmt.Printf("准确率: %.2f\n", Accuracy(matrix)) fmt.Printf("精确率: %.2f\n", Precision(matrix)) fmt.Printf("召回率: %.2f\n", Recall(matrix)) fmt.Printf("F1分数: %.2f\n", F1Score(matrix)) }七、交叉验证
7.1 K折交叉验证
package main import ( "fmt" "math/rand" "time" ) func kFoldSplit(data [][]float64, labels []int, k int) [][][][]float64 { rand.Seed(time.Now().UnixNano()) // 打乱数据 indices := make([]int, len(data)) for i := range indices { indices[i] = i } rand.Shuffle(len(indices), func(i, j int) { indices[i], indices[j] = indices[j], indices[i] }) foldSize := len(data) / k folds := make([][][][]float64, k) for i := 0; i < k; i++ { start := i * foldSize end := start + foldSize if i == k-1 { end = len(data) } var trainData, trainLabels [][]float64 var testData, testLabels []int for j := 0; j < len(data); j++ { if j >= start && j < end { testData = append(testData, data[indices[j]]) testLabels = append(testLabels, labels[indices[j]]) } else { trainData = append(trainData, data[indices[j]]) trainLabels = append(trainLabels, labels[indices[j]]) } } folds[i] = [][][]float64{trainData, make([][]float64, len(trainLabels)), testData, make([][]float64, len(testLabels))} // 转换labels为[][]float64以便存储 for idx, label := range trainLabels { folds[i][1] = append(folds[i][1], []float64{float64(label)}) } for idx, label := range testLabels { folds[i][3] = append(folds[i][3], []float64{float64(label)}) } } return folds } func main() { data := [][]float64{ {1, 2}, {2, 3}, {3, 3}, {2, 1}, {3, 2}, {4, 1}, {1, 1}, {4, 4}, } labels := []int{1, 1, 1, -1, -1, -1, -1, 1} folds := kFoldSplit(data, labels, 4) for i, fold := range folds { fmt.Printf("Fold %d:\n", i+1) fmt.Printf(" 训练集大小: %d\n", len(fold[0])) fmt.Printf(" 测试集大小: %d\n", len(fold[2])) } }八、总结
本文介绍了如何使用Go语言构建机器学习分类模型,包括:
- 决策树:基于基尼不纯度的决策树实现
- 随机森林:集成多个决策树提高准确性
- 支持向量机:基于最大间隔分类的SVM实现
- K近邻:基于距离的分类算法
- 模型评估:混淆矩阵和评估指标
- 交叉验证:K折交叉验证
通过这些实现,你可以使用Go语言构建自己的机器学习模型,充分利用Go的性能优势。