DAY19 机器学习-平芜编程栈

# 导入必要库（补充时间统计、熵权法TOPSIS所需库） import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error import warnings import time # 用于统计训练时间 warnings.filterwarnings('ignore') # ---------------------- 熵权法+TOPSIS综合评价函数 ---------------------- def entropy_topsis_ranking(evaluation_df): """ 输入：模型评估结果DataFrame（包含模型名称和各指标） 输出：带综合得分和排名的DataFrame 说明： - 指标类型：R²（越大越好）、RMSE（越小越好）、MAE（越小越好）、训练时间（越小越好） - 步骤：数据标准化→计算熵权→TOPSIS排序 """ # 1. 分离指标数据（排除模型名称列） indicators = evaluation_df.drop(columns=['模型名称']).values indicator_names = evaluation_df.drop(columns=['模型名称']).columns.tolist() # 2. 指标正向化（将负向指标转为正向，负向指标：RMSE、MAE、训练时间） # 负向指标正向化公式：x' = max(x) - x pos_indicators = indicators.copy() negative_cols = [i for i, name in enumerate(indicator_names) if name in ['RMSE（美元）', 'MAE（美元）', '训练时间（秒）']] for col in negative_cols: pos_indicators[:, col] = pos_indicators[:, col].max() - pos_indicators[:, col] # 3. 数据标准化（0-1标准化，避免分母为0） max_vals = pos_indicators.max(axis=0) min_vals = pos_indicators.min(axis=0) # 处理分母为0的情况（指标值全部相同） denom = max_vals - min_vals denom[denom == 0] = 1e-10 # 替换为极小值避免除零错误 norm_indicators = (pos_indicators - min_vals) / denom # 4. 计算熵权 m, n = norm_indicators.shape # m个模型，n个指标 p = norm_indicators / norm_indicators.sum(axis=0, keepdims=True) # 概率矩阵 p = np.where(p == 0, 1e-10, p) # 避免log(0) entropy = -np.sum(p * np.log(p), axis=0) / np.log(m) # 各指标熵值 weight = (1 - entropy) / (1 - entropy).sum() # 熵权（归一化） # 5. TOPSIS计算 weighted_norm = norm_indicators * weight # 加权标准化矩阵 ideal_pos = weighted_norm.max(axis=0) # 正理想解 ideal_neg = weighted_norm.min(axis=0) # 负理想解 # 计算各模型到理想解的距离 dist_pos = np.sqrt(np.sum((weighted_norm - ideal_pos)**2, axis=1)) dist_neg = np.sqrt(np.sum((weighted_norm - ideal_neg)**2, axis=1)) # 综合得分（越接近1越好） score = dist_neg / (dist_pos + dist_neg) # 6. 整理结果 result_df = evaluation_df.copy() # 熵权列优化：改为显示每个指标的熵权（字符串格式，避免列表显示混乱） weight_str = [f"R²:{weight[0]:.4f}, RMSE:{weight[1]:.4f}, MAE:{weight[2]:.4f}, 训练时间:{weight[3]:.4f}" for _ in range(m)] result_df['各指标熵权'] = weight_str result_df['TOPSIS综合得分'] = score.round(4) result_df['排名'] = result_df['TOPSIS综合得分'].rank(ascending=False, method='min').astype(int) # 调整列顺序，优化显示 col_order = ['模型名称', 'TOPSIS综合得分', '排名'] + indicator_names + ['各指标熵权'] result_df = result_df[col_order] # 单独显示指标-熵权对应关系（修复字符串拼接问题） weight_df = pd.DataFrame({ '评估指标': indicator_names, '熵权': weight.round(4), # 改用Python列表推导式拼接字符串，避免NumPy类型冲突 '权重占比': [f"{w*100:.2f}%" for w in weight] }) return result_df, weight_df # ---------------------- 1. 加载数据并查看原始结构 ---------------------- df = pd.read_csv(r'C:\Users\asus1\Desktop\PythonStudy\housing.csv') # ---------------------- 2. 标签翻译（英文特征名→中文，优化翻译准确性） ---------------------- column_mapping = { 'longitude': '经度', 'latitude': '纬度', 'housing_median_age': '房屋年龄中位数', 'total_rooms': '总房间数', 'total_bedrooms': '总卧室数', 'population': '人口数', 'households': '家庭数', 'median_income': '收入中位数', 'median_house_value': '房屋价值中位数', 'ocean_proximity': '海洋邻近度' } df_cn = df.rename(columns=column_mapping) # ---------------------- 3. 数据预处理 ---------------------- print("\n=== 数据预处理 ===") # 缺失值处理（只处理数值型列） numeric_cols = df_cn.select_dtypes(include=['int', 'float']).columns missing_cols = [col for col in numeric_cols if df_cn[col].isnull().sum() > 0] if missing_cols: for col in missing_cols: median_val = df_cn[col].median() df_cn[col] = df_cn[col].fillna(median_val) print(f"{col} 缺失值已用中位数 {median_val:.2f} 填补") else: print("无缺失值") # 分类特征独热编码（海洋邻近度） df_cn = pd.get_dummies(df_cn, columns=['海洋邻近度'], drop_first=True) # 分离特征和目标变量 X = df_cn.drop(columns=['房屋价值中位数']) y = df_cn['房屋价值中位数'] # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"训练集形状：{X_train.shape}, 测试集形状：{X_test.shape}") # ---------------------- 4. 特征标准化 ---------------------- # 只对数值型特征标准化（排除独热编码后的分类特征） numeric_features = ['经度', '纬度', '房屋年龄中位数', '总房间数', '总卧室数', '人口数', '家庭数', '收入中位数'] X_train_numeric = X_train[numeric_features] X_test_numeric = X_test[numeric_features] # 标准化 scaler = StandardScaler() X_train_numeric_norm = scaler.fit_transform(X_train_numeric) X_test_numeric_norm = scaler.transform(X_test_numeric) # 重组特征矩阵（数值型标准化特征 + 独热编码特征） categorical_features = X.columns.difference(numeric_features).tolist() X_train_categorical = X_train[categorical_features].values X_test_categorical = X_test[categorical_features].values X_train_norm = np.hstack([X_train_numeric_norm, X_train_categorical]) X_test_norm = np.hstack([X_test_numeric_norm, X_test_categorical]) # 验证标准化效果 # ---------------------- 5. 回归模型训练与评估（含训练时间） ---------------------- print("\n=== 模型训练与评估 ===") # 定义模型字典（优化参数设置） regressors = { '线性回归': LinearRegression(), '决策树回归': DecisionTreeRegressor(random_state=42, max_depth=10), '随机森林回归': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=15), 'XGBoost回归': XGBRegressor(random_state=42, eval_metric='rmse', n_estimators=100, max_depth=8), 'LightGBM回归': LGBMRegressor(random_state=42, n_estimators=100, max_depth=8, verbose=-1) } results = [] preds_dict = {} for name, model in regressors.items(): print(f"\n正在训练：{name}") # 记录训练时间 start_time = time.time() model.fit(X_train_norm, y_train) train_time = time.time() - start_time # 预测与评估 y_pred = model.predict(X_test_norm) preds_dict[name] = y_pred r2 = r2_score(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) # 记录结果 results.append({ "模型名称": name, "R²（决定系数）": round(r2, 4), "RMSE（美元）": round(rmse, 2), "MAE（美元）": round(mae, 2), "训练时间（秒）": round(train_time, 4) }) # 打印实时结果 print(f"{name:10s} | 训练时间: {train_time:.4f}秒 | R²: {r2:.4f} | RMSE: {rmse:,.2f}美元 | MAE: {mae:,.2f}美元") # 转换结果为DataFrame results_df = pd.DataFrame(results) print("\n=== 各模型原始评估结果 ===") print(results_df.to_string(index=False, formatters={ 'RMSE（美元）': '{:,.2f}'.format, 'MAE（美元）': '{:,.2f}'.format, '训练时间（秒）': '{:.4f}'.format })) # ---------------------- 6. 熵权法+TOPSIS综合排序 ---------------------- print("\n=== 熵权法+TOPSIS综合评价结果 ===") topsis_result, weight_df = entropy_topsis_ranking(results_df) # 显示指标权重 print("\n各评估指标的熵权分布：") print(weight_df.to_string(index=False)) # 显示综合排名（优化格式） print("\n模型综合排名（按TOPSIS得分降序）：") topsis_result_sorted = topsis_result.sort_values('TOPSIS综合得分', ascending=False).drop_duplicates('模型名称') display_cols = ['模型名称', 'TOPSIS综合得分', '排名', 'R²（决定系数）', 'RMSE（美元）', 'MAE（美元）', '训练时间（秒）'] topsis_display = topsis_result_sorted[display_cols].copy() # 格式化显示 topsis_display['RMSE（美元）'] = topsis_display['RMSE（美元）'].apply(lambda x: f"{x:,.2f}") topsis_display['MAE（美元）'] = topsis_display['MAE（美元）'].apply(lambda x: f"{x:,.2f}") topsis_display['训练时间（秒）'] = topsis_display['训练时间（秒）'].apply(lambda x: f"{x:.4f}") print(topsis_display.to_string(index=False)) # 输出最优模型信息 best_model = topsis_result_sorted.iloc[0]['模型名称'] best_score = topsis_result_sorted.iloc[0]['TOPSIS综合得分'] best_r2 = topsis_result_sorted.iloc[0]['R²（决定系数）'] best_rmse = topsis_result_sorted.iloc[0]['RMSE（美元）'] print(f"\n🏆 最优模型：{best_model}") print(f"综合得分：{best_score:.4f}") print(f"关键性能：R²={best_r2:.4f}, RMSE={best_rmse:,.2f}美元") print(f"\n💡 提示：最优模型综合考虑了预测精度（R²、RMSE、MAE）和训练效率（训练时间），是平衡后的最佳选择")
DAY19 机器学习

LobeChat是否支持OAuth登录？第三方鉴权集成进展

如何在笔记本上运行50量子比特模拟？：不为人知的内存压缩黑科技

揭秘低代码平台事件绑定难题：3步实现无缝交互逻辑

别再盲目聚类了！空间转录组R语言最优算法选择指南

太月香学新书《中国传统香学》首发亮相

2025冬暖影展奔赴广州，以光影开启时空对话