@浙大疏锦行
作业:尝试用svd处理心脏病数据集
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings('ignore') # 设置随机种子保证可复现性 np.random.seed(42) plt.rcParams['font.sans-serif'] = ['SimHei'] # 支持中文显示 plt.rcParams['axes.unicode_minus'] = False # -------------------------- # 1. 本地数据集加载与预处理 # -------------------------- # 读取本地数据集(E盘PythonStudy文件夹下的data.csv) data_path = r'E:\PythonStudy\data.csv' df = pd.read_csv(data_path) # 查看数据集基本信息 print("="*60) print("数据集基本信息") print("="*60) print(f"数据集形状: {df.shape}") print(f"\n数据集前5行:") print(df.head()) print(f"\n数据集缺失值统计:") print(df.isnull().sum()) print(f"\n数据集目标变量分布:") print(df['target'].value_counts()) # 分离特征和目标变量(假设目标变量列名为'target',若不同需手动修改) X = df.drop('target', axis=1) y = df['target'] # 标签二值化(确保是二分类问题:0=无疾病,1=有疾病) y = (y > 0).astype(int) # 处理缺失值(数值型特征用中位数填充,避免异常值影响) imputer = SimpleImputer(strategy='median') X_imputed = imputer.fit_transform(X) # 数据标准化(SVD对特征尺度敏感,必须标准化) scaler = StandardScaler() X_scaled = scaler.fit_transform(X_imputed) # 划分训练集和测试集(分层抽样,保证类别分布一致) X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.25, random_state=42, stratify=y ) print(f"\n训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}") print(f"训练集类别分布: 0={sum(y_train==0)}, 1={sum(y_train==1)}") print(f"测试集类别分布: 0={sum(y_test==0)}, 1={sum(y_test==1)}") # -------------------------- # 2. 基准模型训练(无降维) # -------------------------- print("\n" + "="*60) print("基准模型(原始特征)- 逻辑回归") print("="*60) # 训练逻辑回归模型 lr_base = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_base.fit(X_train, y_train) # 预测与评估 y_pred_base = lr_base.predict(X_test) base_accuracy = accuracy_score(y_test, y_pred_base) print(f"原始特征维度: {X_train.shape[1]}") print(f"基准模型测试集精度: {base_accuracy:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred_base)) # 混淆矩阵 cm_base = confusion_matrix(y_test, y_pred_base) print("混淆矩阵:") print(cm_base) # -------------------------- # 3. SVD降维与模型训练 # -------------------------- print("\n" + "="*60) print("SVD降维后模型训练") print("="*60) # 确定SVD候选维度(基于原始特征维度的1/3到2/3,兼顾信息保留和降维效果) max_components = min(15, X_train.shape[1]-1) # 最多保留15个维度或原始维度-1 n_components_list = list(range(2, max_components+1)) svd_accuracies = [] explained_variances = [] # 遍历不同维度进行SVD降维与模型训练 for n in n_components_list: # SVD降维(保留前n个主成分) svd = TruncatedSVD(n_components=n, random_state=42) X_train_svd = svd.fit_transform(X_train) X_test_svd = svd.transform(X_test) # 训练逻辑回归模型(与基准模型参数一致) lr_svd = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_svd.fit(X_train_svd, y_train) # 评估模型 y_pred_svd = lr_svd.predict(X_test_svd) accuracy = accuracy_score(y_test, y_pred_svd) svd_accuracies.append(accuracy) # 计算累计解释方差比(反映保留的信息比例) total_var = np.sum(svd.explained_variance_ratio_) explained_variances.append(total_var) print(f"维度={n:2d} | 精度={accuracy:.4f} | 累计解释方差={total_var:.4f}") # 找到最优维度(精度最高,若有多个则选维度最小的) best_idx = np.argmax(svd_accuracies) best_n = n_components_list[best_idx] best_accuracy = svd_accuracies[best_idx] best_var = explained_variances[best_idx] print(f"\n最优SVD维度: {best_n}") print(f"最优维度精度: {best_accuracy:.4f}") print(f"精度变化: {best_accuracy - base_accuracy:.4f} ({'提升' if best_accuracy > base_accuracy else '下降'})") print(f"最优维度累计解释方差: {best_var:.4f}") # -------------------------- # 4. 最优SVD模型详细评估 # -------------------------- print("\n" + "="*60) print(f"最优SVD模型(维度={best_n})详细评估") print("="*60) # 重新训练最优维度的SVD模型 svd_best = TruncatedSVD(n_components=best_n, random_state=42) X_train_best = svd_best.fit_transform(X_train) X_test_best = svd_best.transform(X_test) lr_best = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_best.fit(X_train_best, y_train) y_pred_best = lr_best.predict(X_test_best) print("分类报告:") print(classification_report(y_test, y_pred_best)) # 混淆矩阵 cm_best = confusion_matrix(y_test, y_pred_best) print("混淆矩阵:") print(cm_best) # -------------------------- # 5. 可视化对比分析 # -------------------------- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # 图1:不同SVD维度的精度变化 ax1.plot(n_components_list, svd_accuracies, 'b-o', linewidth=2, markersize=6, label='SVD降维精度') ax1.axhline(y=base_accuracy, color='r', linestyle='--', linewidth=2, label='基准模型精度') ax1.axvline(x=best_n, color='g', linestyle=':', linewidth=2, label=f'最优维度={best_n}') ax1.set_xlabel('SVD降维后的特征维度', fontsize=11) ax1.set_ylabel('预测精度', fontsize=11) ax1.set_title('SVD维度与预测精度关系', fontsize=12, fontweight='bold') ax1.legend() ax1.grid(True, alpha=0.3) ax1.set_xticks(n_components_list) # 图2:累计解释方差比变化 ax2.plot(n_components_list, explained_variances, 'orange', marker='s', linewidth=2, markersize=6) ax2.axvline(x=best_n, color='g', linestyle=':', linewidth=2, label=f'最优维度={best_n}') ax2.set_xlabel('SVD降维后的特征维度', fontsize=11) ax2.set_ylabel('累计解释方差比', fontsize=11) ax2.set_title('SVD维度与累计解释方差关系', fontsize=12, fontweight='bold') ax2.legend() ax2.grid(True, alpha=0.3) ax2.set_xticks(n_components_list) # 图3:基准模型混淆矩阵热力图 im1 = ax3.imshow(cm_base, interpolation='nearest', cmap=plt.cm.Blues) ax3.set_title('基准模型混淆矩阵', fontsize=12, fontweight='bold') ax3.set_xlabel('预测标签', fontsize=11) ax3.set_ylabel('真实标签', fontsize=11) ax3.set_xticks([0, 1]) ax3.set_yticks([0, 1]) ax3.set_xticklabels(['无疾病', '有疾病']) ax3.set_yticklabels(['无疾病', '有疾病']) # 在混淆矩阵中添加数值 for i in range(2): for j in range(2): ax3.text(j, i, cm_base[i, j], ha='center', va='center', color='white' if cm_base[i, j] > cm_base.max()/2 else 'black') # 图4:最优SVD模型混淆矩阵热力图 im2 = ax4.imshow(cm_best, interpolation='nearest', cmap=plt.cm.Greens) ax4.set_title(f'最优SVD模型混淆矩阵(维度={best_n})', fontsize=12, fontweight='bold') ax4.set_xlabel('预测标签', fontsize=11) ax4.set_ylabel('真实标签', fontsize=11) ax4.set_xticks([0, 1]) ax4.set_yticks([0, 1]) ax4.set_xticklabels(['无疾病', '有疾病']) ax4.set_yticklabels(['无疾病', '有疾病']) # 在混淆矩阵中添加数值 for i in range(2): for j in range(2): ax4.text(j, i, cm_best[i, j], ha='center', va='center', color='white' if cm_best[i, j] > cm_best.max()/2 else 'black') plt.tight_layout() plt.savefig(r'E:\PythonStudy\SVD_heart_disease_analysis.png', dpi=300, bbox_inches='tight') plt.show() # -------------------------- # 6. 额外:随机森林模型对比(验证降维鲁棒性) # -------------------------- print("\n" + "="*60) print("随机森林模型对比(验证SVD降维鲁棒性)") print("="*60) # 基准随机森林 rf_base = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') rf_base.fit(X_train, y_train) y_pred_rf_base = rf_base.predict(X_test) rf_base_acc = accuracy_score(y_test, y_pred_rf_base) # SVD降维后的随机森林 rf_svd = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') rf_svd.fit(X_train_best, y_train) y_pred_rf_svd = rf_svd.predict(X_test_best) rf_svd_acc = accuracy_score(y_test, y_pred_rf_svd) print(f"随机森林-原始特征精度: {rf_base_acc:.4f}") print(f"随机森林-SVD降维(维度={best_n})精度: {rf_svd_acc:.4f}") print(f"精度变化: {rf_svd_acc - rf_base_acc:.4f}") # -------------------------- # 结果总结 # -------------------------- print("\n" + "="*60) print("最终结果总结") print("="*60) print(f"1. 原始特征维度: {X_train.shape[1]}") print(f"2. 最优SVD降维维度: {best_n}") print(f"3. 逻辑回归模型:") print(f" - 原始特征精度: {base_accuracy:.4f}") print(f" - SVD降维精度: {best_accuracy:.4f}") print(f" - 精度变化: {best_accuracy - base_accuracy:.4f}") print(f"4. 随机森林模型:") print(f" - 原始特征精度: {rf_base_acc:.4f}") print(f" - SVD降维精度: {rf_svd_acc:.4f}") print(f" - 精度变化: {rf_svd_acc - rf_base_acc:.4f}") print(f"5. 最优SVD维度累计解释方差: {best_var:.4f}") print(f"\n结论: SVD降维在{'保留' if best_accuracy >= base_accuracy-0.02 else '损失少量'}精度的前提下,将特征维度从{X_train.shape[1]}降至{best_n},有效简化了模型复杂度。")