【DAY24】奇异值SVD分解-洪萨配资

@浙大疏锦行
作业：尝试用svd处理心脏病数据集
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings('ignore') # 设置随机种子保证可复现性 np.random.seed(42) plt.rcParams['font.sans-serif'] = ['SimHei'] # 支持中文显示 plt.rcParams['axes.unicode_minus'] = False # -------------------------- # 1. 本地数据集加载与预处理 # -------------------------- # 读取本地数据集（E盘PythonStudy文件夹下的data.csv） data_path = r'E:\PythonStudy\data.csv' df = pd.read_csv(data_path) # 查看数据集基本信息 print("="*60) print("数据集基本信息") print("="*60) print(f"数据集形状: {df.shape}") print(f"\n数据集前5行:") print(df.head()) print(f"\n数据集缺失值统计:") print(df.isnull().sum()) print(f"\n数据集目标变量分布:") print(df['target'].value_counts()) # 分离特征和目标变量（假设目标变量列名为'target'，若不同需手动修改） X = df.drop('target', axis=1) y = df['target'] # 标签二值化（确保是二分类问题：0=无疾病，1=有疾病） y = (y > 0).astype(int) # 处理缺失值（数值型特征用中位数填充，避免异常值影响） imputer = SimpleImputer(strategy='median') X_imputed = imputer.fit_transform(X) # 数据标准化（SVD对特征尺度敏感，必须标准化） scaler = StandardScaler() X_scaled = scaler.fit_transform(X_imputed) # 划分训练集和测试集（分层抽样，保证类别分布一致） X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.25, random_state=42, stratify=y ) print(f"\n训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}") print(f"训练集类别分布: 0={sum(y_train==0)}, 1={sum(y_train==1)}") print(f"测试集类别分布: 0={sum(y_test==0)}, 1={sum(y_test==1)}") # -------------------------- # 2. 基准模型训练（无降维） # -------------------------- print("\n" + "="*60) print("基准模型（原始特征）- 逻辑回归") print("="*60) # 训练逻辑回归模型 lr_base = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_base.fit(X_train, y_train) # 预测与评估 y_pred_base = lr_base.predict(X_test) base_accuracy = accuracy_score(y_test, y_pred_base) print(f"原始特征维度: {X_train.shape[1]}") print(f"基准模型测试集精度: {base_accuracy:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred_base)) # 混淆矩阵 cm_base = confusion_matrix(y_test, y_pred_base) print("混淆矩阵:") print(cm_base) # -------------------------- # 3. SVD降维与模型训练 # -------------------------- print("\n" + "="*60) print("SVD降维后模型训练") print("="*60) # 确定SVD候选维度（基于原始特征维度的1/3到2/3，兼顾信息保留和降维效果） max_components = min(15, X_train.shape[1]-1) # 最多保留15个维度或原始维度-1 n_components_list = list(range(2, max_components+1)) svd_accuracies = [] explained_variances = [] # 遍历不同维度进行SVD降维与模型训练 for n in n_components_list: # SVD降维（保留前n个主成分） svd = TruncatedSVD(n_components=n, random_state=42) X_train_svd = svd.fit_transform(X_train) X_test_svd = svd.transform(X_test) # 训练逻辑回归模型（与基准模型参数一致） lr_svd = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_svd.fit(X_train_svd, y_train) # 评估模型 y_pred_svd = lr_svd.predict(X_test_svd) accuracy = accuracy_score(y_test, y_pred_svd) svd_accuracies.append(accuracy) # 计算累计解释方差比（反映保留的信息比例） total_var = np.sum(svd.explained_variance_ratio_) explained_variances.append(total_var) print(f"维度={n:2d} | 精度={accuracy:.4f} | 累计解释方差={total_var:.4f}") # 找到最优维度（精度最高，若有多个则选维度最小的） best_idx = np.argmax(svd_accuracies) best_n = n_components_list[best_idx] best_accuracy = svd_accuracies[best_idx] best_var = explained_variances[best_idx] print(f"\n最优SVD维度: {best_n}") print(f"最优维度精度: {best_accuracy:.4f}") print(f"精度变化: {best_accuracy - base_accuracy:.4f} ({'提升' if best_accuracy > base_accuracy else '下降'})") print(f"最优维度累计解释方差: {best_var:.4f}") # -------------------------- # 4. 最优SVD模型详细评估 # -------------------------- print("\n" + "="*60) print(f"最优SVD模型（维度={best_n}）详细评估") print("="*60) # 重新训练最优维度的SVD模型 svd_best = TruncatedSVD(n_components=best_n, random_state=42) X_train_best = svd_best.fit_transform(X_train) X_test_best = svd_best.transform(X_test) lr_best = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced') lr_best.fit(X_train_best, y_train) y_pred_best = lr_best.predict(X_test_best) print("分类报告:") print(classification_report(y_test, y_pred_best)) # 混淆矩阵 cm_best = confusion_matrix(y_test, y_pred_best) print("混淆矩阵:") print(cm_best) # -------------------------- # 5. 可视化对比分析 # -------------------------- fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # 图1：不同SVD维度的精度变化 ax1.plot(n_components_list, svd_accuracies, 'b-o', linewidth=2, markersize=6, label='SVD降维精度') ax1.axhline(y=base_accuracy, color='r', linestyle='--', linewidth=2, label='基准模型精度') ax1.axvline(x=best_n, color='g', linestyle=':', linewidth=2, label=f'最优维度={best_n}') ax1.set_xlabel('SVD降维后的特征维度', fontsize=11) ax1.set_ylabel('预测精度', fontsize=11) ax1.set_title('SVD维度与预测精度关系', fontsize=12, fontweight='bold') ax1.legend() ax1.grid(True, alpha=0.3) ax1.set_xticks(n_components_list) # 图2：累计解释方差比变化 ax2.plot(n_components_list, explained_variances, 'orange', marker='s', linewidth=2, markersize=6) ax2.axvline(x=best_n, color='g', linestyle=':', linewidth=2, label=f'最优维度={best_n}') ax2.set_xlabel('SVD降维后的特征维度', fontsize=11) ax2.set_ylabel('累计解释方差比', fontsize=11) ax2.set_title('SVD维度与累计解释方差关系', fontsize=12, fontweight='bold') ax2.legend() ax2.grid(True, alpha=0.3) ax2.set_xticks(n_components_list) # 图3：基准模型混淆矩阵热力图 im1 = ax3.imshow(cm_base, interpolation='nearest', cmap=plt.cm.Blues) ax3.set_title('基准模型混淆矩阵', fontsize=12, fontweight='bold') ax3.set_xlabel('预测标签', fontsize=11) ax3.set_ylabel('真实标签', fontsize=11) ax3.set_xticks([0, 1]) ax3.set_yticks([0, 1]) ax3.set_xticklabels(['无疾病', '有疾病']) ax3.set_yticklabels(['无疾病', '有疾病']) # 在混淆矩阵中添加数值 for i in range(2): for j in range(2): ax3.text(j, i, cm_base[i, j], ha='center', va='center', color='white' if cm_base[i, j] > cm_base.max()/2 else 'black') # 图4：最优SVD模型混淆矩阵热力图 im2 = ax4.imshow(cm_best, interpolation='nearest', cmap=plt.cm.Greens) ax4.set_title(f'最优SVD模型混淆矩阵（维度={best_n}）', fontsize=12, fontweight='bold') ax4.set_xlabel('预测标签', fontsize=11) ax4.set_ylabel('真实标签', fontsize=11) ax4.set_xticks([0, 1]) ax4.set_yticks([0, 1]) ax4.set_xticklabels(['无疾病', '有疾病']) ax4.set_yticklabels(['无疾病', '有疾病']) # 在混淆矩阵中添加数值 for i in range(2): for j in range(2): ax4.text(j, i, cm_best[i, j], ha='center', va='center', color='white' if cm_best[i, j] > cm_best.max()/2 else 'black') plt.tight_layout() plt.savefig(r'E:\PythonStudy\SVD_heart_disease_analysis.png', dpi=300, bbox_inches='tight') plt.show() # -------------------------- # 6. 额外：随机森林模型对比（验证降维鲁棒性） # -------------------------- print("\n" + "="*60) print("随机森林模型对比（验证SVD降维鲁棒性）") print("="*60) # 基准随机森林 rf_base = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') rf_base.fit(X_train, y_train) y_pred_rf_base = rf_base.predict(X_test) rf_base_acc = accuracy_score(y_test, y_pred_rf_base) # SVD降维后的随机森林 rf_svd = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') rf_svd.fit(X_train_best, y_train) y_pred_rf_svd = rf_svd.predict(X_test_best) rf_svd_acc = accuracy_score(y_test, y_pred_rf_svd) print(f"随机森林-原始特征精度: {rf_base_acc:.4f}") print(f"随机森林-SVD降维（维度={best_n}）精度: {rf_svd_acc:.4f}") print(f"精度变化: {rf_svd_acc - rf_base_acc:.4f}") # -------------------------- # 结果总结 # -------------------------- print("\n" + "="*60) print("最终结果总结") print("="*60) print(f"1. 原始特征维度: {X_train.shape[1]}") print(f"2. 最优SVD降维维度: {best_n}") print(f"3. 逻辑回归模型:") print(f" - 原始特征精度: {base_accuracy:.4f}") print(f" - SVD降维精度: {best_accuracy:.4f}") print(f" - 精度变化: {best_accuracy - base_accuracy:.4f}") print(f"4. 随机森林模型:") print(f" - 原始特征精度: {rf_base_acc:.4f}") print(f" - SVD降维精度: {rf_svd_acc:.4f}") print(f" - 精度变化: {rf_svd_acc - rf_base_acc:.4f}") print(f"5. 最优SVD维度累计解释方差: {best_var:.4f}") print(f"\n结论: SVD降维在{'保留' if best_accuracy >= base_accuracy-0.02 else '损失少量'}精度的前提下，将特征维度从{X_train.shape[1]}降至{best_n}，有效简化了模型复杂度。")
【DAY24】奇异值SVD分解

昇腾 Ascend 310P 边缘推理实战：基于 MindSpore Lite 的轻量化部署与性能分析

【无人艇编队】4虚拟领航员+人工势场APF+自适应控制4 艘欠驱动水面船舶USV “包容控制 + 障碍规避 + 事件触发” 复杂环境下分布式协同控制【含Matlab源码 14691期

【路径规划】PQ-RRT算法路径规划（结合贝塞尔曲线）【含Matlab源码 14693期】

Flutter Provider 状态管理深度解析与开源鸿蒙 ArkUI 状态管理对比

Vue.js 前端框架开发知识点总结

行测教程资源合集