机器学习项目二:帕金森病检测
目录
下载数据
一、导入相关包
二、数据加载
三、特征工程
四、构建模型
五、评估与可视化
六、程序流程
七、完整代码
一、导入相关包
# 导入库部分
import numpy as np # 数值计算基础库
import pandas as pd # 数据处理库
from sklearn.preprocessing import MinMaxScaler # 数据标准化
from xgboost import XGBClassifier # XGBoost分类器
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold # 数据分割和超参数优化
from sklearn.metrics import (accuracy_score, classification_report,
confusion_matrix, roc_auc_score) # 评估指标
import matplotlib.pyplot as plt # 可视化
import seaborn as sns # 高级可视化
import joblib # 模型持久化
from datetime import datetime # 时间戳生成
二、数据加载
def load_data(path):
"""加载并预处理数据"""
df = pd.read_csv(path) # 读取CSV文件
# 数据质量断言检查
assert 'status' in df.columns, "数据必须包含status列"
# 打印关键统计信息(调试用)
print(f"数据分布:\n{df['status'].value_counts()}")
print(f"\n缺失值统计:\n{df.isnull().sum()}")
# 用中位数填充缺失值(比均值更抗异常值)
df = df.fillna(df.median())
return df
三、特征工程
def feature_engineering(df):
"""特征处理"""
# 移除标签列和无关列(患者姓名)
features = df.drop(['status', 'name'], axis=1)
labels = df['status'].values
# MinMax标准化到[-1,1]范围
scaler = MinMaxScaler(feature_range=(-1, 1))
features_scaled = scaler.fit_transform(features)
return features_scaled, labels, scaler # 返回scaler对象用于后续推理
四、构建模型
def optimize_model(X_train, y_train):
"""使用网格搜索优化XGBoost"""
# 扩展的参数网格(基于文献和实验)
param_grid = {
'learning_rate': [0.01, 0.05, 0.1], # 更精细的学习率设置
'max_depth': [3, 5, 7], # 树深度范围
'min_child_weight': [1, 3], # 子节点最小权重
'gamma': [0, 0.1], # 分裂最小损失下降
'subsample': [0.7, 0.9], # 样本采样比例
'colsample_bytree': [0.7, 0.9], # 特征采样比例
'reg_alpha': [0, 0.1], # L1正则化
'reg_lambda': [0.1, 1], # L2正则化
'n_estimators': [100, 200] # 树的数量
}
# 分层K折交叉验证(保持类别分布)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 配置基准模型(带早停机制)
base_model = XGBClassifier(
objective='binary:logistic',
eval_metric='logloss', # 使用对数损失
early_stopping_rounds=10, # 早停轮数
random_state=39
)
# 网格搜索配置
grid_search = GridSearchCV(
estimator=base_model,
param_grid=param_grid,
cv=cv,
scoring='roc_auc', # 使用AUC作为优化目标
n_jobs=-1, # 使用所有CPU核心
verbose=1 # 打印进度
)
grid_search.fit(X_train, y_train)
return grid_search.best_estimator_, grid_search.best_params_
五、评估与可视化
def evaluate_model(model, X_test, y_test):
"""模型评估与结果可视化"""
# 生成预测结果
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # 获取正类概率
# 打印分类报告
print("\n分类报告:")
print(classification_report(y_test, y_pred))
# 输出AUC分数
print(f"\nAUC分数: {roc_auc_score(y_test, y_proba):.4f}")
# 混淆矩阵热力图
plt.figure(figsize=(6,4))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['健康', '患病'],
yticklabels=['健康', '患病'])
plt.title('混淆矩阵')
plt.show()
# 特征重要性可视化
plt.figure(figsize=(10, 6))
feat_imp = pd.Series(model.feature_importances_,
index=df.drop(['status', 'name'], axis=1).columns)
feat_imp.nlargest(15).plot(kind='barh')
plt.title('Top 15特征重要性')
plt.tight_layout()
plt.show()
六、程序流程
# 主程序流程
if __name__ == "__main__":
# 数据加载
df = load_data('./data/parkinsons.data')
# 特征工程
X, y, scaler = feature_engineering(df)
# 数据分割(分层抽样)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
stratify=y, # 保持类别比例
random_state=39
)
# 模型优化
print("\n开始参数优化...")
best_model, best_params = optimize_model(X_train, y_train)
print(f"\n最佳参数: {best_params}")
# 模型评估
evaluate_model(best_model, X_test, y_test)
# 模型保存(带时间戳)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
model_path = f"parkinson_model_v{timestamp}.pkl"
joblib.dump({'model': best_model, 'scaler': scaler}, model_path)
print(f"\n模型已保存到: {model_path}")
七、完整代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime
# 1. 数据加载与预处理
def load_data(path):
"""加载并预处理数据"""
df = pd.read_csv(path)
# 数据质量检查
assert 'status' in df.columns, "数据必须包含status列"
print(f"数据分布:\n{df['status'].value_counts()}")
print(f"\n缺失值统计:\n{df.isnull().sum()}")
# 识别数值列和非数值列
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
print(f"\n数值列: {list(numeric_cols)}")
print(f"非数值列: {list(non_numeric_cols)}")
# 仅对数值列填充中位数
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
return df
# 2. 特征工程
def feature_engineering(df):
"""特征处理"""
# 分离特征和标签
features = df.drop(['status', 'name'], axis=1) # 移除无关列
labels = df['status'].values
# 数据标准化
scaler = MinMaxScaler(feature_range=(-1, 1))
features_scaled = scaler.fit_transform(features)
return features_scaled, labels, scaler
# 3. 模型优化
def optimize_model(X_train, y_train):
"""使用网格搜索优化XGBoost"""
# 改进的参数网格(基于文献和实验)
param_grid = {
'learning_rate': [0.01, 0.05, 0.1], # 更精细的学习率
'max_depth': [3, 5, 7],
'min_child_weight': [1, 3],
'gamma': [0, 0.1], # 添加gamma参数控制分裂
'subsample': [0.7, 0.9],
'colsample_bytree': [0.7, 0.9],
'reg_alpha': [0, 0.1],
'reg_lambda': [0.1, 1],
'n_estimators': [100, 200]
}
# 改进的交叉验证策略(分层K折)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 使用早停的基准模型
base_model = XGBClassifier(
objective='binary:logistic',
eval_metric='logloss',
early_stopping_rounds=10,
random_state=39
)
# 网格搜索配置
grid_search = GridSearchCV(
estimator=base_model,
param_grid=param_grid,
cv=cv,
scoring='roc_auc', # 使用AUC作为评估指标
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
return grid_search.best_estimator_, grid_search.best_params_
# 4. 评估与可视化
def evaluate_model(model, X_test, y_test):
"""模型评估与结果可视化"""
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print("\n分类报告:")
print(classification_report(y_test, y_pred))
print(f"\nAUC分数: {roc_auc_score(y_test, y_proba):.4f}")
# 混淆矩阵可视化
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.show()
# 特征重要性
plt.figure(figsize=(10, 6))
feat_imp = pd.Series(model.feature_importances_,
index=df.drop(['status', 'name'], axis=1).columns)
feat_imp.nlargest(15).plot(kind='barh')
plt.title('Top 15特征重要性')
plt.show()
# 主流程
if __name__ == "__main__":
# 数据加载
df = load_data('data/parkinsons.csv')
# 特征工程
X, y, scaler = feature_engineering(df)
# 数据分割(分层抽样)
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
stratify=y,
random_state=39
)
# 模型优化
print("\n开始参数优化...")
best_model, best_params = optimize_model(X_train, y_train)
print(f"\n最佳参数: {best_params}")
# 模型评估
evaluate_model(best_model, X_test, y_test)
# 模型保存(带时间戳)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
model_path = f"parkinson_model_v{timestamp}.pkl"
joblib.dump({'model': best_model, 'scaler': scaler}, model_path)
print(f"\n模型已保存到: {model_path}")