当前位置：首页 > news >正文

机器学习误差图绘

news 2025/7/13 16:41:44

机器学习误差图绘制

绘图类

# Define the ModelComparisonPlot class
class ModelComparisonPlot:def __init__(self, model_name):self.model_name = model_namedef plot_comparison(self, y_val, y_pred, mse, mae, r2):# Create a figure with two subplotsfig, axes = plt.subplots(1, 2, figsize=(11, 5))# Plot the predicted vs true valuessns.regplot(x=y_val, y=y_pred, color='blue', scatter_kws={'alpha':0.5}, ax=axes[0])axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2)axes[0].set_xlabel('True values', fontsize=12)axes[0].set_ylabel('Predicted values', fontsize=12)axes[0].set_title('Predicted vs true values')axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted valuesresiduals = y_val - y_predsns.residplot(x=y_pred, y=residuals, color='blue', scatter_kws={'alpha':0.5}, ax=axes[1])axes[1].plot([y_val.min(), y_val.max()], [0, 0], 'k--', lw=2)axes[1].set_xlabel('Predicted values', fontsize=12)axes[1].set_ylabel('Residuals', fontsize=12)axes[1].set_title('Residual plot', fontsize=15)axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figurefig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(self.model_name), fontsize=15)# Adjust the spacing between subplotsplt.subplots_adjust(wspace=0.4)# Display the figure with the titleplt.show()

独热编码

# Assuming 'Entity' is a categorical variable
X_encoded = pd.get_dummies(df, columns=['实体'], drop_first=True)

确定预测值

# Assuming 'gdp_growth' is target variable
X = X_encoded.drop('人均国内生产总值', axis=1)
y = X_encoded['人均国内生产总值']
X_encoded = X_encoded.fillna(0)  # Replace with your preferred imputation method

区分训练集与测试集

# Assuming X and y are features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

机器学习代码

ExtraTreesRegressor


# Create an Extra Trees Regressor model
model_ETR = ExtraTreesRegressor(max_depth=None,max_features=None,min_samples_leaf=1,min_samples_split=2,n_estimators=300
)# Fit the model
model_ETR.fit(X_train, y_train)# Make predictions
y_pred = model_ETR.predict(X_test)# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)# Print evaluation metrics
print(f"Model: {type(model_ETR).__name__}, mse: {mse}")
print(f"Model: {type(model_ETR).__name__}, mae: {mae}")
print(f"Model: {type(model_ETR).__name__}, r2: {r2}")
model_ETR_plot = ModelComparisonPlot('ExtraTreesRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=y_pred, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test - y_pred
sns.residplot(x=y_pred, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(model_ETR_plot.model_name), fontsize=15, color="black")
# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()

DecisionTreeRegressor

# Create a decision tree regression model
dt_model = DecisionTreeRegressor()
# Fit the model
dt_model.fit(X_train, y_train)
# Make predictions
predictions = dt_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
# Print evaluation metrics
print(f"Model: {type(dt_model).__name__}, mse: {mse}")
print(f"Model: {type(dt_model).__name__}, mae: {mae}")
print(f"Model: {type(dt_model).__name__}, r2: {r2}")
dt_model_plot = ModelComparisonPlot('DecisionTreeRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(dt_model_plot.model_name), fontsize=15, color="black")
# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()

LinearRegression

# Create a linear regression model
linear_model = LinearRegression()# Fit the model
linear_model.fit(X_train, y_train)# Make predictions
predictions_linear = linear_model.predict(X_test)# Evaluate the model
mse_linear = mean_squared_error(y_test, predictions_linear)
mae_linear = mean_absolute_error(y_test, predictions_linear)
r2_linear = r2_score(y_test, predictions_linear)# Print evaluation metrics
print(f"Model: {type(linear_model).__name__}, mse: {mse_linear}")
print(f"Model: {type(linear_model).__name__}, mae: {mae_linear}")
print(f"Model: {type(linear_model).__name__}, r2: {r2_linear}")
linear_model_plot = ModelComparisonPlot('LinearRegression')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_linear, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_linear = y_test - predictions_linear
sns.residplot(x=predictions_linear, y=residuals_linear, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(linear_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()

KNeighborsRegressor

# Create a KNN regression model
knn_model = KNeighborsRegressor()# Fit the model
knn_model.fit(X_train, y_train)# Make predictions
predictions_knn = knn_model.predict(X_test)# Evaluate the model
mse_knn = mean_squared_error(y_test, predictions_knn)
mae_knn = mean_absolute_error(y_test, predictions_knn)
r2_knn = r2_score(y_test, predictions_knn)# Print evaluation metrics
print(f"Model: {type(knn_model).__name__}, mse: {mse_knn}")
print(f"Model: {type(knn_model).__name__}, mae: {mae_knn}")
print(f"Model: {type(knn_model).__name__}, r2: {r2_knn}")
knn_model_plot = ModelComparisonPlot('KNeighborsRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_knn, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_knn = y_test - predictions_knn
sns.residplot(x=predictions_knn, y=residuals_knn, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(knn_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()

XGboost

# Convert 'Density' column to numeric
X_train['Density'] = pd.to_numeric(X_train['Density'], errors='coerce')
X_test['Density'] = pd.to_numeric(X_test['Density'], errors='coerce')# Drop rows with missing values after conversion
X_train = X_train.dropna()
X_test = X_test.dropna()# Create an XGBoost regression model
xgb_model = XGBRegressor()# Fit the model
xgb_model.fit(X_train, y_train)# Make predictions
predictions_xgb = xgb_model.predict(X_test)# Evaluate the model
mse_xgb = mean_squared_error(y_test, predictions_xgb)
mae_xgb = mean_absolute_error(y_test, predictions_xgb)
r2_xgb = r2_score(y_test, predictions_xgb)# Print evaluation metrics
print(f"Model: {type(xgb_model).__name__}, mse: {mse_xgb}")
print(f"Model: {type(xgb_model).__name__}, mae: {mae_xgb}")
print(f"Model: {type(xgb_model).__name__}, r2: {r2_xgb}")
xgb_model_plot = ModelComparisonPlot('XGBRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_xgb, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_xgb = y_test - predictions_xgb
sns.residplot(x=predictions_xgb, y=residuals_xgb, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(xgb_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()

Gaussian Naive Bayes

# Convert continuous labels to binary categories
y_train_binary = (y_train > y_train.mean()).astype(int)
y_test_binary = (y_test > y_train.mean()).astype(int)
# Create a Naive Bayes model
nb_model = GaussianNB()
# Fit the model
nb_model.fit(X_train, y_train_binary)
# Make predictions
predictions = nb_model.predict(X_test)
X_encoded['Density'] = pd.to_numeric(X_encoded['Density'], errors='coerce')# Drop 'gdp_growth' as before
X = X_encoded.drop('国内生产总值增长率', axis=1)
y = X_encoded['国内生产总值增长率']
X_encoded = X_encoded.fillna(0)# Assuming X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# List of algorithms to check
algorithms = [LinearRegression(),DecisionTreeRegressor(),KNeighborsRegressor(),XGBRegressor()
]best_mse = float('inf')
best_model = None
# Loop through each algorithm
for model in algorithms:# Fit the modelmodel.fit(X_train, y_train)# Make predictionspredictions = model.predict(X_test)# Evaluate the model using cross-validation with mean squared errormse_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')mean_mse = np.mean(mse_scores)# Print the cross-validation mean squared errorprint(f"{model.__class__.__name__} - Cross-Validation MSE: {mean_mse}")# Update the best model if the current model has lower mean squared errorif mean_mse < best_mse:best_mse = mean_msebest_model = model# Print the best model and its mean squared error
print("\nBest Model:")
print(best_model)
print("Best Cross-Validation MSE:", best_mse)
# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test_binary, y=predictions, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test_binary.min(), y_test_binary.max()], [y_test_binary.min(), y_test_binary.max()], 'k--', lw=2)
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test_binary - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test_binary.min(), y_test_binary.max()], [0, 0], 'k--', lw=2)
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\nGaussianNB', fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()