# Define the ModelComparisonPlot classclassModelComparisonPlot:def__init__(self, model_name):self.model_name = model_namedefplot_comparison(self, y_val, y_pred, mse, mae, r2):# Create a figure with two subplotsfig, axes = plt.subplots(1,2, figsize=(11,5))# Plot the predicted vs true valuessns.regplot(x=y_val, y=y_pred, color='blue', scatter_kws={'alpha':0.5}, ax=axes[0])axes[0].plot([y_val.min(), y_val.max()],[y_val.min(), y_val.max()],'k--', lw=2)axes[0].set_xlabel('True values', fontsize=12)axes[0].set_ylabel('Predicted values', fontsize=12)axes[0].set_title('Predicted vs true values')axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted valuesresiduals = y_val - y_predsns.residplot(x=y_pred, y=residuals, color='blue', scatter_kws={'alpha':0.5}, ax=axes[1])axes[1].plot([y_val.min(), y_val.max()],[0,0],'k--', lw=2)axes[1].set_xlabel('Predicted values', fontsize=12)axes[1].set_ylabel('Residuals', fontsize=12)axes[1].set_title('Residual plot', fontsize=15)axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figurefig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(self.model_name), fontsize=15)# Adjust the spacing between subplotsplt.subplots_adjust(wspace=0.4)# Display the figure with the titleplt.show()
独热编码
# Assuming 'Entity' is a categorical variable
X_encoded = pd.get_dummies(df, columns=['实体'], drop_first=True)
确定预测值
# Assuming 'gdp_growth' is target variable
X = X_encoded.drop('人均国内生产总值', axis=1)
y = X_encoded['人均国内生产总值']
X_encoded = X_encoded.fillna(0)# Replace with your preferred imputation method
区分训练集与测试集
# Assuming X and y are features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
机器学习代码
ExtraTreesRegressor
# Create an Extra Trees Regressor model
model_ETR = ExtraTreesRegressor(max_depth=None,max_features=None,min_samples_leaf=1,min_samples_split=2,n_estimators=300)# Fit the model
model_ETR.fit(X_train, y_train)# Make predictions
y_pred = model_ETR.predict(X_test)# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)# Print evaluation metricsprint(f"Model: {type(model_ETR).__name__}, mse: {mse}")print(f"Model: {type(model_ETR).__name__}, mae: {mae}")print(f"Model: {type(model_ETR).__name__}, r2: {r2}")
model_ETR_plot = ModelComparisonPlot('ExtraTreesRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test, y=y_pred, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'k--', lw=2)# Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test - y_pred
sns.residplot(x=y_pred, y=residuals, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()],[0,0],'k--', lw=2)# Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(model_ETR_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()
DecisionTreeRegressor
# Create a decision tree regression model
dt_model = DecisionTreeRegressor()# Fit the model
dt_model.fit(X_train, y_train)# Make predictions
predictions = dt_model.predict(X_test)# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)# Print evaluation metricsprint(f"Model: {type(dt_model).__name__}, mse: {mse}")print(f"Model: {type(dt_model).__name__}, mae: {mae}")print(f"Model: {type(dt_model).__name__}, r2: {r2}")
dt_model_plot = ModelComparisonPlot('DecisionTreeRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'k--', lw=2)# Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()],[0,0],'k--', lw=2)# Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(dt_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()
LinearRegression
# Create a linear regression model
linear_model = LinearRegression()# Fit the model
linear_model.fit(X_train, y_train)# Make predictions
predictions_linear = linear_model.predict(X_test)# Evaluate the model
mse_linear = mean_squared_error(y_test, predictions_linear)
mae_linear = mean_absolute_error(y_test, predictions_linear)
r2_linear = r2_score(y_test, predictions_linear)# Print evaluation metricsprint(f"Model: {type(linear_model).__name__}, mse: {mse_linear}")print(f"Model: {type(linear_model).__name__}, mae: {mae_linear}")print(f"Model: {type(linear_model).__name__}, r2: {r2_linear}")
linear_model_plot = ModelComparisonPlot('LinearRegression')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_linear, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'k--', lw=2)# Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_linear = y_test - predictions_linear
sns.residplot(x=predictions_linear, y=residuals_linear, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()],[0,0],'k--', lw=2)# Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(linear_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()
KNeighborsRegressor
# Create a KNN regression model
knn_model = KNeighborsRegressor()# Fit the model
knn_model.fit(X_train, y_train)# Make predictions
predictions_knn = knn_model.predict(X_test)# Evaluate the model
mse_knn = mean_squared_error(y_test, predictions_knn)
mae_knn = mean_absolute_error(y_test, predictions_knn)
r2_knn = r2_score(y_test, predictions_knn)# Print evaluation metricsprint(f"Model: {type(knn_model).__name__}, mse: {mse_knn}")print(f"Model: {type(knn_model).__name__}, mae: {mae_knn}")print(f"Model: {type(knn_model).__name__}, r2: {r2_knn}")
knn_model_plot = ModelComparisonPlot('KNeighborsRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_knn, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'k--', lw=2)# Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_knn = y_test - predictions_knn
sns.residplot(x=predictions_knn, y=residuals_knn, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()],[0,0],'k--', lw=2)# Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(knn_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()
XGboost
# Convert 'Density' column to numeric
X_train['Density']= pd.to_numeric(X_train['Density'], errors='coerce')
X_test['Density']= pd.to_numeric(X_test['Density'], errors='coerce')# Drop rows with missing values after conversion
X_train = X_train.dropna()
X_test = X_test.dropna()# Create an XGBoost regression model
xgb_model = XGBRegressor()# Fit the model
xgb_model.fit(X_train, y_train)# Make predictions
predictions_xgb = xgb_model.predict(X_test)# Evaluate the model
mse_xgb = mean_squared_error(y_test, predictions_xgb)
mae_xgb = mean_absolute_error(y_test, predictions_xgb)
r2_xgb = r2_score(y_test, predictions_xgb)# Print evaluation metricsprint(f"Model: {type(xgb_model).__name__}, mse: {mse_xgb}")print(f"Model: {type(xgb_model).__name__}, mae: {mae_xgb}")print(f"Model: {type(xgb_model).__name__}, r2: {r2_xgb}")
xgb_model_plot = ModelComparisonPlot('XGBRegressor')# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_xgb, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()],'k--', lw=2)# Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals_xgb = y_test - predictions_xgb
sns.residplot(x=predictions_xgb, y=residuals_xgb, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()],[0,0],'k--', lw=2)# Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(xgb_model_plot.model_name), fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()
Gaussian Naive Bayes
# Convert continuous labels to binary categories
y_train_binary =(y_train > y_train.mean()).astype(int)
y_test_binary =(y_test > y_train.mean()).astype(int)# Create a Naive Bayes model
nb_model = GaussianNB()# Fit the model
nb_model.fit(X_train, y_train_binary)# Make predictions
predictions = nb_model.predict(X_test)
X_encoded['Density']= pd.to_numeric(X_encoded['Density'], errors='coerce')# Drop 'gdp_growth' as before
X = X_encoded.drop('国内生产总值增长率', axis=1)
y = X_encoded['国内生产总值增长率']
X_encoded = X_encoded.fillna(0)# Assuming X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# List of algorithms to check
algorithms =[LinearRegression(),DecisionTreeRegressor(),KNeighborsRegressor(),XGBRegressor()]best_mse =float('inf')
best_model =None# Loop through each algorithmfor model in algorithms:# Fit the modelmodel.fit(X_train, y_train)# Make predictionspredictions = model.predict(X_test)# Evaluate the model using cross-validation with mean squared errormse_scores =-cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')mean_mse = np.mean(mse_scores)# Print the cross-validation mean squared errorprint(f"{model.__class__.__name__} - Cross-Validation MSE: {mean_mse}")# Update the best model if the current model has lower mean squared errorif mean_mse < best_mse:best_mse = mean_msebest_model = model# Print the best model and its mean squared errorprint("\nBest Model:")print(best_model)print("Best Cross-Validation MSE:", best_mse)# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor":"white","grid.color":"#D3D3D3"})# Create a figure with two subplots
fig, axes = plt.subplots(1,2, figsize=(11,5), facecolor="#F5F5F5")# Plot the predicted vs true values
sns.regplot(x=y_test_binary, y=predictions, color='red', scatter_kws={'alpha':0.5}, ax=axes[0])
axes[0].plot([y_test_binary.min(), y_test_binary.max()],[y_test_binary.min(), y_test_binary.max()],'k--', lw=2)
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)# Plot the residuals vs predicted values
residuals = y_test_binary - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha':0.5}, ax=axes[1])
axes[1].plot([y_test_binary.min(), y_test_binary.max()],[0,0],'k--', lw=2)
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\nGaussianNB', fontsize=15, color="black")# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)# Display the figure with the title
plt.show()