随机森林实现以及网格搜索


1、网格搜索
在构建随机森林之前,首先要设置决策树的个数,数的深度等参数,因此可以利用网格搜索来确定。
导入需要的包:
import pandas as pd
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split
import pickle
import seaborn as sns
from sklearn.tree import export_graphviz
#最优参数确定
def parameter_sure(file_path):
df=pd.read_excel(file_path,header=0)
df.dropna(how='any', inplace=True)
#参数范围设置
start1=3
end1=11
start2=3
end2=11
#设置你要确定参数的范围
min_samples_split_list=[i for i in range(start1,end1)]
n_estimators_list=[i for i in range(start2,end2)]
#10折交叉验证
kf = KFold(n_splits=10, shuffle=True, random_state=12345)
R2=[]
for min_samples_split in min_samples_split_list:
r2_list=[]
for n_estimaters in n_estimators_list:
temp_r2_list = []
for train_index, test_index in kf.split(df.values):
model = RandomForestRegressor(min_samples_split=min_samples_split,n_estimators=n_estimaters,max_depth=5)
x_train, x_test, y_train, y_test = df.values[train_index, 1:-1], df.values[test_index, 1:-1], \
df.values[train_index, -1], df.values[test_index, -1]
model.fit(x_train,y_train)
y_pre = model.predict(x_test)
r2=r2_score(y_test,y_pre) #计算r2作为评价指标
temp_r2_list.append(r2)
r2_list.append(np.mean(temp_r2_list))
R2.append(r2_list)
R2_view(np.array(R2),[i for i in range(start1,end1)],[i for i in range(start2,end2)])
#参数调优可视化
def R2_view(correction,labels1,labels2):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(10,8))
ax = sns.heatmap(correction, linewidths=0.05, vmax=0.93, vmin=0.85, annot=True,cbar=False,
annot_kws={'size': 18, 'weight': 'bold'})
cb = ax.figure.colorbar(ax.collections[0]) # 显示colorbar
cb.ax.tick_params(labelsize=18) # 设置colorbar刻度字体大小。
# 热力图参数设置(相关系数矩阵,颜色,每个值间隔等)
plt.xlabel('N',fontsize=25)
plt.ylabel('S',fontsize=25)
plt.xticks(np.arange(len(labels2)) + 0.5, labels2, rotation=0) # 横坐标标注点
plt.yticks(np.arange(len(labels1)) + 0.5, labels1, rotation=0) # 纵坐标标注点
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
示例效果如下:

当然也可以用已有的API来实现:
def grind_search(file_path):
df=pd.read_excel(file_path,header=0)
df.dropna(how='any', inplace=True)
kf = KFold(n_splits=5, shuffle=True, random_state=12345)
tree_param_grid = {'min_samples_split': range(3, 21), 'n_estimators': range(3, 21)}
grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_param_grid,cv=kf)
grid.fit(df.values[:,1:-1], df.values[:,-1])
print(grid.best_params_, grid.best_score_)
2、随机森林回归
使用交叉验证,基于随机森林进行回归,示例代码如下:
#获取训练集和测试集
def get_train_test(file_path,use_folder=True):
df=pd.read_excel(file_path,header=0)
df.dropna(how='any', inplace=True)
if use_folder:
kf = KFold(n_splits=5,shuffle=True,random_state=12345)
best_r2=0.5
for train_index, test_index in kf.split(df.values):
x_train,x_test,y_train,y_test=df.values[train_index,1:-1],df.values[test_index,1:-1],\
df.values[train_index,-1],df.values[test_index,-1]
best_r2=train(x_train, x_test, y_train, y_test,best_r2)
else:
train_data=df.values[:,1:-1]
train_target=df.values[:,-1]
x_train, x_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.1, random_state=12345)
return x_train, x_test, y_train, y_test
#模型训练与保存
def train(x_train, x_test, y_train, y_test,best_r=0.5):
model = RandomForestRegressor(min_samples_split=10, n_estimators=5)
model.fit(x_train, y_train)
y_pre=model.predict(x_test)
r2_pre=r2_score(y_test,y_pre)
if r2_pre>best_r:
if not os.path.exists('random_forest'):
os.mkdir('random_forest')
with open('random_forest/random_forest.pickle', 'wb') as f:
pickle.dump(model, f)
return r2_pre
return best_r
3、计算预测结果
#预测结保存
def pre(file_path):
with open('random_forest/random_forest.pickle', 'rb') as f:
model = pickle.load(f)
# 测试读取后的Model
df = pd.read_excel(file_path, header=0)
df.dropna(how='any', inplace=True)
predict=model.predict(df.values[:,1:-1])
df['预测结果']=predict
df.to_csv('随机森林预测结果.csv',encoding='GBK')
其它例如GBM,xgboost等算法的实现和这个类似,只需要将模型修改即可。
好了,本片文章就到这里了,希望对大家有所帮助,也感谢大家的支持。