手动实现随机森林并做数据实验

mac2024-04-18  29

获取波士顿房价数据集

import numpy as np from numpy import * import random from sklearn.model_selection import train_test_split from sklearn.datasets import load_boston from sklearn.metrics import r2_score boston = load_boston() boston.data.shape (506, 13) boston.target.shape (506,)

搭建随机森林

建立随机森林类

import warnings warnings.filterwarnings('ignore') from joblib import Parallel, delayed class myrf: # 存放树的列表 trees = [] # 随机种子 random_state = 0 # 树的个数 n_estimators = 10 # 最大特征数 max_features = 10 # 最大深度 max_depth = 10 # 切分新节点所需的最小阈值 min_change = 0.001 # 当前树的数量 cur_tree = 0 # 最小分割 min_samples_split = 0 # 叶子内节点的最小数目 min_samples_leaf = 0 # 每次建树时所用的样本占总样本的比例 sample_radio = 0.9 # 每次建树时所并行化处理器的个数 n_jobs = 10 # 计算y的方差 # 本来是要除总样本数的,考虑到对于所有的叶子来说,总样本数都是一致的,所以不除应该也可以。 def get_varience(self, dataSet): return np.var(dataSet[:,-1])*shape(dataSet)[0] # 计算y的均值 def get_mean(self,dataSet): return np.mean(dataSet[:,-1]) # 根据特征边界划分样本 def SplitDataSet(self, dataSet,feature,value): dataSet = dataSet[dataSet[:,feature].argsort()] for i in range(shape(dataSet)[0]): if dataSet[i][feature] == value and dataSet[i+1][feature] != value: return dataSet[i+1:, :], dataSet[0:i+1, :] # 选取特征边界 def select_best_feature(self, dataSet): #计算特征的数目 feature_num=dataSet.shape[1]-1 features=np.random.choice(feature_num,self.max_features,replace=False) # 最好分数 bestS=inf; # 最优特征 bestfeature=0; # 最优特征的分割值 bestValue=0; S=self.get_varience(dataSet) # 判断样本数量是否足够 if shape(dataSet)[0] < self.min_samples_split or shape(dataSet)[0] < self.min_samples_leaf: return None,self.get_mean(dataSet) for feature in features: dataSet = dataSet[dataSet[:,feature].argsort()] # 控制叶子节点数目 for index in range(shape(dataSet)[0]-1): # 排除重复值 if index != shape(dataSet)[0]-1 and dataSet[index][feature] == dataSet[index+1][feature]: continue data0 = dataSet[0:index+1, :] data1 = dataSet[index+1:, :] if shape(data0)[0] < self.min_samples_leaf or shape(data1)[0] < self.min_samples_leaf: continue; newS=self.get_varience(data0)+self.get_varience(data1) if bestS>newS: bestfeature=feature bestValue=dataSet[index][feature] # print(bestfeature, bestValue) bestS=newS if (S-bestS)<self.min_change: #如果误差不大就退出,说明无法分割 return None,self.get_mean(dataSet) # print(bestfeature, bestValue) return bestfeature,bestValue # 搭建单颗决策树 def createTree(self, dataSet, max_level, flag = 0): if flag == 0: seqtree = self.cur_tree+1 self.cur_tree = seqtree; print('正在搭建第',seqtree,'棵树...') bestfeature,bestValue=self.select_best_feature(dataSet) if bestfeature==None: if flag == 0: print('第',seqtree,'棵树搭建完成!') return bestValue retTree={} max_level-=1 if max_level<0: #控制深度 return self.get_mean(dataSet) retTree['bestFeature']=bestfeature retTree['bestVal']=bestValue # 分割成左右两棵树 lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue) retTree['right']=self.createTree(rSet,self.max_depth,1) retTree['left']=self.createTree(lSet,self.max_depth,1) if flag == 0: print('第',seqtree,'棵树搭建完成!') return retTree # 搭建决策树 def createTree(self, dataSet, max_level, flag = 0): if flag == 0: seqtree = self.cur_tree+1 self.cur_tree = seqtree; print('正在搭建第'+str(seqtree)+'棵树...\n') bestfeature,bestValue=self.select_best_feature(dataSet) if bestfeature==None: if flag == 0: print('第'+str(seqtree)+'棵树搭建完成!') return bestValue retTree={} max_level-=1 if max_level<0: #控制深度 return self.get_mean(dataSet) retTree['bestFeature']=bestfeature retTree['bestVal']=bestValue # 分割成左右两棵树 lSet,rSet=self.SplitDataSet(dataSet,bestfeature,bestValue) retTree['right']=self.createTree(rSet,self.max_depth,1) retTree['left']=self.createTree(lSet,self.max_depth,1) if flag == 0: print('第'+str(seqtree)+'棵树搭建完成!') return retTree # 初始化随机森林 def __init__(self, random_state, n_estimators, max_features, max_depth, min_change = 0.001, min_samples_split = 0, min_samples_leaf = 0, sample_radio = 0.9, n_jobs = 10): self.trees = [] self.random_state = random_state np.random.seed(self.random_state) self.n_estimators = n_estimators self.max_features = max_features self.max_depth = max_depth self.min_change = min_change self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.sample_radio = sample_radio self.n_jobs = n_jobs # 向森林添加单棵决策树 def get_one_tree(self, dataSet): X_train, X_test, y_train, y_test = train_test_split(dataSet[:,:-1], dataSet[:,-1], train_size = self.sample_radio, random_state = self.random_state) X_train=np.concatenate((X_train,y_train.reshape((-1,1))),axis=1) self.trees.append(self.createTree(X_train,self.max_depth)) # 并行化搭建随机森林 def fit(self, X, Y): #树的个数,预测时使用的特征的数目,树的深度 dataSet = np.concatenate((X, Y.reshape(-1,1)), axis = -1) Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.get_one_tree)(dataSet) for _ in range(self.n_estimators)) #预测单个数据样本 def treeForecast(self,tree,data): if not isinstance(tree,dict): return float(tree) if data[tree['bestFeature']]>tree['bestVal']: if type(tree['left'])=='float': return tree['left'] else: return self.treeForecast(tree['left'],data) else: if type(tree['right'])=='float': return tree['right'] else: return self.treeForecast(tree['right'],data) # 单决策树预测结果 def createForeCast(self,tree,dataSet): seqtree = self.cur_tree+1 self.cur_tree = seqtree; print('第'+str(seqtree)+'棵树正在预测...\n') l=len(dataSet) predict=np.mat(zeros((l,1))) for i in range(l): predict[i,0]=self.treeForecast(tree,dataSet[i,:]) print('第'+str(seqtree)+'棵树预测完成!') return predict # 更新预测值函数 def unpdate_predict(self, predict, tree, X): predict+=self.createForeCast(tree,X) # 随机森林预测结果 def predict(self,X): self.cur_tree = 0; l=len(X) predict=np.mat(zeros((l,1))) Parallel(n_jobs=self.n_jobs, backend="threading")(delayed(self.unpdate_predict)(predict, tree, X) for tree in self.trees) # 对多棵树预测的结果取平均 predict/=self.n_estimators return predict # 获取模型分数 def get_score(self,target, X): return r2_score(target, self.predict(X))

模型预测与评估

预测模型

# rf2 = mycache(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=10) rf1 = myrf(random_state=2, n_estimators=10, max_features=3, max_depth=10, min_change=0.001, min_samples_split=20, n_jobs=-1) rf1.fit(boston.data, boston.target) 正在搭建第1棵树... 正在搭建第2棵树... 正在搭建第3棵树... 正在搭建第4棵树... 第4棵树搭建完成! 正在搭建第5棵树... 第2棵树搭建完成!第3棵树搭建完成! 正在搭建第6棵树... 正在搭建第7棵树... 第1棵树搭建完成! 正在搭建第8棵树... 第6棵树搭建完成! 正在搭建第9棵树... 第5棵树搭建完成!第7棵树搭建完成! 第8棵树搭建完成! 正在搭建第10棵树... 第9棵树搭建完成! 第10棵树搭建完成! rf1.get_score(boston.target, boston.data) 第1棵树正在预测... 第2棵树正在预测... 第3棵树正在预测... 第1棵树预测完成! 第4棵树正在预测... 第3棵树预测完成!第5棵树正在预测... 第6棵树正在预测... 第6棵树预测完成!第2棵树预测完成! 第7棵树正在预测... 第7棵树预测完成! 第8棵树正在预测... 第8棵树预测完成! 第5棵树预测完成! 第9棵树正在预测... 第4棵树预测完成!第9棵树预测完成! 第10棵树正在预测... 第10棵树预测完成!

0.9302502640348399
最新回复(0)