获取波士顿房价数据集
import numpy
as np
from numpy
import *
import random
from sklearn
.model_selection
import train_test_split
from sklearn
.datasets
import load_boston
from sklearn
.metrics
import r2_score
boston
= load_boston
()
boston
.data
.shape
(506, 13)
boston
.target
.shape
(506,)
搭建随机森林
建立随机森林类
import warnings
warnings
.filterwarnings
('ignore')
from joblib
import Parallel
, delayed
class myrf:
trees
= []
random_state
= 0
n_estimators
= 10
max_features
= 10
max_depth
= 10
min_change
= 0.001
cur_tree
= 0
min_samples_split
= 0
min_samples_leaf
= 0
sample_radio
= 0.9
n_jobs
= 10
def get_varience(self
, dataSet
):
return np
.var
(dataSet
[:,-1])*shape
(dataSet
)[0]
def get_mean(self
,dataSet
):
return np
.mean
(dataSet
[:,-1])
def SplitDataSet(self
, dataSet
,feature
,value
):
dataSet
= dataSet
[dataSet
[:,feature
].argsort
()]
for i
in range(shape
(dataSet
)[0]):
if dataSet
[i
][feature
] == value
and dataSet
[i
+1][feature
] != value
:
return dataSet
[i
+1:, :], dataSet
[0:i
+1, :]
def select_best_feature(self
, dataSet
):
feature_num
=dataSet
.shape
[1]-1
features
=np
.random
.choice
(feature_num
,self
.max_features
,replace
=False)
bestS
=inf
;
bestfeature
=0;
bestValue
=0;
S
=self
.get_varience
(dataSet
)
if shape
(dataSet
)[0] < self
.min_samples_split
or shape
(dataSet
)[0] < self
.min_samples_leaf
:
return None,self
.get_mean
(dataSet
)
for feature
in features
:
dataSet
= dataSet
[dataSet
[:,feature
].argsort
()]
for index
in range(shape
(dataSet
)[0]-1):
if index
!= shape
(dataSet
)[0]-1 and dataSet
[index
][feature
] == dataSet
[index
+1][feature
]:
continue
data0
= dataSet
[0:index
+1, :]
data1
= dataSet
[index
+1:, :]
if shape
(data0
)[0] < self
.min_samples_leaf
or shape
(data1
)[0] < self
.min_samples_leaf
:
continue;
newS
=self
.get_varience
(data0
)+self
.get_varience
(data1
)
if bestS
>newS
:
bestfeature
=feature
bestValue
=dataSet
[index
][feature
]
bestS
=newS
if (S
-bestS
)<self
.min_change
:
return None,self
.get_mean
(dataSet
)
return bestfeature
,bestValue
def createTree(self
, dataSet
, max_level
, flag
= 0):
if flag
== 0:
seqtree
= self
.cur_tree
+1
self
.cur_tree
= seqtree
;
print('正在搭建第',seqtree
,'棵树...')
bestfeature
,bestValue
=self
.select_best_feature
(dataSet
)
if bestfeature
==None:
if flag
== 0:
print('第',seqtree
,'棵树搭建完成!')
return bestValue
retTree
={}
max_level
-=1
if max_level
<0:
return self
.get_mean
(dataSet
)
retTree
['bestFeature']=bestfeature
retTree
['bestVal']=bestValue
lSet
,rSet
=self
.SplitDataSet
(dataSet
,bestfeature
,bestValue
)
retTree
['right']=self
.createTree
(rSet
,self
.max_depth
,1)
retTree
['left']=self
.createTree
(lSet
,self
.max_depth
,1)
if flag
== 0:
print('第',seqtree
,'棵树搭建完成!')
return retTree
def createTree(self
, dataSet
, max_level
, flag
= 0):
if flag
== 0:
seqtree
= self
.cur_tree
+1
self
.cur_tree
= seqtree
;
print('正在搭建第'+str(seqtree
)+'棵树...\n')
bestfeature
,bestValue
=self
.select_best_feature
(dataSet
)
if bestfeature
==None:
if flag
== 0:
print('第'+str(seqtree
)+'棵树搭建完成!')
return bestValue
retTree
={}
max_level
-=1
if max_level
<0:
return self
.get_mean
(dataSet
)
retTree
['bestFeature']=bestfeature
retTree
['bestVal']=bestValue
lSet
,rSet
=self
.SplitDataSet
(dataSet
,bestfeature
,bestValue
)
retTree
['right']=self
.createTree
(rSet
,self
.max_depth
,1)
retTree
['left']=self
.createTree
(lSet
,self
.max_depth
,1)
if flag
== 0:
print('第'+str(seqtree
)+'棵树搭建完成!')
return retTree
def __init__(self
, random_state
, n_estimators
, max_features
, max_depth
, min_change
= 0.001,
min_samples_split
= 0, min_samples_leaf
= 0, sample_radio
= 0.9, n_jobs
= 10):
self
.trees
= []
self
.random_state
= random_state
np
.random
.seed
(self
.random_state
)
self
.n_estimators
= n_estimators
self
.max_features
= max_features
self
.max_depth
= max_depth
self
.min_change
= min_change
self
.min_samples_leaf
= min_samples_leaf
self
.min_samples_split
= min_samples_split
self
.sample_radio
= sample_radio
self
.n_jobs
= n_jobs
def get_one_tree(self
, dataSet
):
X_train
, X_test
, y_train
, y_test
= train_test_split
(dataSet
[:,:-1], dataSet
[:,-1],
train_size
= self
.sample_radio
, random_state
= self
.random_state
)
X_train
=np
.concatenate
((X_train
,y_train
.reshape
((-1,1))),axis
=1)
self
.trees
.append
(self
.createTree
(X_train
,self
.max_depth
))
def fit(self
, X
, Y
):
dataSet
= np
.concatenate
((X
, Y
.reshape
(-1,1)), axis
= -1)
Parallel
(n_jobs
=self
.n_jobs
, backend
="threading")(delayed
(self
.get_one_tree
)(dataSet
) for _
in range(self
.n_estimators
))
def treeForecast(self
,tree
,data
):
if not isinstance(tree
,dict):
return float(tree
)
if data
[tree
['bestFeature']]>tree
['bestVal']:
if type(tree
['left'])=='float':
return tree
['left']
else:
return self
.treeForecast
(tree
['left'],data
)
else:
if type(tree
['right'])=='float':
return tree
['right']
else:
return self
.treeForecast
(tree
['right'],data
)
def createForeCast(self
,tree
,dataSet
):
seqtree
= self
.cur_tree
+1
self
.cur_tree
= seqtree
;
print('第'+str(seqtree
)+'棵树正在预测...\n')
l
=len(dataSet
)
predict
=np
.mat
(zeros
((l
,1)))
for i
in range(l
):
predict
[i
,0]=self
.treeForecast
(tree
,dataSet
[i
,:])
print('第'+str(seqtree
)+'棵树预测完成!')
return predict
def unpdate_predict(self
, predict
, tree
, X
):
predict
+=self
.createForeCast
(tree
,X
)
def predict(self
,X
):
self
.cur_tree
= 0;
l
=len(X
)
predict
=np
.mat
(zeros
((l
,1)))
Parallel
(n_jobs
=self
.n_jobs
, backend
="threading")(delayed
(self
.unpdate_predict
)(predict
, tree
, X
) for tree
in self
.trees
)
predict
/=self
.n_estimators
return predict
def get_score(self
,target
, X
):
return r2_score
(target
, self
.predict
(X
))
模型预测与评估
预测模型
rf1
= myrf
(random_state
=2, n_estimators
=10, max_features
=3, max_depth
=10, min_change
=0.001, min_samples_split
=20, n_jobs
=-1)
rf1
.fit
(boston
.data
, boston
.target
)
正在搭建第1棵树...
正在搭建第2棵树...
正在搭建第3棵树...
正在搭建第4棵树...
第4棵树搭建完成!
正在搭建第5棵树...
第2棵树搭建完成!第3棵树搭建完成!
正在搭建第6棵树...
正在搭建第7棵树...
第1棵树搭建完成!
正在搭建第8棵树...
第6棵树搭建完成!
正在搭建第9棵树...
第5棵树搭建完成!第7棵树搭建完成!
第8棵树搭建完成!
正在搭建第10棵树...
第9棵树搭建完成!
第10棵树搭建完成!
rf1
.get_score
(boston
.target
, boston
.data
)
第1棵树正在预测...
第2棵树正在预测...
第3棵树正在预测...
第1棵树预测完成!
第4棵树正在预测...
第3棵树预测完成!第5棵树正在预测...
第6棵树正在预测...
第6棵树预测完成!第2棵树预测完成!
第7棵树正在预测...
第7棵树预测完成!
第8棵树正在预测...
第8棵树预测完成!
第5棵树预测完成!
第9棵树正在预测...
第4棵树预测完成!第9棵树预测完成!
第10棵树正在预测...
第10棵树预测完成!
0.9302502640348399