背景
最近接到一个项目,使用遗传算法对决策树进行调参;以前都是使用网格搜索来调参,没想到也可以用ga来做这件事情,再加上以前也写过比较多的ga算法,也就接了下来,本来以为要花一点时间来搞,实际上熟悉的话2-3个小时就能搞定。
算法
做项目肯定是要用库的啦(不可能自己写的),选择使用sklearn的决策树,ga算法流程比较清晰,就自己手写了,下面关键介绍ga算法的几个步骤是如何做的。
初始化
选择决策树比较重要的三个参数"max_depth", "min_samples_split", "max_leaf_nodes",穷举这三个参数可能的值进行初始化
1 def init():
2 forest =
[]
3 for max_depth
in range(5, 31, 3
):
4 for min_samples_split
in range(5, 25, 5
):
5 for max_leaf_nodes
in range(5, 25, 5
):
6 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
7 return forest
选择
使用准确率作为评分依据得到累计概率
1 def tree_score(X, Y, clf):
2 kf = KFold(n_splits=5
)
3 score =
[]
4 for train_index, valid_index
in kf.split(X):
5 clf.fit(X[train_index], Y[train_index])
6 pred =
clf.predict(X[valid_index])
7 score.append(accuracy_score(y_true=Y[valid_index], y_pred=
pred))
8 return np.mean(score)
1 def adaption(X, Y, forest):
2 score =
[]
3 for t
in forest:
4 score.append(tree_score(X, Y, t))
5 best_pos =
np.argmax(score)
6 global BEST_TREE
7 BEST_TREE =
copy.deepcopy(forest[best_pos])
8 sm =
np.sum(score)
9 ada = score /
sm
10 for i
in range(1
, len(ada)):
11 ada[i] = ada[i] + ada[i - 1
]
12 return ada
选择这里可以注意一下,可以使用精英策略,即:把当前这一轮最好的个体,直接送入下一代中。这个策略在提升算法的稳定性上又很大用处
交叉
交叉使用的是参数的交叉,比如clf1,和clf2 然后随机得到一个找到一个交换参数的位置p,进行交叉
1 def _cross_2_tree(t1, t2):
2 sz =
len(param)
3
4 t1_param_value = _dict_get_value_list(t1.
__dict__, param)
5 t2_param_value = _dict_get_value_list(t2.
__dict__, param)
6 pos = random.randint(0, sz - 1
)
7 t1_left = t1_param_value[0:pos + 1
]
8 t1_right = t1_param_value[pos + 1
:]
9
10 t2_left = t2_param_value[0:pos + 1
]
11 t2_right = t2_param_value[pos + 1
:]
12
13 t1_left.extend(t2_right)
14 t2_left.extend(t1_right)
15 return [make_tree(t1_left), make_tree(t2_left)]
16
17
18 def cross(forest):
19 result =
[]
20 sz =
len(forest)
21 for i
in range(1, sz, 2
):
22 result.extend(_cross_2_tree(forest[i - 1
], forest[i]))
23 return result
变异
这一步使用比较简单的策略,直接在参数上进行+1或者-1操作
1 def variation(forest):
2 result =
[]
3 for t
in forest:
4 r =
random.random()
5 if r <
VAR_P:
6 result.append(t)
7 continue
8
9 # 变异
10 sz =
len(param)
11 pos = random.randint(0, sz - 1
)
12 val = t.
__dict__[param[pos]]
13 up =
random.random()
14
15 if up > 0.5
:
16 val = val + 1
17 else:
18 val = val - 1
19
20 if val < 2
:
21 val = 2
22 t.
__dict__[param[pos]] =
val
23 result.append(t)
24 return result
完整代码
1 import pandas as pd
2 import numpy as np
3 from sklearn.tree
import DecisionTreeClassifier
4 from sklearn.model_selection
import train_test_split
5 from sklearn.model_selection
import KFold
6 from sklearn.metrics
import accuracy_score
7 import random
8 import copy
9 import matplotlib.pyplot as plt
10
11 param = [
"max_depth",
"min_samples_split",
"max_leaf_nodes"]
12 epochs = 300
13 VAR_P = 0.4
14 BEST_TREE =
None
15
16
17 def make_tree(param_value):
18 p =
dict(zip(param, param_value))
19 return DecisionTreeClassifier(**
p)
20
21
22 def init():
23 forest =
[]
24 for max_depth
in range(5, 31, 3
):
25 for min_samples_split
in range(5, 25, 5
):
26 for max_leaf_nodes
in range(5, 25, 5
):
27 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
28 return forest
29
30 def tree_score(X, Y, clf):
31 kf = KFold(n_splits=5
)
32 score =
[]
33 for train_index, valid_index
in kf.split(X):
34 clf.fit(X[train_index], Y[train_index])
35 pred =
clf.predict(X[valid_index])
36 score.append(accuracy_score(y_true=Y[valid_index], y_pred=
pred))
37 return np.mean(score)
38
39
40 def evulate_forest(X, Y, forest):
41 score =
[]
42 for t
in forest:
43 score.append(tree_score(X, Y, t))
44 worse_pos =
np.argmin(score)
45 global BEST_TREE
46 forest[worse_pos] =
BEST_TREE
47 score[worse_pos] =
tree_score(X, Y, BEST_TREE)
48
49 score.sort(reverse=
True)
50 return score, np.mean(score)
51
52
53 def adaption(X, Y, forest):
54 score =
[]
55 for t
in forest:
56 score.append(tree_score(X, Y, t))
57 best_pos =
np.argmax(score)
58 global BEST_TREE
59 BEST_TREE =
copy.deepcopy(forest[best_pos])
60 sm =
np.sum(score)
61 ada = score /
sm
62 for i
in range(1
, len(ada)):
63 ada[i] = ada[i] + ada[i - 1
]
64 return ada
65
66
67 def choose_trees(forest, ada):
68 sz =
len(forest)
69 result =
[]
70 for i
in range(sz):
71 r =
random.random()
72 for j
in range(len(ada)):
73 if r <=
ada[j]:
74 result.append(copy.deepcopy(forest[j]))
75 break
76 return result
77
78
79 def _dict_get_value_list(mp, key_list):
80 value_list =
[]
81 for key
in key_list:
82 value_list.append(mp.get(key))
83 return value_list
84
85
86 def _cross_2_tree(t1, t2):
87 sz =
len(param)
88
89 t1_param_value = _dict_get_value_list(t1.
__dict__, param)
90 t2_param_value = _dict_get_value_list(t2.
__dict__, param)
91 pos = random.randint(0, sz - 1
)
92 t1_left = t1_param_value[0:pos + 1
]
93 t1_right = t1_param_value[pos + 1
:]
94
95 t2_left = t2_param_value[0:pos + 1
]
96 t2_right = t2_param_value[pos + 1
:]
97
98 t1_left.extend(t2_right)
99 t2_left.extend(t1_right)
100 return [make_tree(t1_left), make_tree(t2_left)]
101
102
103 def cross(forest):
104 result =
[]
105 sz =
len(forest)
106 for i
in range(1, sz, 2
):
107 result.extend(_cross_2_tree(forest[i - 1
], forest[i]))
108 return result
109
110
111 def variation(forest):
112 result =
[]
113 for t
in forest:
114 r =
random.random()
115 if r <
VAR_P:
116 result.append(t)
117 continue
118
119 # 变异
120 sz =
len(param)
121 pos = random.randint(0, sz - 1
)
122 val = t.
__dict__[param[pos]]
123 up =
random.random()
124
125 if up > 0.5
:
126 val = val + 1
127 else:
128 val = val - 1
129
130 if val < 2
:
131 val = 2
132 t.
__dict__[param[pos]] =
val
133 result.append(t)
134 return result
135
136
137 df = pd.read_csv(
"../dataset/data.csv", index_col=
0)
138 X = df.iloc[:, 1
:].values
139 Y =
df.iloc[:, 0].values
140 forest =
init()
141
142 mean_score_arr =
[]
143
144 for i
in range(epochs):
145 ada =
adaption(X, Y, forest)
146 forest =
choose_trees(forest, ada)
147 forest =
cross(forest)
148 forest =
variation(forest)
149 score, mean =
evulate_forest(X, Y, forest)
150 mean_score_arr.append(mean)
151
152 print(i,
"/", epochs,
":")
153 print(
"mean:", mean)
154
155 plt.plot(np.arange(len(mean_score_arr)), mean_score_arr)
156 plt.show()
总结
感觉使用ga进行调参很鸡肋,还不如使用网格搜索来的快,但是作为一种思想可以学习一下的。
最近搞了一个人工智能交流的群:831852635,有兴趣的可以加一下!
转载于:https://www.cnblogs.com/oldBook/p/10656241.html
相关资源:JAVA上百实例源码以及开源项目