本文作者:hhh5460
本文地址:https://www.cnblogs.com/hhh5460/p/10159331.html
特别感谢:本文的三幅图皆来自莫凡的教程 https://morvanzhou.github.io/
pandas是基于numpy的,但是两者之间的操作有区别,故在实现上述算法时的细节有出入。故记录之
几点说明:
1). 为了更好的说明问题,采用最简单的例一。
2). 分离了环境与个体,采用类编程的形式。
3). 调整了环境与个体的变量、函数的位置,使得Agent完全不需要改动!
4). 个体与环境的互动逻辑更符合实际
〇、效果图
一、pandas实现
1.q-learning
class RLQLearning(Agent):
'''Agent的子类'''
def __init__(self, env):
super().__init__(env)
def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4
):
'''学习'''
print(
'q-learning算法')
for _
in range(episode):
s =
self.env.reset()
is_win =
False
while not is_win:
a =
self.observe(s, epsilon)
r, s1, is_win =
self.env.step(a)
self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, self.env.get_valid_actions(s1)].max() -
self.Q.ix[s, a])
s = s1
2.saras
class RLSaras(Agent):
'''Agent的子类'''
def __init__(self, env):
super().__init__(env)
def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4
):
'''学习'''
print(
'saras算法')
for _
in range(episode):
s =
self.env.reset()
a =
self.observe(s, epsilon)
is_win =
False
while not is_win:
r, s1, is_win =
self.env.step(a)
a1 =
self.observe(s1, epsilon)
self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, a1] -
self.Q.ix[s, a])
s, a = s1, a1
3.saras(lambda)
class RLSarasLambda(Agent):
'''Agent的子类'''
def __init__(self, env):
super().__init__(env)
self.E = self.Q.copy()
# 复制Q table
def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4
):
'''学习'''
print(
'saras(lambda)算法,lambda_为衰减值')
for _
in range(episode):
self.E *=
0
s =
self.env.reset()
a =
self.observe(s, epsilon)
is_win =
False
while not is_win:
r, s1, is_win =
self.env.step(a)
a1 =
self.observe(s1, epsilon)
delta = r + gamma * self.Q.ix[s1, a1] -
self.Q.ix[s, a]
#self.E.ix[s, a] += 1 # 效果不如下两句
self.E.ix[s] *=
0
self.E.ix[s, a] = 1
for s_
in self.env.states:
for a_
in self.env.actions:
self.Q.ix[s_, a_] += alpha * delta *
self.E.ix[s_, a_]
self.E.ix[s_, a_] *= gamma *
lambda_
s, a = s1, a1
4.完整代码
1 import pandas as pd
2 import random
3 import time
4
5
6 '''
7 -o---T
8 # T 就是宝藏的位置, o 是探索者的位置
9 '''
10
11 # 作者:hhh5460
12 # 时间:20181221
13 # 地点:Tai Zi Miao
14
15 class Env(object):
16 '''环境'''
17 def __init__(self):
18 '''初始化'''
19 self.board = list(
'-----T')
20 self.states = range(6
)
21 self.actions = [
'left',
'right']
22 self.rewards = [0,0,0,0,0,1
]
23
24 def get_valid_actions(self, state):
25 '''取当前状态下所有的合法动作'''
26 valid_actions =
[]
27 if state != 5:
# 除末状态(位置),皆可向右
28 valid_actions.append(
'right')
29 if state != 0:
# 除首状态(位置),皆可向左
30 valid_actions.append(
'left')
31 return valid_actions
32
33 def _step(self, action):
34 '''执行动作,到达新状态'''
35 if action ==
'right' and self.state != self.states[-1]:
# 除末状态(位置),向右+1
36 self.state += 1
37 elif action ==
'left' and self.state != self.states[0]:
# 除首状态(位置),向左-1
38 self.state -= 1
39
40 def reset(self):
41 '''重置环境,返回状态0'''
42 self.board = list(
'-----T')
43 self.state =
0
44 self.board[self.state] =
'o'
45 print(
'\r ', end=
'')
46 print(
'\r{}'.format(
''.join(self.board)), end=
'')
47 return self.state
48
49 def step(self, action, step_time=0.1
):
50 '''执行动作 返回奖励、新状态、胜利标志'''
51 self.board[self.state] =
'-' # 擦除旧位置'o'
52 self._step(action)
# 到达新位置
53 self.board[self.state] =
'o' # 改变新位置
54
55 reward = self.rewards[self.state]
# 奖励
56 is_win = [False, True][self.state == self.states[-1]]
# 胜利标志
57 if is_win ==
True:
58 print(
'\r{} WIN!'.format(
''.join(self.board)), end=
'')
# 胜利,则加特写镜头
59 else:
60 print(
'\r{}'.format(
''.join(self.board)), end=
'')
61 time.sleep(step_time)
62
63 return reward, self.state, is_win
64
65
66 class Agent(object):
67 '''智能体'''
68 def __init__(self, env):
69 '''初始化'''
70 # 环境
71 self.env =
env
72 # 大脑
73 self.Q = pd.DataFrame(data=[[0
for _
in self.env.actions]
for _
in self.env.states],
74 index=
self.env.states,
75 columns=
self.env.actions)
76
77 def observe(self, state, epsilon=0.4
):
78 '''观察'''
79 # 根据自身所处状态,按某种策略选择相应的动作
80 if random.uniform(0,1) < epsilon:
# 贪婪
81 s = self.Q.ix[state].filter(items=
self.env.get_valid_actions(state))
82 action = random.choice(s[s==s.max()].index)
# 可能多个最大值!
83 else:
# 探索
84 action =
random.choice(self.env.get_valid_actions(state))
85 return action
86
87 def learn(self,*args, **
kw):
88 '''学习'''
89 pass
90
91 def play(self, step_time=0.5
):
92 '''玩耍'''
93 # 学有所成
94 s =
self.env.reset()
95 is_win =
False
96 while not is_win:
97 a = self.observe(s, epsilon=1.)
# 1.,100%贪婪,即利用
98 _, s1, is_win =
self.env.step(a, step_time)
99 s =
s1
100 print()
101
102 class RLQLearning(Agent):
103 '''Agent的子类'''
104 def __init__(self, env):
105 super().
__init__(env)
106
107 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4
):
108 '''学习'''
109 print(
'q-learning算法')
110 for _
in range(episode):
111 s =
self.env.reset()
112 is_win =
False
113 while not is_win:
114 a =
self.observe(s, epsilon)
115 r, s1, is_win =
self.env.step(a)
116 self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, self.env.get_valid_actions(s1)].max() -
self.Q.ix[s, a])
117 s =
s1
118
119 class RLSaras(Agent):
120 '''Agent的子类'''
121 def __init__(self, env):
122 super().
__init__(env)
123
124 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4
):
125 '''学习'''
126 print(
'saras算法')
127 for _
in range(episode):
128 s =
self.env.reset()
129 a =
self.observe(s, epsilon)
130 is_win =
False
131 while not is_win:
132 r, s1, is_win =
self.env.step(a)
133 a1 =
self.observe(s1, epsilon)
134 self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, a1] -
self.Q.ix[s, a])
135 s, a =
s1, a1
136
137 class RLSarasLambda(Agent):
138 '''Agent的子类'''
139 def __init__(self, env):
140 super().
__init__(env)
141 self.E = self.Q.copy()
# 复制Q table
142
143 def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4
):
144 '''学习'''
145 print(
'saras(lambda)算法,lambda_为衰减值')
146 for _
in range(episode):
147 self.E *=
0
148 s =
self.env.reset()
149 a =
self.observe(s, epsilon)
150 is_win =
False
151 while not is_win:
152 r, s1, is_win =
self.env.step(a)
153 a1 =
self.observe(s1, epsilon)
154 delta = r + gamma * self.Q.ix[s1, a1] -
self.Q.ix[s, a]
155 #self.E.ix[s, a] += 1 # 效果不如下两句
156 self.E.ix[s] *=
0
157 self.E.ix[s, a] = 1
158 for s_
in self.env.states:
159 for a_
in self.env.actions:
160 self.Q.ix[s_, a_] += alpha * delta *
self.E.ix[s_, a_]
161 self.E.ix[s_, a_] *= gamma *
lambda_
162 s, a =
s1, a1
163
164
165 if __name__ ==
'__main__':
166 env = Env()
# 环境
167
168 agent = RLQLearning(env)
# 个体
169 agent.learn(episode=13)
# 先学
170 agent.play()
# 再玩
171
172 agent2 = RLSaras(env)
# 个体2
173 agent2.learn(episode=13)
# 先学
174 agent2.play()
# 再玩
175
176 agent3 = RLSarasLambda(env)
# 个体3
177 agent3.learn(episode=13)
# 先学
178 agent3.play()
# 再玩
二、numpy实现
1.q-learning
2.saras
3.saras(lambda)
4.完整代码
1 import numpy as np
2 import time
3
4
5 '''
6 -o---T
7 # T 就是宝藏的位置, o 是探索者的位置
8 '''
9
10 # 作者:hhh5460
11 # 时间:20181221
12 # 地点:Tai Zi Miao
13
14 class Env(object):
15 '''环境'''
16 def __init__(self):
17 '''初始化'''
18 self.board = list(
'-----T')
19 self.states = range(6
)
20 self.actions = [
'left',
'right']
# 索引[0,1]
21 self.rewards = [0,0,0,0,0,1
]
22
23 def get_valid_actions(self, state):
24 '''取当前状态下所有的合法动作(索引)'''
25 valid_actions =
[]
26 if state != self.states[0]:
# 除首状态(位置),皆可向左
27 valid_actions.append(self.actions.index(
'left'))
28 if state != self.states[-1]:
# 除末状态(位置),皆可向右
29 valid_actions.append(self.actions.index(
'right'))
30 return valid_actions
31
32 def _step(self, action):
33 '''执行动作(索引),到达新状态'''
34 if self.actions[action] ==
'left' and self.state > self.states[0]:
# 除首状态(位置),向左-1
35 self.state = self.state - 1
36 elif self.actions[action] ==
'right' and self.state < self.states[-1]:
# 除末状态(位置),向右+1
37 self.state = self.state + 1
38
39 def reset(self):
40 '''重置环境,返回状态0'''
41 self.board = list(
'-----T')
42 self.state =
0
43 self.board[self.state] =
'o'
44 print(
'\r ', end=
'')
45 print(
'\r{}'.format(
''.join(self.board)), end=
'')
46 return self.state
47
48 def step(self, action, step_time=0.1
):
49 '''执行动作 返回奖励、新状态、胜利标志'''
50 self.board[self.state] =
'-' # 擦除旧位置'o'
51 self._step(action)
# 到达新位置
52 self.board[self.state] =
'o' # 改变新位置
53
54 reward = self.rewards[self.state]
# 奖励
55 is_win = [False, True][self.state == self.states[-1]]
# 胜利标志
56 if is_win ==
True:
57 print(
'\r{} WIN!'.format(
''.join(self.board)), end=
'')
# 胜利,则加特写镜头
58 else:
59 print(
'\r{}'.format(
''.join(self.board)), end=
'')
60 time.sleep(step_time)
61
62 return reward, self.state, is_win
63
64
65 class Agent(object):
66 '''智能体'''
67 def __init__(self, env):
68 '''初始化'''
69 # 环境
70 self.env =
env
71 # 大脑
72 self.Q = np.zeros((len(self.env.states), len(self.env.actions)), dtype=
np.float32)
73
74 def observe(self, state, epsilon=0.8
):
75 '''观察'''
76 # 根据自身所处状态,按某种策略选择相应的动作(索引)
77 valid_actions =
self.env.get_valid_actions(state)
78 arr =
self.Q[state, valid_actions]
79 if (np.random.uniform() >
epsilon
80 or arr.max() ==
0
81 or len(arr[arr==arr.max()]) > 1
):
82 action = np.random.choice(valid_actions)
# 探索
83 else:
84 action = self.Q[state].argmax()
# 利用
85 return action
86
87 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.8
):
88 '''学习'''
89 pass
90
91 def play(self, step_time=0.5
):
92 '''玩耍'''
93 # 学有所成
94 s =
self.env.reset()
95 is_win =
False
96 while not is_win:
97 a = self.observe(s, epsilon=1.)
# 1.,100%贪婪,即利用
98 _, s1, is_win =
self.env.step(a, step_time)
99 s =
s1
100 print()
101
102 class RLQLearning(Agent):
103 '''智能体'''
104 def __init__(self, env):
105 '''初始化'''
106 super().
__init__(env)
107
108 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.8
):
109 '''学习'''
110 print(
'q-learning算法')
111 for _
in range(episode):
112 s =
self.env.reset()
113 is_win =
False
114 while not is_win:
115 a =
self.observe(s, epsilon)
116 r, s1, is_win =
self.env.step(a)
117 self.Q[s, a] += alpha * (r + gamma * self.Q[s1, self.env.get_valid_actions(s1)].max() -
self.Q[s, a])
118 s =
s1
119
120 class RLSaras(Agent):
121 '''Agent的子类'''
122 def __init__(self, env):
123 super().
__init__(env)
124
125 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4
):
126 '''学习'''
127 print(
'saras算法')
128 for _
in range(episode):
129 s =
self.env.reset()
130 a =
self.observe(s, epsilon)
131 is_win =
False
132 while not is_win:
133 r, s1, is_win =
self.env.step(a)
134 a1 =
self.observe(s1, epsilon)
135 self.Q[s, a] += alpha * (r + gamma * self.Q[s1, a1] -
self.Q[s, a])
136 s, a =
s1, a1
137
138 class RLSarasLambda(Agent):
139 '''Agent的子类'''
140 def __init__(self, env):
141 super().
__init__(env)
142 self.E = self.Q.copy()
# 复制Q table
143
144 def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4
):
145 '''学习'''
146 print(
'saras(lambda)算法,lambda_为衰减值')
147 for _
in range(episode):
148 self.E *=
0
149 s =
self.env.reset()
150 a =
self.observe(s, epsilon)
151 is_win =
False
152 while not is_win:
153 r, s1, is_win =
self.env.step(a)
154 a1 =
self.observe(s1, epsilon)
155 delta = r + gamma * self.Q[s1, a1] -
self.Q[s, a]
156 #self.E.ix[s, a] += 1 # 效果不如下两句
157 self.E[s] *=
0
158 self.E[s, a] = 1
159 for s_
in self.env.states:
160 for a_
in range(len(self.env.actions)):
# 遍历动作索引!!
161 self.Q[s_, a_] += alpha * delta *
self.E[s_, a_]
162 self.E[s_, a_] *= gamma *
lambda_
163 s, a =
s1, a1
164
165 if __name__ ==
'__main__':
166 env = Env()
# 环境
167 agent = RLQLearning(env)
# 个体
168 agent.learn(episode=13)
# 先学
169 agent.play()
# 再玩
170
171 agent2 = RLSaras(env)
# 个体2
172 agent2.learn(episode=13)
# 先学
173 agent2.play()
# 再玩
174
175 agent3 = RLSarasLambda(env)
# 个体3
176 agent3.learn(episode=13)
# 先学
177 agent3.play()
# 再玩
转载于:https://www.cnblogs.com/hhh5460/p/10159331.html