Q-learning
REF:
1.https://blog.csdn.net/itplus/article/details/9361915
一、Q-leaning算法
1.

二、A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)


- 其实,这个公式相当于上面提到的公式中alpha = 0的一种情况。
- 并且从该算法也可以看出,其实它是一直在探索的。毕竟该问题比较特殊,我的最终目标是走出去,所以没有利用这一回事。
程序如下:
import numpy as np
import random
# initial
q = np.zeros([6, 6])
q = np.matrix(q)
#其实这里虽然描述的是状态动作奖赏,但实际是环境建模
#对于智能体在某一个具体位置时候,它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1, 0, -1 ],
[-1, -1, -1, 0, -1, 100],
[-1, -1, -1, 0, -1, -1 ],
[-1, 0, 0, -1, 0, -1 ],
[ 0, -1, -1, 0, -1, 100],
[-1, 0, -1, -1, 0, 100]])
r = np.matrix(r)
gamma = 0.8
rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的
# 对智能体进行训练,得出一个合理的Q表。
for i in range(rep):
# one episode
state = random.randint(0, 5)
while (state != 5):
# choose positive r-value action randomly
r_pos_action = []
for action in range(6):
if r[state, action] >= 0:#r>=0,代表从这一个房间进入另一个房间是有门的
r_pos_action.append(action)
next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
state = next_state
# 对智能体进行测试,测试10次,查看训练效果。
for i in range(10):
# one episode
print("episode: " + str(i + 1))
# random initial state
state = random.randint(0, 5)
print("the robot borns in " + str(state) + ".")
count = 0
while (state != 5):
# prevent endless loop
if count > 20:
print('fails')
break
# choose maximal q-value action randomly
q_max = -100
for action in range(6):
if q[state, action] > q_max:
q_max = q[state, action]
q_max_action = []
for action in range(6):
if q[state, action] == q_max:
q_max_action.append(action)
next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
print("the robot goes to " + str(next_state) + '.')
state = next_state
count = count + 1
执行结果如下:

现在严格按照最上面给出的q-learning公式来:
import numpy as np
import random
# initial
q = np.zeros([6, 6])
q = np.matrix(q)
#其实这里虽然描述的是状态动作奖赏,但实际是环境建模
#对于智能体在某一个具体位置时候,它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1, 0, -1 ],
[-1, -1, -1, 0, -1, 100],
[-1, -1, -1, 0, -1, -1 ],
[-1, 0, 0, -1, 0, -1 ],
[ 0, -1, -1, 0, -1, 100],
[-1, 0, -1, -1, 0, 100]])
r = np.matrix(r)
gammma = 0.8
alpha = 0.1
rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的
# 对智能体进行训练,得出一个合理的Q表。
for i in range(rep):
# one episode
state = random.randint(0, 5)
last_state = state
while (state != 5):
# choose positive r-value action randomly
r_pos_action = []
for action in range(6):
if r[state, action] >= 0:#r>=0,代表从这一个房间进入另一个房间是有门的
r_pos_action.append(action)
next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
#q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
q[state, next_state] = (1-alpha)*q[last_state, state] + alpha*(r[state, next_state] + gamma*q[next_state].max())
last_state = state
state = next_state
# 对智能体进行测试,测试10次,查看训练效果。
for i in range(10):
# one episode
print("episode: " + str(i + 1))
# random initial state
state = random.randint(0, 5)
print("the robot borns in " + str(state) + ".")
count = 0
while (state != 5):
# prevent endless loop
if count > 20:
print('fails')
break
# choose maximal q-value action randomly
q_max = -100
for action in range(6):
if q[state, action] > q_max:
q_max = q[state, action]
q_max_action = []
for action in range(6):
if q[state, action] == q_max:
q_max_action.append(action)
next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
print("the robot goes to " + str(next_state) + '.')
state = next_state
count = count + 1
执行结果:

三:
浙公网安备 33010602011771号