Q-learning

REF：

1.https://blog.csdn.net/itplus/article/details/9361915

一、Q-leaning算法

二、A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)

- 其实，这个公式相当于上面提到的公式中alpha = 0的一种情况。

- 并且从该算法也可以看出，其实它是一直在探索的。毕竟该问题比较特殊，我的最终目标是走出去，所以没有利用这一回事。

程序如下：

import numpy as np
import random

# initial
q = np.zeros([6, 6])
q = np.matrix(q)

#其实这里虽然描述的是状态动作奖赏，但实际是环境建模
#对于智能体在某一个具体位置时候，它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1,  0, -1 ],
              [-1, -1, -1,  0, -1, 100],
			  [-1, -1, -1,  0, -1, -1 ],
			  [-1,  0,  0, -1,  0, -1 ], 
			  [ 0, -1, -1,  0, -1, 100], 
			  [-1,  0, -1, -1,  0, 100]])
r = np.matrix(r)

gamma = 0.8

rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的


# 对智能体进行训练，得出一个合理的Q表。
for i in range(rep):
    # one episode
    state = random.randint(0, 5)
    while (state != 5):
        # choose positive r-value action randomly
        r_pos_action = []
        for action in range(6):
            if r[state, action] >= 0:#r>=0，代表从这一个房间进入另一个房间是有门的
                r_pos_action.append(action)
        
        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
        q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
        state = next_state
		

# 对智能体进行测试，测试10次，查看训练效果。
for i in range(10):
    # one episode
    print("episode: " + str(i + 1))
    
    # random initial state
    state = random.randint(0, 5)
    print("the robot borns in " + str(state) + ".")
    count = 0
    while (state != 5):
        # prevent endless loop
        if count > 20:
            print('fails')
            break
            
        # choose maximal q-value action randomly
        q_max = -100
        for action in range(6):
            if q[state, action] > q_max:
                q_max = q[state, action]
            
        q_max_action = []
        for action in range(6):
            if q[state, action] == q_max:
                q_max_action.append(action)
                
        next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
        
        print("the robot goes to " + str(next_state) + '.')
        state = next_state
        count = count + 1

执行结果如下：

现在严格按照最上面给出的q-learning公式来：

import numpy as np
import random

# initial
q = np.zeros([6, 6])
q = np.matrix(q)

#其实这里虽然描述的是状态动作奖赏，但实际是环境建模
#对于智能体在某一个具体位置时候，它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1,  0, -1 ],
              [-1, -1, -1,  0, -1, 100],
			  [-1, -1, -1,  0, -1, -1 ],
			  [-1,  0,  0, -1,  0, -1 ], 
			  [ 0, -1, -1,  0, -1, 100], 
			  [-1,  0, -1, -1,  0, 100]])
r = np.matrix(r)

gammma = 0.8
alpha = 0.1

rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的


# 对智能体进行训练，得出一个合理的Q表。
for i in range(rep):
    # one episode

    state = random.randint(0, 5)
    last_state = state
    while (state != 5):
        # choose positive r-value action randomly
        r_pos_action = []
        for action in range(6):
            if r[state, action] >= 0:#r>=0，代表从这一个房间进入另一个房间是有门的
                r_pos_action.append(action)
        
        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
        #q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
        q[state, next_state] = (1-alpha)*q[last_state, state] + alpha*(r[state, next_state] + gamma*q[next_state].max())
        last_state = state
        state = next_state
		

# 对智能体进行测试，测试10次，查看训练效果。
for i in range(10):
    # one episode
    print("episode: " + str(i + 1))
    
    # random initial state
    state = random.randint(0, 5)
    print("the robot borns in " + str(state) + ".")
    count = 0
    while (state != 5):
        # prevent endless loop
        if count > 20:
            print('fails')
            break
            
        # choose maximal q-value action randomly
        q_max = -100
        for action in range(6):
            if q[state, action] > q_max:
                q_max = q[state, action]
            
        q_max_action = []
        for action in range(6):
            if q[state, action] == q_max:
                q_max_action.append(action)
                
        next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
        
        print("the robot goes to " + str(next_state) + '.')
        state = next_state
        count = count + 1

执行结果：

三：

posted on 2018-04-10 10:14 单曲循环903 阅读(241) 评论(0) 收藏举报

刷新页面返回顶部

单曲循环903

导航

公告

Q-learning