单曲循环903

导航

Q-learning

REF:

1.https://blog.csdn.net/itplus/article/details/9361915

一、Q-leaning算法

1.

二、A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)

- 其实,这个公式相当于上面提到的公式中alpha = 0的一种情况。

- 并且从该算法也可以看出,其实它是一直在探索的。毕竟该问题比较特殊,我的最终目标是走出去,所以没有利用这一回事。

程序如下:

import numpy as np
import random

# initial
q = np.zeros([6, 6])
q = np.matrix(q)

#其实这里虽然描述的是状态动作奖赏,但实际是环境建模
#对于智能体在某一个具体位置时候,它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1,  0, -1 ],
              [-1, -1, -1,  0, -1, 100],
			  [-1, -1, -1,  0, -1, -1 ],
			  [-1,  0,  0, -1,  0, -1 ], 
			  [ 0, -1, -1,  0, -1, 100], 
			  [-1,  0, -1, -1,  0, 100]])
r = np.matrix(r)

gamma = 0.8

rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的


# 对智能体进行训练,得出一个合理的Q表。
for i in range(rep):
    # one episode
    state = random.randint(0, 5)
    while (state != 5):
        # choose positive r-value action randomly
        r_pos_action = []
        for action in range(6):
            if r[state, action] >= 0:#r>=0,代表从这一个房间进入另一个房间是有门的
                r_pos_action.append(action)
        
        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
        q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
        state = next_state
		

# 对智能体进行测试,测试10次,查看训练效果。
for i in range(10):
    # one episode
    print("episode: " + str(i + 1))
    
    # random initial state
    state = random.randint(0, 5)
    print("the robot borns in " + str(state) + ".")
    count = 0
    while (state != 5):
        # prevent endless loop
        if count > 20:
            print('fails')
            break
            
        # choose maximal q-value action randomly
        q_max = -100
        for action in range(6):
            if q[state, action] > q_max:
                q_max = q[state, action]
            
        q_max_action = []
        for action in range(6):
            if q[state, action] == q_max:
                q_max_action.append(action)
                
        next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
        
        print("the robot goes to " + str(next_state) + '.')
        state = next_state
        count = count + 1

执行结果如下:

现在严格按照最上面给出的q-learning公式来:

import numpy as np
import random

# initial
q = np.zeros([6, 6])
q = np.matrix(q)

#其实这里虽然描述的是状态动作奖赏,但实际是环境建模
#对于智能体在某一个具体位置时候,它对当前采取某个动作的即刻奖赏显然是知道的
r = np.array([[-1, -1, -1, -1,  0, -1 ],
              [-1, -1, -1,  0, -1, 100],
			  [-1, -1, -1,  0, -1, -1 ],
			  [-1,  0,  0, -1,  0, -1 ], 
			  [ 0, -1, -1,  0, -1, 100], 
			  [-1,  0, -1, -1,  0, 100]])
r = np.matrix(r)

gammma = 0.8
alpha = 0.1

rep = 100#换句话说这里并没有判断系统是否已经收敛,这里的这个数值其实是自己设置的


# 对智能体进行训练,得出一个合理的Q表。
for i in range(rep):
    # one episode

    state = random.randint(0, 5)
    last_state = state
    while (state != 5):
        # choose positive r-value action randomly
        r_pos_action = []
        for action in range(6):
            if r[state, action] >= 0:#r>=0,代表从这一个房间进入另一个房间是有门的
                r_pos_action.append(action)
        
        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
        #q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
        q[state, next_state] = (1-alpha)*q[last_state, state] + alpha*(r[state, next_state] + gamma*q[next_state].max())
        last_state = state
        state = next_state
		

# 对智能体进行测试,测试10次,查看训练效果。
for i in range(10):
    # one episode
    print("episode: " + str(i + 1))
    
    # random initial state
    state = random.randint(0, 5)
    print("the robot borns in " + str(state) + ".")
    count = 0
    while (state != 5):
        # prevent endless loop
        if count > 20:
            print('fails')
            break
            
        # choose maximal q-value action randomly
        q_max = -100
        for action in range(6):
            if q[state, action] > q_max:
                q_max = q[state, action]
            
        q_max_action = []
        for action in range(6):
            if q[state, action] == q_max:
                q_max_action.append(action)
                
        next_state = q_max_action[random.randint(0, len(q_max_action) - 1)]
        
        print("the robot goes to " + str(next_state) + '.')
        state = next_state
        count = count + 1

执行结果:

三:

 

 

 

 

 

 

 

 

 

posted on 2018-04-10 10:14  单曲循环903  阅读(241)  评论(0)    收藏  举报