Q-learning算法

1. 初始化一个state * action大学的q_table

2. 采用ε贪心策略选择action，得到reward和下一步的状态

3. 根据reward和下一步的状态更新q_table

import gym
import numpy as np

# 创建CliffWalking-v0环境
env = gym.make('CliffWalking-v0')

# 初始化Q表
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
Q_table = np.zeros((state_space_size, action_space_size))
print(state_space_size, action_space_size)

# 设置超参数
learning_rate = 0.1
discount_factor = 0.99
exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# 训练的回合数
num_episodes = 5000
max_steps_per_episode = 100

# Q - learning训练循环
for episode in range(num_episodes):
    # 重置环境
    state = env.reset()
    # print("state: ", state)
    if isinstance(state, tuple):
        state = state[0]
    done = False
    for step in range(max_steps_per_episode):
        # ε - 贪心策略选择动作
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(Q_table[state, :])
        else:
            action = env.action_space.sample()

        # 执行动作，观察奖励和下一个状态
        new_state, reward, terminated, truncated, info = env.step(action)
        # print(new_state, reward, terminated, truncated, info)
        # break
        done = terminated or truncated

        # 更新Q表
        # Q_table[state, action] = Q_table[state, action] * (1 - learning_rate) + \
        #                          learning_rate * (reward + discount_factor * np.max(Q_table[new_state, :]))
        target = reward + discount_factor * np.max(Q_table[new_state, :])
        Q_table[state, action] = Q_table[state, action] + learning_rate * (target - Q_table[state, action])

        state = new_state

        if done:
            break

    # 逐渐降低探索率
    exploration_rate = min_exploration_rate + \
                       (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

# 测试训练好的策略
total_rewards = 0
for episode in range(3):
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]
    done = False
    print(f"***** Episode {episode + 1} *****")
    while not done:
        action = np.argmax(Q_table[state, :])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        total_rewards += reward
        env.render()
        state = new_state
    print(f"Total rewards in episode {episode + 1}: {total_rewards}")

env.close()

Q-learning

问题：

1. 对于棋类这种需要结局才知道reward的场景如何定义每一步的回报？

参考资料

强化学习入门必看！经典Qlearning理论推导到实践

posted @ 2025-02-27 19:17 AI_Engineer 阅读(27) 评论(0) 收藏举报

刷新页面返回顶部

xd_xumaomao

Q-learning算法

公告