Q-learning算法
1. 初始化一个state * action大学的q_table
2. 采用ε贪心策略选择action,得到reward和下一步的状态
3. 根据reward和下一步的状态更新q_table

import gym import numpy as np # 创建CliffWalking-v0环境 env = gym.make('CliffWalking-v0') # 初始化Q表 state_space_size = env.observation_space.n action_space_size = env.action_space.n Q_table = np.zeros((state_space_size, action_space_size)) print(state_space_size, action_space_size) # 设置超参数 learning_rate = 0.1 discount_factor = 0.99 exploration_rate = 1.0 max_exploration_rate = 1.0 min_exploration_rate = 0.01 exploration_decay_rate = 0.001 # 训练的回合数 num_episodes = 5000 max_steps_per_episode = 100 # Q - learning训练循环 for episode in range(num_episodes): # 重置环境 state = env.reset() # print("state: ", state) if isinstance(state, tuple): state = state[0] done = False for step in range(max_steps_per_episode): # ε - 贪心策略选择动作 exploration_rate_threshold = np.random.uniform(0, 1) if exploration_rate_threshold > exploration_rate: action = np.argmax(Q_table[state, :]) else: action = env.action_space.sample() # 执行动作,观察奖励和下一个状态 new_state, reward, terminated, truncated, info = env.step(action) # print(new_state, reward, terminated, truncated, info) # break done = terminated or truncated # 更新Q表 # Q_table[state, action] = Q_table[state, action] * (1 - learning_rate) + \ # learning_rate * (reward + discount_factor * np.max(Q_table[new_state, :])) target = reward + discount_factor * np.max(Q_table[new_state, :]) Q_table[state, action] = Q_table[state, action] + learning_rate * (target - Q_table[state, action]) state = new_state if done: break # 逐渐降低探索率 exploration_rate = min_exploration_rate + \ (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode) # 测试训练好的策略 total_rewards = 0 for episode in range(3): state = env.reset() if isinstance(state, tuple): state = state[0] done = False print(f"***** Episode {episode + 1} *****") while not done: action = np.argmax(Q_table[state, :]) new_state, reward, terminated, truncated, info = env.step(action) done = terminated or truncated total_rewards += reward env.render() state = new_state print(f"Total rewards in episode {episode + 1}: {total_rewards}") env.close()
问题:
1. 对于棋类这种需要结局才知道reward的场景如何定义每一步的回报?
参考资料