强化学习理论-第9课-策略梯度方法

1. Basic idea of policy gradient

之前的策略都是用表格表示的,现在改成函数的形式描述策略




2. Metric 1 - Average value




3. Metric 2 - Average reward








4. Gradients of the metrics








5. Gradient-ascent algorithm(REINFORCE)








def reinforce(self, learning_rate=0.001, epochs=20000, episode_length=100):
        policy_net = PolicyNet()
        optimizer = torch.optim.Adam(policy_net.parameters(),
                                     lr=learning_rate)
        for epoch in range(epochs):
            # start_state =  0
            # y, x = self.env.state2pos(start_state) / self.env.size
            #将起始状态 (0, 0) 输入到策略网络中,得到动作概率分布 prb
            #torch.tensor((0, 0)): 创建一个形状为 (2,) 的张量
            #.reshape(-1, 2): 转换为形状 (1, 2),符合网络输入格式
            #policy_net(...): 前向传播,输出每个动作的概率分布
            #[0]: 取出第一个(也是唯一一个)batch 的结果
            prb = policy_net(torch.tensor((0, 0)).reshape(-1, 2))[0]
            #根据概率分布选择起始action
            start_action = np.random.choice(np.arange(self.action_space_size),
                                            p=prb.detach().numpy())
            episode = self.obtain_episode_p(policy_net, 0, start_action)
            if (len(episode) < 10):
                g = -100
            else:
                g = 0
            optimizer.zero_grad()

            for step in reversed(range(len(episode))):

                reward = episode[step]['reward']
                state = episode[step]['state']
                action = episode[step]['action']
                if len(episode) > 1000:
                    print(g, reward)
                g = self.gama * g + reward
                self.qvalue[state, action] = g
                y, x = self.env.state2pos(state) / self.env.size
                prb = policy_net(torch.tensor((y, x)).reshape(-1, 2))[0]
                log_prob = torch.log(prb[action])
                loss = -log_prob * g
                loss.backward()  # 自动求导,反向传播计算梯度
            self.writer.add_scalar('loss', float(loss.detach()), epoch)
            self.writer.add_scalar('g', g, epoch)
            self.writer.add_scalar('episode_length', len(episode), epoch)
            print(epoch, len(episode), g)
            optimizer.step()
        for s in range(self.state_space_size):
            y, x = self.env.state2pos(s) / self.env.size
            prb = policy_net(torch.tensor((y, x)).reshape(-1, 2))[0].detach().numpy()
            self.policy[s, :] = prb.copy()
        self.writer.close()
posted @ 2025-04-10 11:08  penuel  阅读(39)  评论(0)    收藏  举报