强化学习理论-第8课-值函数近似

当state space太大的时候，需要用一个函数来对state value 或action value进行近似，方便处理
1. Algorithm for state value estimation

1.1 Objective function

这里\(d_{\pi}\)是权重，可以决定哪个state权重比较大，就让它的误差变得更小
这里的\(P_{\pi}\)和贝尔曼公式里的是一个，里面每个元素代表当前状态到下一个状态的转移的概率
1.2 Optimization algorithms

1.3 Selection of function approximators

2. Sarsa with function approximation(action value)

3. Q-learning with function approximation

4. Deep Q-learning

5. Deep Q-learning-Experience replay

6. Deep Q-learning-implementation and example

def gfv(self, fourier: bool, state: int, ord: int) -> np.ndarray:
        """
        get_feature_vector
        :param fourier: 是否使用傅里叶特征函数
        :param state: 状态
        :param ord: 特征函数最高阶次数/傅里叶q(对应书)
        :return: 代入state后的计算结果
        """

        if state < 0 or state >= self.state_space_size:
            raise ValueError("Invalid state value")
        y, x = self.env.state2pos(state) + (1, 1)
        feature_vector = []
        if fourier:
            # 归一化到 -1 到 1
            x_normalized = x / self.env.size
            y_normalized = y / self.env.size
            for i in range(ord + 1):
                for j in range(ord + 1):
                    feature_vector.append(np.cos(np.pi * (i * x_normalized + j * y_normalized)))

        else:
            # 归一化到 0 到 1
            x_normalized = (x - (self.env.size - 1) * 0.5) / (self.env.size - 1)
            y_normalized = (y - (self.env.size - 1) * 0.5) / (self.env.size - 1)
            for i in range(ord + 1):
                for j in range(i + 1):
                    feature_vector.append(y_normalized ** (ord - i) * x_normalized ** j)

        return np.array(feature_vector)

    def gfv_a(self, fourier: bool, state: int, action: int, ord: int) -> np.ndarray:
        """
        get_feature_vector_with_action
        :param fourier: 是否使用傅里叶特征函数
        :param state: 状态
        :param ord: 特征函数最高阶次数/傅里叶q(对应书)
        :return: 代入state后的计算结果
        """

        if state < 0 or state >= self.state_space_size or action < 0 or action >= self.action_space_size:
            raise ValueError("Invalid state/action value")
        feature_vector = []
        y, x = self.env.state2pos(state) + (1, 1)

        if fourier:
            # 归一化到 -1 到 1
            x_normalized = x / self.env.size
            y_normalized = y / self.env.size
            action_normalized = action / self.action_space_size
            for i in range(ord + 1):
                for j in range(ord + 1):
                    for k in range(ord + 1):
                        feature_vector.append(
                            np.cos(np.pi * (i * x_normalized + j * action_normalized + k * y_normalized)))

        else:
            # 归一化到 0 到 1
            state_normalized = (state - (self.state_space_size - 1) * 0.5) / (self.state_space_size - 1)
            action_normalized = (action - (self.action_space_size - 1) * 0.5) / (self.action_space_size - 1)
            for i in range(ord + 1):
                for j in range(i + 1):
                    feature_vector.append(state_normalized ** (ord - i) * action_normalized ** j)
        return np.array(feature_vector)

    def td_value_approximation(self, learning_rate=0.0005, epochs=100000, fourier=True, ord=5):
        self.state_value=self.policy_evaluation(self.policy)
        if not isinstance(learning_rate, float) or not isinstance(epochs, int) or not isinstance(
                fourier, bool) or not isinstance(ord, int):
            raise TypeError("Invalid input type")
        if learning_rate <= 0 or epochs <= 0 or ord <= 0:
            raise ValueError("Invalid input value")
        episode_length = epochs
        start_state = np.random.randint(self.state_space_size)
        start_action = np.random.choice(np.arange(self.action_space_size),
                                        p=self.mean_policy[start_state])
        episode = self.obtain_episode(self.mean_policy, start_state, start_action, length=episode_length)
        dim = (ord + 1) ** 2 if fourier else np.arange(ord + 2).sum()
        w = np.random.default_rng().normal(size=dim)
        rmse = []
        value_approximation = np.zeros(self.state_space_size)
        for epoch in range(epochs):
            reward = episode[epoch]['reward']
            state = episode[epoch]['state']
            next_state = episode[epoch]['next_state']
            target = reward + self.gama * np.dot(self.gfv(fourier, next_state, ord), w)
            error = target - np.dot(self.gfv(fourier, state, ord), w)
            gradient = self.gfv(fourier, state, ord)
            w = w + learning_rate * error * gradient
            for state in range(self.state_space_size):
                value_approximation[state] = np.dot(self.gfv(fourier, state, ord), w)
            rmse.append(np.sqrt(np.mean((value_approximation - self.state_value) ** 2)))
            print(epoch)
        X, Y = np.meshgrid(np.arange(1, 6), np.arange(1, 6))
        Z = self.state_value.reshape(5, 5)
        Z1 = value_approximation.reshape(5, 5)
        # 绘制 3D 曲面图
        fig = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax = fig.add_subplot(121, projection='3d')
        ax.plot_surface(X, Y, Z)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('State Value')
        z_min = -5
        z_max = -2
        ax.set_zlim(z_min, z_max)
        ax1 = fig.add_subplot(122, projection='3d')
        ax1.plot_surface(X, Y, Z1)
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.set_zlabel('Value Approximation')
        ax1.set_zlim(z_min, z_max)
        fig_rmse = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax_rmse = fig_rmse.add_subplot(111)

        # 绘制 rmse 图像
        ax_rmse.plot(rmse)
        ax_rmse.set_title('RMSE')
        ax_rmse.set_xlabel('Epoch')
        ax_rmse.set_ylabel('RMSE')
        plt.show()
        return value_approximation

    def sarsa_function_approximation(self, learning_rate=0.0005, epsilon=0.1, num_episodes=100000, fourier=True, ord=5):
        #BUG

        dim = (ord + 1) ** 2 if fourier else np.arange(ord + 2).sum()
        w = np.random.default_rng().normal(size=dim)

        qvalue_approximation = np.zeros((self.state_space_size, self.action_space_size))
        reward_list = []
        length_list = []
        rmse = []
        policy_rmse = []
        policy = self.mean_policy.copy()
        next_state = 0
        episode = self.obtain_episode(self.mean_policy, 0, 0, length=num_episodes)

        for episode in range(num_episodes):
            # epsilon = (epsilon - 1 / num_episodes) if epsilon > 0 else 0
            done = False
            self.env.reset()
            total_rewards = 0
            episode_length = 0
            while not done:
                state = next_state
                action = np.random.choice(np.arange(self.action_space_size),
                                          p=policy[state])
                _, reward, done, _, _ = self.env.step(action)
                episode_length += 1
                total_rewards += reward
                next_state = self.env.pos2state(self.env.agent_location)
                next_action = np.random.choice(np.arange(self.action_space_size),
                                               p=policy[next_state])
                target = reward + self.gama * np.dot(self.gfv_a(fourier, next_state, next_action, ord), w)
                error = target - np.dot(self.gfv_a(fourier, state, action, ord), w)
                gradient = self.gfv_a(fourier, state, action, ord)
                w = w + learning_rate * error * gradient
                # for state in range(self.state_space_size):
                #     for action in range(self.action_space_size):
                qvalue_approximation[state, action] = np.dot(self.gfv_a(fourier, state, action, ord), w)

                qvalue_star = qvalue_approximation[state].max()
                action_star = qvalue_approximation[state].tolist().index(qvalue_star)
                for a in range(self.action_space_size):
                    if a == action_star:
                        policy[state, a] = 1 - (
                                self.action_space_size - 1) / self.action_space_size * epsilon
                    else:
                        policy[state, a] = 1 / self.action_space_size * epsilon
            rmse.append(np.sqrt(np.mean((qvalue_approximation - self.qvalue) ** 2)))
            # policy_rmse.append(np.sqrt(np.mean((policy - self.policy) ** 2)))
            reward_list.append(total_rewards)
            length_list.append(episode_length)
            print("episode={},length={},reward={}".format(episode, episode_length, total_rewards))

        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(211)
        ax.plot(reward_list)
        ax.set_ylabel('total_reward')
        ax1 = fig.add_subplot(212)
        ax1.plot(length_list)
        ax1.set_xlabel('episode index')
        ax1.set_ylabel('episode length')
        fig_rmse = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax_rmse = fig_rmse.add_subplot(111)
        ax_rmse.plot(rmse, label='qvalue')
        # ax_rmse.plot(policy_rmse,label='policy')

        ax_rmse.set_title('RMSE')
        ax_rmse.set_xlabel('Epoch')
        ax_rmse.set_ylabel('RMSE')
        X, Y = np.meshgrid(np.arange(0, self.action_space_size), np.arange(0, self.state_space_size))
        Z = self.qvalue
        Z1 = qvalue_approximation
        print(Z.shape, Z1.shape, X.shape)

        # 绘制 3D 曲面图
        fig = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax = fig.add_subplot(121, projection='3d')
        ax.plot_surface(X, Y, Z)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('q Value')
        # z_min = -5
        # z_max = -2
        # ax.set_zlim(z_min, z_max)
        ax1 = fig.add_subplot(122, projection='3d')
        ax1.plot_surface(X, Y, Z1)
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.set_zlabel('qValue Approximation')
        fig_rmse = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax_rmse = fig_rmse.add_subplot(111)

        plt.show()
        return qvalue_approximation

    def qlearning_function_approximation(self, learning_rate=0.0005, epsilon=0.1, num_episodes=100000, fourier=True,
                                         ord=15):
        #BUG

        dim = (ord + 1) ** 2 if fourier else np.arange(ord + 2).sum()
        w = np.random.default_rng().normal(size=dim)

        qvalue_approximation = np.zeros((self.state_space_size, self.action_space_size))
        reward_list = []
        length_list = []
        rmse = []
        policy = self.mean_policy.copy()
        next_state = 0
        episode = self.obtain_episode(self.mean_policy, 0, 0, length=num_episodes)

        for episode in range(num_episodes):
            # epsilon = (epsilon - 1 / num_episodes) if epsilon > 0 else 0
            done = False
            self.env.reset()
            total_rewards = 0
            episode_length = 0
            while not done:
                state = next_state
                action = np.random.choice(np.arange(self.action_space_size),
                                          p=policy[state])
                _, reward, done, _, _ = self.env.step(action)
                episode_length += 1
                total_rewards += reward
                next_state = self.env.pos2state(self.env.agent_location)
                q_list = []
                for a in range(self.action_space_size):
                    q_list.append(np.dot(self.gfv_a(fourier, next_state, a, ord), w))

                target = reward + self.gama * np.array(q_list).max()
                error = target - np.dot(self.gfv_a(fourier, state, action, ord), w)
                gradient = self.gfv_a(fourier, state, action, ord)
                w = w + learning_rate * error * gradient
                for s in range(self.state_space_size):
                    for a in range(self.action_space_size):
                        qvalue_approximation[s, a] = np.dot(self.gfv_a(fourier, s, a, ord), w)
                qvalue_star = qvalue_approximation[state].max()
                action_star = qvalue_approximation[state].tolist().index(qvalue_star)
                for a in range(self.action_space_size):
                    if a == action_star:
                        policy[state, a] = 1 - (
                                self.action_space_size - 1) / self.action_space_size * epsilon
                    else:
                        policy[state, a] = 1 / self.action_space_size * epsilon
            self.writer.add_scalar('rmse', np.sqrt(np.mean((qvalue_approximation - self.qvalue) ** 2)), episode)
            self.writer.add_scalar('episode_length', episode_length, episode)
            self.writer.add_scalar('total_reward', total_rewards, episode)

            # policy_rmse.append(np.sqrt(np.mean((policy - self.policy) ** 2)))
            # reward_list.append(total_rewards)
            # length_list.append(episode_length)
            print("episode={},length={},reward={}".format(episode, episode_length, total_rewards))

        # fig = plt.figure(figsize=(10, 10))
        # ax = fig.add_subplot(211)
        # ax.plot(reward_list)
        # ax.set_ylabel('total_reward')
        # ax1 = fig.add_subplot(212)
        # ax1.plot(length_list)
        # ax1.set_xlabel('episode index')
        # ax1.set_ylabel('episode length')
        # fig_rmse = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        # ax_rmse = fig_rmse.add_subplot(111)
        # ax_rmse.plot(rmse, label='qvalue')
        # # ax_rmse.plot(policy_rmse,label='policy')
        #
        # ax_rmse.set_title('RMSE')
        # ax_rmse.set_xlabel('Epoch')
        # ax_rmse.set_ylabel('RMSE')
        X, Y = np.meshgrid(np.arange(0, self.action_space_size), np.arange(0, self.state_space_size))
        Z = self.qvalue
        Z1 = qvalue_approximation
        print(Z.shape, Z1.shape, X.shape)

        # 绘制 3D 曲面图
        fig = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax = fig.add_subplot(121, projection='3d')
        ax.plot_surface(X, Y, Z)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('q Value')
        z_min = -6
        z_max = 0
        ax.set_zlim(z_min, z_max)
        ax1 = fig.add_subplot(122, projection='3d')
        ax1.plot_surface(X, Y, Z1)
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.set_zlabel('qValue Approximation')
        ax1.set_zlim(z_min, z_max)

        # fig_rmse = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        # ax_rmse = fig_rmse.add_subplot(111)
        self.writer.close()
        plt.show()
        return qvalue_approximation

    def qvalue_function_approximation(self, learning_rate=0.00008, epsilon=0.1, num_episodes=1000000,
                                      fourier=True,
                                      ord=5):
        #BUG
        dim = (ord + 1) ** 3 if fourier else np.arange(ord + 2).sum()
        w = np.random.default_rng().normal(size=dim)
        qvalue_approximation = np.zeros(shape=(self.state_space_size, self.action_space_size))
        episode = self.obtain_episode(self.mean_policy, 0, 0, length=100000)

        for epoch in range(num_episodes):
            # epsilon = (epsilon - 1 / num_episodes) if epsilon > 0 else 0
            step = int(np.random.randint(low=0, high=99999, size=1))
            reward = episode[step]['reward']
            state = episode[step]['state']
            action = episode[step]['action']
            next_action = episode[step]['next_action']
            next_state = episode[step]['next_state']
            target = reward + self.gama * np.dot(self.gfv_a(fourier, next_state, next_action, ord), w)
            error = target - np.dot(self.gfv_a(fourier, state, action, ord), w)
            gradient = self.gfv_a(fourier, state, action, ord)
            w = w + learning_rate * error * gradient
            for a in range(self.action_space_size):
                qvalue_approximation[state, a] = np.dot(self.gfv_a(fourier, state, a, ord), w)
            self.writer.add_scalar('rmse', np.sqrt(np.mean((qvalue_approximation - self.qvalue) ** 2)), epoch)
            if epoch % 1000 == 0:
                print(epoch, np.sqrt(np.mean((qvalue_approximation - self.qvalue) ** 2)))
        X, Y = np.meshgrid(np.arange(0, self.action_space_size), np.arange(0, self.state_space_size))
        Z = self.qvalue
        Z1 = qvalue_approximation
        print(Z.shape, Z1.shape, X.shape)

        # 绘制 3D 曲面图
        fig = plt.figure(figsize=(8, 6))  # 设置图形的尺寸，宽度为8，高度为6
        ax = fig.add_subplot(121, projection='3d')
        ax.plot_surface(X, Y, Z)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('q Value')
        z_min = -6
        z_max = 0
        ax.set_zlim(z_min, z_max)
        ax1 = fig.add_subplot(122, projection='3d')
        ax1.plot_surface(X, Y, Z1)
        ax1.set_xlabel('X')
        ax1.set_ylabel('Y')
        ax1.set_zlabel('qValue Approximation')
        ax1.set_zlim(z_min, z_max)
        for i in range(self.state_space_size):
            for j in range(self.action_space_size):
                print("qvalue:{},approximation:{}".format(self.qvalue[i, j], qvalue_approximation[i, j]))

        self.writer.close()
        plt.show()
        return qvalue_approximation

    def get_data_iter(self, episode, batch_size=64, is_train=True):
        """构造一个PyTorch数据迭代器"""
        reward = []
        state_action = []
        next_state = []
        for i in range(len(episode)):
            reward.append(episode[i]['reward'])
            action = episode[i]['action']
            y, x = self.env.state2pos(episode[i]['state'])
            state_action.append((y, x, action))
            y, x = self.env.state2pos(episode[i]['next_state'])
            next_state.append((y, x))
        reward = torch.tensor(reward).reshape(-1, 1)
        state_action = torch.tensor(state_action)
        next_state = torch.tensor(next_state)
        data_arrays = (state_action, reward, next_state)
        dataset = data.TensorDataset(*data_arrays)
        return data.DataLoader(dataset, batch_size, shuffle=is_train, drop_last=False)

    def dqn(self, learning_rate=0.0015, episode_length=5000, epochs=600, batch_size=100, update_step=10):
        q_net = QNET()
        policy = self.policy.copy()
        state_value = self.state_value.copy()
        q_target_net = QNET()
        q_target_net.load_state_dict(q_net.state_dict())
        optimizer = torch.optim.SGD(q_net.parameters(),
                                    lr=learning_rate)
        episode = self.obtain_episode(self.mean_policy, 0, 0, length=episode_length)
        date_iter = self.get_data_iter(episode, batch_size)
        loss = torch.nn.MSELoss()
        approximation_q_value = np.zeros(shape=(self.state_space_size, self.action_space_size))
        i = 0
        rmse_list=[]
        loss_list=[]
        for epoch in range(epochs):
            for state_action, reward, next_state in date_iter:
                i += 1
                q_value = q_net(state_action)
                q_value_target = torch.empty((batch_size, 0))  # 定义空的张量
                for action in range(self.action_space_size):
                    s_a = torch.cat((next_state, torch.full((batch_size, 1), action)), dim=1)
                    q_value_target = torch.cat((q_value_target, q_target_net(s_a)), dim=1)
                q_star = torch.max(q_value_target, dim=1, keepdim=True)[0]
                y_target_value = reward + self.gama * q_star
                l = loss(q_value, y_target_value)
                optimizer.zero_grad()  # PyTorch中默认梯度会累积,这里需要显式将梯度置为0S
                l.backward()  # 反向传播更新参数
                optimizer.step()
                if i % update_step == 0 and i != 0:
                    q_target_net.load_state_dict(
                        q_net.state_dict())  # 更新目标网络
                    # policy = np.zeros(shape=(self.state_space_size, self.action_space_size))
            loss_list.append(float(l))
            print("loss:{},epoch:{}".format(l, epoch))
            self.policy = np.zeros(shape=(self.state_space_size, self.action_space_size))
            self.state_value = np.zeros(shape=self.state_space_size)

            for s in range(self.state_space_size):
                y, x = self.env.state2pos(s)
                for a in range(self.action_space_size):
                    approximation_q_value[s, a] = float(q_net(torch.tensor((y, x, a)).reshape(-1, 3)))
                q_star_index = approximation_q_value[s].argmax()
                self.policy[s, q_star_index] = 1
                self.state_value[s] = approximation_q_value[s, q_star_index]
            rmse_list.append(np.sqrt(np.mean((state_value - self.state_value) ** 2)))
            # policy_rmse = np.sqrt(np.mean((policy - self.policy) ** 2))
        fig_rmse = plt.figure(figsize=(8, 12))  # 设置图形的尺寸，宽度为8，高度为6
        ax_rmse = fig_rmse.add_subplot(211)

        # 绘制 rmse 图像
        ax_rmse.plot(rmse_list)
        ax_rmse.set_title('RMSE')
        ax_rmse.set_xlabel('Epoch')
        ax_rmse.set_ylabel('RMSE')
        self.writer.close()
        ax_loss = fig_rmse.add_subplot(212)

        ax_loss.plot(loss_list)
        ax_loss.set_title('loss')
        ax_loss.set_xlabel('Epoch')
        ax_loss.set_ylabel('Loss')
        plt.show()

    def obtain_episode_p(self, policy_net, start_state, start_action):
        f"""

        :param policy_net: 由指定策略产生episode
        :param start_state: 起始state
        :param start_action: 起始action
        :return: 一个 state,action,reward,next_state,next_action 序列
        """
        self.env.agent_location = self.env.state2pos(start_state)
        episode = []
        next_action = start_action
        next_state = start_state
        done = False
        while not done:
            state = next_state
            action = next_action
            _, reward, done, _, _ = self.env.step(action)
            next_state = self.env.pos2state(self.env.agent_location)
            y, x = self.env.state2pos(next_state) / self.env.size
            prb = policy_net(torch.tensor((y, x)).reshape(-1, 2))[0]

            next_action = np.random.choice(np.arange(self.action_space_size),
                                           p=prb.detach().numpy())
            episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
                            "next_action": next_action})
        return episode
posted @ 2025-04-01 11:45 penuel 阅读(51) 评论(0) 收藏举报
刷新页面返回顶部
penuel