基于百度飞桨PaddlePaddle和PARL复现PPO强化学习算法

  • PG
    • sample efficience 很低:由于采样和优化,用的是一套策略
    • 训练过程不稳定,容易崩溃(可能由于样本的关联度太高造成)。错误会被放大,比如随机采集到了查的,和环境交互后更差,得到的也是一堆差的数据,很难从错误恢复,导致崩溃
  • 如何使的训练更稳定呢?
    • 引入 Trust region 的机制,使得产生的 gradient 总是在一个安全的范围里(平缓更新?)
    • 或者使用 natural policy gradient,一种二阶优化的方式(SGD是用的一阶近似,所以计算得到的 policy 不准确)
  • 如何变成 off-policy?(数据采集和优化使用不同 policy)
    • 使用 Important sampling 的方法(比如 TRPO)
  • 对 TRPO 进一步改进:ACKTR
    • 提升 TRPO 的计算效率:把 Fisher Information Matrix 求逆的过程,用 Kronecker-factored approximation curvature (K-FAC) 替代。结果是训练速度大幅加快
  • 对 TRPO 进行简化:Proximal Policy Optimization (PPO)
    • 用 unconstrained form 把 Natural gradient 和 TRPO 的 loss function 结合起来
    • 效果和 TRPO类似,但是由于用的是一阶的 SGD 方法优化,所以速度更快
  • PPO with clipping
    • 使用 clip 的方式,限制 ratio 的更新,让 policy output 不会有太激进的变化,让更新变得稳定
    • 大部分的 PPO 算法用了这种形式
    • 仅几行代码即可把经典 PG 算法改写成 PPO 的形式,所以很容易实现

一、主程序

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gym
import numpy as np
import parl
from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel
from parl.utils import logger, action_mapping
from parl.utils.rl_utils import calc_gae, calc_discount_sum_rewards
from scaler import Scaler


def run_train_episode(env, agent, scaler):
    obs = env.reset()
    observes, actions, rewards, unscaled_obs = [], [], [], []
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    while True:
        obs = obs.reshape((1, -1))
        obs = np.append(obs, [[step]], axis=1)  # add time step feature
        unscaled_obs.append(obs)
        obs = (obs - offset) * scale  # center and scale observations
        obs = obs.astype('float32')
        observes.append(obs)

        action = agent.policy_sample(obs)
        action = np.clip(action, -1.0, 1.0)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        action = action.reshape((1, -1)).astype('float32')
        actions.append(action)

        obs, reward, done, _ = env.step(np.squeeze(action))
        rewards.append(reward)
        step += 1e-3  # increment time step feature

        if done:
            break

    return (np.concatenate(observes), np.concatenate(actions),
            np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))


def run_evaluate_episode(env, agent, scaler):
    obs = env.reset()
    rewards = []
    step = 0.0
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    while True:
        obs = obs.reshape((1, -1))
        obs = np.append(obs, [[step]], axis=1)  # add time step feature
        obs = (obs - offset) * scale  # center and scale observations
        obs = obs.astype('float32')

        action = agent.policy_predict(obs)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        obs, reward, done, _ = env.step(np.squeeze(action))
        rewards.append(reward)

        step += 1e-3  # increment time step feature

        if done:
            break
    return np.sum(rewards)


def collect_trajectories(env, agent, scaler, episodes):
    trajectories, all_unscaled_obs = [], []
    for e in range(episodes):
        obs, actions, rewards, unscaled_obs = run_train_episode(
            env, agent, scaler)
        trajectories.append({
            'obs': obs,
            'actions': actions,
            'rewards': rewards,
        })
        all_unscaled_obs.append(unscaled_obs)
    # update running statistics for scaling observations
    scaler.update(np.concatenate(all_unscaled_obs))
    return trajectories


def build_train_data(trajectories, agent):
    train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], []
    for trajectory in trajectories:
        pred_values = agent.value_predict(trajectory['obs'])

        # scale rewards
        scale_rewards = trajectory['rewards'] * (1 - args.gamma)

        discount_sum_rewards = calc_discount_sum_rewards(
            scale_rewards, args.gamma).astype('float32')

        advantages = calc_gae(scale_rewards, pred_values, 0, args.gamma,
                              args.lam)

        # normalize advantages
        advantages = (advantages - advantages.mean()) / (
            advantages.std() + 1e-6)
        advantages = advantages.astype('float32')

        train_obs.append(trajectory['obs'])
        train_actions.append(trajectory['actions'])
        train_advantages.append(advantages)
        train_discount_sum_rewards.append(discount_sum_rewards)

    train_obs = np.concatenate(train_obs)
    train_actions = np.concatenate(train_actions)
    train_advantages = np.concatenate(train_advantages)
    train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards)

    return train_obs, train_actions, train_advantages, train_discount_sum_rewards


def main():
    env = gym.make(args.env)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    obs_dim += 1  # add 1 to obs dim for time step feature

    scaler = Scaler(obs_dim)

    model = MujocoModel(obs_dim, act_dim)
    alg = parl.algorithms.PPO(
        model,
        act_dim=act_dim,
        policy_lr=model.policy_lr,
        value_lr=model.value_lr)
    agent = MujocoAgent(
        alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type)

    # run a few episodes to initialize scaler
    collect_trajectories(env, agent, scaler, episodes=5)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        trajectories = collect_trajectories(
            env, agent, scaler, episodes=args.episodes_per_batch)
        total_steps += sum([t['obs'].shape[0] for t in trajectories])
        total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])

        train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
            trajectories, agent)

        policy_loss, kl = agent.policy_learn(train_obs, train_actions,
                                             train_advantages)
        value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)

        logger.info(
            'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
            .format(total_steps, total_train_rewards / args.episodes_per_batch,
                    policy_loss, kl, value_loss))
        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            eval_reward = run_evaluate_episode(env, agent, scaler)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, eval_reward))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--env',
        type=str,
        help='Mujoco environment name',
        default='HalfCheetah-v2')
    parser.add_argument(
        '--gamma', type=float, help='Discount factor', default=0.995)
    parser.add_argument(
        '--lam',
        type=float,
        help='Lambda for Generalized Advantage Estimation',
        default=0.98)
    parser.add_argument(
        '--kl_targ', type=float, help='D_KL target value', default=0.003)
    parser.add_argument(
        '--episodes_per_batch',
        type=int,
        help='Number of episodes per training batch',
        default=5)
    parser.add_argument(
        '--loss_type',
        type=str,
        help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'",
        default='CLIP')
    parser.add_argument(
        '--train_total_steps',
        type=int,
        default=int(1e7),
        help='maximum training steps')
    parser.add_argument(
        '--test_every_steps',
        type=int,
        default=int(1e4),
        help='the step interval between two consecutive evaluations')

    args = parser.parse_args()

    main()

二、mujoco_agent

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import parl
from parl import layers
from paddle import fluid
from parl.utils import logger


class MujocoAgent(parl.Agent):
    def __init__(self,
                 algorithm,
                 obs_dim,
                 act_dim,
                 kl_targ,
                 loss_type,
                 beta=1.0,
                 epsilon=0.2,
                 policy_learn_times=20,
                 value_learn_times=10,
                 value_batch_size=256):
        self.alg = algorithm
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        assert loss_type == 'CLIP' or loss_type == 'KLPEN'
        self.loss_type = loss_type
        super(MujocoAgent, self).__init__(algorithm)

        self.policy_learn_times = policy_learn_times
        # Adaptive kl penalty coefficient
        self.beta = beta
        self.kl_targ = kl_targ

        self.value_learn_times = value_learn_times
        self.value_batch_size = value_batch_size
        self.value_learn_buffer = None

    def build_program(self):
        self.policy_predict_program = fluid.Program()
        self.policy_sample_program = fluid.Program()
        self.policy_learn_program = fluid.Program()
        self.value_predict_program = fluid.Program()
        self.value_learn_program = fluid.Program()

        with fluid.program_guard(self.policy_sample_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            sampled_act = self.alg.sample(obs)
            self.policy_sample_output = [sampled_act]

        with fluid.program_guard(self.policy_predict_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            means = self.alg.predict(obs)
            self.policy_predict_output = [means]

        with fluid.program_guard(self.policy_learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            actions = layers.data(
                name='actions', shape=[self.act_dim], dtype='float32')
            advantages = layers.data(
                name='advantages', shape=[1], dtype='float32')
            if self.loss_type == 'KLPEN':
                beta = layers.data(name='beta', shape=[], dtype='float32')
                loss, kl = self.alg.policy_learn(obs, actions, advantages,
                                                 beta)
            else:
                loss, kl = self.alg.policy_learn(obs, actions, advantages)

            self.policy_learn_output = [loss, kl]

        with fluid.program_guard(self.value_predict_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            value = self.alg.value_predict(obs)
            self.value_predict_output = [value]

        with fluid.program_guard(self.value_learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            val = layers.data(name='val', shape=[], dtype='float32')
            value_loss = self.alg.value_learn(obs, val)
            self.value_learn_output = [value_loss]

    def policy_sample(self, obs):
        feed = {'obs': obs}
        sampled_act = self.fluid_executor.run(
            self.policy_sample_program,
            feed=feed,
            fetch_list=self.policy_sample_output)[0]
        return sampled_act

    def policy_predict(self, obs):
        feed = {'obs': obs}
        means = self.fluid_executor.run(
            self.policy_predict_program,
            feed=feed,
            fetch_list=self.policy_predict_output)[0]
        return means

    def value_predict(self, obs):
        feed = {'obs': obs}
        value = self.fluid_executor.run(
            self.value_predict_program,
            feed=feed,
            fetch_list=self.value_predict_output)[0]
        return value

    def _batch_policy_learn(self, obs, actions, advantages):
        if self.loss_type == 'KLPEN':
            feed = {
                'obs': obs,
                'actions': actions,
                'advantages': advantages,
                'beta': self.beta
            }
        else:
            feed = {'obs': obs, 'actions': actions, 'advantages': advantages}
        [loss, kl] = self.fluid_executor.run(
            self.policy_learn_program,
            feed=feed,
            fetch_list=self.policy_learn_output)
        return loss, kl

    def _batch_value_learn(self, obs, val):
        feed = {'obs': obs, 'val': val}
        value_loss = self.fluid_executor.run(
            self.value_learn_program,
            feed=feed,
            fetch_list=self.value_learn_output)[0]
        return value_loss

    def policy_learn(self, obs, actions, advantages):
        """ Learn policy:

        1. Sync parameters of policy model to old policy model
        2. Fix old policy model, and learn policy model multi times
        3. if use KLPEN loss, Adjust kl loss coefficient: beta
        """
        self.alg.sync_old_policy()

        all_loss, all_kl = [], []
        for _ in range(self.policy_learn_times):
            loss, kl = self._batch_policy_learn(obs, actions, advantages)
            all_loss.append(loss)
            all_kl.append(kl)

        if self.loss_type == 'KLPEN':
            # Adative KL penalty coefficient
            if kl > self.kl_targ * 2:
                self.beta = 1.5 * self.beta
            elif kl < self.kl_targ / 2:
                self.beta = self.beta / 1.5

        return np.mean(all_loss), np.mean(all_kl)

    def value_learn(self, obs, value):
        """ Fit model to current data batch + previous data batch
        """
        data_size = obs.shape[0]

        if self.value_learn_buffer is None:
            obs_train, value_train = obs, value
        else:
            obs_train = np.concatenate([obs, self.value_learn_buffer[0]])
            value_train = np.concatenate([value, self.value_learn_buffer[1]])
        self.value_learn_buffer = (obs, value)

        all_loss = []
        for _ in range(self.value_learn_times):
            random_ids = np.arange(obs_train.shape[0])
            np.random.shuffle(random_ids)
            shuffle_obs_train = obs_train[random_ids]
            shuffle_value_train = value_train[random_ids]
            start = 0
            while start < data_size:
                end = start + self.value_batch_size
                value_loss = self._batch_value_learn(
                    shuffle_obs_train[start:end, :],
                    shuffle_value_train[start:end])
                all_loss.append(value_loss)
                start += self.value_batch_size
        return np.mean(all_loss)

三、mujoco_model

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import parl
from parl import layers
from paddle import fluid
from paddle.fluid.param_attr import ParamAttr


class MujocoModel(parl.Model):
    def __init__(self, obs_dim, act_dim, init_logvar=-1.0):
        self.policy_model = PolicyModel(obs_dim, act_dim, init_logvar)
        self.value_model = ValueModel(obs_dim, act_dim)
        self.policy_lr = self.policy_model.lr
        self.value_lr = self.value_model.lr

    def policy(self, obs):
        return self.policy_model.policy(obs)

    def policy_sample(self, obs):
        return self.policy_model.sample(obs)

    def value(self, obs):
        return self.value_model.value(obs)


class PolicyModel(parl.Model):
    def __init__(self, obs_dim, act_dim, init_logvar):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        hid1_size = obs_dim * 10
        hid3_size = act_dim * 10
        hid2_size = int(np.sqrt(hid1_size * hid3_size))

        self.lr = 9e-4 / np.sqrt(hid2_size)

        self.fc1 = layers.fc(size=hid1_size, act='tanh')
        self.fc2 = layers.fc(size=hid2_size, act='tanh')
        self.fc3 = layers.fc(size=hid3_size, act='tanh')
        self.fc4 = layers.fc(size=act_dim, act='tanh')

        self.logvars = layers.create_parameter(
            shape=[act_dim],
            dtype='float32',
            default_initializer=fluid.initializer.ConstantInitializer(
                init_logvar))

    def policy(self, obs):
        hid1 = self.fc1(obs)
        hid2 = self.fc2(hid1)
        hid3 = self.fc3(hid2)
        means = self.fc4(hid3)
        logvars = self.logvars()
        return means, logvars

    def sample(self, obs):
        means, logvars = self.policy(obs)
        sampled_act = means + (
            layers.exp(logvars / 2.0) *  # stddev
            layers.gaussian_random(shape=(self.act_dim, ), dtype='float32'))
        return sampled_act


class ValueModel(parl.Model):
    def __init__(self, obs_dim, act_dim):
        super(ValueModel, self).__init__()
        hid1_size = obs_dim * 10
        hid3_size = 5
        hid2_size = int(np.sqrt(hid1_size * hid3_size))

        self.lr = 1e-2 / np.sqrt(hid2_size)

        self.fc1 = layers.fc(size=hid1_size, act='tanh')
        self.fc2 = layers.fc(size=hid2_size, act='tanh')
        self.fc3 = layers.fc(size=hid3_size, act='tanh')
        self.fc4 = layers.fc(size=1)

    def value(self, obs):
        hid1 = self.fc1(obs)
        hid2 = self.fc2(hid1)
        hid3 = self.fc3(hid2)
        V = self.fc4(hid3)
        V = layers.squeeze(V, axes=[])
        return V

四、scaler

# Third party code
#
# The following code are copied or modified from:
# //github.com/pat-coady/trpo

import numpy as np
import scipy.signal

__all__ = ['Scaler']


class Scaler(object):
    """ Generate scale and offset based on running mean and stddev along axis=0

        offset = running mean
        scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0)
    """

    def __init__(self, obs_dim):
        """
        Args:
            obs_dim: dimension of axis=1
        """
        self.vars = np.zeros(obs_dim)
        self.means = np.zeros(obs_dim)
        self.cnt = 0
        self.first_pass = True

    def update(self, x):
        """ Update running mean and variance (this is an exact method)
        Args:
            x: NumPy array, shape = (N, obs_dim)

        see: //stats.stackexchange.com/questions/43159/how-to-calculate-pooled-
               variance-of-two-groups-given-known-group-variances-mean
        """
        if self.first_pass:
            self.means = np.mean(x, axis=0)
            self.vars = np.var(x, axis=0)
            self.cnt = x.shape[0]
            self.first_pass = False
        else:
            n = x.shape[0]
            new_data_var = np.var(x, axis=0)
            new_data_mean = np.mean(x, axis=0)
            new_data_mean_sq = np.square(new_data_mean)
            new_means = (
                (self.means * self.cnt) + (new_data_mean * n)) / (self.cnt + n)
            self.vars = (((self.cnt * (self.vars + np.square(self.means))) +
                          (n * (new_data_var + new_data_mean_sq))) /
                         (self.cnt + n) - np.square(new_means))
            self.vars = np.maximum(
                0.0, self.vars)  # occasionally goes negative, clip
            self.means = new_means
            self.cnt += n

    def get(self):
        """ returns 2-tuple: (scale, offset) """
        return 1 / (np.sqrt(self.vars) + 0.1) / 3, self.means