基于百度飞桨PaddlePaddle和PARL复现PPO强化学习算法
- PG
- sample efficience 很低:由于采样和优化,用的是一套策略
- 训练过程不稳定,容易崩溃(可能由于样本的关联度太高造成)。错误会被放大,比如随机采集到了查的,和环境交互后更差,得到的也是一堆差的数据,很难从错误恢复,导致崩溃
- 如何使的训练更稳定呢?
- 引入 Trust region 的机制,使得产生的 gradient 总是在一个安全的范围里(平缓更新?)
- 或者使用 natural policy gradient,一种二阶优化的方式(SGD是用的一阶近似,所以计算得到的 policy 不准确)
- 如何变成 off-policy?(数据采集和优化使用不同 policy)
- 使用 Important sampling 的方法(比如 TRPO)
- 对 TRPO 进一步改进:ACKTR
- 提升 TRPO 的计算效率:把 Fisher Information Matrix 求逆的过程,用 Kronecker-factored approximation curvature (K-FAC) 替代。结果是训练速度大幅加快
- 对 TRPO 进行简化:Proximal Policy Optimization (PPO)
- 用 unconstrained form 把 Natural gradient 和 TRPO 的 loss function 结合起来
- 效果和 TRPO类似,但是由于用的是一阶的 SGD 方法优化,所以速度更快
- PPO with clipping
- 使用 clip 的方式,限制 ratio 的更新,让 policy output 不会有太激进的变化,让更新变得稳定
- 大部分的 PPO 算法用了这种形式
- 仅几行代码即可把经典 PG 算法改写成 PPO 的形式,所以很容易实现
一、主程序
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import gym
import numpy as np
import parl
from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel
from parl.utils import logger, action_mapping
from parl.utils.rl_utils import calc_gae, calc_discount_sum_rewards
from scaler import Scaler
def run_train_episode(env, agent, scaler):
obs = env.reset()
observes, actions, rewards, unscaled_obs = [], [], [], []
step = 0.0
scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature
while True:
obs = obs.reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature
unscaled_obs.append(obs)
obs = (obs - offset) * scale # center and scale observations
obs = obs.astype('float32')
observes.append(obs)
action = agent.policy_sample(obs)
action = np.clip(action, -1.0, 1.0)
action = action_mapping(action, env.action_space.low[0],
env.action_space.high[0])
action = action.reshape((1, -1)).astype('float32')
actions.append(action)
obs, reward, done, _ = env.step(np.squeeze(action))
rewards.append(reward)
step += 1e-3 # increment time step feature
if done:
break
return (np.concatenate(observes), np.concatenate(actions),
np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))
def run_evaluate_episode(env, agent, scaler):
obs = env.reset()
rewards = []
step = 0.0
scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature
while True:
obs = obs.reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature
obs = (obs - offset) * scale # center and scale observations
obs = obs.astype('float32')
action = agent.policy_predict(obs)
action = action_mapping(action, env.action_space.low[0],
env.action_space.high[0])
obs, reward, done, _ = env.step(np.squeeze(action))
rewards.append(reward)
step += 1e-3 # increment time step feature
if done:
break
return np.sum(rewards)
def collect_trajectories(env, agent, scaler, episodes):
trajectories, all_unscaled_obs = [], []
for e in range(episodes):
obs, actions, rewards, unscaled_obs = run_train_episode(
env, agent, scaler)
trajectories.append({
'obs': obs,
'actions': actions,
'rewards': rewards,
})
all_unscaled_obs.append(unscaled_obs)
# update running statistics for scaling observations
scaler.update(np.concatenate(all_unscaled_obs))
return trajectories
def build_train_data(trajectories, agent):
train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], []
for trajectory in trajectories:
pred_values = agent.value_predict(trajectory['obs'])
# scale rewards
scale_rewards = trajectory['rewards'] * (1 - args.gamma)
discount_sum_rewards = calc_discount_sum_rewards(
scale_rewards, args.gamma).astype('float32')
advantages = calc_gae(scale_rewards, pred_values, 0, args.gamma,
args.lam)
# normalize advantages
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-6)
advantages = advantages.astype('float32')
train_obs.append(trajectory['obs'])
train_actions.append(trajectory['actions'])
train_advantages.append(advantages)
train_discount_sum_rewards.append(discount_sum_rewards)
train_obs = np.concatenate(train_obs)
train_actions = np.concatenate(train_actions)
train_advantages = np.concatenate(train_advantages)
train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards)
return train_obs, train_actions, train_advantages, train_discount_sum_rewards
def main():
env = gym.make(args.env)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
obs_dim += 1 # add 1 to obs dim for time step feature
scaler = Scaler(obs_dim)
model = MujocoModel(obs_dim, act_dim)
alg = parl.algorithms.PPO(
model,
act_dim=act_dim,
policy_lr=model.policy_lr,
value_lr=model.value_lr)
agent = MujocoAgent(
alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type)
# run a few episodes to initialize scaler
collect_trajectories(env, agent, scaler, episodes=5)
test_flag = 0
total_steps = 0
while total_steps < args.train_total_steps:
trajectories = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch)
total_steps += sum([t['obs'].shape[0] for t in trajectories])
total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])
train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
trajectories, agent)
policy_loss, kl = agent.policy_learn(train_obs, train_actions,
train_advantages)
value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)
logger.info(
'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(total_steps, total_train_rewards / args.episodes_per_batch,
policy_loss, kl, value_loss))
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
eval_reward = run_evaluate_episode(env, agent, scaler)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, eval_reward))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--env',
type=str,
help='Mujoco environment name',
default='HalfCheetah-v2')
parser.add_argument(
'--gamma', type=float, help='Discount factor', default=0.995)
parser.add_argument(
'--lam',
type=float,
help='Lambda for Generalized Advantage Estimation',
default=0.98)
parser.add_argument(
'--kl_targ', type=float, help='D_KL target value', default=0.003)
parser.add_argument(
'--episodes_per_batch',
type=int,
help='Number of episodes per training batch',
default=5)
parser.add_argument(
'--loss_type',
type=str,
help="Choose loss type of PPO algorithm, 'CLIP' or 'KLPEN'",
default='CLIP')
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int,
default=int(1e4),
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
main()
二、mujoco_agent
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
from parl import layers
from paddle import fluid
from parl.utils import logger
class MujocoAgent(parl.Agent):
def __init__(self,
algorithm,
obs_dim,
act_dim,
kl_targ,
loss_type,
beta=1.0,
epsilon=0.2,
policy_learn_times=20,
value_learn_times=10,
value_batch_size=256):
self.alg = algorithm
self.obs_dim = obs_dim
self.act_dim = act_dim
assert loss_type == 'CLIP' or loss_type == 'KLPEN'
self.loss_type = loss_type
super(MujocoAgent, self).__init__(algorithm)
self.policy_learn_times = policy_learn_times
# Adaptive kl penalty coefficient
self.beta = beta
self.kl_targ = kl_targ
self.value_learn_times = value_learn_times
self.value_batch_size = value_batch_size
self.value_learn_buffer = None
def build_program(self):
self.policy_predict_program = fluid.Program()
self.policy_sample_program = fluid.Program()
self.policy_learn_program = fluid.Program()
self.value_predict_program = fluid.Program()
self.value_learn_program = fluid.Program()
with fluid.program_guard(self.policy_sample_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
sampled_act = self.alg.sample(obs)
self.policy_sample_output = [sampled_act]
with fluid.program_guard(self.policy_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
means = self.alg.predict(obs)
self.policy_predict_output = [means]
with fluid.program_guard(self.policy_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(
name='actions', shape=[self.act_dim], dtype='float32')
advantages = layers.data(
name='advantages', shape=[1], dtype='float32')
if self.loss_type == 'KLPEN':
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
val = layers.data(name='val', shape=[], dtype='float32')
value_loss = self.alg.value_learn(obs, val)
self.value_learn_output = [value_loss]
def policy_sample(self, obs):
feed = {'obs': obs}
sampled_act = self.fluid_executor.run(
self.policy_sample_program,
feed=feed,
fetch_list=self.policy_sample_output)[0]
return sampled_act
def policy_predict(self, obs):
feed = {'obs': obs}
means = self.fluid_executor.run(
self.policy_predict_program,
feed=feed,
fetch_list=self.policy_predict_output)[0]
return means
def value_predict(self, obs):
feed = {'obs': obs}
value = self.fluid_executor.run(
self.value_predict_program,
feed=feed,
fetch_list=self.value_predict_output)[0]
return value
def _batch_policy_learn(self, obs, actions, advantages):
if self.loss_type == 'KLPEN':
feed = {
'obs': obs,
'actions': actions,
'advantages': advantages,
'beta': self.beta
}
else:
feed = {'obs': obs, 'actions': actions, 'advantages': advantages}
[loss, kl] = self.fluid_executor.run(
self.policy_learn_program,
feed=feed,
fetch_list=self.policy_learn_output)
return loss, kl
def _batch_value_learn(self, obs, val):
feed = {'obs': obs, 'val': val}
value_loss = self.fluid_executor.run(
self.value_learn_program,
feed=feed,
fetch_list=self.value_learn_output)[0]
return value_loss
def policy_learn(self, obs, actions, advantages):
""" Learn policy:
1. Sync parameters of policy model to old policy model
2. Fix old policy model, and learn policy model multi times
3. if use KLPEN loss, Adjust kl loss coefficient: beta
"""
self.alg.sync_old_policy()
all_loss, all_kl = [], []
for _ in range(self.policy_learn_times):
loss, kl = self._batch_policy_learn(obs, actions, advantages)
all_loss.append(loss)
all_kl.append(kl)
if self.loss_type == 'KLPEN':
# Adative KL penalty coefficient
if kl > self.kl_targ * 2:
self.beta = 1.5 * self.beta
elif kl < self.kl_targ / 2:
self.beta = self.beta / 1.5
return np.mean(all_loss), np.mean(all_kl)
def value_learn(self, obs, value):
""" Fit model to current data batch + previous data batch
"""
data_size = obs.shape[0]
if self.value_learn_buffer is None:
obs_train, value_train = obs, value
else:
obs_train = np.concatenate([obs, self.value_learn_buffer[0]])
value_train = np.concatenate([value, self.value_learn_buffer[1]])
self.value_learn_buffer = (obs, value)
all_loss = []
for _ in range(self.value_learn_times):
random_ids = np.arange(obs_train.shape[0])
np.random.shuffle(random_ids)
shuffle_obs_train = obs_train[random_ids]
shuffle_value_train = value_train[random_ids]
start = 0
while start < data_size:
end = start + self.value_batch_size
value_loss = self._batch_value_learn(
shuffle_obs_train[start:end, :],
shuffle_value_train[start:end])
all_loss.append(value_loss)
start += self.value_batch_size
return np.mean(all_loss)
三、mujoco_model
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
from parl import layers
from paddle import fluid
from paddle.fluid.param_attr import ParamAttr
class MujocoModel(parl.Model):
def __init__(self, obs_dim, act_dim, init_logvar=-1.0):
self.policy_model = PolicyModel(obs_dim, act_dim, init_logvar)
self.value_model = ValueModel(obs_dim, act_dim)
self.policy_lr = self.policy_model.lr
self.value_lr = self.value_model.lr
def policy(self, obs):
return self.policy_model.policy(obs)
def policy_sample(self, obs):
return self.policy_model.sample(obs)
def value(self, obs):
return self.value_model.value(obs)
class PolicyModel(parl.Model):
def __init__(self, obs_dim, act_dim, init_logvar):
self.obs_dim = obs_dim
self.act_dim = act_dim
hid1_size = obs_dim * 10
hid3_size = act_dim * 10
hid2_size = int(np.sqrt(hid1_size * hid3_size))
self.lr = 9e-4 / np.sqrt(hid2_size)
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=hid2_size, act='tanh')
self.fc3 = layers.fc(size=hid3_size, act='tanh')
self.fc4 = layers.fc(size=act_dim, act='tanh')
self.logvars = layers.create_parameter(
shape=[act_dim],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(
init_logvar))
def policy(self, obs):
hid1 = self.fc1(obs)
hid2 = self.fc2(hid1)
hid3 = self.fc3(hid2)
means = self.fc4(hid3)
logvars = self.logvars()
return means, logvars
def sample(self, obs):
means, logvars = self.policy(obs)
sampled_act = means + (
layers.exp(logvars / 2.0) * # stddev
layers.gaussian_random(shape=(self.act_dim, ), dtype='float32'))
return sampled_act
class ValueModel(parl.Model):
def __init__(self, obs_dim, act_dim):
super(ValueModel, self).__init__()
hid1_size = obs_dim * 10
hid3_size = 5
hid2_size = int(np.sqrt(hid1_size * hid3_size))
self.lr = 1e-2 / np.sqrt(hid2_size)
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=hid2_size, act='tanh')
self.fc3 = layers.fc(size=hid3_size, act='tanh')
self.fc4 = layers.fc(size=1)
def value(self, obs):
hid1 = self.fc1(obs)
hid2 = self.fc2(hid1)
hid3 = self.fc3(hid2)
V = self.fc4(hid3)
V = layers.squeeze(V, axes=[])
return V
四、scaler
# Third party code
#
# The following code are copied or modified from:
# //github.com/pat-coady/trpo
import numpy as np
import scipy.signal
__all__ = ['Scaler']
class Scaler(object):
""" Generate scale and offset based on running mean and stddev along axis=0
offset = running mean
scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0)
"""
def __init__(self, obs_dim):
"""
Args:
obs_dim: dimension of axis=1
"""
self.vars = np.zeros(obs_dim)
self.means = np.zeros(obs_dim)
self.cnt = 0
self.first_pass = True
def update(self, x):
""" Update running mean and variance (this is an exact method)
Args:
x: NumPy array, shape = (N, obs_dim)
see: //stats.stackexchange.com/questions/43159/how-to-calculate-pooled-
variance-of-two-groups-given-known-group-variances-mean
"""
if self.first_pass:
self.means = np.mean(x, axis=0)
self.vars = np.var(x, axis=0)
self.cnt = x.shape[0]
self.first_pass = False
else:
n = x.shape[0]
new_data_var = np.var(x, axis=0)
new_data_mean = np.mean(x, axis=0)
new_data_mean_sq = np.square(new_data_mean)
new_means = (
(self.means * self.cnt) + (new_data_mean * n)) / (self.cnt + n)
self.vars = (((self.cnt * (self.vars + np.square(self.means))) +
(n * (new_data_var + new_data_mean_sq))) /
(self.cnt + n) - np.square(new_means))
self.vars = np.maximum(
0.0, self.vars) # occasionally goes negative, clip
self.means = new_means
self.cnt += n
def get(self):
""" returns 2-tuple: (scale, offset) """
return 1 / (np.sqrt(self.vars) + 0.1) / 3, self.means