import gym from gym import wrappers import time import logz import os.path as osp import random import numpy as np import torch from torch import nn import dqn from dqn_utils import PiecewiseSchedule, get_wrapper_by_name from atari_wrappers import wrap_deepmind_ram def weights_init(m): if hasattr(m, 'weight'): nn.init.xavier_uniform_(m.weight) if hasattr(m, 'bias'): nn.init.constant_(m.bias, 0) class DQN(nn.Module): # for atari ram def __init__(self, in_features, num_actions): super(DQN, self).__init__() self.classifier = nn.Sequential( nn.Linear(in_features, out_features=256), nn.ReLU(True), nn.Linear(in_features=256, out_features=128), nn.ReLU(True), nn.Linear(in_features=128, out_features=64), nn.ReLU(True), nn.Linear(in_features=64, out_features=num_actions), ) self.apply(weights_init) def forward(self, obs): out = obs.float() / 255 # convert 8-bits ram state to float in [0, 1] out = self.classifier(out) return out def atari_learn(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule( [ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier ) lr_lambda = lambda t: lr_schedule.value(t) optimizer = dqn.OptimizerSpec( constructor=torch.optim.Adam, kwargs=dict(eps=1e-4), lr_lambda=lr_lambda ) def stopping_criterion(env, t): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps exploration_schedule = PiecewiseSchedule( [ (0, 0.2), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01 ) dqn.learn( env, q_func=DQN, optimizer_spec=optimizer, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=1, target_update_freq=10000, grad_norm_clipping=10 ) env.close() def set_global_seeds(i): torch.manual_seed(i) if torch.cuda.is_available: torch.cuda.manual_seed(i) np.random.seed(i) random.seed(i) def get_env(env_name, exp_name, seed): env = gym.make(env_name) set_global_seeds(seed) env.seed(seed) # Set Up Logger logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join('data', logdir) logdir = osp.join(logdir, '%d'%seed) logz.configure_output_dir(logdir) hyperparams = {'exp_name': exp_name, 'env_name': env_name} logz.save_hyperparams(hyperparams) expt_dir = '/tmp/hw3_vid_dir/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) env = wrap_deepmind_ram(env) return env def main(): # Choose Atari games. env_name = 'Pong-ram-v0' exp_name = 'Pong_double_dqn' # you can use it to mark different experiments # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) print('random seed = %d' % seed) env = get_env(env_name, exp_name, seed) atari_learn(env, num_timesteps=int(4e7)) if __name__ == "__main__": main()