python source code of model

Deep-Reinforcement-Learning-Hands-On-master
- Chapter08
  - lib
    - data.py
    - environ.py
    - validation.py
    - models.py
    - common.py
    - __init__.py
  - train_model_conv.py
  - run_model.py
  - data
    - unpack_data.sh
  - train_model.py
  - tests
    - test_environ.py
    - test_data.py
  - .gitignore
- Chapter13
  - wob_click_mm_play.py
  - environment.yml
  - ksy
    - rfp_client.ksy
    - fbs.ksy
    - rfp_server.ksy
  - adhoc
    - wob_test.py
    - wob_create.py
    - wd_tests.py
    - demo_dump.py
    - fbs_dump.py
    - start_docker_demo.sh
    - fbs_read.py
    - fbs_join.py
    - start_docker.sh
    - wob_clicks.py
  - wob_fixes
    - 01_wob_crash-fix.patch
    - readme.md
    - autopatch.sh
    - 02_reward_proxy_append_rewards.patch
  - wob_click_train.py
  - lib
    - ksy
      - rfp_server.py
      - fbs.py
      - rfp_client.py
      - __init__.py
    - model_vnc.py
    - common.py
    - __init__.py
    - wob_vnc.py
    - vnc_demo.py
  - wob_click_play.py
  - wob_click_mm_train.py
  - demos
  - .gitignore
- Chapter18
  - semi-final.sh
  - telegram-bot.py
  - play.py
  - lib
    - game.py
    - mcts.py
    - model.py
    - __init__.py
  - train.py
  - tournament
    - charts.ipynb
    - .ipynb_checkpoints
      - charts-checkpoint.ipynb
  - tests
    - test_model.py
    - __init__.py
    - test_game.py
  - .gitignore
- Chapter16
  - 02_cheetah_es.py
  - 03_cartpole_ga.py
  - 05_cheetah_ga_batch.py
  - 01_cartpole_es.py
  - .gitignore
  - not_converging
    - 02_breakout_es.py
    - 02_hyper.py
  - 04_cheetah_ga.py
- Chapter17
  - 02_imag.py
  - 01_a2c.py
  - play.py
  - lib
    - i2a.py
    - common.py
    - __init__.py
  - 03_i2a.py
  - .gitignore
- LICENSE
- Chapter12
  - use_model.py
  - train_scst.py
  - data_test.py
  - cor_reader.py
  - libbots
    - data.py
    - model.py
    - __init__.py
    - utils.py
    - cornell.py
  - telergam_bot.py
  - train_crossent.py
  - data
    - get_data.sh
    - .gitignore
  - tests
    - test_subtitles.py
    - test_data.py
  - .gitignore
- formulas
  - ch16
    - ch16-018.eps
    - ch16-007.eps
    - ch16-019.eps
    - ch16-011.eps
    - ch16-001.eps
    - ch16-010.eps
    - ch16-020.eps
    - ch16-012.eps
    - ch16-006.eps
    - ch16-005.eps
    - ch16-021.eps
    - ch16-015.eps
    - ch16-013.eps
    - ch16-022.eps
    - ch16-003.eps
    - ch16-008.eps
    - ch16-009.eps
    - ch16-017.eps
    - ch16-002.eps
    - ch16-004.eps
    - ch16-014.eps
    - ch16-016.eps
  - ch12.tex
  - ch01.tex
  - ch16.tex
  - ch07
    - ch07-005.eps
    - ch07-009.eps
    - ch07-008.eps
    - ch07-011.eps
    - ch07-010.eps
    - ch07-007.eps
    - ch07-004.eps
    - ch07-006.eps
    - ch07-001.eps
    - ch07-013.eps
    - ch07-002.eps
    - ch07-003.eps
    - ch07-012.eps
  - _template.tex
  - ch06.tex
  - clean.sh
  - ch10
    - ch10-013.eps
    - ch10-010.eps
    - ch10-014.eps
    - ch10-004.eps
    - ch10-002.eps
    - ch10-008.eps
    - ch10-009.eps
    - ch10-005.eps
    - ch10-007.eps
    - ch10-011.eps
    - ch10-015.eps
    - ch10-006.eps
    - ch10-001.eps
    - ch10-003.eps
    - ch10-012.eps
  - ch09.tex
  - ch05
    - ch05-024.eps
    - ch05-036.eps
    - ch05-038.eps
    - ch05-056.eps
    - ch05-043.eps
    - ch05-012.eps
    - ch05-050.eps
    - ch05-026.eps
    - ch05-002.eps
    - ch05-018.eps
    - ch05-029.eps
    - ch05-061.eps
    - ch05-010.eps
    - ch05-062.eps
    - ch05-037.eps
    - ch05-019.eps
    - ch05-032.eps
    - ch05-028.eps
    - ch05-004.eps
    - ch05-066.eps
    - ch05-046.eps
    - ch05-049.eps
    - ch05-063.eps
    - ch05-015.eps
    - ch05-057.eps
    - ch05-033.eps
    - ch05-005.eps
    - ch05-059.eps
    - ch05-014.eps
    - ch05-044.eps
    - ch05-054.eps
    - ch05-023.eps
    - ch05-065.eps
    - ch05-025.eps
    - ch05-021.eps
    - ch05-045.eps
    - ch05-055.eps
    - ch05-022.eps
    - ch05-020.eps
    - ch05-048.eps
    - ch05-051.eps
    - ch05-017.eps
    - ch05-040.eps
    - ch05-041.eps
    - ch05-027.eps
    - ch05-067.eps
    - ch05-031.eps
    - ch05-053.eps
    - ch05-001.eps
    - ch05-011.eps
    - ch05-035.eps
    - ch05-060.eps
    - ch05-039.eps
    - ch05-042.eps
    - ch05-058.eps
    - ch05-034.eps
    - ch05-052.eps
    - ch05-013.eps
    - ch05-006.eps
    - ch05-016.eps
    - ch05-047.eps
    - ch05-008.eps
    - ch05-030.eps
    - ch05-003.eps
    - ch05-007.eps
    - ch05-068.eps
    - ch05-064.eps
    - ch05-009.eps
  - ch05.tex
  - ch07.tex
  - ch18.tex
  - ch10.tex
  - ch09
    - ch09-005.eps
    - ch09-004.eps
    - ch09-002.eps
    - ch09-010.eps
    - ch09-009.eps
    - ch09-007.eps
    - ch09-001.eps
    - ch09-006.eps
    - ch09-003.eps
    - ch09-008.eps
  - ch14.tex
  - ch15.tex
  - ch06
    - ch06-001.eps
    - ch06-008.eps
    - ch06-009.eps
    - ch06-015.eps
    - ch06-002.eps
    - ch06-006.eps
    - ch06-014.eps
    - ch06-005.eps
    - ch06-011.eps
    - ch06-012.eps
    - ch06-013.eps
    - ch06-007.eps
    - ch06-010.eps
    - ch06-003.eps
    - ch06-004.eps
  - ch14
    - ch14-010.eps
    - ch14-022.eps
    - ch14-015.eps
    - ch14-018.eps
    - ch14-019.eps
    - ch14-008.eps
    - ch14-020.eps
    - ch14-009.eps
    - ch14-023.eps
    - ch14-003.eps
    - ch14-021.eps
    - ch14-014.eps
    - ch14-017.eps
    - ch14-005.eps
    - ch14-001.eps
    - ch14-011.eps
    - ch14-024.eps
    - ch14-013.eps
    - ch14-006.eps
    - ch14-016.eps
    - ch14-002.eps
    - ch14-004.eps
    - ch14-012.eps
    - ch14-007.eps
  - ch15
    - ch15-001.eps
    - ch15-010.eps
    - ch15-012.eps
    - ch15-009.eps
    - ch15-004.eps
    - ch15-018.eps
    - ch15-017.eps
    - ch15-005.eps
    - ch15-011.eps
    - ch15-014.eps
    - ch15-020.eps
    - ch15-002.eps
    - ch15-015.eps
    - ch15-013.eps
    - ch15-016.eps
    - ch15-006.eps
    - ch15-008.eps
    - ch15-007.eps
    - ch15-003.eps
    - ch15-019.eps
  - ch12
    - ch12-004.eps
    - ch12-003.eps
    - ch12-001.eps
    - ch12-005.eps
    - ch12-002.eps
  - ch04
    - ch04-07.eps
    - ch04-05.eps
    - ch04-03.eps
    - ch04-06.eps
    - ch04-02.eps
    - ch04-04.eps
    - ch04-01.eps
  - ch18
    - ch18-006.eps
    - ch18-009.eps
    - ch18-005.eps
    - ch18-003.eps
    - ch18-008.eps
    - ch18-002.eps
    - ch18-004.eps
    - ch18-007.eps
    - ch18-001.eps
  - ch01
    - ch01-01.eps
    - ch01-02.eps
    - ch01-03.eps
  - .gitignore
  - make.sh
  - ch04.tex
- Chapter10
  - 01_cartpole_pg.py
  - log.md
  - tmp
    - 00_pong_pg.py
    - 00_pong_pg-loc-baseline.py
  - 03_pong_a2c_rollouts.py
  - lib
    - common.py
    - __init__.py
  - 04_pong_r2.py
  - .gitignore
  - 02_pong_a2c.py
- Chapter15
  - 05_train_acktr.py
  - 04_train_ppo.py
  - 03_train_trpo.py
  - lib
    - trpo.py
    - model.py
    - common.py
    - __init__.py
    - kfac.py
  - 01_train_a2c.py
  - 02_play.py
  - .gitignore
- Chapter14
  - 05_play_ddpg.py
  - adhoc
    - record_a2c.sh
    - record_ddpg.sh
  - lib
    - model.py
    - common.py
    - __init__.py
  - 01_check_env.py
  - 02_train_a2c.py
  - 03_play_a2c.py
  - 06_train_d4pg.py
  - 04_train_ddpg.py
  - .gitignore
- download-roboschool.sh
- README.md
- install-roboschool.sh
- Chapter02
  - 01_agent_anatomy.py
  - 02_cartpole_random.py
  - 03_random_actionwrapper.py
  - 04_cartpole_random_monitor.py
- Chapter05
  - 01_frozenlake_v_iteration.py
  - 02_frozenlake_q_iteration.py
- Chapter03
  - 01_modules.py
  - 03_atari_gan.py
  - 02_tensorboard.py
  - .gitignore
- Chapter07
  - res
    - log-05_dqn_prio_replay.py.txt
    - Nov11_15-26-36_gpu-pong-double=True
    - log-06_dqn_dueling.py.txt
    - log-08_dqn_rainbow.py.txt
    - Nov11_09-55-16_gpu-pong-double=False
    - log-07_dqn_distrib.py.txt
    - Nov11_21-03-13_gpu-pong-noisy-net
      - events.out.tfevents.1510430593.gpu
    - Nov11_07-49-02_gpu-pong-basic
      - events.out.tfevents.1510382942.gpu
    - log-03_dqn_double.py.txt
    - Nov11_09-16-29_gpu-pong-2-step
      - events.out.tfevents.1510388189.gpu
    - Nov11_17-20-40_gpu-pong-prio-replay
      - events.out.tfevents.1510417240.gpu
    - Nov11_16-34-18_gpu-pong-distrib
    - log-04_dqn_noisy_net.py.txt
    - Nov11_15-35-13_gpu-pong-dueling
      - events.out.tfevents.1510410913.gpu
    - log-01_dqn_basic.py.txt
    - log-02_dqn_n_steps.py.txt
    - Nov11_22-50-56_gpu-pong-rainbow
      - events.out.tfevents.1510437056.gpu
    - .gitignore
  - 01_dqn_basic.py
  - adhoc
    - distr_test.py
    - commute.py
  - 06_dqn_dueling.py
  - 03_dqn_double.py
  - 08_dqn_rainbow.py
  - lib
    - dqn_model.py
    - common.py
    - __init__.py
  - 05_dqn_prio_replay.py
  - 07_dqn_distrib.py
  - 04_dqn_noisy_net.py
  - bench
    - simple_buffer_bench.py
    - prio_buffer_bench.py
  - 02_dqn_n_steps.py
- requirements.txt
- Chapter11
  - adhoc
    - sync_bench.py
    - distr_grad.py
    - distr_grad2.py
  - 01_a3c_data.py
  - lib
    - common.py
    - __init__.py
  - 02_a3c_grad.py
  - .gitignore
- .gitignore
- Chapter04
  - 04_frozenlake_nonslippery.py
  - 03_frozenlake_tweaked.py
  - 01_cartpole.py
  - 02_frozenlake_naive.py
- Chapter06
  - 01_frozenlake_q_learning.py
  - lib
    - wrappers.py
    - dqn_model.py
    - __init__.py
  - 02_dqn_pong.py
  - 03_dqn_play.py
- Chapter09
  - res
    - Dec06_16-05-54_gpu-pong-pg-bad
    - Dec03_13-48-07_gpu-cartpole-reinforce
      - events.out.tfevents.1512305287.gpu
    - Dec01_12-13-46_gpu-cartpole-dqn
      - events.out.tfevents.1512126826.gpu
    - Dec03_12-16-34_gpu-cartpole-reinforce-baseline
      - events.out.tfevents.1512299794.gpu
  - 05_pong_pg.py
  - lib
    - common.py
    - __init__.py
  - 03_cartpole_reinforce_baseline.py
  - 04_cartpole_pg.py
  - 01_cartpole_dqn.py
  - 02_cartpole_reinforce.py

import logging
import pickle
import numpy as np
from nltk.tokenize import TweetTokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils

MM_EMBEDDINGS_DIM = 50
MM_HIDDEN_SIZE = 128
MM_MAX_DICT_SIZE = 100

TOKEN_UNK = "#unk"


class Model(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Model, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, 5, stride=5),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=2),
            nn.ReLU(),
        )

        conv_out_size = self._get_conv_out(input_shape)

        self.policy = nn.Sequential(
            nn.Linear(conv_out_size, n_actions),
        )

        self.value = nn.Sequential(
            nn.Linear(conv_out_size, 1),
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        fx = x.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)
        return self.policy(conv_out), self.value(conv_out)


class ModelMultimodal(nn.Module):
    def __init__(self, input_shape, n_actions, max_dict_size=MM_MAX_DICT_SIZE):
        super(ModelMultimodal, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 64, 5, stride=5),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=2),
            nn.ReLU(),
        )

        conv_out_size = self._get_conv_out(input_shape)

        self.emb = nn.Embedding(max_dict_size, MM_EMBEDDINGS_DIM)
        self.rnn = nn.LSTM(MM_EMBEDDINGS_DIM, MM_HIDDEN_SIZE, batch_first=True)

        self.policy = nn.Sequential(
            nn.Linear(conv_out_size + MM_HIDDEN_SIZE*2, n_actions),
        )

        self.value = nn.Sequential(
            nn.Linear(conv_out_size + MM_HIDDEN_SIZE*2, 1),
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def _concat_features(self, img_out, rnn_hidden):
        batch_size = img_out.size()[0]
        if isinstance(rnn_hidden, tuple):
            flat_h = list(map(lambda t: t.view(batch_size, -1), rnn_hidden))
            rnn_h = torch.cat(flat_h, dim=1)
        else:
            rnn_h = rnn_hidden.view(batch_size, -1)
        return torch.cat((img_out, rnn_h), dim=1)

    def forward(self, x):
        x_img, x_text = x
        assert isinstance(x_text, rnn_utils.PackedSequence)

        # deal with text data
        emb_out = self.emb(x_text.data)
        emb_out_seq = rnn_utils.PackedSequence(emb_out, x_text.batch_sizes)
        rnn_out, rnn_h = self.rnn(emb_out_seq)

        # extract image features
        fx = x_img.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)

        feats = self._concat_features(conv_out, rnn_h)
        return self.policy(feats), self.value(feats)


class MultimodalPreprocessor:
    log = logging.getLogger("MulitmodalPreprocessor")

    def __init__(self, max_dict_size=MM_MAX_DICT_SIZE, device="cpu"):
        self.max_dict_size = max_dict_size
        self.token_to_id = {TOKEN_UNK: 0}
        self.next_id = 1
        self.tokenizer = TweetTokenizer(preserve_case=True)
        self.device = device

    def __len__(self):
        return len(self.token_to_id)

    def __call__(self, batch):
        """
        Convert list of multimodel observations (tuples with image and text string) into the form suitable
        for ModelMultimodal to disgest
        :param batch:
        """
        tokens_batch = []
        for img_obs, txt_obs in batch:
            tokens = self.tokenizer.tokenize(txt_obs)
            idx_obs = self.tokens_to_idx(tokens)
            tokens_batch.append((img_obs, idx_obs))
        # sort batch decreasing to seq len
        tokens_batch.sort(key=lambda p: len(p[1]), reverse=True)
        img_batch, seq_batch = zip(*tokens_batch)
        lens = list(map(len, seq_batch))

        # convert data into the target form
        # images
        img_v = torch.FloatTensor(img_batch).to(self.device)
        # sequences
        seq_arr = np.zeros(shape=(len(seq_batch), max(len(seq_batch[0]), 1)), dtype=np.int64)
        for idx, seq in enumerate(seq_batch):
            seq_arr[idx, :len(seq)] = seq
            # Map empty sequences into single #UNK token
            if len(seq) == 0:
                lens[idx] = 1
        seq_v = torch.LongTensor(seq_arr).to(self.device)
        seq_p = rnn_utils.pack_padded_sequence(seq_v, lens, batch_first=True)
        return img_v, seq_p

    def tokens_to_idx(self, tokens):
        res = []
        for token in tokens:
            idx = self.token_to_id.get(token)
            if idx is None:
                if self.next_id == self.max_dict_size:
                    self.log.warning("Maximum size of dict reached, token '%s' converted to #UNK token", token)
                    idx = 0
                else:
                    idx = self.next_id
                    self.next_id += 1
                    self.token_to_id[token] = idx
            res.append(idx)
        return res

    def save(self, file_name):
        with open(file_name, 'wb') as fd:
            pickle.dump(self.token_to_id, fd)
            pickle.dump(self.max_dict_size, fd)
            pickle.dump(self.next_id, fd)

    @classmethod
    def load(cls, file_name):
        with open(file_name, "rb") as fd:
            token_to_id = pickle.load(fd)
            max_dict_size = pickle.load(fd)
            next_id = pickle.load(fd)

            res = MultimodalPreprocessor(max_dict_size)
            res.token_to_id = token_to_id
            res.next_id = next_id
            return res


def train_demo(net, optimizer, batch, writer, step_idx, preprocessor, device="cpu"):
    """
    Train net on demonstration batch
    """
    batch_obs, batch_act = zip(*batch)
    batch_v = preprocessor(batch_obs).to(device)
    optimizer.zero_grad()
    ref_actions_v = torch.LongTensor(batch_act).to(device)
    policy_v = net(batch_v)[0]
    loss_v = F.cross_entropy(policy_v, ref_actions_v)
    loss_v.backward()
    optimizer.step()
    writer.add_scalar("demo_loss", loss_v.item(), step_idx)