python source code of dqn

cs294-112_hws-master
- hw1
  - load_policy.py
  - expert_data
    - Reacher-v2.json
    - Humanoid-v2.json
    - Ant-v2.json
    - Walker2d-v2.json
    - Hopper-v2.json
    - HalfCheetah-v2.json
  - report.md
  - report.py
  - run.sh
  - model.py
  - experiments
    - HalfCheetah-v2_smooth-l1
      - behavioral_cloning
        log.txt
        best_checkpoint
        m_best.ckpt-4615.index
        checkpoint
        m_best.ckpt-4615.meta
        checkpoint
        HalfCheetah-v2_smooth-l1.json
        m.ckpt-7100.meta
        m.ckpt-7100.index
        HalfCheetah-v2_smooth-l1_results.json
    - Walker2d-v2
      - dagger
        m.ckpt-28455.meta
        log.txt
        Walker2d-v2_results.json
        m.ckpt-28455.index
        best_checkpoint
        m_best.ckpt-15565.meta
        checkpoint
        m_best.ckpt-15565.index
        checkpoint
        Walker2d-v2.json
      - behavioral_cloning
        log.txt
        Walker2d-v2_results.json
        best_checkpoint
        m_best.ckpt-5680.meta
        checkpoint
        m_best.ckpt-5680.index
        checkpoint
        Walker2d-v2.json
        m.ckpt-7100.meta
        m.ckpt-7100.index
    - Reacher-v2_smooth-l1
      - behavioral_cloning
        log.txt
        best_checkpoint
        checkpoint
        m_best.ckpt-400.meta
        m_best.ckpt-400.index
        checkpoint
        Reacher-v2_smooth-l1.json
        m.ckpt-400.index
        m.ckpt-400.meta
        Reacher-v2_smooth-l1_results.json
    - HalfCheetah-v2
      - dagger
        log.txt
        best_checkpoint
        m_best.ckpt-33265.meta
        m_best.ckpt-33265.index
        checkpoint
        HalfCheetah-v2_results.json
        m.ckpt-40480.meta
        checkpoint
        m.ckpt-40480.index
        HalfCheetah-v2.json
      - behavioral_cloning
        log.txt
        best_checkpoint
        m_best.ckpt-4615.index
        checkpoint
        m_best.ckpt-4615.meta
        HalfCheetah-v2_results.json
        checkpoint
        m.ckpt-7100.meta
        m.ckpt-7100.index
        HalfCheetah-v2.json
    - Hopper-v2_smooth-l1
      - behavioral_cloning
        Hopper-v2_smooth-l1_results.json
        log.txt
        Hopper-v2_smooth-l1.json
        best_checkpoint
        m_best.ckpt-5680.meta
        checkpoint
        m_best.ckpt-5680.index
        checkpoint
        m.ckpt-7100.meta
        m.ckpt-7100.index
    - Walker2d-v2_smooth-l1
      - behavioral_cloning
        log.txt
        best_checkpoint
        m_best.ckpt-6035.index
        checkpoint
        m_best.ckpt-6035.meta
        Walker2d-v2_smooth-l1_results.json
        checkpoint
        m.ckpt-7100.meta
        m.ckpt-7100.index
        Walker2d-v2_smooth-l1.json
    - Reacher-v2
      - dagger
        log.txt
        best_checkpoint
        checkpoint
        m_best.ckpt-1540.meta
        m_best.ckpt-1540.index
        m.ckpt-2075.meta
        Reacher-v2_results.json
        Reacher-v2.json
        checkpoint
        m.ckpt-2075.index
      - behavioral_cloning
        log.txt
        best_checkpoint
        checkpoint
        m_best.ckpt-400.meta
        m_best.ckpt-400.index
        Reacher-v2_results.json
        Reacher-v2.json
        checkpoint
        m.ckpt-400.index
        m.ckpt-400.meta
    - Hopper-v2
      - dagger
        log.txt
        m.ckpt-31140.meta
        best_checkpoint
        checkpoint
        m_best.ckpt-6020.meta
        m_best.ckpt-6020.index
        m.ckpt-31140.index
        checkpoint
        Hopper-v2.json
        Hopper-v2_results.json
      - behavioral_cloning
        log.txt
        best_checkpoint
        m_best.ckpt-5680.meta
        checkpoint
        m_best.ckpt-5680.index
        checkpoint
        Hopper-v2.json
        m.ckpt-7100.meta
        m.ckpt-7100.index
        Hopper-v2_results.json
    - Ant-v2
      - dagger
        Ant-v2_results.json
        m.ckpt-39985.meta
        log.txt
        best_checkpoint
        m_best.ckpt-9405.meta
        checkpoint
        m_best.ckpt-9405.index
        Ant-v2.json
        checkpoint
        m.ckpt-39985.index
      - behavioral_cloning
        Ant-v2_results.json
        log.txt
        best_checkpoint
        m_best.ckpt-6745.index
        m_best.ckpt-6745.meta
        checkpoint
        Ant-v2.json
        checkpoint
        m.ckpt-7100.meta
        m.ckpt-7100.index
    - Humanoid-v2
      - dagger
        log.txt
        best_checkpoint
        checkpoint
        m_best.ckpt-10640.index
        Humanoid-v2.json
        checkpoint
        Humanoid-v2_results.json
        m.ckpt-10640.index
      - behavioral_cloning
        log.txt
        best_checkpoint
        m_best.ckpt-6390.index
        checkpoint
        Humanoid-v2.json
        checkpoint
        Humanoid-v2_results.json
        m.ckpt-7100.index
    - Humanoid-v2_smooth-l1
      - behavioral_cloning
        log.txt
        Humanoid-v2_smooth-l1.json
        best_checkpoint
        m_best.ckpt-6745.index
        checkpoint
        checkpoint
        m.ckpt-7100.index
        Humanoid-v2_smooth-l1_results.json
    - Ant-v2_smooth-l1
      - behavioral_cloning
        Ant-v2_smooth-l1.json
        log.txt
        best_checkpoint
        m_best.ckpt-6745.index
        checkpoint
        Ant-v2_smooth-l1_results.json
        checkpoint
        m.ckpt-7100.index
  - run_expert.py
  - demo.bash
  - get_data.sh
  - main.py
  - README.md
  - requirements.txt
  - experts
    - Hopper-v2.pkl
    - Reacher-v2.pkl
    - HalfCheetah-v2.pkl
    - Ant-v2.pkl
    - Walker2d-v2.pkl
  - tf_util.py
- LICENSE
- project
- hw5
  - sac
    - run_1.sh
    - environment.yml
    - nn.py
    - ref
    - results
    - logz.py
    - plot.py
    - sac.py
    - data
      - sac_HalfCheetah-v2_reparam
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - sac_HalfCheetah-v2_reinf
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - sac_Ant-v2_reparam
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - sac_Ant-v2_reparam_2qf
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
    - README.md
    - utils.py
    - run_2.sh
    - train_mujoco.py
  - exp
    - replay.py
    - ref
    - results
    - run_all.sh
    - exploration.py
    - ex_utils.py
    - density_model.py
    - logz.py
    - plot.py
    - pointmass.py
    - data
      - ac_PM_rbf_bc0.01_s8_sig0.2_PointMass-v0
        1
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        11
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        21
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
      - ac_PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000_PointMass-v0
        1
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        11
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        21
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
      - ac_HC_bc0_HalfCheetah-v2
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - ac_PM_bc0_s8_PointMass-v0
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - ac_PM_hist_bc0.01_s8_PointMass-v0
        1
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        11
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
        21
        74.npy
        42.npy
        36.npy
        61.npy
        0.npy
        3.npy
        51.npy
        54.npy
        18.npy
        45.npy
        38.npy
        48.npy
        57.npy
        log.txt
        47.npy
        77.npy
        32.npy
        79.npy
        85.npy
        4.npy
        55.npy
        33.npy
        50.npy
        13.npy
        59.npy
        91.npy
        89.npy
        25.npy
        28.npy
        65.npy
        26.npy
        93.npy
        82.npy
        44.npy
        34.npy
        11.npy
        43.npy
        9.npy
        15.npy
        98.npy
        2.npy
        params.json
        8.npy
        35.npy
        1.npy
        41.npy
        37.npy
        31.npy
        66.npy
        62.npy
        63.npy
        22.npy
        73.npy
        60.npy
        6.npy
        52.npy
        71.npy
        19.npy
        86.npy
        68.npy
        99.npy
        46.npy
        87.npy
        21.npy
        78.npy
        29.npy
        56.npy
        23.npy
        70.npy
        14.npy
        39.npy
        96.npy
        95.npy
        5.npy
        75.npy
        53.npy
        12.npy
        81.npy
        97.npy
        84.npy
        58.npy
        69.npy
        94.npy
        83.npy
        7.npy
        27.npy
        88.npy
        76.npy
        64.npy
        67.npy
        20.npy
        24.npy
        30.npy
        17.npy
        40.npy
        16.npy
        90.npy
        72.npy
        80.npy
        10.npy
        92.npy
        49.npy
      - ac_HC_bc0.001_kl0.1_dlr0.005_dti1000_HalfCheetah-v2
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - ac_HC_bc0.0001_kl0.1_dlr0.005_dti10000_HalfCheetah-v2
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
    - README.md
    - requirements.txt
    - train_ac_exploration_f18.py
    - sparse_half_cheetah.py
  - meta
    - run_1.sh
    - point_mass.py
    - run_21.sh
    - run_25.sh
    - replay_buffer.py
    - run_24.sh
    - ref
    - plot_3.py
    - results
    - train_policy.py
    - run_31.sh
    - logz.py
    - point_mass_observed.py
    - plot.py
    - run_23.sh
    - data
      - pm_mlp_h-1_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-30_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-60_gs-10
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm-obs_mlp_h-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-60_gs-5
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-45_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-1_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_mlp_h-30_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-60_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_mlp_h-15_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_gru_h-15_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_mlp_h-60_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
      - pm_mlp_h-45_gs-1
        1
        log.txt
        params.json
        11
        log.txt
        params.json
        21
        log.txt
        params.json
    - README.md
    - requirements.txt
    - run_32.sh
    - run_22.sh
- hw3
  - run_21.sh
  - results
  - lunar_lander.py
  - dqn_utils.py
  - train_ac_f18.py
  - plot_part1.py
  - logz.py
  - run_14.sh
  - atari_wrappers.py
  - plot.py
  - run_dqn_atari.py
  - old
    - run_dqn_lander.py
    - README.md
    - run_dqn_ram.py
  - run_11.sh
  - data
    - ac_1_100_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ac_10_10_InvertedPendulum-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ac_1_1_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - PongNoFrameskip-v4_dq_gamma-0_9
      - log.txt
    - PongNoFrameskip-v4_dq_gamma-0_999
      - log.txt
    - ac_10_10_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ac_10_10_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ac_100_1_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - PongNoFrameskip-v4_dq
      - log.txt
    - PongNoFrameskip-v4_bq
      - log.txt
  - run_12.sh
  - README.md
  - dqn.py
  - requirements.txt
  - run_13.sh
  - run_22.sh
- hw2
  - run_5.sh
  - README.tex.md
  - results
  - lunar_lander.py
  - tex
    - c27101b2bb742f3c10dfd4f7b27c299b.svg
    - 344d33eb616b74dbcae4e4d2aae62c94.svg
    - 6032ead988551c349c799b3f615ec60a.svg
  - run_93.sh
  - run_812.sh
  - run_811.sh
  - logz.py
  - run_7.sh
  - plot.py
  - run_4.sh
  - data
    - ip93_b-1000_lr-1e-3_pg-step-1_InvertedPendulum-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - lb_no_rtg_dna_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - sb_rtg_na_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ip_b-1000_lr-1e-2_InvertedPendulum-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - lb_rtg_dna_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc12_b-30000_lr-0.005_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ll_b-40000_lr-0.005_LunarLanderContinuous-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc2_b-50000_lr-0.02_rtg_nn_baseline_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc13_b-50000_lr-0.01_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ip93_b-1000_lr-1e-3_pg-step-20_InvertedPendulum-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc12_b-30000_lr-0.01_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - ip93_b-1000_lr-1e-3_pg-step-5_InvertedPendulum-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc11_b-10000_lr-0.005_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - sb_no_rtg_dna_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc2_b-50000_lr-0.02_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - lb_rtg_na_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc13_b-50000_lr-0.005_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc11_b-10000_lr-0.01_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc11_b-10000_lr-0.02_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc12_b-30000_lr-0.02_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc2_b-50000_lr-0.02_rtg_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - sb_rtg_dna_CartPole-v0
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc13_b-50000_lr-0.02_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
    - hc2_b-50000_lr-0.02_nn_baseline_HalfCheetah-v2
      - 1
        log.txt
        params.json
      - 11
        log.txt
        params.json
      - 21
        log.txt
        params.json
  - run_813.sh
  - README.md
  - run_82.sh
  - train_pg_f18.py
  - requirements.txt
  - hw2_instructions.tex
- README.md
- hw4
  - plots
  - ref
  - run_all.sh
  - model_based_policy.py
  - plot.py
  - logger.py
  - data
    - HalfCheetah_q3_horizon15
      - log.txt
      - log.csv
    - HalfCheetah_q3_action128
      - log.txt
      - log.csv
    - HalfCheetah_q3_horizon20
      - log.txt
      - log.csv
    - HalfCheetah_q3_horizon10
      - log.txt
      - log.csv
    - HalfCheetah_q3_layers2
      - log.txt
      - log.csv
    - HalfCheetah_q3_default
      - log.txt
      - log.csv
    - HalfCheetah_q1_exp
      - log.txt
    - HalfCheetah_q2_exp
      - log.txt
      - log.csv
    - HalfCheetah_q3_action4096
      - log.txt
      - log.csv
    - HalfCheetah_q3_layers3
      - log.txt
      - log.csv
    - HalfCheetah_q3_action16384
      - log.txt
      - log.csv
    - HalfCheetah_q3_layers1
      - log.txt
      - log.csv
  - model_based_rl.py
  - half_cheetah_env.py
  - main.py
  - README.md
  - utils.py
  - requirements.txt
  - timer.py
  - tabulate.py
- .gitignore

"""This file includes a collection of utility functions that are useful for
implementing DQN."""
import gym
import tensorflow as tf
import numpy as np
import random

def huber_loss(x, delta=1.0):
    # https://en.wikipedia.org/wiki/Huber_loss
    return tf.where(
        tf.abs(x) < delta,
        tf.square(x) * 0.5,
        delta * (tf.abs(x) - 0.5 * delta)
    )

def sample_n_unique(sampling_f, n):
    """Helper function. Given a function `sampling_f` that returns
    comparable objects, sample n such unique objects.
    """
    res = []
    while len(res) < n:
        candidate = sampling_f()
        if candidate not in res:
            res.append(candidate)
    return res

class Schedule(object):
    def value(self, t):
        """Value of the schedule at time t"""
        raise NotImplementedError()

class ConstantSchedule(object):
    def __init__(self, value):
        """Value remains constant over time.
        Parameters
        ----------
        value: float
            Constant value of the schedule
        """
        self._v = value

    def value(self, t):
        """See Schedule.value"""
        return self._v

def linear_interpolation(l, r, alpha):
    return l + alpha * (r - l)

class PiecewiseSchedule(object):
    def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
        """Piecewise schedule.
        endpoints: [(int, int)]
            list of pairs `(time, value)` meanining that schedule should output
            `value` when `t==time`. All the values for time must be sorted in
            an increasing order. When t is between two times, e.g. `(time_a, value_a)`
            and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
            `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
            time passed between `time_a` and `time_b` for time `t`.
        interpolation: lambda float, float, float: float
            a function that takes value to the left and to the right of t according
            to the `endpoints`. Alpha is the fraction of distance from left endpoint to
            right endpoint that t has covered. See linear_interpolation for example.
        outside_value: float
            if the value is requested outside of all the intervals sepecified in
            `endpoints` this value is returned. If None then AssertionError is
            raised when outside value is requested.
        """
        idxes = [e[0] for e in endpoints]
        assert idxes == sorted(idxes)
        self._interpolation = interpolation
        self._outside_value = outside_value
        self._endpoints      = endpoints

    def value(self, t):
        """See Schedule.value"""
        for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
            if l_t <= t and t < r_t:
                alpha = float(t - l_t) / (r_t - l_t)
                return self._interpolation(l, r, alpha)

        # t does not belong to any of the pieces, so doom.
        assert self._outside_value is not None
        return self._outside_value

class LinearSchedule(object):
    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
        """Linear interpolation between initial_p and final_p over
        schedule_timesteps. After this many timesteps pass final_p is
        returned.
        Parameters
        ----------
        schedule_timesteps: int
            Number of timesteps for which to linearly anneal initial_p
            to final_p
        initial_p: float
            initial output value
        final_p: float
            final output value
        """
        self.schedule_timesteps = schedule_timesteps
        self.final_p            = final_p
        self.initial_p          = initial_p

    def value(self, t):
        """See Schedule.value"""
        fraction  = min(float(t) / self.schedule_timesteps, 1.0)
        return self.initial_p + fraction * (self.final_p - self.initial_p)

def compute_exponential_averages(variables, decay):
    """Given a list of tensorflow scalar variables
    create ops corresponding to their exponential
    averages
    Parameters
    ----------
    variables: [tf.Tensor]
        List of scalar tensors.
    Returns
    -------
    averages: [tf.Tensor]
        List of scalar tensors corresponding to averages
        of al the `variables` (in order)
    apply_op: tf.runnable
        Op to be run to update the averages with current value
        of variables.
    """
    averager = tf.train.ExponentialMovingAverage(decay=decay)
    apply_op = averager.apply(variables)
    return [averager.average(v) for v in variables], apply_op

def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
    """Minimized `objective` using `optimizer` w.r.t. variables in
    `var_list` while ensure the norm of the gradients for each
    variable is clipped to `clip_val`
    """
    gradients = optimizer.compute_gradients(objective, var_list=var_list)
    for i, (grad, var) in enumerate(gradients):
        if grad is not None:
            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
    return optimizer.apply_gradients(gradients)

def initialize_interdependent_variables(session, vars_list, feed_dict):
    """Initialize a list of variables one at a time, which is useful if
    initialization of some variables depends on initialization of the others.
    """
    vars_left = vars_list
    while len(vars_left) > 0:
        new_vars_left = []
        for v in vars_left:
            try:
                # If using an older version of TensorFlow, uncomment the line
                # below and comment out the line after it.
		#session.run(tf.initialize_variables([v]), feed_dict)
                session.run(tf.variables_initializer([v]), feed_dict)
            except tf.errors.FailedPreconditionError:
                new_vars_left.append(v)
        if len(new_vars_left) >= len(vars_left):
            # This can happend if the variables all depend on each other, or more likely if there's
            # another variable outside of the list, that still needs to be initialized. This could be
            # detected here, but life's finite.
            raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
        else:
            vars_left = new_vars_left

def get_wrapper_by_name(env, classname):
    currentenv = env
    while True:
        if classname in currentenv.__class__.__name__:
            return currentenv
        elif isinstance(env, gym.Wrapper):
            currentenv = currentenv.env
        else:
            raise ValueError("Couldn't find wrapper named %s"%classname)

class ReplayBuffer(object):
    def __init__(self, size, frame_history_len, lander=False):
        """This is a memory efficient implementation of the replay buffer.

        The sepecific memory optimizations use here are:
            - only store each frame once rather than k times
              even if every observation normally consists of k last frames
            - store frames as np.uint8 (actually it is most time-performance
              to cast them back to float32 on GPU to minimize memory transfer
              time)
            - store frame_t and frame_(t+1) in the same buffer.

        For the tipical use case in Atari Deep RL buffer with 1M frames the total
        memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes

        Warning! Assumes that returning frame of zeros at the beginning
        of the episode, when there is less frames than `frame_history_len`,
        is acceptable.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        frame_history_len: int
            Number of memories to be retried for each observation.
        """
        self.lander = lander

        self.size = size
        self.frame_history_len = frame_history_len

        self.next_idx      = 0
        self.num_in_buffer = 0

        self.obs      = None
        self.action   = None
        self.reward   = None
        self.done     = None

    def can_sample(self, batch_size):
        """Returns true if `batch_size` different transitions can be sampled from the buffer."""
        return batch_size + 1 <= self.num_in_buffer

    def _encode_sample(self, idxes):
        obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
        act_batch      = self.action[idxes]
        rew_batch      = self.reward[idxes]
        next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
        done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)

        return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask


    def sample(self, batch_size):
        """Sample `batch_size` different transitions.

        i-th sample transition is the following:

        when observing `obs_batch[i]`, action `act_batch[i]` was taken,
        after which reward `rew_batch[i]` was received and subsequent
        observation  next_obs_batch[i] was observed, unless the epsiode
        was done which is represented by `done_mask[i]` which is equal
        to 1 if episode has ended as a result of that action.

        Parameters
        ----------
        batch_size: int
            How many transitions to sample.

        Returns
        -------
        obs_batch: np.array
            Array of shape
            (batch_size, img_h, img_w, img_c * frame_history_len)
            and dtype np.uint8
        act_batch: np.array
            Array of shape (batch_size,) and dtype np.int32
        rew_batch: np.array
            Array of shape (batch_size,) and dtype np.float32
        next_obs_batch: np.array
            Array of shape
            (batch_size, img_h, img_w, img_c * frame_history_len)
            and dtype np.uint8
        done_mask: np.array
            Array of shape (batch_size,) and dtype np.float32
        """
        assert self.can_sample(batch_size)
        idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
        return self._encode_sample(idxes)

    def encode_recent_observation(self):
        """Return the most recent `frame_history_len` frames.

        Returns
        -------
        observation: np.array
            Array of shape (img_h, img_w, img_c * frame_history_len)
            and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
            encodes frame at time `t - frame_history_len + i`
        """
        assert self.num_in_buffer > 0
        return self._encode_observation((self.next_idx - 1) % self.size)

    def _encode_observation(self, idx):
        end_idx   = idx + 1 # make noninclusive
        start_idx = end_idx - self.frame_history_len
        # this checks if we are using low-dimensional observations, such as RAM
        # state, in which case we just directly return the latest RAM.
        if len(self.obs.shape) == 2:
            return self.obs[end_idx-1]
        # if there weren't enough frames ever in the buffer for context
        if start_idx < 0 and self.num_in_buffer != self.size:
            start_idx = 0
        for idx in range(start_idx, end_idx - 1):
            if self.done[idx % self.size]:
                start_idx = idx + 1
        missing_context = self.frame_history_len - (end_idx - start_idx)
        # if zero padding is needed for missing context
        # or we are on the boundry of the buffer
        if start_idx < 0 or missing_context > 0:
            frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
            for idx in range(start_idx, end_idx):
                frames.append(self.obs[idx % self.size])
            return np.concatenate(frames, 2)
        else:
            # this optimization has potential to saves about 30% compute time \o/
            img_h, img_w = self.obs.shape[1], self.obs.shape[2]
            return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)

    def store_frame(self, frame):
        """Store a single frame in the buffer at the next available index, overwriting
        old frames if necessary.

        Parameters
        ----------
        frame: np.array
            Array of shape (img_h, img_w, img_c) and dtype np.uint8
            the frame to be stored

        Returns
        -------
        idx: int
            Index at which the frame is stored. To be used for `store_effect` later.
        """
        if self.obs is None:
            self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.lander else np.uint8)
            self.action   = np.empty([self.size],                     dtype=np.int32)
            self.reward   = np.empty([self.size],                     dtype=np.float32)
            self.done     = np.empty([self.size],                     dtype=np.bool)
        self.obs[self.next_idx] = frame

        ret = self.next_idx
        self.next_idx = (self.next_idx + 1) % self.size
        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)

        return ret

    def store_effect(self, idx, action, reward, done):
        """Store effects of action taken after obeserving frame stored
        at index idx. The reason `store_frame` and `store_effect` is broken
        up into two functions is so that once can call `encode_recent_observation`
        in between.

        Paramters
        ---------
        idx: int
            Index in buffer of recently observed frame (returned by `store_frame`).
        action: int
            Action that was performed upon observing this frame.
        reward: float
            Reward that was received when the actions was performed.
        done: bool
            True if episode was finished after performing that action.
        """
        self.action[idx] = action
        self.reward[idx] = reward
        self.done[idx]   = done