python source code of imagenet_to

tpu_models-master
- LICENSE
- models
  - samples
    - core
      - get_started
        custom_tpuestimator.py
        iris_data_tpu.py
  - official
    - squeezenet
      - data_pipeline.py
      - squeezenet_model.py
      - squeezenet_main.py
      - configs
        squeezenet_config.py
        __init__.py
    - densenet
      - vgg_preprocessing.py
      - densenet_model.py
      - README.md
      - densenet_imagenet.py
    - unet3d
      - input_reader.py
      - tpu_executor.py
      - unet_model.py
      - metrics.py
      - params_dict.py
      - data_preprocess
        convert_lits.py
        convert_lits_nii_to_npy.py
      - unet_config.py
      - unet_main.py
      - README.md
      - requirements.txt
      - export_saved_model.py
      - saved_model_inference.py
    - detection
      - utils
        autoaugment_utils.py
        input_utils.py
        spatial_transform.py
        __init__.py
        object_detection
        balanced_positive_negative_sampler.py
        region_similarity_calculator.py
        ops.py
        visualization_utils.py
        shape_utils.py
        target_assigner.py
        preprocessor.py
        minibatch_sampler.py
        box_list_ops.py
        box_list.py
        faster_rcnn_box_coder.py
        argmax_matcher.py
        matcher.py
        box_coder.py
        __init__.py
        box_utils.py
        segm_utils.py
      - modeling
        retinanet_model.py
        learning_rates.py
        factory.py
        base_model.py
        postprocess.py
        losses.py
        architecture
        efficientnet_builder.py
        efficientnet_model.py
        fpn.py
        identity.py
        factory.py
        seresnext.py
        resnet.py
        heads.py
        nasfpn.py
        __init__.py
        utils.py
        nn_ops.py
        serving.py
        __init__.py
        checkpoint_utils.py
        model_builder.py
      - executor
        tpu_executor.py
        __init__.py
      - dataloader
        input_reader.py
        anchor.py
        retinanet_parser.py
        factory.py
        __init__.py
        tf_example_decoder.py
        shapemask_parser.py
        mode_keys.py
      - evaluation
        factory.py
        coco_utils.py
        __init__.py
        coco_evaluator.py
      - k8s
        retinanet_k8s.yaml
      - configs
        yaml
        4.8.14_r152_pretrained_640_feat_300_drop_0.9_aug3.yaml
        2.0.5_nasfpn_resnet50_shorter.yaml
        4.1.1_effnet_b5_45k_90k_1024_feat_300.yaml
        2.6.1_r50_features_250_drop_0.7.yaml
        3.1.0_balanced_resnet50_step_45k.yaml
        2.3.1b_efficientnet_b5_resumed.yaml
        2.6.4_r50_features_250_drop_0.85.yaml
        1.0.0_retinanet.yaml
        4.2.2_effnet_b7_45k_90k_1024_feat_256_drop_0.9.yaml
        2.6.5_r50_features_300_drop_0.85.yaml
        2.0.0_nasfpn_resnet50.yaml
        4.5.5_effnet_b6_45k_90k_1024_feat_256_drop_0.85.yaml
        2.4.7_r50_cosine_10k_300_feat_drop_0.5.yaml
        4.1.0_effnet_b5_45k_90k_1024_batch_32.yaml
        2.1.0_nasfpn_resnet200_aug_v1.yaml
        2.6.10_r50_features_400_drop_0.5_1024.yaml
        4.1.10_effnet_b5_45k_90k_1024_feat_350_drop_0.9.yaml
        4.9.1_r101_step_45k_60k_feat_300_drop_0.9_aug0.yaml
        2.5.2_effnet_b5_cosine_10k_frozen_15.yaml
        2.0.4_nasfpn_resnet50_aug_v3.yaml
        2.2.0_nasfpn_seresnext50.yaml
        1.0.4_1024x768.yaml
        4.8.4_r50_step_45k_90k_1024.yaml
        4.0.3_r200_step_45k_90k_1024_drop_0.9.yaml
        4.1.6_effnet_b5_45k_90k_1024_feat_256_drop_0.9.yaml
        2.5.0_effnet_b5_cosine_10k.yaml
        4.5.7_effnet_b6_45k_90k_1024_feat_300_drop_0.85.yaml
        2.4.4_r50_cosine_10k_1024x1024.yaml
        4.8.0_r200_step_45k_90k_640_drop_0.9_aug1.yaml
        2.1.1_nasfpn_resnet200_aug_v3.yaml
        2.3.2_nasfpn_effnet_b7_clr.yaml
        4.8.5_r50_step_45k_90k_feat_300_drop_0.9.yaml
        2.4.5_r50_cosine_10k_128_feat.yaml
        2.4.6_r50_cosine_10k_384_feat.yaml
        4.3.0_effnet_b5_45k_90k_1024_frozen_25.yaml
        4.8.1_r152_step_45k_90k_800_drop_0.9_feat_300.yaml
        4.1.9_effnet_b5_45k_90k_1024_feat_350_drop_0.85.yaml
        4.5.9_effnet_b6_45k_90k_1024_feat_350_drop_0.85.yaml
        retinanet_autoaugment.yaml
        4.4.4_effnet_b7_45k_90k_1024_feat_300_drop_0.85.yaml
        2.6.0_r50_step_45k.yaml
        2.0.6_nasfpn_resnet50_cosine_lr1.yaml
        4.9.2_r200_pretrained_640_drop_0.9_aug1.yaml
        2.0.3_nasfpn_resnet200.yaml
        4.1.3_effnet_b5_45k_90k_1024_feat_400.yaml
        4.5.0_effnet_b6_45k_90k_1024_batch_32.yaml
        2.3.1_efficientnet_b5_pretrain.yaml
        4.5.1_effnet_b6_45k_90k_1024_feat_300.yaml
        4.1.5_effnet_b5_45k_90k_1024_feat_256_drop_0.85.yaml
        2.6.9_r50_features_512_drop_0.7_1024.yaml
        retinanet_nasfpn.yaml
        2.0.9a_nasfpn_resnet50_cosine_lr5.yaml
        4.0.2_r152_step_45k_90k_1024_drop_0.9.yaml
        4.8.8_r200_pretrained_640_drop_0.9_aug1.yaml
        1.1.0_added_lr_schedule.yaml
        4.8.9_r152_pretrained_640_drop_0.9_aug3.yaml
        2.3.3a_nasfpn_effnet_b7_resumed.yaml
        4.8.11_r101_pretrained_1024_feat_300_aug2.yaml
        4.2.3_effnet_b7_45k_90k_1024_feat_300_drop_0.9.yaml
        2.4.3_r50_cosine_10k_separable_aug_v3.yaml
        2.0.9_nasfpn_resnet50_cosine_lr4.yaml
        1.0.5_1024x1024.yaml
        2.6.7_r50_features_400_drop_0.7_1024.yaml
        1.3.1_resnet152.yaml
        2.3.3_nasfpn_effnet_b7_batch_32.yaml
        1.3.2_resnet200.yaml
        4.7.1_r50_cosine_10k_feat_300_drop_0.7.yaml
        4.0.0_r50_step_45k_90k_1024.yaml
        1.0.2_changed_schedule.yaml
        4.9.4_r152_pretrained_640_feat_300_drop_0.9_aug2.yaml
        2.1.4_nasfpn_resnet200_1024_aug_v3.yaml
        2.0.1_nasfpn_resnet101.yaml
        1.3.0_resnet101.yaml
        4.9.0_r50_step_45k_60k_feat_300_drop_0.9_aug2.yaml
        2.4.2_r50_cosine_10k_separable_aug_v1.yaml
        3.0.1_balanced_resnet50_cosine_20k.yaml
        3.0.2_balanced_resnet50_cosine_30k.yaml
        2.0.8_nasfpn_resnet50_cosine_lr3.yaml
        4.8.13_r200_pretrained_640_feat_300_drop_0.9_aug1.yaml
        4.7.0_r50_cosine_10k_1024.yaml
        4.8.6_r50_step_45k_90k_feat_300_drop_0.9_aug2.yaml
        4.6.0_r50_step_45k_60k_1024.yaml
        4.1.4_effnet_b5_45k_90k_1024_separable.yaml
        4.1.2_effnet_b5_45k_90k_1024_feat_350.yaml
        1.2.1_autoaugment_v1.yaml
        3.1.1_balanced_resnet50_step_30k.yaml
        4.9.3_r152_pretrained_640_drop_0.9_aug3.yaml
        2.6.2_r50_features_300_drop_0.7.yaml
        4.4.5_effnet_b7_45k_90k_1024_feat_350_drop_0.9.yaml
        2.5.1_effnet_b5_cosine_10k_frozen_10.yaml
        4.8.10_r101_pretrained_1024_feat_350_aug3.yaml
        2.5.4_effnet_b5_cosine_10k_frozen_15_640.yaml
        4.8.12_r200_pretrained_feat_300_aug3.yaml
        4.8.3_r101_step_45k_90k_1024_feat_300_aug2.yaml
        2.4.8_r50_cosine_10k_300_feat_drop_0.7.yaml
        2.5.5_effnet_b5_cosine_10k_frozen_15_640_sep.yaml
        1.2.0_autoaugment_v0.yaml
        2.1.2_nasfpn_resnet200_1024.yaml
        4.1.7_effnet_b5_45k_90k_1024_feat_300_drop_0.85.yaml
        4.8.2_r101_step_45k_90k_1024_feat_350_aug3.yaml
        4.5.4_effnet_b6_45k_90k_1024_separable.yaml
        4.5.3_effnet_b6_45k_90k_1024_feat_400.yaml
        2.3.0_nasfpn_efficientnet_b5.yaml
        3.0.0_balanced_resnet50_cosine_10k.yaml
        4.5.6_effnet_b6_45k_90k_1024_feat_256_drop_0.9.yaml
        2.6.6_r50_features_400_drop_0.85.yaml
        4.6.1_r50_step_45k_60k_1024_drop_0.9.yaml
        2.0.7_nasfpn_resnet50_cosine_lr2.yaml
        1.2.2_autoaugment_v2.yaml
        4.2.4_effnet_b7_45k_90k_1024_frozen_25.yaml
        2.5.3_effnet_b5_cosine_10k_frozen_20.yaml
        1.2.3_autoaugment_v3.yaml
        2.4.1_r50_cosine_10k_separable_aug_v0.yaml
        2.5.6_effnet_b5_cosine_10k_frozen_15_640_sep_drop.yaml
        4.5.2_effnet_b6_45k_90k_1024_feat_350.yaml
        4.4.2_effnet_b7_45k_90k_1024_feat_256_drop_0.85.yaml
        4.4.3_effnet_b7_45k_90k_1024_feat_300_drop_0.9.yaml
        4.1.8_effnet_b5_45k_90k_1024_feat_300_drop_0.9.yaml
        4.4.1_effnet_b7_45k_90k_1024_feat_256_drop_0.9.yaml
        2.6.3_r50_features_400_drop_0.7.yaml
        1.0.1_fixed_val_batch.yaml
        4.0.1_r101_step_45k_90k_1024_drop_0.9.yaml
        4.5.10_effnet_b6_45k_90k_1024_feat_350_drop_0.9.yaml
        2.4.0_r50_cosine_10k.yaml
        4.8.7_r50_step_45k_90k_feat_350_drop_0.85_aug0.yaml
        2.0.2_nasfpn_resnet152.yaml
        4.4.0_effnet_b7_45k_90k_1024_batch_32.yaml
        4.2.1_effnet_b7_45k_90k_1024_frozen_20.yaml
        4.5.8_effnet_b6_45k_90k_1024_feat_300_drop_0.9.yaml
        4.4.6_effnet_b7_45k_90k_1024_feat_350_drop_0.85.yaml
        4.2.0_effnet_b7_45k_90k_1024.yaml
        1.0.3_constant.yaml
        2.1.3_nasfpn_resnet200_1024_aug_v1.yaml
        2.6.8_r50_features_400_drop_0.85_1024.yaml
        factory.py
        retinanet_config.py
        __init__.py
      - main.py
      - README.md
      - export_saved_model.py
    - transformer
      - README.md
    - mask_rcnn
      - learning_rates.py
      - postprocess_ops.py
      - fpn.py
      - evaluation.py
      - mask_rcnn_main.py
      - anchors.py
      - mask_rcnn_model.py
      - losses.py
      - resnet.py
      - distributed_executer.py
      - dataloader.py
      - coco_utils.py
      - tpu_normalization.py
      - heads.py
      - training_ops.py
      - spatial_transform_ops.py
      - serving.py
      - configs
        mask_rcnn_config.py
        cloud
        v2-128.yaml
        v3-128.yaml
        v3-8.yaml
        v2-32.yaml
        v3-32.yaml
        v2-8.yaml
        __init__.py
      - README.md
      - mask_rcnn_k8s.yaml
      - object_detection
        balanced_positive_negative_sampler.py
        region_similarity_calculator.py
        ops.py
        visualization_utils.py
        shape_utils.py
        target_assigner.py
        preprocessor.py
        minibatch_sampler.py
        box_list.py
        faster_rcnn_box_coder.py
        argmax_matcher.py
        matcher.py
        box_coder.py
        __init__.py
        tf_example_decoder.py
      - box_utils.py
      - export_saved_model.py
      - preprocess_ops.py
      - roi_ops.py
      - coco_metric.py
    - resnet
      - imagenet_input.py
      - lars_util.py
      - resnet_preprocessing.py
      - resnet_k8s.yaml
      - resnet_model_test.py
      - resnet_model.py
      - configs
        resnet101.yaml
        resnet_config.py
        resnet152.yaml
        cloud
        v2-128.yaml
        v3-1024.yaml
        v3-128.yaml
        v3-8.yaml
        v2-32.yaml
        v3-512.yaml
        v3-64.yaml
        v3-32.yaml
        v2-8.yaml
        v2-512.yaml
        v3-2048.yaml
        v3-256.yaml
        v2-256.yaml
        __init__.py
        resnet200.yaml
      - __init__.py
      - README.md
      - resnet_main.py
      - benchmark
        read_training_time.py
        resnet_benchmark.py
        __init__.py
        README.md
    - amoeba_net
      - amoeba_net.py
      - inception_preprocessing.py
      - network_utils_test.py
      - amoeba_net_k8s.yaml
      - amoeba_net_model.py
      - README.md
      - tf_hub.py
      - model_specs.py
      - model_builder.py
      - network_utils.py
    - retinanet
      - retinanet_model.py
      - retinanet_segmentation_main.py
      - retinanet_segmentation_model.py
      - retinanet_architecture.py
      - evaluation.py
      - anchors.py
      - postprocess.py
      - retinanet_k8s.yaml
      - dataloader.py
      - retinanet_tensorrt.py
      - README.md
      - object_detection
        region_similarity_calculator.py
        shape_utils.py
        target_assigner.py
        preprocessor.py
        box_list.py
        faster_rcnn_box_coder.py
        argmax_matcher.py
        matcher.py
        box_coder.py
        __init__.py
        tf_example_decoder.py
      - retinanet_main.py
      - coco_metric.py
    - mobilenet
      - vgg_preprocessing.py
      - mobilenet_model.py
      - mobilenet.py
      - inception_preprocessing.py
      - configs
        mobilenet_config.py
        cloud
        v2-128.yaml
        v3-1024.yaml
        v3-128.yaml
        v3-8.yaml
        v2-32.yaml
        v3-512.yaml
        v3-64.yaml
        v3-32.yaml
        v2-8.yaml
        v2-512.yaml
        v3-2048.yaml
        v3-256.yaml
        v2-256.yaml
        __init__.py
      - README.md
      - supervised_images.py
    - __init__.py
    - mnist
      - mnist_tpu.py
      - README.md
    - efficientnet
      - eval_ckpt_main.py
      - autoaugment.py
      - edgetpu
        efficientnet_edgetpu_builder.py
        __init__.py
        README.md
      - efficientnet_builder.py
      - efficientnet_model.py
      - g3doc
      - imagenet_input.py
      - preprocessing.py
      - main.py
      - README.md
      - utils.py
      - export_model.py
    - mnasnet
      - eval_ckpt_main.py
      - mnasnet_models.py
      - post_quantization.py
      - g3doc
      - imagenet_input.py
      - mixnet
        g3doc
        mixnet_builder.py
        custom_layers.py
        __init__.py
        README.md
        mixnet_model.py
      - preprocessing.py
      - mnasnet_main.py
      - configs
        mnasnet_config.py
        cloud
        v3-8.yaml
        v2-32.yaml
        v3-32.yaml
        v2-8.yaml
        gpu.yaml
        __init__.py
      - README.md
      - mnas_utils.py
      - mnasnet_model.py
  - common
    - tpu_profiler_hook.py
    - inference_warmup.py
    - __init__.py
    - imagenet.py
  - setup.py
  - hyperparameters
    - flags_to_params.py
    - common_hparams_flags.py
    - params_dict.py
    - common_tpu_flags.py
    - __init__.py
  - experimental
    - mnist_keras
      - mnist.py
    - keras_colab
      - shakespeare_lstm.py
      - README.md
    - keras_application
      - application_model.py
    - dcgan
      - dcgan_main.py
      - mnist_input.py
      - cifar_model.py
      - mnist_model.py
      - README.md
      - cifar_input.py
    - qanet
      - data.py
      - preprocess.py
      - run.py
      - run_lib.py
      - testdata
        train-v1.1.json
        vocab.vec
        train_0000
      - model.py
      - README.md
      - utils.py
    - distribution_strategy
      - imagenet_input.py
      - resnet_preprocessing.py
      - resnet_model.py
      - resnet_estimator.py
    - resnet50_keras
      - resnet50_tf2.py
      - model_saving_utils.py
      - resnet50.py
      - imagenet_input.py
      - resnet_preprocessing.py
      - resnet50_ctl_tf1.py
      - resnet50_ctl_tf2.py
      - resnet_model.py
      - README.md
      - resnet50_test.py
    - cifar_keras
      - README.md
      - cifar_keras.py
    - inception
      - inception_v3.py
      - inception_v2_tpu_model.py
      - vgg_preprocessing.py
      - inception_v4_model.py
      - inception_v3_old.py
      - inception_preprocessing.py
      - inception_v3_k8s.yaml
      - inception_v2.py
      - inception_v4.py
      - imagenet.py
    - densenet_keras
      - densenet_keras_model.py
      - vgg_preprocessing.py
      - densenet_keras_imagenet.py
      - README.md
    - inference
      - setup-pool.sh
      - load_test_client.py
      - api_config.yaml
      - openapi.yaml
      - api_descriptor.pb
    - ncf
      - ncf_main.py
      - README.md
    - deeplab
      - data_pipeline.py
      - model.py
      - main.py
      - README.md
    - show_and_tell
      - show_and_tell_tpu_test.py
      - README
      - show_and_tell_model.py
      - inputs.py
      - image_processing.py
      - train.py
      - configuration.py
      - image_embedding.py
    - mnist_jupyter
      - launch.sh
- benchmarks
  - ResNet-50_v1.5_Performance_Comparison_TensorFlow_1.12_GCP.md
- README.md
- scripts
  - train_on_fold.sh
  - prepare_folds.sh
  - add_extra_data.sh
  - export_fold.sh
  - prepare_tfrecords_v3_balanced.sh
  - prepare_tfrecords_v1.sh
  - train_on_dataset.sh
  - build_validation.py
  - filter_dataset.py
  - prepare_datasets.sh
  - extra
    - class-ids-human-body-parts-and-mammal.txt
  - upload_files_to_gcs.sh
  - prepare_tfrecords_v2_removed_crowd.sh
  - kfold_split.py
  - gen_tfrecords.py
  - split_classes.py
  - prepare_tfrecords_v4_pseudo_labels.sh
  - export_saved_model.sh
  - blacklist.txt
  - build_leaf_classes_list.py
  - prepare_tfrecords_v5_balanced_pseudo_labels.sh
  - inference
    - merge_all_subs.sh
    - trim_sub_by_threshold.py
    - add_parents.py
    - ready_models.txt
    - gen_sub.py
    - cloud_inference.py
    - docker_run.sh
    - partial_inference.py
    - docker_build.sh
    - all_models.yml
    - gen_all_subs.sh
    - drop_parents.py
    - soft_nms.pyx
    - merge_subs.py
    - join_subs.py
    - trim_sub_decimal_digits.py
    - inference.py
    - trim_sub_by_num_of_predicts.py
    - join_predicted_parts.py
    - requirements.txt
    - Dockerfile
  - export_models.py
  - export_all_folds.sh
  - .gitignore
  - gen_coco_val_json.py
  - debug.py
  - add_data_from_testset.py
- tools
  - diagnostics
    - diagnostics.py
  - datasets
    - download_and_preprocess_coco.sh
    - download_and_preprocess_coco_k8s.yaml
    - create_coco_tf_record.py
    - imagenet_to_gcs.py
    - tfrecords_to_bigtable.py
    - README.md
    - imagenet_to_gcs_k8s.yaml
    - jpeg_to_tf_record.py
  - ctpu
    - ctrl
      - resourcemgmt.go
      - gce_test.go
      - ctrl_test.go
      - gcloud_cli_test.go
      - tpu_test.go
      - servicemgmt.go
      - ctrl.go
      - gcloud_cli.go
      - tpu.go
      - resourcemgmt_test.go
      - gce.go
    - commands
      - status.go
      - common.go
      - status_test.go
      - tpu_locations.go
      - config_cmd.go
      - tpu_size_test.go
      - list_test.go
      - pause_test.go
      - delete.go
      - quota.go
      - common_test.go
      - up_test.go
      - pause.go
      - list.go
      - up.go
      - auth_test.go
      - delete_test.go
      - tpu_size.go
      - version.go
      - tf_versions.go
      - restart.go
      - auth.go
      - tpu_locations_test.go
      - restart_test.go
    - config
      - config_gce.go
      - config.go
      - devshell_test.go
      - testdata
        gcloud
        corrupted2
        application_default_credentials.json
        README.md
        active_config
        corrupted
        application_default_credentials.json
        README.md
        active_config
        configurations
        config_default
        clean
        application_default_credentials.json
        README.md
        active_config
        configurations
        config_ctpu9
        no_config
        README.md
        incomplete
        application_default_credentials.json
        README.md
        active_config
        configurations
        config_ctpu9
        no_app_creds
        README.md
        active_config
        configurations
        config_ctpu9
      - config_gcloud_test.go
      - config_gcloud.go
      - config_test.go
      - devshell.go
    - tutorial.md
    - README.md
    - main.go
    - .gitignore
  - docker
    - Dockerfile.util
    - Dockerfile
  - colab
    - BUILD
  - kubernetes
    - tensorboard_k8s.yaml
    - tpu_profiler_k8s.yaml
- .gitignore

# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

r"""Script to download the Imagenet dataset and upload to gcs.

To run the script setup a virtualenv with the following libraries installed.
- `gcloud`: Follow the instructions on
  [cloud SDK docs](https://cloud.google.com/sdk/downloads) followed by
  installing the python api using `pip install gcloud`.
- `google-cloud-storage`: Install with `pip install google-cloud-storage`
- `tensorflow`: Install with `pip install tensorflow`

Once you have all the above libraries setup, you should register on the
[Imagenet website](http://image-net.org/download-images) to get your
username and access_key.

Make sure you have around 300GB of disc space available on the machine where
you're running this script. You can run the script using the following command.
```
python imagenet_to_gcs.py \
  --project="TEST_PROJECT" \
  --gcs_output_path="gs://TEST_BUCKET/IMAGENET_DIR" \
  --local_scratch_dir="./imagenet" \
  --imagenet_username=FILL_ME_IN \
  --imagenet_access_key=FILL_ME_IN \
```

Optionally if the raw data has already been downloaded you can provide a direct
`raw_data_directory` path. If raw data directory is provided it should be in
the format:
- Training images: train/n03062245/n03062245_4620.JPEG
- Validation Images: validation/ILSVRC2012_val_00000001.JPEG
- Validation Labels: synset_labels.txt
"""

import math
import os
import random
import tarfile
import urllib

from absl import app
from absl import flags
import tensorflow as tf

from google.cloud import storage

flags.DEFINE_string(
    'project', None, 'Google cloud project id for uploading the dataset.')
flags.DEFINE_string(
    'gcs_output_path', None, 'GCS path for uploading the dataset.')
flags.DEFINE_string(
    'local_scratch_dir', None, 'Scratch directory path for temporary files.')
flags.DEFINE_string(
    'raw_data_dir', None, 'Directory path for raw Imagenet dataset. '
    'Should have train and validation subdirectories inside it.')
flags.DEFINE_string(
    'imagenet_username', None, 'Username for Imagenet.org account')
flags.DEFINE_string(
    'imagenet_access_key', None, 'Access Key for Imagenet.org account')
flags.DEFINE_boolean(
    'gcs_upload', True, 'Set to false to not upload to gcs.')

FLAGS = flags.FLAGS

BASE_URL = 'http://www.image-net.org/challenges/LSVRC/2012/nnoupb/'
LABELS_URL = 'https://raw.githubusercontent.com/tensorflow/models/master/research/inception/inception/data/imagenet_2012_validation_synset_labels.txt'  # pylint: disable=line-too-long

TRAINING_FILE = 'ILSVRC2012_img_train.tar'
VALIDATION_FILE = 'ILSVRC2012_img_val.tar'
LABELS_FILE = 'synset_labels.txt'

TRAINING_SHARDS = 1024
VALIDATION_SHARDS = 128

TRAINING_DIRECTORY = 'train'
VALIDATION_DIRECTORY = 'validation'


def _check_or_create_dir(directory):
  """Check if directory exists otherwise create it."""
  if not tf.gfile.Exists(directory):
    tf.gfile.MakeDirs(directory)


def download_dataset(raw_data_dir):
  """Download the Imagenet dataset into the temporary directory."""
  def _download(url, filename):
    """Download the dataset at the provided filepath."""
    urllib.urlretrieve(url, filename)

  def _get_members(filename):
    """Get all members of a tarfile."""
    tar = tarfile.open(filename)
    members = tar.getmembers()
    tar.close()
    return members

  def _untar_file(filename, directory, member=None):
    """Untar a file at the provided directory path."""
    _check_or_create_dir(directory)
    tar = tarfile.open(filename)
    if member is None:
      tar.extractall(path=directory)
    else:
      tar.extract(member, path=directory)
    tar.close()

  # Check if raw_data_dir exists
  _check_or_create_dir(raw_data_dir)

  # Download the training data
  tf.logging.info('Downloading the training set. This may take a few hours.')
  directory = os.path.join(raw_data_dir, TRAINING_DIRECTORY)
  filename = os.path.join(raw_data_dir, TRAINING_FILE)
  _download(BASE_URL + TRAINING_FILE, filename)

  # The training tarball contains multiple tar balls inside it. Extract them
  # in order to create a clean directory structure.
  for member in _get_members(filename):
    subdirectory = os.path.join(directory, member.name.split('.')[0])
    sub_tarfile = os.path.join(subdirectory, member.name)

    _untar_file(filename, subdirectory, member)
    _untar_file(sub_tarfile, subdirectory)
    os.remove(sub_tarfile)

  # Download synset_labels for validation set
  tf.logging.info('Downloading the validation labels.')
  _download(LABELS_URL, os.path.join(raw_data_dir, LABELS_FILE))

  # Download the validation data
  tf.logging.info('Downloading the validation set. This may take a few hours.')
  directory = os.path.join(raw_data_dir, VALIDATION_DIRECTORY)
  filename = os.path.join(raw_data_dir, VALIDATION_FILE)
  _download(BASE_URL + VALIDATION_FILE, filename)
  _untar_file(filename, directory)


def _int64_feature(value):
  """Wrapper for inserting int64 features into Example proto."""
  if not isinstance(value, list):
    value = [value]
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def _bytes_feature(value):
  """Wrapper for inserting bytes features into Example proto."""
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _convert_to_example(filename, image_buffer, label, synset, height, width):
  """Build an Example proto for an example.

  Args:
    filename: string, path to an image file, e.g., '/path/to/example.JPG'
    image_buffer: string, JPEG encoding of RGB image
    label: integer, identifier for the ground truth for the network
    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
    height: integer, image height in pixels
    width: integer, image width in pixels
  Returns:
    Example proto
  """
  colorspace = 'RGB'
  channels = 3
  image_format = 'JPEG'

  example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': _int64_feature(height),
      'image/width': _int64_feature(width),
      'image/colorspace': _bytes_feature(colorspace),
      'image/channels': _int64_feature(channels),
      'image/class/label': _int64_feature(label),
      'image/class/synset': _bytes_feature(synset),
      'image/format': _bytes_feature(image_format),
      'image/filename': _bytes_feature(os.path.basename(filename)),
      'image/encoded': _bytes_feature(image_buffer)}))
  return example


def _is_png(filename):
  """Determine if a file contains a PNG format image.

  Args:
    filename: string, path of the image file.

  Returns:
    boolean indicating if the image is a PNG.
  """
  # File list from:
  # https://github.com/cytsai/ilsvrc-cmyk-image-list
  return 'n02105855_2933.JPEG' in filename


def _is_cmyk(filename):
  """Determine if file contains a CMYK JPEG format image.

  Args:
    filename: string, path of the image file.

  Returns:
    boolean indicating if the image is a JPEG encoded with CMYK color space.
  """
  # File list from:
  # https://github.com/cytsai/ilsvrc-cmyk-image-list
  blacklist = set(['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
                   'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
                   'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
                   'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
                   'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
                   'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
                   'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
                   'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
                   'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
                   'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
                   'n07583066_647.JPEG', 'n13037406_4650.JPEG'])
  return os.path.basename(filename) in blacklist


class ImageCoder(object):
  """Helper class that provides TensorFlow image coding utilities."""

  def __init__(self):
    # Create a single Session to run all image coding calls.
    self._sess = tf.Session()

    # Initializes function that converts PNG to JPEG data.
    self._png_data = tf.placeholder(dtype=tf.string)
    image = tf.image.decode_png(self._png_data, channels=3)
    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)

    # Initializes function that converts CMYK JPEG data to RGB JPEG data.
    self._cmyk_data = tf.placeholder(dtype=tf.string)
    image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)

    # Initializes function that decodes RGB JPEG data.
    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)

  def png_to_jpeg(self, image_data):
    return self._sess.run(self._png_to_jpeg,
                          feed_dict={self._png_data: image_data})

  def cmyk_to_rgb(self, image_data):
    return self._sess.run(self._cmyk_to_rgb,
                          feed_dict={self._cmyk_data: image_data})

  def decode_jpeg(self, image_data):
    image = self._sess.run(self._decode_jpeg,
                           feed_dict={self._decode_jpeg_data: image_data})
    assert len(image.shape) == 3
    assert image.shape[2] == 3
    return image


def _process_image(filename, coder):
  """Process a single image file.

  Args:
    filename: string, path to an image file e.g., '/path/to/example.JPG'.
    coder: instance of ImageCoder to provide TensorFlow image coding utils.
  Returns:
    image_buffer: string, JPEG encoding of RGB image.
    height: integer, image height in pixels.
    width: integer, image width in pixels.
  """
  # Read the image file.
  with tf.gfile.FastGFile(filename, 'r') as f:
    image_data = f.read()

  # Clean the dirty data.
  if _is_png(filename):
    # 1 image is a PNG.
    tf.logging.info('Converting PNG to JPEG for %s' % filename)
    image_data = coder.png_to_jpeg(image_data)
  elif _is_cmyk(filename):
    # 22 JPEG images are in CMYK colorspace.
    tf.logging.info('Converting CMYK to RGB for %s' % filename)
    image_data = coder.cmyk_to_rgb(image_data)

  # Decode the RGB JPEG.
  image = coder.decode_jpeg(image_data)

  # Check that image converted to RGB
  assert len(image.shape) == 3
  height = image.shape[0]
  width = image.shape[1]
  assert image.shape[2] == 3

  return image_data, height, width


def _process_image_files_batch(coder, output_file, filenames, synsets, labels):
  """Processes and saves list of images as TFRecords.

  Args:
    coder: instance of ImageCoder to provide TensorFlow image coding utils.
    output_file: string, unique identifier specifying the data set
    filenames: list of strings; each string is a path to an image file
    synsets: list of strings; each string is a unique WordNet ID
    labels: map of string to integer; id for all synset labels
  """
  writer = tf.python_io.TFRecordWriter(output_file)

  for filename, synset in zip(filenames, synsets):
    image_buffer, height, width = _process_image(filename, coder)
    label = labels[synset]
    example = _convert_to_example(filename, image_buffer, label,
                                  synset, height, width)
    writer.write(example.SerializeToString())

  writer.close()


def _process_dataset(filenames, synsets, labels, output_directory, prefix,
                     num_shards):
  """Processes and saves list of images as TFRecords.

  Args:
    filenames: list of strings; each string is a path to an image file
    synsets: list of strings; each string is a unique WordNet ID
    labels: map of string to integer; id for all synset labels
    output_directory: path where output files should be created
    prefix: string; prefix for each file
    num_shards: number of chucks to split the filenames into

  Returns:
    files: list of tf-record filepaths created from processing the dataset.
  """
  _check_or_create_dir(output_directory)
  chunksize = int(math.ceil(len(filenames) / num_shards))
  coder = ImageCoder()

  files = []

  for shard in range(num_shards):
    chunk_files = filenames[shard * chunksize : (shard + 1) * chunksize]
    chunk_synsets = synsets[shard * chunksize : (shard + 1) * chunksize]
    output_file = os.path.join(
        output_directory, '%s-%.5d-of-%.5d' % (prefix, shard, num_shards))
    _process_image_files_batch(coder, output_file, chunk_files,
                               chunk_synsets, labels)
    tf.logging.info('Finished writing file: %s' % output_file)
    files.append(output_file)
  return files


def convert_to_tf_records(raw_data_dir):
  """Convert the Imagenet dataset into TF-Record dumps."""

  # Shuffle training records to ensure we are distributing classes
  # across the batches.
  random.seed(0)
  def make_shuffle_idx(n):
    order = range(n)
    random.shuffle(order)
    return order

  # Glob all the training files
  training_files = tf.gfile.Glob(
      os.path.join(raw_data_dir, TRAINING_DIRECTORY, '*', '*.JPEG'))

  # Get training file synset labels from the directory name
  training_synsets = [
      os.path.basename(os.path.dirname(f)) for f in training_files]

  training_shuffle_idx = make_shuffle_idx(len(training_files))
  training_files = [training_files[i] for i in training_shuffle_idx]
  training_synsets = [training_synsets[i] for i in training_shuffle_idx]

  # Glob all the validation files
  validation_files = sorted(tf.gfile.Glob(
      os.path.join(raw_data_dir, VALIDATION_DIRECTORY, '*.JPEG')))

  # Get validation file synset labels from labels.txt
  validation_synsets = tf.gfile.FastGFile(
      os.path.join(raw_data_dir, LABELS_FILE), 'r').read().splitlines()

  # Create unique ids for all synsets
  labels = {v: k + 1 for k, v in enumerate(
      sorted(set(validation_synsets + training_synsets)))}

  # Create training data
  tf.logging.info('Processing the training data.')
  training_records = _process_dataset(
      training_files, training_synsets, labels,
      os.path.join(FLAGS.local_scratch_dir, TRAINING_DIRECTORY),
      TRAINING_DIRECTORY, TRAINING_SHARDS)

  # Create validation data
  tf.logging.info('Processing the validation data.')
  validation_records = _process_dataset(
      validation_files, validation_synsets, labels,
      os.path.join(FLAGS.local_scratch_dir, VALIDATION_DIRECTORY),
      VALIDATION_DIRECTORY, VALIDATION_SHARDS)

  return training_records, validation_records


def upload_to_gcs(training_records, validation_records):
  """Upload TF-Record files to GCS, at provided path."""

  # Find the GCS bucket_name and key_prefix for dataset files
  path_parts = FLAGS.gcs_output_path[5:].split('/', 1)
  bucket_name = path_parts[0]
  if len(path_parts) == 1:
    key_prefix = ''
  elif path_parts[1].endswith('/'):
    key_prefix = path_parts[1]
  else:
    key_prefix = path_parts[1] + '/'

  client = storage.Client(project=FLAGS.project)
  bucket = client.get_bucket(bucket_name)

  def _upload_files(filenames):
    """Upload a list of files into a specifc subdirectory."""
    for i, filename in enumerate(sorted(filenames)):
      blob = bucket.blob(key_prefix + os.path.basename(filename))
      blob.upload_from_filename(filename)
      if not i % 20:
        tf.logging.info('Finished uploading file: %s' % filename)

  # Upload training dataset
  tf.logging.info('Uploading the training data.')
  _upload_files(training_records)

  # Upload validation dataset
  tf.logging.info('Uploading the validation data.')
  _upload_files(validation_records)


def main(argv):  # pylint: disable=unused-argument
  tf.logging.set_verbosity(tf.logging.INFO)

  if FLAGS.gcs_upload and FLAGS.project is None:
    raise ValueError('GCS Project must be provided.')

  if FLAGS.gcs_upload and FLAGS.gcs_output_path is None:
    raise ValueError('GCS output path must be provided.')
  elif FLAGS.gcs_upload and not FLAGS.gcs_output_path.startswith('gs://'):
    raise ValueError('GCS output path must start with gs://')

  if FLAGS.local_scratch_dir is None:
    raise ValueError('Scratch directory path must be provided.')

  # Download the dataset if it is not present locally
  raw_data_dir = FLAGS.raw_data_dir
  if raw_data_dir is None:
    raw_data_dir = os.path.join(FLAGS.local_scratch_dir, 'raw_data')
    tf.logging.info('Downloading data to raw_data_dir: %s' % raw_data_dir)
    download_dataset(raw_data_dir)

  # Convert the raw data into tf-records
  training_records, validation_records = convert_to_tf_records(raw_data_dir)

  # Upload to GCS
  if FLAGS.gcs_upload:
    upload_to_gcs(training_records, validation_records)


if __name__ == '__main__':
  app.run(main)