python source code of

pydatalab-master
- release.sh
- .coveragerc
- google
  - datalab
    - kernel
      - __init__.py
    - ml
      - _metrics.py
      - _summary.py
      - _cloud_models.py
      - _tensorboard.py
      - _feature_slice_view.py
      - _util.py
      - _cloud_training_config.py
      - _job.py
      - _dataset.py
      - __init__.py
      - _fasets.py
      - _confusion_matrix.py
    - utils
      - _dataflow_job.py
      - _json_encoder.py
      - commands
        _chart.py
        _chart_data.py
        _csv.py
        _html.py
        _commands.py
        _utils.py
        _job.py
        __init__.py
      - _utils.py
      - _lambda_job.py
      - __init__.py
      - facets
        base_generic_feature_statistics_generator.py
        base_feature_statistics_generator.py
        __init__.py
        feature_statistics_generator.py
        feature_statistics_pb2.py
        generic_feature_statistics_generator.py
      - _lru_cache.py
      - _async.py
      - _gcp_job.py
      - _http.py
      - _iterator.py
    - storage
      - _object.py
      - commands
        __init__.py
        _storage.py
      - __init__.py
      - _bucket.py
      - _api.py
    - _context.py
    - bigquery
      - _query_output.py
      - _external_data_source.py
      - _csv_options.py
      - _query_results_table.py
      - _query_job.py
      - _parser.py
      - _schema.py
      - commands
        _bigquery.py
        __init__.py
      - _view.py
      - _table.py
      - _utils.py
      - _query.py
      - _job.py
      - _dataset.py
      - _query_stats.py
      - __init__.py
      - _udf.py
      - _sampling.py
      - _api.py
    - commands
      - _datalab.py
      - __init__.py
    - data
      - __init__.py
      - _csv_file.py
    - notebook
      - __init__.py
      - static
        charting.css
        charting.ts
        visualization.ts
        bigquery.css
        style.ts
        job.ts
        element.ts
        bigquery.ts
        job.css
        parcoords.ts
        extern
        d3.parcoords.css
        d3.parcoords.js
        sylvester-LICENSE.txt
        parcoords-LICENSE.txt
        sylvester.js
    - _job.py
    - __init__.py
    - contrib
      - bigquery
        commands
        _bigquery.py
        __init__.py
        operators
        _bq_extract_operator.py
        _bq_execute_operator.py
        _bq_load_operator.py
        __init__.py
        __init__.py
      - __init__.py
      - mlworkbench
        _local_predict.py
        commands
        __init__.py
        _ml.py
        _archive.py
        _prediction_explainer.py
        __init__.py
        _shell_process.py
      - pipeline
        airflow
        _airflow.py
        __init__.py
        commands
        _pipeline.py
        __init__.py
        _pipeline.py
        composer
        _composer.py
        __init__.py
        _api.py
        __init__.py
    - stackdriver
      - monitoring
        _metric.py
        _query_metadata.py
        _utils.py
        _query.py
        __init__.py
        _group.py
        _resource.py
      - commands
        _monitoring.py
        __init__.py
      - __init__.py
  - __init__.py
- solutionbox
  - image_classification
    - setup.py
    - mltoolbox
      - __init__.py
      - image
        classification
        _local.py
        _model.py
        _inceptionlib.py
        _util.py
        _preprocess.py
        _predictor.py
        _trainer.py
        task.py
        setup.py
        __init__.py
        _cloud.py
        _api.py
        __init__.py
  - structured_data
    - setup.py
    - test_mltoolbox
      - e2e_functions.py
      - test_package_functions.py
      - test_sd_trainer.py
      - test_sd_preprocess.py
      - test_datalab_e2e.py
      - __init__.py
    - mltoolbox
      - classification
        linear
        __init__.py
        _classification_linear.py
        dnn
        _classification_dnn.py
        __init__.py
        __init__.py
      - _structured_data
        __version__.py
        trainer
        util.py
        task.py
        __init__.py
        _package.py
        master_setup.py
        __init__.py
        prediction
        predict.py
        __init__.py
        preprocess
        cloud_preprocess.py
        local_preprocess.py
        __init__.py
      - regression
        linear
        _regression_linear.py
        __init__.py
        dnn
        _regression_dnn.py
        __init__.py
        __init__.py
      - __init__.py
    - build.sh
  - ml_workbench
    - xgboost
      - analyze.py
      - transform.py
      - trainer
        feature_transforms.py
        task.py
        __init__.py
        feature_analysis.py
      - setup.py
      - __init__.py
    - test_xgboost
      - run_all.sh
      - test_analyze.py
      - test_transform.py
    - setup.py
    - tensorflow
      - analyze.py
      - transform.py
      - trainer
        feature_transforms.py
        task.py
        __init__.py
        feature_analysis.py
      - setup.py
      - __init__.py
    - test_tensorflow
      - test_cloud_workflow.py
      - run_all.sh
      - test_feature_transforms.py
      - test_training.py
      - test_analyze.py
      - test_transform.py
- install-no-virtualenv.sh
- CONTRIBUTING.md
- legacy_tests
  - kernel
    - html_tests.py
    - chart_data_tests.py
    - bigquery_tests.py
    - commands_tests.py
    - chart_tests.py
    - __init__.py
    - module_tests.py
    - utils_tests.py
    - storage_tests.py
    - sql_tests.py
  - storage
    - api_tests.py
    - item_tests.py
    - __init__.py
    - bucket_tests.py
  - bigquery
    - table_tests.py
    - api_tests.py
    - dataset_tests.py
    - jobs_tests.py
    - view_tests.py
    - federated_table_tests.py
    - udf_tests.py
    - __init__.py
    - parser_tests.py
    - schema_tests.py
    - query_tests.py
    - sampling_tests.py
  - data
    - __init__.py
    - sql_tests.py
  - main.py
  - _util
    - lru_cache_tests.py
    - util_tests.py
    - __init__.py
    - http_tests.py
  - stackdriver
    - monitoring
      - resource_tests.py
      - group_tests.py
      - __init__.py
      - query_metadata_tests.py
      - utils_tests.py
      - query_tests.py
      - metric_tests.py
    - commands
      - monitoring_tests.py
      - __init__.py
    - __init__.py
- datalab
  - kernel
    - __init__.py
  - utils
    - _dataflow_job.py
    - _json_encoder.py
    - commands
      - _chart.py
      - _chart_data.py
      - _csv.py
      - _modules.py
      - _html.py
      - _commands.py
      - _utils.py
      - _job.py
      - _extension.py
      - __init__.py
    - _utils.py
    - _lambda_job.py
    - _job.py
    - __init__.py
    - _lru_cache.py
    - _async.py
    - _gcp_job.py
    - _http.py
    - _iterator.py
  - storage
    - commands
      - __init__.py
      - _storage.py
    - __init__.py
    - _bucket.py
    - _item.py
    - _api.py
  - bigquery
    - _csv_options.py
    - _query_results_table.py
    - _query_job.py
    - _parser.py
    - _schema.py
    - commands
      - _bigquery.py
      - __init__.py
    - _view.py
    - _table.py
    - _utils.py
    - _query.py
    - _job.py
    - _dataset.py
    - _dialect.py
    - _query_stats.py
    - __init__.py
    - _udf.py
    - _sampling.py
    - _federated_table.py
    - _api.py
  - README
  - context
    - _context.py
    - commands
      - __init__.py
      - _projects.py
    - _project.py
    - _utils.py
    - __init__.py
    - _api.py
  - data
    - _csv.py
    - commands
      - __init__.py
      - _sql.py
    - _utils.py
    - _sql_module.py
    - __init__.py
    - _sql_statement.py
  - notebook
    - __init__.py
    - static
      - charting.css
      - charting.ts
      - visualization.ts
      - bigquery.js.map
      - bigquery.css
      - visualization.js.map
      - charting.js.map
      - style.ts
      - element.js.map
      - style.js.map
      - job.ts
      - element.ts
      - bigquery.ts
      - job.css
      - parcoords.ts
      - job.js.map
      - extern
        d3.parcoords.css
        d3.parcoords.js
        sylvester-LICENSE.txt
        parcoords-LICENSE.txt
        sylvester.js
  - __init__.py
  - stackdriver
    - monitoring
      - _metric.py
      - _query_metadata.py
      - _utils.py
      - _query.py
      - __init__.py
      - _group.py
      - _resource.py
    - commands
      - _monitoring.py
      - __init__.py
    - __init__.py
- setup.py
- externs
  - ts
    - require
      - require.d.ts
- .build-bot.json
- setup.cfg
- .travis.yml
- README.md
- tests
  - kernel
    - html_tests.py
    - chart_data_tests.py
    - bigquery_tests.py
    - chart_tests.py
    - __init__.py
    - pipeline_tests.py
    - utils_tests.py
    - storage_tests.py
  - ml
    - summary_tests.py
    - tensorboard_tests.py
    - dataset_tests.py
    - confusion_matrix_tests.py
    - __init__.py
    - metrics_tests.py
    - facets_tests.py
  - storage
    - object_tests.py
    - api_tests.py
    - __init__.py
    - bucket_tests.py
  - integration
    - storage_test.py
  - bigquery
    - operator_tests.py
    - table_tests.py
    - api_tests.py
    - external_data_source_tests.py
    - dataset_tests.py
    - jobs_tests.py
    - view_tests.py
    - udf_tests.py
    - __init__.py
    - pipeline_tests.py
    - parser_tests.py
    - schema_tests.py
    - query_tests.py
    - sampling_tests.py
  - mltoolbox_structured_data
    - sd_e2e_tests.py
    - dl_interface_tests.py
    - __init__.py
    - traininglib_tests.py
  - main.py
  - _util
    - lru_cache_tests.py
    - generic_feature_statistics_generator_test.py
    - feature_statistics_generator_test.py
    - commands_tests.py
    - util_tests.py
    - __init__.py
    - http_tests.py
  - context_tests.py
  - pipeline
    - __init__.py
    - pipeline_tests.py
    - composer_api_tests.py
    - airflow_tests.py
    - composer_tests.py
  - stackdriver
    - monitoring
      - resource_tests.py
      - group_tests.py
      - __init__.py
      - query_metadata_tests.py
      - utils_tests.py
      - query_tests.py
      - metric_tests.py
    - commands
      - monitoring_tests.py
      - __init__.py
    - __init__.py
  - mlworkbench_magic
    - archive_tests.py
    - ml_tests.py
    - shell_process_tests.py
    - __init__.py
    - local_predict_tests.py
    - explainer_tests.py
  - ml_workbench
    - all_tests.py
    - __init__.py
- .gitignore
- docs
  - datalab.data.rst
  - Makefile
  - make.bat
  - gen-magic-rst.ipy
  - .nojekyll
  - google.datalab.data.rst
  - datalab.context.rst
  - datalab Commands.rst
  - README
  - google.datalab Commands.rst
  - datalab.storage.rst
  - mltoolbox.regression.linear.rst
  - google.datalab.rst
  - google.datalab.ml.rst
  - mltoolbox.classification.linear.rst
  - mltoolbox.regression.dnn.rst
  - datalab.stackdriver.monitoring.rst
  - mltoolbox.image.classification.rst
  - google.datalab.bigquery.rst
  - google.datalab.storage.rst
  - index.rst
  - conf.py
  - datalab.bigquery.rst
  - google.datalab.stackdriver.monitoring.rst
  - mltoolbox.classification.dnn.rst
- LICENSE.txt
- tox.ini
- install-virtualenv.sh

# Copyright 2016 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.

"""Implements usefule CSV utilities."""
from __future__ import absolute_import
from __future__ import unicode_literals
from builtins import next
from builtins import str as newstr
from builtins import range
from builtins import object


import csv
import os
import pandas as pd
import random
import sys

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
import tempfile
import datalab.storage
import datalab.utils


_MAX_CSV_BYTES = 10000000


class Csv(object):
  """Represents a CSV file in GCS or locally with same schema.
  """
  def __init__(self, path, delimiter=b','):
    """Initializes an instance of a Csv instance.
    Args:
      path: path of the Csv file.
      delimiter: the separator used to parse a Csv line.
    """
    self._path = path
    self._delimiter = delimiter

  @property
  def path(self):
    return self._path

  @staticmethod
  def _read_gcs_lines(path, max_lines=None):
    return datalab.storage.Item.from_url(path).read_lines(max_lines)

  @staticmethod
  def _read_local_lines(path, max_lines=None):
    lines = []
    for line in open(path):
      if max_lines is not None and len(lines) >= max_lines:
        break
      lines.append(line)
    return lines

  def _is_probably_categorical(self, column):
    if newstr(column.dtype) != 'object':
      # only string types (represented in DataFrame as object) can potentially be categorical
      return False
    if len(max(column, key=lambda p: len(newstr(p)))) > 100:
      return False  # value too long to be a category
    if len(set(column)) > 100:
      return False  # too many unique values to be a category
    return True

  def browse(self, max_lines=None, headers=None):
    """Try reading specified number of lines from the CSV object.
    Args:
      max_lines: max number of lines to read. If None, the whole file is read
      headers: a list of strings as column names. If None, it will use "col0, col1..."
    Returns:
      A pandas DataFrame with the schema inferred from the data.
    Raises:
      Exception if the csv object cannot be read or not enough lines to read, or the
      headers size does not match columns size.
    """
    if self.path.startswith('gs://'):
      lines = Csv._read_gcs_lines(self.path, max_lines)
    else:
      lines = Csv._read_local_lines(self.path, max_lines)
    if len(lines) == 0:
      return pd.DataFrame(columns=headers)
    columns_size = len(next(csv.reader([lines[0]], delimiter=self._delimiter)))
    if headers is None:
      headers = ['col' + newstr(e) for e in range(columns_size)]
    if len(headers) != columns_size:
      raise Exception('Number of columns in CSV do not match number of headers')
    buf = StringIO()
    for line in lines:
      buf.write(line)
      buf.write('\n')
    buf.seek(0)
    df = pd.read_csv(buf, names=headers, delimiter=self._delimiter)
    for key, col in df.iteritems():
      if self._is_probably_categorical(col):
        df[key] = df[key].astype('category')
    return df

  def _create_federated_table(self, skip_header_rows):
    import datalab.bigquery as bq
    df = self.browse(1, None)
    # read each column as STRING because we only want to sample rows.
    schema_train = bq.Schema([{'name': name, 'type': 'STRING'} for name in df.keys()])
    options = bq.CSVOptions(skip_leading_rows=(1 if skip_header_rows is True else 0))
    return bq.FederatedTable.from_storage(self.path,
                                          csv_options=options,
                                          schema=schema_train,
                                          max_bad_records=0)

  def _get_gcs_csv_row_count(self, federated_table):
    import datalab.bigquery as bq
    results = bq.Query('SELECT count(*) from data',
                       data_sources={'data': federated_table}).results()
    return results[0].values()[0]

  def sample_to(self, count, skip_header_rows, strategy, target):
    """Sample rows from GCS or local file and save results to target file.

    Args:
      count: number of rows to sample. If strategy is "BIGQUERY", it is used as approximate number.
      skip_header_rows: whether to skip first row when reading from source.
      strategy: can be "LOCAL" or "BIGQUERY". If local, the sampling happens in local memory,
          and number of resulting rows matches count. If BigQuery, sampling is done
          with BigQuery in cloud, and the number of resulting rows will be approximated to
          count.
      target: The target file path, can be GCS or local path.
    Raises:
      Exception if strategy is "BIGQUERY" but source is not a GCS path.
    """
    # TODO(qimingj) Add unit test
    # Read data from source into DataFrame.

    if sys.version_info.major > 2:
      xrange = range  # for python 3 compatibility

    if strategy == 'BIGQUERY':
      import datalab.bigquery as bq
      if not self.path.startswith('gs://'):
        raise Exception('Cannot use BIGQUERY if data is not in GCS')
      federated_table = self._create_federated_table(skip_header_rows)
      row_count = self._get_gcs_csv_row_count(federated_table)
      query = bq.Query('SELECT * from data', data_sources={'data': federated_table})
      sampling = bq.Sampling.random(count * 100 / float(row_count))
      sample = query.sample(sampling=sampling)
      df = sample.to_dataframe()
    elif strategy == 'LOCAL':
      local_file = self.path
      if self.path.startswith('gs://'):
        local_file = tempfile.mktemp()
        datalab.utils.gcs_copy_file(self.path, local_file)
      with open(local_file) as f:
        row_count = sum(1 for line in f)
      start_row = 1 if skip_header_rows is True else 0
      skip_count = row_count - count - 1 if skip_header_rows is True else row_count - count
      skip = sorted(random.sample(xrange(start_row, row_count), skip_count))
      header_row = 0 if skip_header_rows is True else None
      df = pd.read_csv(local_file, skiprows=skip, header=header_row, delimiter=self._delimiter)
      if self.path.startswith('gs://'):
        os.remove(local_file)
    else:
      raise Exception('strategy must be BIGQUERY or LOCAL')
    # Write to target.
    if target.startswith('gs://'):
      with tempfile.NamedTemporaryFile() as f:
        df.to_csv(f, header=False, index=False)
        f.flush()
        datalab.utils.gcs_copy_file(f.name, target)
    else:
      with open(target, 'w') as f:
        df.to_csv(f, header=False, index=False, sep=str(self._delimiter))