Python google.cloud.storage.Client() Examples

The following are code examples for showing how to use google.cloud.storage.Client(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: Coulomb   Author: DynamoDS   File: storage_utils.py    MIT License 6 votes vote down vote up
def move_blob(bucket_name, blob_name, new_bucket_name):
    """Moves a blob from one bucket to another with a new name."""
    storage_client = storage.Client()
    source_bucket = storage_client.get_bucket(bucket_name)
    source_blob = source_bucket.blob(blob_name)
    destination_bucket = storage_client.get_bucket(new_bucket_name)

    new_blob = source_bucket.copy_blob(
        source_blob, destination_bucket, blob_name)

    log('Blob {} in bucket {} copied to blob {} in bucket {}.'.format(
        source_blob.name, source_bucket.name, new_blob.name,
        destination_bucket.name))
    
    source_blob.delete()
    log('Blob {} deleted.'.format(source_blob)) 
Example 2
Project: Coulomb   Author: DynamoDS   File: data_files_to_sessions.py    MIT License 6 votes vote down vote up
def move_blob(bucket_name, blob_name, new_bucket_name):
    """Moves a blob from one bucket to another with a new name."""
    storage_client = storage.Client()
    source_bucket = storage_client.get_bucket(bucket_name)
    source_blob = source_bucket.blob(blob_name)
    destination_bucket = storage_client.get_bucket(new_bucket_name)

    new_blob = source_bucket.copy_blob(
        source_blob, destination_bucket, blob_name)

    log('Blob {} in bucket {} copied to blob {} in bucket {}.'.format(
        source_blob.name, source_bucket.name, new_blob.name,
        destination_bucket.name))
    
    source_blob.delete()
    log('Blob {} deleted.'.format(source_blob)) 
Example 3
Project: gcp-variant-transforms   Author: googlegenomics   File: vcf_file_composer.py    Apache License 2.0 6 votes vote down vote up
def _compose_files(project, bucket_name, blob_names, composite_name):
  # type: (str, str, List[str], str) -> None
  """Composes multiple files (up to 32 objects) in GCS to one.

  Args:
    project: The project name.
    bucket_name: The name of the bucket where the `components` and the new
      composite are saved.
    blob_names: A list of blob object names.
    composite_name: Name of the new composite.
  """
  bucket = storage.Client(project).get_bucket(bucket_name)
  output_file_blob = bucket.blob(composite_name)
  output_file_blob.content_type = 'text/plain'
  blobs = [bucket.get_blob(blob_name) for blob_name in blob_names]
  output_file_blob.compose(blobs) 
Example 4
Project: gcp-variant-transforms   Author: googlegenomics   File: vcf_file_composer.py    Apache License 2.0 6 votes vote down vote up
def __init__(self, project, bucket_name, blob_prefix):
    # type: (str, str, str) -> None
    """Initializes a `MultiProcessComposer`.

    This class composes all blobs that start with `blob_prefix` to one.

    Args:
      project: The project name.
      bucket_name: The name of the bucket where the blob components and the new
        composite are saved.
      blob_prefix: The prefix used to filter blobs. Only the blobs with this
        prefix will be composed.
    """
    self._project = project
    self._bucket_name = bucket_name
    self._blob_prefix = blob_prefix
    self._bucket = storage.Client(project).get_bucket(bucket_name) 
Example 5
Project: loaner   Author: google   File: storage.py    Apache License 2.0 6 votes vote down vote up
def from_config(cls, config, creds=None):
    """Returns an initialized CloudStorageAPI object.

    Args:
      config: common.ProjectConfig, the project configuration.
      creds: auth.CloudCredentials, the credentials to use for client
          authentication.

    Returns:
      An authenticated CloudStorageAPI instance.
    """
    if creds is None:
      creds = auth.CloudCredentials(config, cls.SCOPES)
    client = storage.Client(
        project=config.project, credentials=creds.get_credentials(cls.SCOPES))
    return cls(config, client) 
Example 6
Project: genetic-curriculum   Author: Octavian-ai   File: file.py    Apache License 2.0 6 votes vote down vote up
def copy_from_bucket(self):
    if 'google.cloud' in sys.modules and self.args.bucket is not None and self.args.gcs_dir is not None:
      client = storage.Client()
      bucket = client.get_bucket(self.args.bucket)
      blob = bucket.blob(self.gcs_path)
      os.makedirs(self.file_dir, exist_ok=True)
      with open(self.file_path, "wb" if self.binary else "w") as dest_file:
        try:
          blob.download_to_file(dest_file)
        except google.cloud.exceptions.NotFound:
          raise FileNotFoundError() 
Example 7
Project: analysis-py-utils   Author: verilylifesciences   File: bq.py    Apache License 2.0 6 votes vote down vote up
def _run_async_query(self, query, job_config):
        # type: (str, QueryJobConfig) -> QueryJob
        """Run an asynchronous query with a given job config.

        This is a wrapper of google.cloud.bigquery.Client.query. It adds retry for polling after
        a QueryJob is created.

        Args:
            query: The query to run
            job_config: A QueryJobConfig for the job

        Returns:
            A QueryJob instance for the job to run the query
        """
        query_job = self.gclient.query(query, job_config=job_config,
                                       retry=self.default_retry_for_api_calls)
        # The above retry is for errors encountered in executing the jobs. The below retry is
        # for errors encountered in polling to see whether the job is done.
        query_job._retry = self.default_retry_for_async_jobs

        return query_job 
Example 8
Project: lookml-tools   Author: ww-tech   File: bq_writer.py    Apache License 2.0 6 votes vote down vote up
def _upload_to_gcs(self, gcs_project_id, target_bucket_name, bucket_folder, filename):
        '''upload CSV to file in GCS

        Args:
            gcs_project_id (str): project name
            target_bucket_name (str): name of GCS bucket
            bucket_folder (str): name of GCS folder
            filename (str): filepath to upload

        Returns:
            nothing. Side effect is that data is uploaded to GCS

        '''
        storage_client = storage.Client(gcs_project_id)
        bucket = storage_client.get_bucket(target_bucket_name)
        path = bucket_folder + os.sep + filename
        logging.info("Loading to GCS: %s", path)
        blob = bucket.blob(path) #name in GCS
        blob.upload_from_filename(filename) 
Example 9
Project: koku   Author: project-koku   File: provider.py    GNU Affero General Public License v3.0 6 votes vote down vote up
def cost_usage_source_is_reachable(self, credential_name, data_source):
        """
        Verify that the GCP bucket exists and is reachable.

        Args:
            credential_name (object): not used; only present for interface compatibility
            data_source (dict): dict containing name of GCP storage bucket

        """
        storage_client = storage.Client()
        bucket = data_source['bucket']
        try:
            bucket_info = storage_client.lookup_bucket(bucket)
            if not bucket_info:
                # if the lookup does not return anything, then this is an nonexistent bucket
                key = 'billing_source.bucket'
                message = f'The provided GCP bucket {bucket} does not exist'
                raise serializers.ValidationError(error_obj(key, message))

        except GoogleCloudError as e:
            key = 'billing_source.bucket'
            raise serializers.ValidationError(error_obj(key, e.message))

        return True 
Example 10
Project: vimss   Author: Veleslavia   File: Utils.py    GNU General Public License v3.0 6 votes vote down vote up
def upload_to_gcs(filenames, gcs_bucket_path):
    """Upload wave file to GCS, at provided path."""

    path_parts = gcs_bucket_path[5:].split('/', 1)
    bucket_name = path_parts[0]
    if len(path_parts) == 1:
        key_prefix = ''
    elif path_parts[1].endswith('/'):
        key_prefix = path_parts[1]
    else:
        key_prefix = path_parts[1] + '/'

    client = storage.Client(project=os.environ["PROJECT_NAME"])
    bucket = client.get_bucket(bucket_name)

    def _upload_files(filenames):
        """Upload a list of files into a specifc subdirectory."""
        for i, filename in enumerate(filenames):
            blob = bucket.blob(key_prefix + os.path.basename(filename))
            blob.upload_from_filename(filename)
            if not i % 5:
                tf.logging.info('Finished uploading file: %s' % filename)

    _upload_files(filenames) 
Example 11
Project: prefect   Author: PrefectHQ   File: storage.py    Apache License 2.0 6 votes vote down vote up
def _get_client(
        self, project: str, credentials: dict, credentials_secret: str = None
    ):
        """
        Creates and returns a GCS Client instance
        """
        if credentials_secret is not None:
            warnings.warn(
                "The `credentials_secret` argument is deprecated. Use a `Secret` task "
                "to pass the credentials value at runtime instead.",
                UserWarning,
            )
            creds = Secret(credentials_secret).get()
            credentials = Credentials.from_service_account_info(creds)
            project = project or credentials.project_id

        if credentials is not None:
            project = project or credentials.get("project")
            client = storage.Client(project=project, credentials=credentials)
        else:
            client = storage.Client(project=project)
        return client 
Example 12
Project: ViS   Author: aceew   File: getvids.py    MIT License 6 votes vote down vote up
def __init__(
            self, 
            client: storage.Client,
            bucket_name: str,
            blob_name: str,
            chunk_size: int= 1 * 1024 * 1024
        ):
        self._client = client
        self._bucket = self._client.bucket(bucket_name)
        self._blob = self._bucket.blob(blob_name)

        self._buffer = b''
        self._buffer_size = 0
        self._chunk_size = chunk_size
        self._read = 0

        self._transport = AuthorizedSession(
            credentials=self._client._credentials
        )
        self._request = None  # type: requests.ResumableUpload 
Example 13
Project: ViS   Author: aceew   File: getvids.py    MIT License 6 votes vote down vote up
def get_yt_video(request):
    client = storage.Client()
    if request.args and 'youtube_url' in request.args:
        yt_link = request.args.get('youtube_url')
        print('Got a request with youtube_url=', yt_link)
    else:
        print('ERROR: no URL was provided. exiting.')
        return
    yt_id = extract_video_id_from_url(yt_link)
    yt_object = YouTube(yt_link)
    yt_stream = yt_object.streams.filter(only_audio=True).first()
    data = yt_stream.stream_to_buffer()
    with GCSObjectStreamUpload(client=client, bucket_name='visumm-store', blob_name= yt_id + '/youtube-' + yt_stream.default_filename) as fh:
        fh.write(data.getbuffer())

# The lambda function is created from the code above
# The code below is to test locally.
#get_yt_video('request') 
Example 14
Project: docker-python   Author: Kaggle   File: kaggle_gcp.py    Apache License 2.0 6 votes vote down vote up
def init_gcs():
    is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
    from google.cloud import storage
    if not is_user_secrets_token_set:
        return storage

    from kaggle_gcp import get_integrations
    if not get_integrations().has_gcs():
        return storage

    from kaggle_secrets import GcpTarget
    from kaggle_gcp import KaggleKernelCredentials
    monkeypatch_client(
        storage.Client,
        KaggleKernelCredentials(target=GcpTarget.GCS))
    return storage 
Example 15
Project: oculi   Author: google   File: gcs_read_helper.py    Apache License 2.0 6 votes vote down vote up
def init_gcs(credentials_path, gcp_project):
  """Initializes the GCS API.

  Args:
    credentials_path: filepath to client_secrets.json
    gcp_project: for project holding GCS bucket

  Returns:
    GCS service object

  Raises:
    ValueError
  """

  try:
    credentials = service_account.Credentials.from_service_account_file(
        credentials_path)
  except IOError:
    msg = 'no or invalid credentials found at {}, '.format(credentials_path)
    msg += 'have you run setup_environment.sh?'
    raise ValueError(msg)

  service = storage.Client(project=gcp_project, credentials=credentials)

  return service 
Example 16
Project: Coulomb   Author: DynamoDS   File: storage_utils.py    MIT License 5 votes vote down vote up
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    log('upload_blob: About to upload {}.'.format(source_file_name))

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    log('upload_blob: {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name)) 
Example 17
Project: Coulomb   Author: DynamoDS   File: data_files_to_sessions.py    MIT License 5 votes vote down vote up
def get_blob_name_to_process():
    DATA_FILES_PREFIX = "DynamoData"
    bucket = storage_client.get_bucket(DATA_FILES_BUCKET)
    blobs = bucket.list_blobs(prefix=DATA_FILES_PREFIX, delimiter=None)
    for blob in blobs:
        blob_name = blob.name
        log ("get_blob_name_to_process: => {}".format(blob_name))
        return blob_name
        # This is clearly crazy as a way of getting an item out of the
        # psuedo-iterator that the Cloud Client library is returning 
Example 18
Project: Coulomb   Author: DynamoDS   File: data_files_to_sessions.py    MIT License 5 votes vote down vote up
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    log('upload_blob: About to upload {}.'.format(source_file_name))

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    log('upload_blob: {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name)) 
Example 19
Project: cwavegan   Author: acheketa   File: backup.py    MIT License 5 votes vote down vote up
def copy_blob(bucket_name, blob_name, new_bucket_name, new_blob_name):
    """Copies a blob from one bucket to another with a new name."""
    storage_client = storage.Client()
    source_bucket = storage_client.get_bucket(bucket_name)
    source_blob = source_bucket.blob(blob_name)
    destination_bucket = storage_client.get_bucket(new_bucket_name)

    new_blob = source_bucket.copy_blob(
        source_blob, destination_bucket, new_blob_name)

    print('Blob {} in bucket {} copied to blob {} in bucket {}.'.format(
        source_blob.name, source_bucket.name, new_blob.name,
        destination_bucket.name)) 
Example 20
Project: cwavegan   Author: acheketa   File: backup.py    MIT License 5 votes vote down vote up
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()
    return blobs 
Example 21
Project: neural-fingerprinting   Author: StephanZheng   File: cloud_client.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, project_id, bucket_name):
    """Initialize client with project id and name of the storage bucket."""
    self.project_id = project_id
    self.bucket_name = bucket_name
    self.client = storage.Client(project=project_id)
    self.bucket = self.client.get_bucket(bucket_name) 
Example 22
Project: neural-fingerprinting   Author: StephanZheng   File: cloud_client.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, project_id, namespace=None):
    """Init this method with given project id and optional namespace."""
    self._client = datastore.Client(project=project_id, namespace=namespace) 
Example 23
Project: cloudygo   Author: sethtroisi   File: cloudygo.py    Apache License 2.0 5 votes vote down vote up
def __get_gs_game(self, bucket, model_name, filename, view_type):
        assert 'full' in view_type, view_type


        # Maybe it's worth caching these for, now just globally rate limit
        now = time.time()
        if now - self.last_cloud_request < 1:
            return None
        self.last_cloud_request = now

        # NOTE: needs to be before cloud_bucket clears bucket.
        from google.cloud import storage
        cloud_bucket = CloudyGo.get_cloud_bucket(bucket)
        if bucket not in self.storage_clients:
            client = storage.Client(project="minigo-pub").bucket(cloud_bucket)
            self.storage_clients[bucket] = client

        # MINIGO-HACK
        if bucket in CloudyGo.MINIGO_TS:
            # Take a guess at based on timestamp
            hour_guess = CloudyGo.guess_hour_dir(filename)
            model_name = hour_guess

            path = os.path.join('sgf', 'full', hour_guess, filename)
            if cloud_bucket == CloudyGo.FULL_GAME_CLOUD_BUCKET:
                # MINIGO_PUB has an outer folder of the bucket name
                path = os.path.join(bucket, path)
        else:
            path = os.path.join(bucket, 'sgf', model_name, 'full', filename)

        blob = self.storage_clients[bucket].get_blob(path)
        print("Checking {}: {}".format(filename, blob is not None))
        print(self.storage_clients[bucket], path)
        if not isinstance(blob, storage.Blob):
            return None

        data = blob.download_as_string().decode('utf8')
        return data 
Example 24
Project: gcp-variant-transforms   Author: googlegenomics   File: vcf_file_composer.py    Apache License 2.0 5 votes vote down vote up
def compose_gcs_vcf_shards(project,  # type: str
                           vcf_header_file_path,  # type: str
                           vcf_data_files_folder,  # type: str
                           output_file,  # type: str
                           delete=False,  # type: bool
                          ):
  # type: (...) -> None
  """Composes VCF shards in GCS to one VCF file.

  It composes VCF header and VCF data files to one VCF file, and deletes the
  original VCF shards if `delete` is True.

  Args:
    project: The project name.
    vcf_header_file_path: The path of the VCF header file, it contains the meta
      information, as well as the data header line with the call names.
    vcf_data_files_folder: The folder that contains all VCF data files.
    output_file: The final VCF file path.
    delete: If true, delete the original VCF shards.
  """
  header_bucket_name, header_blob = gcsio.parse_gcs_path(vcf_header_file_path)
  vcf_data_bucket_name, vcf_data_blob_prefix = gcsio.parse_gcs_path(
      vcf_data_files_folder)

  if vcf_data_bucket_name != header_bucket_name:
    raise ValueError('The VCF data files {} and header file {} are in '
                     'different buckets. '.format(vcf_data_files_folder,
                                                  vcf_header_file_path))

  composed_vcf_data_blob = _compose_vcf_data_files(project,
                                                   vcf_data_files_folder)
  client = storage.Client(project)
  bucket = client.get_bucket(vcf_data_bucket_name)
  output_file_blob = _create_blob(client, output_file)
  output_file_blob.compose([bucket.get_blob(header_blob),
                            composed_vcf_data_blob])
  if delete:
    bucket.delete_blobs(bucket.list_blobs(prefix=vcf_data_blob_prefix))
    bucket.delete_blobs(bucket.list_blobs(prefix=header_blob)) 
Example 25
Project: gcp-variant-transforms   Author: googlegenomics   File: vcf_file_composer.py    Apache License 2.0 5 votes vote down vote up
def _create_blob(client, file_path):
  # type: (storage.Client, str) -> storage.Blob
  bucket_name, blob_name = gcsio.parse_gcs_path(file_path)
  file_blob = client.get_bucket(bucket_name).blob(blob_name)
  file_blob.content_type = 'text/plain'
  return file_blob 
Example 26
Project: gcp-variant-transforms   Author: googlegenomics   File: run_preprocessor_tests.py    Apache License 2.0 5 votes vote down vote up
def validate_result(self):
    """Validates the results.

    - Checks that the report is generated.
    - Validates report's contents are the same as `expected_contents`.
    - Checks that the resolved headers are generated if `header_blob_name` is
      specified in the test.
    """
    client = storage.Client(self._project)
    bucket = client.get_bucket(_BUCKET_NAME)
    report_blob = bucket.get_blob(self._report_blob_name)
    if not report_blob:
      raise run_tests_common.TestCaseFailure(
          'Report is not generated in {} in test {}'.format(self._report_path,
                                                            self._name))
    contents = report_blob.download_as_string()
    expected_contents = '\n'.join(self._expected_contents)
    if expected_contents != contents:
      raise run_tests_common.TestCaseFailure(
          'Contents mismatch: expected {}, got {} in test {}'.format(
              expected_contents, contents, self._name))
    if not self._keep_reports:
      report_blob.delete()

    if self._header_blob_name:
      resolved_headers_blob = bucket.get_blob(self._header_blob_name)
      if not resolved_headers_blob:
        raise run_tests_common.TestCaseFailure(
            'The resolved header is not generated in {} in test {}'.format(
                self._header_path, self._name))
      if not self._keep_reports:
        resolved_headers_blob.delete() 
Example 27
Project: genetic-curriculum   Author: Octavian-ai   File: file.py    Apache License 2.0 5 votes vote down vote up
def copy_to_bucket(self):
    if 'google.cloud' in sys.modules and self.args.bucket is not None and self.args.gcs_dir is not None:
      client = storage.Client()
      bucket = client.get_bucket(self.args.bucket)
      blob = bucket.blob(self.gcs_path)
      blob.upload_from_filename(filename=self.file_path) 
Example 28
Project: grover   Author: rowanz   File: prepare_lm_data.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, fn):
        self.fn = fn
        if fn.startswith('s3://'):
            from boto3.s3.transfer import TransferConfig
            import boto3
            self.gclient = None
            self.s3client = boto3.client('s3',
                                         )
            self.storage_dir = TemporaryDirectory()
            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
            self.bucket_name, self.file_name = self.fn.split('s3://', 1)[1].split('/', 1)
        elif fn.startswith('gs://'):
            from google.cloud import storage
            self.s3client = None
            self.gclient = storage.Client()
            self.storage_dir = TemporaryDirectory()
            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
            self.bucket_name, self.file_name = self.fn.split('gs://', 1)[1].split('/', 1)

        else:
            self.s3client = None
            self.gclient = None
            self.bucket_name = None
            self.file_name = None
            self.storage_dir = None
            self.writer = tf.python_io.TFRecordWriter(fn) 
Example 29
Project: grover   Author: rowanz   File: validate.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, gcloud_name):
        assert gcloud_name.startswith('gs://')
        self.gcloud_name = gcloud_name
        bucket_name, blob_name = gcloud_name.split('gs://')[1].split('/', 1)
        bucket = storage.Client().get_bucket(bucket_name)
        self.blob = bucket.blob(blob_name) 
Example 30
Project: cloudml-edge-automation   Author: GoogleCloudPlatform   File: mark_done.py    Apache License 2.0 5 votes vote down vote up
def mark_done(gspath):
    """Uploads a file to the bucket to indicate comletion of training job.
    gspath is a path to the output directory of training such as

    gs://$PROJECT-model-output/$MODEL_NAME/$MODEL_VERSION/output

    """
    url = urlparse(gspath)
    if url.scheme != "gs":
        raise RuntimeError("not a Google Storage URL")
    bucket_name = url.netloc
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(url.path.strip("/") + "/TRAINER-DONE")
    blob.upload_from_string("done") 
Example 31
Project: inyourface   Author: yacomink   File: GStorageCacheProvider.py    MIT License 5 votes vote down vote up
def __init__(self, project_id):
        self.storage_client = storage.Client()
        self.bucket = self.storage_client.get_bucket(project_id + ".appspot.com") 
Example 32
Project: icrawler   Author: hellock   File: google_storage.py    MIT License 5 votes vote down vote up
def __init__(self, root_dir):
        try:
            from google.cloud import storage
        except ImportError:
            print('GoogleStorage backend requires the package '
                  '"google-cloud-storage", execute '
                  '"pip install google-cloud-storage" to install it.')

        self.client = storage.Client()
        bucket_str = root_dir[5:].split('/')[0]
        self.bucket = self.client.get_bucket(bucket_str)
        self.folder_str = root_dir[6 + len(bucket_str):]
        if self.folder_str[0] == '/':
            self.folder_str = self.folder_str[1:] 
Example 33
Project: asynconsumer   Author: rhoboro   File: gcs_downloader.py    Apache License 2.0 5 votes vote down vote up
def fetch_gcs_objects(uris, directory, naming=None, concurrency=3, sleep=0):
    """GCSから取得したオブジェクトをdirectoryに取得し、そのファイルパスを返す。

    :param uris: uriの一覧
    :param directory: オブジェクトを格納するディレクトリのパス
    :param naming: uriを引数に取り、ファイル名を返すCallableオブジェクト。デフォルトはmd5化。
    :param concurrency: 並列実行の最大数。デフォルトは3。
    :param sleep: スリープ時間(秒)。デフォルトは0。
    :return: 渡したurisと同潤のファイルパスの一覧。取得できない場合はファイルパスはNoneにする。
    """
    from google.cloud import storage

    client = storage.Client()
    loop = asyncio.get_event_loop()
    naming = naming or to_md5

    async def _fetch(uri):
        local_path = directory + '/' + naming(uri)
        try:
            params = [client, uri, local_path]
            filename = await loop.run_in_executor(None, _get_gcs_object, *params)
        except Exception:
            filename = None
        return filename

    return async_run(uris, _fetch, concurrency=concurrency, sleep=sleep) 
Example 34
Project: lm-human-preferences   Author: openai   File: gcs.py    MIT License 5 votes vote down vote up
def get_blob(url, client=None):
    if client is None:
        client = storage.Client()
    bucket_name, path = parse_url(url)
    bucket = client.get_bucket(bucket_name)
    return bucket.get_blob(path) 
Example 35
Project: lm-human-preferences   Author: openai   File: gcs.py    MIT License 5 votes vote down vote up
def upload_contents(url, contents, client=None):
    """Given a gs:// path, returns contents of the corresponding blob."""
    if client is None:
        client = storage.Client()
    bucket_name, path = parse_url(url)
    bucket = client.get_bucket(bucket_name)
    blob = storage.Blob(path, bucket)
    blob.upload_from_string(contents) 
Example 36
Project: analysis-py-utils   Author: verilylifesciences   File: bq.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, project_id, default_dataset=None, maximum_billing_tier=None,
                 max_wait_secs=DEFAULT_TIMEOUT_SEC, alternate_bq_client_class=None):
        self.gclient = (alternate_bq_client_class or bigquery.Client)(project=project_id)
        self.max_wait_secs = max_wait_secs
        # Retry object for errors encountered in making API calls (executing jobs, etc.)
        self.default_retry_for_api_calls = DEFAULT_RETRY_FOR_API_CALLS.with_deadline(max_wait_secs)
        # Retry object for errors encountered while polling jobs in progress.
        # See https://github.com/googleapis/google-cloud-python/issues/6301
        self.default_retry_for_async_jobs = DEFAULT_RETRY_FOR_ASYNC_JOBS.with_deadline(
            max_wait_secs)
        super(Client, self).__init__(project_id, default_dataset, maximum_billing_tier) 
Example 37
Project: analysis-py-utils   Author: verilylifesciences   File: bq_test.py    Apache License 2.0 5 votes vote down vote up
def create_temp_bucket(cls):
        # type: () -> None
        """Create temporary bucket"""
        cls.temp_bucket_name = str(random.randint(1000000, 9999999))
        cls.bucket = storage.Client(cls.TEST_PROJECT).bucket(cls.temp_bucket_name)
        if not cls.bucket.exists():
            cls.bucket.create() 
Example 38
Project: analysis-py-utils   Author: verilylifesciences   File: bq_test.py    Apache License 2.0 5 votes vote down vote up
def _test_bq_api_call_retries(self, method_to_test, exc, should_retry):
        """A helper function to test retries when calling backend API.

        Args:
            method_to_test: The method to test. It is a method called after an AsyncJob is created
                and takes an AsyncJob and a string of query as arguments.
            exc: The exception to raise.
            should_retry: Whether bq.Client should catch the exception and do a retry.
        """
        exceptions_to_raise = [
            None,  # No error to raise when called in self.client.query
            exc   # Method validate_query_job calls QueryJob.done, which raises an error.
                  # If exc is transient, trigger a retry; otherwise exit.
        ]
        side_effect_func = self._get_mock_api_request_side_effect(exceptions_to_raise)

        with patch('google.cloud._http.JSONConnection.api_request') as mock_api_request:
            mock_api_request.side_effect = side_effect_func

            dest_path = self.client.path('{}_test_{}'.format(method_to_test.__name__, self.test_id))

            query_to_run = 'SELECT 5'
            query_job = self.client.start_async_job(query_to_run, dest_path=dest_path)

            if should_retry:
                method_to_test(query_job, query_to_run)
            else:
                with self.assertRaises(type(exc)):
                    method_to_test(query_job, query_to_run) 
Example 39
Project: analysis-py-utils   Author: verilylifesciences   File: bq_test.py    Apache License 2.0 5 votes vote down vote up
def test_is_job_done_retries(self, exc, should_retry):
        """Tests retries in is_job_done

        Args:
            exc: The exception to raise.
            should_retry: Whether bq.Client should catch the exception and do a retry.
        """
        self._test_bq_api_call_retries(is_job_done, exc, should_retry) 
Example 40
Project: analysis-py-utils   Author: verilylifesciences   File: bq_test.py    Apache License 2.0 5 votes vote down vote up
def test_validate_query_job_retries(self, exc, should_retry):
        """Tests retries in validate_query_job

        Args:
            exc: The exception to raise.
            should_retry: Whether bq.Client should catch the exception and do a retry.
        """
        self._test_bq_api_call_retries(validate_query_job, exc, should_retry) 
Example 41
Project: ScoutSuite   Author: nccgroup   File: cloudstorage.py    GNU General Public License v2.0 5 votes vote down vote up
def get_buckets(self, project_id: str):
        try:
            client = storage.Client(project=project_id)
            buckets = await run_concurrently(lambda: list(client.list_buckets()))
            await get_and_set_concurrently([self._get_and_set_bucket_logging, 
                self._get_and_set_bucket_iam_policy], buckets)
            return buckets
        except Exception as e:
            print_exception('Failed to retrieve storage buckets: {}'.format(e))
            return [] 
Example 42
Project: lookml-tools   Author: ww-tech   File: bq_writer.py    Apache License 2.0 5 votes vote down vote up
def upload_to_bq(self, dataframe, gcs_project_id, project_id, dataset, tablename, target_bucket_name, bucket_folder, write_disposition=WriteDisposition.WRITE_APPEND):
        '''Write some dataframe to BigQuery via GCS storage. 
            
        Args:
            dataframe (pandas dataframe): data to be written
            gcs_project_id (str): GCS project ID
            target_bucket_name (str): GCS bucket_name
            bucket_folder (str): GCS bucket_folder
            project_id (str): BQ project ID
            dataset (str): BQ dataset
            tablename (str): BQ tablename

        Returns:
            nothing but side effect is to write data to GCS and then to BigQuery

        '''
        filename = self._write_to_csv(dataframe, target_bucket_name, bucket_folder)

        self._upload_to_gcs(gcs_project_id, target_bucket_name, bucket_folder, filename)

        job_config = self._create_job_config(write_disposition)

        # copy from file to table
        bigquery_client = bigquery.Client(project=project_id)
        table_ref = bigquery_client.dataset(dataset).table(tablename)  # to target a partition concat a $date to the

        path = target_bucket_name + os.sep + bucket_folder + os.sep + filename
        logging.info("Loading to %s", path)

        load_job = bigquery_client.load_table_from_uri(
            'gs://{}'.format(path),  # need to make sure this is wildcarded
            table_ref,
            job_config=job_config)  # API request

        load_job.result()  # Waits for table load to complete. 
Example 43
Project: koku   Author: project-koku   File: gcp_report_downloader.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, task, customer_name, billing_source, **kwargs):
        """
        Constructor.

        Args:
            task           (Object) bound celery object
            customer_name  (str): Name of the customer
            billing_source (dict): dict containing name of GCP storage bucket

        """
        super().__init__(task, **kwargs)

        self.bucket_name = billing_source['bucket']
        self.report_prefix = billing_source.get('report_prefix', '')
        self.customer_name = customer_name.replace(' ', '_')
        self._provider_uuid = kwargs.get('provider_uuid')

        try:
            GCPProvider().cost_usage_source_is_reachable(None, billing_source)
            self._storage_client = storage.Client()
            self._bucket_info = self._storage_client.lookup_bucket(self.bucket_name)
        except ValidationError as ex:
            LOG.error(
                'GCP bucket %(bucket_name)s for customer %(customer_name)s is not reachable. '
                'Error: %(ex)s',
                {
                    'customer_name': customer_name,
                    'bucket_name': self.bucket_name,
                    'ex': str(ex),
                },
            )
            raise GCPReportDownloaderError(str(ex)) 
Example 44
Project: vimss   Author: Veleslavia   File: musdb_to_tfrecord.py    GNU General Public License v3.0 5 votes vote down vote up
def upload_to_gcs(training_records, test_records):
    """Upload TF-Record files to GCS, at provided path."""

    # Find the GCS bucket_name and key_prefix for dataset files
    path_parts = FLAGS.gcs_output_path[5:].split('/', 1)
    bucket_name = path_parts[0]
    if len(path_parts) == 1:
        key_prefix = ''
    elif path_parts[1].endswith('/'):
        key_prefix = path_parts[1]
    else:
        key_prefix = path_parts[1] + '/'

    client = storage.Client(project=FLAGS.project)
    bucket = client.get_bucket(bucket_name)

    def _upload_files(filenames):
        """Upload a list of files into a specifc subdirectory."""
        for i, filename in enumerate(sorted(filenames)):
            blob = bucket.blob(key_prefix + os.path.basename(filename))
            blob.upload_from_filename(filename)
            if not i % 5:
                tf.logging.info('Finished uploading file: %s' % filename)

    # Upload training dataset
    tf.logging.info('Uploading the training data.')
    _upload_files(training_records)

    # Upload validation dataset
    tf.logging.info('Uploading the validation data.')
    _upload_files(test_records) 
Example 45
Project: vimss   Author: Veleslavia   File: urmp_to_tfrecords.py    GNU General Public License v3.0 5 votes vote down vote up
def upload_to_gcs(training_records, test_records):
    """Upload TF-Record files to GCS, at provided path."""

    # Find the GCS bucket_name and key_prefix for dataset files
    path_parts = FLAGS.gcs_output_path[5:].split('/', 1)
    bucket_name = path_parts[0]
    if len(path_parts) == 1:
        key_prefix = ''
    elif path_parts[1].endswith('/'):
        key_prefix = path_parts[1]
    else:
        key_prefix = path_parts[1] + '/'

    client = storage.Client(project=FLAGS.project)
    bucket = client.get_bucket(bucket_name)

    def _upload_files(filenames):
        """Upload a list of files into a specifc subdirectory."""
        for i, filename in enumerate(filenames):
            blob = bucket.blob(key_prefix + os.path.basename(filename))
            blob.upload_from_filename(filename)
            if not i % 5:
                tf.logging.info('Finished uploading file: %s' % filename)

    # Upload training dataset
    tf.logging.info('Uploading the training data.')
    _upload_files(training_records)

    # Upload validation dataset
    tf.logging.info('Uploading the validation data.')
    _upload_files(test_records) 
Example 46
Project: airflow-ml-pipeline   Author: icoxfog417   File: gcs_storage.py    MIT License 5 votes vote down vote up
def _get_client(self):
        if self.credential_path is not None:
            client = storage.Client.from_service_account_json(
                        self.credential_path)
        else:
            client = storage.Client()
        return client 
Example 47
Project: pyeo   Author: clcr   File: queries_and_downloads.py    GNU General Public License v3.0 5 votes vote down vote up
def download_from_google_cloud(product_ids, out_folder, redownload = False):
    """Still experimental."""
    log = logging.getLogger(__name__)
    log.info("Downloading following products from Google Cloud:".format(product_ids))
    storage_client = storage.Client()
    bucket = storage_client.get_bucket("gcp-public-data-sentinel-2")
    for safe_id in product_ids:
        if not safe_id.endswith(".SAFE"):
            safe_id = safe_id+".SAFE"
        if check_for_invalid_l1_data(os.path.join(out_folder, safe_id)) and not redownload:
            log.info("File exists, skipping.")
            return
        if redownload:
            log.info("Removing {}".format(os.path.join(out_folder, safe_id)))
            shutil.rmtree(os.path.join(out_folder, safe_id))
        tile_id = get_sen_2_image_tile(safe_id)
        utm_zone = tile_id[1:3]
        lat_band = tile_id[3]
        grid_square = tile_id[4:6]
        object_prefix = r"tiles/{}/{}/{}/{}/".format(
            utm_zone, lat_band, grid_square, safe_id
        )
        object_iter = bucket.list_blobs(prefix=object_prefix, delimiter=None)
        if object_iter.num_results == 0:
            log.error("{} missing from Google Cloud, continuing".format(safe_id))
            continue
        for s2_object in object_iter:
            download_blob_from_google(bucket, object_prefix, out_folder, s2_object)
        # Need to make these two empty folders for sen2cor to work properly
        try:
            os.mkdir(os.path.join(os.path.abspath(out_folder), safe_id, "AUX_DATA"))
            os.mkdir(os.path.join(os.path.abspath(out_folder), safe_id, "HTML"))
        except FileExistsError:
            pass 
Example 48
Project: open-recipe   Author: dspray95   File: test.py    The Unlicense 5 votes vote down vote up
def get_gcs_content_and_delete(bucket, path):
    from google.cloud import storage
    client = storage.Client(project=os.environ.get('GCS_PROJECT_ID'))
    bucket = client.get_bucket(bucket)
    blob = bucket.get_blob(path)
    content = blob.download_as_string()
    acl = list(blob.acl)  # loads acl before it will be deleted
    bucket.delete_blob(path)
    return content, acl, blob 
Example 49
Project: open-recipe   Author: dspray95   File: files.py    The Unlicense 5 votes vote down vote up
def __init__(self, uri):
        from google.cloud import storage
        client = storage.Client(project=self.GCS_PROJECT_ID)
        bucket, prefix = uri[5:].split('/', 1)
        self.bucket = client.bucket(bucket)
        self.prefix = prefix 
Example 50
Project: clusterfuzz   Author: google   File: storage.py    Apache License 2.0 5 votes vote down vote up
def _create_storage_client_new():
  """Create a storage client."""
  creds, project = credentials.get_default()
  if not project:
    project = utils.get_application_id()

  return gcs.Client(project=project, credentials=creds) 
Example 51
Project: OpenGPT-2   Author: agermanidis   File: prepare_lm_data.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, fn):
        self.fn = fn
        if fn.startswith('s3://'):
            from boto3.s3.transfer import TransferConfig
            import boto3
            self.gclient = None
            self.s3client = boto3.client('s3',
                                         )
            self.storage_dir = TemporaryDirectory()
            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
            self.bucket_name, self.file_name = self.fn.split('s3://', 1)[1].split('/', 1)
        elif fn.startswith('gs://'):
            from google.cloud import storage
            self.s3client = None
            self.gclient = storage.Client()
            self.storage_dir = TemporaryDirectory()
            self.writer = tf.python_io.TFRecordWriter(os.path.join(self.storage_dir.name, 'temp.tfrecord'))
            self.bucket_name, self.file_name = self.fn.split('gs://', 1)[1].split('/', 1)

        else:
            self.s3client = None
            self.gclient = None
            self.bucket_name = None
            self.file_name = None
            self.storage_dir = None
            self.writer = tf.python_io.TFRecordWriter(fn) 
Example 52
Project: OpenGPT-2   Author: agermanidis   File: validate.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, gcloud_name):
        assert gcloud_name.startswith('gs://')
        self.gcloud_name = gcloud_name
        bucket_name, blob_name = gcloud_name.split('gs://')[1].split('/', 1)
        bucket = storage.Client().get_bucket(bucket_name)
        self.blob = bucket.blob(blob_name) 
Example 53
Project: delta   Author: celskeggs   File: remote.py    MIT License 5 votes vote down vote up
def get_ref():
    global paradox
    if paradox is None:
        paradox = storage.Client(project="backups-cela").get_bucket("paradox_backup")
    return paradox


# NOTE: THIS IS SOME HACKY MONKEYPATCHING 
Example 54
Project: sregistry-cli   Author: singularityhub   File: __init__.py    Mozilla Public License 2.0 5 votes vote down vote up
def __init__(self, secrets=None, base=None, init=True, **kwargs):

        self._update_secrets()
        self._update_headers()

        # Do we need storage/compute client now?
        if init is True:
            self._init_client()

        super(Client, self).__init__(**kwargs) 
Example 55
Project: sregistry-cli   Author: singularityhub   File: __init__.py    Mozilla Public License 2.0 5 votes vote down vote up
def _get_services(self, version="v1"):
        """get version 1 of the google compute and storage service

        Parameters
        ==========
        version: version to use (default is v1)
        """
        self._bucket_service = storage.Client()
        creds = GoogleCredentials.get_application_default()
        self._storage_service = discovery_build("storage", version, credentials=creds)
        self._compute_service = discovery_build("compute", version, credentials=creds) 
Example 56
Project: sregistry-cli   Author: singularityhub   File: __init__.py    Mozilla Public License 2.0 5 votes vote down vote up
def __init__(self, secrets=None, base=None, init=True, **kwargs):

        self._update_secrets()
        self._update_headers()

        # Do we need storage client now?
        if init is True:
            self._init_client()

        super(Client, self).__init__(**kwargs) 
Example 57
Project: sregistry-cli   Author: singularityhub   File: __init__.py    Mozilla Public License 2.0 5 votes vote down vote up
def _get_services(self, version="v1"):
        """get version 1 of the google compute and storage service

            Parameters
            ==========
            version: version to use (default is v1)
        """
        self._bucket_service = storage.Client()
        creds = GoogleCredentials.get_application_default()
        self._storage_service = discovery_build("storage", version, credentials=creds)
        self._build_service = discovery_build("cloudbuild", version, credentials=creds) 
Example 58
Project: prefect   Author: PrefectHQ   File: gcs.py    Apache License 2.0 5 votes vote down vote up
def _gcs_client(self):  # type: ignore
        from google.cloud import storage

        return storage.Client(project=self.project) 
Example 59
Project: prefect   Author: PrefectHQ   File: gcs_result_handler.py    Apache License 2.0 5 votes vote down vote up
def initialize_client(self) -> None:
        """
        Initializes GCS connections.
        """
        from google.oauth2.service_account import Credentials
        from google.cloud import storage

        creds = Secret(self.credentials_secret).get()
        credentials = Credentials.from_service_account_info(creds)
        project = credentials.project_id
        client = storage.Client(project=project, credentials=credentials)
        self.gcs_bucket = client.bucket(self.bucket) 
Example 60
Project: google-speech-to-text   Author: Naki21   File: upload_to_gcloud.py    MIT License 5 votes vote down vote up
def upload_to_gcloud(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name)) 
Example 61
Project: ViS   Author: aceew   File: main.py    MIT License 5 votes vote down vote up
def download_blob(source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    print('inside download_blob')
    
    bucket_name='visumm-store'
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print('File {} has been downloaded to {}.'.format(source_blob_name, destination_file_name)) 
Example 62
Project: ViS   Author: aceew   File: main.py    MIT License 5 votes vote down vote up
def upload_blob(source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    print('inside upload_blob')
    bucket_name='visumm-store'
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} has been uploaded to {}.'.format(source_file_name, destination_blob_name)) 
Example 63
Project: ViS   Author: aceew   File: getvids.py    MIT License 5 votes vote down vote up
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    print('inside upload_blob')
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(source_file_name, destination_blob_name)) 
Example 64
Project: docker-python   Author: Kaggle   File: test_gcs.py    Apache License 2.0 5 votes vote down vote up
def test_ctr(self):
        credentials = _make_credentials()
        env = EnvironmentVarGuard()
        env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
        env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
        with env:
            init_gcs()
            client = storage.Client(project="xyz", credentials=credentials)
            self.assertEqual(client.project, "xyz")
            self.assertNotIsInstance(client._credentials, KaggleKernelCredentials)
            self.assertIsNotNone(client._credentials) 
Example 65
Project: docker-python   Author: Kaggle   File: test_gcs.py    Apache License 2.0 5 votes vote down vote up
def test_annonymous_client(self):
        env = EnvironmentVarGuard()
        env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
        env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
        with env:
            init_gcs()
            anonymous = storage.Client.create_anonymous_client()
            self.assertIsNotNone(anonymous) 
Example 66
Project: docker-python   Author: Kaggle   File: test_gcs.py    Apache License 2.0 5 votes vote down vote up
def test_default_credentials_gcs_enabled(self):
        env = EnvironmentVarGuard()
        env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
        env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
        with env:
            init_gcs()
            client = storage.Client(project="xyz")
            self.assertIsInstance(client._credentials, KaggleKernelCredentials)
            self.assertTrue(client._connection.user_agent.startswith("kaggle-gcp-client/1.0")) 
Example 67
Project: docker-python   Author: Kaggle   File: test_automl.py    Apache License 2.0 5 votes vote down vote up
def test_tables_gcs_client(self):
        # The GcsClient can't currently be monkeypatched for default
        # credentials because it requires a project which can't be set.
        # Verify that creating an automl.GcsClient given an actual
        # storage.Client sets the client properly.
        gcs_client = storage.Client(project="xyz", credentials=_make_credentials())
        tables_gcs_client = automl.GcsClient(client=gcs_client)
        self.assertIs(tables_gcs_client.client, gcs_client) 
Example 68
Project: tensorboardX   Author: lanpa   File: record_writer.py    MIT License 5 votes vote down vote up
def __init__(self, path):
        if not GCS_ENABLED:
            raise ImportError("`google-cloud-storage` must be installed in order to use "
                              "the 'gs://' protocol")

        self.path = path
        self.buffer = io.BytesIO()

        from google.cloud import storage
        client = storage.Client()

        bucket_name, filepath = self.bucket_and_path()
        bucket = storage.Bucket(client, bucket_name)
        self.blob = storage.Blob(filepath, bucket) 
Example 69
Project: tensorboardX   Author: lanpa   File: record_writer.py    MIT License 5 votes vote down vote up
def __init__(self, path):
        if not GCS_ENABLED:
            raise ImportError("`google-cloud-storage` must be installed in order to use "
                              "the 'gs://' protocol")

        self.path = path
        self.buffer = io.BytesIO()

        from google.cloud import storage
        client = storage.Client()

        bucket_name, filepath = self.bucket_and_path()
        bucket = storage.Bucket(client, bucket_name)
        self.blob = storage.Blob(filepath, bucket) 
Example 70
Project: airflow   Author: apache   File: gcs.py    Apache License 2.0 5 votes vote down vote up
def get_conn(self):
        """
        Returns a Google Cloud Storage service object.
        """
        if not self._conn:
            self._conn = storage.Client(credentials=self._get_credentials(),
                                        client_info=self.client_info,
                                        project=self.project_id)

        return self._conn 
Example 71
Project: portal_client   Author: IGS   File: gcp.py    MIT License 5 votes vote down vote up
def download_file(self, gs_remote_path, local_path):
        """
        Given a remote GCP object's URL, starting with gs://, download it and
        save it to the specified local path.
        """
        self.logger.debug("In download_file.")

        # Get the bucket from the gs_remote_path, which should be in the
        # form of gs://bucket_name/path
        if gs_remote_path.startswith('gs://'):
            gs_remote_path = gs_remote_path[5:]
        else:
            raise Exception("Invalid google storage path. Must start with gs://")

        bucket_name = gs_remote_path.split('/')[0]

        self.logger.debug("Bucket name: %s", bucket_name)

        bucket_length = len(bucket_name)
        obj_path = gs_remote_path[bucket_length + 1:]

        self.logger.debug("Object path: %s", obj_path)

        client = storage.Client(project=self.project_id, credentials=self.credentials)

        bucket = client.get_bucket(bucket_name)

        blob = bucket.blob(obj_path)

        self.logger.info("Downloading %s to %s.", obj_path, local_path)

        blob.download_to_filename(local_path) 
Example 72
Project: oculi   Author: google   File: gcs_copy_helper.py    Apache License 2.0 5 votes vote down vote up
def wrapper_gcs_upload(self, gcp_project, gcs_bucket_name, job_name,
                         job_type, file_name,
                         advertiser_id, asset_byte_string, creative_id):
    storage_client = storage.Client(project=gcp_project)
    bucket = storage_client.get_bucket(gcs_bucket_name)
    gcs_file_name = "{0}/{1}/{2}/{3}".format(job_name, advertiser_id,
                                             job_type, file_name)
    blob = bucket.blob(gcs_file_name)
    # TODO(team): pick correct format based on filename extension
    content_type = "image/jpg" if job_type == "image" else "video/mp4"
    blob.upload_from_string(data=asset_byte_string, content_type=content_type)

    gcs_url = "{0}{1}/{2}".format("gs://", gcs_bucket_name, gcs_file_name)
    uploaded_asset_details = {"Creative_ID": creative_id, "GCS_URL": gcs_url}
    return uploaded_asset_details 
Example 73
Project: studio   Author: studioml   File: pyrebase.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, credentials, storage_bucket, requests):
        from google.cloud import storage
        self.storage_bucket = \
            "https://firebasestorage.googleapis.com/v0/b/" + storage_bucket

        self.credentials = credentials
        self.requests = requests
        self.path = ""
        if credentials:
            client = storage.Client(
                credentials=credentials,
                project=storage_bucket)
            self.bucket = client.get_bucket(storage_bucket) 
Example 74
Project: studio   Author: studioml   File: gcloud_artifact_store.py    Apache License 2.0 5 votes vote down vote up
def get_client(self):
        if self._client is None or \
           self._client_timestamp is None or \
           time.time() - self._client_timestamp > STORAGE_CLIENT_EXPIRATION:

            from google.cloud import storage
            if 'credentials' in self.config.keys():
                self._client = storage.Client \
                    .from_service_account_json(self.config['serviceAccount'])
            else:
                self._client = storage.Client()
            self._client_timestamp = time.time()

        return self._client 
Example 75
Project: COP   Author: ZJULearning   File: imagenet_to_gcs.py    MIT License 5 votes vote down vote up
def upload_to_gcs(training_records, validation_records):
  """Upload TF-Record files to GCS, at provided path."""

  # Find the GCS bucket_name and key_prefix for dataset files
  path_parts = FLAGS.gcs_output_path[5:].split('/', 1)
  bucket_name = path_parts[0]
  if len(path_parts) == 1:
    key_prefix = ''
  elif path_parts[1].endswith('/'):
    key_prefix = path_parts[1]
  else:
    key_prefix = path_parts[1] + '/'

  client = storage.Client(project=FLAGS.project)
  bucket = client.get_bucket(bucket_name)

  def _upload_files(filenames):
    """Upload a list of files into a specifc subdirectory."""
    for i, filename in enumerate(sorted(filenames)):
      blob = bucket.blob(key_prefix + os.path.basename(filename))
      blob.upload_from_filename(filename)
      if not i % 20:
        tf.logging.info('Finished uploading file: %s' % filename)

  # Upload training dataset
  tf.logging.info('Uploading the training data.')
  _upload_files(training_records)

  # Upload validation dataset
  tf.logging.info('Uploading the validation data.')
  _upload_files(validation_records) 
Example 76
Project: driblet   Author: google   File: gcs_to_bq_operator_test.py    Apache License 2.0 5 votes vote down vote up
def setUp(self):
    super(GCStoBQOperatorTest, self).setUp()
    # Mock a GCS client that returns mock bucket with mock blob.
    self.mock_gcs_client = mock.create_autospec(storage.Client)
    self.mock_bucket = mock.create_autospec(storage.bucket.Bucket)
    self.mock_bucket.name = 'test_bucket'
    self.mock_gcs_client.get_bucket.return_value = self.mock_bucket
    self.mock_blob = mock.create_autospec(storage.blob.Blob)
    self.mock_blob.name = _FILE_NAME
    self.mock_blob.size = 100
    self.mock_bucket.list_blobs.return_value = [self.mock_blob]
    self.context = mock.MagicMock()
    # Mock BigQuery client that returns mock dataset with mock table reference.
    self.mock_bq_client = mock.create_autospec(bigquery.Client)
    self.mock_dataset_ref = self.mock_bq_client.dataset(_DATASET_ID)
    self.mock_table_ref = self.mock_dataset_ref.table(_TABLE_ID)
    self.mock_bq_client.dataset.return_value = self.mock_dataset_ref
    self.job_config = bigquery.LoadJobConfig()
    self.operator = gcs_to_bq_operator.GCStoBQOperator(
        task_id='test_task_id',
        bq_client=self.mock_bq_client,
        gcs_client=self.mock_gcs_client,
        job_config=self.job_config,
        dataset_id=_DATASET_ID,
        table_id=_TABLE_ID,
        gcs_bucket=self.mock_bucket.name,
        gcs_location=_LOCATION,
        exclude_prefix=_EXCLUDE_PREFIX,
        dir_prefix=_DIR_PREFIX) 
Example 77
Project: driblet   Author: google   File: gcs_delete_blob_operator_test.py    Apache License 2.0 5 votes vote down vote up
def setUp(self):
    super(GCSDeleteBlobOperatorTest, self).setUp()
    # Mock a GCS client that returns mock bucket with mock blob.
    self.mock_client = mock.create_autospec(storage.Client)
    self.mock_bucket = mock.create_autospec(storage.bucket.Bucket)
    self.mock_blob = mock.create_autospec(storage.blob.Blob)
    self.mock_blob.name = _FILE_NAME
    self.mock_client.get_bucket.return_true = self.mock_bucket
    self.mock_bucket.list_blobs.return_value = [self.mock_blob]
    self.context = mock.MagicMock() 
Example 78
Project: analysis-py-utils   Author: verilylifesciences   File: bq.py    Apache License 2.0 4 votes vote down vote up
def export_schema_to_bucket(self,
                                table_path,  # type: str
                                bucket_name,  # type: str
                                dir_in_bucket='',  # type: Optional[str]
                                output_ext='',  # type: Optional[str]
                                explicit_filename=None,  # type: Optional[str]
                                ):
        # type: (...) -> str
        """
        Export a BigQuery table's schema to a json file in the given bucket. The output file's
        name is <BQ table name>-schema.json

        Args:
            table_path: Path of the table
            bucket_name: Name of the bucket to store the spreadsheet. The bucket must be in project
                self.project_id
            dir_in_bucket: The directory in the bucket to store the output files
            output_ext: An optional extension to output file. So that we can tell output files from
                different exports
            explicit_filename: File name. Use it as file name if specified, otherwise use table ID,
                maybe with output_ext, as file name

        Returns:
            The name of the schema file exported.
        """
        table_project, dataset_id, table_name = self.parse_table_path(table_path)

        # Generate the destination of the table schema
        if explicit_filename:
            schema_filename = explicit_filename
        else:
            schema_filename = table_name
            if output_ext:
                schema_filename += '_' + output_ext
        schema_filename += '-schema.json'

        schema_path = os.path.join(dir_in_bucket, schema_filename).lstrip().lstrip('/')

        # Export schema as a json file to the bucket
        schema = [
            OrderedDict([('name', field.name), ('type', field.field_type)])
            for field in self.get_schema(dataset_id, table_name, table_project)
        ]

        schema_blob = storage.blob.Blob(schema_path,
                                        storage.Client(self.project_id).bucket(bucket_name))

        schema_blob.upload_from_string(json.dumps(schema, indent=2, separators=(',', ':')))
        return schema_filename 
Example 79
Project: analysis-py-utils   Author: verilylifesciences   File: bq.py    Apache License 2.0 4 votes vote down vote up
def copy_dataset(source_project, source_dataset_name, destination_project,
                 destination_dataset_name):
    # type: (str, str, str, str) -> None
    """Copies a dataset and all its tables to a different location.

    If the destination dataset does not yet exist, creates it.

    This function uses subprocess to copy tables from one project to another, because the python API
    does not support any functions that work across different GCP projects.

    Args:
        source_project: The project containing the dataset to copy.
        source_dataset_name: The name of the dataset to copy.
        destination_project: The project to which to copy the dataset.
        destination_dataset_name: The name of the dataset to copy to.

    Raises:
        RuntimeError if the destination dataset already exists and contains any tables.
    """
    source_client = Client(source_project)
    destination_client = Client(destination_project)
    source_tables = source_client.tables(source_dataset_name)
    if destination_client.dataset_exists_with_name(destination_dataset_name):
        destination_tables = destination_client.tables(destination_dataset_name)
        if destination_tables:
            raise RuntimeError(
                'Cannot copy a dataset into a dataset that already contains tables. Destination '
                'dataset: {}. Found tables: {}'.format(destination_dataset_name, destination_tables)
            )
    else:
        destination_client.create_dataset_by_name(destination_dataset_name)

    for table_name in source_tables:
        try:
            # If this is the first time using BQ on this machine, bq cp will prompt for a default
            # project. Arbitrarily pick the first option by piping "1" to it. (It doesn't matter
            # which project is the default, since the project names are specified in the command.)
            echo_1_process = subprocess.Popen(['echo', '1'], stdout=subprocess.PIPE)
            subprocess.check_output(
                # --project_id is needed, in case the default project is set to something else.
                ['bq', 'cp', '--project_id', source_project,
                 '{}:{}.{}'.format(source_project, source_dataset_name, table_name),
                 '{}:{}.{}'.format(destination_project, destination_dataset_name, table_name)],
                stdin=echo_1_process.stdout
            )
        except subprocess.CalledProcessError as e:
            raise RuntimeError(
                "Command {} returned with error code {}: {}".format(e.cmd, e.returncode, e.output)) 
Example 80
Project: docker-python   Author: Kaggle   File: kaggle_gcp.py    Apache License 2.0 4 votes vote down vote up
def init_bigquery():
    from google.auth import environment_vars
    from google.cloud import bigquery

    is_proxy_token_set = "KAGGLE_DATA_PROXY_TOKEN" in os.environ
    is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
    if not (is_proxy_token_set or is_user_secrets_token_set):
        return bigquery

    # If this Notebook has bigquery integration on startup, preload the Kaggle Credentials
    # object for magics to work.
    if get_integrations().has_bigquery():
        from google.cloud.bigquery import magics
        magics.context.credentials = KaggleKernelCredentials()

    def monkeypatch_bq(bq_client, *args, **kwargs):
        from kaggle_gcp import get_integrations, PublicBigqueryClient, KaggleKernelCredentials
        specified_credentials = kwargs.get('credentials')
        has_bigquery = get_integrations().has_bigquery()
        # Prioritize passed in project id, but if it is missing look for env var.
        arg_project = kwargs.get('project')
        explicit_project_id = arg_project or os.environ.get(environment_vars.PROJECT)
        # This is a hack to get around the bug in google-cloud library.
        # Remove these two lines once this is resolved:
        # https://github.com/googleapis/google-cloud-python/issues/8108
        if explicit_project_id:
            Log.info(f"Explicit project set to {explicit_project_id}")
            kwargs['project'] = explicit_project_id
        if explicit_project_id is None and specified_credentials is None and not has_bigquery:
            msg = "Using Kaggle's public dataset BigQuery integration."
            Log.info(msg)
            print(msg)
            return PublicBigqueryClient(*args, **kwargs)
        else:
            if specified_credentials is None:
                Log.info("No credentials specified, using KaggleKernelCredentials.")
                kwargs['credentials'] = KaggleKernelCredentials()
                if (not has_bigquery):
                    Log.info("No bigquery integration found, creating client anyways.")
                    print('Please ensure you have selected a BigQuery '
                        'account in the Notebook Add-ons menu.')
            if explicit_project_id is None:
                Log.info("No project specified while using the unmodified client.")
                print('Please ensure you specify a project id when creating the client'
                    ' in order to use your BigQuery account.')
            kwargs['client_info'] = set_kaggle_user_agent(kwargs.get('client_info'))
            return bq_client(*args, **kwargs)

    # Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
    # Deprecated in favor of Kaggle.DataProxyClient().
    # TODO: Remove this once uses have migrated to that new interface.
    bq_client = bigquery.Client
    if (not has_been_monkeypatched(bigquery.Client)):
        bigquery.Client = lambda *args, **kwargs:  monkeypatch_bq(
            bq_client, *args, **kwargs)
    return bigquery