Python pandas.read_gbq() Examples

The following are 25 code examples of pandas.read_gbq(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: utils.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
  """Read data from BigQuery and split into train and validation sets.

  Args:
    full_table_path: (string) full path of the table containing training data
      in the format of [project_id.dataset_name.table_name].
    project_id: (string, Optional) Google BigQuery Account project ID.
    num_samples: (int, Optional) Number of data samples to read.

  Returns:
    pandas.DataFrame
  """

  query = metadata.BASE_QUERY.format(table=full_table_path)
  limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
  query += limit

  # Use "application default credentials"
  # Use SQL syntax dialect
  data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

  return data_df 
Example #2
Source File: test_gbq.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
                  private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size 
Example #3
Source File: weatherhistoryapp.py    From spyre with MIT License 6 votes vote down vote up
def get_data(self, type, station_ids, n_years):
        query = """
            SELECT station_number, year, month, day, {type} as value, rain, snow
            FROM `publicdata.samples.gsod`
            WHERE station_number IN ({stns})
            AND year < 2010
            AND year >= {minyr}
        """.format(
            type=type,
            stns=','.join(station_ids),
            minyr=2010 - n_years
        )

        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

        stations_df = pd.DataFrame({
            'location': stations.keys(),
            'station_number': [int(v) for v in stations.values()]
        })
        df = pd.merge(df, stations_df, on='station_number')
        return df 
Example #4
Source File: io.py    From modin with Apache License 2.0 6 votes vote down vote up
def read_gbq(
        cls,
        query: str,
        project_id=None,
        index_col=None,
        col_order=None,
        reauth=False,
        auth_local_webserver=False,
        dialect=None,
        location=None,
        configuration=None,
        credentials=None,
        use_bqstorage_api=None,
        private_key=None,
        verbose=None,
        progress_bar_type=None,
    ):
        ErrorMessage.default_to_pandas("`read_gbq`")
        return cls.from_pandas(
            pandas.read_gbq(
                query,
                project_id=project_id,
                index_col=index_col,
                col_order=col_order,
                reauth=reauth,
                auth_local_webserver=auth_local_webserver,
                dialect=dialect,
                location=location,
                configuration=configuration,
                credentials=credentials,
                use_bqstorage_api=use_bqstorage_api,
                private_key=private_key,
                verbose=verbose,
                progress_bar_type=progress_bar_type,
            )
        ) 
Example #5
Source File: test_gbq.py    From elasticintel with GNU General Public License v3.0 6 votes vote down vote up
def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
                  private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size 
Example #6
Source File: samples_test.py    From python-docs-samples with Apache License 2.0 6 votes vote down vote up
def test_pandas_gbq_query():
    # [START bigquery_migration_pandas_gbq_query]
    import pandas

    sql = """
        SELECT name
        FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = 'TX'
        LIMIT 100
    """

    # Run a Standard SQL query using the environment's default project
    df = pandas.read_gbq(sql, dialect='standard')

    # Run a Standard SQL query with the project set explicitly
    project_id = 'your-project-id'
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0
    project_id = os.environ['GOOGLE_CLOUD_PROJECT']
    # [START bigquery_migration_pandas_gbq_query]
    df = pandas.read_gbq(sql, project_id=project_id, dialect='standard')
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0 
Example #7
Source File: compute_vessel_metrics.py    From vessel-classification with Apache License 2.0 6 votes vote down vote up
def load_class_weights(inference_table):
    query = '''
        with

        core as (
        select * from `{}*`
        where max_label is not null
        ),

        count as (
        select count(*) as total from core
        )
        select max_label as label, count(*) / total as fraction
        from core
        cross join count
        group by label, total
        order by fraction desc
    '''.format(inference_table)
    df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard')
    wt_map = {x.label : x.fraction for x in df.itertuples()}
    return wt_map 
Example #8
Source File: compute_vessel_metrics.py    From vessel-classification with Apache License 2.0 6 votes vote down vote up
def load_inferred(inference_table, label_table, extractors):
    """Load inferred data and generate comparison data

    """
    query = """

    SELECT inference_table.* except (ssvid), ssvid as id FROM 
    `{}` label_table
    JOIN
   `{}*` inference_table
    ON (cast(label_table.id as string) = inference_table.ssvid)
    where split = "Test"
    """.format(label_table, inference_table)
    print(query)
    df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard')

    for row in df.itertuples():
        for ext in extractors:
            ext.extract(row)
    for ext in extractors:
        ext.finalize() 
Example #9
Source File: samples_test.py    From python-docs-samples with Apache License 2.0 6 votes vote down vote up
def test_pandas_gbq_query_with_parameters():
    # [START bigquery_migration_pandas_gbq_query_parameters]
    import pandas

    sql = """
        SELECT name
        FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = @state
        LIMIT @limit
    """
    query_config = {
        'query': {
            'parameterMode': 'NAMED',
            'queryParameters': [
                {
                    'name': 'state',
                    'parameterType': {'type': 'STRING'},
                    'parameterValue': {'value': 'TX'}
                },
                {
                    'name': 'limit',
                    'parameterType': {'type': 'INTEGER'},
                    'parameterValue': {'value': 100}
                }
            ]
        }
    }

    df = pandas.read_gbq(sql, configuration=query_config)
    # [END bigquery_migration_pandas_gbq_query_parameters]
    assert len(df) > 0 
Example #10
Source File: utils.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
  """Read data from BigQuery and split into train and validation sets.

  Args:
    full_table_path: (string) full path of the table containing training data
      in the format of [project_id.dataset_name.table_name].
    project_id: (string, Optional) Google BigQuery Account project ID.
    num_samples: (int, Optional) Number of data samples to read.

  Returns:
    pandas.DataFrame
  """

  query = metadata.BASE_QUERY.format(table=full_table_path)
  limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
  query += limit

  # Use "application default credentials"
  # Use SQL syntax dialect
  data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

  return data_df 
Example #11
Source File: bigquery.py    From openprescribing with MIT License 5 votes vote down vote up
def query_into_dataframe(self, sql, legacy=False):
        sql = interpolate_sql(sql)
        kwargs = {
            "project_id": self.project,
            "dialect": "legacy" if legacy else "standard",
        }
        with exception_sql_printer(sql):
            return pd.read_gbq(sql, **kwargs) 
Example #12
Source File: test_gbq.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
    # Default dialect is changing to standard SQL. See:
    # https://github.com/pydata/pandas-gbq/issues/195

    def mock_read_gbq(*args, **kwargs):
        return DataFrame([[1.0]])

    monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
    with tm.assert_produces_warning(FutureWarning):
        pd.read_gbq("SELECT 1") 
Example #13
Source File: compute_fishing_metrics.py    From vessel-classification with Apache License 2.0 5 votes vote down vote up
def load_inferred_fishing(table, id_list, project_id, threshold=True):
    """Load inferred data and generate comparison data

    """
    query_template = """
    SELECT vessel_id as id, start_time, end_time, nnet_score FROM 
        TABLE_DATE_RANGE([{table}],
            TIMESTAMP('{year}-01-01'), TIMESTAMP('{year}-12-31'))
        WHERE vessel_id in ({ids})
    """
    ids = ','.join('"{}"'.format(x) for x in id_list)
    ranges = defaultdict(list)
    for year in range(2012, 2019):
        query = query_template.format(table=table, year=year, ids=ids)
        try:
            df = pd.read_gbq(query, project_id=project_id, dialect='legacy')
        except pandas_gbq.gbq.GenericGBQException as err:
            if 'matches no table' in err.args[0]:
                print('skipping', year)
                continue
            else:
                print(query)
                raise
        for x in df.itertuples():
            score = x.nnet_score
            if threshold:
                score = score > 0.5
            start = x.start_time.replace(tzinfo=pytz.utc)
            end = x.end_time.replace(tzinfo=pytz.utc)
            ranges[x.id].append(FishingRange(score, start, end))
    return ranges 
Example #14
Source File: create_train_info.py    From vessel-classification with Apache License 2.0 5 votes vote down vote up
def read_vessel_database_for_char_mmsi(dbname, dataset):
    query = '''
      with multi_id as (
        select identity.ssvid as id
        from {dbname}
        group by id
        having count(identity.ssvid) > 1
      )
      
      select identity.ssvid as id, 
             feature.length_m as length, 
             feature.tonnage_gt as tonnage, 
             feature.engine_power_kw as engine_power, 
             feature.crew as crew_size,
             array_to_string(feature.geartype, '|') as label
      from {dbname} a
      where (feature.length_m is not null or 
            feature.tonnage_gt is not null or 
            feature.engine_power_kw is not null or 
            feature.crew is not null or
            (feature.geartype is not null and array_length(feature.geartype) > 0)) and
            identity.ssvid not in (select * from multi_id)
            order by id
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise 
Example #15
Source File: query.py    From subreddit-generator with MIT License 5 votes vote down vote up
def get_reddit_data(project_id, subreddits, start_month, end_month, max_posts):
    query = '''
    # standardSQL
    SELECT
    title,
    subreddit AS context_label
    FROM (
    SELECT
        title,
        subreddit,
        ROW_NUMBER() OVER (PARTITION BY subreddit ORDER BY score DESC)
        AS rank_num
    FROM
        `fh-bigquery.reddit_posts.*`
    WHERE
        _TABLE_SUFFIX BETWEEN "{}" AND "{}"
        AND LOWER(subreddit) IN ({})
        )
    WHERE
    rank_num <= {}
    '''

    query = query.format(start_month,
                         end_month,
                         str([x.lower() for x in subreddits])[1:-1],
                         max_posts)

    df = pd.read_gbq(query, project_id, dialect='standard')
    return df 
Example #16
Source File: samples_test.py    From python-docs-samples with Apache License 2.0 5 votes vote down vote up
def test_pandas_gbq_legacy_query():
    # [START bigquery_migration_pandas_gbq_query_legacy]
    import pandas

    sql = """
        SELECT name
        FROM [bigquery-public-data:usa_names.usa_1910_current]
        WHERE state = 'TX'
        LIMIT 100
    """

    df = pandas.read_gbq(sql, dialect='legacy')
    # [END bigquery_migration_pandas_gbq_query_legacy]
    assert len(df) > 0 
Example #17
Source File: samples_test.py    From python-docs-samples with Apache License 2.0 5 votes vote down vote up
def test_pandas_gbq_query_bqstorage():
    # [START bigquery_migration_pandas_gbq_query_bqstorage]
    import pandas

    sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`"

    # Use the BigQuery Storage API to download results more quickly.
    df = pandas.read_gbq(sql, dialect='standard', use_bqstorage_api=True)
    # [END bigquery_migration_pandas_gbq_query_bqstorage]
    assert len(df) > 0 
Example #18
Source File: test_gbq.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  credentials=_get_credentials())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             credentials=_get_credentials(),
                             dialect="standard")
        assert result['num_rows'][0] == test_size 
Example #19
Source File: test_gbq.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
    # Default dialect is changing to standard SQL. See:
    # https://github.com/pydata/pandas-gbq/issues/195

    def mock_read_gbq(*args, **kwargs):
        return DataFrame([[1.0]])

    monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
    with tm.assert_produces_warning(FutureWarning):
        pd.read_gbq("SELECT 1") 
Example #20
Source File: utils.py    From bigquery-bokeh-dashboard with Apache License 2.0 5 votes vote down vote up
def _run(query, dialect='legacy'):
    return pd.read_gbq(
        query,
        project_id=os.environ['GOOGLE_PROJECT_ID'],
        private_key=os.environ['GOOGLE_APPLICATION_CREDENTIALS'],
        dialect=dialect
    ) 
Example #21
Source File: test_gbq.py    From recruit with Apache License 2.0 5 votes vote down vote up
def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  credentials=_get_credentials())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             credentials=_get_credentials(),
                             dialect="standard")
        assert result['num_rows'][0] == test_size 
Example #22
Source File: create_train_info.py    From vessel-classification with Apache License 2.0 4 votes vote down vote up
def read_vessel_database_for_char_vessel_id(dbname, dataset):
    query = '''
        with 

        core as (
          select vessel_id as id, 
                 feature.length_m as length, 
                 feature.tonnage_gt as tonnage, 
                 feature.engine_power_kw as engine_power, 
                 feature.geartype as label, 
                 feature.crew as crew_size,
                 array_to_string(feature.geartype, '|') as lbl,
                 pos_count
          from (select * from {dbname} cross join unnest (activity)) a
          join `{dataset}.vessel_info` b
          on (a.identity.ssvid = b.ssvid)
          where 
           (not ((a.last_timestamp is not null and 
                  a.last_timestamp < b.first_timestamp) or 
                 (a.first_timestamp is not null and
                  a.first_timestamp > b.last_timestamp)))
            and (feature.length_m is not null or feature.tonnage_gt is not null or 
                feature.engine_power_kw is not null or feature.geartype is not null
                --or feature.crew is not null
                )
        ),

        counted as (
          select * except(pos_count, label, lbl), lbl as label, sum(pos_count) as count
          from core 
          group by id, length, tonnage, 
                 engine_power, label, crew_size
        ),

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
        where rk = 1
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise 
Example #23
Source File: create_train_info.py    From vessel-classification with Apache License 2.0 4 votes vote down vote up
def read_vessel_database_for_detect_vessel_id(dbname, fishdbname, dataset):
    fishing_range_query=fishing_range_vessel_id(fishdbname, dataset)
    query = '''
        with 

        fishing_range_vessel_id as {fishing_range_query},

        core as (
          select vessel_id as id, feature.length_m as length, feature.tonnage_gt as tonnage, 
                 feature.engine_power_kw as engine_power, 
                 array_to_string(feature.geartype, '|') as label,
                 feature.crew as crew_size,
                 pos_count, transit_only
          from (select * from {dbname} cross join unnest (activity)) a
         join `{dataset}.segment_info` b
            on (cast(a.identity.ssvid as string) = ssvid)
          join `fishing_range_vessel_id` c
          using (vessel_id)
          where -- valid times overlap with segments
           (not ((a.last_timestamp is not null and 
                 a.last_timestamp < b.first_timestamp) or 
                 (a.first_timestamp is not null and
                 a.first_timestamp > b.last_timestamp)))
            and -- valid times overlaps with fishing ranges
           (not ((a.last_timestamp is not null and 
                  a.last_timestamp < c.first_timestamp) or 
                 (a.first_timestamp is not null and 
                  a.first_timestamp > c.last_timestamp))) 
        ),

        counted as (
          select * except(pos_count, transit_only, length, tonnage, engine_power,
                          label, crew_size), 
                    sum(pos_count) as count,
                   min(transit_only) as transit_only,
                   avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power,
                   any_value(label) as label, avg(crew_size) as crew_size
          from core 
          group by id
        ),     

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise 
Example #24
Source File: create_train_info.py    From vessel-classification with Apache License 2.0 4 votes vote down vote up
def read_vessel_database_for_detect_mmsi(dbname, fishdbname, dataset):
    fishing_range_query=fishing_range_mmsi(fishdbname, dataset)
    query = '''
        with 

        fishing_range_mmsi as {fishing_range_query},

        core as (
          select identity.ssvid as id, length_m as length, tonnage_gt as tonnage, 
                 engine_power_kw as engine_power, 
                 geartype as label, crew as crew_size,
                 confidence, -- 1==low, 2==typical, 3==high
                 (select sum(messages) from unnest(activity)) as pos_count, transit_only
          from {dbname} a
          cross join unnest (registry)
          join `fishing_range_mmsi` c
          on (cast(mmsi as string) = identity.ssvid)
          where  -- valid times overlaps with fishing ranges
          -- TODO: should do each activity period separately here.
           (not (((select min(last_timestamp) from unnest(activity)) < c.first_timestamp) or 
                 ((select min(first_timestamp) from unnest(activity)) > c.last_timestamp))) 
        ),

        counted as (
          select * except(pos_count, confidence, transit_only, length, tonnage, engine_power,
                          label, crew_size), 
                    sum(pos_count) as count,
                   avg(confidence) as confidence, min(transit_only) as transit_only,
                   avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power,
                   any_value(label) as label, avg(crew_size) as crew_size
          from core 
          group by id
        ),     

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size, confidence) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise 
Example #25
Source File: create_train_info.py    From vessel-classification with Apache License 2.0 4 votes vote down vote up
def read_fishing_ranges_vessel_id(fishdbname, dataset):
    query = '''
        with 

        fishing_ranges as {fishing_ranges},

        core as (
          select mmsi as ssvid, vessel_id as id, 
                 start_time, end_time, is_fishing, pos_count
          from
          `{fishdbname}` a
          join
          `{dataset}.vessel_info` b
          on (a.mmsi = cast(ssvid as int64))
          join `fishing_ranges` c
          using (vessel_id)
          where 
            (not (b.last_timestamp <  c.first_timestamp or 
                  b.first_timestamp > c.last_timestamp))
        ),

        counted as (
          select id, start_time, end_time, is_fishing, sum(pos_count) as count
          from core 
          group by id, start_time, end_time, is_fishing
        ),
            
        ordered as (
          select *, 
                 row_number() over (partition by id, start_time, end_time
                                     order by count desc) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
        where rk = 1
    '''.format(fishdbname=fishdbname,
               dataset=dataset, 
               fishing_ranges=fishing_range_vessel_id(fishdbname, dataset))
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise