Python pandas.Int64Dtype() Examples

The following are 11 code examples of pandas.Int64Dtype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: converter.py    From PyAthena with MIT License 6 votes vote down vote up
def _dtypes(self):
        if not hasattr(self, "__dtypes"):
            import pandas as pd

            self.__dtypes = {
                "tinyint": pd.Int64Dtype(),
                "smallint": pd.Int64Dtype(),
                "integer": pd.Int64Dtype(),
                "bigint": pd.Int64Dtype(),
                "float": float,
                "real": float,
                "double": float,
                "char": str,
                "varchar": str,
                "string": str,
                "array": str,
                "map": str,
                "row": str,
            }
        return self.__dtypes 
Example #2
Source File: filtering_fe_autotype.py    From dash-docs with MIT License 6 votes vote down vote up
def table_type(df_column):
    # Note - this only works with Pandas >= 1.0.0

    if sys.version_info < (3, 0):  # Pandas 1.0.0 does not support Python 2
        return 'any'

    if isinstance(df_column.dtype, pd.DatetimeTZDtype):
        return 'datetime',
    elif (isinstance(df_column.dtype, pd.StringDtype) or
            isinstance(df_column.dtype, pd.BooleanDtype) or
            isinstance(df_column.dtype, pd.CategoricalDtype) or
            isinstance(df_column.dtype, pd.PeriodDtype)):
        return 'text'
    elif (isinstance(df_column.dtype, pd.SparseDtype) or
            isinstance(df_column.dtype, pd.IntervalDtype) or
            isinstance(df_column.dtype, pd.Int8Dtype) or
            isinstance(df_column.dtype, pd.Int16Dtype) or
            isinstance(df_column.dtype, pd.Int32Dtype) or
            isinstance(df_column.dtype, pd.Int64Dtype)):
        return 'numeric'
    else:
        return 'any' 
Example #3
Source File: mysql_to_s3.py    From airflow with Apache License 2.0 5 votes vote down vote up
def _fix_int_dtypes(self, df):
        """
        Mutate DataFrame to set dtypes for int columns containing NaN values."
        """
        for col in df:
            if "float" in df[col].dtype.name and df[col].hasnans:
                # inspect values to determine if dtype of non-null values is int or float
                notna_series = df[col].dropna().values
                if np.isclose(notna_series, notna_series.astype(int)).all():
                    # set to dtype that retains integers and supports NaNs
                    df[col] = np.where(df[col].isnull(), None, df[col]).astype(pd.Int64Dtype) 
Example #4
Source File: eia923.py    From pudl with MIT License 5 votes vote down vote up
def get_dtypes(year, page):
        """Returns dtypes for plant id columns."""
        return {
            "Plant ID": pd.Int64Dtype(),
            "Plant Id": pd.Int64Dtype(),
        } 
Example #5
Source File: eia860.py    From pudl with MIT License 5 votes vote down vote up
def get_dtypes(year, page):
        """Returns dtypes for plant id columns."""
        return {
            "Plant ID": pd.Int64Dtype(),
            "Plant Id": pd.Int64Dtype(),
        } 
Example #6
Source File: datasets.py    From lkpy with MIT License 5 votes vote down vote up
def links(self):
        """
        The movie link table, connecting movie IDs to external identifiers.  It is indexed
        by movie ID.

        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.links
                 imdbId  tmdbId
        item
        1        114709     862
        2        113497    8844
        3        113228   15602
        4        114885   31357
        5        113041   11862
        ...
        [9125 rows x 2 columns]
        """

        fn = self.path / 'links.csv'
        links = pd.read_csv(fn, dtype={
            'movieId': np.int32,
            'imdbId': np.int64,
            'tmdbId': pd.Int64Dtype()
        })
        links.rename(columns={'movieId': 'item'}, inplace=True)
        links.set_index('item', inplace=True)
        _log.debug('loaded %s, takes %d bytes', fn, links.memory_usage().sum())
        return links 
Example #7
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_nullable_int_unsupported(self):
        dataframe = pd.DataFrame(
            {
                # We don't support nullable integer columns ... yet
                "A": pd.Series([1, np.nan], dtype=pd.Int64Dtype())
            }
        )
        with self.assertRaisesRegex(ValueError, "unsupported dtype"):
            validate_dataframe(dataframe) 
Example #8
Source File: test_assets.py    From cognite-sdk-python with Apache License 2.0 5 votes vote down vote up
def test_to_pandas_nullable_int(self):
        import pandas as pd

        for camel_case in [False, True]:
            assert (
                pd.Int64Dtype()
                == AssetList([Asset(parent_id=123), Asset(parent_id=None)]).to_pandas(camel_case=camel_case).dtypes[0]
            ) 
Example #9
Source File: test_dtypes.py    From pandera with MIT License 4 votes vote down vote up
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (
            pd.CategoricalDtype(),
            pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"),
            None
        ),
        (
            pd.DatetimeTZDtype(tz='UTC'),
            pd.Series(
                pd.date_range(start="20200101", end="20200301"),
                dtype="datetime64[ns, utc]"
            ),
            None
        ),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None),
        (
            pd.PeriodDtype(freq='D'),
            pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')),
            None
        ),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(
                lambda s: s < 5, other=np.nan).astype("Sparse[float]"),
            {"nullable": True},
        ),
        (
            pd.BooleanDtype(),
            pd.Series([1, 0, 0, 1, 1], dtype="boolean"),
            None
        ),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series) 
Example #10
Source File: ferc714.py    From pudl with MIT License 4 votes vote down vote up
def electricity_planning_areas(pudl_settings):
    """Electric Planning Area geometries from HIFLD."""
    gdb_path = pathlib.Path(
        pudl_settings["data_dir"],
        "local/hifld/electric_planning_areas.gdb"
    )

    gdf = (
        geopandas.read_file(gdb_path)
        .assign(
            SOURCEDATE=lambda x: pd.to_datetime(x.SOURCEDATE),
            VAL_DATE=lambda x: pd.to_datetime(x.VAL_DATE),
            ID=lambda x: pd.to_numeric(x.ID),
            NAICS_CODE=lambda x: pd.to_numeric(x.NAICS_CODE),
            YEAR=lambda x: pd.to_numeric(x.YEAR),
        )
        # Hack to work around geopanda issue fixed as of v0.8.0
        # https://github.com/geopandas/geopandas/issues/1366
        .assign(
            ID=lambda x: x.ID.astype(pd.Int64Dtype()),
            NAME=lambda x: x.NAME.astype(pd.StringDtype()),
            COUNTRY=lambda x: x.COUNTRY.astype(pd.StringDtype()),
            NAICS_CODE=lambda x: x.NAICS_CODE.astype(pd.Int64Dtype()),
            NAICS_DESC=lambda x: x.NAICS_DESC.astype(pd.StringDtype()),
            SOURCE=lambda x: x.SOURCE.astype(pd.StringDtype()),
            VAL_METHOD=lambda x: x.VAL_METHOD.astype(pd.StringDtype()),
            WEBSITE=lambda x: x.WEBSITE.astype(pd.StringDtype()),
            ABBRV=lambda x: x.ABBRV.astype(pd.StringDtype()),
            YEAR=lambda x: x.YEAR.astype(pd.Int64Dtype()),
            PEAK_LOAD=lambda x: x.PEAK_LOAD.astype(float),
            PEAK_RANGE=lambda x: x.PEAK_RANGE.astype(float),
            SHAPE_Length=lambda x: x.SHAPE_Length.astype(float),
            SHAPE_Area=lambda x: x.SHAPE_Area.astype(float),
        )
    )
    # Need to set these IDs b/c HIFLD geometry uses EIA Balancing Authority IDs
    # (maybe?) FERC 714 is using EIA Utility IDs. This isn't totally resolved
    # and we need to figure out which set of IDs is getting used where.
    gdf.loc[gdf.ID == 2775, "ID"] = 229  # CAISO
    gdf.loc[gdf.ID == 59504, "ID"] = 17690  # Southwest Power Pool
    gdf.loc[gdf.ID == 14379, "ID"] = 14354  # PacifiCorp East + West
    gdf.loc[gdf.ID == 13670, "ID"] = 39347  # Northeast TX Electric Co-op
    return gdf 
Example #11
Source File: eia860.py    From pudl with MIT License 4 votes vote down vote up
def ownership(eia860_dfs, eia860_transformed_dfs):
    """
    Pulls and transforms the ownership table.

    Args:
        eia860_dfs (dict): Each entry in this dictionary of DataFrame objects
            corresponds to a page from the EIA860 form, as reported in the
            Excel spreadsheets they distribute
        eia860_transformed_dfs (dict): A dictionary of DataFrame objects in
            which pages from EIA860 form (keys) correspond to normalized
            DataFrames of values from that page (values)

    Returns:
        dict: eia860_transformed_dfs, a dictionary of DataFrame objects in
        which pages from EIA860 form (keys) correspond to normalized
        DataFrames of values from that page (values)

    """
    o_df = (
        eia860_dfs['ownership'].copy()
        .pipe(pudl.helpers.fix_eia_na)
        .pipe(pudl.helpers.convert_to_date)
    )

    # The fix we're making here is only known to be valid for 2011 -- if we
    # get older data... then we need to to revisit the cleaning function and
    # make sure it also applies to those earlier years.
    if min(o_df.report_date.dt.year) < min(pc.working_years["eia860"]):
        raise ValueError(
            f"EIA 860 transform step is only known to work for "
            f"year {min(pc.working_years['eia860'])} and later, but found data "
            f"from year {min(o_df.report_date.dt.year)}."
        )

    # Prior to 2012, ownership was reported as a percentage, rather than
    # as a proportion, so we need to divide those values by 100.
    o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] = \
        o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] / 100

    o_df = (
        o_df.astype({
            "owner_utility_id_eia": pd.Int64Dtype(),
            "utility_id_eia": pd.Int64Dtype(),
            "plant_id_eia": pd.Int64Dtype(),
            "owner_state": pd.StringDtype()
        })
    )

    eia860_transformed_dfs['ownership_eia860'] = o_df

    return eia860_transformed_dfs