Python pandas.NA Examples

The following are 29 code examples of pandas.NA(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: pandas_transformer.py    From kgx with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def is_null(item: Any) -> bool:
        """
        Checks if a given item is null or correspond to null.

        This method checks for: None, numpy.nan, pandas.NA,
        pandas.NaT, "", and " "

        Parameters
        ----------
        item: Any
            The item to check

        Returns
        -------
        bool
            Whether the given item is null or not

        """
        null_values = {np.nan, pd.NA, pd.NaT, None, "", " "}
        return item in null_values 
Example #2
Source File: helpers.py    From pudl with MIT License 6 votes vote down vote up
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015):
    """Add State and County FIPS IDs to a dataframe."""
    af = addfips.AddFIPS(vintage=vintage)
    # Lookup the state and county FIPS IDs and add them to the dataframe:
    df["state_id_fips"] = df.apply(
        lambda x: af.get_state_fips(state=x.state), axis=1)
    logger.info(
        f"Assigned state FIPS codes for "
        f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records."
    )
    df["county_id_fips"] = df.apply(
        lambda x: af.get_county_fips(state=x.state, county=x.county), axis=1)
    df["county_id_fips"] = df.county_id_fips.fillna(pd.NA)
    logger.info(
        f"Assigned county FIPS codes for "
        f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records."
    )
    return df 
Example #3
Source File: helpers.py    From pudl with MIT License 6 votes vote down vote up
def fix_eia_na(df):
    """
    Replace common ill-posed EIA NA spreadsheet values with np.nan.

    Args:
        df (pandas.DataFrame): The DataFrame to clean.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.

    Todo:
        Update docstring.

    """
    return df.replace(to_replace=[r'^\.$', r'^\s$', r'^$'],
                      value=np.nan, regex=True) 
Example #4
Source File: base.py    From fletcher with MIT License 6 votes vote down vote up
def __or__(self, other):
        """Compute vectorised or."""
        if not pa.types.is_boolean(self.dtype.arrow_dtype):
            raise NotImplementedError("__or__ is only supported for boolean arrays yet")

        if other is pd.NA or (pd.api.types.is_scalar(other) and pd.isna(other)):
            # All fields that are True stay True, all others get set to NA
            return type(self)(or_na(self.data))
        elif isinstance(other, bool):
            if other:
                # or with True yields all-True
                return type(self)(all_true(self.data))
            else:
                return self
        else:
            if isinstance(other, FletcherBaseArray):
                other = other.data
            return type(self)(or_vectorised(self.data, other)) 
Example #5
Source File: test_boolean.py    From fletcher with MIT License 5 votes vote down vote up
def test_or(fletcher_array):
    # Scalar versions
    # non-null versions
    result = fletcher_array([True, False]) | pd.NA
    expected = fletcher_array([True, None])
    pdt.assert_extension_array_equal(result, expected)

    result = fletcher_array([True, False, None]) | pd.NA
    expected = fletcher_array([True, None, None])
    pdt.assert_extension_array_equal(result, expected)

    result = fletcher_array([True, False, None]) | True
    expected = fletcher_array([True, True, True])
    pdt.assert_extension_array_equal(result, expected)

    result = fletcher_array([True, False, None]) | False
    expected = fletcher_array([True, False, None])
    pdt.assert_extension_array_equal(result, expected)

    # Array version
    # Non-null version
    result = fletcher_array([True, False, False]) | fletcher_array([False, True, False])
    expected = fletcher_array([True, True, False])
    pdt.assert_extension_array_equal(result, expected)
    # One has nulls, the other not
    result = fletcher_array([True, False, None, None]) | fletcher_array(
        [False, True, False, True]
    )
    expected = fletcher_array([True, True, None, True])
    pdt.assert_extension_array_equal(result, expected)
    # Both have nulls
    result = fletcher_array([True, False, None, None]) | fletcher_array(
        [None, True, False, True]
    )
    pdt.assert_extension_array_equal(result, expected)

    result = fletcher_array([True, False, None, None]) | np.array(
        [False, True, False, True]
    )
    pdt.assert_extension_array_equal(result, expected) 
Example #6
Source File: test_pandas_conversions.py    From rpy2 with GNU General Public License v2.0 5 votes vote down vote up
def test_timeR2Pandas(self):
        tzone = robjects.vectors.get_timezone()
        dt = [datetime(1960, 5, 2),
              datetime(1970, 6, 3), 
              datetime(2012, 7, 1)]
        dt = [x.replace(tzinfo=tzone) for x in dt]
        # fix the time
        ts = [x.timestamp() for x in dt]
        # Create an R POSIXct vector.
        r_time = robjects.baseenv['as.POSIXct'](
            rinterface.FloatSexpVector(ts),
            origin=rinterface.StrSexpVector(('1970-01-01',))
        )

        # Convert R POSIXct vector to pandas-compatible vector
        with localconverter(default_converter + rpyp.converter) as cv:
            py_time = robjects.conversion.rpy2py(r_time)

        # Check that the round trip did not introduce changes
        for expected, obtained in zip(dt, py_time):
            assert expected == obtained.to_pydatetime()

        # Try with NA.
        r_time[1] = rinterface.na_values.NA_Real
        # Convert R POSIXct vector to pandas-compatible vector
        with localconverter(default_converter + rpyp.converter) as cv:
            py_time = robjects.conversion.rpy2py(r_time)

        assert py_time[1] is pandas.NaT 
Example #7
Source File: pandas2ri.py    From rpy2 with GNU General Public License v2.0 5 votes vote down vote up
def _int_populate_r_vector(iterable, r_vector,
                           set_elt,
                           cast_value):
    for i, v in enumerate(iterable):
        if v is None or v is pandas.NA:
            v = math.nan
        set_elt(r_vector, i, cast_value(v)) 
Example #8
Source File: pandas2ri.py    From rpy2 with GNU General Public License v2.0 5 votes vote down vote up
def _str_populate_r_vector(iterable, r_vector,
                           set_elt,
                           cast_value):
    for i, v in enumerate(iterable):
        if (
                v is None
                or
                v is pandas.NA
                or
                (isinstance(v, float) and math.isnan(v))
        ):
            v = na_values.NA_Character
        set_elt(r_vector, i, cast_value(v)) 
Example #9
Source File: series.py    From modin with Apache License 2.0 5 votes vote down vote up
def update(self, other):
        """
        Modify Series in place using non-NA values from passed
        Series. Aligns on index.

        Parameters
        ----------
        other : Series, or object coercible into Series
        """
        if not isinstance(other, Series):
            other = Series(other)
        query_compiler = self._query_compiler.series_update(other._query_compiler)
        self._update_inplace(new_query_compiler=query_compiler) 
Example #10
Source File: series.py    From modin with Apache License 2.0 5 votes vote down vote up
def argmin(self, axis=None, skipna=True, *args, **kwargs):
        result = self.idxmin(axis=axis, skipna=skipna, *args, **kwargs)
        if np.isnan(result) or result is pandas.NA:
            result = -1
        return result 
Example #11
Source File: series.py    From modin with Apache License 2.0 5 votes vote down vote up
def argmax(self, axis=None, skipna=True, *args, **kwargs):
        result = self.idxmax(axis=axis, skipna=skipna, *args, **kwargs)
        if np.isnan(result) or result is pandas.NA:
            result = -1
        return result 
Example #12
Source File: util.py    From holoviews with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def isfinite(val):
    """
    Helper function to determine if scalar or array value is finite extending
    np.isfinite with support for None, string, datetime types.
    """
    is_dask = is_dask_array(val)
    if not np.isscalar(val) and not is_dask:
        val = asarray(val, strict=False)

    if val is None:
        return False
    elif is_dask:
        import dask.array as da
        return da.isfinite(val)
    elif isinstance(val, np.ndarray):
        if val.dtype.kind == 'M':
            return ~isnat(val)
        elif val.dtype.kind == 'O':
            return np.array([isfinite(v) for v in val], dtype=bool)
        elif val.dtype.kind in 'US':
            return ~pd.isna(val) if pd else np.ones_like(val, dtype=bool)
        finite = np.isfinite(val)
        if pd and pandas_version >= '1.0.0':
            finite &= ~pd.isna(val)
        return finite
    elif isinstance(val, datetime_types+timedelta_types):
        return not isnat(val)
    elif isinstance(val, (basestring, bytes)):
        return True
    finite = np.isfinite(val)
    if pd and pandas_version >= '1.0.0':
        if finite is pd.NA:
            return False
        return finite & (~pd.isna(val))
    return finite 
Example #13
Source File: test_s3.py    From aws-data-wrangler with Apache License 2.0 5 votes vote down vote up
def test_to_parquet_file_dtype(path):
    df = pd.DataFrame({"c0": [1.0, None, 2.0], "c1": [pd.NA, pd.NA, pd.NA]})
    file_path = f"{path}0.parquet"
    wr.s3.to_parquet(df, file_path, dtype={"c0": "bigint", "c1": "string"})
    wr.s3.wait_objects_exist(paths=[file_path])
    df2 = wr.s3.read_parquet(file_path)
    assert df2.shape == df.shape
    assert df2.c0.sum() == 3
    assert str(df2.c0.dtype) == "Int64"
    assert str(df2.c1.dtype) == "string" 
Example #14
Source File: test_boolean.py    From fletcher with MIT License 5 votes vote down vote up
def test_np_any(fletcher_array):
    arr = fletcher_array([True, False, None])
    assert np.any(arr)

    arr = fletcher_array([True, False, True])
    assert np.any(arr)

    # TODO(pandas-0.26): Uncomment this when BooleanArray landed.
    #   Then we change the behaviour.
    # arr = fr.FletcherChunkedArray([False, False, None])
    # assert np.any(arr) is pd.NA

    arr = fletcher_array([False, False, False])
    assert not np.any(arr) 
Example #15
Source File: test_pandas_extension.py    From fletcher with MIT License 5 votes vote down vote up
def data_missing_for_sorting(fletcher_type, fletcher_array):
    """Length-3 array with a known sort order.

    This should be three items [B, NA, A] with
    A < B and NA missing.
    """
    return fletcher_array(
        fletcher_type.data_missing_for_sorting, dtype=fletcher_type.dtype
    ) 
Example #16
Source File: test_pandas_extension.py    From fletcher with MIT License 5 votes vote down vote up
def data_for_grouping(fletcher_type, fletcher_array):
    """Fixture with data for factorization, grouping, and unique tests.

    Expected to be like [B, B, NA, NA, A, A, B, C]

    Where A < B < C and NA is missing
    """
    return fletcher_array(fletcher_type.data_for_grouping, dtype=fletcher_type.dtype) 
Example #17
Source File: helpers.py    From pudl with MIT License 5 votes vote down vote up
def fix_int_na(df, columns, float_na=np.nan, int_na=-1, str_na=''):
    """Convert NA containing integer columns from float to string.

    Numpy doesn't have a real NA value for integers. When pandas stores integer
    data which has NA values, it thus upcasts integers to floating point
    values, using np.nan values for NA. However, in order to dump some of our
    dataframes to CSV files for use in data packages, we need to write out
    integer formatted numbers, with empty strings as the NA value. This
    function replaces np.nan values with a sentinel value, converts the column
    to integers, and then to strings, finally replacing the sentinel value with
    the desired NA string.

    This is an interim solution -- now that pandas extension arrays have been
    implemented, we need to go back through and convert all of these integer
    columns that contain NA values to Nullable Integer types like Int64.

    Args:
        df (pandas.DataFrame): The dataframe to be fixed. This argument allows
            method chaining with the pipe() method.
        columns (iterable of strings): A list of DataFrame column labels
            indicating which columns need to be reformatted for output.
        float_na (float): The floating point value to be interpreted as NA and
            replaced in col.
        int_na (int): Sentinel value to substitute for float_na prior to
            conversion of the column to integers.
        str_na (str): sa.String value to substitute for int_na after the column
            has been converted to strings.

    Returns:
        df (pandas.DataFrame): a new DataFrame, with the selected columns
        converted to strings that look like integers, compatible with
        the postgresql COPY FROM command.

    """
    return (
        df.replace({c: float_na for c in columns}, int_na)
          .astype({c: int for c in columns})
          .astype({c: str for c in columns})
          .replace({c: str(int_na) for c in columns}, str_na)
    ) 
Example #18
Source File: ferc714.py    From pudl with MIT License 5 votes vote down vote up
def respondent_id(tfr_dfs):
    """
    Transform the FERC 714 respondent IDs, names, and EIA utility IDs.

    This consists primarily of dropping test respondents and manually
    assigning EIA utility IDs to a few FERC Form 714 respondents that report
    planning area demand, but which don't have their corresponding EIA utility
    IDs provided by FERC for some reason (including PacifiCorp).

    Args:
        tfr_dfs (dict): A dictionary of (partially) transformed dataframes,
            to be cleaned up.

    Returns:
        dict: The input dictionary of dataframes, but with a finished
        respondent_id_ferc714 dataframe.

    """
    df = (
        tfr_dfs["respondent_id_ferc714"].assign(
            utility_name_ferc714=lambda x: x.utility_name_ferc714.str.strip(),
            utility_id_eia=lambda x: x.utility_id_eia.replace(
                to_replace=0, value=pd.NA)
        )
        # These excludes fake Test IDs -- not real planning areas
        .query("utility_id_ferc714 not in @BAD_RESPONDENTS")
    )
    # There are a few utilities that seem mappable, but missing:
    for rid in MISSING_UTILITY_ID_EIA:
        df.loc[df.utility_id_ferc714 == rid,
               "utility_id_eia"] = MISSING_UTILITY_ID_EIA[rid]
    tfr_dfs["respondent_id_ferc714"] = df
    return tfr_dfs 
Example #19
Source File: epacems.py    From pudl with MIT License 5 votes vote down vote up
def _load_plant_utc_offset(datapkg_dir):
    """Load the UTC offset each EIA plant.

    CEMS times don't change for DST, so we get get the UTC offset by using the
    offset for the plants' timezones in January.

    Args:
        datapkg_dir (path-like) : Path to the directory of the datapackage
            which is currently being assembled.

    Returns:
        pandas.DataFrame: With columns plant_id_eia and utc_offset

    """
    import pytz

    jan1 = datetime.datetime(2011, 1, 1)  # year doesn't matter
    timezones = (
        pd.read_csv(
            pathlib.Path(datapkg_dir, 'data/plants_entity_eia.csv'),
            usecols=["plant_id_eia", "timezone"],
            dtype={"plant_id_eia": "Int64", "timezone": pd.StringDtype()})
        .replace(to_replace="None", value=pd.NA)
        .dropna()
    )

    timezones["utc_offset"] = (
        timezones["timezone"]
        .apply(lambda tz: pytz.timezone(tz).localize(jan1).utcoffset())
    )
    del timezones["timezone"]
    return timezones 
Example #20
Source File: test_pandas_cursor.py    From PyAthena with MIT License 5 votes vote down vote up
def test_integer_na_values(self, cursor):
        df = cursor.execute(
            """
            SELECT * FROM integer_na_values
            """
        ).as_pandas()
        rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()]
        version = float(re.search(r"^([\d]+\.[\d]+)\..+", pd.__version__).group(1))
        if version >= 1.0:
            self.assertEqual(rows, [(1, 2), (1, pd.NA), (pd.NA, pd.NA)])
        else:
            self.assertEqual(rows, [(1, 2), (1, np.nan), (np.nan, np.nan)]) 
Example #21
Source File: base.py    From fletcher with MIT License 4 votes vote down vote up
def fillna(self, value=None, method=None, limit=None):
        """Fill NA/NaN values using the specified method.

        Parameters
        ----------
        value : scalar, array-like
            If a scalar value is passed it is used to fill all missing values.
            Alternatively, an array-like 'value' can be given. It's expected
            that the array-like have the same length as 'self'.
        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
            Method to use for filling holes in reindexed Series
            pad / ffill: propagate last valid observation forward to next valid
            backfill / bfill: use NEXT valid observation to fill gap
        limit : int, default None
            If method is specified, this is the maximum number of consecutive
            NaN values to forward/backward fill. In other words, if there is
            a gap with more than this number of consecutive NaNs, it will only
            be partially filled. If method is not specified, this is the
            maximum number of entries along the entire axis where NaNs will be
            filled.

        Returns
        -------
        filled : ExtensionArray with NA/NaN filled
        """
        from pandas.api.types import is_array_like
        from pandas.util._validators import validate_fillna_kwargs
        from pandas.core.missing import pad_1d, backfill_1d

        value, method = validate_fillna_kwargs(value, method)

        mask = self.isna()

        if is_array_like(value):
            if len(value) != len(self):
                raise ValueError(
                    "Length of 'value' does not match. Got ({}) "
                    " expected {}".format(len(value), len(self))
                )
            value = value[mask]

        if mask.any():
            if method is not None:
                func = pad_1d if method == "pad" else backfill_1d
                new_values = func(self.astype(object), limit=limit, mask=mask)
                new_values = self._from_sequence(new_values, self._dtype.arrow_dtype)
            else:
                # fill with value
                new_values = self.copy()
                new_values[mask] = value
        else:
            new_values = self.copy()
        return new_values 
Example #22
Source File: base.py    From fletcher with MIT License 4 votes vote down vote up
def take(
        self,
        indices: Union[Sequence[int], np.ndarray],
        allow_fill: bool = False,
        fill_value: Optional[Any] = None,
    ) -> ExtensionArray:
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of integers
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.
            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.
            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.
        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.
            For many ExtensionArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if nescessary.

        Returns
        -------
        ExtensionArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignemnt, with a `fill_value`.

        See Also
        --------
        numpy.take
        pandas.api.extensions.take
        """
        return self._take_array(self.data, indices, allow_fill, fill_value) 
Example #23
Source File: base.py    From fletcher with MIT License 4 votes vote down vote up
def factorize(self, na_sentinel=-1):
        # type: (int) -> Tuple[np.ndarray, ExtensionArray]
        """Encode the extension array as an enumerated type.

        Parameters
        ----------
        na_sentinel : int, default -1
            Value to use in the `labels` array to indicate missing values.

        Returns
        -------
        labels : ndarray
            An integer NumPy array that's an indexer into the original
            ExtensionArray.
        uniques : ExtensionArray
            An ExtensionArray containing the unique values of `self`.
            .. note::
               uniques will *not* contain an entry for the NA value of
               the ExtensionArray if there are any missing values present
               in `self`.

        See Also
        --------
        pandas.factorize : Top-level factorize method that dispatches here.

        Notes
        -----
        :meth:`pandas.factorize` offers a `sort` keyword as well.
        """
        if pa.types.is_dictionary(self.data.type):
            raise NotImplementedError()
        elif self.data.num_chunks == 1:
            # Dictionaryencode and do the same as above
            encoded = self.data.chunk(0).dictionary_encode()
            indices = encoded.indices.to_pandas()
            if indices.dtype.kind == "f":
                indices[np.isnan(indices)] = na_sentinel
                indices = indices.astype(int)
            if not is_int64_dtype(indices):
                indices = indices.astype(np.int64)
            return indices.values, type(self)(encoded.dictionary)
        else:
            np_array = self.data.to_pandas().values
            return pd.factorize(np_array, na_sentinel=na_sentinel) 
Example #24
Source File: base.py    From fletcher with MIT License 4 votes vote down vote up
def fillna(self, value=None, method=None, limit=None):
        """Fill NA/NaN values using the specified method.

        Parameters
        ----------
        value : scalar, array-like
            If a scalar value is passed it is used to fill all missing values.
            Alternatively, an array-like 'value' can be given. It's expected
            that the array-like have the same length as 'self'.
        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
            Method to use for filling holes in reindexed Series
            pad / ffill: propagate last valid observation forward to next valid
            backfill / bfill: use NEXT valid observation to fill gap
        limit : int, default None
            If method is specified, this is the maximum number of consecutive
            NaN values to forward/backward fill. In other words, if there is
            a gap with more than this number of consecutive NaNs, it will only
            be partially filled. If method is not specified, this is the
            maximum number of entries along the entire axis where NaNs will be
            filled.

        Returns
        -------
        filled : ExtensionArray with NA/NaN filled
        """
        from pandas.api.types import is_array_like
        from pandas.util._validators import validate_fillna_kwargs
        from pandas.core.missing import pad_1d, backfill_1d

        value, method = validate_fillna_kwargs(value, method)

        mask = self.isna()

        if is_array_like(value):
            if len(value) != len(self):
                raise ValueError(
                    "Length of 'value' does not match. Got ({}) "
                    " expected {}".format(len(value), len(self))
                )
            value = value[mask]

        if mask.any():
            if method is not None:
                func = pad_1d if method == "pad" else backfill_1d
                new_values = func(self.astype(object), limit=limit, mask=mask)
                new_values = self._from_sequence(new_values, self._dtype.arrow_dtype)
            else:
                # fill with value
                new_values = self.copy()
                new_values[mask] = value
        else:
            new_values = self.copy()
        return new_values 
Example #25
Source File: helpers.py    From pudl with MIT License 4 votes vote down vote up
def find_timezone(*, lng=None, lat=None, state=None, strict=True):
    """Find the timezone associated with the a specified input location.

    Note that this function requires named arguments. The names are lng, lat,
    and state.  lng and lat must be provided, but they may be NA. state isn't
    required, and isn't used unless lng/lat are NA or timezonefinder can't find
    a corresponding timezone.

    Timezones based on states are imprecise, so it's far better to use lng/lat
    if possible. If `strict` is True, state will not be used.
    More on state-to-timezone conversion here:
    https://en.wikipedia.org/wiki/List_of_time_offsets_by_U.S._state_and_territory

    Args:
        lng (int or float in [-180,180]): Longitude, in decimal degrees
        lat (int or float in [-90, 90]): Latitude, in decimal degrees
        state (str): Abbreviation for US state or Canadian province
        strict (bool): Raise an error if no timezone is found?

    Returns:
        str: The timezone (as an IANA string) for that location.

    Todo:
        Update docstring.

    """
    try:
        tz = tz_finder.timezone_at(lng=lng, lat=lat)
        if tz is None:  # Try harder
            # Could change the search radius as well
            tz = tz_finder.closest_timezone_at(lng=lng, lat=lat)
    # For some reason w/ Python 3.6 we get a ValueError here, but with
    # Python 3.7 we get an OverflowError...
    except (OverflowError, ValueError):
        # If we're being strict, only use lng/lat, not state
        if strict:
            raise ValueError(
                f"Can't find timezone for: lng={lng}, lat={lat}, state={state}"
            )
        # If, e.g., the coordinates are missing, try looking in the
        # state_tz_approx dictionary.
        try:
            tz = pudl.constants.state_tz_approx[state]
        except KeyError:
            tz = None
    return tz 
Example #26
Source File: eia923.py    From pudl with MIT License 4 votes vote down vote up
def boiler_fuel(eia923_dfs, eia923_transformed_dfs):
    """Transforms the boiler_fuel_eia923 table.

    Args:
        eia923_dfs (dict): Each entry in this
            dictionary of DataFrame objects corresponds to a page from the
            EIA923 form, as reported in the Excel spreadsheets they distribute.
        eia923_transformed_dfs (dict): A dictionary of DataFrame objects in
            which pages from EIA923 form (keys) correspond to normalized
            DataFrames of values from that page (values)

    Returns:
        dict: eia923_transformed_dfs, a dictionary of DataFrame objects in
        which pages from EIA923 form (keys) correspond to normalized
        DataFrames of values from that page (values).

    """
    bf_df = eia923_dfs['boiler_fuel'].copy()

    # Drop fields we're not inserting into the boiler_fuel_eia923 table.
    cols_to_drop = ['combined_heat_power',
                    'plant_name_eia',
                    'operator_name',
                    'operator_id',
                    'plant_state',
                    'census_region',
                    'nerc_region',
                    'naics_code',
                    'eia_sector',
                    'sector_name',
                    'fuel_unit',
                    'total_fuel_consumption_quantity']
    bf_df.drop(cols_to_drop, axis=1, inplace=True)

    bf_df.dropna(subset=['boiler_id', 'plant_id_eia'], inplace=True)

    # Convert the EIA923 DataFrame from yearly to monthly records.
    bf_df = _yearly_to_monthly_records(
        bf_df, pc.month_dict_eia923)
    bf_df['fuel_type_code_pudl'] = pudl.helpers.cleanstrings_series(
        bf_df.fuel_type_code,
        pc.fuel_type_eia923_boiler_fuel_simple_map)
    # Replace the EIA923 NA value ('.') with a real NA value.
    bf_df = pudl.helpers.fix_eia_na(bf_df)

    # Convert Year/Month columns into a single Date column...
    bf_df = pudl.helpers.convert_to_date(bf_df)

    eia923_transformed_dfs['boiler_fuel_eia923'] = bf_df

    return eia923_transformed_dfs 
Example #27
Source File: eia923.py    From pudl with MIT License 4 votes vote down vote up
def generation_fuel(eia923_dfs, eia923_transformed_dfs):
    """Transforms the generation_fuel_eia923 table.

    Args:
        eia923_dfs (dict): Each entry in this
            dictionary of DataFrame objects corresponds to a page from the
            EIA923 form, as reported in the Excel spreadsheets they distribute.
        eia923_transformed_dfs (dict): A dictionary of DataFrame objects in
            which pages from EIA923 form (keys) correspond to normalized
            DataFrames of values from that page (values)

    Returns:
        dict: eia923_transformed_dfs, a dictionary of DataFrame objects in
        which pages from EIA923 form (keys) correspond to normalized
        DataFrames of values from that page (values).

    """
    # This needs to be a copy of what we're passed in so we can edit it.
    gf_df = eia923_dfs['generation_fuel'].copy()

    # Drop fields we're not inserting into the generation_fuel_eia923 table.
    cols_to_drop = ['combined_heat_power',
                    'plant_name_eia',
                    'operator_name',
                    'operator_id',
                    'plant_state',
                    'census_region',
                    'nerc_region',
                    'naics_code',
                    'eia_sector',
                    'sector_name',
                    'fuel_unit',
                    'total_fuel_consumption_quantity',
                    'electric_fuel_consumption_quantity',
                    'total_fuel_consumption_mmbtu',
                    'elec_fuel_consumption_mmbtu',
                    'net_generation_megawatthours']
    gf_df.drop(cols_to_drop, axis=1, inplace=True)

    # Convert the EIA923 DataFrame from yearly to monthly records.
    gf_df = _yearly_to_monthly_records(gf_df, pc.month_dict_eia923)
    # Replace the EIA923 NA value ('.') with a real NA value.
    gf_df = pudl.helpers.fix_eia_na(gf_df)
    # Remove "State fuel-level increment" records... which don't pertain to
    # any particular plant (they have plant_id_eia == operator_id == 99999)
    gf_df = gf_df[gf_df.plant_id_eia != 99999]

    gf_df['fuel_type_code_pudl'] = pudl.helpers.cleanstrings_series(gf_df.fuel_type,
                                                                    pc.fuel_type_eia923_gen_fuel_simple_map)

    # Convert Year/Month columns into a single Date column...
    gf_df = pudl.helpers.convert_to_date(gf_df)

    eia923_transformed_dfs['generation_fuel_eia923'] = gf_df

    return eia923_transformed_dfs 
Example #28
Source File: series.py    From modin with Apache License 2.0 4 votes vote down vote up
def value_counts(
        self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
    ):
        """
        Return a Series containing counts of unique values.

        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : bool, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : bool, default True
            Sort by frequencies.
        ascending : bool, default False
            Sort in ascending order.
        bins : int, optional
            Rather than count values, group them into half-open bins,
            a convenience for ``pd.cut``, only works with numeric data.
        dropna : bool, default True
            Don't include counts of NaN.

        Returns
        -------
        Series

        Notes
        -----
        The indices of resulting object will be in descending
        (ascending, if ascending=True) order for equal values.
        It slightly differ from pandas where indices are located in random order.
        """
        return self.__constructor__(
            query_compiler=self._query_compiler.value_counts(
                normalize=normalize,
                sort=sort,
                ascending=ascending,
                bins=bins,
                dropna=dropna,
            )
        ) 
Example #29
Source File: ferc714.py    From pudl with MIT License 4 votes vote down vote up
def _standardize_offset_codes(df, offset_fixes):
    """
    Convert to standardized UTC offset abbreviations.

    This function ensures that all of the 3-4 letter abbreviations used to
    indicate a timestamp's localized offset from UTC are standardized, so that
    they can be used to make the timestamps timezone aware. The standard
    abbreviations we're using are:

    "HST": Hawaii Standard Time
    "AKST": Alaska Standard Time
    "AKDT": Alaska Daylight Time
    "PST": Pacific Standard Time
    "PDT": Pacific Daylight Time
    "MST": Mountain Standard Time
    "MDT": Mountain Daylight Time
    "CST": Central Standard Time
    "CDT": Central Daylight Time
    "EST": Eastern Standard Time
    "EDT": Eastern Daylight Time

    In some cases different respondents use the same non-standard abbreviations
    to indicate different offsets, and so the fixes are applied on a
    per-respondent basis, as defined by offset_fixes.

    UTC offset codes which are originally NA or the empty string are replaced
    with a temporary sentinel value, the string "XXX".

    Args:
        df (pandas.DataFrame): A DataFrame containing a utc_offset_code column
            that needs to be standardized.
        offset_fixes (dict): A dictionary with utility_id_ferc714 values as the
            keys, and a dictionary mapping non-standard UTC offset codes to
            the standardized UTC offset codes as the value.

    Returns:
        pandas.DataFrame: The same as the input DataFrame, but with only
        standardized UTC offset codes in the ``utc_offset_code`` column.

    """
    logger.info("Standardizing UTC offset codes.")
    df = df.copy()
    # Replace NaN and empty string values with a temporary placeholder "XXX"
    df["utc_offset_code"] = (
        df.utc_offset_code.replace(to_replace={np.nan: "XXX", "": "XXX"})
    )
    # Apply specific fixes on a per-respondent basis:
    for rid in offset_fixes:
        for orig_tz in offset_fixes[rid]:
            df.loc[(
                (df.utility_id_ferc714 == rid) &
                (df["utc_offset_code"] == orig_tz)), "utc_offset_code"] = offset_fixes[rid][orig_tz]

    return df