Python pandas.NA Examples
The following are 29
code examples of pandas.NA().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: pandas_transformer.py From kgx with BSD 3-Clause "New" or "Revised" License | 6 votes |
def is_null(item: Any) -> bool: """ Checks if a given item is null or correspond to null. This method checks for: None, numpy.nan, pandas.NA, pandas.NaT, "", and " " Parameters ---------- item: Any The item to check Returns ------- bool Whether the given item is null or not """ null_values = {np.nan, pd.NA, pd.NaT, None, "", " "} return item in null_values
Example #2
Source File: helpers.py From pudl with MIT License | 6 votes |
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015): """Add State and County FIPS IDs to a dataframe.""" af = addfips.AddFIPS(vintage=vintage) # Lookup the state and county FIPS IDs and add them to the dataframe: df["state_id_fips"] = df.apply( lambda x: af.get_state_fips(state=x.state), axis=1) logger.info( f"Assigned state FIPS codes for " f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records." ) df["county_id_fips"] = df.apply( lambda x: af.get_county_fips(state=x.state, county=x.county), axis=1) df["county_id_fips"] = df.county_id_fips.fillna(pd.NA) logger.info( f"Assigned county FIPS codes for " f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records." ) return df
Example #3
Source File: helpers.py From pudl with MIT License | 6 votes |
def fix_eia_na(df): """ Replace common ill-posed EIA NA spreadsheet values with np.nan. Args: df (pandas.DataFrame): The DataFrame to clean. Returns: pandas.DataFrame: The cleaned DataFrame. Todo: Update docstring. """ return df.replace(to_replace=[r'^\.$', r'^\s$', r'^$'], value=np.nan, regex=True)
Example #4
Source File: base.py From fletcher with MIT License | 6 votes |
def __or__(self, other): """Compute vectorised or.""" if not pa.types.is_boolean(self.dtype.arrow_dtype): raise NotImplementedError("__or__ is only supported for boolean arrays yet") if other is pd.NA or (pd.api.types.is_scalar(other) and pd.isna(other)): # All fields that are True stay True, all others get set to NA return type(self)(or_na(self.data)) elif isinstance(other, bool): if other: # or with True yields all-True return type(self)(all_true(self.data)) else: return self else: if isinstance(other, FletcherBaseArray): other = other.data return type(self)(or_vectorised(self.data, other))
Example #5
Source File: test_boolean.py From fletcher with MIT License | 5 votes |
def test_or(fletcher_array): # Scalar versions # non-null versions result = fletcher_array([True, False]) | pd.NA expected = fletcher_array([True, None]) pdt.assert_extension_array_equal(result, expected) result = fletcher_array([True, False, None]) | pd.NA expected = fletcher_array([True, None, None]) pdt.assert_extension_array_equal(result, expected) result = fletcher_array([True, False, None]) | True expected = fletcher_array([True, True, True]) pdt.assert_extension_array_equal(result, expected) result = fletcher_array([True, False, None]) | False expected = fletcher_array([True, False, None]) pdt.assert_extension_array_equal(result, expected) # Array version # Non-null version result = fletcher_array([True, False, False]) | fletcher_array([False, True, False]) expected = fletcher_array([True, True, False]) pdt.assert_extension_array_equal(result, expected) # One has nulls, the other not result = fletcher_array([True, False, None, None]) | fletcher_array( [False, True, False, True] ) expected = fletcher_array([True, True, None, True]) pdt.assert_extension_array_equal(result, expected) # Both have nulls result = fletcher_array([True, False, None, None]) | fletcher_array( [None, True, False, True] ) pdt.assert_extension_array_equal(result, expected) result = fletcher_array([True, False, None, None]) | np.array( [False, True, False, True] ) pdt.assert_extension_array_equal(result, expected)
Example #6
Source File: test_pandas_conversions.py From rpy2 with GNU General Public License v2.0 | 5 votes |
def test_timeR2Pandas(self): tzone = robjects.vectors.get_timezone() dt = [datetime(1960, 5, 2), datetime(1970, 6, 3), datetime(2012, 7, 1)] dt = [x.replace(tzinfo=tzone) for x in dt] # fix the time ts = [x.timestamp() for x in dt] # Create an R POSIXct vector. r_time = robjects.baseenv['as.POSIXct']( rinterface.FloatSexpVector(ts), origin=rinterface.StrSexpVector(('1970-01-01',)) ) # Convert R POSIXct vector to pandas-compatible vector with localconverter(default_converter + rpyp.converter) as cv: py_time = robjects.conversion.rpy2py(r_time) # Check that the round trip did not introduce changes for expected, obtained in zip(dt, py_time): assert expected == obtained.to_pydatetime() # Try with NA. r_time[1] = rinterface.na_values.NA_Real # Convert R POSIXct vector to pandas-compatible vector with localconverter(default_converter + rpyp.converter) as cv: py_time = robjects.conversion.rpy2py(r_time) assert py_time[1] is pandas.NaT
Example #7
Source File: pandas2ri.py From rpy2 with GNU General Public License v2.0 | 5 votes |
def _int_populate_r_vector(iterable, r_vector, set_elt, cast_value): for i, v in enumerate(iterable): if v is None or v is pandas.NA: v = math.nan set_elt(r_vector, i, cast_value(v))
Example #8
Source File: pandas2ri.py From rpy2 with GNU General Public License v2.0 | 5 votes |
def _str_populate_r_vector(iterable, r_vector, set_elt, cast_value): for i, v in enumerate(iterable): if ( v is None or v is pandas.NA or (isinstance(v, float) and math.isnan(v)) ): v = na_values.NA_Character set_elt(r_vector, i, cast_value(v))
Example #9
Source File: series.py From modin with Apache License 2.0 | 5 votes |
def update(self, other): """ Modify Series in place using non-NA values from passed Series. Aligns on index. Parameters ---------- other : Series, or object coercible into Series """ if not isinstance(other, Series): other = Series(other) query_compiler = self._query_compiler.series_update(other._query_compiler) self._update_inplace(new_query_compiler=query_compiler)
Example #10
Source File: series.py From modin with Apache License 2.0 | 5 votes |
def argmin(self, axis=None, skipna=True, *args, **kwargs): result = self.idxmin(axis=axis, skipna=skipna, *args, **kwargs) if np.isnan(result) or result is pandas.NA: result = -1 return result
Example #11
Source File: series.py From modin with Apache License 2.0 | 5 votes |
def argmax(self, axis=None, skipna=True, *args, **kwargs): result = self.idxmax(axis=axis, skipna=skipna, *args, **kwargs) if np.isnan(result) or result is pandas.NA: result = -1 return result
Example #12
Source File: util.py From holoviews with BSD 3-Clause "New" or "Revised" License | 5 votes |
def isfinite(val): """ Helper function to determine if scalar or array value is finite extending np.isfinite with support for None, string, datetime types. """ is_dask = is_dask_array(val) if not np.isscalar(val) and not is_dask: val = asarray(val, strict=False) if val is None: return False elif is_dask: import dask.array as da return da.isfinite(val) elif isinstance(val, np.ndarray): if val.dtype.kind == 'M': return ~isnat(val) elif val.dtype.kind == 'O': return np.array([isfinite(v) for v in val], dtype=bool) elif val.dtype.kind in 'US': return ~pd.isna(val) if pd else np.ones_like(val, dtype=bool) finite = np.isfinite(val) if pd and pandas_version >= '1.0.0': finite &= ~pd.isna(val) return finite elif isinstance(val, datetime_types+timedelta_types): return not isnat(val) elif isinstance(val, (basestring, bytes)): return True finite = np.isfinite(val) if pd and pandas_version >= '1.0.0': if finite is pd.NA: return False return finite & (~pd.isna(val)) return finite
Example #13
Source File: test_s3.py From aws-data-wrangler with Apache License 2.0 | 5 votes |
def test_to_parquet_file_dtype(path): df = pd.DataFrame({"c0": [1.0, None, 2.0], "c1": [pd.NA, pd.NA, pd.NA]}) file_path = f"{path}0.parquet" wr.s3.to_parquet(df, file_path, dtype={"c0": "bigint", "c1": "string"}) wr.s3.wait_objects_exist(paths=[file_path]) df2 = wr.s3.read_parquet(file_path) assert df2.shape == df.shape assert df2.c0.sum() == 3 assert str(df2.c0.dtype) == "Int64" assert str(df2.c1.dtype) == "string"
Example #14
Source File: test_boolean.py From fletcher with MIT License | 5 votes |
def test_np_any(fletcher_array): arr = fletcher_array([True, False, None]) assert np.any(arr) arr = fletcher_array([True, False, True]) assert np.any(arr) # TODO(pandas-0.26): Uncomment this when BooleanArray landed. # Then we change the behaviour. # arr = fr.FletcherChunkedArray([False, False, None]) # assert np.any(arr) is pd.NA arr = fletcher_array([False, False, False]) assert not np.any(arr)
Example #15
Source File: test_pandas_extension.py From fletcher with MIT License | 5 votes |
def data_missing_for_sorting(fletcher_type, fletcher_array): """Length-3 array with a known sort order. This should be three items [B, NA, A] with A < B and NA missing. """ return fletcher_array( fletcher_type.data_missing_for_sorting, dtype=fletcher_type.dtype )
Example #16
Source File: test_pandas_extension.py From fletcher with MIT License | 5 votes |
def data_for_grouping(fletcher_type, fletcher_array): """Fixture with data for factorization, grouping, and unique tests. Expected to be like [B, B, NA, NA, A, A, B, C] Where A < B < C and NA is missing """ return fletcher_array(fletcher_type.data_for_grouping, dtype=fletcher_type.dtype)
Example #17
Source File: helpers.py From pudl with MIT License | 5 votes |
def fix_int_na(df, columns, float_na=np.nan, int_na=-1, str_na=''): """Convert NA containing integer columns from float to string. Numpy doesn't have a real NA value for integers. When pandas stores integer data which has NA values, it thus upcasts integers to floating point values, using np.nan values for NA. However, in order to dump some of our dataframes to CSV files for use in data packages, we need to write out integer formatted numbers, with empty strings as the NA value. This function replaces np.nan values with a sentinel value, converts the column to integers, and then to strings, finally replacing the sentinel value with the desired NA string. This is an interim solution -- now that pandas extension arrays have been implemented, we need to go back through and convert all of these integer columns that contain NA values to Nullable Integer types like Int64. Args: df (pandas.DataFrame): The dataframe to be fixed. This argument allows method chaining with the pipe() method. columns (iterable of strings): A list of DataFrame column labels indicating which columns need to be reformatted for output. float_na (float): The floating point value to be interpreted as NA and replaced in col. int_na (int): Sentinel value to substitute for float_na prior to conversion of the column to integers. str_na (str): sa.String value to substitute for int_na after the column has been converted to strings. Returns: df (pandas.DataFrame): a new DataFrame, with the selected columns converted to strings that look like integers, compatible with the postgresql COPY FROM command. """ return ( df.replace({c: float_na for c in columns}, int_na) .astype({c: int for c in columns}) .astype({c: str for c in columns}) .replace({c: str(int_na) for c in columns}, str_na) )
Example #18
Source File: ferc714.py From pudl with MIT License | 5 votes |
def respondent_id(tfr_dfs): """ Transform the FERC 714 respondent IDs, names, and EIA utility IDs. This consists primarily of dropping test respondents and manually assigning EIA utility IDs to a few FERC Form 714 respondents that report planning area demand, but which don't have their corresponding EIA utility IDs provided by FERC for some reason (including PacifiCorp). Args: tfr_dfs (dict): A dictionary of (partially) transformed dataframes, to be cleaned up. Returns: dict: The input dictionary of dataframes, but with a finished respondent_id_ferc714 dataframe. """ df = ( tfr_dfs["respondent_id_ferc714"].assign( utility_name_ferc714=lambda x: x.utility_name_ferc714.str.strip(), utility_id_eia=lambda x: x.utility_id_eia.replace( to_replace=0, value=pd.NA) ) # These excludes fake Test IDs -- not real planning areas .query("utility_id_ferc714 not in @BAD_RESPONDENTS") ) # There are a few utilities that seem mappable, but missing: for rid in MISSING_UTILITY_ID_EIA: df.loc[df.utility_id_ferc714 == rid, "utility_id_eia"] = MISSING_UTILITY_ID_EIA[rid] tfr_dfs["respondent_id_ferc714"] = df return tfr_dfs
Example #19
Source File: epacems.py From pudl with MIT License | 5 votes |
def _load_plant_utc_offset(datapkg_dir): """Load the UTC offset each EIA plant. CEMS times don't change for DST, so we get get the UTC offset by using the offset for the plants' timezones in January. Args: datapkg_dir (path-like) : Path to the directory of the datapackage which is currently being assembled. Returns: pandas.DataFrame: With columns plant_id_eia and utc_offset """ import pytz jan1 = datetime.datetime(2011, 1, 1) # year doesn't matter timezones = ( pd.read_csv( pathlib.Path(datapkg_dir, 'data/plants_entity_eia.csv'), usecols=["plant_id_eia", "timezone"], dtype={"plant_id_eia": "Int64", "timezone": pd.StringDtype()}) .replace(to_replace="None", value=pd.NA) .dropna() ) timezones["utc_offset"] = ( timezones["timezone"] .apply(lambda tz: pytz.timezone(tz).localize(jan1).utcoffset()) ) del timezones["timezone"] return timezones
Example #20
Source File: test_pandas_cursor.py From PyAthena with MIT License | 5 votes |
def test_integer_na_values(self, cursor): df = cursor.execute( """ SELECT * FROM integer_na_values """ ).as_pandas() rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()] version = float(re.search(r"^([\d]+\.[\d]+)\..+", pd.__version__).group(1)) if version >= 1.0: self.assertEqual(rows, [(1, 2), (1, pd.NA), (pd.NA, pd.NA)]) else: self.assertEqual(rows, [(1, 2), (1, np.nan), (np.nan, np.nan)])
Example #21
Source File: base.py From fletcher with MIT License | 4 votes |
def fillna(self, value=None, method=None, limit=None): """Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, array-like If a scalar value is passed it is used to fill all missing values. Alternatively, an array-like 'value' can be given. It's expected that the array-like have the same length as 'self'. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Returns ------- filled : ExtensionArray with NA/NaN filled """ from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d value, method = validate_fillna_kwargs(value, method) mask = self.isna() if is_array_like(value): if len(value) != len(self): raise ValueError( "Length of 'value' does not match. Got ({}) " " expected {}".format(len(value), len(self)) ) value = value[mask] if mask.any(): if method is not None: func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, self._dtype.arrow_dtype) else: # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
Example #22
Source File: base.py From fletcher with MIT License | 4 votes |
def take( self, indices: Union[Sequence[int], np.ndarray], allow_fill: bool = False, fill_value: Optional[Any] = None, ) -> ExtensionArray: """ Take elements from an array. Parameters ---------- indices : sequence of integers Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if nescessary. Returns ------- ExtensionArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. See Also -------- numpy.take pandas.api.extensions.take """ return self._take_array(self.data, indices, allow_fill, fill_value)
Example #23
Source File: base.py From fletcher with MIT License | 4 votes |
def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[np.ndarray, ExtensionArray] """Encode the extension array as an enumerated type. Parameters ---------- na_sentinel : int, default -1 Value to use in the `labels` array to indicate missing values. Returns ------- labels : ndarray An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray An ExtensionArray containing the unique values of `self`. .. note:: uniques will *not* contain an entry for the NA value of the ExtensionArray if there are any missing values present in `self`. See Also -------- pandas.factorize : Top-level factorize method that dispatches here. Notes ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ if pa.types.is_dictionary(self.data.type): raise NotImplementedError() elif self.data.num_chunks == 1: # Dictionaryencode and do the same as above encoded = self.data.chunk(0).dictionary_encode() indices = encoded.indices.to_pandas() if indices.dtype.kind == "f": indices[np.isnan(indices)] = na_sentinel indices = indices.astype(int) if not is_int64_dtype(indices): indices = indices.astype(np.int64) return indices.values, type(self)(encoded.dictionary) else: np_array = self.data.to_pandas().values return pd.factorize(np_array, na_sentinel=na_sentinel)
Example #24
Source File: base.py From fletcher with MIT License | 4 votes |
def fillna(self, value=None, method=None, limit=None): """Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, array-like If a scalar value is passed it is used to fill all missing values. Alternatively, an array-like 'value' can be given. It's expected that the array-like have the same length as 'self'. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Returns ------- filled : ExtensionArray with NA/NaN filled """ from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d value, method = validate_fillna_kwargs(value, method) mask = self.isna() if is_array_like(value): if len(value) != len(self): raise ValueError( "Length of 'value' does not match. Got ({}) " " expected {}".format(len(value), len(self)) ) value = value[mask] if mask.any(): if method is not None: func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, self._dtype.arrow_dtype) else: # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
Example #25
Source File: helpers.py From pudl with MIT License | 4 votes |
def find_timezone(*, lng=None, lat=None, state=None, strict=True): """Find the timezone associated with the a specified input location. Note that this function requires named arguments. The names are lng, lat, and state. lng and lat must be provided, but they may be NA. state isn't required, and isn't used unless lng/lat are NA or timezonefinder can't find a corresponding timezone. Timezones based on states are imprecise, so it's far better to use lng/lat if possible. If `strict` is True, state will not be used. More on state-to-timezone conversion here: https://en.wikipedia.org/wiki/List_of_time_offsets_by_U.S._state_and_territory Args: lng (int or float in [-180,180]): Longitude, in decimal degrees lat (int or float in [-90, 90]): Latitude, in decimal degrees state (str): Abbreviation for US state or Canadian province strict (bool): Raise an error if no timezone is found? Returns: str: The timezone (as an IANA string) for that location. Todo: Update docstring. """ try: tz = tz_finder.timezone_at(lng=lng, lat=lat) if tz is None: # Try harder # Could change the search radius as well tz = tz_finder.closest_timezone_at(lng=lng, lat=lat) # For some reason w/ Python 3.6 we get a ValueError here, but with # Python 3.7 we get an OverflowError... except (OverflowError, ValueError): # If we're being strict, only use lng/lat, not state if strict: raise ValueError( f"Can't find timezone for: lng={lng}, lat={lat}, state={state}" ) # If, e.g., the coordinates are missing, try looking in the # state_tz_approx dictionary. try: tz = pudl.constants.state_tz_approx[state] except KeyError: tz = None return tz
Example #26
Source File: eia923.py From pudl with MIT License | 4 votes |
def boiler_fuel(eia923_dfs, eia923_transformed_dfs): """Transforms the boiler_fuel_eia923 table. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) Returns: dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). """ bf_df = eia923_dfs['boiler_fuel'].copy() # Drop fields we're not inserting into the boiler_fuel_eia923 table. cols_to_drop = ['combined_heat_power', 'plant_name_eia', 'operator_name', 'operator_id', 'plant_state', 'census_region', 'nerc_region', 'naics_code', 'eia_sector', 'sector_name', 'fuel_unit', 'total_fuel_consumption_quantity'] bf_df.drop(cols_to_drop, axis=1, inplace=True) bf_df.dropna(subset=['boiler_id', 'plant_id_eia'], inplace=True) # Convert the EIA923 DataFrame from yearly to monthly records. bf_df = _yearly_to_monthly_records( bf_df, pc.month_dict_eia923) bf_df['fuel_type_code_pudl'] = pudl.helpers.cleanstrings_series( bf_df.fuel_type_code, pc.fuel_type_eia923_boiler_fuel_simple_map) # Replace the EIA923 NA value ('.') with a real NA value. bf_df = pudl.helpers.fix_eia_na(bf_df) # Convert Year/Month columns into a single Date column... bf_df = pudl.helpers.convert_to_date(bf_df) eia923_transformed_dfs['boiler_fuel_eia923'] = bf_df return eia923_transformed_dfs
Example #27
Source File: eia923.py From pudl with MIT License | 4 votes |
def generation_fuel(eia923_dfs, eia923_transformed_dfs): """Transforms the generation_fuel_eia923 table. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) Returns: dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). """ # This needs to be a copy of what we're passed in so we can edit it. gf_df = eia923_dfs['generation_fuel'].copy() # Drop fields we're not inserting into the generation_fuel_eia923 table. cols_to_drop = ['combined_heat_power', 'plant_name_eia', 'operator_name', 'operator_id', 'plant_state', 'census_region', 'nerc_region', 'naics_code', 'eia_sector', 'sector_name', 'fuel_unit', 'total_fuel_consumption_quantity', 'electric_fuel_consumption_quantity', 'total_fuel_consumption_mmbtu', 'elec_fuel_consumption_mmbtu', 'net_generation_megawatthours'] gf_df.drop(cols_to_drop, axis=1, inplace=True) # Convert the EIA923 DataFrame from yearly to monthly records. gf_df = _yearly_to_monthly_records(gf_df, pc.month_dict_eia923) # Replace the EIA923 NA value ('.') with a real NA value. gf_df = pudl.helpers.fix_eia_na(gf_df) # Remove "State fuel-level increment" records... which don't pertain to # any particular plant (they have plant_id_eia == operator_id == 99999) gf_df = gf_df[gf_df.plant_id_eia != 99999] gf_df['fuel_type_code_pudl'] = pudl.helpers.cleanstrings_series(gf_df.fuel_type, pc.fuel_type_eia923_gen_fuel_simple_map) # Convert Year/Month columns into a single Date column... gf_df = pudl.helpers.convert_to_date(gf_df) eia923_transformed_dfs['generation_fuel_eia923'] = gf_df return eia923_transformed_dfs
Example #28
Source File: series.py From modin with Apache License 2.0 | 4 votes |
def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : bool, default False If True then the object returned will contain the relative frequencies of the unique values. sort : bool, default True Sort by frequencies. ascending : bool, default False Sort in ascending order. bins : int, optional Rather than count values, group them into half-open bins, a convenience for ``pd.cut``, only works with numeric data. dropna : bool, default True Don't include counts of NaN. Returns ------- Series Notes ----- The indices of resulting object will be in descending (ascending, if ascending=True) order for equal values. It slightly differ from pandas where indices are located in random order. """ return self.__constructor__( query_compiler=self._query_compiler.value_counts( normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna, ) )
Example #29
Source File: ferc714.py From pudl with MIT License | 4 votes |
def _standardize_offset_codes(df, offset_fixes): """ Convert to standardized UTC offset abbreviations. This function ensures that all of the 3-4 letter abbreviations used to indicate a timestamp's localized offset from UTC are standardized, so that they can be used to make the timestamps timezone aware. The standard abbreviations we're using are: "HST": Hawaii Standard Time "AKST": Alaska Standard Time "AKDT": Alaska Daylight Time "PST": Pacific Standard Time "PDT": Pacific Daylight Time "MST": Mountain Standard Time "MDT": Mountain Daylight Time "CST": Central Standard Time "CDT": Central Daylight Time "EST": Eastern Standard Time "EDT": Eastern Daylight Time In some cases different respondents use the same non-standard abbreviations to indicate different offsets, and so the fixes are applied on a per-respondent basis, as defined by offset_fixes. UTC offset codes which are originally NA or the empty string are replaced with a temporary sentinel value, the string "XXX". Args: df (pandas.DataFrame): A DataFrame containing a utc_offset_code column that needs to be standardized. offset_fixes (dict): A dictionary with utility_id_ferc714 values as the keys, and a dictionary mapping non-standard UTC offset codes to the standardized UTC offset codes as the value. Returns: pandas.DataFrame: The same as the input DataFrame, but with only standardized UTC offset codes in the ``utc_offset_code`` column. """ logger.info("Standardizing UTC offset codes.") df = df.copy() # Replace NaN and empty string values with a temporary placeholder "XXX" df["utc_offset_code"] = ( df.utc_offset_code.replace(to_replace={np.nan: "XXX", "": "XXX"}) ) # Apply specific fixes on a per-respondent basis: for rid in offset_fixes: for orig_tz in offset_fixes[rid]: df.loc[( (df.utility_id_ferc714 == rid) & (df["utc_offset_code"] == orig_tz)), "utc_offset_code"] = offset_fixes[rid][orig_tz] return df