Python pandas.notnull() Examples

The following are 30 code examples of pandas.notnull(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: cbc_hb.py    From lifestyles with MIT License 6 votes vote down vote up
def _create_observation_variable(individual_selections, choices, partsworth):
    """
    This function handles creating the PyMC3 observation variables.  It also gracefully handles missing observations in individual selections.

    `individual_selections` is a Series of the individuals selections made, starting from 0. It can contain NaNs which represent answer was not provided.

    `choices` is a DataFrame with a hierarchical index: level=0 enumerates the choices, and level=1 displays the profile at a specific choice.
    It's size is (n_questions, n_choices_per_question).

    `partsworth` is a slice of PyMC3 matrix. It represents the partsworth variables of a individual. Size is (n_profiles,)

    This computes the values exp(partsworth * profile_j) / sum[ exp(partsworth * profile_k ] for all j.
    """
    nan_mask = pd.notnull(individual_selections)
    return pm.Categorical("Obs_%s" % individual_selections.name,
                          tt.nnet.softmax(tt.stack([
                            tt.dot(choice.values, partsworth) for _, choice in choices[nan_mask.values].groupby(axis=1, level=0)
                          ], axis=0).T),
                          observed=individual_selections[nan_mask.values].values) 
Example #2
Source File: utils.py    From urbansprawl with MIT License 6 votes vote down vote up
def load_geodataframe(geo_filename):
	""" 
	Load input GeoDataFrame

	Parameters
	----------
	geo_filename : string
		input GeoDataFrame filename

	Returns
	----------
	geopandas.GeoDataFrame
		loaded data

	"""
	# Load using geopandas
	df_osm_data = gpd.read_file(geo_filename)
	# Set None as NaN
	df_osm_data.fillna(value=np.nan, inplace=True)
	# Replace empty string (Json NULL sometimes read as '') for NaN
	df_osm_data.replace('', np.nan, inplace=True)
	
	def list_int_from_string(x): # List of integers given input in string format
		return [ int(id_) for id_ in x.split(",") ]
	def list_str_from_string(x): # List of strings given input in string format
		return x.split(",")

	# Recover list
	if ( "activity_category" in df_osm_data.columns): 
		df_osm_data[ "activity_category" ] = df_osm_data.activity_category.apply(lambda x: list_str_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_parts" in df_osm_data.columns): 
		df_osm_data[ "containing_parts" ] = df_osm_data.containing_parts.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_poi" in df_osm_data.columns): 
		df_osm_data[ "containing_poi" ] = df_osm_data.containing_poi.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	
	# To UTM coordinates
	return ox.project_gdf( df_osm_data ) 
Example #3
Source File: utils.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fillna(series_or_arr, missing_value=0.0):
    """Fill missing values in pandas objects and numpy arrays.

    Arguments
    ---------
    series_or_arr : pandas.Series, numpy.ndarray
        The numpy array or pandas series for which the missing values
        need to be replaced.
    missing_value : float, int, str
        The value to replace the missing value with. Default 0.0.

    Returns
    -------
    pandas.Series, numpy.ndarray
        The numpy array or pandas series with the missing values
        filled.
    """

    if pandas.notnull(missing_value):
        if isinstance(series_or_arr, (numpy.ndarray)):
            series_or_arr[numpy.isnan(series_or_arr)] = missing_value
        else:
            series_or_arr.fillna(missing_value, inplace=True)

    return series_or_arr 
Example #4
Source File: datasets.py    From deepchem with MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example #5
Source File: plotter.py    From pygraphistry with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _make_json_dataset(self, edges, nodes, name):
        (elist, nlist) = self._bind_attributes_v1(edges, nodes)
        edict = elist.where((pandas.notnull(elist)), None).to_dict(orient='records')

        bindings = {'idField': self._node or Plotter._defaultNodeId,
                    'destinationField': self._destination, 'sourceField': self._source}
        dataset = {'name': PyGraphistry._config['dataset_prefix'] + name,
                   'bindings': bindings, 'type': 'edgelist', 'graph': edict}

        if nlist is not None:
            ndict = nlist.where((pandas.notnull(nlist)), None).to_dict(orient='records')
            dataset['labels'] = ndict
        return dataset


    # Main helper for creating ETL2 payload 
Example #6
Source File: vgraph.py    From pygraphistry with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def objectEncoder(vg, series, dtype):
    series.where(pandas.notnull(series), '\0', inplace=True)
    # vec is a string[] submessage within a repeated
    vec = vg.string_vectors.add()
    str_series = None    
    try:
        str_series = series.astype('unicode')
    except UnicodeDecodeError:
        warnings.warn("Warning: escaping unicode")
        str_series = series.apply(lambda v: v.decode('utf-8'))
    for val in str_series:
        vec.values.append(val)
    return (vec, {'ctype': 'utf8'})


# NaN (as well as Infinity and undefined) are valid JSON. Use this guard to filter
# them out when creating the json metadata. 
Example #7
Source File: datasets.py    From PADME with MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example #8
Source File: test_mice.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_pertmeth(self):
        # Test with specified perturbation method.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        for pert_meth in "gaussian", "boot":

            imp_data = mice.MICEData(df, perturbation_method=pert_meth)

            for k in range(2):
                imp_data.update_all()
                assert_equal(imp_data.data.shape[0], nrow)
                assert_equal(imp_data.data.shape[1], ncol)
                assert_allclose(orig[mx], imp_data.data[mx])

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) 
Example #9
Source File: uniprot.py    From ssbio with MIT License 6 votes vote down vote up
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id] 
Example #10
Source File: quality.py    From ssbio with MIT License 6 votes vote down vote up
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results 
Example #11
Source File: movie_data.py    From parade with MIT License 6 votes vote down vote up
def execute_internal(self, context, **kwargs):
        """
        the internal execution process to be implemented
        :param context:
        :param kwargs:
        :return:
        """
        df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv')

        # Process projection on the dataset to get our interested attributes
        df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']]

        # Filter out records with *NAN* title_year and budget
        df = df[pd.notnull(df['title_year'])]
        df = df[df['budget'] > 0]

        # Extract the genres ROOT
        df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0])

        return df 
Example #12
Source File: test_logic.py    From ontask_b with MIT License 6 votes vote down vote up
def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = services.load_df_from_csvfile(
            io.StringIO(self.csv1),
            0,
            0)

        # Store the DF in the DB
        pandas.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas.load_table(self.table_name)

        # NaN in boolean columns are now None
        df_source['bool1'] = df_source['bool1'].where(
            pd.notnull(df_source['bool1']),
            None)
        df_source['bool2'] = df_source['bool2'].where(
            pd.notnull(df_source['bool2']),
            None)

        # Data frames mut be identical
        assert df_source.equals(df_dst) 
Example #13
Source File: dataframe_utils.py    From fileflow with Apache License 2.0 6 votes vote down vote up
def clean_and_write_dataframe_to_csv(data, filename):
    """
    Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv

    :param data: data to write to CSV
    :type data: :class:`pandas.DataFrame`
    :param filename: Path to file to write CSV to. if None, string of data
        will be returned
    :type filename: str | None
    :return: If the filename is None, returns the string of data. Otherwise
        returns None.
    :rtype: str | None
    """
    # cleans np.NaN values
    data = data.where((pd.notnull(data)), None)
    # If filename=None, to_csv will return a string
    result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None,
                         skipinitialspace=True, quoting=csv.QUOTE_ALL)
    logging.info("Dataframe of shape %s has been stored." % str(data.shape))

    return result 
Example #14
Source File: Trajectory.py    From TrajLib with Apache License 2.0 6 votes vote down vote up
def pre_processing(self, labels):
        # removing NaN in lat and lon
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        for label in labels:
            self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :]
        """
        lat_= self.raw_data.lat.rolling(3, min_periods=1).median()
        self.raw_data.assign(lat=lat_)
        lon_ = self.raw_data.lon.rolling(3, min_periods=1).median()
        self.raw_data.assign(lot=lon_)

        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        """

        return None 
Example #15
Source File: generate_avro_file.py    From tfx with Apache License 2.0 6 votes vote down vote up
def generate_avro(src_file: Text, output_file: Text):
  """Generates avro file based on src file.

  Args:
    src_file: path to Chicago taxi dataset.
    output_file: output path for avro file.
  """
  df = pd.read_csv(src_file)
  # Replaces NaN's with None's for avroWriter to interpret null values
  df = df.where((pd.notnull(df)), None)

  records = df.to_dict(orient='records')

  parsed_schema = fastavro.parse_schema(get_schema())
  with open(output_file, 'wb') as f:
    fastavro.writer(f, parsed_schema, records) 
Example #16
Source File: base.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _make_index_names(self, name1, name2):

        if pandas.notnull(name1) and pandas.notnull(name2) and \
                (name1 == name2):
            return ["{}{}".format(name1, self.suffixes[0]),
                    "{}{}".format(name1, self.suffixes[1])]
        else:
            return [name1, name2] 
Example #17
Source File: MSVD.py    From RecNet with MIT License 5 votes vote down vote up
def load_captions(self):
        df = pd.read_csv(self.caption_fpath)
        df = df[df['Language'] == 'English']
        df = df[pd.notnull(df['Description'])]
        captions = df['Description'].values
        return captions 
Example #18
Source File: MSVD.py    From RecNet with MIT License 5 votes vote down vote up
def load_captions(self):
        df = pd.read_csv(self.caption_fpath)
        df = df[df['Language'] == 'English']
        df = df[[ 'VideoID', 'Start', 'End', 'Description' ]]
        df = df[pd.notnull(df['Description'])]

        for video_id, start, end, caption in df.values:
            vid = "{}_{}_{}".format(video_id, start, end)
            self.captions[vid].append(caption) 
Example #19
Source File: MSVD.py    From RecNet with MIT License 5 votes vote down vote up
def load_metadata():
    df = pd.read_csv(C.caption_fpath)
    df = df[df['Language'] == 'English']
    df = df[pd.notnull(df['Description'])]
    df = df.reset_index(drop=True)
    return df 
Example #20
Source File: test_mice.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_default(self):
        # Test with all defaults.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        imp_data = mice.MICEData(df)
        nrow, ncol = df.shape

        assert_allclose(imp_data.ix_miss['x1'], np.arange(60))
        assert_allclose(imp_data.ix_obs['x1'], np.arange(60, 200))
        assert_allclose(imp_data.ix_miss['x2'], np.arange(40))
        assert_allclose(imp_data.ix_miss['x3'], np.arange(10, 30, 2))
        assert_allclose(imp_data.ix_obs['x3'],
                        np.concatenate((np.arange(10),
                                        np.arange(11, 30, 2),
                                        np.arange(30, 200))))

        for k in range(3):
            imp_data.update_all()
            assert_equal(imp_data.data.shape[0], nrow)
            assert_equal(imp_data.data.shape[1], ncol)
            assert_allclose(orig[mx], imp_data.data[mx])

        fml = 'x1 ~ x2 + x3 + x4 + x5 + y'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])

        # Should make a copy
        assert(not (df is imp_data.data))

        (endog_obs, exog_obs, exog_miss,
         predict_obs_kwds, predict_miss_kwds) = imp_data.get_split_data('x3')
        assert_equal(len(endog_obs), 190)
        assert_equal(exog_obs.shape, [190, 6])
        assert_equal(exog_miss.shape, [10, 6]) 
Example #21
Source File: test_mice.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_set_imputer(self):
        # Test with specified perturbation method.

        from statsmodels.regression.linear_model import RegressionResultsWrapper
        from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        imp_data = mice.MICEData(df)
        imp_data.set_imputer('x1', 'x3 + x4 + x3*x4')
        imp_data.set_imputer('x2', 'x4 + I(x5**2)')
        imp_data.set_imputer('x3', model_class=sm.GLM,
                             init_kwds={"family": sm.families.Binomial()})

        imp_data.update_all()
        assert_equal(imp_data.data.shape[0], nrow)
        assert_equal(imp_data.data.shape[1], ncol)
        assert_allclose(orig[mx], imp_data.data[mx])
        for j in range(1, 6):
            if j == 3:
                assert_equal(isinstance(imp_data.models['x3'], sm.GLM), True)
                assert_equal(isinstance(imp_data.models['x3'].family, sm.families.Binomial), True)
                assert_equal(isinstance(imp_data.results['x3'], GLMResultsWrapper), True)
            else:
                assert_equal(isinstance(imp_data.models['x%d' % j], sm.OLS), True)
                assert_equal(isinstance(imp_data.results['x%d' % j], RegressionResultsWrapper), True)

        fml = 'x1 ~ x3 + x4 + x3*x4'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        fml = 'x4 ~ x1 + x2 + x3 + x5 + y'
        assert_equal(imp_data.conditional_formula['x4'], fml)

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) 
Example #22
Source File: mixed_linear_model.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _handle_missing(data, groups, formula, re_formula, vc_formula):

    tokens = set([])

    forms = [formula]
    if re_formula is not None:
        forms.append(re_formula)
    if vc_formula is not None:
        forms.extend(vc_formula.values())

    import tokenize
    from statsmodels.compat import PY3
    from statsmodels.compat.python import StringIO, asunicode
    skiptoks = {"(", ")", "*", ":", "+", "-", "**", "/"}

    for fml in forms:
        # Unicode conversion is for Py2 compatability
        rl = StringIO(fml)

        def rlu():
            line = rl.readline()
            return asunicode(line, 'ascii')
        g = tokenize.generate_tokens(rlu)
        for tok in g:
            if tok not in skiptoks:
                if PY3:
                    tokens.add(tok.string)
                else:
                    tokens.add(tok[1])
    tokens = list(tokens & set(data.columns))
    tokens.sort()

    data = data[tokens]
    ii = pd.notnull(data).all(1)
    if type(groups) != "str":
        ii &= pd.notnull(groups)

    return data.loc[ii, :], groups[np.asarray(ii)] 
Example #23
Source File: xlsxpandasformatter.py    From xlsxpandasformatter with MIT License 5 votes vote down vote up
def format_background_colormap(self, col, colormap, vmin, vmax):

        iCol, worksheetCol = self.convert_to_col_index(col)

        for index, row in self.df.iterrows():
            x = row.iloc[iCol]
            if pd.notnull(x):
                colorHex = convert_colormap_to_hex(colormap, x, vmin=vmin, vmax=vmax)
                rowIndex = self.df.index.get_loc(index)
                self.formatTable[rowIndex][iCol]['bg_color'] = colorHex 
Example #24
Source File: core.py    From meterstick with Apache License 2.0 5 votes vote down vote up
def _merge_metrics(row):
  non_empty = [str(r) for r in row if pd.notnull(r)]
  if not non_empty:
    return None
  elif len(non_empty) == 1:
    return non_empty[0]
  else:
    return "::".join(non_empty)


# TODO(dlsun): Remove AnalysisParameters and incorporate the
#   attributes directly into the Analyze object. 
Example #25
Source File: test_resample.py    From Computable with MIT License 5 votes vote down vote up
def test_custom_grouper(self):

        dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1),
                            end=datetime(2005, 1, 10))

        s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        b = TimeGrouper(Minute(5), closed='right', label='right')
        g = s.groupby(b)
        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [1] + [5] * 2592
        idx = dti[0:-1:5]
        idx = idx.append(dti[-1:])
        expect = Series(arr, index=idx)

        # GH2763 - return in put dtype if we can
        result = g.agg(np.sum)
        assert_series_equal(result, expect)

        df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64')
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593) 
Example #26
Source File: test_moments.py    From Computable with MIT License 5 votes vote down vote up
def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True,
                                 has_time_rule=True, preserve_nan=True):
        result = func(self.arr)

        assert_almost_equal(result[10],
                            static_comp(self.arr[:11]))

        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        arr = randn(50)

        if has_min_periods:
            result = func(arr, min_periods=30)
            assert(np.isnan(result[:29]).all())
            assert_almost_equal(result[-1], static_comp(arr[:50]))

            # min_periods is working correctly
            result = func(arr, min_periods=15)
            self.assert_(np.isnan(result[13]))
            self.assert_(not np.isnan(result[14]))

            arr2 = randn(20)
            result = func(arr2, min_periods=5)
            self.assert_(isnull(result[3]))
            self.assert_(notnull(result[4]))

            # min_periods=0
            result0 = func(arr, min_periods=0)
            result1 = func(arr, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr)
            assert_almost_equal(result[-1], static_comp(arr[:50])) 
Example #27
Source File: test_ols.py    From Computable with MIT License 5 votes vote down vote up
def test_longpanel_series_combo(self):
        wp = tm.makePanel()
        lp = wp.to_frame()

        y = lp.pop('ItemA')
        model = ols(y=y, x=lp, entity_effects=True, window=20)
        self.assert_(notnull(model.beta.values).all())
        tm.assert_isinstance(model, PanelOLS)
        model.summary 
Example #28
Source File: ec2_analyze.py    From isitfit with Apache License 2.0 5 votes vote down vote up
def after_all(self, context_all):
    # add col for utilization in percentage
    def calc_usedPct(row):
      if row.capacity_usd==0: return 0
      o = row.used_usd / row.capacity_usd * 100
      return int(o)

    self.df_bins['used_pct'] = self.df_bins.apply(calc_usedPct, axis=1)

    # add column for regions as string
    self.df_bins['regions_str'] = self.df_bins['regions_set'].apply(lambda x: "0" if len(x)==0 else "%i (%s)"%(len(x), l2s(x)))

    # cases where dt_start > dt_end are those where there was no data and the initialization remained
    # so overwrite with na
    # Update 2019-12-11 Now that the df_bins timestamps are set with resample and dt_end is inclusive,
    # instead of setting to na, just swap the start/end fake timestamps which represent the end/start of the periods
    #import numpy as np
    #self.df_bins['dt_start'] = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_start, axis=1)
    #self.df_bins['dt_end']   = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_end  , axis=1)
    self.df_bins['dt_start_bkp']  = self.df_bins['dt_start']
    self.df_bins['dt_start'] = self.df_bins.apply(lambda row: row.dt_end       if row.count_analyzed==0 else row.dt_start, axis=1)
    self.df_bins['dt_end']   = self.df_bins.apply(lambda row: row.dt_start_bkp if row.count_analyzed==0 else row.dt_end  , axis=1)
    del self.df_bins['dt_start_bkp']

    # convert the dt_{start,end} back to dates again, given the nans
    for fx in ['dt_start', 'dt_end']: self.df_bins[fx] = pd.to_datetime(self.df_bins[fx])

    # Bugfix for cloudwatch:
    # When 90>=ndays>=64, cloudwatchman/metric.get_statistics returns data with max Timestamp on mainmanager.EndTime
    # Otherwise, the data max Timestamp is EndTime - 1 day
    # Here, get around this problem by incrementing by 1 day (or just set to mainmanager.EndTime)
    # This was tested on 2019-12-10 10:00 am UTC, and the last date was Dec 9 for ndays<64 and Dec 10 for ndays>=64
    if pd.notnull(self.df_bins['dt_end'].iloc[-1]):
      dt_max = context_all['mainManager'].EndTime.date()
      import datetime as dt
      dt_lastp1 = self.df_bins.dt_end.iloc[-1].date() + dt.timedelta(days=1)
      self.df_bins.iloc[-1, self.df_bins.columns=='dt_end'] = min(dt_max, dt_lastp1)

    # inject result for reporter access
    context_all['df_bins'] = self.df_bins
    return context_all 
Example #29
Source File: label.py    From lost with MIT License 5 votes vote down vote up
def import_df(self, df):
        '''Import LabelTree from DataFrame
        
        Args:
            df (pandas.DataFrame): LabelTree in DataFrame style.

        Retruns:
            :class:`lost.db.model.LabelLeaf` or None: 
                The created root leaf or None if a root leaf with same
                name is already present in database.
        '''
        df = df.where((pd.notnull(df)), None)
        root = df[df['parent_leaf_id'].isnull()]
        no_root = df[~df['parent_leaf_id'].isnull()]
        childs = {}

        if len(root) != 1:
            raise ValueError('''Can not import. There needs 
                to be exactly one root leaf for that tree! 
                Found: \n{}'''.format(root))
        else:
            try:
                root_leaf = self.create_root(root['name'].values[0])
                if root_leaf is None:
                    return None #A tree with the same name already exists.
                self._df_row_to_leaf(root.loc[0], root_leaf)

                #Create child dict
                for index, row in no_root.iterrows():
                    if not row['parent_leaf_id'] in childs:
                        childs[row['parent_leaf_id']] = []
                    childs[row['parent_leaf_id']].append(row)
                
                self.__create_childs_from_df(childs, root_leaf, root.loc[0])
                self.dbm.commit()
                return root_leaf
            except KeyError:
                self.logger.error('''At least the following columns 
                    need to be provided: *idx*, *name*, *parent_leaf_id*''')
                raise 
Example #30
Source File: data_cleaner.py    From data-cleaner with MIT License 5 votes vote down vote up
def _split(value, separators):
        values = []
        for separator in separators:
            if separator in str(value):
                values = [str(split_value) for split_value in
                          value.split(separator)]
                break

        return pd.Series([str(value).strip() for value in values
                          if pd.notnull(value)])