Python pandas.notnull() Examples

The following are 30 code examples for showing how to use pandas.notnull(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: lifestyles   Author: CamDavidsonPilon   File: cbc_hb.py    License: MIT License 6 votes vote down vote up
def _create_observation_variable(individual_selections, choices, partsworth):
    """
    This function handles creating the PyMC3 observation variables.  It also gracefully handles missing observations in individual selections.

    `individual_selections` is a Series of the individuals selections made, starting from 0. It can contain NaNs which represent answer was not provided.

    `choices` is a DataFrame with a hierarchical index: level=0 enumerates the choices, and level=1 displays the profile at a specific choice.
    It's size is (n_questions, n_choices_per_question).

    `partsworth` is a slice of PyMC3 matrix. It represents the partsworth variables of a individual. Size is (n_profiles,)

    This computes the values exp(partsworth * profile_j) / sum[ exp(partsworth * profile_k ] for all j.
    """
    nan_mask = pd.notnull(individual_selections)
    return pm.Categorical("Obs_%s" % individual_selections.name,
                          tt.nnet.softmax(tt.stack([
                            tt.dot(choice.values, partsworth) for _, choice in choices[nan_mask.values].groupby(axis=1, level=0)
                          ], axis=0).T),
                          observed=individual_selections[nan_mask.values].values) 
Example 2
Project: recordlinkage   Author: J535D165   File: utils.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fillna(series_or_arr, missing_value=0.0):
    """Fill missing values in pandas objects and numpy arrays.

    Arguments
    ---------
    series_or_arr : pandas.Series, numpy.ndarray
        The numpy array or pandas series for which the missing values
        need to be replaced.
    missing_value : float, int, str
        The value to replace the missing value with. Default 0.0.

    Returns
    -------
    pandas.Series, numpy.ndarray
        The numpy array or pandas series with the missing values
        filled.
    """

    if pandas.notnull(missing_value):
        if isinstance(series_or_arr, (numpy.ndarray)):
            series_or_arr[numpy.isnan(series_or_arr)] = missing_value
        else:
            series_or_arr.fillna(missing_value, inplace=True)

    return series_or_arr 
Example 3
Project: deepchem   Author: deepchem   File: datasets.py    License: MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example 4
Project: pygraphistry   Author: graphistry   File: plotter.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _make_json_dataset(self, edges, nodes, name):
        (elist, nlist) = self._bind_attributes_v1(edges, nodes)
        edict = elist.where((pandas.notnull(elist)), None).to_dict(orient='records')

        bindings = {'idField': self._node or Plotter._defaultNodeId,
                    'destinationField': self._destination, 'sourceField': self._source}
        dataset = {'name': PyGraphistry._config['dataset_prefix'] + name,
                   'bindings': bindings, 'type': 'edgelist', 'graph': edict}

        if nlist is not None:
            ndict = nlist.where((pandas.notnull(nlist)), None).to_dict(orient='records')
            dataset['labels'] = ndict
        return dataset


    # Main helper for creating ETL2 payload 
Example 5
Project: pygraphistry   Author: graphistry   File: vgraph.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def objectEncoder(vg, series, dtype):
    series.where(pandas.notnull(series), '\0', inplace=True)
    # vec is a string[] submessage within a repeated
    vec = vg.string_vectors.add()
    str_series = None    
    try:
        str_series = series.astype('unicode')
    except UnicodeDecodeError:
        warnings.warn("Warning: escaping unicode")
        str_series = series.apply(lambda v: v.decode('utf-8'))
    for val in str_series:
        vec.values.append(val)
    return (vec, {'ctype': 'utf8'})


# NaN (as well as Infinity and undefined) are valid JSON. Use this guard to filter
# them out when creating the json metadata. 
Example 6
Project: PADME   Author: simonfqy   File: datasets.py    License: MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example 7
Project: vnpy_crypto   Author: birforce   File: test_mice.py    License: MIT License 6 votes vote down vote up
def test_pertmeth(self):
        # Test with specified perturbation method.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        for pert_meth in "gaussian", "boot":

            imp_data = mice.MICEData(df, perturbation_method=pert_meth)

            for k in range(2):
                imp_data.update_all()
                assert_equal(imp_data.data.shape[0], nrow)
                assert_equal(imp_data.data.shape[1], ncol)
                assert_allclose(orig[mx], imp_data.data[mx])

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) 
Example 8
Project: ssbio   Author: SBRG   File: uniprot.py    License: MIT License 6 votes vote down vote up
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id] 
Example 9
Project: ssbio   Author: SBRG   File: quality.py    License: MIT License 6 votes vote down vote up
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results 
Example 10
Project: parade   Author: bailaohe   File: movie_data.py    License: MIT License 6 votes vote down vote up
def execute_internal(self, context, **kwargs):
        """
        the internal execution process to be implemented
        :param context:
        :param kwargs:
        :return:
        """
        df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv')

        # Process projection on the dataset to get our interested attributes
        df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']]

        # Filter out records with *NAN* title_year and budget
        df = df[pd.notnull(df['title_year'])]
        df = df[df['budget'] > 0]

        # Extract the genres ROOT
        df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0])

        return df 
Example 11
Project: ontask_b   Author: abelardopardo   File: test_logic.py    License: MIT License 6 votes vote down vote up
def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = services.load_df_from_csvfile(
            io.StringIO(self.csv1),
            0,
            0)

        # Store the DF in the DB
        pandas.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas.load_table(self.table_name)

        # NaN in boolean columns are now None
        df_source['bool1'] = df_source['bool1'].where(
            pd.notnull(df_source['bool1']),
            None)
        df_source['bool2'] = df_source['bool2'].where(
            pd.notnull(df_source['bool2']),
            None)

        # Data frames mut be identical
        assert df_source.equals(df_dst) 
Example 12
Project: fileflow   Author: industrydive   File: dataframe_utils.py    License: Apache License 2.0 6 votes vote down vote up
def clean_and_write_dataframe_to_csv(data, filename):
    """
    Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv

    :param data: data to write to CSV
    :type data: :class:`pandas.DataFrame`
    :param filename: Path to file to write CSV to. if None, string of data
        will be returned
    :type filename: str | None
    :return: If the filename is None, returns the string of data. Otherwise
        returns None.
    :rtype: str | None
    """
    # cleans np.NaN values
    data = data.where((pd.notnull(data)), None)
    # If filename=None, to_csv will return a string
    result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None,
                         skipinitialspace=True, quoting=csv.QUOTE_ALL)
    logging.info("Dataframe of shape %s has been stored." % str(data.shape))

    return result 
Example 13
Project: TrajLib   Author: metemaad   File: Trajectory.py    License: Apache License 2.0 6 votes vote down vote up
def pre_processing(self, labels):
        # removing NaN in lat and lon
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        for label in labels:
            self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :]
        """
        lat_= self.raw_data.lat.rolling(3, min_periods=1).median()
        self.raw_data.assign(lat=lat_)
        lon_ = self.raw_data.lon.rolling(3, min_periods=1).median()
        self.raw_data.assign(lot=lon_)

        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        """

        return None 
Example 14
Project: tfx   Author: tensorflow   File: generate_avro_file.py    License: Apache License 2.0 6 votes vote down vote up
def generate_avro(src_file: Text, output_file: Text):
  """Generates avro file based on src file.

  Args:
    src_file: path to Chicago taxi dataset.
    output_file: output path for avro file.
  """
  df = pd.read_csv(src_file)
  # Replaces NaN's with None's for avroWriter to interpret null values
  df = df.where((pd.notnull(df)), None)

  records = df.to_dict(orient='records')

  parsed_schema = fastavro.parse_schema(get_schema())
  with open(output_file, 'wb') as f:
    fastavro.writer(f, parsed_schema, records) 
Example 15
Project: urbansprawl   Author: lgervasoni   File: utils.py    License: MIT License 5 votes vote down vote up
def load_geodataframe(geo_filename):
	""" 
	Load input GeoDataFrame

	Parameters
	----------
	geo_filename : string
		input GeoDataFrame filename

	Returns
	----------
	geopandas.GeoDataFrame
		loaded data

	"""
	# Load using geopandas
	df_osm_data = gpd.read_file(geo_filename)
	# Set None as NaN
	df_osm_data.fillna(value=np.nan, inplace=True)
	# Replace empty string (Json NULL sometimes read as '') for NaN
	df_osm_data.replace('', np.nan, inplace=True)
	
	def list_int_from_string(x): # List of integers given input in string format
		return [ int(id_) for id_ in x.split(",") ]
	def list_str_from_string(x): # List of strings given input in string format
		return x.split(",")

	# Recover list
	if ( "activity_category" in df_osm_data.columns): 
		df_osm_data[ "activity_category" ] = df_osm_data.activity_category.apply(lambda x: list_str_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_parts" in df_osm_data.columns): 
		df_osm_data[ "containing_parts" ] = df_osm_data.containing_parts.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_poi" in df_osm_data.columns): 
		df_osm_data[ "containing_poi" ] = df_osm_data.containing_poi.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	
	# To UTM coordinates
	return ox.project_gdf( df_osm_data ) 
Example 16
Project: recordlinkage   Author: J535D165   File: base.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _make_index_names(self, name1, name2):

        if pandas.notnull(name1) and pandas.notnull(name2) and \
                (name1 == name2):
            return ["{}{}".format(name1, self.suffixes[0]),
                    "{}{}".format(name1, self.suffixes[1])]
        else:
            return [name1, name2] 
Example 17
Project: RecNet   Author: hobincar   File: MSVD.py    License: MIT License 5 votes vote down vote up
def load_captions(self):
        df = pd.read_csv(self.caption_fpath)
        df = df[df['Language'] == 'English']
        df = df[pd.notnull(df['Description'])]
        captions = df['Description'].values
        return captions 
Example 18
Project: RecNet   Author: hobincar   File: MSVD.py    License: MIT License 5 votes vote down vote up
def load_captions(self):
        df = pd.read_csv(self.caption_fpath)
        df = df[df['Language'] == 'English']
        df = df[[ 'VideoID', 'Start', 'End', 'Description' ]]
        df = df[pd.notnull(df['Description'])]

        for video_id, start, end, caption in df.values:
            vid = "{}_{}_{}".format(video_id, start, end)
            self.captions[vid].append(caption) 
Example 19
Project: RecNet   Author: hobincar   File: MSVD.py    License: MIT License 5 votes vote down vote up
def load_metadata():
    df = pd.read_csv(C.caption_fpath)
    df = df[df['Language'] == 'English']
    df = df[pd.notnull(df['Description'])]
    df = df.reset_index(drop=True)
    return df 
Example 20
Project: vnpy_crypto   Author: birforce   File: test_mice.py    License: MIT License 5 votes vote down vote up
def test_default(self):
        # Test with all defaults.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        imp_data = mice.MICEData(df)
        nrow, ncol = df.shape

        assert_allclose(imp_data.ix_miss['x1'], np.arange(60))
        assert_allclose(imp_data.ix_obs['x1'], np.arange(60, 200))
        assert_allclose(imp_data.ix_miss['x2'], np.arange(40))
        assert_allclose(imp_data.ix_miss['x3'], np.arange(10, 30, 2))
        assert_allclose(imp_data.ix_obs['x3'],
                        np.concatenate((np.arange(10),
                                        np.arange(11, 30, 2),
                                        np.arange(30, 200))))

        for k in range(3):
            imp_data.update_all()
            assert_equal(imp_data.data.shape[0], nrow)
            assert_equal(imp_data.data.shape[1], ncol)
            assert_allclose(orig[mx], imp_data.data[mx])

        fml = 'x1 ~ x2 + x3 + x4 + x5 + y'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])

        # Should make a copy
        assert(not (df is imp_data.data))

        (endog_obs, exog_obs, exog_miss,
         predict_obs_kwds, predict_miss_kwds) = imp_data.get_split_data('x3')
        assert_equal(len(endog_obs), 190)
        assert_equal(exog_obs.shape, [190, 6])
        assert_equal(exog_miss.shape, [10, 6]) 
Example 21
Project: vnpy_crypto   Author: birforce   File: test_mice.py    License: MIT License 5 votes vote down vote up
def test_set_imputer(self):
        # Test with specified perturbation method.

        from statsmodels.regression.linear_model import RegressionResultsWrapper
        from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        imp_data = mice.MICEData(df)
        imp_data.set_imputer('x1', 'x3 + x4 + x3*x4')
        imp_data.set_imputer('x2', 'x4 + I(x5**2)')
        imp_data.set_imputer('x3', model_class=sm.GLM,
                             init_kwds={"family": sm.families.Binomial()})

        imp_data.update_all()
        assert_equal(imp_data.data.shape[0], nrow)
        assert_equal(imp_data.data.shape[1], ncol)
        assert_allclose(orig[mx], imp_data.data[mx])
        for j in range(1, 6):
            if j == 3:
                assert_equal(isinstance(imp_data.models['x3'], sm.GLM), True)
                assert_equal(isinstance(imp_data.models['x3'].family, sm.families.Binomial), True)
                assert_equal(isinstance(imp_data.results['x3'], GLMResultsWrapper), True)
            else:
                assert_equal(isinstance(imp_data.models['x%d' % j], sm.OLS), True)
                assert_equal(isinstance(imp_data.results['x%d' % j], RegressionResultsWrapper), True)

        fml = 'x1 ~ x3 + x4 + x3*x4'
        assert_equal(imp_data.conditional_formula['x1'], fml)

        fml = 'x4 ~ x1 + x2 + x3 + x5 + y'
        assert_equal(imp_data.conditional_formula['x4'], fml)

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) 
Example 22
Project: vnpy_crypto   Author: birforce   File: mixed_linear_model.py    License: MIT License 5 votes vote down vote up
def _handle_missing(data, groups, formula, re_formula, vc_formula):

    tokens = set([])

    forms = [formula]
    if re_formula is not None:
        forms.append(re_formula)
    if vc_formula is not None:
        forms.extend(vc_formula.values())

    import tokenize
    from statsmodels.compat import PY3
    from statsmodels.compat.python import StringIO, asunicode
    skiptoks = {"(", ")", "*", ":", "+", "-", "**", "/"}

    for fml in forms:
        # Unicode conversion is for Py2 compatability
        rl = StringIO(fml)

        def rlu():
            line = rl.readline()
            return asunicode(line, 'ascii')
        g = tokenize.generate_tokens(rlu)
        for tok in g:
            if tok not in skiptoks:
                if PY3:
                    tokens.add(tok.string)
                else:
                    tokens.add(tok[1])
    tokens = list(tokens & set(data.columns))
    tokens.sort()

    data = data[tokens]
    ii = pd.notnull(data).all(1)
    if type(groups) != "str":
        ii &= pd.notnull(groups)

    return data.loc[ii, :], groups[np.asarray(ii)] 
Example 23
Project: xlsxpandasformatter   Author: webermarcolivier   File: xlsxpandasformatter.py    License: MIT License 5 votes vote down vote up
def format_background_colormap(self, col, colormap, vmin, vmax):

        iCol, worksheetCol = self.convert_to_col_index(col)

        for index, row in self.df.iterrows():
            x = row.iloc[iCol]
            if pd.notnull(x):
                colorHex = convert_colormap_to_hex(colormap, x, vmin=vmin, vmax=vmax)
                rowIndex = self.df.index.get_loc(index)
                self.formatTable[rowIndex][iCol]['bg_color'] = colorHex 
Example 24
Project: meterstick   Author: google   File: core.py    License: Apache License 2.0 5 votes vote down vote up
def _merge_metrics(row):
  non_empty = [str(r) for r in row if pd.notnull(r)]
  if not non_empty:
    return None
  elif len(non_empty) == 1:
    return non_empty[0]
  else:
    return "::".join(non_empty)


# TODO(dlsun): Remove AnalysisParameters and incorporate the
#   attributes directly into the Analyze object. 
Example 25
Project: Computable   Author: ktraunmueller   File: test_resample.py    License: MIT License 5 votes vote down vote up
def test_custom_grouper(self):

        dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1),
                            end=datetime(2005, 1, 10))

        s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        b = TimeGrouper(Minute(5), closed='right', label='right')
        g = s.groupby(b)
        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [1] + [5] * 2592
        idx = dti[0:-1:5]
        idx = idx.append(dti[-1:])
        expect = Series(arr, index=idx)

        # GH2763 - return in put dtype if we can
        result = g.agg(np.sum)
        assert_series_equal(result, expect)

        df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64')
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593) 
Example 26
Project: Computable   Author: ktraunmueller   File: test_moments.py    License: MIT License 5 votes vote down vote up
def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True,
                                 has_time_rule=True, preserve_nan=True):
        result = func(self.arr)

        assert_almost_equal(result[10],
                            static_comp(self.arr[:11]))

        if preserve_nan:
            assert(np.isnan(result[self._nan_locs]).all())

        arr = randn(50)

        if has_min_periods:
            result = func(arr, min_periods=30)
            assert(np.isnan(result[:29]).all())
            assert_almost_equal(result[-1], static_comp(arr[:50]))

            # min_periods is working correctly
            result = func(arr, min_periods=15)
            self.assert_(np.isnan(result[13]))
            self.assert_(not np.isnan(result[14]))

            arr2 = randn(20)
            result = func(arr2, min_periods=5)
            self.assert_(isnull(result[3]))
            self.assert_(notnull(result[4]))

            # min_periods=0
            result0 = func(arr, min_periods=0)
            result1 = func(arr, min_periods=1)
            assert_almost_equal(result0, result1)
        else:
            result = func(arr)
            assert_almost_equal(result[-1], static_comp(arr[:50])) 
Example 27
Project: Computable   Author: ktraunmueller   File: test_ols.py    License: MIT License 5 votes vote down vote up
def test_longpanel_series_combo(self):
        wp = tm.makePanel()
        lp = wp.to_frame()

        y = lp.pop('ItemA')
        model = ols(y=y, x=lp, entity_effects=True, window=20)
        self.assert_(notnull(model.beta.values).all())
        tm.assert_isinstance(model, PanelOLS)
        model.summary 
Example 28
Project: isitfit   Author: autofitcloud   File: ec2_analyze.py    License: Apache License 2.0 5 votes vote down vote up
def after_all(self, context_all):
    # add col for utilization in percentage
    def calc_usedPct(row):
      if row.capacity_usd==0: return 0
      o = row.used_usd / row.capacity_usd * 100
      return int(o)

    self.df_bins['used_pct'] = self.df_bins.apply(calc_usedPct, axis=1)

    # add column for regions as string
    self.df_bins['regions_str'] = self.df_bins['regions_set'].apply(lambda x: "0" if len(x)==0 else "%i (%s)"%(len(x), l2s(x)))

    # cases where dt_start > dt_end are those where there was no data and the initialization remained
    # so overwrite with na
    # Update 2019-12-11 Now that the df_bins timestamps are set with resample and dt_end is inclusive,
    # instead of setting to na, just swap the start/end fake timestamps which represent the end/start of the periods
    #import numpy as np
    #self.df_bins['dt_start'] = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_start, axis=1)
    #self.df_bins['dt_end']   = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_end  , axis=1)
    self.df_bins['dt_start_bkp']  = self.df_bins['dt_start']
    self.df_bins['dt_start'] = self.df_bins.apply(lambda row: row.dt_end       if row.count_analyzed==0 else row.dt_start, axis=1)
    self.df_bins['dt_end']   = self.df_bins.apply(lambda row: row.dt_start_bkp if row.count_analyzed==0 else row.dt_end  , axis=1)
    del self.df_bins['dt_start_bkp']

    # convert the dt_{start,end} back to dates again, given the nans
    for fx in ['dt_start', 'dt_end']: self.df_bins[fx] = pd.to_datetime(self.df_bins[fx])

    # Bugfix for cloudwatch:
    # When 90>=ndays>=64, cloudwatchman/metric.get_statistics returns data with max Timestamp on mainmanager.EndTime
    # Otherwise, the data max Timestamp is EndTime - 1 day
    # Here, get around this problem by incrementing by 1 day (or just set to mainmanager.EndTime)
    # This was tested on 2019-12-10 10:00 am UTC, and the last date was Dec 9 for ndays<64 and Dec 10 for ndays>=64
    if pd.notnull(self.df_bins['dt_end'].iloc[-1]):
      dt_max = context_all['mainManager'].EndTime.date()
      import datetime as dt
      dt_lastp1 = self.df_bins.dt_end.iloc[-1].date() + dt.timedelta(days=1)
      self.df_bins.iloc[-1, self.df_bins.columns=='dt_end'] = min(dt_max, dt_lastp1)

    # inject result for reporter access
    context_all['df_bins'] = self.df_bins
    return context_all 
Example 29
Project: lost   Author: l3p-cv   File: label.py    License: MIT License 5 votes vote down vote up
def import_df(self, df):
        '''Import LabelTree from DataFrame
        
        Args:
            df (pandas.DataFrame): LabelTree in DataFrame style.

        Retruns:
            :class:`lost.db.model.LabelLeaf` or None: 
                The created root leaf or None if a root leaf with same
                name is already present in database.
        '''
        df = df.where((pd.notnull(df)), None)
        root = df[df['parent_leaf_id'].isnull()]
        no_root = df[~df['parent_leaf_id'].isnull()]
        childs = {}

        if len(root) != 1:
            raise ValueError('''Can not import. There needs 
                to be exactly one root leaf for that tree! 
                Found: \n{}'''.format(root))
        else:
            try:
                root_leaf = self.create_root(root['name'].values[0])
                if root_leaf is None:
                    return None #A tree with the same name already exists.
                self._df_row_to_leaf(root.loc[0], root_leaf)

                #Create child dict
                for index, row in no_root.iterrows():
                    if not row['parent_leaf_id'] in childs:
                        childs[row['parent_leaf_id']] = []
                    childs[row['parent_leaf_id']].append(row)
                
                self.__create_childs_from_df(childs, root_leaf, root.loc[0])
                self.dbm.commit()
                return root_leaf
            except KeyError:
                self.logger.error('''At least the following columns 
                    need to be provided: *idx*, *name*, *parent_leaf_id*''')
                raise 
Example 30
Project: data-cleaner   Author: datosgobar   File: data_cleaner.py    License: MIT License 5 votes vote down vote up
def _split(value, separators):
        values = []
        for separator in separators:
            if separator in str(value):
                values = [str(split_value) for split_value in
                          value.split(separator)]
                break

        return pd.Series([str(value).strip() for value in values
                          if pd.notnull(value)])