Python pandas.notnull() Examples
The following are 30
code examples of pandas.notnull().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.

Example #1
Source File: cbc_hb.py From lifestyles with MIT License | 6 votes |
def _create_observation_variable(individual_selections, choices, partsworth): """ This function handles creating the PyMC3 observation variables. It also gracefully handles missing observations in individual selections. `individual_selections` is a Series of the individuals selections made, starting from 0. It can contain NaNs which represent answer was not provided. `choices` is a DataFrame with a hierarchical index: level=0 enumerates the choices, and level=1 displays the profile at a specific choice. It's size is (n_questions, n_choices_per_question). `partsworth` is a slice of PyMC3 matrix. It represents the partsworth variables of a individual. Size is (n_profiles,) This computes the values exp(partsworth * profile_j) / sum[ exp(partsworth * profile_k ] for all j. """ nan_mask = pd.notnull(individual_selections) return pm.Categorical("Obs_%s" % individual_selections.name, tt.nnet.softmax(tt.stack([ tt.dot(choice.values, partsworth) for _, choice in choices[nan_mask.values].groupby(axis=1, level=0) ], axis=0).T), observed=individual_selections[nan_mask.values].values)
Example #2
Source File: utils.py From urbansprawl with MIT License | 6 votes |
def load_geodataframe(geo_filename): """ Load input GeoDataFrame Parameters ---------- geo_filename : string input GeoDataFrame filename Returns ---------- geopandas.GeoDataFrame loaded data """ # Load using geopandas df_osm_data = gpd.read_file(geo_filename) # Set None as NaN df_osm_data.fillna(value=np.nan, inplace=True) # Replace empty string (Json NULL sometimes read as '') for NaN df_osm_data.replace('', np.nan, inplace=True) def list_int_from_string(x): # List of integers given input in string format return [ int(id_) for id_ in x.split(",") ] def list_str_from_string(x): # List of strings given input in string format return x.split(",") # Recover list if ( "activity_category" in df_osm_data.columns): df_osm_data[ "activity_category" ] = df_osm_data.activity_category.apply(lambda x: list_str_from_string(x) if pd.notnull(x) else np.nan ) if ( "containing_parts" in df_osm_data.columns): df_osm_data[ "containing_parts" ] = df_osm_data.containing_parts.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan ) if ( "containing_poi" in df_osm_data.columns): df_osm_data[ "containing_poi" ] = df_osm_data.containing_poi.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan ) # To UTM coordinates return ox.project_gdf( df_osm_data )
Example #3
Source File: utils.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fillna(series_or_arr, missing_value=0.0): """Fill missing values in pandas objects and numpy arrays. Arguments --------- series_or_arr : pandas.Series, numpy.ndarray The numpy array or pandas series for which the missing values need to be replaced. missing_value : float, int, str The value to replace the missing value with. Default 0.0. Returns ------- pandas.Series, numpy.ndarray The numpy array or pandas series with the missing values filled. """ if pandas.notnull(missing_value): if isinstance(series_or_arr, (numpy.ndarray)): series_or_arr[numpy.isnan(series_or_arr)] = missing_value else: series_or_arr.fillna(missing_value, inplace=True) return series_or_arr
Example #4
Source File: datasets.py From deepchem with MIT License | 6 votes |
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
Example #5
Source File: plotter.py From pygraphistry with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _make_json_dataset(self, edges, nodes, name): (elist, nlist) = self._bind_attributes_v1(edges, nodes) edict = elist.where((pandas.notnull(elist)), None).to_dict(orient='records') bindings = {'idField': self._node or Plotter._defaultNodeId, 'destinationField': self._destination, 'sourceField': self._source} dataset = {'name': PyGraphistry._config['dataset_prefix'] + name, 'bindings': bindings, 'type': 'edgelist', 'graph': edict} if nlist is not None: ndict = nlist.where((pandas.notnull(nlist)), None).to_dict(orient='records') dataset['labels'] = ndict return dataset # Main helper for creating ETL2 payload
Example #6
Source File: vgraph.py From pygraphistry with BSD 3-Clause "New" or "Revised" License | 6 votes |
def objectEncoder(vg, series, dtype): series.where(pandas.notnull(series), '\0', inplace=True) # vec is a string[] submessage within a repeated vec = vg.string_vectors.add() str_series = None try: str_series = series.astype('unicode') except UnicodeDecodeError: warnings.warn("Warning: escaping unicode") str_series = series.apply(lambda v: v.decode('utf-8')) for val in str_series: vec.values.append(val) return (vec, {'ctype': 'utf8'}) # NaN (as well as Infinity and undefined) are valid JSON. Use this guard to filter # them out when creating the json metadata.
Example #7
Source File: datasets.py From PADME with MIT License | 6 votes |
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
Example #8
Source File: test_mice.py From vnpy_crypto with MIT License | 6 votes |
def test_pertmeth(self): # Test with specified perturbation method. df = gendat() orig = df.copy() mx = pd.notnull(df) nrow, ncol = df.shape for pert_meth in "gaussian", "boot": imp_data = mice.MICEData(df, perturbation_method=pert_meth) for k in range(2): imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
Example #9
Source File: uniprot.py From ssbio with MIT License | 6 votes |
def uniprot_reviewed_checker(uniprot_id): """Check if a single UniProt ID is reviewed or not. Args: uniprot_id: Returns: bool: If the entry is reviewed """ query_string = 'id:' + uniprot_id uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab')) uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0) uni_rev_df = uni_rev_df.fillna(False) uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)] uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True) uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False) uni_rev_dict_adder = uni_rev_df.to_dict()['Status'] return uni_rev_dict_adder[uniprot_id]
Example #10
Source File: quality.py From ssbio with MIT License | 6 votes |
def parse_psqs(psqs_results_file): """Parse a PSQS result file and returns a Pandas DataFrame of the results Args: psqs_results_file: Path to psqs results file Returns: Pandas DataFrame: Summary of PSQS results """ # TODO: generalize column names for all results, save as dict instead psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None) psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb')) psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1) psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan) psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan) psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)] return psqs_results
Example #11
Source File: movie_data.py From parade with MIT License | 6 votes |
def execute_internal(self, context, **kwargs): """ the internal execution process to be implemented :param context: :param kwargs: :return: """ df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv') # Process projection on the dataset to get our interested attributes df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']] # Filter out records with *NAN* title_year and budget df = df[pd.notnull(df['title_year'])] df = df[df['budget'] > 0] # Extract the genres ROOT df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0]) return df
Example #12
Source File: test_logic.py From ontask_b with MIT License | 6 votes |
def test_df_equivalent_after_sql(self): # Parse the CSV df_source = services.load_df_from_csvfile( io.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas.load_table(self.table_name) # NaN in boolean columns are now None df_source['bool1'] = df_source['bool1'].where( pd.notnull(df_source['bool1']), None) df_source['bool2'] = df_source['bool2'].where( pd.notnull(df_source['bool2']), None) # Data frames mut be identical assert df_source.equals(df_dst)
Example #13
Source File: dataframe_utils.py From fileflow with Apache License 2.0 | 6 votes |
def clean_and_write_dataframe_to_csv(data, filename): """ Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv :param data: data to write to CSV :type data: :class:`pandas.DataFrame` :param filename: Path to file to write CSV to. if None, string of data will be returned :type filename: str | None :return: If the filename is None, returns the string of data. Otherwise returns None. :rtype: str | None """ # cleans np.NaN values data = data.where((pd.notnull(data)), None) # If filename=None, to_csv will return a string result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None, skipinitialspace=True, quoting=csv.QUOTE_ALL) logging.info("Dataframe of shape %s has been stored." % str(data.shape)) return result
Example #14
Source File: Trajectory.py From TrajLib with Apache License 2.0 | 6 votes |
def pre_processing(self, labels): # removing NaN in lat and lon self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :] self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :] for label in labels: self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :] """ lat_= self.raw_data.lat.rolling(3, min_periods=1).median() self.raw_data.assign(lat=lat_) lon_ = self.raw_data.lon.rolling(3, min_periods=1).median() self.raw_data.assign(lot=lon_) self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :] self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :] """ return None
Example #15
Source File: generate_avro_file.py From tfx with Apache License 2.0 | 6 votes |
def generate_avro(src_file: Text, output_file: Text): """Generates avro file based on src file. Args: src_file: path to Chicago taxi dataset. output_file: output path for avro file. """ df = pd.read_csv(src_file) # Replaces NaN's with None's for avroWriter to interpret null values df = df.where((pd.notnull(df)), None) records = df.to_dict(orient='records') parsed_schema = fastavro.parse_schema(get_schema()) with open(output_file, 'wb') as f: fastavro.writer(f, parsed_schema, records)
Example #16
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _make_index_names(self, name1, name2): if pandas.notnull(name1) and pandas.notnull(name2) and \ (name1 == name2): return ["{}{}".format(name1, self.suffixes[0]), "{}{}".format(name1, self.suffixes[1])] else: return [name1, name2]
Example #17
Source File: MSVD.py From RecNet with MIT License | 5 votes |
def load_captions(self): df = pd.read_csv(self.caption_fpath) df = df[df['Language'] == 'English'] df = df[pd.notnull(df['Description'])] captions = df['Description'].values return captions
Example #18
Source File: MSVD.py From RecNet with MIT License | 5 votes |
def load_captions(self): df = pd.read_csv(self.caption_fpath) df = df[df['Language'] == 'English'] df = df[[ 'VideoID', 'Start', 'End', 'Description' ]] df = df[pd.notnull(df['Description'])] for video_id, start, end, caption in df.values: vid = "{}_{}_{}".format(video_id, start, end) self.captions[vid].append(caption)
Example #19
Source File: MSVD.py From RecNet with MIT License | 5 votes |
def load_metadata(): df = pd.read_csv(C.caption_fpath) df = df[df['Language'] == 'English'] df = df[pd.notnull(df['Description'])] df = df.reset_index(drop=True) return df
Example #20
Source File: test_mice.py From vnpy_crypto with MIT License | 5 votes |
def test_default(self): # Test with all defaults. df = gendat() orig = df.copy() mx = pd.notnull(df) imp_data = mice.MICEData(df) nrow, ncol = df.shape assert_allclose(imp_data.ix_miss['x1'], np.arange(60)) assert_allclose(imp_data.ix_obs['x1'], np.arange(60, 200)) assert_allclose(imp_data.ix_miss['x2'], np.arange(40)) assert_allclose(imp_data.ix_miss['x3'], np.arange(10, 30, 2)) assert_allclose(imp_data.ix_obs['x3'], np.concatenate((np.arange(10), np.arange(11, 30, 2), np.arange(30, 200)))) for k in range(3): imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) fml = 'x1 ~ x2 + x3 + x4 + x5 + y' assert_equal(imp_data.conditional_formula['x1'], fml) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) # Should make a copy assert(not (df is imp_data.data)) (endog_obs, exog_obs, exog_miss, predict_obs_kwds, predict_miss_kwds) = imp_data.get_split_data('x3') assert_equal(len(endog_obs), 190) assert_equal(exog_obs.shape, [190, 6]) assert_equal(exog_miss.shape, [10, 6])
Example #21
Source File: test_mice.py From vnpy_crypto with MIT License | 5 votes |
def test_set_imputer(self): # Test with specified perturbation method. from statsmodels.regression.linear_model import RegressionResultsWrapper from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper df = gendat() orig = df.copy() mx = pd.notnull(df) nrow, ncol = df.shape imp_data = mice.MICEData(df) imp_data.set_imputer('x1', 'x3 + x4 + x3*x4') imp_data.set_imputer('x2', 'x4 + I(x5**2)') imp_data.set_imputer('x3', model_class=sm.GLM, init_kwds={"family": sm.families.Binomial()}) imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) for j in range(1, 6): if j == 3: assert_equal(isinstance(imp_data.models['x3'], sm.GLM), True) assert_equal(isinstance(imp_data.models['x3'].family, sm.families.Binomial), True) assert_equal(isinstance(imp_data.results['x3'], GLMResultsWrapper), True) else: assert_equal(isinstance(imp_data.models['x%d' % j], sm.OLS), True) assert_equal(isinstance(imp_data.results['x%d' % j], RegressionResultsWrapper), True) fml = 'x1 ~ x3 + x4 + x3*x4' assert_equal(imp_data.conditional_formula['x1'], fml) fml = 'x4 ~ x1 + x2 + x3 + x5 + y' assert_equal(imp_data.conditional_formula['x4'], fml) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
Example #22
Source File: mixed_linear_model.py From vnpy_crypto with MIT License | 5 votes |
def _handle_missing(data, groups, formula, re_formula, vc_formula): tokens = set([]) forms = [formula] if re_formula is not None: forms.append(re_formula) if vc_formula is not None: forms.extend(vc_formula.values()) import tokenize from statsmodels.compat import PY3 from statsmodels.compat.python import StringIO, asunicode skiptoks = {"(", ")", "*", ":", "+", "-", "**", "/"} for fml in forms: # Unicode conversion is for Py2 compatability rl = StringIO(fml) def rlu(): line = rl.readline() return asunicode(line, 'ascii') g = tokenize.generate_tokens(rlu) for tok in g: if tok not in skiptoks: if PY3: tokens.add(tok.string) else: tokens.add(tok[1]) tokens = list(tokens & set(data.columns)) tokens.sort() data = data[tokens] ii = pd.notnull(data).all(1) if type(groups) != "str": ii &= pd.notnull(groups) return data.loc[ii, :], groups[np.asarray(ii)]
Example #23
Source File: xlsxpandasformatter.py From xlsxpandasformatter with MIT License | 5 votes |
def format_background_colormap(self, col, colormap, vmin, vmax): iCol, worksheetCol = self.convert_to_col_index(col) for index, row in self.df.iterrows(): x = row.iloc[iCol] if pd.notnull(x): colorHex = convert_colormap_to_hex(colormap, x, vmin=vmin, vmax=vmax) rowIndex = self.df.index.get_loc(index) self.formatTable[rowIndex][iCol]['bg_color'] = colorHex
Example #24
Source File: core.py From meterstick with Apache License 2.0 | 5 votes |
def _merge_metrics(row): non_empty = [str(r) for r in row if pd.notnull(r)] if not non_empty: return None elif len(non_empty) == 1: return non_empty[0] else: return "::".join(non_empty) # TODO(dlsun): Remove AnalysisParameters and incorporate the # attributes directly into the Analyze object.
Example #25
Source File: test_resample.py From Computable with MIT License | 5 votes |
def test_custom_grouper(self): dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1), end=datetime(2005, 1, 10)) s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) b = TimeGrouper(Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) self.assertEquals(g.ngroups, 2593) self.assert_(notnull(g.mean()).all()) # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # GH2763 - return in put dtype if we can result = g.agg(np.sum) assert_series_equal(result, expect) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64') r = df.groupby(b).agg(np.sum) self.assertEquals(len(r.columns), 10) self.assertEquals(len(r.index), 2593)
Example #26
Source File: test_moments.py From Computable with MIT License | 5 votes |
def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) assert_almost_equal(result[10], static_comp(self.arr[:11])) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) arr = randn(50) if has_min_periods: result = func(arr, min_periods=30) assert(np.isnan(result[:29]).all()) assert_almost_equal(result[-1], static_comp(arr[:50])) # min_periods is working correctly result = func(arr, min_periods=15) self.assert_(np.isnan(result[13])) self.assert_(not np.isnan(result[14])) arr2 = randn(20) result = func(arr2, min_periods=5) self.assert_(isnull(result[3])) self.assert_(notnull(result[4])) # min_periods=0 result0 = func(arr, min_periods=0) result1 = func(arr, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr) assert_almost_equal(result[-1], static_comp(arr[:50]))
Example #27
Source File: test_ols.py From Computable with MIT License | 5 votes |
def test_longpanel_series_combo(self): wp = tm.makePanel() lp = wp.to_frame() y = lp.pop('ItemA') model = ols(y=y, x=lp, entity_effects=True, window=20) self.assert_(notnull(model.beta.values).all()) tm.assert_isinstance(model, PanelOLS) model.summary
Example #28
Source File: ec2_analyze.py From isitfit with Apache License 2.0 | 5 votes |
def after_all(self, context_all): # add col for utilization in percentage def calc_usedPct(row): if row.capacity_usd==0: return 0 o = row.used_usd / row.capacity_usd * 100 return int(o) self.df_bins['used_pct'] = self.df_bins.apply(calc_usedPct, axis=1) # add column for regions as string self.df_bins['regions_str'] = self.df_bins['regions_set'].apply(lambda x: "0" if len(x)==0 else "%i (%s)"%(len(x), l2s(x))) # cases where dt_start > dt_end are those where there was no data and the initialization remained # so overwrite with na # Update 2019-12-11 Now that the df_bins timestamps are set with resample and dt_end is inclusive, # instead of setting to na, just swap the start/end fake timestamps which represent the end/start of the periods #import numpy as np #self.df_bins['dt_start'] = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_start, axis=1) #self.df_bins['dt_end'] = self.df_bins.apply(lambda row: np.nan if row.count_analyzed==0 else row.dt_end , axis=1) self.df_bins['dt_start_bkp'] = self.df_bins['dt_start'] self.df_bins['dt_start'] = self.df_bins.apply(lambda row: row.dt_end if row.count_analyzed==0 else row.dt_start, axis=1) self.df_bins['dt_end'] = self.df_bins.apply(lambda row: row.dt_start_bkp if row.count_analyzed==0 else row.dt_end , axis=1) del self.df_bins['dt_start_bkp'] # convert the dt_{start,end} back to dates again, given the nans for fx in ['dt_start', 'dt_end']: self.df_bins[fx] = pd.to_datetime(self.df_bins[fx]) # Bugfix for cloudwatch: # When 90>=ndays>=64, cloudwatchman/metric.get_statistics returns data with max Timestamp on mainmanager.EndTime # Otherwise, the data max Timestamp is EndTime - 1 day # Here, get around this problem by incrementing by 1 day (or just set to mainmanager.EndTime) # This was tested on 2019-12-10 10:00 am UTC, and the last date was Dec 9 for ndays<64 and Dec 10 for ndays>=64 if pd.notnull(self.df_bins['dt_end'].iloc[-1]): dt_max = context_all['mainManager'].EndTime.date() import datetime as dt dt_lastp1 = self.df_bins.dt_end.iloc[-1].date() + dt.timedelta(days=1) self.df_bins.iloc[-1, self.df_bins.columns=='dt_end'] = min(dt_max, dt_lastp1) # inject result for reporter access context_all['df_bins'] = self.df_bins return context_all
Example #29
Source File: label.py From lost with MIT License | 5 votes |
def import_df(self, df): '''Import LabelTree from DataFrame Args: df (pandas.DataFrame): LabelTree in DataFrame style. Retruns: :class:`lost.db.model.LabelLeaf` or None: The created root leaf or None if a root leaf with same name is already present in database. ''' df = df.where((pd.notnull(df)), None) root = df[df['parent_leaf_id'].isnull()] no_root = df[~df['parent_leaf_id'].isnull()] childs = {} if len(root) != 1: raise ValueError('''Can not import. There needs to be exactly one root leaf for that tree! Found: \n{}'''.format(root)) else: try: root_leaf = self.create_root(root['name'].values[0]) if root_leaf is None: return None #A tree with the same name already exists. self._df_row_to_leaf(root.loc[0], root_leaf) #Create child dict for index, row in no_root.iterrows(): if not row['parent_leaf_id'] in childs: childs[row['parent_leaf_id']] = [] childs[row['parent_leaf_id']].append(row) self.__create_childs_from_df(childs, root_leaf, root.loc[0]) self.dbm.commit() return root_leaf except KeyError: self.logger.error('''At least the following columns need to be provided: *idx*, *name*, *parent_leaf_id*''') raise
Example #30
Source File: data_cleaner.py From data-cleaner with MIT License | 5 votes |
def _split(value, separators): values = [] for separator in separators: if separator in str(value): values = [str(split_value) for split_value in value.split(separator)] break return pd.Series([str(value).strip() for value in values if pd.notnull(value)])