Python pandas.MultiIndex() Examples
The following are 30
code examples of pandas.MultiIndex().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.

Example #1
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _link_index(self, df_a, df_b): """Build an index for linking two datasets. Parameters ---------- df_a : (tuple of) pandas.Series The data of the left DataFrame to build the index with. df_b : (tuple of) pandas.Series The data of the right DataFrame to build the index with. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index values of two records. """ raise NotImplementedError( "Not possible to call index for the BaseEstimator" )
Example #2
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fit_predict(self, comparison_vectors, match_index=None): """Train the classifier. Parameters ---------- comparison_vectors : pandas.DataFrame The comparison vectors. match_index : pandas.MultiIndex The true matches. return_type : str Deprecated. Use recordlinkage.options instead. Use the option `recordlinkage.set_option('classification.return_type', 'index')` instead. Returns ------- pandas.Series A pandas Series with the labels 1 (for the matches) and 0 (for the non-matches). """ self.fit(comparison_vectors, match_index) result = self.predict(comparison_vectors) return result
Example #3
Source File: febrl.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _febrl_links(df): """Get the links of a FEBRL dataset.""" index = df.index.to_series() keys = index.str.extract(r'rec-(\d+)', expand=True)[0] index_int = numpy.arange(len(df)) df_helper = pandas.DataFrame({ 'key': keys, 'index': index_int }) # merge the two frame and make MultiIndex. pairs_df = df_helper.merge( df_helper, on='key' )[['index_x', 'index_y']] pairs_df = pairs_df[pairs_df['index_x'] > pairs_df['index_y']] return pandas.MultiIndex( levels=[df.index.values, df.index.values], codes=[pairs_df['index_x'].values, pairs_df['index_y'].values], names=[None, None], verify_integrity=False )
Example #4
Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def true_positives(links_true, links_pred): """Count the number of True Positives. Returns the number of correctly predicted links, also called the number of True Positives (TP). Parameters ---------- links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series The true (or actual) links. links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series The predicted links. Returns ------- int The number of correctly predicted links. """ links_true = _get_multiindex(links_true) links_pred = _get_multiindex(links_pred) return len(links_true & links_pred)
Example #5
Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def false_positives(links_true, links_pred): """Count the number of False Positives. Returns the number of incorrect predictions of true non-links. (true non- links, but predicted as links). This value is known as the number of False Positives (FP). Parameters ---------- links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series The true (or actual) links. links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series The predicted links. Returns ------- int The number of false positives. """ links_true = _get_multiindex(links_true) links_pred = _get_multiindex(links_pred) return len(links_pred.difference(links_true))
Example #6
Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def false_negatives(links_true, links_pred): """Count the number of False Negatives. Returns the number of incorrect predictions of true links. (true links, but predicted as non-links). This value is known as the number of False Negatives (FN). Parameters ---------- links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series The true (or actual) links. links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series The predicted links. Returns ------- int The number of false negatives. """ links_true = _get_multiindex(links_true) links_pred = _get_multiindex(links_pred) return len(links_true.difference(links_pred))
Example #7
Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = Full() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = Full() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() pdt.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it.
Example #8
Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_index_names_pandas023(self, index_class): # Pandas changes the behaviour of MultiIndex names. # https://github.com/pandas-dev/pandas/pull/18882 # https://github.com/J535D165/recordlinkage/issues/55 # This test tests compatibility. # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs_link = index_class._link_index(df_a, df_b) if pairs_link.names[0] is not None: assert pairs_link.names[0] != pairs_link.names[1] # make the index pairs_dedup = index_class._dedup_index(df_a) if pairs_link.names[0] is not None: assert pairs_dedup.names[0] != pairs_dedup.names[1]
Example #9
Source File: test_indexing.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_lower_triangular(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) pairs = index_class.index(df_a) # expected levels = [df_a.index.values, df_a.index.values] codes = np.tril_indices(len(df_a.index), k=-1) full_pairs = pd.MultiIndex(levels=levels, codes=codes, verify_integrity=False) # all pairs are in the lower triangle of the matrix. assert len(pairs.difference(full_pairs)) == 0
Example #10
Source File: test_datasets.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_krebs_dataset_download(): # remove downloaded datasets clear_data_home() krebs_data, krebs_matches = load_krebsregister() for i in range(1, 11): assert Path(get_data_home(), "krebsregister", "block_{}.zip".format(i)).is_file() # count the number of recordss assert type(krebs_data), pandas.DataFrame assert type(krebs_matches), pandas.MultiIndex assert len(krebs_data) == 5749132 assert len(krebs_matches) == 20931
Example #11
Source File: model_processing.py From respy with MIT License | 6 votes |
def _infer_choices_with_experience(params, options): """Infer choices with experiences. Example ------- >>> options = {"covariates": {"a": "exp_white_collar + exp_a", "b": "exp_b >= 2"}} >>> index = pd.MultiIndex.from_product([["category"], ["a", "b"]]) >>> params = pd.Series(index=index, dtype="object") >>> _infer_choices_with_experience(params, options) ['a', 'b', 'white_collar'] """ covariates = options["covariates"] parameters = params.index.get_level_values(1) used_covariates = [cov for cov in covariates if cov in parameters] matches = [] for param in parameters: matches += re.findall(r"\bexp_([A-Za-z_]+)\b", str(param)) for cov in used_covariates: matches += re.findall(r"\bexp_([A-Za-z_]+)\b", covariates[cov]) return sorted(set(matches))
Example #12
Source File: multi_index.py From arctic with GNU Lesser General Public License v2.1 | 6 votes |
def multi_index_insert_row(df, index_row, values_row): """ Return a new dataframe with a row inserted for a multi-index dataframe. This will sort the rows according to the ordered multi-index levels. """ if PD_VER < '0.24.0': row_index = pd.MultiIndex(levels=[[i] for i in index_row], labels=[[0] for i in index_row]) else: row_index = pd.MultiIndex(levels=[[i] for i in index_row], codes=[[0] for i in index_row]) row = pd.DataFrame(values_row, index=row_index, columns=df.columns) df = pd.concat((df, row)) if df.index.lexsort_depth == len(index_row) and df.index[-2] < df.index[-1]: # We've just appended a row to an already-sorted dataframe return df # The df wasn't sorted or the row has to be put in the middle somewhere return df.sort_index()
Example #13
Source File: test_pandas_store.py From arctic with GNU Lesser General Public License v2.1 | 6 votes |
def test_data_info_cols(library): i = MultiIndex.from_tuples([(1, "ab"), (2, "bb"), (3, "cb")]) s = DataFrame(data=[100, 200, 300], index=i) library.write('test_data', s) md = library.get_info('test_data') # {'dtype': [('level_0', '<i8'), ('level_1', 'S2'), ('0', '<i8')], # 'col_names': {u'index': [u'level_0', u'level_1'], u'columns': [u'0'], 'index_tz': [None, None]}, # 'type': u'pandasdf', # 'handler': 'PandasDataFrameStore', # 'rows': 3, # 'segment_count': 1, # 'size': 50} assert 'size' in md assert md['segment_count'] == 1 assert md['rows'] == 3 assert md['handler'] == 'PandasDataFrameStore' assert md['type'] == 'pandasdf' assert md['col_names'] == {'index': ['level_0', u'level_1'], 'columns': [u'0'], 'index_tz': [None, None]} assert len(md['dtype']) == 3 assert md['dtype'][0][0] == 'level_0' assert md['dtype'][1][0] == 'level_1' assert md['dtype'][2][0] == '0'
Example #14
Source File: test_base.py From recruit with Apache License 2.0 | 6 votes |
def setup_method(self, method): self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100), strIndex=tm.makeStringIndex(100), dateIndex=tm.makeDateIndex(100), periodIndex=tm.makePeriodIndex(100), tdIndex=tm.makeTimedeltaIndex(100), intIndex=tm.makeIntIndex(100), uintIndex=tm.makeUIntIndex(100), rangeIndex=tm.makeRangeIndex(100), floatIndex=tm.makeFloatIndex(100), boolIndex=Index([True, False]), catIndex=tm.makeCategoricalIndex(100), empty=Index([]), tuples=MultiIndex.from_tuples(lzip( ['foo', 'bar', 'baz'], [1, 2, 3])), repeats=Index([0, 0, 1, 1, 2, 2])) self.setup_indices()
Example #15
Source File: test_common.py From recruit with Apache License 2.0 | 6 votes |
def test_droplevel(self, indices): # GH 21115 if isinstance(indices, MultiIndex): # Tested separately in test_multi.py return assert indices.droplevel([]).equals(indices) for level in indices.name, [indices.name]: if isinstance(indices.name, tuple) and level is indices.name: # GH 21121 : droplevel with tuple name continue with pytest.raises(ValueError): indices.droplevel(level) for level in 'wrong', ['wrong']: with pytest.raises(KeyError): indices.droplevel(level)
Example #16
Source File: test_common.py From recruit with Apache License 2.0 | 6 votes |
def test_constructor_non_hashable_name(self, indices): # GH 20527 if isinstance(indices, MultiIndex): pytest.skip("multiindex handled in test_multi.py") message = "Index.name must be a hashable type" renamed = [['1']] # With .rename() with pytest.raises(TypeError, match=message): indices.rename(name=renamed) # With .set_names() with pytest.raises(TypeError, match=message): indices.set_names(names=renamed)
Example #17
Source File: test_common.py From recruit with Apache License 2.0 | 6 votes |
def test_duplicated(self, indices, keep): if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex') holder = type(indices) idx = holder(indices) if idx.has_duplicates: # We are testing the duplicated-method here, so we need to know # exactly which indices are duplicate and how (for the result). # This is not possible if "idx" has duplicates already, which we # therefore remove. This is seemingly circular, as drop_duplicates # invokes duplicated, but in the end, it all works out because we # cross-check with Series.duplicated, which is tested separately. idx = idx.drop_duplicates() n, k = len(idx), 10 duplicated_selection = np.random.choice(n, k * n) expected = pd.Series(duplicated_selection).duplicated(keep=keep).values idx = holder(idx.values[duplicated_selection]) result = idx.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected)
Example #18
Source File: index.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _link_index(self, df_a, df_b): return pandas.MultiIndex.from_product( [df_a.index.values, df_b.index.values])
Example #19
Source File: index.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _dedup_index(self, df_a): levels = [df_a.index.values, df_a.index.values] codes = numpy.tril_indices(len(df_a.index), k=-1) return pandas.MultiIndex( levels=levels, codes=codes, verify_integrity=False)
Example #20
Source File: index.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _link_index(self, df_a, df_b): left_on, right_on = self._get_left_and_right_on() left_on = listify(left_on) right_on = listify(right_on) blocking_keys = ["blocking_key_%d" % i for i, v in enumerate(left_on)] # make a dataset for the data on the left # 1. make a dataframe # 2. rename columns # 3. add index col # 4. drop na (last step to presever index) data_left = pandas.DataFrame(df_a[left_on], copy=False) data_left.columns = blocking_keys data_left['index_x'] = numpy.arange(len(df_a)) data_left.dropna(axis=0, how='any', subset=blocking_keys, inplace=True) # make a dataset for the data on the right data_right = pandas.DataFrame(df_b[right_on], copy=False) data_right.columns = blocking_keys data_right['index_y'] = numpy.arange(len(df_b)) data_right.dropna( axis=0, how='any', subset=blocking_keys, inplace=True) # merge the dataframes pairs_df = data_left.merge(data_right, how='inner', on=blocking_keys) return pandas.MultiIndex( levels=[df_a.index.values, df_b.index.values], codes=[pairs_df['index_x'].values, pairs_df['index_y'].values], verify_integrity=False)
Example #21
Source File: index.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _link_index(self, df_a, df_b): shape = (len(df_a), len(df_b)) n_max = full_index_size(shape) if not isinstance(self.n, int): raise ValueError('n must be an integer') # with replacement if self.replace: if n_max == 0: raise ValueError("one of the dataframes is empty") pairs = random_pairs_with_replacement(self.n, shape, self.random_state) # without replacement else: if self.n <= 0 or self.n > n_max: raise ValueError( "n must be a integer satisfying 0<n<=%s" % n_max) # the fraction of pairs in the sample frac = self.n / n_max # large dataframes if n_max < 1e6 or frac > 0.5: pairs = random_pairs_without_replacement( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_low_memory( self.n, shape, self.random_state) levels = [df_a.index.values, df_b.index.values] codes = pairs return pandas.MultiIndex( levels=levels, codes=codes, verify_integrity=False)
Example #22
Source File: index.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _dedup_index(self, df_a): shape = (len(df_a), ) # with replacement if self.replace: pairs = random_pairs_with_replacement(self.n, shape, self.random_state) # without replacement else: n_max = full_index_size(shape) if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max: raise ValueError( "n must be a integer satisfying 0<n<=%s" % n_max) # large dataframes if n_max < 1e6: pairs = random_pairs_without_replacement( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_low_memory( self.n, shape, self.random_state) levels = [df_a.index.values, df_a.index.values] labels = pairs return pandas.MultiIndex( levels=levels, codes=labels, verify_integrity=False)
Example #23
Source File: utils.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def index_split(index, chunks): """Function to split pandas.Index and pandas.MultiIndex objects. Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects into chunks. This function is based on :func:`numpy.array_split`. Parameters ---------- index : pandas.Index, pandas.MultiIndex A pandas.Index or pandas.MultiIndex to split into chunks. chunks : int The number of parts to split the index into. Returns ------- list A list with chunked pandas.Index or pandas.MultiIndex objects. """ Ntotal = index.shape[0] Nsections = int(chunks) if Nsections <= 0: raise ValueError('number sections must be larger than 0.') Neach_section, extras = divmod(Ntotal, Nsections) section_sizes = ([0] + extras * [Neach_section + 1] + (Nsections - extras) * [Neach_section]) div_points = numpy.array(section_sizes).cumsum() sub_ind = [] for i in range(Nsections): st = div_points[i] end = div_points[i + 1] sub_ind.append(index[st:end]) return sub_ind
Example #24
Source File: utils.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def frame_indexing(frame, multi_index, level_i, indexing_type='label'): """Index dataframe based on one level of MultiIndex. Arguments --------- frame : pandas.DataFrame The datafrme to select records from. multi_index : pandas.MultiIndex A pandas multiindex were one fo the levels is used to sample the dataframe with. level_i : int, str The level of the multiIndex to index on. indexing_type : str The type of indexing. The value can be 'label' or 'position'. Default 'label'. """ if indexing_type == "label": data = frame.loc[multi_index.get_level_values(level_i)] data.index = multi_index elif indexing_type == "position": data = frame.iloc[multi_index.get_level_values(level_i)] data.index = multi_index else: raise ValueError("indexing_type needs to be 'label' or 'position'") return data
Example #25
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _verify_integrety(self, x): if isinstance(x.index, pandas.Index): if not x.index.is_unique: raise ValueError('index of DataFrame is not unique') elif isinstance(x.index, pandas.MultiIndex): raise ValueError( 'expected pandas.Index instead of pandas.MultiIndex' )
Example #26
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _dedup_index(self, df_a): """Build an index for duplicate detection in a dataset. This method can be used to implement an algorithm for duplicate detection. This method is optional if method :func:`~recordlinkage.base.BaseIndexAlgorithm._link_index` is implemented. Parameters ---------- df_a : (tuple of) pandas.Series The data of the DataFrame to build the index with. Returns ------- pandas.MultiIndex A pandas.MultiIndex with record pairs. Each record pair contains the index values of two records. The records are sampled from the lower triangular part of the matrix. """ pairs = self._link_index(df_a, df_a) # Remove all pairs not in the lower triangular part of the matrix. # This part can be inproved by not comparing the level values, but the # level itself. try: pairs = pairs[pairs.codes[0] > pairs.codes[1]] except AttributeError: # backwards compat pandas <24 pairs = pairs[pairs.labels[0] > pairs.labels[1]] return pairs
Example #27
Source File: base.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, features=[], n_jobs=1, indexing_type='label', **kwargs): logging.info("comparing - initialize {} class".format( self.__class__.__name__) ) self.features = [] self.add(features) # public if n_jobs == -1: self.n_jobs = cpu_count() else: self.n_jobs = n_jobs self.indexing_type = indexing_type # label of position # logging self._i = 1 self._i_max = None self._n = [] self._eta = [] self._output_log_total = True # private self._compare_functions = [] if isinstance(features, (pandas.MultiIndex, pandas.Index)): warnings.warn( "It seems you are using the older version of the Compare API, " "see the documentation about how to update to the new API. " "http://recordlinkage.readthedocs.io/" "en/latest/ref-compare.html", DeprecationWarning )
Example #28
Source File: types.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def is_pandas_multiindex(x): return isinstance(x, (pandas.MultiIndex))
Example #29
Source File: febrl.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_febrl4(return_links=False): """Load the FEBRL 4 datasets. The Freely Extensible Biomedical Record Linkage (Febrl) package is distributed with a dataset generator and four datasets generated with the generator. This function returns the fourth Febrl dataset as a :class:`pandas.DataFrame`. *"Generated as one data set with 10000 records (5000 originals and 5000 duplicates, with one duplicate per original), the originals have been split from the duplicates, into dataset4a.csv (containing the 5000 original records) and dataset4b.csv (containing the 5000 duplicate records) These two data sets can be used for testing linkage procedures."* Parameters ---------- return_links: bool When True, the function returns also the true links. Returns ------- (pandas.DataFrame, pandas.DataFrame) A :class:`pandas.DataFrame` with Febrl dataset4a.csv and a pandas dataframe with Febrl dataset4b.csv. When return_links is True, the function returns also the true links. """ df_a = _febrl_load_data('dataset4a.csv') df_b = _febrl_load_data('dataset4b.csv') if return_links: links = pandas.MultiIndex.from_arrays([ ["rec-{}-org".format(i) for i in range(0, 5000)], ["rec-{}-dup-0".format(i) for i in range(0, 5000)]] ) return df_a, df_b, links else: return df_a, df_b
Example #30
Source File: measures.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_multiindex(x): if isinstance(x, (pandas.DataFrame, pandas.Series)): return x.index elif isinstance(x, pandas.MultiIndex): return x else: raise ValueError("Expected one of: pandas.DataFrame, " "pandas.Series, pandas.MultiIndex")