Python pandas.MultiIndex() Examples

The following are code examples for showing how to use pandas.MultiIndex(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: bigquerylayers   Author: smandaric   File: _pandas_helpers.py    GNU General Public License v3.0 6 votes vote down vote up
def get_column_or_index(dataframe, name):
    """Return a column or index as a pandas series."""
    if name in dataframe.columns:
        return dataframe[name].reset_index(drop=True)

    if isinstance(dataframe.index, pandas.MultiIndex):
        if name in dataframe.index.names:
            return (
                dataframe.index.get_level_values(name)
                .to_series()
                .reset_index(drop=True)
            )
    else:
        if name == dataframe.index.name:
            return dataframe.index.to_series().reset_index(drop=True)

    raise ValueError("column or index '{}' not found.".format(name)) 
Example 2
Project: bigquerylayers   Author: smandaric   File: _pandas_helpers.py    GNU General Public License v3.0 6 votes vote down vote up
def list_columns_and_indexes(dataframe):
    """Return all index and column names with dtypes.

    Returns:
        Sequence[Tuple[str, dtype]]:
            Returns a sorted list of indexes and column names with
            corresponding dtypes. If an index is missing a name or has the
            same name as a column, the index is omitted.
    """
    column_names = frozenset(dataframe.columns)
    columns_and_indexes = []
    if isinstance(dataframe.index, pandas.MultiIndex):
        for name in dataframe.index.names:
            if name and name not in column_names:
                values = dataframe.index.get_level_values(name)
                columns_and_indexes.append((name, values.dtype))
    else:
        if dataframe.index.name and dataframe.index.name not in column_names:
            columns_and_indexes.append((dataframe.index.name, dataframe.index.dtype))

    columns_and_indexes += zip(dataframe.columns, dataframe.dtypes)
    return columns_and_indexes 
Example 3
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fit_predict(self, comparison_vectors, match_index=None):
        """Train the classifier.

        Parameters
        ----------
        comparison_vectors : pandas.DataFrame
            The comparison vectors.
        match_index : pandas.MultiIndex
            The true matches.
        return_type : str
            Deprecated. Use recordlinkage.options instead. Use the option
            `recordlinkage.set_option('classification.return_type', 'index')`
            instead.

        Returns
        -------
        pandas.Series
            A pandas Series with the labels 1 (for the matches) and 0 (for the
            non-matches).

        """
        self.fit(comparison_vectors, match_index)
        result = self.predict(comparison_vectors)

        return result 
Example 4
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def true_positives(links_true, links_pred):
    """Count the number of True Positives.

    Returns the number of correctly predicted links, also called the number of
    True Positives (TP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of correctly predicted links.
    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true & links_pred) 
Example 5
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_positives(links_true, links_pred):
    """Count the number of False Positives.

    Returns the number of incorrect predictions of true non-links. (true non-
    links, but predicted as links). This value is known as the number of False
    Positives (FP).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false positives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_pred.difference(links_true)) 
Example 6
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def false_negatives(links_true, links_pred):
    """Count the number of False Negatives.

    Returns the number of incorrect predictions of true links. (true links,
    but predicted as non-links). This value is known as the number of False
    Negatives (FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.

    Returns
    -------
    int
        The number of false negatives.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    return len(links_true.difference(links_pred)) 
Example 7
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = Full()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = Full()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it. 
Example 8
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_index_names_pandas023(self, index_class):
        # Pandas changes the behaviour of MultiIndex names.
        # https://github.com/pandas-dev/pandas/pull/18882
        # https://github.com/J535D165/recordlinkage/issues/55
        # This test tests compatibility.

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs_link = index_class._link_index(df_a, df_b)

        if pairs_link.names[0] is not None:
            assert pairs_link.names[0] != pairs_link.names[1]

        # make the index
        pairs_dedup = index_class._dedup_index(df_a)

        if pairs_link.names[0] is not None:
            assert pairs_dedup.names[0] != pairs_dedup.names[1] 
Example 9
Project: recordlinkage   Author: J535D165   File: test_datasets.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_krebs_dataset_download():

    # remove downloaded datasets
    clear_data_home()

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(get_data_home(), "krebsregister",
                    "block_{}.zip".format(i)).is_file()

    # count the number of recordss
    assert type(krebs_data), pandas.DataFrame
    assert type(krebs_matches), pandas.MultiIndex
    assert len(krebs_data) == 5749132
    assert len(krebs_matches) == 20931 
Example 10
Project: respy   Author: OpenSourceEconomics   File: model_processing.py    MIT License 6 votes vote down vote up
def _infer_choices_with_experience(params, options):
    """Infer choices with experiences.

    Example
    -------
    >>> options = {"covariates": {"a": "exp_white_collar + exp_a", "b": "exp_b >= 2"}}
    >>> index = pd.MultiIndex.from_product([["category"], ["a", "b"]])
    >>> params = pd.Series(index=index)
    >>> _infer_choices_with_experience(params, options)
    ['a', 'b', 'white_collar']

    """
    covariates = options["covariates"]
    parameters = params.index.get_level_values(1)

    used_covariates = [cov for cov in covariates if cov in parameters]

    matches = []
    for param in parameters:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", str(param))
    for cov in used_covariates:
        matches += re.findall(r"\bexp_([A-Za-z_]+)\b", covariates[cov])

    return sorted(set(matches)) 
Example 11
Project: arctic   Author: man-group   File: test_pandas_store.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def test_data_info_cols(library):
    i = MultiIndex.from_tuples([(1, "ab"), (2, "bb"), (3, "cb")])
    s = DataFrame(data=[100, 200, 300], index=i)
    library.write('test_data', s)
    md = library.get_info('test_data')
    # {'dtype': [('level_0', '<i8'), ('level_1', 'S2'), ('0', '<i8')],
    #                  'col_names': {u'index': [u'level_0', u'level_1'], u'columns': [u'0'], 'index_tz': [None, None]},
    #                  'type': u'pandasdf',
    #                  'handler': 'PandasDataFrameStore',
    #                  'rows': 3,
    #                  'segment_count': 1,
    #                  'size': 50}
    assert 'size' in md
    assert md['segment_count'] == 1
    assert md['rows'] == 3
    assert md['handler'] == 'PandasDataFrameStore'
    assert md['type'] == 'pandasdf'
    assert md['col_names'] == {'index': ['level_0', u'level_1'], 'columns': [u'0'], 'index_tz': [None, None]}
    assert len(md['dtype']) == 3
    assert md['dtype'][0][0] == 'level_0'
    assert md['dtype'][1][0] == 'level_1'
    assert md['dtype'][2][0] == '0' 
Example 12
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_setops.py    MIT License 6 votes vote down vote up
def test_union_different_types(index_pair):
    # GH 23525
    idx1, idx2 = index_pair
    type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x)))
    if type_pair in COMPATIBLE_INCONSISTENT_PAIRS:
        pytest.xfail("This test only considers non compatible indexes.")

    if any(isinstance(idx, pd.MultiIndex) for idx in index_pair):
        pytest.xfail("This test doesn't consider multiindixes.")

    if is_dtype_equal(idx1.dtype, idx2.dtype):
        pytest.xfail("This test only considers non matching dtypes.")

    # A union with a CategoricalIndex (even as dtype('O')) and a
    # non-CategoricalIndex can only be made if both indices are monotonic.
    # This is true before this PR as well.

    # Union with a non-unique, non-monotonic index raises error
    # This applies to the boolean index
    idx1 = idx1.sort_values()
    idx2 = idx2.sort_values()

    assert idx1.union(idx2).dtype == np.dtype("O")
    assert idx2.union(idx1).dtype == np.dtype("O") 
Example 13
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_base.py    MIT License 6 votes vote down vote up
def setup_method(self, method):
        self.indices = dict(
            unicodeIndex=tm.makeUnicodeIndex(100),
            strIndex=tm.makeStringIndex(100),
            dateIndex=tm.makeDateIndex(100),
            periodIndex=tm.makePeriodIndex(100),
            tdIndex=tm.makeTimedeltaIndex(100),
            intIndex=tm.makeIntIndex(100),
            uintIndex=tm.makeUIntIndex(100),
            rangeIndex=tm.makeRangeIndex(100),
            floatIndex=tm.makeFloatIndex(100),
            boolIndex=Index([True, False]),
            catIndex=tm.makeCategoricalIndex(100),
            empty=Index([]),
            tuples=MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])),
            repeats=Index([0, 0, 1, 1, 2, 2]),
        )
        self.setup_indices() 
Example 14
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_common.py    MIT License 6 votes vote down vote up
def test_droplevel(self, indices):
        # GH 21115
        if isinstance(indices, MultiIndex):
            # Tested separately in test_multi.py
            return

        assert indices.droplevel([]).equals(indices)

        for level in indices.name, [indices.name]:
            if isinstance(indices.name, tuple) and level is indices.name:
                # GH 21121 : droplevel with tuple name
                continue
            with pytest.raises(ValueError):
                indices.droplevel(level)

        for level in "wrong", ["wrong"]:
            with pytest.raises(
                KeyError,
                match=r"'Requested level \(wrong\) does not match index name \(None\)'",
            ):
                indices.droplevel(level) 
Example 15
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_common.py    MIT License 6 votes vote down vote up
def test_duplicated(self, indices, keep):
        if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
            # MultiIndex tested separately in:
            # tests/indexes/multi/test_unique_and_duplicates
            pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex")

        holder = type(indices)

        idx = holder(indices)
        if idx.has_duplicates:
            # We are testing the duplicated-method here, so we need to know
            # exactly which indices are duplicate and how (for the result).
            # This is not possible if "idx" has duplicates already, which we
            # therefore remove. This is seemingly circular, as drop_duplicates
            # invokes duplicated, but in the end, it all works out because we
            # cross-check with Series.duplicated, which is tested separately.
            idx = idx.drop_duplicates()

        n, k = len(idx), 10
        duplicated_selection = np.random.choice(n, k * n)
        expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
        idx = holder(idx.values[duplicated_selection])

        result = idx.duplicated(keep=keep)
        tm.assert_numpy_array_equal(result, expected) 
Example 16
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_missing.py    MIT License 6 votes vote down vote up
def test_nan_stays_float():

    # GH 7031
    idx0 = pd.MultiIndex(
        levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1]
    )
    idx1 = pd.MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1])
    idxm = idx0.join(idx1, how="outer")
    assert pd.isna(idx0.get_level_values(1)).all()
    # the following failed in 0.14.1
    assert pd.isna(idxm.get_level_values(1)[:-1]).all()

    df0 = pd.DataFrame([[1, 2]], index=idx0)
    df1 = pd.DataFrame([[3, 4]], index=idx1)
    dfm = df0 - df1
    assert pd.isna(df0.index.get_level_values(1)).all()
    # the following failed in 0.14.1
    assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() 
Example 17
Project: scicast   Author: iandriver   File: matrix_filter.py    MIT License 5 votes vote down vote up
def index_to_label(self,index):
        """Convert a pandas index or multiindex to an axis label."""
        if isinstance(index, pd.MultiIndex):
            return "-".join(map(str, index.names))
        else:
            return index.name

    #given a gene list make new matrix from gene subset 
Example 18
Project: scicast   Author: iandriver   File: matrix_filter.py    MIT License 5 votes vote down vote up
def _index_to_label(index):
    """Convert a pandas index or multiindex to an axis label."""
    if isinstance(index, pd.MultiIndex):
        return "-".join(map(str, index.names))
    else:
        return index.name 
Example 19
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def multi_index_to_frame(index):
    """
    Replicates MultiIndex.to_frame, which was introduced in pandas 0.21,
    for the sake of backwards compatibility.
    """
    return pandas.DataFrame(index.tolist(), index=index, columns=index.names) 
Example 20
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def index_split(index, chunks):
    """Function to split pandas.Index and pandas.MultiIndex objects.

    Split :class:`pandas.Index` and :class:`pandas.MultiIndex` objects
    into chunks. This function is based on :func:`numpy.array_split`.

    Parameters
    ----------
    index : pandas.Index, pandas.MultiIndex
        A pandas.Index or pandas.MultiIndex to split into chunks.
    chunks : int
        The number of parts to split the index into.

    Returns
    -------
    list
        A list with chunked pandas.Index or pandas.MultiIndex objects.

    """

    Ntotal = index.shape[0]
    Nsections = int(chunks)
    if Nsections <= 0:
        raise ValueError('number sections must be larger than 0.')
    Neach_section, extras = divmod(Ntotal, Nsections)
    section_sizes = ([0] + extras * [Neach_section + 1] +
                     (Nsections - extras) * [Neach_section])
    div_points = numpy.array(section_sizes).cumsum()

    sub_ind = []
    for i in range(Nsections):
        st = div_points[i]
        end = div_points[i + 1]
        sub_ind.append(index[st:end])

    return sub_ind 
Example 21
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def frame_indexing(frame, multi_index, level_i, indexing_type='label'):
    """Index dataframe based on one level of MultiIndex.

    Arguments
    ---------
    frame : pandas.DataFrame
        The datafrme to select records from.
    multi_index : pandas.MultiIndex
        A pandas multiindex were one fo the levels is used to sample the
        dataframe with.
    level_i : int, str
        The level of the multiIndex to index on.
    indexing_type : str
        The type of indexing. The value can be 'label' or 'position'.
        Default 'label'.

    """

    if indexing_type == "label":
        data = frame.loc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    elif indexing_type == "position":
        data = frame.iloc[multi_index.get_level_values(level_i)]
        data.index = multi_index
    else:
        raise ValueError("indexing_type needs to be 'label' or 'position'")

    return data 
Example 22
Project: recordlinkage   Author: J535D165   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def construct_multiindex(levels, codes, *args, **kwargs):

    if is_min_pandas_version("0.24.0"):
        return pandas.MultiIndex(levels=levels, codes=codes, *args, **kwargs)
    else:
        return pandas.MultiIndex(levels=levels, labels=codes, *args, **kwargs) 
Example 23
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _verify_integrety(self, x):

        if isinstance(x.index, pandas.Index):

            if not x.index.is_unique:
                raise ValueError('index of DataFrame is not unique')

        elif isinstance(x.index, pandas.MultiIndex):
            raise ValueError(
                'expected pandas.Index instead of pandas.MultiIndex'
            ) 
Example 24
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _dedup_index(self, df_a):
        """Build an index for duplicate detection in a dataset.

        This method can be used to implement an algorithm for
        duplicate detection. This method is optional if method
        :func:`~recordlinkage.base.BaseIndexAlgorithm._link_index`
        is implemented.

        Parameters
        ----------
        df_a : (tuple of) pandas.Series
            The data of the DataFrame to build the index with.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair
            contains the index values of two records. The records are
            sampled from the lower triangular part of the matrix.
        """
        pairs = self._link_index(df_a, df_a)

        # Remove all pairs not in the lower triangular part of the matrix.
        # This part can be inproved by not comparing the level values, but the
        # level itself.
        try:
            pairs = pairs[pairs.codes[0] > pairs.codes[1]]
        except AttributeError:
            # backwards compat pandas <24
            pairs = pairs[pairs.labels[0] > pairs.labels[1]]

        return pairs 
Example 25
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, features=[], n_jobs=1, indexing_type='label',
                 **kwargs):

        logging.info("comparing - initialize {} class".format(
            self.__class__.__name__)
        )

        self.features = []
        self.add(features)

        # public
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs
        self.indexing_type = indexing_type  # label of position

        # logging
        self._i = 1
        self._i_max = None
        self._n = []
        self._eta = []
        self._output_log_total = True

        # private
        self._compare_functions = []

        if isinstance(features, (pandas.MultiIndex, pandas.Index)):
            warnings.warn(
                "It seems you are using the older version of the Compare API, "
                "see the documentation about how to update to the new API. "
                "http://recordlinkage.readthedocs.io/"
                "en/latest/ref-compare.html",
                DeprecationWarning
            ) 
Example 26
Project: recordlinkage   Author: J535D165   File: base.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _return_result(self, result, comparison_vectors=None):
        """Return different formatted classification results.

        """
        return_type = cf.get_option('classification.return_type')

        if type(result) != np.ndarray:
            raise ValueError("numpy.ndarray expected.")

        # return the pandas.MultiIndex
        if return_type == 'index':
            return comparison_vectors.index[result.astype(bool)]

        # return a pandas.Series
        elif return_type == 'series':
            return pandas.Series(
                result,
                index=comparison_vectors.index,
                name='classification')

        # return a numpy.ndarray
        elif return_type == 'array':
            return result

        # return_type not known
        else:
            raise ValueError(
                "return_type {} unknown. Choose 'index', 'series' or "
                "'array'".format(return_type)) 
Example 27
Project: recordlinkage   Author: J535D165   File: types.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def is_pandas_multiindex(x):

    return isinstance(x, (pandas.MultiIndex)) 
Example 28
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _get_multiindex(x):

    if isinstance(x, (pandas.DataFrame, pandas.Series)):
        return x.index
    elif isinstance(x, pandas.MultiIndex):
        return x
    else:
        raise ValueError("Expected one of: pandas.DataFrame, "
                         "pandas.Series, pandas.MultiIndex") 
Example 29
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def reduction_ratio(links_pred, *total):
    """Compute the reduction ratio.

    The reduction ratio is 1 minus the ratio candidate matches and the maximum
    number of pairs possible.

    Parameters
    ----------
    links_pred: int, pandas.MultiIndex
        The number of candidate record pairs or the pandas.MultiIndex with
        record pairs.
    *total: pandas.DataFrame object(s)
        The DataFrames are used to compute the full index size with the
        full_index_size function.

    Returns
    -------
    float
        The reduction ratio.

    """

    n_max = full_index_size(*total)

    if isinstance(links_pred, pandas.MultiIndex):
        links_pred = len(links_pred)

    if links_pred > n_max:
        raise ValueError("n has to be smaller of equal n_max")

    return 1 - links_pred / n_max 
Example 30
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def true_negatives(links_true, links_pred, total):
    """Count the number of True Negatives.

    Returns the number of correctly predicted non-links, also called the
    number of True Negatives (TN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted links.
    total: int, pandas.MultiIndex
        The count of all record pairs (both links and non-links). When the
        argument is a pandas.MultiIndex, the length of the index is used.

    Returns
    -------
    int
        The number of correctly predicted non-links.

    """

    links_true = _get_multiindex(links_true)
    links_pred = _get_multiindex(links_pred)

    if isinstance(total, pandas.MultiIndex):
        total = len(total)

    return int(total) - len(links_true | links_pred) 
Example 31
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def recall(links_true, links_pred=None):
    """recall(links_true, links_pred)

    Compute the recall/sensitivity.

    The recall is given by TP/(TP+FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) collection of links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted collection of links.

    Returns
    -------
    float
        The recall
    """

    if _isconfusionmatrix(links_true):

        confusion_matrix = links_true

        v = confusion_matrix[0, 0] \
            / (confusion_matrix[0, 0] + confusion_matrix[0, 1])
    else:

        tp = true_positives(links_true, links_pred)
        fn = false_negatives(links_true, links_pred)
        v = tp / (tp + fn)

    return float(v) 
Example 32
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def accuracy(links_true, links_pred=None, total=None):
    """accuracy(links_true, links_pred, total)

    Compute the accuracy.

    The accuracy is given by (TP+TN)/(TP+FP+TN+FN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) collection of links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted collection of links.
    total: int, pandas.MultiIndex
        The count of all record pairs (both links and non-links). When the
        argument is a pandas.MultiIndex, the length of the index is used.

    Returns
    -------
    float
        The accuracy
    """

    if isinstance(total, pandas.MultiIndex):
        total = len(total)

    if _isconfusionmatrix(links_true):

        confusion_matrix = links_true

        v = (confusion_matrix[0, 0] + confusion_matrix[1, 1]) \
            / numpy.sum(confusion_matrix)
    else:

        tp = true_positives(links_true, links_pred)
        tn = true_negatives(links_true, links_pred, total)

        v = (tp + tn) / total

    return float(v) 
Example 33
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def specificity(links_true, links_pred=None, total=None):
    """specificity(links_true, links_pred, total)

    Compute the specificity.

    The specificity is given by TN/(FP+TN).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) collection of links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted collection of links.
    total: int, pandas.MultiIndex
        The count of all record pairs (both links and non-links). When the
        argument is a pandas.MultiIndex, the length of the index is used.

    Returns
    -------
    float
        The specificity

    """

    if _isconfusionmatrix(links_true):

        confusion_matrix = links_true

        v = confusion_matrix[1, 1] / \
            (confusion_matrix[1, 0] + confusion_matrix[1, 1])
    else:

        fp = false_positives(links_true, links_pred)

        if isinstance(total, pandas.MultiIndex):
            total = len(total)
        tn = true_negatives(links_true, links_pred, total)
        v = tn / (fp + tn)

    return float(v) 
Example 34
Project: recordlinkage   Author: J535D165   File: measures.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fscore(links_true, links_pred=None):
    """fscore(links_true, links_pred)

    Compute the F-score.

    The F-score is given by 2*(precision*recall)/(precision+recall).

    Parameters
    ----------
    links_true: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The true (or actual) collection of links.
    links_pred: pandas.MultiIndex, pandas.DataFrame, pandas.Series
        The predicted collection of links.

    Returns
    -------
    float
        The fscore

    Note
    ----
    If there are no pairs predicted as links, this measure will raise a
    ZeroDivisionError.

     """

    prec = precision(links_true, links_pred)
    rec = recall(links_true, links_pred)

    return float(2 * prec * rec / (prec + rec)) 
Example 35
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_lower_triangular(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)
        pairs = index_class.index(df_a)

        # expected
        levels = [df_a.index.values, df_a.index.values]
        codes = np.tril_indices(len(df_a.index), k=-1)

        if is_min_pandas_version("0.24.0"):
            full_pairs = pd.MultiIndex(
                levels=levels,
                codes=codes,
                verify_integrity=False
            )
        else:
            full_pairs = pd.MultiIndex(
                levels=levels,
                labels=codes,
                verify_integrity=False
            )            

        # all pairs are in the lower triangle of the matrix.
        assert len(pairs.difference(full_pairs)) == 0 
Example 36
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_basic_dedup(self):
        """FULL: Test basic characteristics of full indexing (dedup)."""

        from recordlinkage.index import Full

        # finding duplicates
        index_cl = Full()
        pairs = index_cl.index(self.a)

        assert isinstance(pairs, pd.MultiIndex)
        assert len(pairs) == len(self.a) * (len(self.a) - 1) / 2
        assert pairs.is_unique 
Example 37
Project: recordlinkage   Author: J535D165   File: test_indexing.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_basic_link(self):
        """FULL: Test basic characteristics of full indexing (link)."""

        from recordlinkage.index import Full

        # finding duplicates
        index_cl = Full()
        pairs = index_cl.index((self.a, self.b))

        assert isinstance(pairs, pd.MultiIndex)
        assert len(pairs) == len(self.a) * len(self.b)
        assert pairs.is_unique 
Example 38
Project: recordlinkage   Author: J535D165   File: test_datasets.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_febrl_dedup_links(dataset, nrows, nlinks):

    df, links = dataset(return_links=True)
    assert isinstance(df, pandas.DataFrame)
    assert len(df) == nrows
    assert len(links) == nlinks
    assert isinstance(links, pandas.MultiIndex) 
Example 39
Project: recordlinkage   Author: J535D165   File: test_datasets.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_random_comparison_vectors():
    # Test the generation of a random dataset

    n_record_pairs = 10000
    n_matches = 500

    df = binary_vectors(
        n_record_pairs, n_matches, m=[0.8] * 8, u=[0.2] * 8, random_state=535)

    # Check the result is a DataFrame with MultiIndex
    assert isinstance(df, pandas.DataFrame)
    assert isinstance(df.index, pandas.MultiIndex)

    # Test the length of the dataframe
    assert len(df) == n_record_pairs 
Example 40
Project: recordlinkage   Author: J535D165   File: test_classify.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_return_result_options(self, classifier):

        cl = classifier()
        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(self.X_train)
        else:
            cl.fit(self.X_train, self.y_train)

        prediction_default = cl.predict(self.X_test)
        assert isinstance(prediction_default, pd.MultiIndex)

        with rl.option_context('classification.return_type', 'index'):
            prediction_multiindex = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_multiindex, pd.MultiIndex)

        with rl.option_context('classification.return_type', 'array'):
            prediction_ndarray = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_ndarray, np.ndarray)

        with rl.option_context('classification.return_type', 'series'):
            prediction_series = cl.predict(comparison_vectors=self.X_train)
            assert isinstance(prediction_series, pd.Series)

        with pytest.raises(ValueError):
            with rl.option_context('classification.return_type',
                                   'unknown_return_type'):
                cl.predict(
                    comparison_vectors=self.X_train
                ) 
Example 41
Project: recordlinkage   Author: J535D165   File: test_classify.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_return_result_options_depr(self, classifier):

        cl = classifier()
        cl.fit(self.X_train, self.y_train)

        prediction_default = cl.predict(self.X_test)
        assert isinstance(prediction_default, pd.MultiIndex)

        with pytest.deprecated_call():
            prediction_multiindex = cl.predict(
                comparison_vectors=self.X_train, return_type='index')
            assert isinstance(prediction_multiindex, pd.MultiIndex)

        with pytest.deprecated_call():
            prediction_ndarray = cl.predict(
                comparison_vectors=self.X_train, return_type='array')
            assert isinstance(prediction_ndarray, np.ndarray)

        with pytest.deprecated_call():
            prediction_series = cl.predict(
                comparison_vectors=self.X_train,
                return_type='series')
            assert isinstance(prediction_series, pd.Series)

        with pytest.deprecated_call():
            with pytest.raises(ValueError):
                cl.predict(
                    comparison_vectors=self.X_train,
                    return_type='unknown_return_type'
                ) 
Example 42
Project: recordlinkage   Author: J535D165   File: test_classify.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_fit_predict_unsupervised(self, classifier):

        cl = classifier()
        cl.fit(self.X_train)
        result = cl.predict(self.X_train)

        assert isinstance(result, pd.MultiIndex)

        cl2 = classifier()
        expected = cl2.fit_predict(self.X_train)

        assert isinstance(expected, pd.MultiIndex)
        assert result.values.shape == expected.values.shape

        ptm.assert_index_equal(result, expected) 
Example 43
Project: recordlinkage   Author: J535D165   File: test_classify.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_kmeans(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)
        result = kmeans.predict(self.X_test)

        assert isinstance(result, pd.MultiIndex)
        assert result.shape[0] == 11670 
Example 44
Project: pore-c   Author: nanoporetech   File: model.py    Mozilla Public License 2.0 5 votes vote down vote up
def _validate(self, obj):
        for col in ["chrom", "start", "end"]:
            if col not in obj.columns:
                raise AttributeError("Must have columns 'chrom', 'start' and 'end'.")
        self.index_name = "index" if obj.index.name is None else obj.index.name
        assert obj.index.is_unique, "Must have a unique index"
        assert not isinstance(obj.index, pd.MultiIndex), "Can't be multindex"
        assert np.issubdtype(obj.index.dtype, np.integer), "Must have integer index: {}".format(obj.index.dtype) 
Example 45
Project: pore-c   Author: nanoporetech   File: model.py    Mozilla Public License 2.0 5 votes vote down vote up
def _validate(self, obj):
        for col in ["chrom", "start", "end"]:
            if col not in obj.columns:
                raise AttributeError("Must have columns 'chrom', 'start' and 'end'.")
        self.index_name = "index" if obj.index.name is None else obj.index.name
        dupes = obj.index.duplicated(keep=False)
        if dupes.any():
            raise ValueError("Must have a unique index: {}".format(obj[dupes]))
        assert not isinstance(obj.index, pd.MultiIndex), "Can't be multindex"
        if not np.issubdtype(obj.index.dtype, np.integer):
            raise ValueError("Must have integer index: {}\n{}".format(obj.index.dtype, obj)) 
Example 46
Project: arctic   Author: man-group   File: multi_index.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def groupby_asof(df, as_of=None, dt_col='sample_dt', asof_col='observed_dt'):
    ''' Common use case for selecting the latest rows from a bitemporal dataframe as-of a certain date.

    Parameters
    ----------
    df: ``pd.DataFrame``
        Dataframe with a MultiIndex index
    as_of: ``datetime``
        Return a timeseries with values observed <= this as-of date. By default, the latest observed
        values will be returned.
    dt_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the sample date
    asof_col: ``str`` or ``int``
        Name or index of the column in the MultiIndex that is the observed date
    '''
    if as_of:
        if as_of.tzinfo is None and df.index.get_level_values(asof_col).tz is not None:
            as_of = as_of.replace(tzinfo=mktz())
    return fancy_group_by(df,
                          grouping_level=dt_col,
                          aggregate_level=asof_col,
                          method='last',
                          max_=as_of)


# ----------------------- Insert/Append ---------------------------- # 
Example 47
Project: arctic   Author: man-group   File: multi_index.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def multi_index_insert_row(df, index_row, values_row):
    """ Return a new dataframe with a row inserted for a multi-index dataframe.
        This will sort the rows according to the ordered multi-index levels.
    """
    row_index = pd.MultiIndex(levels=[[i] for i in index_row],
                              labels=[[0] for i in index_row])
    row = pd.DataFrame(values_row, index=row_index, columns=df.columns)
    df = pd.concat((df, row))
    if df.index.lexsort_depth == len(index_row) and df.index[-2] < df.index[-1]:
        # We've just appended a row to an already-sorted dataframe
        return df
    # The df wasn't sorted or the row has to be put in the middle somewhere
    return df.sort_index() 
Example 48
Project: arctic   Author: man-group   File: test_pandas_store.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_series_with_multiindex(library):
    df = Series(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values) 
Example 49
Project: arctic   Author: man-group   File: test_pandas_store.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_series_with_multiindex_and_name(library):
    df = Series(data=['A', 'BC', 'DEF'],
                index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]),
                name='Foo')
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)
    assert df.name == 'Foo' 
Example 50
Project: arctic   Author: man-group   File: test_pandas_store.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def test_save_read_pandas_dataframe_with_multiindex(library):
    df = DataFrame(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]))
    library.write('pandas', df)
    saved_df = library.read('pandas').data
    assert np.all(df.values == saved_df.values)