python source code of compare

from functools import partial

import numpy

import pandas

from recordlinkage.algorithms.distance import _1d_distance
from recordlinkage.algorithms.distance import _haversine_distance
from recordlinkage.algorithms.numeric import _exp_sim
from recordlinkage.algorithms.numeric import _gauss_sim
from recordlinkage.algorithms.numeric import _linear_sim
from recordlinkage.algorithms.numeric import _squared_sim
from recordlinkage.algorithms.numeric import _step_sim
from recordlinkage.algorithms.string import cosine_similarity
from recordlinkage.algorithms.string import damerau_levenshtein_similarity
from recordlinkage.algorithms.string import jaro_similarity
from recordlinkage.algorithms.string import jarowinkler_similarity
from recordlinkage.algorithms.string import levenshtein_similarity
from recordlinkage.algorithms.string import longest_common_substring_similarity
from recordlinkage.algorithms.string import qgram_similarity
from recordlinkage.algorithms.string import smith_waterman_similarity
from recordlinkage.base import BaseCompareFeature
from recordlinkage.utils import fillna as _fillna


class Exact(BaseCompareFeature):
    """Compare the record pairs exactly.

    This class is used to compare records in an exact way. The similarity
    is 1 in case of agreement and 0 otherwise.

    Parameters
    ----------

    left_on : str or int
        Field name to compare in left DataFrame.
    right_on : str or int
        Field name to compare in right DataFrame.
    agree_value : float, str, numpy.dtype
        The value when two records are identical. Default 1. If 'values'
        is passed, then the value of the record pair is passed.
    disagree_value : float, str, numpy.dtype
        The value when two records are not identical.
    missing_value : float, str, numpy.dtype
        The value for a comparison with a missing value. Default 0.

    """

    name = "exact"
    description = "Compare attributes of record pairs."

    def __init__(self,
                 left_on,
                 right_on,
                 agree_value=1,
                 disagree_value=0,
                 missing_value=0,
                 label=None):
        super(Exact, self).__init__(left_on, right_on, label=label)

        self.agree_value = agree_value
        self.disagree_value = disagree_value
        self.missing_value = missing_value

    def _compute_vectorized(self, s_left, s_right):

        # Values or agree/disagree
        if self.agree_value == 'value':
            compare = s_left.copy()
            compare[s_left != s_right] = self.disagree_value

        else:
            compare = pandas.Series(self.disagree_value, index=s_left.index)
            compare[s_left == s_right] = self.agree_value

        # Only when disagree value is not identical with the missing value
        if self.disagree_value != self.missing_value:
            compare[(s_left.isnull() | s_right.isnull())] = self.missing_value

        return compare


class String(BaseCompareFeature):
    """Compute the (partial) similarity between strings values.

    This class is used to compare string values. The implemented algorithms
    are: 'jaro','jarowinkler', 'levenshtein', 'damerau_levenshtein', 'qgram'
    or 'cosine'. In case of agreement, the similarity is 1 and in case of
    complete disagreement it is 0. The Python Record Linkage Toolkit uses the
    `jellyfish` package for the Jaro, Jaro-Winkler, Levenshtein and Damerau-
    Levenshtein algorithms.

    Parameters
    ----------
    left_on : str or int
        The name or position of the column in the left DataFrame.
    right_on : str or int
        The name or position of the column in the right DataFrame.
    method : str, default 'levenshtein'
        An approximate string comparison method. Options are ['jaro',
        'jarowinkler', 'levenshtein', 'damerau_levenshtein', 'qgram',
        'cosine', 'smith_waterman', 'lcs']. Default: 'levenshtein'
    threshold : float, tuple of floats
        A threshold value. All approximate string comparisons higher or
        equal than this threshold are 1. Otherwise 0.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.
    """

    name = "string"
    description = "Compare string attributes of record pairs."

    def __init__(self,
                 left_on,
                 right_on,
                 method='levenshtein',
                 threshold=None,
                 missing_value=0.0,
                 label=None):
        super(String, self).__init__(left_on, right_on, label=label)

        self.method = method
        self.threshold = threshold
        self.missing_value = missing_value

    def _compute_vectorized(self, s_left, s_right):

        if self.method == 'jaro':
            str_sim_alg = jaro_similarity
        elif self.method in ['jarowinkler', 'jaro_winkler', 'jw']:
            str_sim_alg = jarowinkler_similarity
        elif self.method == 'levenshtein':
            str_sim_alg = levenshtein_similarity
        elif self.method in [
                'dameraulevenshtein', 'damerau_levenshtein', 'dl'
        ]:
            str_sim_alg = damerau_levenshtein_similarity
        elif self.method in ['q_gram', 'qgram']:
            str_sim_alg = qgram_similarity
        elif self.method == 'cosine':
            str_sim_alg = cosine_similarity
        elif self.method in ['smith_waterman', 'smithwaterman', 'sw']:
            str_sim_alg = smith_waterman_similarity
        elif self.method in ['longest_common_substring', 'lcs']:
            str_sim_alg = longest_common_substring_similarity
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = str_sim_alg(s_left, s_right)

        if self.threshold is not None:
            c = c.where((c < self.threshold) | (pandas.isnull(c)), other=1.0)
            c = c.where((c >= self.threshold) | (pandas.isnull(c)), other=0.0)

        c = _fillna(c, self.missing_value)

        return c

class Numeric(BaseCompareFeature):
    """Compute the (partial) similarity between numeric values.

    This class is used to compare numeric values. The implemented algorithms
    are: 'step', 'linear', 'exp', 'gauss' or 'squared'. In case of agreement,
    the similarity is 1 and in case of complete disagreement it is 0. The
    implementation is similar with numeric comparing in ElasticSearch, a full-
    text search tool. The parameters are explained in the image below (source
    ElasticSearch, The Definitive Guide)

    .. image:: /images/elas_1705.png
        :width: 100%
        :target: https://www.elastic.co/guide/en/elasticsearch/
            guide/current/decay-functions.html
        :alt: Decay functions, like in ElasticSearch

    Parameters
    ----------
    left_on : str or int
        The name or position of the column in the left DataFrame.
    right_on : str or int
        The name or position of the column in the right DataFrame.
    method : float
        The metric used. Options 'step', 'linear', 'exp', 'gauss' or
        'squared'. Default 'linear'.
    offset : float
        The offset. See image above.
    scale : float
        The scale of the numeric comparison method. See the image above.
        This argument is not available for the 'step' algorithm.
    origin : float
        The shift of bias between the values. See image above.
    missing_value : numpy.dtype
        The value if one or both records have a missing value on the
        compared field. Default 0.

    Note
    ----
    Numeric comparing can be an efficient way to compare date/time
    variables. This can be done by comparing the timestamps.

    """

    name = "numeric"
    description = "Compare numeric attributes of record pairs."

    def __init__(self,
                 left_on,
                 right_on,
                 method='linear',
                 offset=0.0,
                 scale=1.0,
                 origin=0.0,
                 missing_value=0.0,
                 label=None):
        super(Numeric, self).__init__(left_on, right_on, label=label)

        self.method = method
        self.offset = offset
        self.scale = scale
        self.origin = origin
        self.missing_value = missing_value

    def _compute_vectorized(self, s_left, s_right):

        d = _1d_distance(s_left, s_right)

        if self.method == 'step':
            num_sim_alg = partial(_step_sim, d, self.offset, self.origin)
        elif self.method in ['linear', 'lin']:
            num_sim_alg = partial(_linear_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method == 'squared':
            num_sim_alg = partial(_squared_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['exp', 'exponential']:
            num_sim_alg = partial(_exp_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['gauss', 'gaussian']:
            num_sim_alg = partial(_gauss_sim, d, self.scale, self.offset,
                                  self.origin)
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = num_sim_alg()
        c = _fillna(c, self.missing_value)
        return c


class Geographic(BaseCompareFeature):
    """Compute the (partial) similarity between WGS84 coordinate values.

    Compare the geometric (haversine) distance between two WGS-
    coordinates. The similarity algorithms are 'step', 'linear', 'exp',
    'gauss' or 'squared'. The similarity functions are the same as in
    :meth:`recordlinkage.comparing.Compare.numeric`

    Parameters
    ----------
    left_on_lat : tuple
        The name or position of the latitude in the
        left DataFrame.
    left_on_lng : tuple
        The name or position of the longitude in the
        left DataFrame.
    right_on_lat : tuple
        The name or position of the latitude in the
        right DataFrame.
    right_on_lng : tuple
        The name or position of the longitude in the
        right DataFrame.
    method : str
        The metric used. Options 'step', 'linear', 'exp', 'gauss' or
        'squared'. Default 'linear'.
    offset : float
        The offset. See Compare.numeric.
    scale : float
        The scale of the numeric comparison method. See Compare.numeric.
        This argument is not available for the 'step' algorithm.
    origin : float
        The shift of bias between the values. See Compare.numeric.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.
    """
    name = "geographic"
    description = "Compare geographic attributes of record pairs."

    def __init__(self,
                 left_on_lat,
                 left_on_lng,
                 right_on_lat,
                 right_on_lng,
                 method=None,
                 offset=0.0,
                 scale=1.0,
                 origin=0.0,
                 missing_value=0.0,
                 label=None):
        super(Geographic, self).__init__(
            (left_on_lat, left_on_lng), (right_on_lat, right_on_lng),
            label=label)

        self.method = method
        self.offset = offset
        self.scale = scale
        self.origin = origin
        self.missing_value = missing_value

    def _compute_vectorized(self, lat1, lng1, lat2, lng2):

        d = _haversine_distance(lat1, lng1, lat2, lng2)

        if self.method == 'step':
            num_sim_alg = partial(_step_sim, d, self.offset, self.origin)
        elif self.method in ['linear', 'lin']:
            num_sim_alg = partial(_linear_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method == 'squared':
            num_sim_alg = partial(_squared_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['exp', 'exponential']:
            num_sim_alg = partial(_exp_sim, d, self.scale, self.offset,
                                  self.origin)
        elif self.method in ['gauss', 'gaussian']:
            num_sim_alg = partial(_gauss_sim, d, self.scale, self.offset,
                                  self.origin)
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = num_sim_alg()
        c = _fillna(c, self.missing_value)

        return c


class Date(BaseCompareFeature):
    """Compute the (partial) similarity between date values.

    Parameters
    ----------
    left_on : str or int
        The name or position of the column in the left DataFrame.
    right_on : str or int
        The name or position of the column in the right DataFrame.
    swap_month_day : float
        The value if the month and day are swapped. Default 0.5.
    swap_months : list of tuples
        A list of tuples with common errors caused by the translating of
        months into numbers, i.e. October is month 10. The format of the
        tuples is (month_good, month_bad, value). Default : swap_months =
        [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5), (10, 9, 0.5)]
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "date"
    description = "Compare date attributes of record pairs."

    def __init__(self,
                 left_on,
                 right_on,
                 swap_month_day=0.5,
                 swap_months='default',
                 errors='coerce',
                 missing_value=0.0,
                 label=None):
        super(Date, self).__init__(left_on, right_on, label=label)

        self.missing_value = missing_value
        self.swap_months = swap_months
        self.swap_month_day = swap_month_day
        self.errors = errors

    def _compute_vectorized(self, s_left, s_right):

        # validate datatypes
        if str(s_left.dtype) != 'datetime64[ns]':
            raise ValueError('Left column is not of type datetime64[ns]')

        if str(s_right.dtype) != 'datetime64[ns]':
            raise ValueError('Right column is not of type datetime64[ns]')

        c = (s_left == s_right).astype(numpy.int64)  # start with int64

        # The case is which there is a swap_month_day value given.
        if (self.swap_month_day and self.swap_month_day != 0):

            c[(s_left.dt.year == s_right.dt.year) &
              (s_left.dt.month == s_right.dt.day) &
              (s_left.dt.day == s_right.dt.month) &
              (c != 1)] = self.swap_month_day

        if (self.swap_months and self.swap_months != 0):

            if self.swap_months == 'default':
                self.swap_months = [(6, 7, 0.5), (7, 6, 0.5), (9, 10, 0.5),
                                    (10, 9, 0.5)]
            else:
                try:
                    if not all([len(x) == 3 for x in self.swap_months]):
                        raise Exception
                except Exception:
                    raise ValueError(
                        'swap_months must be a list of (first month, \
                        second month, value) tuples or lists. ')

            for month1, month2, value in self.swap_months:

                c[(s_left.dt.year == s_right.dt.year) &
                  (s_left.dt.month == month1) &
                  (s_right.dt.month == month2) &
                  (s_left.dt.day == s_right.dt.day) & (c != 1)] = value

        c = pandas.Series(c)
        c[s_left.isnull() | s_right.isnull()] = self.missing_value
        return c


class Variable(BaseCompareFeature):
    """Add a variable of the dataframe as feature.

    Parameters
    ----------
    left_on : str or int
        The name or position of the column in the left DataFrame.
    right_on : str or int
        The name or position of the column in the right DataFrame.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "variable"
    description = "Add a variable of the dataframe to the features."

    def __init__(self,
                 left_on=None,
                 right_on=None,
                 missing_value=0.0,
                 label=None):
        super(Variable, self).__init__(left_on, right_on, label=label)

        self.missing_value = missing_value

    def _compute_vectorized(self, *data):

        result = []

        if isinstance(data, tuple):
            for col in data:
                result_i = _fillna(col, self.missing_value)
                result.append(result_i)
        else:
            result_0 = _fillna(data, self.missing_value)
            result.append(result_0)

        return tuple(result)


class VariableA(Variable):
    """Add a variable of the left dataframe as feature.

    Parameters
    ----------
    on : str or int
        The name or position of the column in the left DataFrame.
    normalise : bool
        Normalise the outcome. This is needed for good result in many
        classification models. Default True.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "variable"
    description = "Add a variable of the left dataframe to the features."

    def __init__(self, on=None, missing_value=0.0, label=None):
        super(VariableA, self).__init__(
            on, None, missing_value=missing_value, label=label)


class VariableB(Variable):
    """Add a variable of the right dataframe as feature.

    Parameters
    ----------
    on : str or int
        The name or position of the column in the right DataFrame.
    normalise : bool
        Normalise the outcome. This is needed for good result in many
        classification models. Default True.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "variable"
    description = "Add a variable of the right dataframe to the features."

    def __init__(self, on=None, missing_value=0.0, label=None):
        super(VariableB, self).__init__(
            None, on, missing_value=missing_value, label=label)


class Frequency(BaseCompareFeature):
    """Compute the (relative) frequency of each variable.

    Parameters
    ----------
    left_on : str or int
        The name or position of the column in the left DataFrame.
    right_on : str or int
        The name or position of the column in the right DataFrame.
    normalise : bool
        Normalise the outcome. This is needed for good result in many
        classification models. Default True.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "frequency"
    description = "Compute the frequency."

    def __init__(self,
                 left_on=None,
                 right_on=None,
                 normalise=True,
                 missing_value=0.0,
                 label=None):
        super(Frequency, self).__init__(left_on, right_on, label=label)

        self.normalise = normalise
        self.missing_value = missing_value

    def _compute_frequency(self, col):

        # https://github.com/pydata/pandas/issues/3729
        na_value = 'NAN'
        value_count = col.fillna(na_value)

        c = value_count.groupby(by=value_count).transform('count')
        c = c.astype(numpy.float64)

        if self.normalise:
            c = c / len(col)

        # replace missing values
        c[col.isnull()] = self.missing_value

        return c

    def _compute_vectorized(self, *data):

        result = []

        if isinstance(data, tuple):
            for col in data:
                result_i = self._compute_frequency(col)
                result.append(result_i)
        else:
            result_i = self._compute_frequency(*data)
            result.append(result_i)

        return tuple(result)


class FrequencyA(Frequency):
    """Compute the frequency of a variable in the left dataframe.

    Parameters
    ----------
    on : str or int
        The name or position of the column in the left DataFrame.
    normalise : bool
        Normalise the outcome. This is needed for good result in many
        classification models. Default True.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "frequency"
    description = "Compute the frequency."

    def __init__(self, on=None, normalise=True, missing_value=0.0, label=None):
        super(FrequencyA, self).__init__(
            on,
            None,
            normalise=normalise,
            missing_value=missing_value,
            label=label)


class FrequencyB(Frequency):
    """Compute the frequency of a variable in the right dataframe.

    Parameters
    ----------
    on : str or int
        The name or position of the column in the right DataFrame.
    normalise : bool
        Normalise the outcome. This is needed for good result in many
        classification models. Default True.
    missing_value : numpy.dtype
        The value for a comparison with a missing value. Default 0.0.

    """

    name = "frequency"
    description = "Compute the frequency."

    def __init__(self, on=None, normalise=True, missing_value=0.0, label=None):
        super(FrequencyB, self).__init__(
            None,
            on,
            normalise=normalise,
            missing_value=missing_value,
            label=label)