```# measures.py

import numpy

import pandas

def _get_multiindex(x):

if isinstance(x, (pandas.DataFrame, pandas.Series)):
return x.index
elif isinstance(x, pandas.MultiIndex):
return x
else:
raise ValueError("Expected one of: pandas.DataFrame, "
"pandas.Series, pandas.MultiIndex")

def _isconfusionmatrix(x):

if isinstance(x, numpy.ndarray) and x.shape == (2, 2):
return True
elif isinstance(x, list) and numpy.ndarray(x).shape == (2, 2):
return True
else:
return False

"""Compute the reduction ratio.

The reduction ratio is 1 minus the ratio candidate matches and the maximum
number of pairs possible.

Parameters
----------
The number of candidate record pairs or the pandas.MultiIndex with
record pairs.
*total: pandas.DataFrame object(s)
The DataFrames are used to compute the full index size with the
full_index_size function.

Returns
-------
float
The reduction ratio.

"""

n_max = full_index_size(*total)

raise ValueError("n has to be smaller of equal n_max")

return 1 - links_pred / n_max

def max_pairs(shape):
"""[DEPRECATED] Compute the maximum number of record pairs possible."""

if not isinstance(shape, (tuple, list)):
x = get_length(shape)
n = int(x * (x - 1) / 2)

elif (isinstance(shape, (tuple, list)) and len(shape) == 1):
x = get_length(shape[0])
n = int(x * (x - 1) / 2)

else:
n = numpy.prod([get_length(xi) for xi in shape])

return n

def full_index_size(*args):
"""Compute the number of records in a full index.

Compute the number of records in a full index without building the index
itself. The result is the maximum number of record pairs possible. This
function is especially useful in measures like the `reduction_ratio`.

Deduplication: Given a DataFrame A with length N, the full index size is
N*(N-1)/2. Linking: Given a DataFrame A with length N and a DataFrame B
with length M, the full index size is N*M.

Parameters
----------
*args: int, pandas.MultiIndex, pandas.Series, pandas.DataFrame
A pandas object or a int representing the length of a dataset to link.
When there is one argument, it is assumed that the record linkage is
a deduplication process.

Examples
--------

Use integers:
>>> full_index_size(10)  # deduplication: 45 pairs
>>> full_index_size(10, 10)  # linking: 100 pairs

or pandas objects
>>> full_index_size(DF)  # deduplication: len(DF)*(len(DF)-1)/2 pairs
>>> full_index_size(DF, DF)  # linking: len(DF)*len(DF) pairs

"""

# check if a list or tuple is passed as argument
if len(args) == 1 and isinstance(args[0], (list, tuple)):
args = tuple(args[0])

if len(args) == 1:
n = get_length(args[0])
size = int(n * (n - 1) / 2)
else:
size = numpy.prod([get_length(arg) for arg in args])

return size

"""Count the number of True Positives.

Returns the number of correctly predicted links, also called the number of
True Positives (TP).

Parameters
----------

Returns
-------
int
The number of correctly predicted links.
"""

"""Count the number of True Negatives.

Returns the number of correctly predicted non-links, also called the
number of True Negatives (TN).

Parameters
----------
total: int, pandas.MultiIndex
The count of all record pairs (both links and non-links). When the
argument is a pandas.MultiIndex, the length of the index is used.

Returns
-------
int
The number of correctly predicted non-links.

"""

if isinstance(total, pandas.MultiIndex):
total = len(total)

"""Count the number of False Positives.

Returns the number of incorrect predictions of true non-links. (true non-
links, but predicted as links). This value is known as the number of False
Positives (FP).

Parameters
----------

Returns
-------
int
The number of false positives.

"""

"""Count the number of False Negatives.

but predicted as non-links). This value is known as the number of False
Negatives (FN).

Parameters
----------

Returns
-------
int
The number of false negatives.

"""

"""Compute the confusion matrix.

The confusion matrix is of the following form:

+----------------------+-----------------------+----------------------+
|                      |  Predicted Positives  | Predicted Negatives  |
+======================+=======================+======================+
| **True Positives**   | True Positives (TP)   | False Negatives (FN) |
+----------------------+-----------------------+----------------------+
| **True Negatives**   | False Positives (FP)  | True Negatives (TN)  |
+----------------------+-----------------------+----------------------+

The confusion matrix is an informative way to analyse a prediction. The
matrix can used to compute measures like precision and recall. The count
of true prositives is [0,0], false negatives is [0,1], true negatives
is [1,1] and false positives is [1,0].

Parameters
----------
total: int, pandas.MultiIndex
The count of all record pairs (both links and non-links). When the
argument is a pandas.MultiIndex, the length of the index is used. If
the total is None, the number of True Negatives is not computed.
Default None.

Returns
-------
numpy.array
The confusion matrix with TP, TN, FN, FP values.

Note
----
The number of True Negatives is computed based on the total argument.
This argument is the number of record pairs of the entire matrix.

"""

if total is None:
tn = numpy.nan
else:

if isinstance(total, pandas.MultiIndex):
total = len(total)

return numpy.array([[tp, fn], [fp, tn]])

Compute the precision.

The precision is given by TP/(TP+FP).

Parameters
----------
The true (or actual) collection of links.

Returns
-------
float
The precision
"""

v = confusion_matrix[0, 0] \
/ (confusion_matrix[0, 0] + confusion_matrix[1, 0])
else:

v = tp / (tp + fp)

return float(v)

Compute the recall/sensitivity.

The recall is given by TP/(TP+FN).

Parameters
----------
The true (or actual) collection of links.

Returns
-------
float
The recall
"""

v = confusion_matrix[0, 0] \
/ (confusion_matrix[0, 0] + confusion_matrix[0, 1])
else:

v = tp / (tp + fn)

return float(v)

Compute the accuracy.

The accuracy is given by (TP+TN)/(TP+FP+TN+FN).

Parameters
----------
The true (or actual) collection of links.
total: int, pandas.MultiIndex
The count of all record pairs (both links and non-links). When the
argument is a pandas.MultiIndex, the length of the index is used.

Returns
-------
float
The accuracy
"""

if isinstance(total, pandas.MultiIndex):
total = len(total)

v = (confusion_matrix[0, 0] + confusion_matrix[1, 1]) \
/ numpy.sum(confusion_matrix)
else:

v = (tp + tn) / total

return float(v)

Compute the specificity.

The specificity is given by TN/(FP+TN).

Parameters
----------
The true (or actual) collection of links.
total: int, pandas.MultiIndex
The count of all record pairs (both links and non-links). When the
argument is a pandas.MultiIndex, the length of the index is used.

Returns
-------
float
The specificity

"""

v = confusion_matrix[1, 1] / \
(confusion_matrix[1, 0] + confusion_matrix[1, 1])
else:

if isinstance(total, pandas.MultiIndex):
total = len(total)
v = tn / (fp + tn)

return float(v)

Compute the F-score.

The F-score is given by 2*(precision*recall)/(precision+recall).

Parameters
----------
The true (or actual) collection of links.

Returns
-------
float
The fscore

Note
----
If there are no pairs predicted as links, this measure will raise a
ZeroDivisionError.

"""