Source code for faster.evaluation
import numpy as np
import matplotlib.pyplot as plt
[docs]
class Evaluation():
"""
A class for evaluating the accuracy and uncertainty inherent in the estimates of the Fellegi-Sunter model.
:param Lambda: Unconditional match probability.
:type Lambda: float
:param Ksi: Array containing the conditional match probabilities for each pattern of discrete similarity levels across variables.
:type Ksi: numpy.ndarray
:param Counts: Array containing the observed counts for each pattern of discrete similarity levels across the compared variables.
:type Counts: numpy.ndarray
"""
def __init__(self, Lambda: float, Ksi: np.array, Counts: np.array):
self.Lambda = Lambda
self.Ksi = Ksi
self.Counts = Counts
[docs]
def FDR(self, S: float):
"""
:param S: Threshold value used to calculate the False Discovery Rate (FDR).
:type S: float
:return: The False Discovery Rate (FDR), defined as the proportion of false matches among all pairs with a conditional match probability greater than or equal to the threshold ``S``.
:rtype: float
"""
try:
return np.sum((1 - self.Ksi) * (self.Ksi >= S) * self.Counts) / np.sum((self.Ksi >= S) * self.Counts)
except:
return None
[docs]
def FNR(self, S: float):
"""
:param S: Threshold value used to calculate the False Negative Rate (FNR).
:type S: float
:return: The False Negative Rate (FNR), defined as the proportion of true matches among all pairs with a conditional match probability less than the threshold ``S``.
:rtype: float
"""
try:
return np.sum(self.Ksi * (self.Ksi < S) * self.Counts) / self.Lambda * np.sum(self.Counts)
except:
return None
[docs]
def Frontier(self):
"""
Calculates the False Discovery Rate (FDR) and False Negative Rate (FNR) for all thresholds between 0 and 1 with increments of 1e-3, and displays the resulting frontier curve.
"""
plt.plot([self.FDR(s / 1000) for s in range(1001)], [self.FNR(s / 1000) for s in range(1001)], ".-")
plt.xlabel("False Discovery Rate (FDR)")
plt.ylabel("False Negative Rate (FNR)")
plt.show()
[docs]
def Optimal_Threshold(self, Alpha: float):
"""
Computes the threshold value that minimizes a linear combination of the False Discovery Rate (FDR) and the False Negative Rate (FNR).
:param Alpha: Weight assigned to the False Negative Rate (FNR) in the linear combination.
:type Alpha: float
:return: Threshold value that minimizes the weighted sum of the FDR and FNR.
:rtype: float
"""
return np.argmin(np.nan_to_num([self.FDR(s / 1000) + Alpha * self.FNR(s / 1000) for s in range(1001)], nan = 1 + Alpha)) / 1000