Source code for faster.evaluation

import numpy as np
import matplotlib.pyplot as plt

[docs] class Evaluation(): """ A class for evaluating the accuracy and uncertainty inherent in the estimates of the Fellegi-Sunter model. :param Lambda: Unconditional match probability. :type Lambda: float :param Ksi: Array containing the conditional match probabilities for each pattern of discrete similarity levels across variables. :type Ksi: numpy.ndarray :param Counts: Array containing the observed counts for each pattern of discrete similarity levels across the compared variables. :type Counts: numpy.ndarray """ def __init__(self, Lambda: float, Ksi: np.array, Counts: np.array): self.Lambda = Lambda self.Ksi = Ksi self.Counts = Counts
[docs] def FDR(self, S: float): """ :param S: Threshold value used to calculate the False Discovery Rate (FDR). :type S: float :return: The False Discovery Rate (FDR), defined as the proportion of false matches among all pairs with a conditional match probability greater than or equal to the threshold ``S``. :rtype: float """ try: return np.sum((1 - self.Ksi) * (self.Ksi >= S) * self.Counts) / np.sum((self.Ksi >= S) * self.Counts) except: return None
[docs] def FNR(self, S: float): """ :param S: Threshold value used to calculate the False Negative Rate (FNR). :type S: float :return: The False Negative Rate (FNR), defined as the proportion of true matches among all pairs with a conditional match probability less than the threshold ``S``. :rtype: float """ try: return np.sum(self.Ksi * (self.Ksi < S) * self.Counts) / self.Lambda * np.sum(self.Counts) except: return None
[docs] def Frontier(self): """ Calculates the False Discovery Rate (FDR) and False Negative Rate (FNR) for all thresholds between 0 and 1 with increments of 1e-3, and displays the resulting frontier curve. """ plt.plot([self.FDR(s / 1000) for s in range(1001)], [self.FNR(s / 1000) for s in range(1001)], ".-") plt.xlabel("False Discovery Rate (FDR)") plt.ylabel("False Negative Rate (FNR)") plt.show()
[docs] def Optimal_Threshold(self, Alpha: float): """ Computes the threshold value that minimizes a linear combination of the False Discovery Rate (FDR) and the False Negative Rate (FNR). :param Alpha: Weight assigned to the False Negative Rate (FNR) in the linear combination. :type Alpha: float :return: Threshold value that minimizes the weighted sum of the FDR and FNR. :rtype: float """ return np.argmin(np.nan_to_num([self.FDR(s / 1000) + Alpha * self.FNR(s / 1000) for s in range(1001)], nan = 1 + Alpha)) / 1000