Source code for faster.linkage

import cupy as cp
import numpy as np
import pandas as pd


[docs]
class Linkage():
  """
  A class for linking records between two Pandas DataFrames based on previously estimated conditional match probabilities.
  
  :param df_A: First DataFrame to be linked.
  :type df_A: pandas.DataFrame
  :param df_B: Second DataFrame to be linked.
  :type df_B: pandas.DataFrame
  :param Indices: List of arrays, where each array contains the indices of record pairs from ``df_A`` and ``df_B`` corresponding to a specific pattern of discrete similarity levels across variables.
  :type Indices: list[cupy.ndarray]
  :param Ksi: Array of conditional match probabilities for all combinations of discrete similarity levels across variables.
  :type Ksi: numpy.ndarray
  """

  def __init__(self, df_A: pd.DataFrame, df_B: pd.DataFrame, Indices, Ksi: np.array):

    self.df_A = df_A
    self.df_B = df_B
    self.Indices = Indices
    self.Ksi = Ksi


[docs]
  def transform(self, Threshold = 0.85):
    """
    Returns a DataFrame containing all pairs of records from ``df_A`` and ``df_B`` whose conditional match probabilities exceed a specified threshold.

    :param Threshold: Threshold value above which pairs of records from ``df_A`` and ``df_B`` are considered matches. Defaults to 0.85.
    :type Threshold: float, optional
    :return: DataFrame linking all pairs of records from ``df_A`` and ``df_B`` with conditional match probabilities greater than the specified threshold.
    :rtype: pandas.DataFrame
    :raises Exception: If no pairs of records have conditional match probabilities exceeding the threshold.
    """

    mempool = cp.get_default_memory_pool()

    # Adding suffixes and indices to df_A and df_B
    df_A = self.df_A.add_suffix("_A")

    df_B = self.df_B.add_suffix("_B")

    df_A["Index_A"] = range(len(df_A))

    df_B["Index_B"] = range(len(df_B))

    # Extracting the Indices for which Ksi is above the threshold
    Patterns_Above_Threshold = np.ravel(np.argwhere(self.Ksi >= Threshold))
    
    if np.sum([self.Indices[i - 1].size for i in Patterns_Above_Threshold]) == 0:
      raise Exception("No pair of observations has a conditional match probability exceeding the threshold.")
    
    Indices_to_Link = cp.concatenate((self.Indices[i - 1] for i in Patterns_Above_Threshold))

    Indices_to_Link_A = Indices_to_Link // len(df_B)

    Indices_to_Link_A_cpu = Indices_to_Link_A.get()

    Indices_to_Link_B = Indices_to_Link % len(df_B)

    Indices_to_Link_B_cpu = Indices_to_Link_B.get()

    del Indices_to_Link, Indices_to_Link_A, Indices_to_Link_B
    mempool.free_all_blocks()

    # Extracting the records in df_A with which records in df_B must be linked
    df_A = df_A.iloc[Indices_to_Link_A_cpu,:]

    df_A["Index_B"] = Indices_to_Link_B_cpu

    return df_A.merge(df_B, on = "Index_B")