Source code for faster.deduplication

import cupy as cp
import math
import numpy as np
import pandas as pd
from .search import intersect, setdiff, reduce

jaro_winkler_dedup_code = r"""
extern "C"{

  __device__ float jaro_winkler(const char *str1,
                                const int len1,
                                bool *hash_str1,
                                const char *str2,
                                const int len2,
                                bool *hash_str2,
                                float p) {

    // This function computes the Jaro-Winkler similarity between two strings
    // Inputs:
    // - str1: First string
    // - len1: Length of str1
    // - hash_str1: Working memory to keep track of which characters in str1 are
    //              matching to corresponding characters in str2
    // - str2: Second string
    // - len2: Length of str2
    // - hash_str2: Working memory to keep track of which characters in str2 are
    //              matching to corresponding characters in str1
    // - p: Scaling factor applied to the common prefix
    // Output:
    // - dist: Jaro-Winkler similarity between str1 and str2


    if (len1 == 0 || len2 == 0) {

        // If either string is null, the Jaro-Winkler similarity between str1 and str2 is 0
        return 0.0;

    } else {

        // We compute the number of matching characters between str1 and str2

        // We consider the characters max(len1, len2) / 2 - 1 away from each other
        int max_dist = max(len1, len2) / 2 - 1;

        float match = 0;

        for (int i = 0; i < len1; i++) {

            for (int j = max(0, i - max_dist); j < min(len2, i + max_dist + 1); j++) {

                if (str1[i] == str2[j] && hash_str2[j] == false) {

                    // Two characters are matching if they appear in both strings at most max_dist characters away from each other
                    hash_str1[i] = true;
                    hash_str2[j] = true;
                    match++;
                    break;

                }

            }

        }

        if (match == 0) {

            // If there is no matching characters between both strings, the Jaro-Winkler similarity between them is 0
            return 0.0;

        } else {

            float t = 0;

            int point = 0;

            // If a positive number of matching characters is found, we need to compute the number of transpositions
            // that is, the number of matching characters that are not in the right order divided by two
            for (int i = 0; i < len1; i++) {

                if (hash_str1[i] == true) {

                    while (hash_str2[point] == false) {

                        point++;

                    }

                    if (str1[i] != str2[point++]) {

                        t++;

                    }

                }

            }

            t /= 2;

            // The Jaro similarity between str1 and str2 is defined as follows:
            float dist = ((match / (float)len1) + (match / (float)len2) + ((match - t) / match)) / 3.0;

            // To go from the Jaro similarity to the Jaro-Winkler similarity, we need
            // to compute the length of the common prefix between both strings
            float prefix = 0;

            for (int i = 0; i < min(min(len1, len2), 4); i++) {

                if (str1[i] == str2[i]) {

                    prefix++;

                } else {

                    break;

                }

            }

            // To obtain the Jaro-Winkler similarity, we adjust the Jaro similarity for the length of the common prefix between both strings
            dist += p * prefix * (1 - dist);

            return dist;

        }

    }

  }

  __global__ void jaro_winkler_kernel(char *str,
                                      int *length,
                                      long long *offsets,
                                      int n,
                                      bool *buffer1,
                                      long long *offsets1,
                                      bool *buffer2,
                                      long long *offsets2,
                                      float p,
                                      float *output,
                                      int n_output,
                                      int start_row,
                                      int end_row) {

    const long long id = threadIdx.x + blockDim.x * blockIdx.x;

    if (id < n_output) {

      const int row = id / n + start_row; // Index of the string processed in str1

      const int col = id % n; // Index of the string processed in str2

      // Only computes Jaro-Winkler similarity if row >= col (preventing redundant comparisons)
      if (row >= col) {

        if (row != col) {

          long long off_row = (row == 0 ? 0 : offsets[row - 1]);

          // Move the pointer to the first character of the string we are processing
          char *string1 = str + off_row;

          // Computing the length of the string we are processing
          int len1 = length[row];

          // Move the pointer to the first element of the working memory
          long long off1 = (row == start_row ? 0 : offsets1[row - start_row - 1]);

          bool *hash_str1 = buffer1 + off1 + len1 * col;

          long long off_col = (col == 0 ? 0 : offsets[col - 1]);

          char *string2 = str + off_col;

          int len2 = length[col];

          long long off2 = (col == 0 ? 0 : offsets2[col - 1]);

          bool *hash_str2 = buffer2 + off2 + len2 * (end_row - 1 - row);

          // Compute the Jaro-Winkler similarity between string1 and string2
          output[id] = jaro_winkler(string1, len1, hash_str1, string2, len2, hash_str2, p);

        } else {

          // A string is identical to itself, so its Jaro-Winkler similarity with itself is 1
          output[id] = 1;

        }

      }

    }

  }

}
"""

jaro_winkler_dedup_kernel = cp.RawKernel(jaro_winkler_dedup_code, 'jaro_winkler_kernel')

output_count_dedup_code = r"""
extern "C" {

  __global__ void output_count(long long *input_A,
                               long long *input_B,
                               int n_input,
                               int *unique_count,
                               int *output) {

    // Element of indices being processed
    const long long id = threadIdx.x + blockDim.x * blockIdx.x;

    if (id < n_input) {

      // First input
      long long id_A = input_A[id];

      // Second input
      long long id_B = input_B[id];

      // Number of observations with id_A in df
      int len_A = unique_count[id_A];

      // Number of observations with id_B in df
      int len_B = unique_count[id_B];

      if (id_A != id_B) {

        // Computes the number of pairs of values with id_A and id_B
        output[id] = len_A * len_B;

      } else {

        // If id_A = id_B, we disregard pairs formed by identical elements and those where the row index is less than the column index
        output[id] = len_A * (len_B - 1) / 2;

      }

    }

  }

}
"""

output_count_dedup_kernel = cp.RawKernel(output_count_dedup_code, 'output_count')

indices_inverse_dedup_code = r"""
extern "C" {

  __global__ void indices_inverse(long long *input_A,
                                  long long *input_B,
                                  int n_input,
                                  int n,
                                  long long *unique_argwhere,
                                  int *unique_argwhere_offsets,
                                  int *unique_count,
                                  long long *output,
                                  long long *output_offsets) {

    // Element of indices being processed
    const long long id = threadIdx.x + blockDim.x * blockIdx.x;

    if (id < n_input) {

      long long id_A = input_A[id];

      long long id_B = input_B[id];

      int len_A = unique_count[id_A]; // Number of observations with id_A in df_A

      int len_B = unique_count[id_B]; // Number of observations with id_B in df_B

      // Where observations with id_A in df_A start in unique_A_argwhere
      long long unique_A_off = (id_A == 0 ? 0 : unique_argwhere_offsets[id_A - 1]);

      // Where observations with id_B in df_B start in unique_B_argwhere
      long long unique_B_off = (id_B == 0 ? 0 : unique_argwhere_offsets[id_B - 1]);

      // Offset unique_A_argwhere appropriately
      long long *unique_A_argwhere_off = unique_argwhere + unique_A_off;

      // Offset unique_B_argwhere appropriately
      long long *unique_B_argwhere_off = unique_argwhere + unique_B_off;

      // Where the output starts in output
      long long output_off = (id == 0 ? 0 : output_offsets[id - 1]);

      if (id_A != id_B) {

        int k = 0;

        for (int i = 0; i < len_A ; i++) {

          for (int j = 0; j < len_B; j++) {

            // Considers only pairs with the row index greater than the column index

            if (unique_A_argwhere_off[i] > unique_B_argwhere_off[j]) {

              // Transpose indices of pairs in df_A and df_B in output
              output[output_off + k++] = unique_A_argwhere_off[i] * n + unique_B_argwhere_off[j];

            } else {

              // Transpose indices of pairs in df_A and df_B in output
              output[output_off + k++] = unique_B_argwhere_off[j] * n + unique_A_argwhere_off[i];

            }

          }

        }

      } else {

        int k = 0;

        for (int i = 1; i < len_A; i++) {

          // Considers only pairs with the row index greater than the column index

          for (int j = 0; j < i; j++) {

            // Transpose indices of pairs in df_A and df_B in output
            output[output_off + k++] = unique_A_argwhere_off[i] * n + unique_B_argwhere_off[j];

          }

        }

      }

    }

  }

}
"""

indices_inverse_dedup_kernel = cp.RawKernel(indices_inverse_dedup_code, 'indices_inverse')

indices_inverse_exact_dedup_code = r"""
extern "C" {

  __global__ void indices_inverse(long long *input,
                                  int n,
                                  long long *unique_argwhere,
                                  int *unique_argwhere_offsets,
                                  long long *output,
                                  int *output_mask,
                                  int *output_offsets,
                                  int n_output) {

    const long long id = threadIdx.x + blockDim.x * blockIdx.x; // Element of indices being processed

    if (id < n_output) {

      // Input element to which the processed output element refers
      long long mask = output_mask[id];

      // Move pointer to where the output begins in output
      long long output_off = (mask == 0 ? 0 : output_offsets[mask - 1]);

      long long i = id - output_off;

      long long in = input[mask];

      // Row index
      long long row = floorf((sqrtf(8 * i + 1) - 1) / 2);

      // Column index: consider only those lower than row index
      long long col = i - row * (row + 1) / 2;

      long long unique_off = (in == 0 ? 0 : unique_argwhere_offsets[in - 1]);

      long long *unique_argwhere_off = unique_argwhere + unique_off;

      // Transpose indices of pairs in df_A and df_B in output
      output[id] = unique_argwhere_off[row + 1] * n + unique_argwhere_off[col];

    }

  }

}
"""

indices_inverse_exact_dedup_kernel = cp.RawKernel(indices_inverse_exact_dedup_code, 'indices_inverse')

[docs] def jaro_winkler_dedup_gpu(string, p = 0.1, lower_thr = 0.88, upper_thr = 0.94, num_threads = 256, max_chunk_size = 2.0): """ Computes the Jaro-Winkler similarity between all pairs of strings in an array and returns the indices corresponding to pairs of strings whose Jaro-Winkler similarity falls within specified thresholds. :param string: Array of strings. :type string: numpy.ndarray :param p: Scaling factor applied to the common prefix in the Jaro-Winkler similarity. Defaults to 0.1. :type p: float, optional :param lower_thr: Lower threshold for discretizing the Jaro-Winkler distance. Defaults to 0.88. :type lower_thr: float, optional :param upper_thr: Upper threshold for discretizing the Jaro-Winkler distance. Defaults to 0.94. :type upper_thr: float, optional :param num_threads: Number of threads per block. Defaults to 256. :type num_threads: int, optional :param max_chunk_size: Maximum memory allocation per processing chunk, in gigabytes (GB). Defaults to 2.0. :type max_chunk_size: float, optional :return: List containing two arrays of indices: 1. Indices with Jaro-Winkler distance between ``lower_thr`` and ``upper_thr``. 2. Indices with Jaro-Winkler distance above ``upper_thr``. Indices represent ``i * len(str_B) + j``, where ``i`` is the element's index in ``str_A`` and ``j`` is the element's index in ``str_B``. :rtype: list[cupy.ndarray] """ mempool = cp.get_default_memory_pool() # Extract unique values of string (with inverse and counts) unique, unique_inverse, unique_counts = np.unique(string, return_inverse = True, return_counts = True) n_unique = len(unique) # Array containing the indices corresponding to each unique value of string (stored as an arrow) unique_inverse_gpu = cp.array(unique_inverse, dtype = np.int32) unique_inverse_sorted = cp.argsort(unique_inverse_gpu) del unique_inverse_gpu mempool.free_all_blocks() # Array containing the number of observations in string associated with each unique value unique_counts_gpu = cp.array(unique_counts, dtype = np.int32) # Array containing the offsets necessary to read the indices corresponding to each unique value in string unique_offsets_gpu = cp.cumsum(unique_counts_gpu, dtype = np.int32) unique_arrow = np.frombuffer(''.join(unique).encode(), dtype = np.int8) len_arrow = len(unique_arrow) # Array containing the unique values stored as an arrow unique_arrow_gpu = cp.array(unique_arrow, dtype = np.int8) # Array containing the length of unique values unique_len = np.fromiter((len(row) for row in unique), dtype = np.int32, count = len(unique)) unique_len_gpu = cp.array(unique_len, dtype = np.int32) # Array containing the offsets necessary to read the unique values in arrow offsets_gpu = cp.cumsum(unique_len_gpu, dtype = np.int64) # Approximate the number of chunks required to meet max_chunk_size total_comp = len(unique) * (len(unique) + 1) / 2 chunks = math.ceil((len(unique) * (len(unique) + 1) * 8 + len_arrow * (1 + 2 * len(unique)) + (len(unique) + 1) * 8) / (max_chunk_size * 1024 ** 3 - len_arrow - (len(unique) + 1) * 8)) # Create partitions accordingly chunk_size_row = math.ceil(len(unique) / chunks) indices = [] # Compute the Jaro-Winkler similarity metric by chunk for i in range(chunks): start_row = i * chunk_size_row offset = start_row * len(unique) end_row = min((i + 1) * chunk_size_row, len(unique)) num_comp = end_row * len(unique) - offset rows = cp.arange(start_row, end_row, dtype = np.int32) # Create working memory for the compute kernel (only for comparisons below the diagonal) buffer1_len = unique_len_gpu[rows] * (rows + 1) buffer1_offsets = cp.cumsum(buffer1_len, dtype = np.int64) del buffer1_len mempool.free_all_blocks() buffer1 = cp.zeros(int(buffer1_offsets[-1]), dtype = bool) if start_row > 0: buffer2_len = cp.concatenate((unique_len_gpu[:start_row] * (end_row - start_row), unique_len_gpu[rows] * (end_row - rows))) else: buffer2_len = unique_len_gpu[rows] * (end_row - rows) del rows mempool.free_all_blocks() buffer2_offsets = cp.cumsum(buffer2_len, dtype = np.int64) del buffer2_len mempool.free_all_blocks() buffer2 = cp.zeros(int(buffer2_offsets[-1]), dtype = bool) # Create output vector output_gpu = cp.zeros(int(num_comp), dtype = cp.float32) # Call the compute kernel on GPU num_blocks = math.ceil(num_comp / num_threads) jaro_winkler_dedup_kernel((num_blocks,), (num_threads,), (unique_arrow_gpu, unique_len_gpu, offsets_gpu, len(unique), buffer1, buffer1_offsets, buffer2, buffer2_offsets, cp.float32(p), output_gpu, cp.int32(num_comp), cp.int32(start_row), cp.int32(end_row))) del buffer1, buffer1_offsets, buffer2, buffer2_offsets mempool.free_all_blocks() # Extract the indices with Jaro-Winkler similarity between lower_thr and upper_thr indices1 = cp.bitwise_and(output_gpu >= lower_thr, output_gpu < upper_thr) argwhere1 = cp.argwhere(indices1) del indices1 mempool.free_all_blocks() # Extract the indices with Jaro-Winkler similarity above upper_thr argwhere2 = cp.argwhere(output_gpu >= upper_thr) del output_gpu mempool.free_all_blocks() # Adjust indices relative to the starting row output1 = cp.ravel(argwhere1) + int(offset) output2 = cp.ravel(argwhere2) + int(offset) del argwhere1, argwhere2 mempool.free_all_blocks() indices.append([output1, output2]) del output1, output2 mempool.free_all_blocks() del unique_arrow_gpu, unique_len_gpu, offsets_gpu mempool.free_all_blocks() # Concatenate indices from all chunks indices1 = cp.concatenate((x[0] for x in indices), dtype = np.int64) indices2 = cp.concatenate((x[1] for x in indices), dtype = np.int64) del indices mempool.free_all_blocks() if indices1.size > 0: # Invert indices1, i.e., translate into indices from the original data frame indices1_A = indices1 // len(unique) indices1_B = indices1 % len(unique) del indices1 mempool.free_all_blocks() # Calculate the output count for each input element output1_count = cp.zeros(indices1_A.size, dtype = np.int32) num_blocks = math.ceil(indices1_A.size / num_threads) output_count_dedup_kernel((num_blocks,), (num_threads,), (indices1_A, indices1_B, indices1_A.size, unique_counts_gpu, output1_count)) output1_offsets = cp.cumsum(output1_count, dtype = np.int64) output1_gpu = cp.zeros(int(output1_offsets[-1]), dtype = np.int64) indices_inverse_dedup_kernel((num_blocks,), (num_threads,), (indices1_A, indices1_B, indices1_A.size, len(string), unique_inverse_sorted, unique_offsets_gpu, unique_counts_gpu, output1_gpu, output1_offsets)) del indices1_A, indices1_B, output1_count, output1_offsets mempool.free_all_blocks() # Sort output vectors output1_sorted = cp.sort(output1_gpu) del output1_gpu mempool.free_all_blocks() else: output1_sorted = cp.zeros(0, dtype = np.int64) if indices2.size > 0: # Invert indices2 indices2_A = indices2 // len(unique) indices2_B = indices2 % len(unique) del indices2 mempool.free_all_blocks() output2_count = cp.zeros(indices2_A.size, dtype = np.int32) num_blocks = math.ceil(indices2_A.size / num_threads) output_count_dedup_kernel((num_blocks,), (num_threads,), (indices2_A, indices2_B, indices2_A.size, unique_counts_gpu, output2_count)) output2_offsets = cp.cumsum(output2_count, dtype = np.int64) del output2_count mempool.free_all_blocks() output2_gpu = cp.zeros(int(output2_offsets[-1]), dtype = np.int64) indices_inverse_dedup_kernel((num_blocks,), (num_threads,), (indices2_A, indices2_B, indices2_A.size, len(string), unique_inverse_sorted, unique_offsets_gpu, unique_counts_gpu, output2_gpu, output2_offsets)) del indices2_A, indices2_B, output2_offsets, unique_inverse_sorted, unique_counts_gpu, unique_offsets_gpu mempool.free_all_blocks() output2_sorted = cp.sort(output2_gpu) del output2_gpu mempool.free_all_blocks() else: output2_sorted = cp.zeros(0, dtype = np.int64) del unique_inverse_sorted, unique_counts_gpu, unique_offsets_gpu mempool.free_all_blocks() return [output1_sorted, output2_sorted]
[docs] def exact_dedup_gpu(string, num_threads = 256): """ Compares all pairs of strings in an array and returns the indices of exact matches. :param string: Array of strings. :type string: numpy.ndarray :param num_threads: Number of threads per block. Defaults to 256. :type num_threads: int, optional :return: Array of indices corresponding to pairs with an exact match. Indices represent ``i * len(str_B) + j``, where ``i`` is the element's index in ``str_A`` and ``j`` is the element's index in ``str_B``. :rtype: list[cupy.ndarray] """ mempool = cp.get_default_memory_pool() # Extract unique values of string (with inverse and counts) unique, unique_inverse, unique_counts = np.unique(string, return_inverse = True, return_counts = True) # Array containing the indices corresponding to each unique value of string (stored as an arrow) unique_inverse_gpu = cp.array(unique_inverse, dtype = np.int64) unique_inverse_sorted = cp.argsort(unique_inverse_gpu) del unique_inverse_gpu mempool.free_all_blocks() # Array containing the number of observations in string associated with each unique value unique_counts_gpu = cp.array(unique_counts, dtype = np.int32) # Array containing the offsets necessary to read the indices corresponding to each unique value in str_A unique_offsets_gpu = cp.cumsum(unique_counts_gpu, dtype = np.int32) # Extract unique values with at least two frequencies indices = cp.argwhere(unique_counts_gpu > 1) indices_ravel = cp.ravel(indices) del indices mempool.free_all_blocks() if indices_ravel.size > 0: # Invert indices, i.e., translating into indices from original data frame output_count = unique_counts_gpu[indices_ravel] * (unique_counts_gpu[indices_ravel] - 1) / 2 output_offsets = cp.cumsum(output_count, dtype = np.int32) # Array indicating for the element of indices to which each element of the output is referring to output_mask = cp.repeat(cp.arange(0, indices_ravel.size, dtype = np.int32), repeats = output_count.astype(int).get().tolist()) output_gpu = cp.zeros(int(output_offsets[-1]), dtype = np.int64) num_blocks = math.ceil(output_gpu.size / num_threads) indices_inverse_exact_dedup_kernel((num_blocks,), (num_threads,), (indices_ravel, len(string), unique_inverse_sorted, unique_offsets_gpu, output_gpu, output_mask, output_offsets, output_gpu.size)) del unique_inverse_sorted, unique_counts_gpu, unique_offsets_gpu, indices_ravel, output_count, output_mask, output_offsets mempool.free_all_blocks() # Sort the output vector output_sorted = cp.sort(output_gpu) del output_gpu mempool.free_all_blocks() else: output_sorted = cp.zeros(0, dtype = np.int64) del unique_inverse_sorted, unique_counts_gpu, unique_offsets_gpu, indices_ravel mempool.free_all_blocks() return [output_sorted]
[docs] class Deduplication(): """ A class for comparing the values of selected variables in one pandas DataFrame. :param df: DataFrame to deduplicate. :type df: pandas.DataFrame :param Vars_Fuzzy: List of variable names to be compared using fuzzy matching. :type Vars_Fuzzy: list[str] :param Vars_Exact: List of variable names to be compared using exact matching. Defaults to an empty list. :type Vars_Exact: list[str], optional :raises Exception: If any name in ``Vars_Fuzzy`` or ``Vars_Exact`` is not found in ``df``. """ def __init__(self, df: pd.DataFrame, Vars_Fuzzy, Vars_Exact = []): # Check that inputs are valid if any(var not in df.columns for var in Vars_Fuzzy) or any(var not in df.columns for var in Vars_Exact): raise Exception("The variable names in Vars_Fuzzy and Vars_Exact must match variable names in df.") self.df = df self.Vars_Fuzzy = Vars_Fuzzy self.Vars_Exact = Vars_Exact self.Indices = None """ This attribute stores a list of index arrays representing pairs of records from ``df_A`` and ``df_B`` that correspond to each combination of discrete similarity levels across all compared variables. :return: List of arrays, where each array contains indices of record pairs associated with a specific combination of discrete similarity levels. Indices represent ``i * len(str_B) + j``, where ``i`` is the element's index in ``str_A`` and ``j`` is the element's index in ``str_B``. Similarity patterns are defined iteratively across variables (both fuzzy and exact), following the order specified by the user. Variables listed later in the sequence define faster-changing discrete levels of similarity. The pattern representing no similarity between records is omitted. :rtype: list[cupy.ndarray] """ self._Fit_flag = False
[docs] def fit(self, p = 0.1,Lower_Thr = 0.88, Upper_Thr = 0.94, Num_Threads = 256, Max_Chunk_Size = 2.0): """ Compares all pairs of observations across the selected variables in the dataframe. The result is stored in the Indices attribute. :param p: Scaling factor applied to the common prefix in the Jaro-Winkler similarity. Defaults to 0.1. :type p: float, optional :param Lower_Thr: Lower threshold for discretizing the Jaro-Winkler distance. Defaults to 0.88. :type Lower_Thr: float, optional :param Upper_Thr: Upper threshold for discretizing the Jaro-Winkler distance. Defaults to 0.94. :type Upper_Thr: float, optional :param Num_Threads: Number of threads per block. Defaults to 256. :type Num_Threads: int, optional :param Max_Chunk_Size: Maximum memory allocation per processing chunk, in gigabytes (GB). Defaults to 2.0. :type Max_Chunk_Size: float, optional :raises Exception: If the model has already been fitted, it cannot be fitted again. """ if self._Fit_flag: raise Exception("If the model has already been fitted, it cannot be fitted again.") mempool = cp.get_default_memory_pool() indices = [] # Loop over variables and compute the Jaro-Winkler similarity between all pairs of values for i in range(len(self.Vars_Fuzzy)): indices.append(jaro_winkler_dedup_gpu(self.df[self.Vars_Fuzzy[i]].to_numpy(), p, Lower_Thr, Upper_Thr, Num_Threads, Max_Chunk_Size)) mempool.free_all_blocks() # Loop over variables and compare all pairs of values for exact matching for i in range(len(self.Vars_Exact)): indices.append(exact_dedup_gpu(self.df[self.Vars_Exact[i]].to_numpy(), Num_Threads)) mempool.free_all_blocks() # Merge discrete levels of similarity over all variables self.Indices = indices[0] del indices[0] mempool.free_all_blocks() while len(indices) > 0: output = [] for j in range(len(indices[0])): output.append(reduce(setdiff, self.Indices, indices[0][j])) mempool.free_all_blocks() while len(self.Indices) > 0: output.append(reduce(setdiff, indices[0], self.Indices[0])) mempool.free_all_blocks() for j in range(len(indices[0])): output.append(intersect(self.Indices[0], indices[0][j])) mempool.free_all_blocks() del self.Indices[0] mempool.free_all_blocks() self.Indices = output del indices[0], output mempool.free_all_blocks() self._Fit_flag = True del indices mempool.free_all_blocks()
@property def Counts(self): """ This property stores the count of record pairs corresponding to each combination of discrete similarity levels across all compared variables. :return: Array containing the number of pairs for each pattern of discrete similarity levels across variables. :rtype: numpy.ndarray """ if not self._Fit_flag: raise Exception("The model must be fitted first.") try: return self._Counts except: counts = [x.size for x in self.Indices] # Number of pairs for each pattern of discrete levels of similarity self._Counts = np.concatenate([[int(len(self.df) * (len(self.df) + 1) / 2) - np.sum(counts)], counts]) # Add count of omitted pattern return self._Counts