I find code is a little slow for large dataset and code intending to accelerate it

Hello Chuanyu, thanks for introducing this method for me to test dependency I am now working on the competition for HM recommendation system in kaggle, I want to use this code for check feature importance. There are 800000 samples (x and y are array with size 800000) in training data. I don't know whether I use the code correctly or not. I found that it is too slow when I want to operate with sample size in 10 * 5 level. Therefore I read your code and think maybe slowness is iterative comparison for code in line 67 ''' for s in x: result += pr numpy.square(self._fr(s, t, x, y) - self._f(s, x)) '''

After that, I try accelerating mvtest by avoiding large amount of comparison, Here is my idea

"""

def test_accelerate(self, x: numpy.ndarray, y: numpy.ndarray):
    try:
        x = numpy.array(x)
    except Exception:
        raise Exception("The expected type of this argument is Array or List,"
                        " however \"" + str(type(x)) + "\" is gotten.")
    try:
        y = numpy.array(y, dtype=int)
    except Exception:
        raise Exception("The expected type of this argument is Array or List,"
                        " however \"" + str(type(y)) + "\" is gotten.")
    if len(x) == 0 or len(y) == 0:
        raise Exception("The input vectors cannot be empty.")
    if type(x.dtype) == str or type(y.dtype) == str:
        raise Exception("The element type of input vectors cannot be \"str\".")

    self._n = len(x)
    self._m = len(numpy.unique(y))
    if self._m > 10:
        self._m = 10

    if self._n == len(y):
        result = 0
        xy = np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=-1)
        values_info_in_xy = get_values_info(xy)
        for t in np.unique(y):
            xyr = xy[xy[:, 1] == t]
            pr = len(xyr) / len(xy)
            values_info_in_xyr = get_values_info(xyr)
            partial_result = cal_partial_result(pr, values_info_in_xy, values_info_in_xyr)
            result += partial_result

        return {'Tn': round(result, 2), 'p-value': self._quantiles_transformer(result)}
    else:
        raise Exception("Two vectors must be equal to the same dimension vector.")

def get_values_info(xy):
  '''
  get information for each value in xy
  information include: (value, 
                        last position of value in ratio, 
                        last position of value , 
                        total amount of value)
  '''
  xy = xy[xy[:, 0].argsort()]
  x = xy[:, 0]
  size = len(x)
  value_set = set()
  output = []
  for i in range(len(x)-1, -1, -1):
      value = x[i]
      if value not in value_set:
          value_set.add(value)
          # (value, last position ratio of value in list,  last position of value in list)
          output.append([value, (i + 1) / size, i + 1])  
  output = output[::-1]  # reverse it

  for i in range(len(output)):
      # add total amount of this value in list to item
      if i == 0:
          output[i].append(output[i][2])
      # last position in current value - last position in last value, because it is sorted
      else: 
          output[i].append(output[i][2] - output[i-1][2])

  return output

def cal_partial_result(pr, values_info_in_xy, values_info_in_xyr):
    # x_in_xy, x_in_xyr均为list，其中的元素是（value，position_in_percentage）
    l1, l2 = len(values_info_in_xy), len(values_info_in_xyr)
    p1, p2 = 0, 0
    partial_result = 0

    while p2 < l2 and p1 < l1:
        # 对于x_in_xy中的每一个元素，寻找在它在x_in_xyr的位置，并找出它的前一个元素
        s = values_info_in_xy[p1][0]  
        ratio1 = values_info_in_xy[p1][1]
        value_amount_in_xy = values_info_in_xy[p1][3]

        if s >= values_info_in_xyr[p2][0]:
            while p2 < l2 and s >= values_info_in_xyr[p2][0]:
                p2 += 1
            ratio2 = values_info_in_xyr[p2-1][1]
        else:
            if p2 == 0:
                ratio2 = 0
            else:
                ratio2 = values_info_in_xyr[p2 - 1][1]
        partial_result += value_amount_in_xy * pr * np.square(ratio2 - ratio1)
        p1 += 1

    for p in range(p1, l1):
        ratio1 = values_info_in_xy[p1][1]
        value_amount_in_xy = values_info_in_xy[p1][2]
        ratio2 = 1
        partial_result += value_amount_in_xy * pr * np.square(ratio2 - ratio1)
    return partial_result

"""

ChuanyuXue / MVTest

I find code is a little slow for large dataset and code intending to accelerate it #3