octopize / saiph

A projection package
https://saiph.readthedocs.io
Apache License 2.0
0 stars 0 forks source link

feat(stats): improve cos2 computation speed for categorical #123

Closed mguillaudeux closed 4 months ago

mguillaudeux commented 4 months ago

The computation of cos2 for categorical data with high number of modalities could take some time. This improves the computation speed while keeping the same results

Here is a speed comparison with results

class Model:
    def __init__(self, row_weights, prop):
        self.row_weights = row_weights
        self.prop = prop

# Generate a larger dataset
n_rows = 10000
n_cols = 100
n_dims = 50

np.random.seed(0)
dummy_large = pd.DataFrame(np.random.randint(0, 2, size=(n_rows, n_cols)), 
                           columns=[f'category_{i}' for i in range(n_cols)])
coords_large = pd.DataFrame(np.random.randn(n_rows, n_dims), 
                            columns=[f'Dim.{i}' for i in range(n_dims)])
model_prop_large = pd.Series(np.random.rand(n_cols), 
                             index=dummy_large.columns)
model_row_weights_large = np.random.rand(n_rows)

model_large = Model(row_weights=model_row_weights_large, prop=model_prop_large)

def original_compute_cos2_single_category(single_category_df, model, coords):
    cos2 = []
    for coord_col in coords.columns:
        p = 0
        for col in single_category_df.columns:
            weighted_coords = coords[coord_col] * model.row_weights
            dummy_values = single_category_df[col].values
            if model.prop is not None:
                p += (dummy_values * weighted_coords).sum() ** 2 / model.prop[col]
        cos2.append(p)
    all_weighted_coords = (coords.values**2).T * model.row_weights
    summed_weights = all_weighted_coords.sum(axis=1)
    single_category_cos2 = np.array(cos2) / summed_weights
    return single_category_cos2

def optimized_compute_cos2_single_category(single_category_df, model, coords):
    n_rows, n_cols = single_category_df.shape
    cos2 = []
    for coord_col in coords.columns:
        weighted_coord = coords[coord_col].values * model.row_weights
        p_values = np.zeros(n_cols)
        for i, col in enumerate(single_category_df.columns):
            dummy_values = single_category_df[col].values
            p_values[i] = (dummy_values * weighted_coord).sum() ** 2 / model.prop[col]
        p = p_values.sum()
        cos2.append(p)
    all_weighted_coords = (coords.values**2).T * model.row_weights
    summed_weights = all_weighted_coords.sum(axis=1)
    single_category_cos2 = np.array(cos2) / summed_weights
    return single_category_cos2

# Measure the execution time of the original function
original_time = timeit.timeit(
    stmt="original_compute_cos2_single_category(dummy_large, model_large, coords_large)",
    globals=globals(),
    number=10
)

# Measure the execution time of the optimized function
optimized_time = timeit.timeit(
    stmt="optimized_compute_cos2_single_category(dummy_large, model_large, coords_large)",
    globals=globals(),
    number=10
)

print(f"Original function time: {original_time:.4f} seconds")
print(f"Optimized function time: {optimized_time:.4f} seconds")

Results are the following Original function time: 20.0161 seconds Optimized function time: 2.9835 seconds