The computation of cos2 for categorical data with high number of modalities could take some time.
This improves the computation speed while keeping the same results
Here is a speed comparison with results
class Model:
def __init__(self, row_weights, prop):
self.row_weights = row_weights
self.prop = prop
# Generate a larger dataset
n_rows = 10000
n_cols = 100
n_dims = 50
np.random.seed(0)
dummy_large = pd.DataFrame(np.random.randint(0, 2, size=(n_rows, n_cols)),
columns=[f'category_{i}' for i in range(n_cols)])
coords_large = pd.DataFrame(np.random.randn(n_rows, n_dims),
columns=[f'Dim.{i}' for i in range(n_dims)])
model_prop_large = pd.Series(np.random.rand(n_cols),
index=dummy_large.columns)
model_row_weights_large = np.random.rand(n_rows)
model_large = Model(row_weights=model_row_weights_large, prop=model_prop_large)
def original_compute_cos2_single_category(single_category_df, model, coords):
cos2 = []
for coord_col in coords.columns:
p = 0
for col in single_category_df.columns:
weighted_coords = coords[coord_col] * model.row_weights
dummy_values = single_category_df[col].values
if model.prop is not None:
p += (dummy_values * weighted_coords).sum() ** 2 / model.prop[col]
cos2.append(p)
all_weighted_coords = (coords.values**2).T * model.row_weights
summed_weights = all_weighted_coords.sum(axis=1)
single_category_cos2 = np.array(cos2) / summed_weights
return single_category_cos2
def optimized_compute_cos2_single_category(single_category_df, model, coords):
n_rows, n_cols = single_category_df.shape
cos2 = []
for coord_col in coords.columns:
weighted_coord = coords[coord_col].values * model.row_weights
p_values = np.zeros(n_cols)
for i, col in enumerate(single_category_df.columns):
dummy_values = single_category_df[col].values
p_values[i] = (dummy_values * weighted_coord).sum() ** 2 / model.prop[col]
p = p_values.sum()
cos2.append(p)
all_weighted_coords = (coords.values**2).T * model.row_weights
summed_weights = all_weighted_coords.sum(axis=1)
single_category_cos2 = np.array(cos2) / summed_weights
return single_category_cos2
# Measure the execution time of the original function
original_time = timeit.timeit(
stmt="original_compute_cos2_single_category(dummy_large, model_large, coords_large)",
globals=globals(),
number=10
)
# Measure the execution time of the optimized function
optimized_time = timeit.timeit(
stmt="optimized_compute_cos2_single_category(dummy_large, model_large, coords_large)",
globals=globals(),
number=10
)
print(f"Original function time: {original_time:.4f} seconds")
print(f"Optimized function time: {optimized_time:.4f} seconds")
Results are the following
Original function time: 20.0161 seconds
Optimized function time: 2.9835 seconds
The computation of cos2 for categorical data with high number of modalities could take some time. This improves the computation speed while keeping the same results
Here is a speed comparison with results
Results are the following Original function time: 20.0161 seconds Optimized function time: 2.9835 seconds