david26694 / cluster-experiments

Simulation-based power analysis library
https://david26694.github.io/cluster-experiments/
MIT License
27 stars 3 forks source link

feat: add experiment analysis functionalities #185

Open ludovico-lanni opened 3 months ago

ludovico-lanni commented 3 months ago
import pandas as pd
import numpy as np

# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)

# Seed for reproducibility
np.random.seed(42)

# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)

# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))

# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))

# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS))  # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS))  # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS)  # Random city codes between 1 and 2

# Create DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customers,
    'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
    'order_value': order_values,
    'order_delivery_time_in_minutes': order_delivery_times,
    'order_city_code': order_city_codes
}

df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)

# Show the first few rows of the DataFrame
print(df.head())

from cluster_experiments.analysis_plan import AnalysisPlan
from cluster_experiments.metric import SimpleMetric
from cluster_experiments.dimension import Dimension
from cluster_experiments.variant import Variant
from cluster_experiments.hypothesis_test import HypothesisTest

dimension__city_code = Dimension(
    name='order_city_code',
    values=['1','2']
)

metric__order_value = SimpleMetric(
    alias='AOV',
    name='order_value'
)

metric__delivery_time = SimpleMetric(
    alias='AVG DT',
    name='order_delivery_time_in_minutes'
)

test__order_value = HypothesisTest(
    metric=metric__order_value,
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
    dimensions=[dimension__city_code]
)

test__delivery_time = HypothesisTest(
    metric=metric__delivery_time,
    analysis_type="gee",
    analysis_config={"cluster_cols":["customer_id"]}
)

variants = [
    Variant('control', is_control=True),
    Variant('treatment_1', is_control=False),
    Variant('treatment_2', is_control=False)
]

analysis_plan = AnalysisPlan(
    tests=[test__order_value, test__delivery_time],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01
)

results = analysis_plan.analyze(exp_data=df)

print(results)
codecov-commenter commented 3 months ago

:warning: Please install the 'codecov app svg image' to ensure uploads and comments are reliably processed by Codecov.

Codecov Report

Attention: Patch coverage is 32.70440% with 214 lines in your changes missing coverage. Please review.

Project coverage is 83.16%. Comparing base (d4ea384) to head (0cc21d3).

Files with missing lines Patch % Lines
cluster_experiments/inference/analysis_plan.py 0.00% 71 Missing :warning:
cluster_experiments/experiment_analysis.py 28.57% 55 Missing :warning:
cluster_experiments/inference/hypothesis_test.py 29.85% 47 Missing :warning:
cluster_experiments/inference/metric.py 51.16% 21 Missing :warning:
cluster_experiments/inference/dimension.py 52.17% 11 Missing :warning:
cluster_experiments/inference/variant.py 58.33% 5 Missing :warning:
cluster_experiments/inference/analysis_results.py 84.00% 4 Missing :warning:

:exclamation: Your organization needs to install the Codecov GitHub app to enable full functionality.

Additional details and impacted files ```diff @@ Coverage Diff @@ ## main #185 +/- ## =========================================== - Coverage 96.77% 83.16% -13.62% =========================================== Files 10 16 +6 Lines 1179 1497 +318 =========================================== + Hits 1141 1245 +104 - Misses 38 252 +214 ```

:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.

david26694 commented 3 months ago

hey, I like the flexibility in here but I can also think of many cases were we use the same dimensions and analysis for the whole analysis plan. Therefore, I would also allow for this API:

import pandas as pd
import numpy as np

# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)

# Seed for reproducibility
np.random.seed(42)

# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)

# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))

# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))

# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS))  # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS))  # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS)  # Random city codes between 1 and 2

# Create DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customers,
    'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
    'order_value': order_values,
    'order_delivery_time_in_minutes': order_delivery_times,
    'order_city_code': order_city_codes
}

df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)

# Show the first few rows of the DataFrame
print(df.head())

from cluster_experiments.analysis_plan import AnalysisPlan
from cluster_experiments.metric import SimpleMetric
from cluster_experiments.dimension import Dimension
from cluster_experiments.variant import Variant
from cluster_experiments.hypothesis_test import HypothesisTest

dimension__city_code = Dimension(
    name='order_city_code',
    values=['1','2']
)

metric__order_value = SimpleMetric(
    alias='AOV',
    name='order_value'
)

metric__delivery_time = SimpleMetric(
    alias='AVG DT',
    name='order_delivery_time_in_minutes'
)

variants = [
    Variant('control', is_control=True),
    Variant('treatment_1', is_control=False),
    Variant('treatment_2', is_control=False)
]

# this next line or
# analysis_plan = SimpleAnalysisPlan(
# or
# analysis_plan = AnalysisPlan.from_raw(
analysis_plan = AnalysisPlan.from_metrics(
    metrics=[metric__delivery_time, metric__order_value],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01,
    dimensions=[dimension__city_code],
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
)

results = analysis_plan.analyze(exp_data=df)

print(results)

wdyt?

david26694 commented 3 months ago

I guess we're also missing:

ludovico-lanni commented 3 months ago

Addressed some of the changes that have been proposed and added cupac support in the analysis flow:

import pandas as pd
import numpy as np

# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)

# Seed for reproducibility
np.random.seed(42)

# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)

# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))

# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))

# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS))  # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS))  # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS)  # Random city codes between 1 and 2

# Create DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customers,
    'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
    'order_value': order_values,
    'order_delivery_time_in_minutes': order_delivery_times,
    'order_city_code': order_city_codes
}

df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)

pre_exp_df = df.assign(
    order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
    order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))

# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())

from cluster_experiments.analytics.analysis_plan import AnalysisPlan
from cluster_experiments.analytics.metric import SimpleMetric
from cluster_experiments.analytics.dimension import Dimension
from cluster_experiments.analytics.variant import Variant
from cluster_experiments.analytics.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation

dimension__city_code = Dimension(
    name='order_city_code',
    values=['1','2']
)

metric__order_value = SimpleMetric(
    alias='AOV',
    name='order_value'
)

metric__delivery_time = SimpleMetric(
    alias='AVG DT',
    name='order_delivery_time_in_minutes'
)

test__order_value = HypothesisTest(
    metric=metric__order_value,
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
    dimensions=[dimension__city_code]
)

cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")

test__delivery_time = HypothesisTest(
    metric=metric__delivery_time,
    analysis_type="gee",
    analysis_config={"cluster_cols":["customer_id"]},
    cupac_config={"cupac_model":cupac__model,
                  "target_col":"order_delivery_time_in_minutes"}
)

variants = [
    Variant('control', is_control=True),
    Variant('treatment_1', is_control=False),
    Variant('treatment_2', is_control=False)
]

analysis_plan = AnalysisPlan(
    tests=[test__order_value, test__delivery_time],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01
)

results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)

print(results)

results_df = pd.DataFrame(results)
ludovico-lanni commented 3 months ago

I covered all of the points we discussed :)

Only big thing that should be missing now are unit tests. About initialising everything from config, I believe we can do it in a second iteration as it is not a hard requirement and the interface already looks quite simple to use.. wdyt?

Check this interface now:

#%%
import pandas as pd
import numpy as np

# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)

# Seed for reproducibility
np.random.seed(42)

# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)

# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))

# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))

# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS))  # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS))  # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS)  # Random city codes between 1 and 2

# Create DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customers,
    'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
    'order_value': order_values,
    'order_delivery_time_in_minutes': order_delivery_times,
    'order_city_code': order_city_codes
}

df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)

pre_exp_df = df.assign(
    order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
    order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))

# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())

from cluster_experiments.inference.analysis_plan import AnalysisPlan
from cluster_experiments.inference.metric import SimpleMetric
from cluster_experiments.inference.dimension import Dimension
from cluster_experiments.inference.variant import Variant
from cluster_experiments.inference.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation

dimension__city_code = Dimension(
    name='order_city_code',
    values=['1','2']
)

metric__order_value = SimpleMetric(
    alias='AOV',
    name='order_value'
)

metric__delivery_time = SimpleMetric(
    alias='AVG DT',
    name='order_delivery_time_in_minutes'
)

test__order_value = HypothesisTest(
    metric=metric__order_value,
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
    dimensions=[dimension__city_code]
)

cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")

test__delivery_time = HypothesisTest(
    metric=metric__delivery_time,
    analysis_type="gee",
    analysis_config={"cluster_cols":["customer_id"], "covariates":["estimate_order_delivery_time_in_minutes"]},
    cupac_config={"cupac_model":cupac__model,
                  "target_col":"order_delivery_time_in_minutes"}
)

variants = [
    Variant('control', is_control=True),
    Variant('treatment_1', is_control=False),
    Variant('treatment_2', is_control=False)
]

analysis_plan = AnalysisPlan(
    tests=[test__order_value, test__delivery_time],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01
)

results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)

print(results)

results_df = results.to_dataframe()

#%%

simple_analysis_plan = AnalysisPlan.from_metrics(
    metrics=[metric__delivery_time, metric__order_value],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01,
    dimensions=[dimension__city_code],
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
)

simple_results = simple_analysis_plan.analyze(exp_data=df, verbose=True)

simple_results_df = simple_results.to_dataframe()
david26694 commented 3 months ago

About initialising everything from config, I believe we can do it in a second iteration as it is not a hard requirement and the interface already looks quite simple to use.. wdyt?

agree! but have a look at the experimentanalysis comment then

ludovico-lanni commented 3 weeks ago

Addresses all the points you reviewed. Focusing on the handling of the analysis class to allow for flexibility and extensibility, I proposed a simple-enough solution through a custom mapper in case the user wants to use it. Check it out at the end of this script.

#%%
import pandas as pd
import numpy as np

# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)

# Seed for reproducibility
np.random.seed(42)

# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)

# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))

# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))

# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS))  # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS))  # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS)  # Random city codes between 1 and 2

# Create DataFrame
data = {
    'order_id': order_ids,
    'customer_id': customers,
    'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
    'order_value': order_values,
    'order_delivery_time_in_minutes': order_delivery_times,
    'order_city_code': order_city_codes
}

df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)

pre_exp_df = df.assign(
    order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
    order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))

# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())

from cluster_experiments.inference.analysis_plan import AnalysisPlan
from cluster_experiments.inference.metric import SimpleMetric
from cluster_experiments.inference.dimension import Dimension
from cluster_experiments.inference.variant import Variant
from cluster_experiments.inference.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation

dimension__city_code = Dimension(
    name='order_city_code',
    values=['1','2']
)

metric__order_value = SimpleMetric(
    alias='AOV',
    name='order_value'
)

metric__delivery_time = SimpleMetric(
    alias='AVG DT',
    name='order_delivery_time_in_minutes'
)

test__order_value = HypothesisTest(
    metric=metric__order_value,
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
    dimensions=[dimension__city_code]
)

cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")

test__delivery_time = HypothesisTest(
    metric=metric__delivery_time,
    analysis_type="gee",
    analysis_config={"cluster_cols":["customer_id"], "covariates":["estimate_order_delivery_time_in_minutes"]},
    cupac_config={"cupac_model":cupac__model,
                  "target_col":"order_delivery_time_in_minutes"}
)

variants = [
    Variant('control', is_control=True),
    Variant('treatment_1', is_control=False),
    Variant('treatment_2', is_control=False)
]

analysis_plan = AnalysisPlan(
    tests=[test__order_value, test__delivery_time],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01
)

results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)

print(results)

results_df = results.to_dataframe()

#%% Run a simple analysis plan with two metrics and one dimension

simple_analysis_plan = AnalysisPlan.from_metrics(
    metrics=[metric__delivery_time, metric__order_value],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01,
    dimensions=[dimension__city_code],
    analysis_type="clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
)

simple_results = simple_analysis_plan.analyze(exp_data=df, verbose=True)

simple_results_df = simple_results.to_dataframe()

#%% Run a simple analysis plan with one metric and one dimension and by using a custom ExperimentAnalysis class

from cluster_experiments.experiment_analysis import ClusteredOLSAnalysis

class CustomExperimentAnalysis(ClusteredOLSAnalysis):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

custom_simple_analysis_plan = AnalysisPlan.from_metrics(
    metrics=[metric__order_value],
    variants=variants,
    variant_col='experiment_group',
    alpha=0.01,
    dimensions=[dimension__city_code],
    analysis_type="custom_clustered_ols",
    analysis_config={"cluster_cols":["customer_id"]},
    custom_analysis_type_mapper={"custom_clustered_ols": CustomExperimentAnalysis}
)

custom_simple_results = custom_simple_analysis_plan.analyze(exp_data=df, verbose=True)

custom_simple_results_df = custom_simple_results.to_dataframe()
ludovico-lanni commented 3 weeks ago

We are still missing the unit tests. Should we proceed? Are we happy with the interface? @david26694

david26694 commented 1 week ago

We are still missing the unit tests. Should we proceed? Are we happy with the interface? @david26694

happy with the interface! also, feel free to remove python 3.8 from the github workflow :)