Open ludovico-lanni opened 3 months ago
:warning: Please install the to ensure uploads and comments are reliably processed by Codecov.
Attention: Patch coverage is 32.70440%
with 214 lines
in your changes missing coverage. Please review.
Project coverage is 83.16%. Comparing base (
d4ea384
) to head (0cc21d3
).
:exclamation: Your organization needs to install the Codecov GitHub app to enable full functionality.
:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.
hey, I like the flexibility in here but I can also think of many cases were we use the same dimensions and analysis for the whole analysis plan. Therefore, I would also allow for this API:
import pandas as pd
import numpy as np
# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)
# Seed for reproducibility
np.random.seed(42)
# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)
# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))
# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))
# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS)) # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS)) # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS) # Random city codes between 1 and 2
# Create DataFrame
data = {
'order_id': order_ids,
'customer_id': customers,
'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
'order_value': order_values,
'order_delivery_time_in_minutes': order_delivery_times,
'order_city_code': order_city_codes
}
df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)
# Show the first few rows of the DataFrame
print(df.head())
from cluster_experiments.analysis_plan import AnalysisPlan
from cluster_experiments.metric import SimpleMetric
from cluster_experiments.dimension import Dimension
from cluster_experiments.variant import Variant
from cluster_experiments.hypothesis_test import HypothesisTest
dimension__city_code = Dimension(
name='order_city_code',
values=['1','2']
)
metric__order_value = SimpleMetric(
alias='AOV',
name='order_value'
)
metric__delivery_time = SimpleMetric(
alias='AVG DT',
name='order_delivery_time_in_minutes'
)
variants = [
Variant('control', is_control=True),
Variant('treatment_1', is_control=False),
Variant('treatment_2', is_control=False)
]
# this next line or
# analysis_plan = SimpleAnalysisPlan(
# or
# analysis_plan = AnalysisPlan.from_raw(
analysis_plan = AnalysisPlan.from_metrics(
metrics=[metric__delivery_time, metric__order_value],
variants=variants,
variant_col='experiment_group',
alpha=0.01,
dimensions=[dimension__city_code],
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
)
results = analysis_plan.analyze(exp_data=df)
print(results)
wdyt?
I guess we're also missing:
Addressed some of the changes that have been proposed and added cupac support in the analysis flow:
import pandas as pd
import numpy as np
# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)
# Seed for reproducibility
np.random.seed(42)
# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)
# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))
# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))
# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS)) # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS)) # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS) # Random city codes between 1 and 2
# Create DataFrame
data = {
'order_id': order_ids,
'customer_id': customers,
'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
'order_value': order_values,
'order_delivery_time_in_minutes': order_delivery_times,
'order_city_code': order_city_codes
}
df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)
pre_exp_df = df.assign(
order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))
# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())
from cluster_experiments.analytics.analysis_plan import AnalysisPlan
from cluster_experiments.analytics.metric import SimpleMetric
from cluster_experiments.analytics.dimension import Dimension
from cluster_experiments.analytics.variant import Variant
from cluster_experiments.analytics.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation
dimension__city_code = Dimension(
name='order_city_code',
values=['1','2']
)
metric__order_value = SimpleMetric(
alias='AOV',
name='order_value'
)
metric__delivery_time = SimpleMetric(
alias='AVG DT',
name='order_delivery_time_in_minutes'
)
test__order_value = HypothesisTest(
metric=metric__order_value,
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
dimensions=[dimension__city_code]
)
cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")
test__delivery_time = HypothesisTest(
metric=metric__delivery_time,
analysis_type="gee",
analysis_config={"cluster_cols":["customer_id"]},
cupac_config={"cupac_model":cupac__model,
"target_col":"order_delivery_time_in_minutes"}
)
variants = [
Variant('control', is_control=True),
Variant('treatment_1', is_control=False),
Variant('treatment_2', is_control=False)
]
analysis_plan = AnalysisPlan(
tests=[test__order_value, test__delivery_time],
variants=variants,
variant_col='experiment_group',
alpha=0.01
)
results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)
print(results)
results_df = pd.DataFrame(results)
I covered all of the points we discussed :)
Only big thing that should be missing now are unit tests. About initialising everything from config, I believe we can do it in a second iteration as it is not a hard requirement and the interface already looks quite simple to use.. wdyt?
Check this interface now:
#%%
import pandas as pd
import numpy as np
# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)
# Seed for reproducibility
np.random.seed(42)
# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)
# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))
# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))
# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS)) # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS)) # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS) # Random city codes between 1 and 2
# Create DataFrame
data = {
'order_id': order_ids,
'customer_id': customers,
'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
'order_value': order_values,
'order_delivery_time_in_minutes': order_delivery_times,
'order_city_code': order_city_codes
}
df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)
pre_exp_df = df.assign(
order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))
# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())
from cluster_experiments.inference.analysis_plan import AnalysisPlan
from cluster_experiments.inference.metric import SimpleMetric
from cluster_experiments.inference.dimension import Dimension
from cluster_experiments.inference.variant import Variant
from cluster_experiments.inference.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation
dimension__city_code = Dimension(
name='order_city_code',
values=['1','2']
)
metric__order_value = SimpleMetric(
alias='AOV',
name='order_value'
)
metric__delivery_time = SimpleMetric(
alias='AVG DT',
name='order_delivery_time_in_minutes'
)
test__order_value = HypothesisTest(
metric=metric__order_value,
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
dimensions=[dimension__city_code]
)
cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")
test__delivery_time = HypothesisTest(
metric=metric__delivery_time,
analysis_type="gee",
analysis_config={"cluster_cols":["customer_id"], "covariates":["estimate_order_delivery_time_in_minutes"]},
cupac_config={"cupac_model":cupac__model,
"target_col":"order_delivery_time_in_minutes"}
)
variants = [
Variant('control', is_control=True),
Variant('treatment_1', is_control=False),
Variant('treatment_2', is_control=False)
]
analysis_plan = AnalysisPlan(
tests=[test__order_value, test__delivery_time],
variants=variants,
variant_col='experiment_group',
alpha=0.01
)
results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)
print(results)
results_df = results.to_dataframe()
#%%
simple_analysis_plan = AnalysisPlan.from_metrics(
metrics=[metric__delivery_time, metric__order_value],
variants=variants,
variant_col='experiment_group',
alpha=0.01,
dimensions=[dimension__city_code],
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
)
simple_results = simple_analysis_plan.analyze(exp_data=df, verbose=True)
simple_results_df = simple_results.to_dataframe()
About initialising everything from config, I believe we can do it in a second iteration as it is not a hard requirement and the interface already looks quite simple to use.. wdyt?
agree! but have a look at the experimentanalysis comment then
Addresses all the points you reviewed. Focusing on the handling of the analysis class to allow for flexibility and extensibility, I proposed a simple-enough solution through a custom mapper in case the user wants to use it. Check it out at the end of this script.
#%%
import pandas as pd
import numpy as np
# Constants
NUM_ORDERS = 10000
NUM_CUSTOMERS = 3000
EXPERIMENT_GROUPS = ['control', 'treatment_1', 'treatment_2']
GROUP_SIZE = NUM_CUSTOMERS // len(EXPERIMENT_GROUPS)
# Seed for reproducibility
np.random.seed(42)
# Generate customer_ids
customer_ids = np.arange(1, NUM_CUSTOMERS + 1)
# Shuffle and split customer_ids into experiment groups
np.random.shuffle(customer_ids)
experiment_group = np.repeat(EXPERIMENT_GROUPS, GROUP_SIZE)
experiment_group = np.concatenate((experiment_group, np.random.choice(EXPERIMENT_GROUPS, NUM_CUSTOMERS - len(experiment_group))))
# Assign customers to groups
customer_group_mapping = dict(zip(customer_ids, experiment_group))
# Generate orders
order_ids = np.arange(1, NUM_ORDERS + 1)
customers = np.random.choice(customer_ids, NUM_ORDERS)
order_values = np.abs(np.random.normal(loc=10, scale=2, size=NUM_ORDERS)) # Normally distributed around 10 and positive
order_delivery_times = np.abs(np.random.normal(loc=30, scale=5, size=NUM_ORDERS)) # Normally distributed around 30 minutes and positive
order_city_codes = np.random.randint(1, 3, NUM_ORDERS) # Random city codes between 1 and 2
# Create DataFrame
data = {
'order_id': order_ids,
'customer_id': customers,
'experiment_group': [customer_group_mapping[customer_id] for customer_id in customers],
'order_value': order_values,
'order_delivery_time_in_minutes': order_delivery_times,
'order_city_code': order_city_codes
}
df = pd.DataFrame(data)
df.order_city_code = df.order_city_code.astype(str)
pre_exp_df = df.assign(
order_value = lambda df: df['order_value'] + np.random.normal(loc=0, scale=1, size=NUM_ORDERS),
order_delivery_time_in_minutes = lambda df: df['order_delivery_time_in_minutes'] + np.random.normal(loc=0, scale=2, size=NUM_ORDERS)
).sample(int(NUM_ORDERS/3))
# Show the first few rows of the DataFrame
print(df.head())
print(pre_exp_df.head())
from cluster_experiments.inference.analysis_plan import AnalysisPlan
from cluster_experiments.inference.metric import SimpleMetric
from cluster_experiments.inference.dimension import Dimension
from cluster_experiments.inference.variant import Variant
from cluster_experiments.inference.hypothesis_test import HypothesisTest
from cluster_experiments import TargetAggregation
dimension__city_code = Dimension(
name='order_city_code',
values=['1','2']
)
metric__order_value = SimpleMetric(
alias='AOV',
name='order_value'
)
metric__delivery_time = SimpleMetric(
alias='AVG DT',
name='order_delivery_time_in_minutes'
)
test__order_value = HypothesisTest(
metric=metric__order_value,
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
dimensions=[dimension__city_code]
)
cupac__model = TargetAggregation(agg_col="customer_id", target_col="order_delivery_time_in_minutes")
test__delivery_time = HypothesisTest(
metric=metric__delivery_time,
analysis_type="gee",
analysis_config={"cluster_cols":["customer_id"], "covariates":["estimate_order_delivery_time_in_minutes"]},
cupac_config={"cupac_model":cupac__model,
"target_col":"order_delivery_time_in_minutes"}
)
variants = [
Variant('control', is_control=True),
Variant('treatment_1', is_control=False),
Variant('treatment_2', is_control=False)
]
analysis_plan = AnalysisPlan(
tests=[test__order_value, test__delivery_time],
variants=variants,
variant_col='experiment_group',
alpha=0.01
)
results = analysis_plan.analyze(exp_data=df, pre_exp_data=pre_exp_df)
print(results)
results_df = results.to_dataframe()
#%% Run a simple analysis plan with two metrics and one dimension
simple_analysis_plan = AnalysisPlan.from_metrics(
metrics=[metric__delivery_time, metric__order_value],
variants=variants,
variant_col='experiment_group',
alpha=0.01,
dimensions=[dimension__city_code],
analysis_type="clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
)
simple_results = simple_analysis_plan.analyze(exp_data=df, verbose=True)
simple_results_df = simple_results.to_dataframe()
#%% Run a simple analysis plan with one metric and one dimension and by using a custom ExperimentAnalysis class
from cluster_experiments.experiment_analysis import ClusteredOLSAnalysis
class CustomExperimentAnalysis(ClusteredOLSAnalysis):
def __init__(self, **kwargs):
super().__init__(**kwargs)
custom_simple_analysis_plan = AnalysisPlan.from_metrics(
metrics=[metric__order_value],
variants=variants,
variant_col='experiment_group',
alpha=0.01,
dimensions=[dimension__city_code],
analysis_type="custom_clustered_ols",
analysis_config={"cluster_cols":["customer_id"]},
custom_analysis_type_mapper={"custom_clustered_ols": CustomExperimentAnalysis}
)
custom_simple_results = custom_simple_analysis_plan.analyze(exp_data=df, verbose=True)
custom_simple_results_df = custom_simple_results.to_dataframe()
We are still missing the unit tests. Should we proceed? Are we happy with the interface? @david26694
We are still missing the unit tests. Should we proceed? Are we happy with the interface? @david26694
happy with the interface! also, feel free to remove python 3.8 from the github workflow :)