I'm noticing there are quite a few hyperparameters that can vary drastically for the problem and dataset one would want to use. Can anyone provide any intuition or tips on which values to choose? Particularly I'm talking about these hyperparameters
class ElucidatedImagen(nn.Module):
def init(
self,
unets,
*,
image_sizes, # for cascading ddpm, image size at each stage
text_encoder_name = DEFAULT_T5_NAME,
text_embed_dim = None,
channels = 3,
cond_drop_prob = 0.1,
random_crop_sizes = None,
lowres_sample_noise_level = 0.2, # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level
per_sample_random_aug_noise_level = False, # unclear when conditioning on augmentation noise level, whether each batch element receives a random aug noise value - turning off due to @marunine's find
condition_on_text = True,
auto_normalize_img = True, # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
dynamic_thresholding = True,
dynamic_thresholding_percentile = 0.95, # unsure what this was based on perusal of paper
only_train_unet_number = None,
lowres_noise_schedule = 'linear',
num_sample_steps = 32, # number of sampling steps
sigma_min = 0.002, # min noise level
sigma_max = 80, # max noise level
sigma_data = 0.5, # standard deviation of data distribution
rho = 7, # controls the sampling schedule
P_mean = -1.2, # mean of log-normal distribution from which noise is drawn for training
P_std = 1.2, # standard deviation of log-normal distribution from which noise is drawn for training
S_churn = 80, # parameters for stochastic sampling - depends on dataset, Table 5 in apper
S_tmin = 0.05,
S_tmax = 50,
S_noise = 1.003,
):
I'm noticing there are quite a few hyperparameters that can vary drastically for the problem and dataset one would want to use. Can anyone provide any intuition or tips on which values to choose? Particularly I'm talking about these hyperparameters
class ElucidatedImagen(nn.Module): def init( self, unets, *, image_sizes, # for cascading ddpm, image size at each stage text_encoder_name = DEFAULT_T5_NAME, text_embed_dim = None, channels = 3, cond_drop_prob = 0.1, random_crop_sizes = None, lowres_sample_noise_level = 0.2, # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level per_sample_random_aug_noise_level = False, # unclear when conditioning on augmentation noise level, whether each batch element receives a random aug noise value - turning off due to @marunine's find condition_on_text = True, auto_normalize_img = True, # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader dynamic_thresholding = True, dynamic_thresholding_percentile = 0.95, # unsure what this was based on perusal of paper only_train_unet_number = None, lowres_noise_schedule = 'linear', num_sample_steps = 32, # number of sampling steps sigma_min = 0.002, # min noise level sigma_max = 80, # max noise level sigma_data = 0.5, # standard deviation of data distribution rho = 7, # controls the sampling schedule P_mean = -1.2, # mean of log-normal distribution from which noise is drawn for training P_std = 1.2, # standard deviation of log-normal distribution from which noise is drawn for training S_churn = 80, # parameters for stochastic sampling - depends on dataset, Table 5 in apper S_tmin = 0.05, S_tmax = 50, S_noise = 1.003, ):