# define the hyperparameters for running the train function.
optimizer_ch2 = AdamW(model_ch2.parameters(), lr = lr, correct_bias = True)
scheduler_ch2 = get_linear_schedule_with_warmup(optimizer = optimizer_ch2,
num_warmup_steps = 200,
num_training_steps = 1000,
last_epoch = -1)
and here is my train function:
def train_lm_head(model, train_iter, optimizer, scheduler, log_interval, pad_index):
# turn on a training mode
model.train()
# initialize total_loss to 0
total_loss = 0
for batch_index, batch in enumerate(train_iter):
input_ids = [instance for instance in batch.text]
## NOTE: Positions embeddings can be automatically created by the GPT2DoubleHeadsModel as (0, 1, ..., N)
# set the gradient back to 0 (necessary step)
optimizer.zero_grad()
# notice here that we are only placing lm_labels
# as mc_label is unnecessary for language modelling purpose.
lm_labels = [-1] + input_ids[:(len(input_ids)-1)]
lm_labels = torch.tensor([lm_labels], dtype=torch.long)
input_ids = torch.tensor([input_ids], dtype=torch.long)
output = model(input_ids, lm_labels = lm_labels)
loss = output[0]
# 'loss' here is the cross entropy.
# recall: 'input_ids' is defined above.
# calculate gradient by backwarding the loss
# calculate gradient of the loss w.r.t weights
loss.backward()
# clips norm of the gradient of an iterable of parameters.
# The norm is computed over all gradients together, as if they were
# concatenated into a single vector. Gradients are modified in-place.
# so basically just normalizes the weights and returns them.
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step() # update the weights by following the WarmupLinearSchedule for the lr.
scheduler.step() # update the learning rate
# update the with the calculated loss
total_loss = total_loss + loss
# python format: 's' for string, 'd' to display decimal integers (10-base), and 'f' for floats.
# ex: print("Sammy ate {0:.3f} percent of a pizza!".format(75.765367))
# >> Sammy ate 75.765 percent of a pizza!
# print("Sammy ate {0:f} percent of a {1}!".format(75, "pizza"))
# >> Sammy ate 75.000000 percent of a pizza!
#
# Below is good enough since we are doing the Stochastic Gradient Descent.
# (i.e. 1 batch = 1 sample)
if batch_index % log_interval == 0 and batch_index > 0:
print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} |'.format(
epoch, batch_index, len(train_iter), scheduler.get_lr()[0]))
total_loss = 0
and when I iterate the train function above for 5 epoch, I am getting the following output:
I am a bit concerned about this output because the learning rate does not seem to be changing, although I have specified in my train function scheduler.step(), right underneath the optimizer.step().
what's in the get_linear_schedule_with_warmup function?
BTW, an issue openned on pytorch/pytorch will be better answered since you are talking about scheduler.
Hello,
Outside of the training function, I set:
and here is my train function:
and when I iterate the train function above for 5 epoch, I am getting the following output:
I am a bit concerned about this output because the learning rate does not seem to be changing, although I have specified in my train function
scheduler.step()
, right underneath theoptimizer.step()
.What am I doing wrong here?
Thank you,