Hi there, I'm getting a strange error when trying to use autocasting and gradient caching at the same time and was wondering if you had some insight. For context, I'm trying to train an audio-visual clip model as follows:
Model:
@cached
@autocast
def call_audio_model(model, input):
return model.forward_audio(input)
@cached
@autocast
def call_vision_model(model, input):
return model.forward_visual(input)
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 256
n_epochs = np.inf #200
batches_per_backward = 8
save_every_n = 10
max_gradient_norm = 1.0
dataset = MMDataset(videos,
audio
)
audio_encoder_cfg = {"name": "resnet18", "audio_shape": dataset[0]['audio'].shape, "num_microphones":4}
visual_encoder_cfg = {"img_shape":dataset[0]['img'].shape}
dataloader = torch.utils.data.DataLoader(dataset,batch_size = batch_size, shuffle = True)#, pin_memory=True, num_workers = 10)
avsimclr = AVSimCLR(audio_encoder_cfg = audio_encoder_cfg,
visual_encoder_cfg = visual_encoder_cfg,
projector_cfg = projector_cfg,
loss_cfg = loss_cfg).to(device)
optimizer = Lars(avsimclr.parameters(), **optimizer_cfg)
lr_scheduler_cfg["name"] = "ReduceLROnPlateau"
scaler = torch.cuda.amp.GradScaler()
cache_x = []
cache_y = []
closures_x = []
closures_y = []
losses = []
global_loss = np.inf
epoch = 1
while True:
losses = []
with tqdm(enumerate(dataloader), desc = "step", leave = True, position = 0, unit = "batch") as batches:
for step, sub_batch in batches:
audio = sub_batch['audio'].to(device)
img = sub_batch['img'].to(device)
rx, cx = call_audio_model(avsimclr, audio)
ry, cy = call_vision_model(avsimclr, img)
cache_x.append(rx)
cache_y.append(ry)
closures_x.append(cx)
closures_y.append(cy)
if (step + 1) % batches_per_backward == 0:
batches.set_description(f"Calculating backwards pass on step {step} of {len(dataloader)}!")
loss = cached_loss.cav_loss(cache_x, cache_y)
scaler.scale(loss).backward()
for f, r in zip(closures_x, cache_x):
f(r)
for f, r in zip(closures_y, cache_y):
f(r)
cache_x = []
cache_y = []
closures_x = []
closures_y = []
# Unscales the gradients of optimizer's assigned params in-place
scaler.unscale_(optimizer)
# Since the gradients of optimizer's assigned params are unscaled, clips as usual:
torch.nn.utils.clip_grad_norm_(avsimclr.parameters(), max_gradient_norm)
# optimizer's gradients are already unscaled, so scaler.step does not unscale them,
# although it still skips optimizer.step() if the gradients contain infs or NaNs.
scaler.step(optimizer)
# Updates the scale for next iteration.
scaler.update()
optimizer.zero_grad()
losses.append(loss.item())
batches.set_postfix_str(f"train_loss_step: {loss.item():.3f}")
else:
batches.set_description(f"Epoch {epoch} step")
train_loss_epoch = np.mean(losses)
lr_scheduler.step(train_loss_epoch)
epoch += 1
torch.cuda.empty_cache() #-Call-empty_cache-whenever-we-del-somethingfrom-gpu
gc.collect()
And the error i receive is
File "gradcache/functional.py", line 22, in <module>
reps_no_grad = func(*args, **kwargs)
TypeError: __call__() takes 2 positional arguments but 3 were given
And i believe its because its passing an autocast function into reps_no_grad bc removing `@autocast` fixes it
Hi there, I'm getting a strange error when trying to use autocasting and gradient caching at the same time and was wondering if you had some insight. For context, I'm trying to train an audio-visual clip model as follows: Model:
Loss:
Training:
And the error i receive is