File "[...]/utils.py", line 835, in train_fn
scaler.step(optimizer)
File "/home/jovyan/.imgenv-batyr-hpo-0/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 338, in step
retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
File "/home/jovyan/.imgenv-batyr-hpo-0/lib/python3.7/site-packages/torch/cuda/amp/grad_scaler.py", line 285, in _maybe_opt_step
retval = optimizer.step(*args, **kwargs)
File "/home/jovyan/.imgenv-batyr-hpo-0/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/home/jovyan/.imgenv-batyr-hpo-0/lib/python3.7/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/user/conda/lib/python3.7/site-packages/torch_optimizer/adafactor.py", line 192, in step
exp_avg_sq_row, exp_avg_sq_col, update
File "/home/user/conda/lib/python3.7/site-packages/torch_optimizer/adafactor.py", line 116, in _approx_sq_grad
(exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1))
RuntimeError: The size of tensor a (7) must match the size of tensor b (3) at non-singleton dimension 2
Something's wrong with Adafactor