Training is not done on GPU/CUDA

Describe the bug When choosing device_name="cuda", the training is not done on GPU

What is the current behavior? Training is carried out on CPU

If the current behavior is a bug, please provide the steps to reproduce.

beta = 0.5
u = 1/3

reward_matrix = np.array([
    [0.041, 0, -0.041],
    [0, 0.0041, 0],
    [-0.041, 0, 0.041]
])

reward_tensor = torch.as_tensor(reward_matrix).to("cuda")

def neg_reward(y_pred, y_true):
  y_t = y_true.type(torch.int32)
  p = torch.nn.Softmax(dim=1)(y_pred)
  r_selected = torch.index_select(reward_tensor, dim=1, index=y_t)
  r = torch.transpose(r_selected, dim0=0, dim1=1)
  reward = torch.sum(torch.multiply(r, torch.log(p/(1-p+1e-20))), dim=1)
  kld = torch.sum(p*torch.log(p/u), dim=1)
  losses = -reward+beta*kld
  return torch.mean(losses)

def neg_reward_regularized(y_pred, y_true):
  pass

class Reward(Metric):
  def __init__(self):
    self._name = "reward"
    self._maximize = True

  def __call__(self, y_true, y_score):
    #p = softmax(y_score, axis=1)
    reward_matrix = np.array([
        [0.041, 0, -0.041],
        [0, 0.0041, 0],
        [-0.041, 0, 0.041]
    ])
    rewards = np.sum(np.transpose(reward_matrix[:,y_true])*np.log(y_score/(1-y_score+1e-20)), axis=1)
    #kld = np.sum(y_score*np.log(y_score/u), axis=1)
    #losses = rewards-beta*kld
    return np.mean(rewards)

X_num = np.random.random((2000, 171))
X_cat = np.random.choice([0,1], size=2000).reshape((-1, 1))
X = np.concatenate([X_num, X_cat], axis=1)
y = np.random.choice([0,1,2], size=2000)

cat_idxs = [171]
cat_dims = [2]

model = TabNetClassifier(
              device_name="cuda",
              cat_idxs=cat_idxs,
              cat_dims=cat_dims,
              cat_emb_dim=1,
              optimizer_fn=torch.optim.Adam, # Any optimizer works here
              optimizer_params=dict(lr=2e-2),
              scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
              scheduler_params={"is_batch_level":True,
                                "max_lr":5e-2,
                                "steps_per_epoch":int(train_set.shape[0] / batch_size)+1,
                                "epochs":10_000
                                },
              mask_type='entmax', # "sparsemax",
              lambda_sparse=0
          )

model.fit(
    X, y,
    eval_set=[
        (X, y)],
    eval_name=["train_set"],
    max_epochs=100,
    patience=10,
    loss_fn=neg_reward,
    eval_metric=[Reward],
    batch_size=16
)

Expected behavior When choosing CUDA, I expect the GPU RAM to be used instead of the CPU RAM. Also when a custom loss and eval metric is used.

Screenshots During training:

Specs:

dreamquark-ai / tabnet

Training is not done on GPU/CUDA #484