Fine-tuning CONCODE error: "RuntimeError: Could not infer dtype of NoneType"

I'm trying to fine-tune CONCODE according to the instructions for text-code. I am using the default dataset and follow the exact same steps. At the fine-tuning stage, I get an error: RuntimeError: Could not infer dtype of NoneType

Here is the error log on the console.

11/21/2021 00:31:38 - INFO - __main__ -   GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50259,
  "do_sample": false,
  "embd_pdrop": 0.1,
  "eos_token_id": 50258,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "num_beams": 1,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 50257,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 50260
}

11/21/2021 00:31:38 - INFO - __main__ -   Model has a total of 124442112 trainable parameters
11/21/2021 00:31:40 - INFO - __main__ -   Training/evaluation parameters Namespace(adam_epsilon=1e-08, block_size=512, cache_dir='', config_dir=None, data_dir='../dataset/concode', device=device(type='cuda', index=0), do_eval=False, do_infer=False, do_lower_case=False, do_train=True, eval_all_checkpoints=False, evaluate_during_training=True, fp16=False, fp16_opt_level='O1', gpu_per_node=1, gradient_accumulation_steps=2, langs='java', learning_rate=5e-05, load_name='pretrained', local_rank=0, log_file='text2code_concode.log', logging_steps=100, max_grad_norm=1.0, max_steps=-1, mlm=False, mlm_probability=0.15, model_type='gpt2', n_gpu=1, no_cuda=False, node_index=0, num_train_epochs=30.0, output_dir='../save/concode', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=12, per_gpu_train_batch_size=6, pretrain_dir='microsoft/CodeGPT-small-java-adaptedGPT2', save_steps=5000, save_total_limit=None, seed=42, server_ip='', server_port='', start_epoch=0, start_step=0, tensorboard_dir=None, tokenizer_dir=None, warmup_steps=0, weight_decay=0.01)
11/21/2021 00:31:40 - WARNING - __main__ -   Creating features from dataset file at ../dataset/concode/train.json
11/21/2021 00:31:42 - INFO - __main__ -   Data size: 100000
11/21/2021 00:31:42 - WARNING - __main__ -   Rank 0, load 0
11/21/2021 00:32:05 - WARNING - __main__ -   Rank 0, load 10
11/21/2021 00:32:25 - WARNING - __main__ -   Rank 0, load 20
11/21/2021 00:32:42 - WARNING - __main__ -   Rank 0, load 30
11/21/2021 00:32:58 - WARNING - __main__ -   Rank 0, load 40
11/21/2021 00:33:13 - WARNING - __main__ -   Rank 0, load 50
11/21/2021 00:33:28 - WARNING - __main__ -   Rank 0, load 60
11/21/2021 00:33:43 - WARNING - __main__ -   Rank 0, load 70
11/21/2021 00:33:58 - WARNING - __main__ -   Rank 0, load 80
11/21/2021 00:34:12 - WARNING - __main__ -   Rank 0, load 90
11/21/2021 00:34:25 - WARNING - __main__ -   Rank 0 Training 100000 token, 100000 samples
11/21/2021 00:34:25 - WARNING - __main__ -   Saving features into cached file ../save/concode/train_blocksize_512_wordsize_1_rank_0
11/21/2021 00:34:29 - INFO - __main__ -   ***** Running training *****
11/21/2021 00:34:29 - INFO - __main__ -     Num examples = 100000
11/21/2021 00:34:29 - INFO - __main__ -     Num epoch = 29
11/21/2021 00:34:29 - INFO - __main__ -     Instantaneous batch size per GPU = 6
11/21/2021 00:34:29 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 12
11/21/2021 00:34:29 - INFO - __main__ -     Gradient Accumulation steps = 2
11/21/2021 00:34:29 - INFO - __main__ -     Total optimization steps = 249990
Traceback (most recent call last):
  File "run.py", line 653, in <module>
    main()
  File "run.py", line 640, in main
    global_step, tr_loss = train(args, train_dataset, model, tokenizer, fh, pool)
  File "run.py", line 165, in train
    for step, (batch, token_labels) in enumerate(train_dataloader):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 345, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 385, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/root/CodeXGLUE/Text-Code/text-to-code/code/dataset.py", line 116, in __getitem__
    return torch.tensor(self.inputs[item]), torch.tensor(self.token_labels[item])
RuntimeError: Could not infer dtype of NoneType
Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launch.py", line 263, in <module>
    main()
  File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launch.py", line 259, in main
    cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', '-u', 'run.py', '--local_rank=0', '--data_dir=../dataset/concode', '--langs=java', '--output_dir=../save/concode', '--pretrain_dir=microsoft/CodeGPT-small-java-adaptedGPT2', '--log_file=text2code_concode.log', '--model_type=gpt2', '--block_size=512', '--do_train', '--node_index', '0', '--gpu_per_node', '1', '--learning_rate=5e-5', '--weight_decay=0.01', '--evaluate_during_training', '--per_gpu_train_batch_size=6', '--per_gpu_eval_batch_size=12', '--gradient_accumulation_steps=2', '--num_train_epochs=30', '--logging_steps=100', '--save_steps=5000', '--overwrite_output_dir', '--seed=42']' returned non-zero exit status 1

I tried your approach and I got a new error.

Traceback (most recent call last):
  File "CodeXGLUE/Text-Code/text-to-code/code/run.py", line 653, in <module>
    main()
  File "CodeXGLUE/Text-Code/text-to-code/code/run.py", line 640, in main
    global_step, tr_loss = train(args, train_dataset, model, tokenizer, fh, pool)
  File "CodeXGLUE/Text-Code/text-to-code/code/run.py", line 172, in train
    outputs = model(inputs, attention_mask=attn_mask)
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 760, in forward
    transformer_outputs = self.transformer(
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 641, in forward
    outputs = block(
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 288, in forward
    attn_outputs = self.attn(
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 223, in forward
    query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
  File "/u/yangsong/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/u/yangsong/.local/lib/python3.8/site-packages/transformers/modeling_utils.py", line 1119, in forward
    x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

microsoft / CodeXGLUE

Fine-tuning CONCODE error: "RuntimeError: Could not infer dtype of NoneType" #92