salesforce / CodeT5

Home of CodeT5: Open Code LLMs for Code Understanding and Generation
https://arxiv.org/abs/2305.07922
BSD 3-Clause "New" or "Revised" License
2.65k stars 391 forks source link

TypeError: object of type 'NoneType' has no len() #171

Open SerenaDeAntoni opened 1 month ago

SerenaDeAntoni commented 1 month ago

Hi! I'm trying to fine tune CodeT5p-770m model for text-to-Java code . I'm working in google colab, I copied the tune_codet5p_seq2seq.py file in a script.py file. I modify the script in:

import os
import pprint
import argparse
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import DatasetDict
def run_training(args, model, train_data):
    print(f"Starting main loop")
    print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    print(args)
    print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    training_args = TrainingArguments(
        report_to='tensorboard',
        output_dir=args.save_dir,
        overwrite_output_dir=False,

        do_train=True,
        save_strategy='epoch',

        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size_per_replica,
        gradient_accumulation_steps=args.grad_acc_steps,

        learning_rate=args.lr,
        weight_decay=0.05,
        warmup_steps=args.lr_warmup_steps,

        logging_dir=args.save_dir,
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_total_limit=1,

        dataloader_drop_last=True,
        dataloader_num_workers=4,

        local_rank=args.local_rank,
        deepspeed=args.deepspeed,
        fp16=args.fp16,
    )

    print('££££££££££££££££££££££££££££££££')
    print(training_args)
    print('££££££££££££££££££££££££££££££££') 

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )

    trainer.train()

    if args.local_rank in [0, -1]:
        final_checkpoint_dir = os.path.join(args.save_dir, "final_checkpoint")
        model.save_pretrained(final_checkpoint_dir)
        print(f'  ==> Finish training and save to {final_checkpoint_dir}')

def load_tokenize_data(args):
    # Load and tokenize data
    if os.path.exists(args.cache_data):
        train_data = load_from_disk(args.cache_data)
        print(f'  ==> Loaded {len(train_data)} samples')
        return train_data
    else:
        # Example code to load and process code_x_glue_ct_code_to_text python dataset for code summarization task
        datasets = load_from_disk('/content/drive/MyDrive/prova/Dataset')
        tokenizer = AutoTokenizer.from_pretrained(args.load)

        def preprocess_function(examples):
            source = [' '.join(ex) for ex in examples["code_tokens"]]
            target = [' '.join(ex) for ex in examples["docstring_tokens"]]

            model_inputs = tokenizer(source, max_length=args.max_source_len, padding="max_length", truncation=True)
            labels = tokenizer(target, max_length=args.max_target_len, padding="max_length", truncation=True)

            model_inputs["labels"] = labels["input_ids"].copy()
            model_inputs["labels"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
            ]
            return model_inputs

        train_data = datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=datasets.column_names,
            num_proc=64,
            load_from_cache_file=False,
        )
        print(f'  ==> Loaded {len(train_data)} samples')
        train_data.save_to_disk(args.cache_data)
        print(f'  ==> Saved to {args.cache_data}')
        return train_data

def main(args={}):
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
    print(args)
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
    argsdict = vars(args)
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print(pprint.pformat(argsdict))
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
    # Save command to file
    with open(os.path.join(args.save_dir, "command.txt"), 'w') as f:
        f.write(pprint.pformat(argsdict))

    # Load and tokenize data using the tokenizer from `args.load`. If the data is already cached, load it from there.
    # You can customize this function to load your own data for any Seq2Seq LM tasks.
    train_data = load_tokenize_data(args)

    if args.data_num != -1:
        train_data = train_data.select([i for i in range(args.data_num)])

    # Load model from `args.load`
    model = AutoModelForSeq2SeqLM.from_pretrained(args.load)
    print(f"  ==> Loaded model from {args.load}, model size {model.num_parameters()}")

    print('@@@@@@@@@@@@@@@@@@@@@@@')
    print(args)
    print('@@@@@@@@@@@@@@@@@@@@@@@')
    run_training(args, model, train_data)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="CodeT5+ finetuning on Text-2-Code LM task")
    parser.add_argument('--data-num', default=-1, type=int)
    parser.add_argument('--max-source-len', default=320, type=int)
    parser.add_argument('--max-target-len', default=128, type=int)
    parser.add_argument('--cache-data', default= '/content/drive/MyDrive/prova/Dataset', type=str)
    parser.add_argument('--load', default='Salesforce/codet5p-770m', type=str)

    # Training
    parser.add_argument('--epochs', default=10, type=int)
    parser.add_argument('--lr', default=5e-5, type=float)
    parser.add_argument('--lr-warmup-steps', default=200, type=int)
    parser.add_argument('--batch-size-per-replica', default=8, type=int)
    parser.add_argument('--grad-acc-steps', default=4, type=int)
    parser.add_argument('--local_rank', default=-1, type=int)
    parser.add_argument('--deepspeed', default=None, type=str)
    parser.add_argument('--fp16', default=False, action='store_true')

    # Logging and stuff
    parser.add_argument('--save-dir', default="saved_models/prova1", type=str)
    parser.add_argument('--log-freq', default=10, type=int)
    parser.add_argument('--save-freq', default=500, type=int)

    args = parser.parse_args()
    print('**************************')
    print(args)
    print('**************************')
    os.makedirs(args.save_dir, exist_ok=True)
    print('#################################')
    print(args)
    print('#################################')
    main(args)

I add all the print for debugging my code :) BUT when i i run ' !python3 script.py ' on colab i receive:

2024-05-21 12:07:32.240561: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 12:07:32.240620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 12:07:32.242264: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 12:07:33.539100: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
**************************
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
**************************
#################################
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
#################################
%%%%%%%%%%%%%%%%%%%%%%%%%%
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
%%%%%%%%%%%%%%%%%%%%%%%%%%
^^^^^^^^^^^^^^^^^^^^^^^^^^
{'batch_size_per_replica': 8,
 'cache_data': '/content/drive/MyDrive/prova/Dataset',
 'data_num': -1,
 'deepspeed': None,
 'epochs': 10,
 'fp16': False,
 'grad_acc_steps': 4,
 'load': 'Salesforce/codet5p-770m',
 'local_rank': -1,
 'log_freq': 10,
 'lr': 5e-05,
 'lr_warmup_steps': 200,
 'max_source_len': 320,
 'max_target_len': 128,
 'save_dir': 'saved_models/prova1',
 'save_freq': 500}
^^^^^^^^^^^^^^^^^^^^^^^^^^
  ==> Loaded 3 samples
  ==> Loaded model from Salesforce/codet5p-770m, model size 737639424
@@@@@@@@@@@@@@@@@@@@@@@
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
@@@@@@@@@@@@@@@@@@@@@@@
Starting main loop
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
££££££££££££££££££££££££££££££££
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=4,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=saved_models/prova1,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=10,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=saved_models/prova1,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['tensorboard'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=saved_models/prova1,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=epoch,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=200,
weight_decay=0.05,
)
££££££££££££££££££££££££££££££££
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(_create_warning_msg(
  0% 0/10 [00:00<?, ?it/s]/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.
  self.pid = os.fork()
Traceback (most recent call last):
  File "/content/drive/MyDrive/prova/script.py", line 168, in <module>
    main(args)
  File "/content/drive/MyDrive/prova/script.py", line 135, in main
    run_training(args, model, train_data)
  File "/content/drive/MyDrive/prova/script.py", line 61, in run_training
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3231, in training_step
    inputs = self._prepare_inputs(inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3184, in _prepare_inputs
    if len(inputs) == 0:
TypeError: object of type 'NoneType' has no len()
  0% 0/10 [00:00<?, ?it/s]

Someone can help me?