Open xiaofengstudent opened 12 months ago
Hi xiaofengstudent,
Please download the CheXbert checkpoint from https://github.com/stanfordmlgroup/CheXbert for the CE metrics.
Place the checkpoint at checkpoints/stanford/chexbert/chexbert.pth.
Thanks, Aaron
Thank you!
Hi, i met the issue. Thank you.
(venv) root@autodl-container-8a50119a52-f09cc96a:~/autodl-tmp/cvt2distilgpt2# dlhpcstarter -t iu_x-ray -c config/train_iu_x_ray_chen_cvt2distilgpt2.yaml --stages_module stages --train --test args: {'task': 'iu_x-ray', 'config': 'config/train_iu_x_ray_chen_cvt2distilgpt2', 'exp_dir': 'experiments', 'work_dir': '/root/autodl-tmp/cvt2distilgpt2', 'dataset_dir': 'datasets', 'ckpt_zoo_dir': 'checkpoints', 'definition': 'CvT2DistilGPT2IUXRayChen', 'module': 'cvt2distilgpt2_iu_x_ray_chen', 'stages_definition': 'stages', 'stages_module': 'stages', 'train': True, 'trial': 0, 'resume_last': True, 'resume_epoch': None, 'resume_ckpt_path': None, 'warm_start_ckpt_path': None, 'monitor': 'val_chen_cider', 'monitor_mode': 'max', 'test': True, 'test_epoch': None, 'test_ckpt_path': None, 'fast_dev_run': None, 'num_workers': 5, 'devices': 1, 'num_nodes': 1, 'memory': None, 'time_limit': None, 'submit': None, 'qos': None, 'begin': None, 'slurm_cmd_path': None, 'email': None, 'cuda_visible_devices': None, 'venv_path': None, 'config_file_name': 'config/train_iu_x_ray_chen_cvt2distilgpt2.yaml', 'config_name': 'train_iu_x_ray_chen_cvt2distilgpt2', 'config_dir': '/root/autodl-tmp/cvt2distilgpt2/config', 'config_full_path': '/root/autodl-tmp/cvt2distilgpt2/config/train_iu_x_ray_chen_cvt2distilgpt2.yaml', 'strategy': 'ddp_find_unused_parameters_true', 'encoder_lr': 5e-05, 'decoder_lr': 0.0005, 'mbatch_size': 4, 'every_n_epochs': 1, 'precision': 16, 'decoder_max_len': 128, 'num_test_beams': 4, 'enable_progress_bar': True, 'weights_summary': 'full', 'early_stopping': True, 'patience': 10, 'min_delta': 0.0001, 'deterministic': False, 'exp_dir_trial': 'experiments/iu_x-ray/train_iu_x_ray_chen_cvt2distilgpt2/trial_0'} Seed set to 0 /root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/fabric/connector.py:565:
precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead! Using 16bit Automatic Mixed Precision (AMP) GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs => merge config from tools/ext/cvt/experiments/imagenet/cvt/cvt-21-384x384.yaml Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at checkpoints/distilgpt2 and are newly initialized: ['transformer.h.2.crossattention.q_attn.weight', 'transformer.h.5.crossattention.c_proj.bias', 'transformer.h.0.crossattention.masked_bias', 'transformer.h.4.crossattention.masked_bias', 'transformer.h.2.crossattention.masked_bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.3.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.3.crossattention.c_proj.weight', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_proj.bias', 'transformer.h.2.crossattention.bias', 'transformer.h.5.crossattention.q_attn.weight', 'transformer.h.3.crossattention.bias', 'transformer.h.2.crossattention.c_proj.weight', 'transformer.h.5.ln_cross_attn.weight', 'transformer.h.4.ln_cross_attn.weight', 'transformer.h.0.crossattention.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.3.crossattention.q_attn.weight', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.5.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.2.crossattention.c_attn.weight', 'transformer.h.3.crossattention.masked_bias', 'transformer.h.4.crossattention.q_attn.weight', 'transformer.h.1.crossattention.bias', 'transformer.h.4.crossattention.c_attn.weight', 'transformer.h.1.crossattention.masked_bias', 'transformer.h.3.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.4.crossattention.c_proj.bias', 'transformer.h.3.crossattention.c_attn.weight', 'transformer.h.4.crossattention.c_proj.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.5.crossattention.c_attn.weight', 'transformer.h.2.ln_cross_attn.weight', 'transformer.h.5.crossattention.bias', 'transformer.h.5.crossattention.masked_bias', 'transformer.h.4.crossattention.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. Description, Special token, Index bos_token, [BOS], 50257 eos_token, <|endoftext|>, 50256 unk_token, <|endoftext|>, 50256 pad_token, [PAD], 50258 Config of the encoder: <class 'cvt2distilgpt2_iu_x_ray_chen.CvT2DistilGPT2IUXRayChen.init./root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/dlhpcstarter/utils.py:260: UserWarning: last.ckpt does not exist, starting training from epoch 0. warnings.warn('last.ckpt does not exist, starting training from epoch 0.') /root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/utilities.py:73:
max_epochs
was not set. Setting it to 1000 epochs. To train without an epoch limit, setmax_epochs=-1
. [rank: 0] Seed set to 0 Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1distributed_backend=nccl All distributed processes registered. Starting with 1 processes
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set
torch.set_float32_matmul_precision('medium' | 'high')
which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision Missing logger folder: experiments/iu_x-ray/train_iu_x_ray_chen_cvt2distilgpt2/trial_0/lightning_logs /root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory experiments/iu_x-ray/train_iu_x_ray_chen_cvt2distilgpt2/trial_0 exists and is not empty. Training set #images: 4138, #studies: 2069 Validation set #images: 592, #studies: 296 Test set #images: 1180, #studies: 590 No. of training & validation examples: 2069 & 296. LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]| Name | Type | Params
0 | val_coco_metrics | COCOCaptionMetrics | 0
1 | test_coco_metrics | COCOCaptionMetrics | 0
2 | val_chexbert_metrics | CheXbertMetrics | 0
3 | test_chexbert_metrics | CheXbertMetrics | 0
4 | val_report_logger | ReportLogger | 0
5 | test_report_logger | ReportLogger | 0
6 | encoder | CvT | 31.6 M 7 | encoder_projection | EncoderPermuteProject | 294 K 8 | multi_input | MultiImageInput | 0
9 | multi_output | MultiImageOutput | 0
10 | decoder | Decoder | 96.1 M
128 M Trainable params 0 Non-trainable params 128 M Total params 512.061 Total estimated model params size (MB) Epoch 0: 0%| | 0/518 [00:00<?, ?it/s]/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/autograd/init.py:251: UserWarning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. grad.sizes() = [384, 1, 3, 3], strides() = [9, 1, 3, 1] bucket_view.sizes() = [384, 1, 3, 3], strides() = [9, 9, 3, 1] (Triggered internally at ../torch/csrc/distributed/c10d/reducer.cpp:320.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 518/518 [02:58<00:00, 2.91it/s, v_num=0_1Traceback (most recent call last):██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:49<00:00, 1.48it/s] File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 43, in _call_and_handle_interrupt return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, kwargs) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 102, in launch return function(*args, *kwargs) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 580, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 989, in _run results = self._run_stage() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1035, in _run_stage self.fit_loop.run() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 202, in run self.advance() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 359, in advance self.epoch_loop.run(self._data_fetcher) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 137, in run self.on_advance_end(data_fetcher) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 285, in on_advance_end self.val_loop.run() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/utilities.py", line 182, in _decorator return loop_run(self, args, kwargs) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 141, in run return self.on_run_end() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 253, in on_run_end self._on_evaluation_epoch_end() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 329, in _on_evaluation_epoch_end call._call_lightning_module_hook(trainer, hook_name) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 157, in _call_lightning_module_hook output = fn(*args, *kwargs) File "/root/autodl-tmp/cvt2distilgpt2/cvt2distilgpt2_mimic_cxr_chen.py", line 456, in on_validation_epoch_end output = self.val_chexbert_metrics.compute() File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torchmetrics/metric.py", line 607, in wrapped_func value = _squeeze_if_scalar(compute(args, **kwargs)) File "/root/autodl-tmp/cvt2distilgpt2/tools/metrics/chexbert.py", line 65, in compute chexbert = CheXbert( File "/root/autodl-tmp/cvt2distilgpt2/tools/chexbert.py", line 32, in init state_dict = torch.load(os.path.join(ckpt_dir, checkpoint_path), map_location=device)['model_state_dict'] File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/serialization.py", line 986, in load with _open_file_like(f, 'rb') as opened_file: File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/serialization.py", line 435, in _open_file_like return _open_file(name_or_buffer, mode) File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/serialization.py", line 416, in init super().init(open(name, mode)) FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/stanford/chexbert/chexbert.pth'
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/root/autodl-tmp/cvt2distilgpt2/venv/bin/dlhpcstarter", line 8, in
sys.exit(main())
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/dlhpcstarter/main.py", line 54, in main
submit(args=args, stages_fnc=stages_fnc)
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/dlhpcstarter/main.py", line 69, in submit
stages_fnc(args)
File "/root/autodl-tmp/cvt2distilgpt2/stages.py", line 72, in stages
trainer.fit(model, ckpt_path=ckpt_path)
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 68, in _call_and_handle_interrupt
trainer._teardown()
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1012, in _teardown
self.strategy.teardown()
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/strategies/ddp.py", line 405, in teardown
super().teardown()
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/strategies/parallel.py", line 127, in teardown
super().teardown()
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/pytorch/strategies/strategy.py", line 528, in teardown
self.lightning_module.cpu()
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 78, in cpu
self.update_properties(device=torch.device("cpu"))
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/lightning/fabric/utilities/device_dtype_mixin.py", line 112, in update_properties
self.apply(apply_fn)
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 896, in apply
for module in self.children():
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 2284, in children
for name, module in self.named_children():
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 2304, in named_children
if module is not None and module not in memo:
File "/root/autodl-tmp/cvt2distilgpt2/venv/lib/python3.8/site-packages/torchmetrics/metric.py", line 918, in hash__
return hash(tuple(hash_vals))
TypeError: unhashable type: 'list'
Exception ignored in: <function tqdm.del at 0x7f6ed2e93a60>
Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/tqdm/std.py", line 1152, in del
File "/root/miniconda3/lib/python3.8/site-packages/tqdm/std.py", line 1306, in close
File "/root/miniconda3/lib/python3.8/site-packages/tqdm/std.py", line 1499, in display
File "/root/miniconda3/lib/python3.8/site-packages/tqdm/std.py", line 1155, in str__
File "/root/miniconda3/lib/python3.8/site-packages/tqdm/std.py", line 1457, in format_dict
TypeError: cannot unpack non-iterable NoneType object `