When running the training script, I get the following stack trace
Traceback (most recent call last): File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/train.py", line 27, in <module> pretrain(neox_args=neox_args) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 193, in pretrain model, optimizer, lr_scheduler = setup_model_and_optimizer( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 643, in setup_model_and_optimizer model = get_model(neox_args=neox_args, use_cache=use_cache) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 411, in get_model model = GPT2ModelPipe( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/gpt2_model.py", line 123, in __init__ super().__init__( File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 205, in __init__ self._build() File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 254, in _build module = layer.build() File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 71, in build return self.typename(*self.module_args, **self.module_kwargs) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/word_embeddings.py", line 172, in __init__ self.image_prefix = ImagePrefix( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 196, in __init__ self.enc = get_image_encoder( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 147, in get_image_encoder encoder = clip_encoder(device=device, name=name, pretrain=pretrained, cache_path=cache_path) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 85, in clip_encoder encoder = open_clip.create_model( File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/factory.py", line 224, in create_model checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/pretrained.py", line 425, in download_pretrained target = download_pretrained_from_hf(model_id, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/pretrained.py", line 395, in download_pretrained_from_hf cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 120, in _inner_fn return fn(*args, **kwargs) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1291, in hf_hub_download raise LocalEntryNotFoundError( huggingface_hub.utils._errors.LocalEntryNotFoundError: Connection error, and we cannot find the requested files in the disk cache. Please try again or make sure your Internet connection is on.
I have a local copy of CLIP and have pointed to it in the config, but somewhere in the program's flow control this does not seem to be having the desired effect, forcing it to try and download a model copy from the internet, which then fails.
If I could perhaps compare print statements with someone who has a working setup, this would be great.
When running the training script, I get the following stack trace
Traceback (most recent call last): File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/train.py", line 27, in <module> pretrain(neox_args=neox_args) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 193, in pretrain model, optimizer, lr_scheduler = setup_model_and_optimizer( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 643, in setup_model_and_optimizer model = get_model(neox_args=neox_args, use_cache=use_cache) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/training.py", line 411, in get_model model = GPT2ModelPipe( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/gpt2_model.py", line 123, in __init__ super().__init__( File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 205, in __init__ self._build() File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 254, in _build module = layer.build() File "/gpfs/alpine/csc499/scratch/efennell/DeeperSpeed/deepspeed/runtime/pipe/module.py", line 71, in build return self.typename(*self.module_args, **self.module_kwargs) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/word_embeddings.py", line 172, in __init__ self.image_prefix = ImagePrefix( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 196, in __init__ self.enc = get_image_encoder( File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 147, in get_image_encoder encoder = clip_encoder(device=device, name=name, pretrain=pretrained, cache_path=cache_path) File "/gpfs/alpine/csc499/scratch/efennell/gpt-neox/megatron/model/image_prefix.py", line 85, in clip_encoder encoder = open_clip.create_model( File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/factory.py", line 224, in create_model checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/pretrained.py", line 425, in download_pretrained target = download_pretrained_from_hf(model_id, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/open_clip/pretrained.py", line 395, in download_pretrained_from_hf cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 120, in _inner_fn return fn(*args, **kwargs) File "/ccs/home/efennell/.conda/envs/gpt-neox/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1291, in hf_hub_download raise LocalEntryNotFoundError( huggingface_hub.utils._errors.LocalEntryNotFoundError: Connection error, and we cannot find the requested files in the disk cache. Please try again or make sure your Internet connection is on.
I have a local copy of CLIP and have pointed to it in the config, but somewhere in the program's flow control this does not seem to be having the desired effect, forcing it to try and download a model copy from the internet, which then fails.
If I could perhaps compare print statements with someone who has a working setup, this would be great.