Hi, I'm trying to finetune the BioMedLM for Medical Question Answering using our custom dataset using Hugging Face's transformer's library. Since we're looking to optimize the memory usage, we're using Low Rank Adaptation as well. I'm unsure of the format of the dataset that I need to use. Below is the one I'm using currently: { 'instruction': 'xyz', 'output': 'test'}, where instruction is the question and output is the answer. Below is my code - ```

import logging import torch from datasets import Dataset import pandas as pd import gc from transformers import DataCollatorForLanguageModeling from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

logging.basicConfig(level=logging.DEBUG)

--------------------------------------------------------------------------------------------------------------

print("creating tokenizer from model") model_name="stanford-crfm/BioMedLM"

tokenizer = AutoTokenizer.from_pretrained(model_name,add_eos_token=True) tokenizer.pad_token_id = 0
tokenizer.add_special_tokens({'eos_token':''}) print('eos_token_id:',tokenizer.eos_token_id)

device_type = "cuda" if torch.cuda.is_available() else "cpu"

device = torch.device(device_type) model = AutoModelForCausalLM.from_pretrained( model_name, ).to(device) model.tie_weights() # todo: understand we dont know what this doing

--------------------------------------------------------------------------------------------------------------

peft_name = 'output/biomedLM-lora' CUTOFF_LEN = 512

def tokenize(prompt, tokenizer, add_eos_token=True): result = tokenizer( prompt+"", # add the end-of-stream token truncation=True, max_length=CUTOFF_LEN, padding="max_length", ) return { "input_ids": result["input_ids"], "attention_mask": result["attention_mask"], }

print("loading data from csv") df = pd.read_csv("dataset.csv") dataset = Dataset.from_pandas(df) dataset = dataset.select_columns(['instruction', 'output'])

print("splitting dataset") dataset = dataset.train_test_split(test_size = 0.33) train_data = dataset["train"] val_data = dataset["test"]

def generate_prompt(data_point): return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

Instruction: {data_point["instruction"]}

Response: {data_point["output"]}"""

print("tokenizing train and val ds") train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x), tokenizer)) val_data = val_data.shuffle().map(lambda x: tokenize(generate_prompt(x), tokenizer))

lora_config = LoraConfig( r = 8, lora_alpha=16, target_modules=["c_attn"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM )

eval_steps = 50 save_steps = 50 logging_steps = 20

trainer = Trainer( model=model, train_dataset=train_data, eval_dataset=val_data, args=TrainingArguments( num_train_epochs=1, learning_rate=1e-5, logging_steps=logging_steps, evaluation_strategy="steps", save_strategy="steps", eval_steps=eval_steps, save_steps=save_steps, output_dir="./models", # where model is saved report_to="none", save_total_limit=3, load_best_model_at_end=True, push_to_hub=False, per_device_train_batch_size=1, # defines per batch size per_device_eval_batch_size=1 ), data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), # wtf is this )

model.config.use_cache = False # silence the warnings. Please re-enable for inference!

print("training") trainer.train()

print("saving model") trainer.model.save_pretrained(peft_name) tokenizer.save_pretrained(peft_name)

--------------------------------------------------------------------------------------------------------------../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [127,0,0] Assertion srcIndex < srcSelectDimSize failed.

print("cleanup") model = None tokenizer=None trainer=None gc.collect() torch.cuda.empty_cache()

--------------------------------------------------------------------------------------------------------------



When I run the above code, I get the following error: 

│ ❱ 118 trainer.train()                                                        │
│   119                                                                        │
│   120 print("saving model")                                                  │
│   121 trainer.model.save_pretrained(peft_name)                               │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py:164 │
│ 5 in train                                                                   │
│                                                                              │
│   1642 │   │   inner_training_loop = find_executable_batch_size(             │
│   1643 │   │   │   self._inner_training_loop, self._train_batch_size, args.a │
│   1644 │   │   )                                                             │
│ ❱ 1645 │   │   return inner_training_loop(                                   │
│   1646 │   │   │   args=args,                                                │
│   1647 │   │   │   resume_from_checkpoint=resume_from_checkpoint,            │
│   1648 │   │   │   trial=trial,                                              │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py:193 │
│ 8 in _inner_training_loop                                                    │
│                                                                              │
│   1935 │   │   │   │   │   self.control = self.callback_handler.on_step_begi │
│   1936 │   │   │   │                                                         │
│   1937 │   │   │   │   with self.accelerator.accumulate(model):              │
│ ❱ 1938 │   │   │   │   │   tr_loss_step = self.training_step(model, inputs)  │
│   1939 │   │   │   │                                                         │
│   1940 │   │   │   │   if (                                                  │
│   1941 │   │   │   │   │   args.logging_nan_inf_filter                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py:275 │
│ 9 in training_step                                                           │
│                                                                              │
│   2756 │   │   │   return loss_mb.reduce_mean().detach().to(self.args.device │
│   2757 │   │                                                                 │
│   2758 │   │   with self.compute_loss_context_manager():                     │
│ ❱ 2759 │   │   │   loss = self.compute_loss(model, inputs)                   │
│   2760 │   │                                                                 │
│   2761 │   │   if self.args.n_gpu > 1:                                       │
│   2762 │   │   │   loss = loss.mean()  # mean() to average on multi-gpu para │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py:278 │
│ 4 in compute_loss                                                            │
│                                                                              │
│   2781 │   │   │   labels = inputs.pop("labels")                             │
│   2782 │   │   else:                                                         │
│   2783 │   │   │   labels = None                                             │
│ ❱ 2784 │   │   outputs = model(**inputs)                                     │
│   2785 │   │   # Save past state if it exists                                │
│   2786 │   │   # TODO: this needs to be fixed and made cleaner later.        │
│   2787 │   │   if self.args.past_index >= 0:                                 │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py: │
│ 1501 in _call_impl                                                           │
│                                                                              │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or s │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hoo │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                      │
│   1502 │   │   # Do not call functions when jit is used                      │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []         │
│   1504 │   │   backward_pre_hooks = []                                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gpt2/mo │
│ deling_gpt2.py:1080 in forward                                               │
│                                                                              │
│   1077 │   │   """                                                           │
│   1078 │   │   return_dict = return_dict if return_dict is not None else sel │
│   1079 │   │                                                                 │
│ ❱ 1080 │   │   transformer_outputs = self.transformer(                       │
│   1081 │   │   │   input_ids,                                                │
│   1082 │   │   │   past_key_values=past_key_values,                          │
│   1083 │   │   │   attention_mask=attention_mask,                            │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py: │
│ 1501 in _call_impl                                                           │
│                                                                              │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or s │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hoo │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                      │
│   1502 │   │   # Do not call functions when jit is used                      │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []         │
│   1504 │   │   backward_pre_hooks = []                                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gpt2/mo │
│ deling_gpt2.py:903 in forward                                                │
│                                                                              │
│    900 │   │   │   │   │   encoder_attention_mask,                           │
│    901 │   │   │   │   )                                                     │
│    902 │   │   │   else:                                                     │
│ ❱  903 │   │   │   │   outputs = block(                                      │
│    904 │   │   │   │   │   hidden_states,                                    │
│    905 │   │   │   │   │   layer_past=layer_past,                            │
│    906 │   │   │   │   │   attention_mask=attention_mask,                    │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py: │
│ 1501 in _call_impl                                                           │
│                                                                              │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or s │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hoo │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                      │
│   1502 │   │   # Do not call functions when jit is used                      │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []         │
│   1504 │   │   backward_pre_hooks = []                                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gpt2/mo │
│ deling_gpt2.py:391 in forward                                                │
│                                                                              │
│    388 │   ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tupl │
│    389 │   │   residual = hidden_states                                      │
│    390 │   │   hidden_states = self.ln_1(hidden_states)                      │
│ ❱  391 │   │   attn_outputs = self.attn(                                     │
│    392 │   │   │   hidden_states,                                            │
│    393 │   │   │   layer_past=layer_past,                                    │
│    394 │   │   │   attention_mask=attention_mask,                            │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py: │
│ 1501 in _call_impl                                                           │
│                                                                              │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or s │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hoo │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                      │
│   1502 │   │   # Do not call functions when jit is used                      │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []         │
│   1504 │   │   backward_pre_hooks = []                                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gpt2/mo │
│ deling_gpt2.py:313 in forward                                                │
│                                                                              │
│    310 │   │   │   key, value = self.c_attn(encoder_hidden_states).split(sel │
│    311 │   │   │   attention_mask = encoder_attention_mask                   │
│    312 │   │   else:                                                         │
│ ❱  313 │   │   │   query, key, value = self.c_attn(hidden_states).split(self │
│    314 │   │                                                                 │
│    315 │   │   query = self._split_heads(query, self.num_heads, self.head_di │
│    316 │   │   key = self._split_heads(key, self.num_heads, self.head_dim)   │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py: │
│ 1501 in _call_impl                                                           │
│                                                                              │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or s │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hoo │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                      │
│   1502 │   │   # Do not call functions when jit is used                      │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []         │
│   1504 │   │   backward_pre_hooks = []                                       │
│                                                                              │
│ /home/ubuntu/.local/lib/python3.10/site-packages/transformers/pytorch_utils. │
│ py:103 in forward                                                            │
│                                                                              │
│   100 │                                                                      │
│   101 │   def forward(self, x):                                              │
│   102 │   │   size_out = x.size()[:-1] + (self.nf,)                          │
│ ❱ 103 │   │   x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight │
│   104 │   │   x = x.view(size_out)                                           │
│   105 │   │   return x                                                       │
│   106                                                                        │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling 
`cublasCreate(handle)`

I also get this warning: 
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

How do I proceed?

Hi, @J38 . Actually, I encountered the same problem. I used the code in ./finetune/textgen for text generation, and I processed my data into the expected format. I trained it on 4*A100 40G, and it was running fine. However, at a certain iteration, the training terminated, and the following error occurred:


  0%|          | 97/30152 [06:44<37:45:09,  4.52s/it]/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [38,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [39,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [40,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [41,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [42,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [43,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [44,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [45,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [46,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [47,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [48,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [49,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [50,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [51,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [52,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [53,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [54,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [55,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [56,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [57,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [58,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [59,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [60,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [61,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [62,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [63,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [6,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [7,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [8,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [9,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [10,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [11,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [12,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [13,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [14,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [15,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [16,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [17,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [18,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [19,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [20,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [21,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [22,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [23,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [24,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [25,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [26,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [27,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [28,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [29,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [30,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [31,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [102,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [103,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [104,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [105,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [106,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [107,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [108,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [109,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [110,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [111,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [112,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [113,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [114,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [115,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [116,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [117,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [118,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [119,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [120,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [121,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [122,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [123,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [124,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [125,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [70,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [71,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [72,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [73,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [74,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [75,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [76,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [77,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [78,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [79,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [80,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [81,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [82,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [83,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [84,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [85,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [86,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [87,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [88,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [89,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [90,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [91,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [92,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [93,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [94,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1659484810403/work/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [609,0,0], thread: [95,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

Traceback (most recent call last):
  File "gpt2/finetune_for_summarization.py", line 162, in <module>
    finetune()
  File "gpt2/finetune_for_summarization.py", line 156, in finetune
    trainer.train()
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/trainer.py", line 1543, in train
    return inner_training_loop(
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/trainer.py", line 1791, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/trainer.py", line 2539, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/trainer.py", line 2571, in compute_loss
    outputs = model(**inputs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1724, in forward
    loss = self.module(*inputs, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1043, in forward
    transformer_outputs = self.transformer(
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 877, in forward
    outputs = torch.utils.checkpoint.checkpoint(
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 235, in checkpoint
    return CheckpointFunction.apply(function, preserve, *args)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 96, in forward
    outputs = run_function(*args)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 873, in custom_forward
    return module(*inputs, use_cache, output_attentions)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 388, in forward
    attn_outputs = self.attn(
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 329, in forward
    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
  File "/home/.conda/envs/pubmedgpt/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 199, in _attn
    mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.```

stanford-crfm / BioMedLM

Finetuning BioMedLM for Medical QA #20

--------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

--------------------------------------------------------------------------------------------------------------

stanford-crfm / BioMedLM

Finetuning BioMedLM for Medical QA #20

--------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [126,0,0] Assertion srcIndex < srcSelectDimSize failed.

--------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [472,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.