RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float

I was successful in getting your code to work on my 2060 laptop after a few tweeks. I just got a tesla M40 card in and am looking at running GPT-J-6 on it using this method. To start though, I thought I'd use the same code with the GPT-NEO-2.7B model to verify that it's working OK. I got the error in the title though when I tried to run it.
Any ideas as to what's going on?
Full error log:
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<timed exec> in <module>

~\AppData\Roaming\Python\Python37\site-packages\torch\autograd\grad_mode.py in decorate_context(*args, **kwargs)
     26         def decorate_context(*args, **kwargs):
     27             with self.__class__():
---> 28                 return func(*args, **kwargs)
     29         return cast(F, decorate_context)
     30 

~\AppData\Roaming\Python\Python37\site-packages\transformers\generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, **model_kwargs)
   1024                 return_dict_in_generate=return_dict_in_generate,
   1025                 synced_gpus=synced_gpus,
-> 1026                 **model_kwargs,
   1027             )
   1028 

~\AppData\Roaming\Python\Python37\site-packages\transformers\generation_utils.py in sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
   1533                 return_dict=True,
   1534                 output_attentions=output_attentions,
-> 1535                 output_hidden_states=output_hidden_states,
   1536             )
   1537 

~\AppData\Roaming\Python\Python37\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Roaming\Python\Python37\site-packages\transformers\models\gpt_neo\modeling_gpt_neo.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
    983             output_attentions=output_attentions,
    984             output_hidden_states=output_hidden_states,
--> 985             return_dict=return_dict,
    986         )
    987         hidden_states = transformer_outputs[0]

~\AppData\Roaming\Python\Python37\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Local\Temp/ipykernel_8288/2499053029.py in new_forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    219                     head_mask=head_mask[i],
    220                     use_cache=use_cache,
--> 221                     output_attentions=output_attentions,
    222                 )
    223 

~\AppData\Roaming\Python\Python37\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Roaming\Python\Python37\site-packages\transformers\models\gpt_neo\modeling_gpt_neo.py in forward(self, hidden_states, layer_past, attention_mask, head_mask, use_cache, output_attentions)
    559             head_mask=head_mask,
    560             use_cache=use_cache,
--> 561             output_attentions=output_attentions,
    562         )
    563         attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)

~\AppData\Roaming\Python\Python37\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Roaming\Python\Python37\site-packages\transformers\models\gpt_neo\modeling_gpt_neo.py in forward(self, hidden_states, layer_past, attention_mask, head_mask, use_cache, output_attentions)
    501             head_mask=head_mask,
    502             use_cache=use_cache,
--> 503             output_attentions=output_attentions,
    504         )
    505 

~\AppData\Roaming\Python\Python37\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Roaming\Python\Python37\site-packages\transformers\models\gpt_neo\modeling_gpt_neo.py in forward(self, hidden_states, attention_mask, layer_past, head_mask, use_cache, output_attentions)
    453             masked_bias=self.masked_bias,
    454             attn_dropout=self.attn_dropout,
--> 455             head_mask=head_mask,
    456         )
    457 

~\AppData\Roaming\Python\Python37\site-packages\transformers\models\gpt_neo\modeling_gpt_neo.py in _attn(self, query, key, value, causal_mask, masked_bias, attn_dropout, attention_mask, head_mask)
    276 
    277         attn_weights = torch.matmul(query, key.transpose(-1, -2))
--> 278         attn_weights = torch.where(causal_mask, attn_weights, masked_bias.to(attn_weights.dtype))
    279 
    280         if attention_mask is not None:

RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float
arrmansa / Basic-UI-for-GPT-J-6B-with-low-vram

RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float #4