IndexError: tensors used as indices must be long, byte or bool tensors

Running into this issue when trying to generate with over around 130 tokens in context on my M40. Generation works fine for small contexts, but errors out at larger contexts than around 130 or so. max_length for generation defaults to 150 if that affects things. My 2080 ti does not have this issue, and will happily generate larger prompts.
Running the cuda branch because M40 is too old for triton branch to run. Model: https://huggingface.co/anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /Storage3/gpt/dbot/llamatest.py:9 in <module>                                                    │
│                                                                                                  │
│    6 while True:                                                                                 │
│    7 │   inp = input("Enter text: ")                                                             │
│    8 │   start = time.time()                                                                     │
│ ❱  9 │   out = llama.generate_sync(inp)                                                          │
│   10 │   print(out)                                                                              │
│   11 │   print("Time taken: ", time.time() - start)                                              │
│   12 │   print("Time per token: ", (time.time() - start) / llama.token_len(out))                 │
│                                                                                                  │
│ /Storage3/gpt/dbot/llamaQClientOriginal.py:130 in generate_sync                                  │
│                                                                                                  │
│   127 │   │   │   stop_strings=[],                                                               │
│   128 │   │   │   override_stop_count = 0                                                        │
│   129 │   │   ):                                                                                 │
│ ❱ 130 │   │   return asyncio.run(self.generate(prompt,                                           │
│   131 │   │   │   │   │   │   │   │   │   │   batch=batch,                                       │
│   132 │   │   │   │   │   │   │   │   │   │   min_length=min_length,                             │
│   133 │   │   │   │   │   │   │   │   │   │   max_length=max_length,                             │
│                                                                                                  │
│ /usr/lib/python3.10/asyncio/runners.py:44 in run                                                 │
│                                                                                                  │
│   41 │   │   events.set_event_loop(loop)                                                         │
│   42 │   │   if debug is not None:                                                               │
│   43 │   │   │   loop.set_debug(debug)                                                           │
│ ❱ 44 │   │   return loop.run_until_complete(main)                                                │
│   45 │   finally:                                                                                │
│   46 │   │   try:                                                                                │
│   47 │   │   │   _cancel_all_tasks(loop)                                                         │
│                                                                                                  │
│ /usr/lib/python3.10/asyncio/base_events.py:646 in run_until_complete                             │
│                                                                                                  │
│    643 │   │   if not future.done():                                                             │
│    644 │   │   │   raise RuntimeError('Event loop stopped before Future completed.')             │
│    645 │   │                                                                                     │
│ ❱  646 │   │   return future.result()                                                            │
│    647 │                                                                                         │
│    648 │   def stop(self):                                                                       │
│    649 │   │   """Stop running the event loop.                                                   │
│                                                                                                  │
│ /Storage3/gpt/dbot/llamaQClientOriginal.py:90 in generate                                        │
│                                                                                                  │
│    87 │   │   │   input_ids = input_ids.to(self.device)                                          │
│    88 │   │   │   for p in trange(batch):                                                        │
│    89 │   │   │   │   with torch.no_grad():                                                      │
│ ❱  90 │   │   │   │   │   generated_ids = self.model.generate(                                   │
│    91 │   │   │   │   │   │   input_ids,                                                         │
│    92 │   │   │   │   │   │   do_sample=True,                                                    │
│    93 │   │   │   │   │   │   min_new_tokens=min_length,                                         │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27 in               │
│ decorate_context                                                                                 │
│                                                                                                  │
│    24 │   │   @functools.wraps(func)                                                             │
│    25 │   │   def decorate_context(*args, **kwargs):                                             │
│    26 │   │   │   with self.clone():                                                             │
│ ❱  27 │   │   │   │   return func(*args, **kwargs)                                               │
│    28 │   │   return cast(F, decorate_context)                                                   │
│    29 │                                                                                          │
│    30 │   def _wrap_generator(self, func):                                                       │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1485 in generate        │
│                                                                                                  │
│   1482 │   │   │   )                                                                             │
│   1483 │   │   │                                                                                 │
│   1484 │   │   │   # 13. run sample                                                              │
│ ❱ 1485 │   │   │   return self.sample(                                                           │
│   1486 │   │   │   │   input_ids,                                                                │
│   1487 │   │   │   │   logits_processor=logits_processor,                                        │
│   1488 │   │   │   │   logits_warper=logits_warper,                                              │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:2524 in sample          │
│                                                                                                  │
│   2521 │   │   │   model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)  │
│   2522 │   │   │                                                                                 │
│   2523 │   │   │   # forward pass to get next token                                              │
│ ❱ 2524 │   │   │   outputs = self(                                                               │
│   2525 │   │   │   │   **model_inputs,                                                           │
│   2526 │   │   │   │   return_dict=True,                                                         │
│   2527 │   │   │   │   output_attentions=output_attentions,                                      │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl   │
│                                                                                                  │
│   1191 │   │   # this function, and just call forward.                                           │
│   1192 │   │   if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o  │
│   1193 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1194 │   │   │   return forward_call(*input, **kwargs)                                         │
│   1195 │   │   # Do not call functions when jit is used                                          │
│   1196 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1197 │   │   if self._backward_hooks or _global_backward_hooks:                                │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:687 in       │
│ forward                                                                                          │
│                                                                                                  │
│   684 │   │   return_dict = return_dict if return_dict is not None else self.config.use_return   │
│   685 │   │                                                                                      │
│   686 │   │   # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)    │
│ ❱ 687 │   │   outputs = self.model(                                                              │
│   688 │   │   │   input_ids=input_ids,                                                           │
│   689 │   │   │   attention_mask=attention_mask,                                                 │
│   690 │   │   │   position_ids=position_ids,                                                     │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl   │
│                                                                                                  │
│   1191 │   │   # this function, and just call forward.                                           │
│   1192 │   │   if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o  │
│   1193 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1194 │   │   │   return forward_call(*input, **kwargs)                                         │
│   1195 │   │   # Do not call functions when jit is used                                          │
│   1196 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1197 │   │   if self._backward_hooks or _global_backward_hooks:                                │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:577 in       │
│ forward                                                                                          │
│                                                                                                  │
│   574 │   │   │   │   │   None,                                                                  │
│   575 │   │   │   │   )                                                                          │
│   576 │   │   │   else:                                                                          │
│ ❱ 577 │   │   │   │   layer_outputs = decoder_layer(                                             │
│   578 │   │   │   │   │   hidden_states,                                                         │
│   579 │   │   │   │   │   attention_mask=attention_mask,                                         │
│   580 │   │   │   │   │   position_ids=position_ids,                                             │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl   │
│                                                                                                  │
│   1191 │   │   # this function, and just call forward.                                           │
│   1192 │   │   if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o  │
│   1193 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1194 │   │   │   return forward_call(*input, **kwargs)                                         │
│   1195 │   │   # Do not call functions when jit is used                                          │
│   1196 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1197 │   │   if self._backward_hooks or _global_backward_hooks:                                │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:292 in       │
│ forward                                                                                          │
│                                                                                                  │
│   289 │   │   hidden_states = self.input_layernorm(hidden_states)                                │
│   290 │   │                                                                                      │
│   291 │   │   # Self Attention                                                                   │
│ ❱ 292 │   │   hidden_states, self_attn_weights, present_key_value = self.self_attn(              │
│   293 │   │   │   hidden_states=hidden_states,                                                   │
│   294 │   │   │   attention_mask=attention_mask,                                                 │
│   295 │   │   │   position_ids=position_ids,                                                     │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl   │
│                                                                                                  │
│   1191 │   │   # this function, and just call forward.                                           │
│   1192 │   │   if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o  │
│   1193 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1194 │   │   │   return forward_call(*input, **kwargs)                                         │
│   1195 │   │   # Do not call functions when jit is used                                          │
│   1196 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1197 │   │   if self._backward_hooks or _global_backward_hooks:                                │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:196 in       │
│ forward                                                                                          │
│                                                                                                  │
│   193 │   ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:       │
│   194 │   │   bsz, q_len, _ = hidden_states.size()                                               │
│   195 │   │                                                                                      │
│ ❱ 196 │   │   query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.   │
│   197 │   │   key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.he   │
│   198 │   │   value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.   │
│   199                                                                                            │
│                                                                                                  │
│ /home/pathos/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1194 in _call_impl   │
│                                                                                                  │
│   1191 │   │   # this function, and just call forward.                                           │
│   1192 │   │   if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o  │
│   1193 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1194 │   │   │   return forward_call(*input, **kwargs)                                         │
│   1195 │   │   # Do not call functions when jit is used                                          │
│   1196 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1197 │   │   if self._backward_hooks or _global_backward_hooks:                                │
│                                                                                                  │
│ /Storage3/gpt/dbot/./GPTQ-for-LLaMa/quant.py:313 in forward                                      │
│                                                                                                  │
│   310 │   │   │                                                                                  │
│   311 │   │   │    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])   │
│   312 │   │   │                                                                                  │
│ ❱ 313 │   │   │    weights = (self.scales[self.g_idx] * (weight - zeros[self.g_idx]))            │
│   314 │   │   │    out = torch.matmul(x.half(), weights)                                         │
│   315 │   │   out = out.reshape(out_shape)                                                       │
│   316 │   │   out = out + self.bias if self.bias is not None else out                            │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
IndexError: tensors used as indices must be long, byte or bool tensors```
qwopqwop200 / GPTQ-for-LLaMa

IndexError: tensors used as indices must be long, byte or bool tensors #210