llm.generate issue on CPU machines

Bug description

Another issue with the llm.generate function that was somehow introduced in recent commits (I am surprised that CI didn't catch this):

from litgpt import LLM

llm = LLM.load("EleutherAI/pythia-160m")
llm.generate("What do Llamas eat?")

results in:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 4
      1 from litgpt import LLM
      3 llm = LLM.load("EleutherAI[/pythia-160m](http://localhost:8888/pythia-160m)")
----> 4 llm.generate("What do Llamas eat?")

File [~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py:116](http://localhost:8888/lab/workspaces/~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py#line=115), in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File [~/Desktop/litgpt/litgpt/api.py:534](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/api.py#line=533), in LLM.generate(self, prompt, max_new_tokens, temperature, top_k, top_p, return_as_token_ids, stream)
    532     outputs = iterator()
    533 else:
--> 534     outputs = generate_fn(
    535         model=self.model,
    536         prompt=input_ids,
    537         max_returned_tokens=max_returned_tokens,
    538         temperature=temperature,
    539         top_k=top_k,
    540         top_p=top_p,
    541         eos_id=self.preprocessor.tokenizer.eos_id,
    542         include_prompt=False,
    543     )
    545 if stream:
    546     return outputs

File [~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py:116](http://localhost:8888/lab/workspaces/~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py#line=115), in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File [~/Desktop/litgpt/litgpt/generate/base.py:383](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/generate/base.py#line=382), in generate(model, prompt, max_returned_tokens, temperature, top_k, top_p, eos_id, include_prompt)
    343 @torch.inference_mode()
    344 def generate(
    345     model: GPT,
   (...)
    353     include_prompt: bool = True,
    354 ) -> torch.Tensor:
    355     """
    356     Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
    357     The implementation of this function is modified from A. Karpathy's nanoGPT.
   (...)
    380         include_prompt: If true (default) prepends the prompt (after applying the prompt style) to the output.
    381     """
--> 383     token_list = list(generate_fn(
    384         include_prompt=include_prompt,
    385         include_eos=True,
    386         model=model,
    387         prompt=prompt,
    388         max_returned_tokens=max_returned_tokens,
    389         temperature=temperature,
    390         top_k=top_k,
    391         top_p=top_p,
    392         stop_tokens=(([eos_id],) if eos_id is not None else ())
    393     ))
    395     return torch.cat(token_list) if not len(token_list) == 0 else torch.Tensor()

File [~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py:36](http://localhost:8888/lab/workspaces/~/miniforge3/envs/litgpt/lib/python3.9/site-packages/torch/utils/_contextlib.py#line=35), in _wrap_generator.<locals>.generator_context(*args, **kwargs)
     33 try:
     34     # Issuing `None` to a generator fires it up
     35     with ctx_factory():
---> 36         response = gen.send(None)
     38     while True:
     39         try:
     40             # Forward the response to our caller and get its next request

File [~/Desktop/litgpt/litgpt/generate/base.py:172](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/generate/base.py#line=171), in generate_fn(model, prompt, max_returned_tokens, temperature, top_k, top_p, stop_tokens, include_prompt, include_eos)
    168 input_pos = torch.arange(0, prompt_size, device=device, dtype=torch.int64)
    169 for current_idx in range(max_returned_tokens - prompt_size):
    170 
    171     # Generate the token
--> 172     token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k, top_p=top_p)
    173     tokens.append(token)
    174     int_token = token.item()

File [~/Desktop/litgpt/litgpt/generate/base.py:78](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/generate/base.py#line=77), in next_token(model, input_pos, x, **kwargs)
     76 def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
     77     logits = model(x, input_pos)
---> 78     _next = sample(logits, **kwargs).to(dtype=torch.int64)
     79     return _next

File [~/Desktop/litgpt/litgpt/generate/base.py:72](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/generate/base.py#line=71), in sample(logits, temperature, top_k, top_p)
     70         logits = sample_top_p(logits, top_p)
     71     probs = torch.nn.functional.softmax(logits, dim=-1)
---> 72     return multinomial_num_samples_1(probs)
     73 return torch.argmax(logits, dim=-1, keepdim=True)

File [~/Desktop/litgpt/litgpt/generate/base.py:35](http://localhost:8888/lab/workspaces/~/Desktop/litgpt/litgpt/generate/base.py#line=34), in multinomial_num_samples_1(probs)
     33     distribution = torch.empty_like(probs).exponential_(1)
     34     return torch.argmax(probs [/](http://localhost:8888/) distribution, dim=-1, keepdim=True)
---> 35 return torch.multinomial(probs, num_samples=1)

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

Works fine in previous versions like 0.4.9.

What operating system are you using?

macOS

LitGPT Version

Version: 0.4.11

I pinpointed it a bit more. Something in the model forward path. After the ~7th block the inputs turn nan:

https://github.com/Lightning-AI/litgpt/blob/3d36b6b26aea56317774ffb65769e74cb1d8db5a/litgpt/model.py#L98

Users/sebastian/Desktop/litgpt/litgpt/api.py:222: UserWarning: MPS is currently not supported. Using CPU instead.
  warnings.warn("MPS is currently not supported. Using CPU instead.", UserWarning)
block 1 tensor([[[-0.1057,  0.2296,  0.0062,  ...,  0.4619,  0.3906,  0.6367],
         [-0.4836,  0.2103,  0.6401,  ...,  0.5747,  0.6416,  0.7041],
         [-0.3235,  0.0849,  0.9512,  ...,  0.1890,  0.2151,  0.1394],
         ...,
         [-0.1047,  0.2368, -0.9492,  ..., -0.0238, -0.1179, -0.2322],
         [-0.3896,  0.2751, -0.2380,  ..., -0.2274,  0.1450,  0.3435],
         [-0.6011, -0.2581,  0.1309,  ...,  0.4829, -0.1338, -0.0518]]])
block 2 tensor([[[-0.0986, -0.1464, -0.2467,  ...,  0.4736,  0.4595,  0.4951],
         [-0.1748, -0.1700,  0.1436,  ...,  0.4585,  0.8359,  0.5918],
         [-0.2993, -0.5112,  0.5020,  ...,  0.1832,  0.3770,  0.0740],
         ...,
         [-0.1707,  0.2238, -1.0098,  ...,  0.2377, -0.2566, -0.1475],
         [-0.2678,  0.6162, -0.7803,  ...,  0.0831,  0.0305,  0.3169],
         [-0.3025, -0.1704, -0.3274,  ...,  0.3608, -0.1277, -0.2117]]])
block 3 tensor([[[ 0.1680, -0.1973,  0.2661,  ..., -0.8584,  1.4062, -0.4258],
         [-0.0076, -0.9214, -0.4199,  ..., -0.2085,  0.3550,  0.6611],
         [-0.2158, -0.6768, -0.1826,  ...,  0.3328,  0.1467,  0.3203],
         ...,
         [-0.6362,  0.3423, -1.6582,  ...,  0.2013, -0.6396, -0.3462],
         [-0.0599,  0.3320, -1.4980,  ...,  0.0963,  0.3542,  0.3433],
         [-0.4653, -0.4614, -0.9268,  ...,  0.5674, -0.1849, -0.0605]]])
block 4 tensor([[[ 1.7744, -1.4297,  1.4746,  ..., -1.5049,  2.2109, -0.3230],
         [-0.5703, -1.1035, -1.2637,  ...,  0.1472,  0.9717,  0.3552],
         [-0.3464, -0.8906, -0.9473,  ..., -0.1326, -0.0806,  0.3298],
         ...,
         [-0.5708,  0.1072, -2.0820,  ..., -0.1400, -0.2275, -0.5664],
         [-1.0576, -0.2246, -2.3242,  ..., -0.3274,  0.3459,  0.1765],
         [-0.9800, -1.0176, -1.3828,  ...,  0.3643, -0.6680, -0.0145]]])
block 5 tensor([[[ 1.3242, -1.4248,  1.2607,  ..., -1.5957,  1.8232, -0.3926],
         [-0.8477, -0.7812, -1.1465,  ...,  0.5068,  0.7959,  0.4487],
         [ 0.1035, -1.0010, -0.7876,  ..., -0.0477,  0.0704,  0.3572],
         ...,
         [-0.3098, -0.0284, -2.2227,  ...,  0.5464,  0.1379, -0.5723],
         [-0.9932, -0.2793, -2.6914,  ...,  0.0000,  0.5757,  0.3267],
         [-0.9204, -0.7842, -1.6943,  ...,  0.4355, -0.4875,  0.1433]]])
block 6 tensor([[[ 1.1211, -1.9609,  0.9072,  ..., -1.3203,  1.3613, -0.0569],
         [-0.2979, -0.8257, -1.3096,  ...,  0.7959,  0.4268,  0.8403],
         [ 0.0416, -0.4849, -0.7119,  ..., -0.1052,  0.2598,  0.3496],
         ...,
         [-0.4631,  0.3843, -2.2461,  ...,  0.2756,  0.1716, -0.2839],
         [-0.8379,  0.1685, -2.9551,  ...,  0.0771,  0.3660,  0.3999],
         [-0.7383, -0.2847, -1.5391,  ...,  0.2377, -0.2969,  0.4036]]])
block 7 tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]])
block 8 tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],

Lightning-AI / litgpt

llm.generate issue on CPU machines #1715

Bug description

What operating system are you using?

LitGPT Version