I got this exception after typing anything in a prompt. Anyone knows what that means and how can I fix this?
Traceback (most recent call last):
File "/home/.../scripts/llama-chat/example-chat.py", line 118, in
fire.Fire(main)
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, kwargs)
File "/home/.../scripts/llama-chat/example-chat.py", line 111, in main
results = generator.generate(
File "/home/.../scripts/llama-chat/llama/generation.py", line 60, in generate
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
File "/home/.../.local/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, *kwargs)
File "/home/.../scripts/llama-chat/llama/model.py", line 264, in forward
h = layer(h, start_pos, freqs_cis, mask)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(input, kwargs)
File "/home/.../scripts/llama-chat/llama/model.py", line 189, in forward
h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
File "/home/.../scripts/llama-chat/llama/model.py", line 111, in forward
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: at::cuda::blas::gemm: not implemented for N3c108BFloat16E
I got this exception after typing anything in a prompt. Anyone knows what that means and how can I fix this?
Traceback (most recent call last): File "/home/.../scripts/llama-chat/example-chat.py", line 118, in
fire.Fire(main)
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/.../.local/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, kwargs)
File "/home/.../scripts/llama-chat/example-chat.py", line 111, in main
results = generator.generate(
File "/home/.../scripts/llama-chat/llama/generation.py", line 60, in generate
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
File "/home/.../.local/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, *kwargs)
File "/home/.../scripts/llama-chat/llama/model.py", line 264, in forward
h = layer(h, start_pos, freqs_cis, mask)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(input, kwargs)
File "/home/.../scripts/llama-chat/llama/model.py", line 189, in forward
h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
File "/home/.../scripts/llama-chat/llama/model.py", line 111, in forward
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.../.local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: at::cuda::blas::gemm: not implemented for N3c108BFloat16E