Open leojames opened 1 year ago
What args are you adding to the command? I see that you're trying to run this on 16 GB VRAM which means that you'll only be able to either fit the Pythia model in 8-bit or you'd have to offload parts of the model to cpu/disk.
when i use this model ask some question while a moment later happens the memory is not enough
WeTraceback (most recent call last): File "/export/openChatKit/openChatKit/inference/bot.py", line 285, in
main()
File "/export/openChatKit/openChatKit/inference/bot.py", line 281, in main
).cmdloop()
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/cmd.py", line 138, in cmdloop
stop = self.onecmd(line)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/cmd.py", line 217, in onecmd
return func(arg)
File "/export/openChatKit/openChatKit/inference/bot.py", line 150, in do_say
output = self._model.do_inference(
File "/export/openChatKit/openChatKit/inference/bot.py", line 92, in do_inference
outputs = self._model.generate(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/generation_utils.py", line 1326, in generate
return self.sample(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/generation_utils.py", line 1944, in sample
outputs = self(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 619, in forward
outputs = self.gpt_neox(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, *kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 511, in forward
outputs = layer(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 319, in forward
attention_layer_outputs = self.attention(
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "/export/dolly/anaconda3/envs/OpenChatKit/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 148, in forward
key = torch.cat((past_key, key), dim=-2)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB (GPU 0; 15.89 GiB total capacity; 14.98 GiB already allocated; 15.88 MiB free; 15.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF