from transformers import AutoTokenizer, AutoModel
#path = "/home/soft/glm2/int4"
#path = "THUDM/chatglm2-6b-int4"
path = "THUDM/chatglm2-6b-32k-int4"
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model = AutoModel.from_pretrained(path, trust_remote_code=True).half().cuda()
model = model.eval()
response, history = model.chat(tokenizer, "你好", history=[])
print(response)
response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history)
print(response)
错误:
Traceback (most recent call last):
File "/home/soft/glm2/glm2/1.py", line 8, in <module>
response, history = model.chat(tokenizer, "你好", history=[])
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 1042, in chat
outputs = self.generate(**inputs, **gen_kwargs)
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/soft/glm2/v/lib/python3.10/site-packages/transformers/generation/utils.py", line 1572, in generate
return self.sample(
File "/home/soft/glm2/v/lib/python3.10/site-packages/transformers/generation/utils.py", line 2619, in sample
outputs = self(
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 946, in forward
transformer_outputs = self.transformer(
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 836, in forward
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 638, in forward
layer_ret = layer(
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 546, in forward
attention_output, kv_cache = self.self_attention(
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/modeling_chatglm.py", line 375, in forward
mixed_x_layer = self.query_key_value(hidden_states)
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/quantization.py", line 502, in forward
output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
File "/home/soft/glm2/v/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/quantization.py", line 75, in forward
weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b-32k-int4/bac45edda11b4f88961e054b67bd3547099e84c5/quantization.py", line 299, in extract_weight_to_half
func(
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/kernels/base.py", line 48, in __call__
func = self._prepare_func()
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/kernels/base.py", line 40, in _prepare_func
self._module.get_module(), self._func_name
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/kernels/base.py", line 24, in get_module
self._module[curr_device] = cuda.cuModuleLoadData(self._code)
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/library/base.py", line 94, in wrapper
return f(*args, **kwargs)
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/library/cuda.py", line 233, in cuModuleLoadData
checkCUStatus(cuda.cuModuleLoadData(ctypes.byref(module), data))
File "/home/soft/glm2/v/lib/python3.10/site-packages/cpm_kernels/library/cuda.py", line 216, in checkCUStatus
raise RuntimeError("CUDA Error: %s" % cuGetErrorString(error))
RuntimeError: CUDA Error: no kernel image is available for execution on the device
Is there an existing issue for this?
Current Behavior
Expected Behavior
在1.py中,把THUDM/chatglm2-6b-int4改为THUDM/chatglm2-6b,加载model时去掉.half(),程序完全正常
Steps To Reproduce
Environment
Anything else?
无