During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "/usr/local/lib/python3.9/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 1571, in generate
return self.sample(
File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 2534, in sample
outputs = self(
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 674, in forward
transformer_outputs = self.transformer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 545, in forward
outputs = block(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 270, in forward
attn_outputs = self.attn(
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 164, in forward
qkv = self.qkv_proj(hidden_states)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 371, in forward
out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
File "/usr/local/lib/python3.9/dist-packages/torch/cuda/amp/autocast_mode.py", line 105, in decorate_fwd
return fwd(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 283, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 254, in matmul248
matmul_248_kernel[grid](input, qweight, output,
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 89, in run
timings = {config: self._bench(*args, config=config, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 89, in
timings = {config: self._bench(args, config=config, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 71, in _bench
return triton.testing.do_bench(kernel_call, rep=40)
File "/usr/local/lib/python3.9/dist-packages/triton/testing.py", line 143, in do_bench
fn()
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 67, in kernel_call
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
File "", line 41, in matmul_248_kernel
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/usr/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpmb2st6kj/main.c', '-O3', '-I/usr/local/cuda/include', '-I/usr/include/python3.10', '-I/tmp/tmpmb2st6kj', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpmb2st6kj/matmul_248_kernel.cpython-310-x86_64-linux-gnu.so', '-L/usr/lib/x86_64-linux-gnu']' returned non-zero exit status 1.
版本:moss-moon-003-sft-int8 出现以下错误
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "", line 1, in
File "/usr/local/lib/python3.9/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, kwargs)
File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 1571, in generate
return self.sample(
File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 2534, in sample
outputs = self(
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 674, in forward
transformer_outputs = self.transformer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 545, in forward
outputs = block(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 270, in forward
attn_outputs = self.attn(
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/modeling_moss.py", line 164, in forward
qkv = self.qkv_proj(hidden_states)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 371, in forward
out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
File "/usr/local/lib/python3.9/dist-packages/torch/cuda/amp/autocast_mode.py", line 105, in decorate_fwd
return fwd(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 283, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/root/.cache/huggingface/modules/transformers_modules/local/quantization.py", line 254, in matmul248
matmul_248_kernel[grid](input, qweight, output,
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 89, in run
timings = {config: self._bench(*args, config=config, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 89, in
timings = {config: self._bench( args, config=config, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 71, in _bench
return triton.testing.do_bench(kernel_call, rep=40)
File "/usr/local/lib/python3.9/dist-packages/triton/testing.py", line 143, in do_bench
fn()
File "/root/.cache/huggingface/modules/transformers_modules/local/custom_autotune.py", line 67, in kernel_call
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
File "", line 41, in matmul_248_kernel
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/usr/local/lib/python3.9/dist-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/usr/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpmb2st6kj/main.c', '-O3', '-I/usr/local/cuda/include', '-I/usr/include/python3.10', '-I/tmp/tmpmb2st6kj', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpmb2st6kj/matmul_248_kernel.cpython-310-x86_64-linux-gnu.so', '-L/usr/lib/x86_64-linux-gnu']' returned non-zero exit status 1.