Explicitly passing a revision is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a revision is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a revision is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Setting pad_token_id to eos_token_id:106068 for open-end generation.
/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status
Traceback (most recent call last):
File "", line 21, in matmul_248_kernel
KeyError: ('2-.-0-.-0-d82511111ad128294e9d31a6ac684238-d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-14de7de5c4da5794c8ca14e7e41a122d-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.int32, torch.float16, torch.float16, torch.int32, torch.int32, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (256, 64, 32, 8), (True, True, True, True, True, True, (False, False), (True, False), (True, False), (False, False), (False, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (True, False), (True, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "moss_cli_int8.py", line 10, in
outputs = model.generate(inputs, do_sample=True, temperature=0.7, top_p=0.8, repetition_penalty=1.02, max_new_tokens=256)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/transformers/generation/utils.py", line 1571, in generate
return self.sample(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/transformers/generation/utils.py", line 2534, in sample
outputs = self(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 674, in forward
transformer_outputs = self.transformer(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 545, in forward
outputs = block(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 270, in forward
attn_outputs = self.attn(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 164, in forward
qkv = self.qkv_proj(hidden_states)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 371, in forward
out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, *kwargs) # type: ignore[misc]
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 106, in decorate_fwd
return fwd(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 283, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 254, in matmul248
matmul_248_kernel[grid](input, qweight, output,
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 89, in run
timings = {config: self._bench(*args, config=config, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 89, in
timings = {config: self._bench(*args, config=config, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 71, in _bench
return triton.testing.do_bench(kernel_call, rep=40)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/testing.py", line 143, in do_bench
fn()
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 67, in kernel_call
self.fn.run(args, num_warps=config.num_warps, num_stages=config.num_stages, current)
File "", line 41, in matmul_248_kernel
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/opt/miniconda3/envs/moss/lib/python3.8/subprocess.py", line 364, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmp3o3awxeu/main.c', '-O3', '-I/usr/local/cuda/include', '-I/opt/miniconda3/envs/moss/include/python3.8', '-I/tmp/tmp3o3awxeu', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmp3o3awxeu/matmul_248_kernel.cpython-38-x86_64-linux-gnu.so']' returned non-zero exit status 1.
Explicitly passing a", line 21, in matmul_248_kernel
KeyError: ('2-.-0-.-0-d82511111ad128294e9d31a6ac684238-d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-14de7de5c4da5794c8ca14e7e41a122d-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.int32, torch.float16, torch.float16, torch.int32, torch.int32, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (256, 64, 32, 8), (True, True, True, True, True, True, (False, False), (True, False), (True, False), (False, False), (False, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (True, False), (True, False)))
revision
is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision. Explicitly passing arevision
is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision. Explicitly passing arevision
is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision. Settingpad_token_id
toeos_token_id
:106068 for open-end generation. /usr/bin/ld: cannot find -lcuda collect2: error: ld returned 1 exit status Traceback (most recent call last): File "During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "moss_cli_int8.py", line 10, in
outputs = model.generate(inputs, do_sample=True, temperature=0.7, top_p=0.8, repetition_penalty=1.02, max_new_tokens=256)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/transformers/generation/utils.py", line 1571, in generate
return self.sample(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/transformers/generation/utils.py", line 2534, in sample
outputs = self(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 674, in forward
transformer_outputs = self.transformer(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 545, in forward
outputs = block(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 270, in forward
attn_outputs = self.attn(
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/modeling_moss.py", line 164, in forward
qkv = self.qkv_proj(hidden_states)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 371, in forward
out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, *kwargs) # type: ignore[misc]
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 106, in decorate_fwd
return fwd(args, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 283, in forward
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/quantization.py", line 254, in matmul248
matmul_248_kernel[grid](input, qweight, output,
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 89, in run
timings = {config: self._bench(*args, config=config, kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 89, in
timings = {config: self._bench(*args, config=config, *kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 71, in _bench
return triton.testing.do_bench(kernel_call, rep=40)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/testing.py", line 143, in do_bench
fn()
File "/root/.cache/huggingface/modules/transformers_modules/fnlp/moss-moon-003-sft-int4/30d8a4be19ce413bb2a5bab4bfb75f125010ec06/custom_autotune.py", line 67, in kernel_call
self.fn.run(args, num_warps=config.num_warps, num_stages=config.num_stages, current)
File "", line 41, in matmul_248_kernel
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/opt/miniconda3/envs/moss/lib/python3.8/site-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/opt/miniconda3/envs/moss/lib/python3.8/subprocess.py", line 364, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmp3o3awxeu/main.c', '-O3', '-I/usr/local/cuda/include', '-I/opt/miniconda3/envs/moss/include/python3.8', '-I/tmp/tmp3o3awxeu', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmp3o3awxeu/matmul_248_kernel.cpython-38-x86_64-linux-gnu.so']' returned non-zero exit status 1.