int4量化模型，运行一段时间后报错

OS: ubuntu GPU: v100

ERROR: Exception in ASGI application Traceback (most recent call last): File "/opt/conda/envs/py38/lib/python3.8/site-packages/uvicorn/protocols/websockets/websockets_impl.py", line 254, in run_asgi result = await self.app(self.scope, self.asgi_receive, self.asgi_send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/uvicorn/middleware/proxy_headers.py", line 78, in call return await self.app(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/applications.py", line 276, in call await super().call(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/applications.py", line 122, in call await self.middleware_stack(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/errors.py", line 149, in call await self.app(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 79, in call raise exc File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/middleware/exceptions.py", line 68, in call await self.app(scope, receive, sender) File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/middleware/asyncexitstack.py", line 21, in call raise e File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in call await self.app(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 718, in call await route.handle(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 341, in handle await self.app(scope, receive, send) File "/opt/conda/envs/py38/lib/python3.8/site-packages/starlette/routing.py", line 82, in app await func(session) File "/opt/conda/envs/py38/lib/python3.8/site-packages/fastapi/routing.py", line 289, in app await dependant.call(values) File "/app/api.py", line 51, in stream_chat for response in infer.stream_forward(messages, paras): File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 43, in generator_context response = gen.send(None) File "/app/moss_inference_extend.py", line 18, in stream_forward for output_ids in self.stream_sample( File "/app/moss_inference_extend.py", line 62, in stream_sample logits, past_keyvalues = self.infer(input_ids if i == 0 else new_generated_id, attention_mask, past_key_values) File "/app/mossinference.py", line 338, in infer outputs: BaseModelOutputWithPast = self.model(inputs) File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, kwargs) File "/app/models/modeling_moss.py", line 678, in forward transformer_outputs = self.transformer( File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, *kwargs) File "/app/models/modeling_moss.py", line 545, in forward outputs = block( File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(input, kwargs) File "/app/models/modeling_moss.py", line 270, in forward attn_outputs = self.attn( File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, kwargs) File "/app/models/modeling_moss.py", line 164, in forward qkv = self.qkv_proj(hidden_states) File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, *kwargs) File "/app/models/quantization.py", line 367, in forward out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales, File "/opt/conda/envs/py38/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 105, in decorate_fwd return fwd(args, kwargs) File "/app/models/quantization.py", line 279, in forward output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) File "/app/models/quantization.py", line 250, in matmul248 matmul_248_kernel[grid](input, qweight, output, File "/app/models/custom_autotune.py", line 109, in run return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, kwargs, config.kwargs) File "", line 23, in matmul_248_kernel RuntimeError: Triton Error [CUDA]: device-side assert triggered

OpenMOSS / MOSS

int4量化模型，运行一段时间后报错 #325