Observe that when adjusting the max-input-length and max-total-tokens parameters, the server successfully launches but there are errors during the cuda graph warmup phase:
2024-04-02T17:31:23.524470Z ERROR text_generation_launcher: Decode cuda graph warmup failed
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in __call__
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py", line 89, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 235, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/interceptor.py", line 21, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 95, in Warmup
max_supported_total_tokens = self.model.warmup(batch)
> File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_causal_lm.py", line 807, in warmup
self.cuda_graph_warmup(bs, max_s, max_bt)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_causal_lm.py", line 715, in cuda_graph_warmup
self.model.forward(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 431, in forward
hidden_states = self.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 390, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 317, in forward
attn_output = self.self_attn(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 242, in forward
return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/layers.py", line 592, in forward
out = super().forward(input)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/layers.py", line 380, in forward
return self.linear.forward(x)
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/utils/layers.py", line 166, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
System Info
Information
Tasks
Reproduction
Run a docker container with four H100s and assuming the
codellama/CodeLlama-70b-hf
is present on the local system at/model
:Launch a shell into the container:
From the shell in the container, launch TGI successfully with this command:
Observe that when adjusting the
max-input-length
andmax-total-tokens
parameters, the server successfully launches but there are errors during the cuda graph warmup phase:The error is as follows:
Expected behavior
Expect warmup phase to not report any errors.