> File "/opt/conda/lib/python3.9/site-packages/text_generation_server/interceptor.py", line 20, in intercept
return await response
File "/opt/conda/lib/python3.9/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.9/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 60, in Warmup
self.model.warmup(batch, request.max_total_tokens)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_causal_lm.py", line 735, in warmup
raise e
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_causal_lm.py", line 728, in warmup
_, batch = self.generate_token(batch)
File "/opt/conda/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_causal_lm.py", line 795, in generate_token
raise e
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_causal_lm.py", line 782, in generate_token
out = self.forward(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_causal_lm.py", line 758, in forward
return self.model.forward(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 385, in forward
hidden_states = self.model(
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 344, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 271, in forward
attn_output = self.self_attn(
File "/opt/conda/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py", line 146, in forward
qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
RuntimeError: shape '[-1, 3, 6, 128]' is invalid for input of size 6359808
rank=0
2023-07-10T00:50:36.698059Z ERROR warmup{max_input_length=500 max_prefill_tokens=2548 max_total_tokens=50960}:warmup{max_input_length=500 max_prefill_tokens=2548 max_total_tokens=50960}: text_genera
tion_client: router/client/src/lib.rs:33: Server error: shape '[-1, 3, 6, 128]' is invalid for input of size 6359808
thread 'main' panicked at 'Unable to warmup model: Generation("shape '[-1, 3, 6, 128]' is invalid for input of size 6359808")', router/src/main.rs:216:18
Expected behavior
Running on a 4 gpu config on the same machine works. Ie, either
curl http://127.0.0.1:8080/generate -X POST -H 'Content-Type: application/json' -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":2048}}'
System Info
INFO text_generation_launcher: Runtime environment: Target: x86_64-unknown-linux-gnu Cargo version: 1.70.0 Commit sha: 31b36cca21fcd0e6b7db477a7545063e1b860156 Docker label: sha-31b36cc GPUs: 8x NVIDIA A100 80GB SXM4
Information
Tasks
Reproduction
Run
Gives the following error
Expected behavior
Running on a 4 gpu config on the same machine works. Ie, either
Or
Works. I have tested it works with the following