Closed saramcallister closed 2 years ago
Can you try the nccl test https://github.com/NVIDIA/nccl-tests to make sure there is no problem in the network environment?
Running nccl test with ./build/all_reduce_perf -b 8 -e 128M -f -g 4
works without any errors or warnings so it doesn't seem like a problem in the networking environment.
Can you provide the config.pbtxt
you use? Let's try to reproduce the problem with same configuration and random weight.
Here's config.pbtxt
. It's essentially the provided config.pbtxt, but modified to use 4 gpus. (the config.pbtxt
that works for the 2 gpu configuration just has model_checkpoint_path
pointing to the 2 gpu checkpoint instead of the 4 gpu one, and 2 for tensor_para_size
)
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "gpt3_345M"
max_batch_size: 1024
input [
{
name: "input_ids"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "input_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind : KIND_CPU
}
]
parameters {
key: "tensor_para_size"
value: {
string_value: "4"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "max_seq_len"
value: {
string_value: "528"
}
}
parameters {
key: "data_type"
value: {
string_value: "fp16"
}
}
parameters {
key: "head_num"
value: {
string_value: "16"
}
}
parameters {
key: "size_per_head"
value: {
string_value: "64"
}
}
parameters {
key: "inter_size"
value: {
string_value: "4096"
}
}
parameters {
key: "vocab_size"
value: {
string_value: "50304"
}
}
parameters {
key: "start_id"
value: {
string_value: "50256"
}
}
parameters {
key: "end_id"
value: {
string_value: "50256"
}
}
parameters {
key: "decoder_layers"
value: {
string_value: "24"
}
}
parameters {
key: "model_name"
value: {
string_value: "gpt3_345M"
}
}
parameters {
key: "model_type"
value: {
string_value: "GPT"
}
}
parameters {
key: "model_checkpoint_path"
value: {
string_value: "/workspace/fastertransformer_backend/all_models/gpt/fastertransformer/1/4-gpu"
}
}
parameters {
key: "int8_mode"
value: {
string_value: "0"
}
}
parameters {
key: "enable_custom_all_reduce"
value: {
string_value: "0"
}
}
@saramcallister
gdb attach
to the triton server process, and then do thread apply all bt
.NCCL_DEBUG=INFO
to generate logs to the screen, and share it to us.NCCL_SET_STACK_SIZE=1
, which will avoid a CUDA memory reconfiguration on load.Cuda version: 11.7, driver version: 515.48.07
The most revelant-looking gdb thread backtrace output is like follows (full output):
Thread 126 (Thread 0x7fc974ffd000 (LWP 517)):
#0 0x00007ffc165236cb in ?? ()
#1 0x00007ffc16523954 in clock_gettime ()
#2 0x00007fcc7d5850b5 in __GI___clock_gettime (clock_id=4, tp=0x7fc974ff8270) at ../sysdeps/unix/sysv/linux/clock_gettime.c:38
#3 0x00007fcc7b06412f in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007fcc7af5cdcb in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#5 0x00007fcc7b04785c in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#6 0x00007fcc7b27b0d6 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#7 0x00007fcc7b0d6b99 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#8 0x00007fcc7d9fac90 in ?? () from /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.11.0
#9 0x00007fcc7da522e8 in cudaStreamSynchronize () from /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.11.0
#10 0x00007fcb29f2443e in fastertransformer::ParallelGpt<__half>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__half> const*) () from /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
#11 0x00007fcb29f64958 in ParallelGptTritonModelInstance<__half>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) () from /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
#12 0x00007fcbf804340a in triton::backend::fastertransformer_backend::ThreadForward(std::unique_ptr<AbstractTransformerModelInstance, std::default_delete<AbstractTransformerModelInstance> >*, std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >*, std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >*, int) () from /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
#13 0x00007fcc7d8dcde4 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#14 0x00007fcc7eaea609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#15 0x00007fcc7d5c7133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Here's the NCCL_DEBUG=INFO
output
Same problem still occurs with NCCL_SET_STACK_SIZE=1
output
Thanks for your information. It looks like all threads hang in the cudaStreamSync. Can you maybe try to comment out the streamSync here? If you are returning the cum log probs, please also comment out the streamSync here. If still hangs, also share the backtraces with us.
I was able to get a different server and the same setup works for 4 GPUs. NCCL communication between two of the GPUs on the old server were failing intermittently.
Description
Reproduced Steps