triton-inference-server / fastertransformer_backend

BSD 3-Clause "New" or "Revised" License
411 stars 133 forks source link

Ragged Batching on Megatron Fast Transformer Backend #85

Open mshuffett opened 1 year ago

mshuffett commented 1 year ago

I followed the tutorial to deploy NeMo Megatron on Triton and it was working well. But I wanted to add ragged batching, so I just added allow_ragged_batch: true to the config file resulting in this entry for input_ids. This resulted in the model crashing. Is there something else I need to do to allow ragged batching? If this doesn't work, is there any recommended approach to doing something similar to ragged batching?

  name: "input_ids"
  data_type: TYPE_UINT32
  dims: -1
  allow_ragged_batch: true
}

Here's the error:

terminate called after throwing an instance of 'std::runtime_error'
  what():  [FT][ERROR] CUDA runtime error: an illegal memory access was encountered /opt/fastertransformer_backend/build/_deps/repo-ft-src/src/fastertransformer/utils/memory_utils.cu:96

Signal (6) received.
 0# 0x000056459FF0AC19 in tritonserver
 1# 0x00007F7F6B298090 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 4# 0x00007F7F6B651911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 5# 0x00007F7F6B65D38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007F7F6B65D3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007F7F6B65D6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 8# 0x00007F7EFD57E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 9# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
13# 0x00007F7F6076D08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
14# 0x00007F7F6B689DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
15# 0x00007F7F6C89E609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
16# clone in /usr/lib/x86_64-linux-gnu/libc.so.6

Signal (11) received.
 0# 0x000056459FF0AC19 in tritonserver
 1# 0x00007F7F6B298090 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# 0x00007F7F6B651911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 4# 0x00007F7F6B65D38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 5# 0x00007F7F6B65D3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007F7F6B65D6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007F7EFD57E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 8# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 9# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# 0x00007F7F6076D08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
13# 0x00007F7F6B689DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
14# 0x00007F7F6C89E609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
15# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
PerkzZheng commented 1 year ago

have you added the following parameters? or you can share the config.pbtxt. You can take a look at the config example.

batch_input [
  {
    kind: BATCH_ITEM_SHAPE
    target_name: "input_ids_item_shape"
    data_type: TYPE_INT32
    source_input: "input_ids"
  }
]
mshuffett commented 1 year ago

I'll double check thanks

mshuffett commented 1 year ago

@PerkzZheng after adding that I am getting a different error:

terminate called after throwing an instance of 'std::runtime_error'
  what():  [FT][ERROR] CUDA runtime error: an illegal memory access was encountered /opt/fastertransformer_backend/build/_deps/repo-ft-src/src/fastertransformer/utils/memory_utils.cu:96

Signal (6) received.
 0# 0x0000558D09030C19 in tritonserver
 1# 0x00007FA6B2275090 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 4# 0x00007FA6B262E911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 5# 0x00007FA6B263A38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007FA6B263A3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007FA6B263A6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 8# 0x00007FA63557E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 9# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11$
:basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::b$
sic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fas$
ertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>
> >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/lib$
ransformer-shared.so
11# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx1$
::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::$
asic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fa$
tertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char$
 > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16$
 const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::_$
cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cx$
11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
13# 0x00007FA6A009F08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
14# 0x00007FA6B2666DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
15# 0x00007FA6B387B609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
16# clone in /usr/lib/x86_64-linux-gnu/libc.so.6

Signal (11) received.
 0# 0x0000558D09030C19 in tritonserver
 1# 0x00007FA6B2275090 in /usr/lib/x86_64-linux-gnu/libc.so.6
 2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
 3# 0x00007FA6B262E911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 4# 0x00007FA6B263A38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 5# 0x00007FA6B263A3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 6# 0x00007FA6B263A6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
 7# 0x00007FA63557E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 8# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
 9# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# 0x00007FA6A009F08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
13# 0x00007FA6B2666DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
14# 0x00007FA6B387B609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
15# clone in /usr/lib/x86_64-linux-gnu/libc.so.6

Here is the full config:

name: "gpt3_1.3b"
max_batch_size: 256

batch_input [
  {
    kind: BATCH_ITEM_SHAPE
    target_name: "input_ids_item_shape"
    data_type: TYPE_INT32
    source_input: "input_ids"
  }
]

input {
  name: "input_ids"
  data_type: TYPE_UINT32
  dims: -1
  allow_ragged_batch: true

}
input {
  name: "input_lengths"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
}
input {
  name: "request_output_len"
  data_type: TYPE_UINT32
  dims: -1
}
input {
  name: "runtime_top_k"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "runtime_top_p"
  data_type: TYPE_FP32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "beam_search_diversity_rate"
  data_type: TYPE_FP32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "temperature"
  data_type: TYPE_FP32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "len_penalty"
  data_type: TYPE_FP32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "repetition_penalty"
  data_type: TYPE_FP32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "random_seed"
  data_type: TYPE_UINT64
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "is_return_log_probs"
  data_type: TYPE_BOOL
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "beam_width"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "start_id"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "end_id"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "stop_words_list"
  data_type: TYPE_INT32
  dims: 2
  dims: -1
  optional: true
}
input {
  name: "bad_words_list"
  data_type: TYPE_INT32
  dims: 2
  dims: -1
  optional: true
}
input {
  name: "prompt_learning_task_name_ids"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "request_prompt_embedding"
  data_type: TYPE_FP16
  dims: -1
  dims: -1
  optional: true
}
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "stop_words_list"
  data_type: TYPE_INT32
  dims: 2
  dims: -1
  optional: true
}
input {
  name: "bad_words_list"
  data_type: TYPE_INT32
  dims: 2
  dims: -1
  optional: true
}
input {
  name: "prompt_learning_task_name_ids"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "request_prompt_embedding"
  data_type: TYPE_FP16
  dims: -1
  dims: -1
  optional: true
} input {
  name: "request_prompt_lengths"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
input {
  name: "request_prompt_type"
  data_type: TYPE_UINT32
  dims: 1
  reshape {
  }
  optional: true
}
output {
  name: "output_ids"
  data_type: TYPE_UINT32
  dims: -1
  dims: -1
}
output {
  name: "sequence_length"
  data_type: TYPE_UINT32
  dims: -1
}
output {
  name: "cum_log_probs"
  data_type: TYPE_FP32
  dims: -1
}
output {
  name: "output_log_probs"
  data_type: TYPE_FP32
  dims: -1
  dims: -1
}
instance_group {
  count: 1
  kind: KIND_CPU
}
default_model_filename: "1-gpu"
parameters {
  key: "data_type"
  value {
    string_value: "bf16"
  }
}
parameters {
  key: "enable_custom_all_reduce"
  value {
    string_value: "0"
  }
}
parameters {
  key: "int8_mode"
  value {
    string_value: "0"
  }
}
parameters {
  key: "model_checkpoint_path"
  value {
    string_value: "/model_repository/gpt3_1.3b/1-gpu"
  }
}
parameters {
  key: "model_type"
  value {
    string_value: "GPT"
  }
}
parameters {
  key: "pipeline_para_size"
  value {
    string_value: "1"
  }
}
parameters {
  key: "tensor_para_size"
  value {
    string_value: "1"
  }
}
backend: "fastertransformer"
model_transaction_policy {
}
dynamic_batching {
    max_queue_delay_microseconds: 50000
}
PerkzZheng commented 1 year ago

looks like you are not using the latest FT ? try the latest main branch (v5.3), and set FT_DEBUG_LEVEL=DEBUG when running again.