triton-inference-server / fastertransformer_backend

BSD 3-Clause "New" or "Revised" License
411 stars 133 forks source link

Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written #144

Open songkq opened 1 year ago

songkq commented 1 year ago

Hi, when I ensemble a fastertransformer_backend GPT model, it loaded the ensemble model failed with the error when starting the server. Could you please give some advice? Thanks.

CUDA_VISIBLE_DEVICES="0,1"  /opt/tritonserver/bin/tritonserver --model-store=fastertransformer_backend/all_models/nemo-megatron-gpt-5B > log 2>&1 &

...
E0613 07:51:38.390571 142822 model_repository_manager.cc:1002] Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written
...

I0613 07:51:48.643366 142822 server.cc:264] Waiting for in-flight requests to complete.
I0613 07:51:48.643407 142822 server.cc:280] Timeout 30: Found 0 model versions that have in-flight inferences
I0613 07:51:48.643479 142822 server.cc:295] All models are stopped, unloading models
I0613 07:51:48.643495 142822 server.cc:302] Timeout 30: Found 1 live models and 0 in-flight non-inference requests
I0613 07:51:48.643572 142822 libfastertransformer.cc:1965] TRITONBACKEND_ModelInstanceFinalize: delete instance state
I0613 07:51:48.643651 142822 libfastertransformer.cc:1899] TRITONBACKEND_ModelFinalize: delete model state
I0613 07:51:48.643688 142822 libfastertransformer.cc:1904] TRITONBACKEND_ModelFinalize: MPI Finalize
I0613 07:51:48.730007 142822 model_lifecycle.cc:579] successfully unloaded 'fastertransformer' version 1
I0613 07:51:49.643671 142822 server.cc:302] Timeout 29: Found 0 live models and 0 in-flight non-inference requests
error: creating server: Internal - failed to load all models

Reproduce: model weight: https://huggingface.co/nvidia/nemo-megatron-gpt-5B ensemble example: https://github.com/triton-inference-server/fastertransformer_backend/tree/main/all_models/gpt

In my case, I only ensemble the fastertransformer model. My model resposity fastertransformer_backend/all_models/nemo-megatron-gpt-5B includes ensemble and fastertransformer folders.

# fastertransformer_backend/all_models/nemo-megatron-gpt-5B/ensemble/config.pbtxt
name: "ensemble"
platform: "ensemble"
max_batch_size: 1024
input [
  {
    name: "INPUT_0"
    data_type: TYPE_UINT32
    dims: [ -1 ]
    allow_ragged_batch: true
  },
  {
    name: "INPUT_1"
    data_type: TYPE_UINT32
    dims: [ -1 ]
  },
  {
   name: "INPUT_2"
   data_type: TYPE_INT32
   dims: [ -1 ]
  },
  {
   name: "INPUT_3"
   data_type: TYPE_INT32
   dims: [ -1 ]
  },
  {
    name: "runtime_top_k"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "runtime_top_p"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "beam_search_diversity_rate"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "temperature"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "len_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "repetition_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "random_seed"
    data_type: TYPE_UINT64
    dims: [ 1 ]
    optional: true
  },
  {
    name: "is_return_log_probs"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    optional: true
  },
  {
    name: "beam_width"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "start_id"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "end_id"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "prompt_learning_task_name_ids"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "request_prompt_embedding"
    data_type: TYPE_FP16
    dims: [ -1, -1 ]
    optional: true
  },
  {
    name: "request_prompt_lengths"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "request_prompt_type"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "top_p_decay"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "top_p_min"
    data_type: TYPE_FP32
    dims: [ 1 ]
    optional: true
  },
  {
    name: "top_p_reset_ids"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    optional: true
  }
]
output [
  {
    name: "OUTPUT_0"
    data_type: TYPE_UINT32
    dims: [ -1, -1 ]
  },
  {
    name: "sequence_length"
    data_type: TYPE_UINT32
    dims: [ -1 ]
  },
  {
    name: "response_input_lengths"
    data_type: TYPE_INT32
    dims: [ -1 ]
  },
  {
    name: "cum_log_probs"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "output_log_probs"
    data_type: TYPE_FP32
    dims: [ -1, -1 ]
  }
]
ensemble_scheduling {
  step [
    {
      model_name: "fastertransformer"
      model_version: -1
      input_map {
        key: "input_ids"
        value: "INPUT_0"
      }
      input_map {
        key: "input_lengths"
        value: "input_lengths"
      }
      input_map {
        key: "request_output_len"
        value: "INPUT_1"
      }
      input_map {
        key: "prompt_learning_task_name_ids"
        value: "prompt_learning_task_name_ids"
      }
      input_map {
        key: "request_prompt_embedding"
        value: "request_prompt_embedding"
      }
      input_map {
        key: "request_prompt_lengths"
        value: "request_prompt_lengths"
      }
      input_map {
        key: "request_prompt_type"
        value: "request_prompt_type"
      }
      input_map {
          key: "runtime_top_k"
          value: "runtime_top_k"
      }
      input_map {
          key: "runtime_top_p"
          value: "runtime_top_p"
      }
      input_map {
          key: "beam_search_diversity_rate"
          value: "beam_search_diversity_rate"
      }
      input_map {
          key: "temperature"
          value: "temperature"
      }
      input_map {
          key: "len_penalty"
          value: "len_penalty"
      }
      input_map {
          key: "repetition_penalty"
          value: "repetition_penalty"
      }
      input_map {
          key: "random_seed"
          value: "random_seed"
      }
      input_map {
          key: "is_return_log_probs"
          value: "is_return_log_probs"
      }
      input_map {
          key: "beam_width"
          value: "beam_width"
      }
      input_map {
          key: "start_id"
          value: "start_id"
      }
      input_map {
          key: "end_id"
          value: "end_id"
      }
      input_map {
          key: "stop_words_list"
          value: "INPUT_2"
      }
      input_map {
          key: "bad_words_list"
          value: "INPUT_3"
      }
      input_map {
        key: "top_p_decay"
        value: "top_p_decay"
      }
      input_map {
        key: "top_p_min"
        value: "top_p_min"
      }
      input_map {
        key: "top_p_reset_ids"
        value: "top_p_reset_ids"
      }
      output_map {
        key: "output_ids"
        value: "OUTPUT_0"
      }
      output_map {
        key: "sequence_length"
        value: "sequence_length"
      }
      output_map {
        key: "response_input_lengths"
        value: "response_input_lengths"
      }
      output_map {
        key: "cum_log_probs"
        value: "cum_log_probs"
      }
      output_map {
        key: "output_log_probs"
        value: "output_log_probs"
      }
    }
  ]
}

# fastertransformer_backend/all_models/nemo-megatron-gpt-5B/fastertransformer/config.pbtxt

name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "gpt3_345M"
max_batch_size: 1024

model_transaction_policy {
  decoupled: False
}

dynamic_batching {
   max_queue_delay_microseconds: 50000
}

batch_input [
  {
    kind: BATCH_ITEM_SHAPE
    target_name: "input_ids_item_shape"
    data_type: TYPE_INT32
    source_input: "input_ids"
  }
]

input [
  {
    name: "input_ids"
    data_type: TYPE_UINT32
    dims: [ -1 ]
    allow_ragged_batch: true
  },
  {
    name: "input_lengths"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
  },
  {
    name: "request_output_len"
    data_type: TYPE_UINT32
    dims: [ -1 ]
  },
  {
    name: "runtime_top_k"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "runtime_top_p"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "beam_search_diversity_rate"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "temperature"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "len_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "repetition_penalty"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "random_seed"
    data_type: TYPE_UINT64
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "is_return_log_probs"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "is_return_context_embeddings"
    data_type: TYPE_BOOL
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "beam_width"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "start_id"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "end_id"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "stop_words_list"
    data_type: TYPE_INT32
    dims: [ 2, -1 ]
    optional: true
  },
  {
    name: "bad_words_list"
    data_type: TYPE_INT32
    dims: [ 2, -1 ]
    optional: true
  },
  {
    name: "prompt_learning_task_name_ids"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "request_prompt_embedding"
    data_type: TYPE_FP16
    dims: [ -1, -1 ]
    optional: true
    allow_ragged_batch: false
  },
  {
    name: "request_prompt_lengths"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "request_prompt_type"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "top_p_decay"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "top_p_min"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  },
  {
    name: "top_p_reset_ids"
    data_type: TYPE_UINT32
    dims: [ 1 ]
    reshape: { shape: [ ] }
    optional: true
  }
]
output [
  {
    name: "output_ids"
    data_type: TYPE_UINT32
    dims: [ -1, -1 ]
  },
  {
    name: "sequence_length"
    data_type: TYPE_UINT32
    dims: [ -1 ]
  },
  {
    name: "response_input_lengths"
    data_type: TYPE_INT32
    dims: [ -1 ]
  },
  {
    name: "cum_log_probs"
    data_type: TYPE_FP32
    dims: [ -1 ]
  },
  {
    name: "output_log_probs"
    data_type: TYPE_FP32
    dims: [ -1, -1 ]
  },
  {
    name: "context_embeddings"
    data_type: TYPE_FP32
    dims: [ -1, -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_CPU
  }
]
parameters {
  key: "tensor_para_size"
  value: {
    string_value: "2"
  }
}
parameters {
  key: "pipeline_para_size"
  value: {
    string_value: "1"
  }
}
parameters {
  key: "data_type"
  value: {
    string_value: "fp16"
  }
}
parameters {
  key: "model_type"
  value: {
    string_value: "GPT"
  }
}
parameters {
  key: "model_checkpoint_path"
  value: {
    string_value: "models/nemo-megatron-gpt-5B/2-gpu/"
  }
}
parameters {
  key: "int8_mode"
  value: {
    string_value: "0"
  }
}
parameters {
  key: "enable_custom_all_reduce"
  value: {
    string_value: "0"
  }
}