Open zhangjun opened 1 year ago
/// Create a new inference request object.
///
/// \param inference_request Returns the new request object.
/// \param server the inference server object.
/// \param model_name The name of the model to use for the request.
/// \param model_version The version of the model to use for the
/// request. If -1 then the server will choose a version based on the
/// model's policy.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestNew(
struct TRITONSERVER_InferenceRequest** inference_request,
struct TRITONSERVER_Server* server, const char* model_name,
const int64_t model_version);
/// Delete an inference request object.
///
/// \param inference_request The request object.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestDelete(
struct TRITONSERVER_InferenceRequest* inference_request);
/// Get the ID for a request. The returned ID is owned by
/// 'inference_request' and must not be modified or freed by the
/// caller.
///
/// \param inference_request The request object.
/// \param id Returns the ID.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestId(
struct TRITONSERVER_InferenceRequest* inference_request, const char** id);
/// Set the ID for a request.
///
/// \param inference_request The request object.
/// \param id The ID.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetId(
struct TRITONSERVER_InferenceRequest* inference_request, const char* id);
/// Set the priority for a request. The default is 0 indicating that
/// the request does not specify a priority and so will use the
/// model's default priority.
///
/// \param inference_request The request object.
/// \param priority The priority level.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestSetPriorityUInt64(
struct TRITONSERVER_InferenceRequest* inference_request, uint64_t priority);
/// Get the timeout for a request, in microseconds. The default is 0
/// which indicates that the request has no timeout.
///
/// \param inference_request The request object.
/// \param timeout_us Returns the timeout, in microseconds.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceRequestTimeoutMicroseconds(
struct TRITONSERVER_InferenceRequest* inference_request,
uint64_t* timeout_us);
/// Get model used to produce a response. The caller does not own the
/// returned model name value and must not modify or delete it. The
/// lifetime of all returned values extends until 'inference_response'
/// is deleted.
///
/// \param inference_response The response object.
/// \param model_name Returns the name of the model.
/// \param model_version Returns the version of the model.
/// this response.
/// \return a TRITONSERVER_Error indicating success or failure.
struct TRITONSERVER_Error*
TRITONSERVER_InferenceResponseModel(
struct TRITONSERVER_InferenceResponse* inference_response,
const char** model_name, int64_t* model_version);
class TritonServerOptions {
private:
std::string server_id_;
std::set<std::string> repo_paths_;
tc::ModelControlMode model_control_mode_;
std::set<std::string> models_;
bool exit_on_error_;
bool strict_model_config_;
bool strict_readiness_;
tc::RateLimitMode rate_limit_mode_;
tc::RateLimiter::ResourceMap rate_limit_resource_map_;
bool metrics_;
bool gpu_metrics_;
bool cpu_metrics_;
uint64_t metrics_interval_;
unsigned int exit_timeout_;
uint64_t pinned_memory_pool_size_;
unsigned int buffer_manager_thread_count_;
unsigned int model_load_thread_count_;
bool enable_model_namespacing_;
std::map<int, uint64_t> cuda_memory_pool_size_;
double min_compute_capability_;
std::string backend_dir_;
std::string repoagent_dir_;
std::string cache_dir_;
tc::CacheConfigMap cache_config_map_;
triton::common::BackendCmdlineConfigMap backend_cmdline_config_map_;
triton::common::HostPolicyCmdlineConfigMap host_policy_map_;
#ifdef TRITON_ENABLE_METRICS
tc::MetricsConfigMap metrics_config_map_;
#endif // TRITON_ENABLE_METRICS
};
InferenceServer
// Inference server information.
class InferenceServer {
public:
// Construct an inference server.
InferenceServer();
~InferenceServer();
// Initialize the server. Return true on success, false otherwise.
Status Init();
// Stop the server. Return true if all models are unloaded, false
// if exit timeout occurs. If 'force' is true attempt to stop the
// server even if it is not in a ready state.
Status Stop(const bool force = false);
// Check the model repository for changes and update server state
// based on those changes.
Status PollModelRepository();
// Server health
Status IsLive(bool* live);
Status IsReady(bool* ready);
// Model health
Status ModelIsReady(
const std::string& model_name, const int64_t model_version, bool* ready);
// Return the ready versions of specific model
Status ModelReadyVersions(
const std::string& model_name, std::vector<int64_t>* versions);
// Return the ready versions of all models
Status ModelReadyVersions(
std::map<std::string, std::vector<int64_t>>* model_versions);
/// Get the index of all models in all repositories.
/// \param ready_only If true return only index of models that are ready.
/// \param index Returns the index.
/// \return error status.
Status RepositoryIndex(
const bool ready_only,
std::vector<ModelRepositoryManager::ModelIndex>* index);
// Inference. If Status::Success is returned then this function has
// taken ownership of the request object and so 'request' will be
// nullptr. If non-success is returned then the caller still retains
// ownership of 'request'.
Status InferAsync(std::unique_ptr<InferenceRequest>& request);
// Load the corresponding model. Reload the model if it has been loaded.
Status LoadModel(
const std::unordered_map<
std::string, std::vector<const InferenceParameter*>>& models);
// Unload the corresponding model.
Status UnloadModel(
const std::string& model_name, const bool unload_dependents);
// Print backends and models summary
Status PrintBackendAndModelSummary();
// Register model repository path and associated mappings
Status RegisterModelRepository(
const std::string& repository,
const std::unordered_map<std::string, std::string>& model_mapping);
// Unregister model repository path.
Status UnregisterModelRepository(const std::string& repository);
// Return the server version.
const std::string& Version() const { return version_; }
// Return the server extensions.
const std::vector<const char*>& Extensions() const { return extensions_; }
// Get / set the ID of the server.
const std::string& Id() const { return id_; }
void SetId(const std::string& id) { id_ = id; }
// Get / set the model repository path
const std::set<std::string>& ModelRepositoryPaths() const
{
return model_repository_paths_;
}
void SetModelRepositoryPaths(const std::set<std::string>& p)
{
model_repository_paths_ = p;
}
// Get / set model control mode.
ModelControlMode GetModelControlMode() const { return model_control_mode_; }
void SetModelControlMode(ModelControlMode m) { model_control_mode_ = m; }
// Get / set the startup models
const std::set<std::string>& StartupModels() const { return startup_models_; }
void SetStartupModels(const std::set<std::string>& m) { startup_models_ = m; }
// Get / set strict model configuration enable.
bool StrictModelConfigEnabled() const { return strict_model_config_; }
void SetStrictModelConfigEnabled(bool e) { strict_model_config_ = e; }
// Get / set rate limiter mode.
RateLimitMode RateLimiterMode() const { return rate_limit_mode_; }
void SetRateLimiterMode(RateLimitMode m) { rate_limit_mode_ = m; }
// Get / set rate limit resource counts
const RateLimiter::ResourceMap& RateLimiterResources() const
{
return rate_limit_resource_map_;
}
void SetRateLimiterResources(const RateLimiter::ResourceMap& rm)
{
rate_limit_resource_map_ = rm;
}
// Get / set the pinned memory pool byte size.
int64_t PinnedMemoryPoolByteSize() const { return pinned_memory_pool_size_; }
void SetPinnedMemoryPoolByteSize(int64_t s)
{
pinned_memory_pool_size_ = std::max((int64_t)0, s);
}
// Get / set whether response cache will be enabled server-wide.
// NOTE: Models still need caching enabled in individual model configs.
bool ResponseCacheEnabled()
{
// Only return true if cache was enabled, and has been initialized
return response_cache_enabled_ && CacheManager() && CacheManager()->Cache();
}
void SetResponseCacheEnabled(bool e) { response_cache_enabled_ = e; }
void SetCacheConfig(CacheConfigMap cfg) { cache_config_map_ = cfg; }
std::string CacheDir() const { return cache_dir_; }
void SetCacheDir(std::string dir) { cache_dir_ = dir; }
// Get / set CUDA memory pool size
const std::map<int, uint64_t>& CudaMemoryPoolByteSize() const
{
return cuda_memory_pool_size_;
}
void SetCudaMemoryPoolByteSize(const std::map<int, uint64_t>& s)
{
cuda_memory_pool_size_ = s;
}
// Get / set the minimum support CUDA compute capability.
double MinSupportedComputeCapability() const
{
return min_supported_compute_capability_;
}
void SetMinSupportedComputeCapability(double c)
{
min_supported_compute_capability_ = c;
}
// Get / set strict readiness enable.
bool StrictReadinessEnabled() const { return strict_readiness_; }
void SetStrictReadinessEnabled(bool e) { strict_readiness_ = e; }
// Get / set the server exit timeout, in seconds.
int32_t ExitTimeoutSeconds() const { return exit_timeout_secs_; }
void SetExitTimeoutSeconds(int32_t s) { exit_timeout_secs_ = std::max(0, s); }
void SetBufferManagerThreadCount(unsigned int c)
{
buffer_manager_thread_count_ = c;
}
void SetModelLoadThreadCount(unsigned int c) { model_load_thread_count_ = c; }
void SetModelNamespacingEnabled(const bool e)
{
enable_model_namespacing_ = e;
}
// Set a backend command-line configuration
void SetBackendCmdlineConfig(
const triton::common::BackendCmdlineConfigMap& bc)
{
backend_cmdline_config_map_ = bc;
}
void SetHostPolicyCmdlineConfig(
const triton::common::HostPolicyCmdlineConfigMap& hp)
{
host_policy_map_ = hp;
}
void SetRepoAgentDir(const std::string& d) { repoagent_dir_ = d; }
// Return the requested model object.
Status GetModel(
const std::string& model_name, const int64_t model_version,
std::shared_ptr<Model>* model)
{
// Allow model retrieval while server exiting to provide graceful
// completion of inference sequence that spans multiple requests.
if ((ready_state_ != ServerReadyState::SERVER_READY) &&
(ready_state_ != ServerReadyState::SERVER_EXITING)) {
return Status(Status::Code::UNAVAILABLE, "Server not ready");
}
return model_repository_manager_->GetModel(
model_name, model_version, model);
}
// Return the requested model object.
Status GetModel(
const ModelIdentifier& model_id, const int64_t model_version,
std::shared_ptr<Model>* model)
{
// Allow model retrieval while server exiting to provide graceful
// completion of inference sequence that spans multiple requests.
if ((ready_state_ != ServerReadyState::SERVER_READY) &&
(ready_state_ != ServerReadyState::SERVER_EXITING)) {
return Status(Status::Code::UNAVAILABLE, "Server not ready");
}
return model_repository_manager_->GetModel(model_id, model_version, model);
}
// Get the Backend Manager
const std::shared_ptr<TritonBackendManager>& BackendManager()
{
return backend_manager_;
}
// Return the pointer to RateLimiter object.
std::shared_ptr<RateLimiter> GetRateLimiter() { return rate_limiter_; }
// Get the Cache Manager
const std::shared_ptr<TritonCacheManager>& CacheManager()
{
return cache_manager_;
}
private:
const std::string version_;
std::string id_;
std::vector<const char*> extensions_;
std::set<std::string> model_repository_paths_;
std::set<std::string> startup_models_;
ModelControlMode model_control_mode_;
bool strict_model_config_;
bool strict_readiness_;
uint32_t exit_timeout_secs_;
uint32_t buffer_manager_thread_count_;
uint32_t model_load_thread_count_;
bool enable_model_namespacing_;
uint64_t pinned_memory_pool_size_;
bool response_cache_enabled_;
CacheConfigMap cache_config_map_;
std::string cache_dir_;
std::map<int, uint64_t> cuda_memory_pool_size_;
double min_supported_compute_capability_;
triton::common::BackendCmdlineConfigMap backend_cmdline_config_map_;
triton::common::HostPolicyCmdlineConfigMap host_policy_map_;
std::string repoagent_dir_;
RateLimitMode rate_limit_mode_;
RateLimiter::ResourceMap rate_limit_resource_map_;
// Current state of the inference server.
ServerReadyState ready_state_;
// Number of in-flight, non-inference requests. During shutdown we
// attempt to wait for all in-flight non-inference requests to
// complete before exiting (also wait for in-flight inference
// requests but that is determined by model shared_ptr).
std::atomic<uint64_t> inflight_request_counter_;
std::shared_ptr<RateLimiter> rate_limiter_;
std::unique_ptr<ModelRepositoryManager> model_repository_manager_;
std::shared_ptr<TritonBackendManager> backend_manager_;
std::shared_ptr<TritonCacheManager> cache_manager_;
};
server
InferHandler->Start() => Process() => StartNewRequest(), Execute()