autotune

https://github.com/tensorflow/tensorflow/blob/746b51e568f74919f3e2d3504ac883254385d305/tensorflow/core/kernels/gpu_utils.h#L240-L253

// A Singleton helper that manages the global autotune results by groups.
// The caller specified arbitrary Group type that can distinguish between
// different autotune results, even if their Parameters and Configs are the
// same.
template <class Group, typename Parameters, typename Config,
          typename Hasher = internal::AutotuneMapHasher<Parameters>>
class AutotuneSingleton {
 public:
  typedef AutotuneMap<Parameters, Config, Hasher> AutotuneType;
  static AutotuneType* GetInstance() {
    static AutotuneType* instance = new AutotuneType(Group::name());
    return instance;
  }
};

bias conv matmul

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/autotune_maps/conv_parameters.h

conv

struct ConvAutotuneGroup {
  static string name() { return "Conv"; }
};

// Uniquely identifies a convolution operation that runs on a particular device
// model.
//
// This can serve as a hashtable key, where the value might be the autotuned
// algorithm we choose for the conv.
//
// All of the data in this class other than the device_id is stored in the
// ConvParametersProto, so it can be easily serialized (for the purposes of
// ahead-of-time autotuning).
//
// When using the cudnn frontend API, two autotuning results for two different
// GPUs of the same model are not interchangeable, because an autotuning result
// includes a cudnn execution plan, which is tied to the GPU.  As a result, we
// need to create separate ConvParameters objects for them.
class ConvParameters {
 public:
  struct FusionInfo {
    // For some implementations (e.g. cuDNN new backend) these scales are part
    // of the algorithm, not part of the parameters an algorithm take. They need
    // to be used to distinguish different algorithms.
    double conv_scale;
    double side_input_scale;
    double leakyrelu_alpha;
    stream_executor::dnn::ActivationMode activation_mode;
    bool is_contrib;
  };

  // LINT.IfChange(conv_parameters_version)
  // A positive number that denotes the version of this class. Should be
  // incremented everytime this class or ConvParametersProto are updated in a
  // way that may invalidate autotune results.
  static constexpr int kVersion = 2;
  // LINT.ThenChange()

  // We have three kinds of convolutions today.  Vanilla unfused convolutions,
  // fused convolutions, and fused convolutions as implemented in the `contrib`
  // directory.  The two fused convolutions ultimately correspond to the same
  // cudnn calls, but have slightly different semantics (e.g. they interpret
  // padding differently).
  ConvParameters(
      int64_t batch, int64_t in_depths, absl::Span<const int64_t> in,
      int data_format, int64_t out_depths, absl::Span<const int64_t> filter,
      absl::Span<const int64_t> dilation, absl::Span<const int64_t> stride,
      absl::Span<const int64_t> padding, DataType dtype, int device_id,
      int group_count,
      absl::optional<FusionInfo> fusion_info = absl::optional<FusionInfo>(),
      // This argument should be set only for test use.
      int version = kVersion);

  ConvParameters(int device_id, const ConvParametersProto& proto);

  bool operator==(const ConvParameters& other) const;

  bool operator!=(const ConvParameters& other) const {
    return !(*this == other);
  }
  uint64 hash() const { return hash_code_; }

  string ToString() const;

  const ConvParametersProto& proto() const { return proto_; }

 private:
  int device_id_;
  ConvParametersProto proto_;
  uint64 hash_code_;
};

// Autotuning map entry for cuDNN-frontend-capable APIs.
//
// The longer-term intent is to remove the AlgorithmConfig variant and make this
// contain only the two LazyOpRunners, but for the time being ROCm is stuck on
// the legacy API and requires an AlgorithmConfig.
template <typename Op>
class AutotuneEntry {
 public:
  AutotuneEntry() : is_algorithm_config_(true) {}

  // Initialize with legacy-API AlgorithmConfig; used for the ROCm backend only.
  explicit AutotuneEntry(se::dnn::AlgorithmConfig config)
      : is_algorithm_config_(true), algorithm_config_(std::move(config)) {}

  AutotuneEntry(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary,
                std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback)
      : is_algorithm_config_(false),
        op_runners_{std::move(primary), std::move(no_scratch_fallback)} {}

  // Initialize from config data, without pre-cached runners, such as when
  // loading AoT autotuning maps.
  AutotuneEntry(se::dnn::AlgorithmDesc primary,
                absl::optional<se::dnn::AlgorithmDesc> no_scratch_fallback)
      : AutotuneEntry(std::make_shared<se::dnn::LazyOpRunner<Op>>(primary),
                      no_scratch_fallback
                          ? std::make_shared<se::dnn::LazyOpRunner<Op>>(
                                *no_scratch_fallback)
                          : nullptr) {}

  // Initialize with pre-cached OpRunners, such as during autotuning.
  static StatusOr<AutotuneEntry> FromOpRunners(
      std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>> primary,
      std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>
          no_cache_fallback) {
    TF_ASSIGN_OR_RETURN(
        auto primary_cache,
        se::dnn::LazyOpRunner<Op>::FromOpRunner(std::move(primary)));

    if (no_cache_fallback) {
      TF_ASSIGN_OR_RETURN(auto fallback_cache,
                          se::dnn::LazyOpRunner<Op>::FromOpRunner(
                              std::move(no_cache_fallback)));
      return AutotuneEntry(std::move(primary_cache), std::move(fallback_cache));

    } else {
      return AutotuneEntry(std::move(primary_cache), nullptr);
    }
  }

  struct OpRunners {
    OpRunners() = default;

    OpRunners(std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary_,
              std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback_)
        : primary(std::move(primary_)),
          no_scratch_fallback(std::move(no_scratch_fallback_)) {}

    // Null iff this 'OpRunners' is default-constructed as part of the
    // fake-variant in AutotuneEntry; users outside gpu_utils.h itself should
    // never see primary = nullptr.
    std::shared_ptr<se::dnn::LazyOpRunner<Op>> primary;
    std::shared_ptr<se::dnn::LazyOpRunner<Op>> no_scratch_fallback;  // Nullable

    bool operator==(const OpRunners& other) const {
      return *primary == *other.primary &&
             ((!no_scratch_fallback && !other.no_scratch_fallback) ||
              (no_scratch_fallback && other.no_scratch_fallback &&
               *no_scratch_fallback == *other.no_scratch_fallback));
    }
  };

  bool is_algorithm_config() const { return is_algorithm_config_; }

  const se::dnn::AlgorithmConfig& GetAlgorithmConfig() const {
    DCHECK(is_algorithm_config_);
    return algorithm_config_;
  }

  const OpRunners& GetOpRunners() const {
    DCHECK(!is_algorithm_config_);
    return op_runners_;
  }

  // AutotuneMap needs to test equality to keep track of the number of times an
  // algorithm has won autotuning; for this purpose, we can use ToString to
  // determine whether runners are equal.
  bool operator==(const AutotuneEntry<Op>& other) const {
    if (is_algorithm_config_) {
      return other.is_algorithm_config_ &&
             algorithm_config_ == other.algorithm_config_;
    }

    return !other.is_algorithm_config_ && op_runners_ == other.op_runners_;
  }

  bool operator!=(const AutotuneEntry<Op>& other) const {
    return !(*this == other);
  }

  std::string ToString() const {
    if (is_algorithm_config_) {
      return algorithm_config_.ToString();
    }
    return absl::StrCat("{", op_runners_.primary->ToString(), ", ",
                        (op_runners_.no_scratch_fallback
                             ? op_runners_.no_scratch_fallback->ToString()
                             : "(op_runners have no fallback)"),
                        "}");
  }

 private:
  // NVCC is broken, so we can't use absl::variant here.  Just fake it with a
  // bool and both fields.
  bool is_algorithm_config_;
  se::dnn::AlgorithmConfig algorithm_config_;
  OpRunners op_runners_;
};

using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
                                          AutotuneEntry<se::dnn::ConvOp>>;

zhangjun / zhangjun.github.io

TensorFlow #7

grappler

autotune

conv