NVIDIA / cutlass

CUDA Templates for Linear Algebra Subroutines
Other
5.66k stars 971 forks source link

[BUG] DefaultConv2dFpropWithBroadcast 7x slower than DefaultConv2dFprop with FewChannel mode #884

Closed ShuaiShao93 closed 1 year ago

ShuaiShao93 commented 1 year ago

Describe the bug For a problem with --n=7 --h=1208 --w=1920 --c=4 --k=16 --r=4 --s=4 --stride_h=4 --stride_w=4, DefaultConv2dFprop only takes 0.77ms, but DefaultConv2dFpropWithBroadcast takes 5.2ms

Steps/Code to reproduce bug

include "cutlass/conv/conv2d_problem_size.h"

include "cutlass/conv/device/implicit_gemm_convolution.h"

include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"

include "cutlass/cutlass.h"

include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"

include "cutlass/util/command_line.h"

include "cutlass/util/host_tensor.h"

include "cutlass/util/reference/device/gemm.h"

include "cutlass/util/reference/host/convolution.h"

include "cutlass/util/reference/host/tensor_compare.h"

include "cutlass/util/reference/host/tensor_copy.h"

include "cutlass/util/reference/host/tensor_fill.h"

include "cutlass/util/tensor_view_io.h"

/**

/**

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Computes: /// /// Z = GEMM+Bias+ReLu /// T = Relu conditional /// template struct GemmWithBiasReluReferenceOp { using OutputOp = typename Gemm::UnderlyingKernel::Epilogue::OutputOp;

using ElementCompute = typename OutputOp::ElementCompute; using ElementZ = typename OutputOp::ElementZ; using ElementT = typename OutputOp::ElementT;

typename OutputOp::BinaryOp binary_op; typename OutputOp::ElementwiseOp elementwise_op;

GemmWithBiasReluReferenceOp() {}

void operator()(ElementZ& Z, ElementCompute gemm, ElementCompute bias) { ElementZ kThreshold = ElementZ();

ElementCompute z_full = binary_op(gemm, bias);

if (std::is_same<ElementZ, int8_t>::value) {
  z_full = std::max(float(-128.), z_full);
  z_full = std::min(float(127.), z_full);
  z_full = round(z_full);
}
Z = static_cast<ElementZ>(z_full);

bool conditional = (Z >= kThreshold);

if (!conditional) {
  Z = kThreshold;
}

} };

// The code section below describes datatype for input, output tensors and // computation between elements using ElementAccumulator = float; // Data type of accumulator using ElementComputeEpilogue = float; // Data type of epilogue computation (alpha, beta) using ElementInputA = cutlass::half_t; // Data type of elements in input tensor using ElementInputB = cutlass::half_t; // Data type of elements in input tensor using ElementInputV = ElementComputeEpilogue; using ElementOutputT = ElementComputeEpilogue; using ElementOutputZ = cutlass::half_t; using ElementInputC = ElementOutputZ;

// using LayoutInputA = cutlass::layout::TensorNCxHWx<32>; // using LayoutInputB = cutlass::layout::TensorCxRSKx<32>; // using LayoutOutput = cutlass::layout::TensorNCxHWx<32>; using LayoutInputA = cutlass::layout::TensorNHWC; using LayoutInputB = cutlass::layout::TensorNHWC; using LayoutOutput = cutlass::layout::TensorNHWC;

// This code section describes whether you want to use tensor cores or regular // SIMT cores on GPU SM using MMAOp = cutlass::arch::OpClassTensorOp;

// This code section describes CUDA SM architecture number using SmArch = cutlass::arch::Sm75;

// This code section describes the tile size a thread block will compute using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>; // Threadblock tile shape

// This code section describes tile size a warp will compute using WarpShape = cutlass::gemm::GemmShape<64, 16, 32>; // Warp tile shape

// This code section describes the size of MMA op using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; // TensorCore instruction shape

// This code section describes how threadblocks are scheduled on GPU using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;

// Number of pipelines you want to use constexpr int NumStages = 2;

using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise< ElementInputC, ElementAccumulator, ElementComputeEpilogue, ElementOutputZ, ElementOutputT, 4, cutlass::epilogue::thread::ReLu, cutlass::plus, false, ElementComputeEpilogue>;

using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< ElementInputA, LayoutInputA, ElementInputB, LayoutInputB, ElementOutputZ, LayoutOutput, ElementAccumulator, MMAOp, SmArch, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, SwizzleThreadBlock, NumStages, cutlass::arch::OpMultiplyAdd, cutlass::conv::IteratorAlgorithm::kFewChannels, cutlass::conv::StrideSupport::kStrided, 4, 4>::Kernel;

using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution;

/////////////////////////////////////////////////////////////////////////////////////////////////

// Command line options parsing struct Options { cutlass::Tensor4DCoord input_size; cutlass::Tensor4DCoord filter_size; cutlass::Tensor4DCoord padding; cutlass::MatrixCoord conv_stride; cutlass::MatrixCoord dilation; bool reference_check; bool measure_performance; int iterations; bool save_workspace; ElementComputeEpilogue alpha; ElementComputeEpilogue beta; bool benchmark;

Options() : input_size(1, 32, 32, 32), filter_size(32, 3, 3, 32), padding(1, 1, 1, 1), conv_stride(4, 4), dilation(1, 1), reference_check(false), measure_performance(true), iterations(20), save_workspace(false), alpha(1), beta(0), benchmark(false) {}

// Verify the problem size is compatible with the CUTLASS Convolution // implementation. bool valid() { // // CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently, // all pointers, strides, and tensor extents must be divisible by 32 // elements. // int const kAlignment = 32;

if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
  // misaligned tensors
  return false;
}

// Invalid padding
if ((padding.h() != filter_size.h() / 2) ||
    (padding.w() != filter_size.w() / 2)) {
  return false;
}

return true;

}

/// Updates input and filter sizes void update(cutlass::Tensor4DCoord input_size, cutlass::Tensor4DCoord filter_size) { this->input_size = input_size; this->filter_size = filter_size;

padding.n() = filter_size.h() / 2;
padding.h() = filter_size.h() / 2;
padding.w() = filter_size.w() / 2;
padding.c() = filter_size.w() / 2;

}

// Parses the command line void parse(int argc, char const** args) { cutlass::CommandLine cmd(argc, args);

if (cmd.check_cmd_line_flag("ref-check")) {
  reference_check = true;
}

if (cmd.check_cmd_line_flag("perf-check")) {
  measure_performance = true;
}

if (cmd.check_cmd_line_flag("save-workspace")) {
  save_workspace = true;
}

if (cmd.check_cmd_line_flag("benchmark")) {
  benchmark = true;
}

cmd.get_cmd_line_argument("n", input_size.n());
cmd.get_cmd_line_argument("h", input_size.h());
cmd.get_cmd_line_argument("w", input_size.w());
cmd.get_cmd_line_argument("c", input_size.c());

cmd.get_cmd_line_argument("k", filter_size.n());
cmd.get_cmd_line_argument("r", filter_size.h());
cmd.get_cmd_line_argument("s", filter_size.w());
filter_size.c() = input_size.c();

cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);

cmd.get_cmd_line_argument("stride_h", conv_stride.row());
cmd.get_cmd_line_argument("stride_w", conv_stride.column());

cmd.get_cmd_line_argument("iterations", iterations);

if (filter_size.h() == 3 && filter_size.w() == 3) {
  padding = {1, 1, 1, 1};
} else {
  padding = {0, 0, 0, 0};
}

}

/// Computes the output tensor size (NPQK) cutlass::Tensor4DCoord output_size() const { return cutlass::Tensor4DCoord( input_size.n(), (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1, (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1, filter_size.n()); }

/// Compute performance in GFLOP/s double gflops(double runtime_s) const { // Number of multiply-adds = NPQK CRS int64_t fmas = output_size().product() int64_t(filter_size.h() filter_size.w() filter_size.c());

// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;

} };

/////////////////////////////////////////////////////////////////////////////////////////////////

struct Result { double runtime_ms; double gflops; cutlass::Status status; cutlass::Status reference_check; cudaError_t error;

Result() : runtime_ms(0), gflops(0), status(cutlass::Status::kSuccess), reference_check(cutlass::Status::kInvalid), error(cudaSuccess) {}

static std::ostream& print_header(std::ostream& out, Options const& options) { out << "Layer,N,H,W,C,K,R,S,stride_h,stride_w,Runtime,GFLOPs";

return out;

}

std::ostream& print(std::ostream& out, int idx, Options const& options) { out << "conv_" << idx << "," << options.input_size.n() << "," << options.input_size.h() << "," << options.input_size.w() << "," << options.input_size.c() << "," << options.filter_size.n() << "," << options.filter_size.h() << "," << options.filter_size.w() << "," << options.conv_stride.row() << "," << options.conv_stride.column() << "," << runtime_ms << "," << gflops;

return out;

} };

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Runs one benchmark Result profile_convolution(Options const& options) { Result result;

// // Allocate host-device tensors using the CUTLASS Utilities. //

cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size); cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b( options.filter_size); cutlass::HostTensor<ElementComputeEpilogue, LayoutOutput> tensor_ref_y( options.output_size()); cutlass::HostTensor<ElementOutputZ, LayoutOutput> tensor_z( options.output_size()); cutlass::HostTensor<ElementOutputZ, LayoutOutput> tensor_ref_z( options.output_size()); cutlass::HostTensor<ElementInputV, cutlass::layout::RowMajor> tensor_v( {1, options.output_size().c()});

// Fill input and output matrices on host using CUTLASS helper functions cutlass::reference::host::TensorFillRandomUniform( tensor_a.host_view(), 1, ElementInputA(4), ElementInputA(-4), 0); // <- Fill matrix A on host with uniform-distribution random data cutlass::reference::host::TensorFillRandomUniform( tensor_b.host_view(), 1, ElementInputB(4), ElementInputB(-4), 0); // <- Fill matrix B on host with uniform-distribution random data cutlass::reference::host::TensorFillRandomUniform( tensor_v.host_view(), 1, ElementInputV(4), ElementInputV(-4)); cutlass::reference::host::TensorFill( tensor_ref_y.host_view()); // <- fill matrix D on host with zeros cutlass::reference::host::TensorFill( tensor_z.host_view()); // <- fill matrix D on host with zeros cutlass::reference::host::TensorFill( tensor_ref_z.host_view()); // <- fill matrix D on host with zeros

// cutlass::reference::host::TensorFill( // tensor_v.host_view()); // <- fill matrix D on host with zeros

// Copy data from host to GPU tensor_a.sync_device(); tensor_b.sync_device(); tensor_v.sync_device(); tensor_z.sync_device();

// // Define arguments for CUTLASS Convolution //

// mode (kCrossCorrelation or kConvolution) cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;

// Split K dimension into 1 partitions int split_k_slices = 1;

// Construct Conv2dProblemSize with user defined output size cutlass::conv::Conv2dProblemSize problem_size( options.input_size, options.filter_size, options.padding, options.conv_stride, options.dilation, options.output_size(), mode, split_k_slices);

// Construct ImplicitGemm::Argument structure with conv2d typename ImplicitGemm::Arguments arguments{problem_size, tensor_a.device_ref(), tensor_b.device_ref(), {}, tensor_z.device_ref(), {options.alpha, options.beta}, cutlass::conv::SplitKMode::kSerial, tensor_v.device_data(), 0, 0};

// // Initialize CUTLASS Convolution //

ImplicitGemm implicit_gemm_op;

size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);

// Allocate workspace memory cutlass::device_memory::allocation workspace(workspace_size);

result.status = implicit_gemm_op.can_implement(arguments); CUTLASS_CHECK(result.status);

result.status = implicit_gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(result.status);

// // Launch initialized CUTLASS kernel // result.status = implicit_gemm_op();

CUTLASS_CHECK(result.status);

if (options.reference_check) { std::cout << "Verification on host...\n";

// Compute with reference implementation
cutlass::reference::host::Conv2dFprop<
    ElementInputA, LayoutInputA, ElementInputB, LayoutInputB,
    ElementComputeEpilogue, LayoutOutput, ElementComputeEpilogue,
    ElementAccumulator>(problem_size, tensor_a.host_ref(),
                        tensor_b.host_ref(), {}, tensor_ref_y.host_ref(),
                        options.alpha, options.beta);

GemmWithBiasReluReferenceOp<ImplicitGemm> reference_op;
// compute tensor Z and tensor T
for (int n = 0; n < problem_size.N; ++n) {
  for (int p = 0; p < problem_size.P; ++p) {
    for (int q = 0; q < problem_size.Q; ++q) {
      for (int k = 0; k < problem_size.K; ++k) {
        ElementOutputZ z;

        ElementComputeEpilogue accum = tensor_ref_y.at({n, p, q, k});
        ElementComputeEpilogue bias = tensor_v.at({0, k});

        reference_op(z, accum, bias);

        tensor_ref_z.at({n, p, q, k}) = z;
      }
    }
  }
}

// Check if output from CUTLASS kernel and reference kernel are equal or not
tensor_z.sync_host();

bool passed = cutlass::reference::host::TensorEquals(
    tensor_z.host_view(), tensor_ref_z.host_view());

// for (int i = 0; i < 302 * 480 * 96; i++) {
//   if (tensor_z.host_data()[i] != tensor_ref_z.host_data()[i]) {
//     std::cout << i << " " << static_cast<int>(tensor_z.host_data()[i])
//               << " " << static_cast<int>(tensor_ref_z.host_data()[i])
//               << std::endl;
//   }
// }

if (!passed) {
  result.reference_check = cutlass::Status::kErrorInternal;
  std::cout << "ERROR - results miscompared.\n";
} else {
  result.reference_check = cutlass::Status::kSuccess;
  std::cout << "Passed.\n";
}

}

// // Performance measurement //

if (options.measure_performance) { cudaEvent_t events[2];

for (auto& event : events) {
  result.error = cudaEventCreate(&event);
  if (result.error != cudaSuccess) {
    std::cerr << "cudaEventCreate() failed: "
              << cudaGetErrorString(result.error) << std::endl;
    return result;
  }
}

// Record an event at the start of a series of convolution operations.
result.error = cudaEventRecord(events[0]);
if (result.error != cudaSuccess) {
  std::cerr << "cudaEventRecord() failed: "
            << cudaGetErrorString(result.error) << std::endl;
  return result;
}

// Launch a sequence of implicit GEMM operations on the device
for (int iteration = 0; iteration < options.iterations; ++iteration) {
  result.status = implicit_gemm_op();
  CUTLASS_CHECK(result.status);
}

// Record an event when the convolutions have been launched.
result.error = cudaEventRecord(events[1]);
if (result.error != cudaSuccess) {
  std::cerr << "cudaEventRecord() failed: "
            << cudaGetErrorString(result.error) << std::endl;
  return result;
}

// Wait for work on the device to complete.
result.error = cudaEventSynchronize(events[1]);
if (result.error != cudaSuccess) {
  std::cerr << "cudaEventSynchronize() failed: "
            << cudaGetErrorString(result.error) << std::endl;
  return result;
}

// Measure elapsed runtime
float runtime_ms = 0;
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != cudaSuccess) {
  std::cerr << "cudaEventElapsed() failed: "
            << cudaGetErrorString(result.error) << std::endl;
  return result;
}

// Print average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);

// Cleanup
for (auto event : events) {
  (void)cudaEventDestroy(event);
}

}

return result; }

/////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char const** args) { // Turing Tensor Core operations exposed with mma.sync are first available in // CUDA 10.2. // // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. if (!(CUDACC_VER_MAJOR > 10 || (CUDACC_VER_MAJOR == 10 && CUDACC_VER_MINOR >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA " "10.2 Toolkit or later." << std::endl; return 0; }

cudaDeviceProp props; CUDA_CHECK(cudaGetDeviceProperties(&props, 0));

if (!(props.major > 7 || (props.major == 7 && props.minor >= 5))) { std::cerr << "Turing Tensor Ops must be run on a machine with compute " "capability at least 75." << std::endl; return 0; }

Options options;

options.parse(argc, args);

if (options.benchmark) { // Benchmark several layers

int batch_sizes[] = {1, 32, 64, 128, 256, 512};

struct Benchmark {
  int h, w, c, k, r, s;
} layers[] = {
    {56, 56, 64, 256, 1, 1},    {56, 56, 64, 64, 1, 1},
    {56, 56, 64, 64, 3, 3},     {56, 56, 256, 64, 1, 1},
    {56, 56, 256, 512, 1, 1},   {56, 56, 256, 128, 1, 1},
    {28, 28, 128, 128, 3, 3},   {28, 28, 128, 512, 1, 1},
    {28, 28, 512, 128, 1, 1},   {28, 28, 512, 1024, 1, 1},
    {28, 28, 512, 256, 1, 1},   {14, 14, 256, 256, 3, 3},
    {14, 14, 256, 1024, 1, 1},  {14, 14, 1024, 256, 1, 1},
    {14, 14, 1024, 2048, 1, 1}, {14, 14, 1024, 512, 1, 1},
    {7, 7, 512, 512, 3, 3},
};

Result::print_header(std::cout, options) << std::endl;

int idx = 1;

for (auto const& layer : layers) {
  for (auto N : batch_sizes) {
    options.update({N, layer.h, layer.w, layer.c},
                   {layer.k, layer.r, layer.s, layer.c});

    Result result = profile_convolution(options);
    result.print(std::cout, idx, options) << std::endl;
  }

  ++idx;
}

} else { // Execute one problem size // if (!options.valid()) { // std::cerr << "Invalid problem." << std::endl; // return -1; // }

Result result = profile_convolution(options);

Result::print_header(std::cout, options) << std::endl;
result.print(std::cout, 1, options) << std::endl;

}

return 0; }

/////////////////////////////////////////////////////////////////////////////////////////////////



**Expected behavior**
DefaultConv2dFpropWithBroadcast should be almost as fast as DefaultConv2dFprop

**Environment details (please complete the following information):**
CUDA/NVCC 11.8, GPU: RTX 4000

**Additional context**
It seems this only happens with FewChannel mode
hwu36 commented 1 year ago

sorry for the late response. i was delayed by vacation and gtc.

Your profiler kernel is FixedChannel, your broadcast kernel is FewChannel.

To exactly match the profiler kernel, Z output type needs to be fp32.

seems warp tile size 64x32 is better than 64x16 for the fixed channel profiler kernel on A100. you can try it on your ada card.

If you still see big perf gap, would you please post REG and STACK usage of profiler kernel and your customized kernel. You can get it from cuobjdump --dump-resource-usage xxx.o

github-actions[bot] commented 1 year ago

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.

mnicely commented 1 year ago

@ShuaiShao93 is your issue resolved?

ShuaiShao93 commented 1 year ago

Your profiler kernel is FixedChannel, your broadcast kernel is FewChannel. To exactly match the profiler kernel, Z output type needs to be fp32.

By changing these two, it's 4.6ms now but still much slower

seems warp tile size 64x32 is better than 64x16 for the fixed channel profiler kernel on A100. you can try it on your ada card.

Sorry I only have RTX 4000 now. This should be fine since I'm comparing DefaultConv2dFpropWithBroadcast and DefaultConv2dFprop with the same tile sizes

If you still see big perf gap, would you please post REG and STACK usage of profiler kernel and your customized kernel. You can get it from cuobjdump --dump-resource-usage xxx.o

REG:96 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:872 TEXTURE:0 SURFACE:0 SAMPLER:0

hwu36 commented 1 year ago

REG:96 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:872 TEXTURE:0 SURFACE:0 SAMPLER:0

stack is 0 means no spill. which kernel has this value? the one with broadcast or not?

ShuaiShao93 commented 1 year ago

cuobjdump --dump-resource-usage

The one with broadcast. This is the full command, not sure if there is anything wrong

$ make 09_turing_tensorop_conv2dfprop
Built target 09_turing_tensorop_conv2dfprop

$ cuobjdump --dump-resource-usage examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop

Fatbin ptx code:
================
arch = sm_75
code version = [7,8]
producer = <unknown>
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_75
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

Resource usage:
 Common:
  GLOBAL:2
 Function _ZN7cutlass6KernelINS_4conv6kernel40ImplicitGemmConvolutionWithFusedEpilogueINS1_11threadblock21ImplicitGemmPipelinedINS_4gemm9GemmShapeILi128ELi64ELi32EEENS4_12TileIteratorINS4_52Conv2dFpropActivationTileAccessIteratorFixedChannelsINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout10TensorNHWCENS_9transform29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi256ENSI_ILi4ELi8EEELi8EEENS_12AlignedArrayISD_Li4ELi8EEEEEEENSG_11threadblock19RegularTileIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESL_Li16EEENS9_INS4_48Conv2dFpropFilterTileAccessIteratorFixedChannelsINSB_ILi32ELi64EEESD_SF_NSH_INSI_ILi32ELi64EEELi256ESK_Li8EEESN_EEEENSR_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESY_Li16EEEfSF_NS6_11threadblock9MmaPolicyINS6_4warp11MmaTensorOpINS7_ILi64ELi16ELi32EEESD_ST_SD_S12_fNSE_8RowMajorENS16_17MmaTensorOpPolicyINS_4arch3MmaINS7_ILi16ELi8ELi8EEELi32ESD_S19_SD_NSE_11ColumnMajorEfS19_NS1B_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1K_Li1EEENS_21NumericArrayConverterISD_SD_Li16ELNS_15FloatRoundStyleE2ENSG_6thread14UnaryTransform8IdentityEEENS1M_ISD_SD_Li8ELS1N_2ES1Q_EEbEENS_8epilogue11threadblock21EpilogueWithBroadcastIS8_S1J_Li1ENS1V_22PredicatedTileIteratorINS1V_26OutputTileOptimalThreadMapINS1V_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1Z_ILi1ELi8ELi1ELi1ELi8EEELi256ELi4ELi32EEEfLb0ENSE_9NoPermuteELb0EEES24_fNS1U_4warp24FragmentIteratorTensorOpIS18_S1D_fNS_5ArrayIfLi4ELb1EEES19_EENS25_20TileIteratorTensorOpIS18_S1D_fS19_EENS1V_18SharedLoadIteratorINS22_18CompactedThreadMapEfLi16EEENS1U_6thread32LinearCombinationBiasElementwiseIfffffLi4ENS2F_4ReLuIfEENS_4plusIfEELb0EfEENSB_ILi0ELi8EEELi2ELi1ELb1EEENS14_30GemmIdentityThreadblockSwizzleILi4EEELNS1_8OperatorE0ENS1_17Conv2dProblemSizeEEEEEvNT_6ParamsE:
  REG:96 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:872 TEXTURE:0 SURFACE:0 SAMPLER:0
hwu36 commented 1 year ago

what about the one without broadcast?

ShuaiShao93 commented 1 year ago
  • --n=7 --h=1208 --w=1920 --c=4 --k=16 --r=4 --s=4 --stride_h=4 --stride_w=4

I think it's the same

Fatbin ptx code:
================
arch = sm_75
code version = [7,8]
producer = <unknown>
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_75
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

Resource usage:
 Common:
  GLOBAL:2
 Function _ZN7cutlass6KernelINS_4conv6kernel23ImplicitGemmConvolutionINS1_11threadblock21ImplicitGemmPipelinedINS_4gemm9GemmShapeILi128ELi64ELi32EEENS4_12TileIteratorINS4_52Conv2dFpropActivationTileAccessIteratorFixedChannelsINS_11MatrixShapeILi128ELi32EEENS_6half_tENS_6layout10TensorNHWCENS_9transform29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi32ELi128EEELi256ENSI_ILi4ELi8EEELi8EEENS_12AlignedArrayISD_Li4ELi8EEEEEEENSG_11threadblock19RegularTileIteratorISC_SD_NSE_37RowMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi0ESL_Li16EEENS9_INS4_48Conv2dFpropFilterTileAccessIteratorFixedChannelsINSB_ILi32ELi64EEESD_SF_NSH_INSI_ILi32ELi64EEELi256ESK_Li8EEESN_EEEENSR_ISW_SD_NSE_40ColumnMajorTensorOpMultiplicandCrosswiseILi16ELi32EEELi1ESY_Li16EEEfSF_NS6_11threadblock9MmaPolicyINS6_4warp11MmaTensorOpINS7_ILi64ELi16ELi32EEESD_ST_SD_S12_fNSE_8RowMajorENS16_17MmaTensorOpPolicyINS_4arch3MmaINS7_ILi16ELi8ELi8EEELi32ESD_S19_SD_NSE_11ColumnMajorEfS19_NS1B_13OpMultiplyAddEEENSB_ILi1ELi1EEEEELi1ELb0EbEENSB_ILi0ELi0EEES1K_Li1EEENS_21NumericArrayConverterISD_SD_Li16ELNS_15FloatRoundStyleE2ENSG_6thread14UnaryTransform8IdentityEEENS1M_ISD_SD_Li8ELS1N_2ES1Q_EEbEENS_8epilogue11threadblock8EpilogueIS8_S1J_Li1ENS1V_22PredicatedTileIteratorINS1V_26OutputTileOptimalThreadMapINS1V_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1Z_ILi1ELi8ELi1ELi1ELi8EEELi256ELi4ELi32EEEfLb0ENSE_9NoPermuteELb0EEENS1U_4warp24FragmentIteratorTensorOpIS18_S1D_fNS_5ArrayIfLi4ELb1EEES19_EENS25_20TileIteratorTensorOpIS18_S1D_fS19_EENS1V_18SharedLoadIteratorINS22_18CompactedThreadMapEfLi16EEENS1U_6thread17LinearCombinationIfLi4EffLNS2F_9ScaleType4KindE0ELS1N_2EfEENSB_ILi0ELi8EEELi2ELi1EEENS14_30GemmIdentityThreadblockSwizzleILi4EEELNS1_8OperatorE0ENS1_17Conv2dProblemSizeELNS1_9GroupModeE0EEEEEvNT_6ParamsE:
  REG:96 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:784 TEXTURE:0 SURFACE:0 SAMPLER:0
hwu36 commented 1 year ago

then i am run out of idea. what does nsight say?

ShuaiShao93 commented 1 year ago

then i am run out of idea. what does nsight say?

On nsys it seems everything is identical:

no_broadcast

Begins: 2.01018s
Ends: 2.01128s (+1.104 ms)
grid:  <<<7928, 1, 1>>>
block: <<<256, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 0 bytes
Dynamic Shared Memory: 24,576 bytes
Registers Per Thread: 96
Local Memory Per Thread: 0 bytes
Local Memory Total: 21,233,664 bytes
Shared Memory executed: 65,536 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 50 %
Launched from thread: 685260
Latency: ←3.336 ms
Correlation ID: 126
Stream: Default stream 7

broadcast

Begins: 1.64072s
Ends: 1.64535s (+4.631 ms)
grid:  <<<7928, 1, 1>>>
block: <<<256, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 0 bytes
Dynamic Shared Memory: 24,576 bytes
Registers Per Thread: 96
Local Memory Per Thread: 0 bytes
Local Memory Total: 21,233,664 bytes
Shared Memory executed: 65,536 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 50 %
Launched from thread: 685849
Latency: ←23.171 ms
Correlation ID: 130
Stream: Default stream 7
hwu36 commented 1 year ago

any difference in the metrics, especially memory related such as global load latency?

another way to dig is commenting out broadcast code piece by piece to see which line can be the problem.

anyway it is weird.

ShuaiShao93 commented 1 year ago

any difference in the metrics, especially memory related such as global load latency?

another way to dig is commenting out broadcast code piece by piece to see which line can be the problem.

anyway it is weird.

Is this available on nsight systems?

hwu36 commented 1 year ago

yes.

some one posted the screen shot in github issue before.

mnicely commented 1 year ago

@ShuaiShao93 here are some tutorials on Nsight Systems https://developer.nvidia.com/nsight-systems/get-started#tutorials

mnicely commented 1 year ago

@ShuaiShao93 have you resolved your issues?

ShuaiShao93 commented 1 year ago

Sorry I don't have bandwidth to experiment this now. Feel free to close the ticket, and I can reopen it if I get a chance to do this.