Closed ezhulenev closed 4 months ago
My end goal is to be able to compile host side and deice side code for this H100 GEMM:
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
#include "library_internal.h"
#include "gemm_operation.h"
#include "gemm_operation_3x.hpp"
#include "cutlass/arch/wmma.h"
#include "cutlass/numeric_types.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
///////////////////////////////////////////////////////////////////////////////////////////////////
using cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_epilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
cute::Shape<cute::_256, cute::_128, cute::_64>,
cute::Shape<cute::_1,cute::_2,cute::_1>,
cutlass::epilogue::collective::EpilogueTileAuto,
float, float,
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
cutlass::bfloat16_t, cutlass::layout::ColumnMajor, 8,
cutlass::epilogue::NoSmemWarpSpecialized
>::CollectiveOp;
using cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_mainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
cutlass::bfloat16_t, cutlass::layout::RowMajor, 8,
cutlass::bfloat16_t, cutlass::layout::RowMajor, 8,
float,
cute::Shape<cute::_256, cute::_128, cute::_64>,
cute::Shape<cute::_1,cute::_2,cute::_1>,
cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_epilogue::SharedStorage)>,
cutlass::gemm::KernelTmaWarpSpecializedCooperative
>::CollectiveOp;
// Gemm operator cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem
using cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_base = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_mainloop,
cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_epilogue,
cutlass::gemm::StreamKScheduler>;
// Define named type
struct cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem :
public cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem_base { };
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
void initialize_cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem(Manifest &manifest) {
using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem>;
manifest.append(
new GemmUniversal3xOperation<GemmKernel>("cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_256x128x64_1x2x1_0_ttn_align8_stream_k_warpspecialized_cooperative_epi_nosmem"));
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library
} // namespace cutlass
and once you get past sizeof_bits_v
error, and merge https://github.com/NVIDIA/cutlass/pull/1200 it has much more new errors because clang is more strict with how templates are defined
@mhoemmen
@ezhulenev can you provide the following versions?
Clang: clang version 18.0.0git (https://github.com/llvm/llvm-project.git a855b2c894444419c3689aff6fd0381fdeb02491)
CUDA: 12.2
CUTLASS: top of main branch
@ezhulenev Thanks for the report! I've built (not necessarily that commit) with Clang 17, which is the latest released version, as well as 16 and 14. Do you have build issues with other versions of Clang, or just 18.0.0?
This issue has been labeled inactive-30d
due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d
if there is no activity in the next 60 days.
Closing due to inactivity
Clang built from source: https://clang.llvm.org/get_started.html
main.cpp
build command
Error: