Cuda runtime error in trt-llm gemm

环境

If applicable, please include the following: CPU architecture: x86_64 GPU properties GPU name: NVIDIA A10 Clock frequencies used: None Libraries TensorRT branch: 9.0.0 TensorRT LLM: 0.1.3 Cuda: 12.1.66 Cudnn: unknown Container: registry.cn-hangzhou.aliyuncs.com%2ftrt-hackathon%2ftrt-hackathon%3afinal_v1 NVIDIA driver version: 525.105.17 OS: Ubuntu 5.15.0-73

复现步骤：进入trtllm根目录，cd tests/quantization 执行： python -m unittest test_smooth_quant_gemm.py TestSmoothQuantGemm.test_matmul 即可看到cuda runtime error：what(): [TensorRT-LLM Error][int8gemm Runner] Failed to run cutlass int8 gemm. Error: Error Internal 报错代码位于 ./3rdparty/cutlass/include/cutlass/gemm/device/gemm_universal_base.h 的initialize函数，cudaFuncSetAttribute返回的cudaerror_t为1, 没有继续检察后续代码，更改一些参数重新编译仍然报错


Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {

    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace " 
      << workspace << ", stream: " << (stream ? "non-null" : "null"));

    size_t workspace_bytes = get_workspace_size(args);

    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);

    if (workspace_bytes) {

      if (!workspace) {
        CUTLASS_TRACE_HOST("  error: device workspace must not be null");

        return Status::kErrorWorkspaceNull;
      }

      if (args.mode == GemmUniversalMode::kGemm) {
        CUTLASS_TRACE_HOST("  clearing device workspace");
        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);

        if (result != cudaSuccess) {
          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));

          return Status::kErrorInternal;
        }
      }
    }

    // Get CUDA grid shape
    cutlass::gemm::GemmCoord grid_tiled_shape;
    int gemm_k_size = 0;

    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);

    // Initialize the Params structure
    params_ = typename GemmKernel::Params(
      args,
      grid_tiled_shape,
      gemm_k_size,
      static_cast<int *>(workspace)
    );

    // Specify shared memory capacity for kernel. 
    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));

    if (smem_size >= (48 << 10)) {
      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
                                    smem_size);

      if (result != cudaSuccess) {
        return Status::kErrorInternal;
      }
    }

    return Status::kSuccess;
  }

### Tasks

NVIDIA / trt-samples-for-hackathon-cn

Cuda runtime error in trt-llm gemm #91