flashinfer-ai / flashinfer

FlashInfer: Kernel Library for LLM Serving
https://flashinfer.ai
Apache License 2.0
1.1k stars 98 forks source link

[Install] Build error on main branch #195

Closed esmeetu closed 4 months ago

esmeetu commented 5 months ago

After #183, I can't build main branch successfully.

Here is error log:

 /home/roy/flashinfer/python/csrc/batch_decode.cu(86): warning #174-D: expression has no effect
          [&]() -> bool { switch (q.scalar_type()) { case at::ScalarType::Half: { using c_type = nv_half; return [&] { c_type* tmp = nullptr; return [&]() -> bool { switch (num_qo_heads / num_kv_heads) { case 1: { constexpr auto GROUP_SIZE = 1; return GROUP_SIZE, [&] { return [&]() -> bool { switch (head_dim) { case 64: { constexpr auto HEAD_DIM = 64; return HEAD_DIM, [&] { return [&]() -> bool { switch (PosEncodingMode(pos_encoding_mode)) { case PosEncodingMode::kNone: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kNone; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } case PosEncodingMode::kRoPELlama: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kRoPELlama; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } case PosEncodingMode::kALiBi: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kALiBi; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "positional encoding mode" " " << int(PosEncodingMode(pos_encoding_mode)); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(90), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } case 128: { constexpr auto HEAD_DIM = 128; return HEAD_DIM, [&] { return [&]() -> bool { switch (PosEncodingMode(pos_encoding_mode)) { case PosEncodingMode::kNone: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kNone; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } case PosEncodingMode::kRoPELlama: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kRoPELlama; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } case PosEncodingMode::kALiBi: { constexpr auto POS_ENCODING_MODE = PosEncodingMode::kALiBi; return POS_ENCODING_MODE, [&] { return [&]() -> bool { switch (kv_layout) { case QKVLayout::kNHD: { constexpr auto KV_LAYOUT = QKVLayout::kNHD; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } case QKVLayout::kHND: { constexpr auto KV_LAYOUT = QKVLayout::kHND; return KV_LAYOUT, [&] { cudaError_t status = BatchDecodeWithPaddedKVCacheDispatched< GROUP_SIZE, HEAD_DIM, KV_LAYOUT, POS_ENCODING_MODE, c_type, c_type>( static_cast<c_type*>(q.data_ptr()), static_cast<c_type*>(k_padded.data_ptr()), static_cast<c_type*>(v_padded.data_ptr()), static_cast<c_type*>(o.data_ptr()), tmp, return_lse ? static_cast<float*>(lse.data_ptr()) : nullptr, batch_size, padded_kv_len, num_qo_heads, sm_scale, rope_scale, rope_theta, torch_current_stream); if (!(status == cudaSuccess)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(101), (::c10::detail::torchCheckMsgImpl( "Expected " "status == cudaSuccess" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", "BatchDecodeWithPaddedKVCache failed with error code ", status))); }; return true; }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "kv layout" " " << int(kv_layout); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/home/roy/flashinfer/python/csrc/batch_decode.cu", static_cast<uint32_t>(92), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false.  " "(Could this error message be improved?  If so, " "please report an enhancement request to PyTorch.)", oss.str()))); }; return false; } }(); }(); } default: std::ostringstream oss; oss << __PRETTY_FUNCTION__ << " failed to dispatch " "positional encoding mode" " " << 
...