Closed yudi0201 closed 8 months ago
Describe the bug hidet.ops.conv2d with fp16 fails to compile for CUDA with an internal nvcc compiler error.
Compiling cuda task conv_gemm_fp16_pk(img=float16(1, 224, 224, 3), weight=float16(392, 64), c=float16(3, 1, 112, 112, 64), stride=(2, 2), padding=[3, 3, 5], dilations=(1, 1), orig_weight_shape=[64, 8, 7, 7], groups=1, parallel_k_parts=3, disable_cp_async=False)... Traceback (most recent call last): File "python/hidet/cuda/cudnn/temp.py", line 14, in <module> hidet_conv2d('float16', 1, 3, 224, 224, 64, 7, 7, (3, 3), (2, 2), (1, 1)) File "python/hidet/cuda/cudnn/temp.py", line 11, in hidet_conv2d graph = graph.cuda_graph() File "/home/yudi/hidet/python/hidet/graph/flow_graph.py", line 429, in cuda_graph return CudaGraph(f_create_inputs, f_run, ref_objs=[self]) File "/home/yudi/hidet/python/hidet/cuda/graph.py", line 127, in __init__ f_run(self._inputs) File "/home/yudi/hidet/python/hidet/graph/flow_graph.py", line 427, in f_run return self.forward(inputs) File "/home/yudi/hidet/python/hidet/graph/flow_graph.py", line 239, in forward self._build_nodes() File "/home/yudi/hidet/python/hidet/graph/flow_graph.py", line 207, in _build_nodes hidet.drivers.build_task_batch(tasks) File "/home/yudi/hidet/python/hidet/drivers/build_task.py", line 322, in build_task_batch raise RuntimeError('\n'.join(msg)) RuntimeError: Failed to build 1 tasks: [cuda] conv_gemm_fp16_pk(img=float16(1, 224, 224, 3), weight=float16(392, 64), c=float16(3, 1, 112, 112, 64), stride=(2, 2), padding=[3, 3, 5], dilations=(1, 1), orig_weight_shape=[64, 8, 7, 7], groups=1, parallel_k_parts=3, disable_cp_async=False) Traceback (most recent call last): File "/home/yudi/hidet/python/hidet/drivers/build_task.py", line 298, in build_job task.build(target, load=False) File "/home/yudi/hidet/python/hidet/ir/task.py", line 273, in build return build_task(self, target=target, load=load) File "/home/yudi/hidet/python/hidet/drivers/build_task.py", line 283, in build_task build_task_module(task, candidates, task_dir, target) File "/home/yudi/hidet/python/hidet/drivers/build_task.py", line 160, in build_task_module build_ir_module(ir_module=task_ir_module, output_dir=task_dir, output_kind='.so', target=target) File "/home/yudi/hidet/python/hidet/drivers/build_module.py", line 156, in build_ir_module compile_source( File "/home/yudi/hidet/python/hidet/backend/build.py", line 313, in compile_source compiler.compile( File "/home/yudi/hidet/python/hidet/backend/build.py", line 193, in compile self.run_compile_command(" ".join(command), src_path, out_lib_path) File "/home/yudi/hidet/python/hidet/backend/build.py", line 77, in run_compile_command raise CompilationFailed(src_path, message) hidet.backend.build.CompilationFailed: failed to compile file:///home/yudi/.cache/hidet/ops/cuda_space_0/conv_gemm_fp16_pk/66d7ae2b4070ca6f/source.cu Command: /usr/local/cuda/bin/nvcc -I/home/yudi/hidet/include -L/home/yudi/hidet/build/lib -O3 -Xcompiler -fPIC,-m64,-march=znver1,-O3,-funroll-loops,-ffast-math -std=c++11 -gencode arch=compute_75,code=sm_75 --ptxas-options=-v -lineinfo -ftz=true -prec-div=false -lhidet_runtime --cudart shared --diag-suppress 177 --diag-suppress 179 --diag-suppress 39 --shared /home/yudi/.cache/hidet/ops/cuda_space_0/conv_gemm_fp16_pk/66d7ae2b4070ca6f/source.cu -o /home/yudi/.cache/hidet/ops/cuda_space_0/conv_gemm_fp16_pk/66d7ae2b4070ca6f/lib.so ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 754; error : Feature 'cp.async' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 774; error : Feature 'cp.async' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 780; error : Feature 'cp.async.wait_all' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1300; error : Feature 'cp.async' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1322; error : Feature 'cp.async' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1437; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1440; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1443; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1446; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1449; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1452; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1455; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1458; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1461; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1464; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1467; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1470; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1473; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1476; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1479; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1482; error : Feature '.m16n8k16' requires .target sm_80 or higher ptxas /tmp/tmpxft_003a2977_00000000-6_source.ptx, line 1488; error : Feature 'cp.async.wait_all' requires .target sm_80 or higher ptxas fatal : Ptx assembly aborted due to errors
To Reproduce The following script reproduces this compile error:
import hidet from hidet import ops def hidet_conv2d(dtype, n, c, h, w, k, r, s, padding, stride, dilations): tensor_x = hidet.symbol((n, c, h, w), device='cuda', dtype=dtype) tensor_w = hidet.randn((k, c, r, s), device='cuda', dtype=dtype) output = ops.conv2d(tensor_x, tensor_w, stride=stride, dilations=dilations, padding=padding) graph = hidet.trace_from(output, inputs=[tensor_x, tensor_w]) graph = hidet.graph.optimize(graph) graph = graph.cuda_graph() if __name__ == '__main__': hidet_conv2d('float16', 1, 3, 224, 224, 64, 7, 7, (3, 3), (2, 2), (1, 1))
Expected behavior The compilation should pass and I should be able to run conv2d with fp16.
Enviroment
It seems this schedule does not support 2080Ti (Turing architecture). You should run this op on a GPU above Ampere architecture.
I see. Thanks - I'll give that a try.
Describe the bug hidet.ops.conv2d with fp16 fails to compile for CUDA with an internal nvcc compiler error.
To Reproduce The following script reproduces this compile error:
Expected behavior The compilation should pass and I should be able to run conv2d with fp16.
Enviroment