Closed ccssu closed 5 months ago
cd ComfyUI # For CUDA Graph export NEXFORT_FX_CUDAGRAPHS=1 # For best performance export TORCHINDUCTOR_MAX_AUTOTUNE=1 # Enable CUDNN benchmark export NEXFORT_FX_CONV_BENCHMARK=1 # Faster float32 matmul export NEXFORT_FX_MATMUL_ALLOW_TF32=1 # For graph cache to speedup compilation export TORCHINDUCTOR_FX_GRAPH_CACHE=1 # For persistent cache dir export TORCHINDUCTOR_CACHE_DIR=~/.torchinductor # debug # export TORCH_LOGS="+dynamo" # export TORCHDYNAMO_VERBOSE=1 # export NEXFORT_DEBUG=1 NEXFORT_FX_DUMP_GRAPH=1 TORCH_COMPILE_DEBUG=1 python main.py --gpu-only --disable-cuda-malloc --port 8188 --cuda-device 6
# Compile arbitrary models (torch.nn.Module) import torch import onediff.infer_compiler as infer_compiler class MyModule(torch.nn.Module): def __init__(self): super().__init__() self.lin = torch.nn.Linear(100, 10) def forward(self, x): return torch.nn.functional.relu(self.lin(x)) mod = MyModule().to("cuda").half() with torch.inference_mode(): compiled_mod = infer_compiler.compile(mod, backend="nexfort", options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True}, ) print(compiled_mod(torch.randn(10, 100, device="cuda").half()))
import torch import onediff.infer_compiler as infer_compiler @infer_compiler.compile( backend="nexfort", options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True}, ) def foo(x): return torch.sin(x) + torch.cos(x) print(foo(torch.randn(10, 10, device="cuda").half()))
First compilation time: 321.92 seconds
First compilation time: 878.19 seconds
First compilation time: 437.84 seconds
Nexfort
How to use Nexfort
Case 1
Case 2
Vae
ComfyUI Workflow
Result
First compilation time: 321.92 seconds
Lora
ComfyUI Workflow
Result
First compilation time: 878.19 seconds
Controlnet
ComfyUI Workflow
Result
First compilation time: 437.84 seconds
IPAdapter