Open xiezhipeng-git opened 1 day ago
I'd guess something is up with your version of Triton. Did you perhaps install a very old version on top of the one provided by PyTorch?
That seems to be the case.I'll try to update it.
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\model_executor\layers\fused_moe\fused_moe.py:8
[5](file:///D:/my/env/python3.10.10/lib/site-packages/vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg/vllm/model_executor/layers/fused_moe/fused_moe.py:5) from typing import Any, Callable, Dict, Optional, Tuple
[7](file:///D:/my/env/python3.10.10/lib/site-packages/vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg/vllm/model_executor/layers/fused_moe/fused_moe.py:7) import torch
----> [8](file:///D:/my/env/python3.10.10/lib/site-packages/vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg/vllm/model_executor/layers/fused_moe/fused_moe.py:8) import triton
[9](file:///D:/my/env/python3.10.10/lib/site-packages/vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg/vllm/model_executor/layers/fused_moe/fused_moe.py:9) import triton.language as tl
[11](file:///D:/my/env/python3.10.10/lib/site-packages/vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg/vllm/model_executor/layers/fused_moe/fused_moe.py:11) import vllm.envs as envs
File d:\my\env\python3.10.10\lib\site-packages\triton\__init__.py:8
[2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:2) __version__ = '3.0.0'
[4](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:4) # ---------------------------------------
[5](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:5) # Note: import order is significant here.
[6](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:6)
[7](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:7) # submodules
----> [8](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:8) from .runtime import (
[9](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:9) autotune,
[10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:10) Config,
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:11) heuristics,
[12](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:12) JITFunction,
[13](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:13) KernelInterface,
[14](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:14) reinterpret,
[15](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:15) TensorWrapper,
[16](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:16) OutOfResources,
[17](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:17) InterpreterError,
[18](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:18) MockTensor,
[19](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:19) )
[20](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:20) from .runtime.jit import jit
[21](file:///D:/my/env/python3.10.10/lib/site-packages/triton/__init__.py:21) from .compiler import compile, CompilationError
File d:\my\env\python3.10.10\lib\site-packages\triton\runtime\__init__.py:[1](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/__init__.py:1)
----> 1 from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
[2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/__init__.py:2) from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
[3](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/__init__.py:3) from .driver import driver
File d:\my\env\python3.10.10\lib\site-packages\triton\runtime\autotuner.py:9
[6](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/autotuner.py:6) import inspect
[7](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/autotuner.py:7) from typing import Dict
----> [9](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/autotuner.py:9) from ..testing import do_bench, do_bench_cudagraph
[10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/autotuner.py:10) from .jit import KernelInterface
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/autotuner.py:11) from .errors import OutOfResources
File d:\my\env\python3.10.10\lib\site-packages\triton\testing.py:7
[5](file:///D:/my/env/python3.10.10/lib/site-packages/triton/testing.py:5) from contextlib import contextmanager
[6](file:///D:/my/env/python3.10.10/lib/site-packages/triton/testing.py:6) from typing import Any, Dict, List
----> [7](file:///D:/my/env/python3.10.10/lib/site-packages/triton/testing.py:7) from . import language as tl
[10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/testing.py:10) def nvsmi(attrs):
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/testing.py:11) attrs = ','.join(attrs)
File d:\my\env\python3.10.10\lib\site-packages\triton\language\__init__.py:4
[1](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:1) """isort:skip_file"""
[2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:2) # Import order is significant here.
----> [4](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:4) from . import math
[5](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:5) from . import extra
[6](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:6) from .standard import (
[7](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:7) argmax,
[8](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:8) argmin,
(...)
[24](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:24) zeros_like,
[25](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/__init__.py:25) )
File d:\my\env\python3.10.10\lib\site-packages\triton\language\math.py:1
----> [1](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/math.py:1) from . import core
[2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/math.py:2) from . import semantic
[3](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/math.py:3) from functools import wraps
File d:\my\env\python3.10.10\lib\site-packages\triton\language\core.py:10
[8](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/core.py:8) from typing import Union, Callable, List, Sequence, TypeVar, Optional
[9](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/core.py:9) import builtins
---> [10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/core.py:10) from ..runtime.jit import jit
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/core.py:11) import inspect
[12](file:///D:/my/env/python3.10.10/lib/site-packages/triton/language/core.py:12) import os
File d:\my\env\python3.10.10\lib\site-packages\triton\runtime\jit.py:12
[10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/jit.py:10) from functools import cached_property
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/jit.py:11) from typing import Callable, Generic, Iterable, Optional, TypeVar, Union, overload, Dict, Any, Tuple
---> [12](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/jit.py:12) from ..runtime.driver import driver
[13](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/jit.py:13) from types import ModuleType
[15](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/jit.py:15) TRITON_MODULE = __name__[:-len(".runtime.jit")]
File d:\my\env\python3.10.10\lib\site-packages\triton\runtime\driver.py:1
----> [1](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/driver.py:1) from ..backends import backends
[2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/driver.py:2) from ..backends import DriverBase
[5](file:///D:/my/env/python3.10.10/lib/site-packages/triton/runtime/driver.py:5) def _create_driver():
File d:\my\env\python3.10.10\lib\site-packages\triton\backends\__init__.py:50
[45](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:45) backends[name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),
[46](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:46) _find_concrete_subclasses(driver, DriverBase))
[47](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:47) return backends
---> [50](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:50) backends = _discover_backends()
File d:\my\env\python3.10.10\lib\site-packages\triton\backends\__init__.py:43, in _discover_backends()
[41](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:41) if name.startswith('__'):
[42](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:42) continue
---> [43](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:43) compiler = _load_module(name, os.path.join(root, name, 'compiler.py'))
[44](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:44) driver = _load_module(name, os.path.join(root, name, 'driver.py'))
[45](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:45) backends[name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),
[46](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:46) _find_concrete_subclasses(driver, DriverBase))
File d:\my\env\python3.10.10\lib\site-packages\triton\backends\__init__.py:12, in _load_module(name, path)
[10](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:10) spec = importlib.util.spec_from_file_location(name[:-3], path)
[11](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:11) module = importlib.util.module_from_spec(spec)
---> [12](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:12) spec.loader.exec_module(module)
[13](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/__init__.py:13) return module
File d:\my\env\python3.10.10\lib\site-packages\triton\backends\amd\compiler.py:2
[1](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/amd/compiler.py:1) from triton.backends.compiler import BaseBackend, GPUTarget
----> [2](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/amd/compiler.py:2) from triton._C.libtriton import ir, passes, llvm, amd
[3](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/amd/compiler.py:3) from dataclasses import dataclass
[4](file:///D:/my/env/python3.10.10/lib/site-packages/triton/backends/amd/compiler.py:4) from typing import Any, Tuple
ImportError: DLL load failed while importing libtriton: 动态链接库(DLL)初始化例程失败。
there are new error .and windows only triton=3.0.0 version
You seem to be using AMD on Windows, which are two setups we don't fully support because we're unable to test them ourselves.
The issue though still seems to come from your installation of Triton, I'd suggest you check with them.
🐛 Bug
from vllm import LLM, SamplingParams llm = LLM(model=model_dir,enforce_eager=True)
then
Command
To Reproduce
Steps to reproduce the behavior:
1.pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu124 2.git clone https://github.com/vllm-project/vllm.git cd vllm python use_existing_torch.py pip install -r requirements-common.txt python setup.py install
Expected behavior
Environment
Please copy and paste the output from the environment collection script from PyTorch (or fill out the checklist below manually).
You can run the script with:python -m torch.utils.collect_env
conda
,pip
, source):pipAdditional context
TypeError Traceback (most recent call last) Cell In[2], line 5 1 from vllm import LLM, SamplingParams 3 # model_dir='Qwen2.5-14B-Instruct-GPTQ-Int4' ----> 5 llm = LLM(model=model_dir,enforce_eager=True) 6 sampling_params = SamplingParams( top_p=0.9, max_tokens=512,top_k=10) 8 prompt = "1+1等于几"
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\utils.py:1023, in deprecate_args..wrapper..inner(*args, *kwargs)
1016 msg += f" {additional_message}"
1018 warnings.warn(
1019 DeprecationWarning(msg),
1020 stacklevel=3, # The inner function takes up one level
1021 )
-> 1023 return fn(args, **kwargs)
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\entrypoints\llm.py:198, in LLM.init(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, mm_processor_kwargs, task, kwargs) 172 kwargs["disable_log_stats"] = True 174 engine_args = EngineArgs( 175 model=model, 176 task=task, (...) 196 kwargs, 197 ) --> 198 self.llm_engine = LLMEngine.from_engine_args( 199 engine_args, usage_context=UsageContext.LLM_CLASS) 200 self.request_counter = Counter()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\engine\llm_engine.py:582, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers) 580 executor_class = cls._get_executor_cls(engine_config) 581 # Create the LLM engine. --> 582 engine = cls( 583 **engine_config.to_dict(), 584 executor_class=executor_class, 585 log_stats=not engine_args.disable_log_stats, 586 usage_context=usage_context, 587 stat_loggers=stat_loggers, 588 ) 590 return engine
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\engine\llm_engine.py:341, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers, input_registry, use_cached_outputs) 337 self.input_registry = input_registry 338 self.input_processor = input_registry.create_input_processor( 339 model_config) --> 341 self.model_executor = executor_class( 342 model_config=model_config, 343 cache_config=cache_config, 344 parallel_config=parallel_config, 345 scheduler_config=scheduler_config, 346 device_config=device_config, 347 lora_config=lora_config, 348 speculative_config=speculative_config, 349 load_config=load_config, 350 prompt_adapter_config=prompt_adapter_config, 351 observability_config=self.observability_config, 352 ) 354 if self.model_config.task != "embedding": 355 self._initialize_kv_caches()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\executor_base.py:47, in ExecutorBase.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, prompt_adapter_config, observability_config) 45 self.prompt_adapter_config = prompt_adapter_config 46 self.observability_config = observability_config ---> 47 self._init_executor()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:38, in GPUExecutor._init_executor(self) 33 """Initialize the worker and load the model. 34 """ 35 assert self.parallel_config.world_size == 1, ( 36 "GPUExecutor only supports single GPU.") ---> 38 self.driver_worker = self._create_worker() 39 self.driver_worker.init_device() 40 self.driver_worker.load_model()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:105, in GPUExecutor._create_worker(self, local_rank, rank, distributed_init_method) 101 def _create_worker(self, 102 local_rank: int = 0, 103 rank: int = 0, 104 distributed_init_method: Optional[str] = None): --> 105 return create_worker(**self._get_create_worker_kwargs( 106 local_rank=local_rank, 107 rank=rank, 108 distributed_init_method=distributed_init_method))
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:24, in create_worker(worker_module_name, worker_class_name, worker_class_fn, kwargs) 16 def create_worker(worker_module_name: str, worker_class_name: str, 17 worker_class_fn: Optional[Callable[[], Type[WorkerBase]]], 18 kwargs): 19 wrapper = WorkerWrapperBase( 20 worker_module_name=worker_module_name, 21 worker_class_name=worker_class_name, 22 worker_class_fn=worker_class_fn, 23 ) ---> 24 wrapper.init_worker(**kwargs) 25 return wrapper.worker
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\worker_base.py:449, in WorkerWrapperBase.init_worker(self, *args, *kwargs) 446 mod = importlib.import_module(self.worker_module_name) 447 worker_class = getattr(mod, self.worker_class_name) --> 449 self.worker = worker_class(args, **kwargs) 450 assert self.worker is not None
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\worker.py:99, in Worker.init(self, model_config, parallel_config, scheduler_config, device_config, cache_config, load_config, local_rank, rank, distributed_init_method, lora_config, speculative_config, prompt_adapter_config, is_driver_worker, model_runner_cls, observability_config) 97 elif self._is_encoder_decoder_model(): 98 ModelRunnerClass = EncoderDecoderModelRunner ---> 99 self.model_runner: GPUModelRunnerBase = ModelRunnerClass( 100 model_config, 101 parallel_config, 102 scheduler_config, 103 device_config, 104 cache_config, 105 load_config=load_config, 106 lora_config=self.lora_config, 107 kv_cache_dtype=self.cache_config.cache_dtype, 108 is_driver_worker=is_driver_worker, 109 prompt_adapter_config=prompt_adapter_config, 110 observability_config=observability_config, 111 **speculative_args, 112 ) 113 # Uninitialized cache engine. Will be initialized by 114 # initialize_cache. 115 self.cache_engine: List[CacheEngine]
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\model_runner.py:1013, in GPUModelRunnerBase.init(self, model_config, parallel_config, scheduler_config, device_config, cache_config, load_config, lora_config, kv_cache_dtype, is_driver_worker, prompt_adapter_config, return_hidden_states, observability_config, input_registry, mm_registry) 1008 num_attn_heads = self.model_config.get_num_attention_heads( 1009 self.parallel_config) 1010 needs_attn_backend = (num_attn_heads != 0 1011 or self.model_config.is_attention_free) -> 1013 self.attn_backend = get_attn_backend( 1014 self.model_config.get_head_size(), 1015 self.model_config.dtype, 1016 self.kv_cache_dtype, 1017 self.block_size, 1018 self.model_config.is_attention_free, 1019 ) if needs_attn_backend else None 1020 if self.attn_backend: 1021 self.attn_state = self.attn_backend.get_state_cls()( 1022 weakref.proxy(self))
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\attention\selector.py:120, in get_attn_backend(head_size, dtype, kv_cache_dtype, block_size, is_attention_free, is_blocksparse) 118 if backend == _Backend.XFORMERS: 119 logger.info("Using XFormers backend.") --> 120 from vllm.attention.backends.xformers import ( # noqa: F401 121 XFormersBackend) 122 return XFormersBackend 123 elif backend == _Backend.ROCM_FLASH:
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\attention\backends\xformers.py:6 3 from typing import Any, Dict, List, Optional, Tuple, Type 5 import torch ----> 6 from xformers import ops as xops 7 from xformers.ops.fmha.attn_bias import (AttentionBias, 8 BlockDiagonalCausalMask, 9 BlockDiagonalMask, 10 LowerTriangularMaskWithTensorBias) 12 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, 13 AttentionMetadata, AttentionType)
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops__init.py:8 1 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. [2](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:2) # [3](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:3) # This source code is licensed under the BSD license found in the [4](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:4) # LICENSE file in the root directory of this source tree. [6](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:6) import torch ----> [8](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:8) from .fmha import ( [9](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:9) AttentionBias, [10](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:10) AttentionOp, [11](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:11) AttentionOpBase, [12](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:12) LowerTriangularMask, [13](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:13) MemoryEfficientAttentionCkOp, [14](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:14) MemoryEfficientAttentionCutlassFwdFlashBwOp, [15](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:15) MemoryEfficientAttentionCutlassOp, [16](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:16) MemoryEfficientAttentionFlashAttentionOp, [17](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:17) MemoryEfficientAttentionSplitKCkOp, [18](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:18) memory_efficient_attention, [19](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init.py:19) memory_efficient_attention_backward, 20 memory_efficient_attention_forward, [21](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/init__.py:21) memory_efficient_attention_forward_requires_grad, 22 ) 23 from .indexing import index_select_cat, scaled_index_add 24 from .ipc import init_ipc
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha__init.py:10 [6](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:6) from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast [8](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:8) import torch ---> [10](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:10) from . import ( [11](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:11) attn_bias, 12 ck, 13 ck_decoder, [14](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:14) ck_splitk, 15 cutlass, 16 flash, 17 flash3, 18 triton_splitk, [19](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:19) ) [20](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:20) from .attn_bias import VARLEN_BIASES, AttentionBias, LowerTriangularMask [21](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:21) from .common import ( [22](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:22) AttentionBwOpBase, [23](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:23) AttentionFwOpBase, (...) [29](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init.py:29) bmk2bmhk, [30](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/init__.py:30) )
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha\triton_splitk.py:110 94 return ( 95 super(InputsFp8, self).nbytes 96 + ( (...) 105 ) 106 ) 109 if TYPE_CHECKING or _is_triton_available(): --> 110 from ._triton.splitk_kernels import _fwd_kernel_splitK, _splitK_reduce 111 else: 112 _fwd_kernel_splitK = None
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha_triton\splitk_kernels.py:632 629 if sys.version_info >= (3, 9): 630 # unroll_varargs requires Python 3.9+ 631 for num_groups in [1, 2, 4, 8]: --> 632 _fwd_kernel_splitK_autotune[num_groups] = autotune_kernel( 633 _get_splitk_kernel(num_groups) 634 ) 636 def get_autotuner_cache( 637 num_groups: int, 638 ) -> Dict[Tuple[Union[int, str]], triton.Config]: 639 """Returns a triton.runtime.autotuner.AutoTuner.cache object, which 640 represents mappings from kernel autotune keys (tuples describing kernel inputs) 641 to triton.Config 642 """
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha_triton\splitk_kernels.py:614, in autotune_kernel(kernel) 604 WARPS_VALUES = [1, 2, 4] 606 TRITON_CONFIGS = [ 607 gen_config(block_m, block_n, stages, warps) 608 for block_m in BLOCK_M_VALUES (...) 611 for warps in WARPS_VALUES 612 ] --> 614 kernel = triton.autotune( 615 configs=TRITON_CONFIGS, 616 key=AUTOTUNER_KEY, 617 use_cuda_graph=True, 618 )(kernel) 619 return kernel
TypeError: autotune() got an unexpected keyword argument 'use_cuda_graph'