Open RonanKMcGovern opened 5 days ago
Note that I have tried:
!pip install outlines -qU
then:
import outlines
model = outlines.models.transformers("/kaggle/input/llama-3.1/transformers/8b-instruct/2",
device="cuda")
prompt = "What is the IP address of the Google DNS servers? "
generator = outlines.generate.text(model)
unstructured = generator(prompt, max_tokens=30)
generator = outlines.generate.regex(
model,
r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
)
structured = generator(prompt, max_tokens=30)
print(unstructured)
which gives:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[13], line 3
1 import outlines
----> 3 model = outlines.models.transformers("/kaggle/input/llama-3.1/transformers/8b-instruct/2",
4 device="cuda")
6 prompt = "What is the IP address of the Google DNS servers? "
8 generator = outlines.generate.text(model)
File /opt/conda/lib/python3.10/site-packages/outlines/models/transformers.py:231, in transformers(model_name, device, model_kwargs, tokenizer_kwargs)
228 if device is not None:
229 model_kwargs["device_map"] = device
--> 231 model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
233 tokenizer_kwargs.setdefault("padding_side", "left")
234 tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
File /opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:523, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
520 if kwargs.get("quantization_config", None) is not None:
521 _ = kwargs.pop("quantization_config")
--> 523 config, kwargs = AutoConfig.from_pretrained(
524 pretrained_model_name_or_path,
525 return_unused_kwargs=True,
526 trust_remote_code=trust_remote_code,
527 code_revision=code_revision,
528 _commit_hash=commit_hash,
529 **hub_kwargs,
530 **kwargs,
531 )
533 # if torch_dtype=auto was passed here, ensure to pass it on
534 if kwargs_orig.get("torch_dtype", None) == "auto":
File /opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:958, in AutoConfig.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
952 except KeyError:
953 raise ValueError(
954 f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` "
955 "but Transformers does not recognize this architecture. This could be because of an "
956 "issue with the checkpoint, or because your version of Transformers is out of date."
957 )
--> 958 return config_class.from_dict(config_dict, **unused_kwargs)
959 else:
960 # Fallback: use pattern matching on the string.
961 # We go from longer names to shorter names to catch roberta before bert (for instance)
962 for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):
File /opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py:768, in PretrainedConfig.from_dict(cls, config_dict, **kwargs)
765 # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
766 config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
--> 768 config = cls(**config_dict)
770 if hasattr(config, "pruned_heads"):
771 config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py:161, in LlamaConfig.__init__(self, vocab_size, hidden_size, intermediate_size, num_hidden_layers, num_attention_heads, num_key_value_heads, hidden_act, max_position_embeddings, initializer_range, rms_norm_eps, use_cache, pad_token_id, bos_token_id, eos_token_id, pretraining_tp, tie_word_embeddings, rope_theta, rope_scaling, attention_bias, attention_dropout, mlp_bias, **kwargs)
159 self.rope_theta = rope_theta
160 self.rope_scaling = rope_scaling
--> 161 self._rope_scaling_validation()
162 self.attention_bias = attention_bias
163 self.attention_dropout = attention_dropout
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py:182, in LlamaConfig._rope_scaling_validation(self)
179 return
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
184 )
185 rope_scaling_type = self.rope_scaling.get("type", None)
186 rope_scaling_factor = self.rope_scaling.get("factor", None)
ValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
I suspect, if this worked, it wouldn't necessarily dispatch to both GPUs, but I'm uncertain.
I'm also unclear on where I would need to pass in device_map="auto".
I see there is a separate way to load as follows:
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from outlines import models
model = AutoModelForCausalLM.from_pretrained("/kaggle/input/llama-3.1/transformers/8b-instruct/2", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama-3.1/transformers/8b-instruct/2")
model = models.Transformers(llm, tokenizer)
# model = outlines.models.transformers("/kaggle/input/llama-3.1/transformers/8b-instruct/2",
# device_map="auto")
prompt = "What is the IP address of the Google DNS servers? "
generator = outlines.generate.text(model)
unstructured = generator(prompt, max_tokens=30)
generator = outlines.generate.regex(
model,
r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
)
structured = generator(prompt, max_tokens=30)
print(unstructured)
but this gives the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[10], line 5
2 from transformers import AutoModelForCausalLM, AutoTokenizer
3 from outlines import models
----> 5 model = AutoModelForCausalLM.from_pretrained("/kaggle/input/llama-3.1/transformers/8b-instruct/2", device_map="auto")
7 tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/llama-3.1/transformers/8b-instruct/2")
9 model = models.Transformers(llm, tokenizer)
File /opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:523, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
520 if kwargs.get("quantization_config", None) is not None:
521 _ = kwargs.pop("quantization_config")
--> 523 config, kwargs = AutoConfig.from_pretrained(
524 pretrained_model_name_or_path,
525 return_unused_kwargs=True,
526 trust_remote_code=trust_remote_code,
527 code_revision=code_revision,
528 _commit_hash=commit_hash,
529 **hub_kwargs,
530 **kwargs,
531 )
533 # if torch_dtype=auto was passed here, ensure to pass it on
534 if kwargs_orig.get("torch_dtype", None) == "auto":
File /opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py:958, in AutoConfig.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
952 except KeyError:
953 raise ValueError(
954 f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` "
955 "but Transformers does not recognize this architecture. This could be because of an "
956 "issue with the checkpoint, or because your version of Transformers is out of date."
957 )
--> 958 return config_class.from_dict(config_dict, **unused_kwargs)
959 else:
960 # Fallback: use pattern matching on the string.
961 # We go from longer names to shorter names to catch roberta before bert (for instance)
962 for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True):
File /opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py:768, in PretrainedConfig.from_dict(cls, config_dict, **kwargs)
765 # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
766 config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
--> 768 config = cls(**config_dict)
770 if hasattr(config, "pruned_heads"):
771 config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py:161, in LlamaConfig.__init__(self, vocab_size, hidden_size, intermediate_size, num_hidden_layers, num_attention_heads, num_key_value_heads, hidden_act, max_position_embeddings, initializer_range, rms_norm_eps, use_cache, pad_token_id, bos_token_id, eos_token_id, pretraining_tp, tie_word_embeddings, rope_theta, rope_scaling, attention_bias, attention_dropout, mlp_bias, **kwargs)
159 self.rope_theta = rope_theta
160 self.rope_scaling = rope_scaling
--> 161 self._rope_scaling_validation()
162 self.attention_bias = attention_bias
163 self.attention_dropout = attention_dropout
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py:182, in LlamaConfig._rope_scaling_validation(self)
179 return
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
184 )
185 rope_scaling_type = self.rope_scaling.get("type", None)
186 rope_scaling_factor = self.rope_scaling.get("factor", None)
ValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
btw, running !transformers-cli env
gives:
- `transformers` version: 4.45.1
- Platform: Linux-5.15.154+-x86_64-with-glibc2.31
- Python version: 3.10.13
- Huggingface_hub version: 0.23.2
- Safetensors version: 0.4.3
- Accelerate version: 0.34.2
- Accelerate config: not found
- PyTorch version (GPU?): 2.1.2 (True)
- Tensorflow version (GPU?): 2.15.0 (True)
- Flax version (CPU?/GPU?/TPU?): 0.8.4 (gpu)
- Jax version: 0.4.26
- JaxLib version: 0.4.26.dev20240504
- Using distributed or parallel set-up in script?: <fill in>
- Using GPU in script?: <fill in>
- GPU type: Tesla T4
and I'm in kaggle with 2xT4 (I want to run this llama 3.1 8B model on two gpus).
@RonanKMcGovern this will be addressed by https://github.com/vllm-project/vllm/pull/8252 which will be merged very soon.
Thanks @njhill that would be absolutely excellent!
btw, @njhill - in the meantime - can you recommend a way to install your branch+repo?
I'm trying git clone and pip install -e .
but getting:
Building editable for vllm (pyproject.toml) ... error
error: subprocess-exited-with-error
× Building editable for vllm (pyproject.toml) did not run successfully.
│ exit code: 1
╰─> [600 lines of output]
/tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/_subclasses/functional_tensor.py:258: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:84.)
cpu = _conversion_method_template(device=torch.device("cpu"))
running editable_wheel
creating /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info
writing /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/PKG-INFO
writing dependency_links to /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/dependency_links.txt
writing entry points to /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/entry_points.txt
writing requirements to /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/requires.txt
writing top-level names to /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/top_level.txt
writing manifest file '/tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
adding license file 'LICENSE'
writing manifest file '/tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm.egg-info/SOURCES.txt'
creating '/tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm-0.1.dev2848+g8acc98e.dist-info'
creating /tmp/pip-wheel-3r4env2q/.tmp-tdu7kt0g/vllm-0.1.dev2848+g8acc98e.dist-info/WHEEL
running build_py
running build_ext
-- The CXX compiler identification is GNU 11.4.0
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Build type: RelWithDebInfo
-- Target device: cuda
-- Found Python: /usr/bin/python (found version "3.10.12") found components: Interpreter Development.Module Development.SABIModule
-- Found python matching: /usr/bin/python.
-- Found CUDA: /usr/local/cuda (found version "12.1")
-- The CUDA compiler identification is NVIDIA 12.1.105
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Found CUDAToolkit: /usr/local/cuda/include (found version "12.1.105")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- Caffe2: CUDA detected: 12.1
-- Caffe2: CUDA nvcc is: /usr/local/cuda/bin/nvcc
-- Caffe2: CUDA toolkit directory: /usr/local/cuda
-- Caffe2: Header version is: 12.1
-- /usr/local/cuda/lib64/libnvrtc.so shorthash is b51b459d
-- USE_CUDNN is set to 0. Compiling without cuDNN support
-- USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support
-- Autodetected CUDA architecture(s): 8.6
-- Added CUDA NVCC flags for: -gencode;arch=compute_86,code=sm_86
CMake Warning at /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/share/cmake/Torch/TorchConfig.cmake:22 (message):
static library kineto_LIBRARY-NOTFOUND not found.
Call Stack (most recent call first):
/tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/share/cmake/Torch/TorchConfig.cmake:120 (append_torchlib_if_found)
CMakeLists.txt:84 (find_package)
-- Found Torch: /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/lib/libtorch.so
-- Enabling core extension.
-- CUDA supported arches: 7.0;7.5;8.0;8.6;8.9;9.0
-- CUDA target arches: 86-real
-- CMake Version: 3.30.4
-- CUTLASS 3.5.1
-- CUDART: /usr/local/cuda/lib64/libcudart.so
-- CUDA Driver: /usr/local/cuda/lib64/stubs/libcuda.so
-- NVRTC: /usr/local/cuda/lib64/libnvrtc.so
-- Default Install Location: install
-- Found Python3: /usr/bin/python3.10 (found suitable version "3.10.12", minimum required is "3.5") found components: Interpreter
-- Make cute::tuple be the new standard-layout tuple type
-- CUDA Compilation Architectures: 70;72;75;80;86;87;89;90;90a
-- Enable caching of reference results in conv unit tests
-- Enable rigorous conv problem sizes in conv unit tests
-- Using NVCC flags: --expt-relaxed-constexpr;-DCUTE_USE_PACKED_TUPLE=1;-DCUTLASS_TEST_LEVEL=0;-DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1;-DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1;-DCUTLASS_DEBUG_TRACE_LEVEL=0;-Xcompiler=-Wconversion;-Xcompiler=-fno-strict-aliasing;-lineinfo
-- Configuring cublas ...
-- cuBLAS Disabled.
-- Configuring cuBLAS ... done.
-- Machete generation completed successfully.
-- Machete generated sources: /workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4b8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4b8_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u4b8_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8b128.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8b128_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_bf16u8b128_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4b8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4b8_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u4b8_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8b128.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8b128_impl_part0.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_mm_f16u8b128_impl_part1.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_bf16u4.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_bf16u4b8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_bf16u8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_bf16u8b128.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_f16u4.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_f16u4b8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_f16u8.cu;/workspace/vllm/csrc/quantization/machete/generated/machete_prepack_f16u8b128.cu
-- Enabling C extension.
-- Enabling moe extension.
-- Build type: RelWithDebInfo
-- Target device: cuda
-- Building vllm-flash-attn inside vLLM. Skipping flag detection and relying on parent build.
-- vllm-flash-attn is available at /tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src
-- Configuring done (15.5s)
-- Generating done (0.1s)
-- Build files have been written to: /tmp/tmp_p9yzzdo.build-temp
[1/137] Building CUDA object CMakeFiles/_C.dir/csrc/cuda_utils_kernels.cu.o
[2/137] Building CXX object CMakeFiles/_core_C.dir/csrc/core/torch_bindings.cpp.o
[3/137] Linking CXX shared module _core_C.abi3.so
[4/137] Building CUDA object _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu.o
FAILED: _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu.o
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DFLASHATTENTION_DISABLE_DROPOUT -DFLASHATTENTION_DISABLE_UNEVEN_K -DPy_LIMITED_API=3 -DTORCH_EXTENSION_NAME=vllm_flash_attn_c -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE -Dvllm_flash_attn_c_EXPORTS -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/cutlass/include -isystem /usr/include/python3.10 -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -DONNX_NAMESPACE=onnx_c2 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -O3 -g -DNDEBUG -std=c++17 "--generate-code=arch=compute_86,code=[sm_86]" -Xcompiler=-fPIC --expt-relaxed-constexpr -DENABLE_FP8 --threads=1 --expt-extended-lambda --use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 -MD -MT _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu.o -MF _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu.o.d -x cu -c /tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu -o _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu.o
Killed
[5/137] Building CUDA object _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu.o
FAILED: _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu.o
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DFLASHATTENTION_DISABLE_DROPOUT -DFLASHATTENTION_DISABLE_UNEVEN_K -DPy_LIMITED_API=3 -DTORCH_EXTENSION_NAME=vllm_flash_attn_c -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE -Dvllm_flash_attn_c_EXPORTS -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/cutlass/include -isystem /usr/include/python3.10 -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -DONNX_NAMESPACE=onnx_c2 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -O3 -g -DNDEBUG -std=c++17 "--generate-code=arch=compute_86,code=[sm_86]" -Xcompiler=-fPIC --expt-relaxed-constexpr -DENABLE_FP8 --threads=1 --expt-extended-lambda --use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 -MD -MT _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu.o -MF _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu.o.d -x cu -c /tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu -o _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu.o
Killed
[6/137] Building CUDA object _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu.o
FAILED: _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu.o
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DFLASHATTENTION_DISABLE_DROPOUT -DFLASHATTENTION_DISABLE_UNEVEN_K -DPy_LIMITED_API=3 -DTORCH_EXTENSION_NAME=vllm_flash_attn_c -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE -Dvllm_flash_attn_c_EXPORTS -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/cutlass/include -isystem /usr/include/python3.10 -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -DONNX_NAMESPACE=onnx_c2 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -O3 -g -DNDEBUG -std=c++17 "--generate-code=arch=compute_86,code=[sm_86]" -Xcompiler=-fPIC --expt-relaxed-constexpr -DENABLE_FP8 --threads=1 --expt-extended-lambda --use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 -MD -MT _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu.o -MF _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu.o.d -x cu -c /tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu -o _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu.o
Killed
[7/137] Building CUDA object _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu.o
FAILED: _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu.o
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DFLASHATTENTION_DISABLE_DROPOUT -DFLASHATTENTION_DISABLE_UNEVEN_K -DPy_LIMITED_API=3 -DTORCH_EXTENSION_NAME=vllm_flash_attn_c -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE -Dvllm_flash_attn_c_EXPORTS -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/cutlass/include -isystem /usr/include/python3.10 -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include -isystem /tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/cuda/include -DONNX_NAMESPACE=onnx_c2 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -O3 -g -DNDEBUG -std=c++17 "--generate-code=arch=compute_86,code=[sm_86]" -Xcompiler=-fPIC --expt-relaxed-constexpr -DENABLE_FP8 --threads=1 --expt-extended-lambda --use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 -MD -MT _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu.o -MF _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu.o.d -x cu -c /tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu -o _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu.o
Killed
[8/137] Building CUDA object _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu.o
FAILED: _deps/vllm-flash-attn-build/CMakeFiles/vllm_flash_attn_c.dir/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu.o
/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DFLASHATTENTION_DISABLE_DROPOUT -DFLASHATTENTION_DISABLE_UNEVEN_K -DPy_LIMITED_API=3 -DTORCH_EXTENSION_NAME=vllm_flash_attn_c -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE -Dvllm_flash_attn_c_EXPORTS -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc -I/tmp/tmp_p9yzzdo.build-temp/_deps/vllm-flash-attn-src/csrc/flash_attn -I/tmp........
.......
File "/tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/setuptools/command/build_ext.py", line 98, in run
_build_ext.run(self)
File "/tmp/pip-build-env-wa44wzxf/overlay/local/lib/python3.10/dist-packages/setuptools/_distutils/command/build_ext.py", line 359, in run
self.build_extensions()
File "<string>", line 209, in build_extensions
File "/usr/lib/python3.10/subprocess.py", line 369, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['cmake', '--build', '.', '-j=128', '--target=_core_C', '--target=_moe_C', '--target=vllm_flash_attn_c', '--target=_C']' returned non-zero exit status 1.
[end of output]
note: This error originates from a subprocess, and is likely not a problem with pip.
ERROR: Failed building editable for vllm
Failed to build vllm
ERROR: Could not build wheels for vllm, which is required to install pyproject.toml-based projects
@RonanKMcGovern this section of docs might be helpful: https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source-without-compilation
The TLDR is you can copy in python code changes from a branch after installing the latest published wheel
Your current environment
N/A
How would you like to use vllm
I want to use guided_regex but in offline mode.
I had thought maybe I could pass guided_regex via sampling_params or when calling llm.chat() or llm.generate() but I don't see where that can be done.
I know I can do this via a server, but I'd like to do it locally instead in a notebook, without a server. Thanks
Before submitting a new issue...