Open brando90 opened 2 months ago
https://github.com/unslothai/unsloth/wiki#saving-models-to-16bit-for-vllm
@danielhanchen
Sadly, these are the 2 functions that are giving me issues.
I was hoping I could do something like
hf_mdl = to_hf_model(unsloth_mdl)
or
hf_mdl = from_unsloth_ckpt(unsloth_ckpt_path)
ok even if one uses the hf interface to save the unsloth models it doesn't work:
(AI4Lean) root@miranebr-math-p4de-math-test-eval:~# python ~/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py
-- Start
/data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf
Traceback (most recent call last):
File "/data/miranebr-sandbox/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py", line 107, in <module>
fire.Fire(load_unsloth_ckpt_with_hf_code)
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 143, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 477, in _Fire
component, remaining_args = _CallAndUpdateTrace(
^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py", line 96, in load_unsloth_ckpt_with_hf_code
llm = LLM(model=model_name + '_hf')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 118, in __init__
self.llm_engine = LLMEngine.from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 257, in from_engine_args
engine_config = engine_args.create_engine_config()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/arg_utils.py", line 464, in create_engine_config
model_config = ModelConfig(
^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/config.py", line 107, in __init__
self.hf_config = get_config(self.model, trust_remote_code, revision,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/transformers_utils/config.py", line 23, in get_config
config = AutoConfig.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py", line 972, in from_pretrained
config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/transformers/configuration_utils.py", line 632, in get_config_dict
config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/transformers/configuration_utils.py", line 689, in _get_config_dict
resolved_config_file = cached_file(
^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/transformers/utils/hub.py", line 373, in cached_file
raise EnvironmentError(
OSError: /data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf does not appear to have a file named config.json. Checkout 'https://huggingface.co//data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf/tree/None' for available files.
(AI4Lean) root@miranebr-math-p4de-math-test-eval:~# cd /data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf
(AI4Lean) root@miranebr-math-p4de-math-test-eval:~/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf# ls
adapter_config.json adapter_model.safetensors added_tokens.json generation_config.json merges.txt special_tokens_map.json tokenizer.json tokenizer_config.json vocab.json
code
def load_unsloth_ckpt_with_hf_code():
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM
model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
# model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(model_name)
# tokenizer: AutoModelForCausalLM = AutoTokenizer.from_pretrained(model_name)
# model.save_pretrained(model_name + '_hf')
# tokenizer.save_pretrained(model_name + '_hf')
print(model_name + '_hf')
llm = LLM(model=model_name + '_hf')
outputs = llm.generate('hi', sampling_params)
print(f'{outputs=}')
if __name__ == "__main__":
import fire
import time
print('\n-- Start')
start_time = time.time()
# fire.Fire(test_unsloth_vllm)
# fire.Fire(test_unsloth_inference_efficient)
fire.Fire(load_unsloth_ckpt_with_hf_code)
print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")
darn this was close:
(AI4Lean) root@miranebr-math-p4de-math-test-eval:~/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf# python ~/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py
-- Start
INFO 09-27 23:33:11 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf', speculative_config=None, tokenizer='/data/miranebr-sandbox/data/runs/09192024_12h35m27s_run/train/checkpoint-820_hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
INFO 09-27 23:33:11 utils.py:608] Found nccl from library /data/miranebr-sandbox/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 09-27 23:33:15 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 09-27 23:33:15 selector.py:33] Using XFormers backend.
Traceback (most recent call last):
File "/data/miranebr-sandbox/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py", line 108, in <module>
fire.Fire(load_unsloth_ckpt_with_hf_code)
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 143, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 477, in _Fire
component, remaining_args = _CallAndUpdateTrace(
^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/AI4Lean/py_src/ailean_evals/chat_template_qwen2.py", line 97, in load_unsloth_ckpt_with_hf_code
llm = LLM(model=model_name + '_hf')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 118, in __init__
self.llm_engine = LLMEngine.from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 148, in __init__
self.model_executor = executor_class(
^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/executor/executor_base.py", line 41, in __init__
self._init_executor()
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/executor/gpu_executor.py", line 51, in _init_non_spec_worker
self.driver_worker.load_model()
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/worker/worker.py", line 117, in load_model
self.model_runner.load_model()
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 162, in load_model
self.model = get_model(
^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/model_executor/model_loader/__init__.py", line 19, in get_model
return loader.load_model(model_config=model_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
model.load_weights(
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 363, in load_weights
param = params_dict[name]
~~~~~~~~~~~^^^^^^
KeyError: 'base_model.model.model.layers.0.mlp.down_proj.lora_A.weight'
def load_unsloth_ckpt_with_hf_code():
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm import LLM
model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
# model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(model_name)
# tokenizer: AutoModelForCausalLM = AutoTokenizer.from_pretrained(model_name)
# model.save_pretrained(model_name + '_hf')
# tokenizer.save_pretrained(model_name + '_hf')
# model.config.to_json_file(model_name + '_hf/config.json')
# print(model_name + '_hf')
llm = LLM(model=model_name + '_hf')
outputs = llm.generate('hi', sampling_params)
print(f'{outputs=}')
Oh for vLLM LoRAs - https://docs.vllm.ai/en/latest/models/lora.html might be helpful - sorry on the delay!
I will try that.
Note crucially that I cannot upload any of my models to hf under any circumstances and all the examples assume this. Perhaps it will work anyway if I give the path to the clot but my current clots were saved with the merged 16 but api unsloth gives. So I'm not even sure if this will work.
If the merged 16 API saved the Mel's correctly with the right config file as I noted in the other issue I think things would work. The trick would be to use whatever code your already using to do that to push it to hf. That works for
On Tue, Oct 1, 2024, 10:21 AM Brando Miranda @.***> wrote:
I will try that.
Note crucially that I cannot upload any of my models to hf under any circumstances and all the examples assume this. Perhaps it will work anyway if I give the path to the clot but my current clots were saved with the merged 16 but api unsloth gives. So I'm not even sure if this will work.
If the merged 16 API saved the Mel's correctly with the right config file as I noted in the other issue I think things would work. The trick would be to use whatever code your already using to do that to push it to hf. That works for me in my personal projects so the functionality is there.
On Tue, Oct 1, 2024, 12:58 AM Daniel Han @.***> wrote:
Oh for vLLM LoRAs - https://docs.vllm.ai/en/latest/models/lora.html might be helpful - sorry on the delay!
— Reply to this email directly, view it on GitHub https://github.com/unslothai/unsloth/issues/1039#issuecomment-2385055888, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAOE6LWJPKO3YSGIO2JZ4SLZZJIYRAVCNFSM6AAAAABOQSVPQ6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDGOBVGA2TKOBYHA . You are receiving this because you authored the thread.Message ID: @.***>
This is the code if the adaptors are in HF repo:
# Import necessary libraries
from huggingface_hub import snapshot_download # For downloading the LoRA adapter from Hugging Face
from vllm import LLM, SamplingParams # For initializing the vLLM model and sampling parameters
from vllm.lora.request import LoRARequest # For creating a LoRA request to use in text generation
# Step 1: Download the LoRA Adapter
# Replace 'yard1/llama-2-7b-sql-lora-test' with the specific LoRA adapter you want to use from Hugging Face
lora_adapter_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
# Step 2: Instantiate the Base Model with LoRA Enabled
# Replace 'meta-llama/Llama-2-7b-hf' with your desired base model from Hugging Face
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
# Step 3: Define Sampling Parameters
# Adjust these parameters as needed for your specific use case
sampling_params = SamplingParams(
temperature=0, # Temperature controls randomness in generation. 0 means deterministic output.
max_tokens=256, # Maximum number of tokens to generate
stop=["[/assistant]"] # Stop generation at this token
)
# Step 4: Define Prompts for Generation
# Create a list of prompts you want to pass to the model for text generation
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
]
# Step 5: Create a LoRA Request
# First parameter: a human-readable name for the adapter
# Second parameter: a globally unique ID for the adapter
# Third parameter: the path to the downloaded LoRA adapter
lora_request = LoRARequest("sql_adapter", 1, lora_adapter_path)
# Step 6: Generate Text with the LoRA Adapter
# Call the `generate` method on the model, passing the prompts, sampling parameters, and LoRA request
outputs = llm.generate(
prompts,
sampling_params,
lora_request=lora_request
)
# Step 7: Display the Outputs
# Print each generated output
for i, output in enumerate(outputs):
print(f"Output {i + 1}:\n{output}\n")
and if the adatpors are locally:
# Import necessary libraries
from vllm import LLM, SamplingParams # For initializing the vLLM model and sampling parameters
from vllm.lora.request import LoRARequest # For creating a LoRA request to use in text generation
# Step 1: Specify the Path to the LoRA Adapter Checkpoint
# Replace this with the path to your local LoRA adapter checkpoint
lora_adapter_path = "/path/to/your/lora_adapter_checkpoint"
# Step 2: Instantiate the Base Model with LoRA Enabled
# Replace 'meta-llama/Llama-2-7b-hf' with your desired base model from Hugging Face
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
# Step 3: Define Sampling Parameters
# Adjust these parameters as needed for your specific use case
sampling_params = SamplingParams(
temperature=0, # Temperature controls randomness in generation. 0 means deterministic output.
max_tokens=256, # Maximum number of tokens to generate
stop=["[/assistant]"] # Stop generation at this token
)
# Step 4: Define Prompts for Generation
# Create a list of prompts you want to pass to the model for text generation
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
]
# Step 5: Create a LoRA Request
# First parameter: a human-readable name for the adapter
# Second parameter: a globally unique ID for the adapter
# Third parameter: the path to the local LoRA adapter checkpoint
lora_request = LoRARequest("sql_adapter", 1, lora_adapter_path)
# Step 6: Generate Text with the LoRA Adapter
# Call the `generate` method on the model, passing the prompts, sampling parameters, and LoRA request
outputs = llm.generate(
prompts,
sampling_params,
lora_request=lora_request
)
# Step 7: Display the Outputs
# Print each generated output
for i, output in enumerate(outputs):
print(f"Output {i + 1}:\n{output}\n")
I do need to test this.
ref: chat gpt with websearch help me write it as I was commuting https://chatgpt.com/c/66fc3703-66a0-8001-9ef6-9d6426b390fc
@brando90 Hopefully that code you provided works! You'll have to change the model name to the correct Llama model, and the LoRA adapter should be in a local folder
error @danielhanchen
(AI4Lean) root@miranebr-math-p4de-math-aif-sft:~# python ~/AI4Lean/experiments/october/vllm_lora_adpoter_test.py
Traceback (most recent call last):
File "/data/miranebr-sandbox/AI4Lean/experiments/october/vllm_lora_adpoter_test.py", line 15, in <module>
llm = LLM(model="Qwen/Qwen2-1.5B", enable_lora=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 118, in __init__
self.llm_engine = LLMEngine.from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 257, in from_engine_args
engine_config = engine_args.create_engine_config()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/engine/arg_utils.py", line 542, in create_engine_config
return EngineConfig(model_config=model_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "<string>", line 13, in __init__
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/config.py", line 1127, in __post_init__
self.lora_config.verify_with_scheduler_config(
File "/data/miranebr-sandbox/.virtualenvs/AI4Lean/lib/python3.11/site-packages/vllm/config.py", line 906, in verify_with_scheduler_config
raise ValueError(
ValueError: Due to limitations of the custom LoRA CUDA kernel, max_num_batched_tokens must be <= 65528 when LoRA is enabled.
script
"""
ref: https://github.com/unslothai/unsloth/issues/1039
"""
# Import necessary libraries
from vllm import LLM, SamplingParams # For initializing the vLLM model and sampling parameters
from vllm.lora.request import LoRARequest # For creating a LoRA request to use in text generation
# Step 1: Specify the Path to the LoRA Adapter Checkpoint
# Replace this with the path to your local LoRA adapter checkpoint
lora_adapter_path = "/data/miranebr-sandbox/data/runs/09302024_11h37m55s_run/train/checkpoint-2594"
# Step 2: Instantiate the Base Model with LoRA Enabled
# Replace 'meta-llama/Llama-2-7b-hf' with your desired base model from Hugging Face
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True)
llm = LLM(model="Qwen/Qwen2-1.5B", enable_lora=True)
# Step 3: Define Sampling Parameters
# Adjust these parameters as needed for your specific use case
sampling_params = SamplingParams(
temperature=0, # Temperature controls randomness in generation. 0 means deterministic output.
max_tokens=256, # Maximum number of tokens to generate
stop=["[/assistant]"] # Stop generation at this token
)
# Step 4: Define Prompts for Generation
# Create a list of prompts you want to pass to the model for text generation
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
]
# Step 5: Create a LoRA Request
# First parameter: a human-readable name for the adapter
# Second parameter: a globally unique ID for the adapter
# Third parameter: the path to the local LoRA adapter checkpoint
lora_request = LoRARequest("sql_adapter", 1, lora_adapter_path)
# Step 6: Generate Text with the LoRA Adapter
# Call the `generate` method on the model, passing the prompts, sampling parameters, and LoRA request
outputs = llm.generate(
prompts,
sampling_params,
lora_request=lora_request
)
# Step 7: Display the Outputs
# Print each generated output
for i, output in enumerate(outputs):
print(f"Output {i + 1}:\n{output}\n")
Do you have cycles to test it? 🙏
Ok it worked for me. I had to make sure I had vllm 0.5.5
and torch 2.4.0
on a A100 with 80GB.
requirements.txt
absl-py==2.1.0
accelerate==0.34.2
aiohappyeyeballs==2.4.3
aiohttp==3.10.8
aiosignal==1.3.1
alembic==1.13.3
annotated-types==0.7.0
anthropic==0.34.2
anthropic-bedrock==0.8.0
anyio==4.6.0
attrs==24.2.0
audioread==3.0.1
backoff==2.2.1
backports.tarfile==1.2.0
bitsandbytes==0.44.1
boto3==1.35.31
botocore==1.35.31
certifi==2024.8.30
cffi==1.17.1
charset-normalizer==3.3.2
click==8.1.7
cloudpickle==3.0.0
colorlog==6.8.2
contourpy==1.3.0
cryptography==43.0.1
cycler==0.12.1
datasets==3.0.1
decorator==5.1.1
dill==0.3.8
diskcache==5.6.3
distro==1.9.0
docker-pycreds==0.4.0
docstring_parser==0.16
docutils==0.21.2
dspy-ai==2.5.3
einops==0.8.0
evaluate==0.4.3
fastapi==0.115.0
filelock==3.16.1
fire==0.7.0
fonttools==4.54.1
frozenlist==1.4.1
fsspec==2024.6.1
gguf==0.9.1
gitdb==4.0.11
GitPython==3.1.43
greenlet==3.1.1
grpcio==1.66.2
h11==0.14.0
httpcore==1.0.6
httptools==0.6.1
httpx==0.27.2
huggingface-hub==0.25.1
idna==3.10
importlib_metadata==8.5.0
interegular==0.3.3
jaraco.classes==3.4.0
jaraco.context==6.0.1
jaraco.functools==4.1.0
jeepney==0.8.0
Jinja2==3.1.4
jiter==0.5.0
jmespath==1.0.1
joblib==1.4.2
jsonlines==4.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
keyring==25.4.1
kiwisolver==1.4.7
lark==1.2.2
lark-parser==0.12.0
lazy_loader==0.4
librosa==0.10.2.post1
litellm==1.48.9
llvmlite==0.43.0
lm-format-enforcer==0.10.6
magicattr==0.1.6
Mako==1.3.5
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.2
mdurl==0.1.2
more-itertools==10.5.0
mpmath==1.3.0
msgpack==1.1.0
msgspec==0.18.6
multidict==6.1.0
multiprocess==0.70.16
nest-asyncio==1.6.0
networkx==3.3
nh3==0.2.18
nltk==3.9.1
numba==0.60.0
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-htop==1.2.0
nvidia-ml-py==12.560.30
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.77
nvidia-nvtx-cu12==12.1.105
openai==1.51.0
optuna==4.0.0
outlines==0.0.46
packaging==24.1
pandas==2.2.3
peft==0.13.0
pillow==10.4.0
pkginfo==1.10.0
platformdirs==4.3.6
plotly==5.24.1
pooch==1.8.2
prometheus-fastapi-instrumentator==7.0.0
prometheus_client==0.21.0
protobuf==5.28.2
psutil==6.0.0
py-cpuinfo==9.0.0
pyairports==2.1.1
pyarrow==17.0.0
pycountry==24.6.1
pycparser==2.22
pydantic==2.9.2
pydantic_core==2.23.4
Pygments==2.18.0
pyparsing==3.1.4
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.2
PyYAML==6.0.2
pyzmq==26.2.0
ray==2.37.0
readme_renderer==44.0
referencing==0.35.1
regex==2024.9.11
requests==2.32.3
requests-toolbelt==1.0.0
rfc3986==2.0.0
rich==13.9.1
rpds-py==0.20.0
s3transfer==0.10.2
safetensors==0.4.5
scikit-learn==1.5.2
scipy==1.14.1
seaborn==0.13.2
SecretStorage==3.3.3
sentencepiece==0.2.0
sentry-sdk==2.15.0
setproctitle==1.3.3
shtab==1.7.1
six==1.16.0
smmap==5.0.1
sniffio==1.3.1
soundfile==0.12.1
soxr==0.5.0.post1
SQLAlchemy==2.0.35
starlette==0.38.6
structlog==24.4.0
sympy==1.13.3
tenacity==9.0.0
tensorboard==2.18.0
tensorboard-data-server==0.7.2
termcolor==2.4.0
threadpoolctl==3.5.0
tiktoken==0.7.0
tokenizers==0.20.0
torch==2.4.0
torchvision==0.19.0
tqdm==4.66.5
transformers==4.45.1
triton==3.0.0
trl==0.11.1
twine==5.1.1
typing_extensions==4.12.2
tyro==0.8.11
tzdata==2024.2
ujson==5.10.0
urllib3==2.2.3
uvicorn==0.31.0
uvloop==0.20.0
vllm==0.5.5
vllm-flash-attn==2.6.1
wandb==0.18.3
watchfiles==0.24.0
websockets==13.1
Werkzeug==3.0.4
xformers==0.0.27.post2
xxhash==3.5.0
yarl==1.13.1
zipp==3.20.2
for vllm 0.5.5 flash attention https://github.com/vllm-project/vllm/discussions/9031
fails with 0.5.1
# pip install torch==2.2.1 vllm==0.5.1
# fails install
# pip install flash-attn==2.6.3
@brando90 Oh set max_num_batched_tokens <= 65528
as mentioned in the error msg
https://github.com/unslothai/unsloth/wiki#saving-models-to-16bit-for-vllm should help!