Open werruww opened 3 hours ago
%%capture output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
AttributeError Traceback (most recent call last)
21 frames /usr/local/lib/python3.10/dist-packages/torch/init.py in getattr(name) 2560 return importlib.import_module(f".{name}", name) 2561 -> 2562 raise AttributeError(f"module '{name}' has no attribute '{name}'") 2563 2564
AttributeError: module 'torch' has no attribute 'Any'
AttributeError: module 'torch' has no attribute 'Any' on %%capture output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
AttributeError: module 'torch' has no attribute 'Any'
Efficiently serving Large Language Models in 2bit with
aqlm
andtransformers
compiled into a CUDA graphWelcome to this notebook that goes through the recent
aqlm
integration that introduces efficient GPU utilization when serving LLMs quantized to 2bit.In this notebook, we will learn how to load a large model in 2bit (
Llama-2-7b
) and comile a CUDA graph of it, to circumvent Python overhead whem serving the model.Install the
aqlm
library[gpu]
to install the required CUDA specific dependencies.device_map
you'll need to install accelerate. To properly support AQLM, you'd have to install the latest version straight from their GitHub (to catch PR#2376).%%capture !pip install aqlm[gpu]>=1.1.0 !pip install accelerate>=0.27.0 !pip install transformers>=4.41.0
https://github.com/Vahe1994/AQLM/issues/13
!pip install git+https://github.com/huggingface/accelerate.git@main
Load the model as usual
The tokenizer is just a normal
Llama 2
tokenizer.شغال
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf")
CUDA_VISIBLE_DEVICES=7 RUN_SLOW=1
https://github.com/Vahe1994/AQLM/issues/13
https://github.com/huggingface/accelerate/tree/main/examples
from transformers import AutoTokenizer, AutoModelForCausalLM import torch quantized_model = AutoModelForCausalLM.from_pretrained( "BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x15-hf", trust_remote_code=True, torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True )
Do a few forward passes to load CUDA and automatically compile the kernels. It's done separately here for it not to affect the generation speed benchmark below.
%%capture output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
import torch
إعداد المدخلات مع تحديد attention_mask
inputs = tokenizer("", return_tensors="pt")
التأكد من تحديد attention_mask و pad_token_id
inputs["attention_mask"] = torch.ones(inputs["input_ids"].shape, device=inputs["input_ids"].device) inputs["pad_token_id"] = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
توليد النص مع تحديد max_new_tokens
output = quantized_model.generate(inputs["input_ids"].cuda(), max_new_tokens=10, attention_mask=inputs["attention_mask"].cuda())
import torch
إعداد المدخلات مع تحديد attention_mask
inputs = tokenizer("", return_tensors="pt")
التأكد من تحديد attention_mask و pad_token_id
inputs["attention_mask"] = torch.ones(inputs["input_ids"].shape, device=inputs["input_ids"].device) inputs["pad_token_id"] = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
توليد النص مع تحديد max_new_tokens
output = quantized_model.generate(inputs["input_ids"].cuda(), max_new_tokens=10, attention_mask=inputs["attention_mask"].cuda())
import torch
تأكد من أن inputs يتم إنشاؤها بشكل صحيح
inputs = tokenizer("", return_tensors="pt")
التأكد من تحديد attention_mask و pad_token_id
inputs["attention_mask"] = torch.ones(inputs["input_ids"].shape, device=inputs["input_ids"].device) inputs["pad_token_id"] = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
استخدام model بدلاً من quantized_model إذا كان هذا هو النموذج الذي تم تحميله
output = quantized_model.generate( inputs["input_ids"].cuda(), max_new_tokens=10, attention_mask=inputs["attention_mask"].cuda() )
import torch
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values): logits = model( cur_token, position_ids=input_pos, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True )[0] new_token = torch.argmax(logits[:, -1], dim=-1)[:, None] return new_token
MAX_NEW_TOKENS = 128
from transformers import StaticCache
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
Compile the CUDA graph
print(tokenizer.decode(generated_ids[0]))
import time start = time.perf_counter() with torch.nograd(): for in range(MAX_NEW_TOKENS): with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): next_token = decode_one_tokens(quantized_model, next_token.clone(), None, cache_position, past_key_values) generated_ids[:, cache_position] = next_token.int() cache_position += 1 end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
from transformers import StaticCache
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
import torch
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values): logits = model( cur_token, position_ids=input_pos, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True )[0] new_token = torch.argmax(logits[:, -1], dim=-1)[:, None] return new_token
MAX_NEW_TOKENS = 128
with torch.no_grad():
Compile the CUDA graph
!pip install torch
Measure generation speed
import time
start = time.perf_counter() output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
Check that the output is what one would expect from Llama
print(tokenizer.decode(output[0]))
Compile a CUDA graph
Note that
transformers
generation itself is not the fastest implementation and it's heavily influenced by CPU capabilities of Google Colab. We'll deal with it by using static caches and compiling the model's forward pass into a homogeneous CUDA graph, effectively removing python's overhead.We'll have to implement the logic around forward passes on our own since CUDA graphs are not yet integrated into transformers
import torch
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values): logits = model( cur_token, position_ids=input_pos, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True )[0] new_token = torch.argmax(logits[:, -1], dim=-1)[:, None] return new_token
MAX_NEW_TOKENS = 128
Setup static KV cache for generation
from transformers import StaticCache
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
Allocate token ids to be generated and copy prefix ids
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
Do a forward pass to fill the prefix cache and compile the kernels if necessary
import torch
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
Compile the CUDA graph with
torch.compile
and appply the forward pass repeatedly to generate text!pip install typing_extensions
from typing_extensions import Any as AnyType
... (rest of your aqlm/inference.py code)
def _get_autograd_matmul_op(forward_pass_kernel, backward_pass_kernel): class _QuantizedMatmul(torch.autograd.Function): @staticmethod def forward( ctx: AnyType, # Replace torch.Any with AnyType from typing_extensions import Any as AnyType
... (rest of your aqlm/inference.py code)
def _get_autograd_matmul_op(forward_pass_kernel, backward_pass_kernel): class _QuantizedMatmul(torch.autograd.Function): @staticmethod def forward( ctx: AnyType, # Replace torch.Any with AnyType input: torch.Tensor, codes: torch.IntTensor,
... (rest of the function)
with torch.no_grad():
Compile the CUDA graph
print(tokenizer.decode(generated_ids[0]))
Continue the generation mesuring the speed
start = time.perf_counter() with torch.nograd(): for in range(MAX_NEW_TOKENS): with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): next_token = decode_one_tokens(quantized_model, next_token.clone(), None, cache_position, past_key_values) generated_ids[:, cache_position] = next_token.int() cache_position += 1 end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
We achieved a 3x speedup over normal generation using CUDA graphs, and the generated text is almost identical, as it should be.
{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "w-_k4j9wm5GD" }, "source": [ "# Efficiently serving Large Language Models in 2bit with
aqlm
andtransformers
compiled into a CUDA graph\n", "\n", "<a target=\"_blank\" href=\"https://colab.research.google.com/github/Vahe1994/AQLM/blob/main/notebooks/aqlm_cuda_graph.ipynb\">\n", " <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n", "\n", "\n", "Welcome to this notebook that goes through the recentaqlm
integration that introduces efficient GPU utilization when serving LLMs quantized to 2bit.\n", "\n", "In this notebook, we will learn how to load a large model in 2bit (Llama-2-7b
) and comile a CUDA graph of it, to circumvent Python overhead whem serving the model.\n" ] }, { "cell_type": "markdown", "metadata": { "id": "6egoxPVyckBF" }, "source": [ "Install theaqlm
library\n", "- The only extra dependency to run AQLM models.\n", "- Add[gpu]
to install the required CUDA specific dependencies.\n", "- To use nice features likedevice_map
you'll need to install accelerate. To properly support AQLM, you'd have to install the latest version straight from their GitHub (to catch PR#2376)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "A584OAwRWGks" }, "outputs": [], "source": [ "%%capture\n", "!pip install aqlm[gpu]>=1.1.0\n", "!pip install accelerate>=0.27.0\n", "!pip install transformers>=4.41.0" ] }, { "cell_type": "markdown", "metadata": { "id": "hTfcs4lrc1x4" }, "source": [ "Load the model as usual\n", "\n", "The tokenizer is just a normalLlama 2
tokenizer." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lecaItWkVpIC" }, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "quantized_model = AutoModelForCausalLM.from_pretrained(\n", " \"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf\",\n", " torch_dtype=\"auto\", device_map=\"auto\", low_cpu_mem_usage=True,\n", ")\n", "tokenizer = AutoTokenizer.from_pretrained(\"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf\")" ] }, { "cell_type": "markdown", "metadata": { "id": "39QpRiPbcBYa" }, "source": [ "Do a few forward passes to load CUDA and automatically compile the kernels. It's done separately here for it not to affect the generation speed benchmark below." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ii-mWRdQZCOF" }, "outputs": [], "source": [ "%%capture\n", "output = quantized_model.generate(tokenizer(\"\", return_tensors=\"pt\")[\"input_ids\"].cuda(), max_new_tokens=10)" ] }, { "cell_type": "markdown", "metadata": { "id": "zOQfeb_ScIyb" }, "source": [ "Measure generation speed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Q2CZ9QrA1S0P" }, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hyl4uCxTdmKi" }, "outputs": [], "source": [ "start = time.perf_counter()\n", "output = quantized_model.generate(tokenizer(\"I'm AQLM, \", return_tensors=\"pt\")[\"input_ids\"].cuda(), min_new_tokens=128, max_new_tokens=128)\n", "end = time.perf_counter()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-yaWulHS1eqa", "outputId": "e940864a-0639-4113-9071-4659b32939fe" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Generating at 8.5 tok/s\n" ] } ], "source": [ "print(f\"Generating at {128 / (end - start):.1f} tok/s\")" ] }, { "cell_type": "markdown", "metadata": { "id": "nvShqlguccep" }, "source": [ "Check that the output is what one would expect from Llama" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SsOmDVBvXobJ", "outputId": "b225a155-28bb-462e-dcae-fb1527bd78a6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I'm AQLM, 20, and I'm from the UK. I'm a student at the University of Nottingham, studying English and Creative Writing. I'm a huge fan of the Harry Potter series, and I'm also a huge fan of the Marvel Cinematic Universe. I'm also a huge fan of the DC Extended Universe, and I'm also a huge fan of the Star Wars franchise. I'm also a huge fan of the Marvel Cinematic Universe, and I'm also a huge fan of the DC Extended Universe, and I'm also a huge fan\n" ] } ], "source": [ "print(tokenizer.decode(output[0]))" ] }, { "cell_type": "markdown", "metadata": { "id": "p4ON1sMP2c2P" }, "source": [ "### Compile a CUDA graph\n", "\n", "Note thattransformers
generation itself is not the fastest implementation and it's heavily influenced by CPU capabilities of Google Colab. We'll deal with it by using static caches and compiling the model's forward pass into a homogeneous CUDA graph, effectively removing python's overhead." ] }, { "cell_type": "markdown", "metadata": { "id": "y70ZZqSi27uM" }, "source": [ "We'll have to implement the logic around forward passes on our own since CUDA graphs are not yet integrated into transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SoNpCyT72ffp" }, "outputs": [], "source": [ "import torch\n", "\n", "def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):\n", " logits = model(\n", " cur_token,\n", " position_ids=input_pos,\n", " cache_position=cache_position,\n", " past_key_values=past_key_values,\n", " return_dict=False,\n", " use_cache=True\n", " )[0]\n", " new_token = torch.argmax(logits[:, -1], dim=-1)[:, None]\n", " return new_token\n", "\n", "MAX_NEW_TOKENS = 128" ] }, { "cell_type": "markdown", "metadata": { "id": "Aly3cUrw3rvv" }, "source": [ "Setup static KV cache for generation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "a9nlMa9S3S8h" }, "outputs": [], "source": [ "from transformers import StaticCache\n", "\n", "input_ids = tokenizer(\"I'm AQLM, \", return_tensors=\"pt\").to(\"cuda\")[\"input_ids\"]\n", "seq_length = input_ids.shape[1]\n", "\n", "past_key_values = StaticCache(\n", " quantized_model.config,\n", " 1,\n", " seq_length + MAX_NEW_TOKENS 2 + 1,\n", " quantized_model.device,\n", " quantized_model.dtype\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "aPSedHaQ2gdN" }, "source": [ "Allocate token ids to be generated and copy prefix ids" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LSbQq6As343w" }, "outputs": [], "source": [ "cache_position = torch.arange(seq_length, device=\"cuda\")\n", "generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS 2, dtype=torch.int, device=\"cuda\")\n", "generated_ids[:, cache_position] = input_ids.to(\"cuda\").to(torch.int)" ] }, { "cell_type": "markdown", "metadata": { "id": "xQu-Ppwo4Geu" }, "source": [ "Do a forward pass to fill the prefix cache and compile the kernels if necessary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Spi5_rXb3_IP" }, "outputs": [], "source": [ "logits = quantized_model(\n", " input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True\n", ")[0]\n", "next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)\n", "generated_ids[:, [seq_length]] = next_token" ] }, { "cell_type": "markdown", "metadata": { "id": "j06LyMyo4SE-" }, "source": [ "Compile the CUDA graph withtorch.compile
and appply the forward pass repeatedly to generate text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mRb_1-N-4KsA" }, "outputs": [], "source": [ "with torch.no_grad():\n", " # Compile the CUDA graph\n", " decode_one_tokens = torch.compile(decode_one_tokens, mode=\"reduce-overhead\", fullgraph=True)\n", "\n", " # Generate tokens one by one\n", " cache_position = torch.tensor([seqlength + 1], device=\"cuda\")\n", " for in range(1, MAX_NEW_TOKENS):\n", " with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):\n", " next_token = decode_one_tokens(quantized_model, next_token.clone(), None, cache_position, past_key_values)\n", " generated_ids[:, cache_position] = next_token.int()\n", " cache_position += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RYX1klVj-u7-", "outputId": "ed96681b-da97-4886-ffb4-d3a050ddcc5e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I'm AQLM, 20, and I'm from the UK. I'm a student at the University of Nottingham, studying English and Creative Writing. I'm a huge fan of the Harry Potter series, and I'm also a huge fan of the Marvel Cinematic Universe. I'm also a huge fan of the DC Extended Universe, and I'm also a huge fan of the Star Wars franchise. I'm also a huge fan of the Marvel Cinematic Universe, and I'm also a huge fan of the DC Extended Universe, and I'm also a huge fan\n"
]
}
],
"source": [
"print(tokenizer.decode(generated_ids[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p60kWdwy5BLF"
},
"source": [
"Continue the generation mesuring the speed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mXGckNom4jBy"
},
"outputs": [],
"source": [
"start = time.perf_counter()\n",
"with torch.nograd():\n",
" for in range(MAX_NEW_TOKENS):\n",
" with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):\n",
" next_token = decode_one_tokens(quantized_model, next_token.clone(), None, cache_position, past_key_values)\n",
" generated_ids[:, cache_position] = next_token.int()\n",
" cache_position += 1\n",
"end = time.perf_counter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bUNO9J4-5UOa",
"outputId": "dda822ba-e7fb-483d-9f3a-215d62175b14"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generating at 24.0 tok/s\n"
]
}
],
"source": [
"print(f\"Generating at {128 / (end - start):.1f} tok/s\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AXWMpf897N0H"
},
"source": [
"We achieved a 3x speedup over normal generation using CUDA graphs, and the generated text is almost identical, as it should be."
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
} %%capture !pip install aqlm[gpu] !pip install accelerate !pip install transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")
!pip uninstall torch torchvision torchaudio -y
!pip show torch
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%%capture from typing import Any as AnyType output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
import time start = time.perf_counter() output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
print(tokenizer.decode(output[0]))
!pip install --upgrade torch from transformers import AutoTokenizer, AutoModelForCausalLM import torch from typing import Any as AnyType # Add this line if not already present
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")
input_ids = tokenizer("", return_tensors="pt")["input_ids"].cuda() attention_mask = torch.ones_like(input_ids).cuda() # Create attention mask
output = quantized_model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=10 )
output = quantized_model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=10, pad_token_id=tokenizer.pad_token_id )
import torch
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values): logits = model( cur_token, position_ids=input_pos, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True )[0] new_token = torch.argmax(logits[:, -1], dim=-1)[:, None] return new_token
MAX_NEW_TOKENS = 128
from transformers import StaticCache
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
from transformers import StaticCache import torch
تعيين قيمة MAX_NEW_TOKENS إذا لم تكن معرفة
MAX_NEW_TOKENS = 50 # مثال على قيمة يمكن تعديلها حسب احتياجك
تحويل النص إلى توكنات وتخزينها على الـ CUDA
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
إنشاء StaticCache مع الأخذ في الاعتبار الحجم المطلوب
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
تحديد مواقع الـ cache
cache_position = torch.arange(seq_length, device="cuda")
إنشاء tensor فارغ لتخزين المعرفات المولدة
generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda")
تعيين قيم input_ids في المواضع المناسبة
generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
التأكد من استدعاء النموذج بشكل صحيح
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True )[0]
استخراج التوكن التالي بناءً على logits
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
إضافة التوكن التالي إلى generated_ids
generated_ids[:, [seq_length]] = next_token
تأكد من أن input_ids هو من النوع الصحيح
input_ids = input_ids.to(torch.int64)
تأكد من أن cache_position هو من النوع الصحيح
cache_position = cache_position.to(torch.int64)
تأكد من أن past_key_values هو من النوع الصحيح (إن كان هناك حاجة لتحويله)
past_key_values = past_key_values.to(torch.int64) # إذا كان ذلك مناسبًا
تأكد من أن جميع المدخلات موجودة على الـ CUDA
input_ids = input_ids.to("cuda") cache_position = cache_position.to("cuda") past_key_values = past_key_values.to("cuda")
logits = quantized_model(input_ids, cache_position=cache_position, past_key_values=past_key_values, return_dict=False, use_cache=True)
with torch.no_grad():
Compile the CUDA graph
!git clone https://github.com/Vahe1994/AQLM.git
%cd AQLM
!pip install -r requirements.txt
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf")
%%capture output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
import time
start = time.perf_counter() output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
print(tokenizer.decode(output[0]))
!pip install git+https://github.com/huggingface/accelerate.git@main
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")
output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
import time
start = time.perf_counter() output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
print(tokenizer.decode(output[0]))
ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-2-70b-AQLM-2Bit-2x8-hf")
%%capture output = quantized_model.generate(tokenizer("", return_tensors="pt")["input_ids"].cuda(), max_new_tokens=10)
import time
start = time.perf_counter() output = quantized_model.generate(tokenizer("I'm AQLM, ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")
print(tokenizer.decode(output[0]))
from transformers import StaticCache
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
Compile the CUDA graph
print(tokenizer.decode(generated_ids[0]))
from transformers import StaticCache import torch
input_ids = tokenizer("I'm AQLM, ", return_tensors="pt").to("cuda")["input_ids"] seq_length = input_ids.shape[1]
Define MAX_NEW_TOKENS here
MAX_NEW_TOKENS = 128 # You can adjust this value as needed
past_key_values = StaticCache( quantized_model.config, 1, seq_length + MAX_NEW_TOKENS * 2 + 1, quantized_model.device, quantized_model.dtype )
cache_position = torch.arange(seq_length, device="cuda") generated_ids = torch.zeros(1, seq_length + MAX_NEW_TOKENS * 2, dtype=torch.int, device="cuda") generated_ids[:, cache_position] = input_ids.to("cuda").to(torch.int)
logits = quantized_model( input_ids, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True )[0] next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int) generated_ids[:, [seq_length]] = next_token
You need to define the decode_one_tokens function
Assuming it takes the arguments as shown in the loop below
def decode_one_tokens(model, nexttoken, , cache_position, past_key_values):
Replace this with the actual logic of your decode_one_tokens function
with torch.no_grad():
Compile the CUDA graph
print(tokenizer.decode(generated_ids[0]))
start = time.perf_counter() with torch.nograd(): for in range(MAX_NEW_TOKENS): with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): next_token = decode_one_tokens(quantized_model, next_token.clone(), None, cache_position, past_key_values) generated_ids[:, cache_position] = next_token.int() cache_position += 1 end = time.perf_counter()
print(f"Generating at {128 / (end - start):.1f} tok/s")