vllm-project / llm-compressor

Transformers-compatible library for applying various compression algorithms to LLMs for optimized deployment with vLLM
Apache License 2.0
612 stars 51 forks source link

why gpu0 and gpu1 have high utilization rates, and the remaining eight cards have low utilization rates ? #827

Closed hadoop2xu closed 1 week ago

hadoop2xu commented 2 weeks ago

mycode is :

import torch
from datasets import load_dataset
from transformers import AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

MODEL_ID = "/mydata/Meta-Llama-3.1-70B-Instruct"

# adjust based off number of desired GPUs
device_map = calculate_offload_device_map(
    MODEL_ID, reserve_for_hessians=True, num_gpus=8, torch_dtype="auto"
)

model = SparseAutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map=device_map, torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 8
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }

ds = ds.map(preprocess)

# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for W8A8 quantization
recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(
        targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True
    ),
]

SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"

oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    save_compressed=True,
    output_dir=SAVE_DIR,
)

log : image

kylesayrs commented 1 week ago

@hadoop2xu Thank you for your issue! As you can see from inspecting the device_map, the entire Meta-Llama-3.1-70B-Instruct model is able to fit in just 3 80GB GPUs (with the majority falling into the first two GPUs).

Greedily assigning modules to devices in this way minimizes the amount of inter-device data movement and leads to faster compression.

OrderedDict(
    [
        ("model.embed_tokens", 0),
        ("model.layers.0", 0),
        ("model.layers.1", 0),
        ("model.layers.2", 0),
        ("model.layers.3", 0),
        ("model.layers.4", 0),
        ("model.layers.5", 0),
        ("model.layers.6", 0),
        ("model.layers.7", 0),
        ("model.layers.8", 0),
        ("model.layers.9", 0),
        ("model.layers.10", 0),
        ("model.layers.11", 0),
        ("model.layers.12", 0),
        ("model.layers.13", 0),
        ("model.layers.14", 0),
        ("model.layers.15", 0),
        ("model.layers.16", 0),
        ("model.layers.17", 0),
        ("model.layers.18", 0),
        ("model.layers.19", 0),
        ("model.layers.20", 0),
        ("model.layers.21", 0),
        ("model.layers.22", 0),
        ("model.layers.23", 0),
        ("model.layers.24", 0),
        ("model.layers.25", 0),
        ("model.layers.26", 0),
        ("model.layers.27", 0),
        ("model.layers.28", 0),
        ("model.layers.29", 0),
        ("model.layers.30", 0),
        ("model.layers.31", 0),
        ("model.layers.32", 0),
        ("model.layers.33", 0),
        ("model.layers.34", 0),
        ("model.layers.35", 0),
        ("model.layers.36", 0),
        ("model.layers.37", 0),
        ("model.layers.38", 0),
        ("model.layers.39", 1),
        ("model.layers.40", 1),
        ("model.layers.41", 1),
        ("model.layers.42", 1),
        ("model.layers.43", 1),
        ("model.layers.44", 1),
        ("model.layers.45", 1),
        ("model.layers.46", 1),
        ("model.layers.47", 1),
        ("model.layers.48", 1),
        ("model.layers.49", 1),
        ("model.layers.50", 1),
        ("model.layers.51", 1),
        ("model.layers.52", 1),
        ("model.layers.53", 1),
        ("model.layers.54", 1),
        ("model.layers.55", 1),
        ("model.layers.56", 1),
        ("model.layers.57", 1),
        ("model.layers.58", 1),
        ("model.layers.59", 1),
        ("model.layers.60", 1),
        ("model.layers.61", 1),
        ("model.layers.62", 1),
        ("model.layers.63", 1),
        ("model.layers.64", 1),
        ("model.layers.65", 1),
        ("model.layers.66", 1),
        ("model.layers.67", 1),
        ("model.layers.68", 1),
        ("model.layers.69", 1),
        ("model.layers.70", 1),
        ("model.layers.71", 1),
        ("model.layers.72", 1),
        ("model.layers.73", 1),
        ("model.layers.74", 1),
        ("model.layers.75", 1),
        ("model.layers.76", 1),
        ("model.layers.77", 1),
        ("model.layers.78", 1),
        ("model.layers.79", 1),
        ("model.norm", 1),
        ("model.rotary_emb", 1),
        ("lm_head", 2),
    ]
)

Feel free to reopen if you have any further questions!