why gpu0 and gpu1 have high utilization rates, and the remaining eight cards have low utilization rates ？

mycode is :

import torch
from datasets import load_dataset
from transformers import AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

MODEL_ID = "/mydata/Meta-Llama-3.1-70B-Instruct"

# adjust based off number of desired GPUs
device_map = calculate_offload_device_map(
    MODEL_ID, reserve_for_hessians=True, num_gpus=8, torch_dtype="auto"
)

model = SparseAutoModelForCausalLM.from_pretrained(
    MODEL_ID, device_map=device_map, torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 8
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
        )
    }

ds = ds.map(preprocess)

# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for W8A8 quantization
recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(
        targets="Linear", scheme="W8A8", ignore=["lm_head"], sequential_update=True
    ),
]

SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"

oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    save_compressed=True,
    output_dir=SAVE_DIR,
)

log :

@hadoop2xu Thank you for your issue! As you can see from inspecting the device_map, the entire Meta-Llama-3.1-70B-Instruct model is able to fit in just 3 80GB GPUs (with the majority falling into the first two GPUs).

Greedily assigning modules to devices in this way minimizes the amount of inter-device data movement and leads to faster compression.

OrderedDict(
    [
        ("model.embed_tokens", 0),
        ("model.layers.0", 0),
        ("model.layers.1", 0),
        ("model.layers.2", 0),
        ("model.layers.3", 0),
        ("model.layers.4", 0),
        ("model.layers.5", 0),
        ("model.layers.6", 0),
        ("model.layers.7", 0),
        ("model.layers.8", 0),
        ("model.layers.9", 0),
        ("model.layers.10", 0),
        ("model.layers.11", 0),
        ("model.layers.12", 0),
        ("model.layers.13", 0),
        ("model.layers.14", 0),
        ("model.layers.15", 0),
        ("model.layers.16", 0),
        ("model.layers.17", 0),
        ("model.layers.18", 0),
        ("model.layers.19", 0),
        ("model.layers.20", 0),
        ("model.layers.21", 0),
        ("model.layers.22", 0),
        ("model.layers.23", 0),
        ("model.layers.24", 0),
        ("model.layers.25", 0),
        ("model.layers.26", 0),
        ("model.layers.27", 0),
        ("model.layers.28", 0),
        ("model.layers.29", 0),
        ("model.layers.30", 0),
        ("model.layers.31", 0),
        ("model.layers.32", 0),
        ("model.layers.33", 0),
        ("model.layers.34", 0),
        ("model.layers.35", 0),
        ("model.layers.36", 0),
        ("model.layers.37", 0),
        ("model.layers.38", 0),
        ("model.layers.39", 1),
        ("model.layers.40", 1),
        ("model.layers.41", 1),
        ("model.layers.42", 1),
        ("model.layers.43", 1),
        ("model.layers.44", 1),
        ("model.layers.45", 1),
        ("model.layers.46", 1),
        ("model.layers.47", 1),
        ("model.layers.48", 1),
        ("model.layers.49", 1),
        ("model.layers.50", 1),
        ("model.layers.51", 1),
        ("model.layers.52", 1),
        ("model.layers.53", 1),
        ("model.layers.54", 1),
        ("model.layers.55", 1),
        ("model.layers.56", 1),
        ("model.layers.57", 1),
        ("model.layers.58", 1),
        ("model.layers.59", 1),
        ("model.layers.60", 1),
        ("model.layers.61", 1),
        ("model.layers.62", 1),
        ("model.layers.63", 1),
        ("model.layers.64", 1),
        ("model.layers.65", 1),
        ("model.layers.66", 1),
        ("model.layers.67", 1),
        ("model.layers.68", 1),
        ("model.layers.69", 1),
        ("model.layers.70", 1),
        ("model.layers.71", 1),
        ("model.layers.72", 1),
        ("model.layers.73", 1),
        ("model.layers.74", 1),
        ("model.layers.75", 1),
        ("model.layers.76", 1),
        ("model.layers.77", 1),
        ("model.layers.78", 1),
        ("model.layers.79", 1),
        ("model.norm", 1),
        ("model.rotary_emb", 1),
        ("lm_head", 2),
    ]
)

Feel free to reopen if you have any further questions!

vllm-project / llm-compressor

why gpu0 and gpu1 have high utilization rates, and the remaining eight cards have low utilization rates ？ #827