Open sfc-gh-zhwang opened 3 months ago
code is below
import argparse
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
def main():
parser = argparse.ArgumentParser(description='Quantize a pre-trained model using AutoFP8')
parser.add_argument('--model', type=str, required=True, help='Directory of the pre-trained model')
parser.add_argument('--output', type=str, required=True, help='Directory to save the quantized model')
args = parser.parse_args()
# For dynamic activation scales, there is no need for calibration examples
examples = []
quantize_config = BaseQuantizeConfig(
quant_method="fp8",
activation_scheme="dynamic"
)
model = AutoFP8ForCausalLM.from_pretrained(args.model, quantize_config=quantize_config)
model.quantize(examples)
model.save_quantized(args.output)
if __name__ == "__main__":
main()
Hey @sfc-gh-zhwang AutoFP8 doesn't have support for quantizing a model that big if you can't fit the whole model in one node memory. We recommend using llm-compressor for this, here is an example https://huggingface.co/neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8-dynamic#creation
import torch
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import ( # noqa
calculate_offload_device_map,
custom_offload_device_map,
)
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: channel
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: token
dynamic: true
symmetric: true
targets: ["Linear"]
"""
model_stub = "meta-llama/Meta-Llama-3.1-405B-Instruct"
model_name = model_stub.split("/")[-1]
device_map = calculate_offload_device_map(
model_stub, reserve_for_hessians=False, num_gpus=8, torch_dtype=torch.float16
)
model = SparseAutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype=torch.float16, device_map=device_map
)
output_dir = f"./{model_name}-FP8-dynamic"
oneshot(
model=model,
recipe=recipe,
output_dir=output_dir,
save_compressed=True,
tokenizer=AutoTokenizer.from_pretrained(model_stub),
)