Closed werruww closed 1 week ago
from huggingface_hub import snapshot_download
import torch
from accelerate import infer_auto_device_map
from transformers import AutoModelForCausalLM, AutoConfig
checkpoint = "marcsun13/gpt2-xl-linear-sharded"
weights_location = snapshot_download(repo_id=checkpoint)
# Instead of loading directly from checkpoint, use 'gpt2-xl' as base
# and load the sharded weights into it.
config = AutoConfig.from_pretrained("gpt2-xl") # Load config for gpt2-xl
# Now load the model using the gpt2-xl configuration and downloaded sharded weights
model = AutoModelForCausalLM.from_pretrained(
weights_location, config=config, torch_dtype=torch.float16, ignore_mismatched_sizes=True
)
# Now use the model object in infer_auto_device_map
device_map = infer_auto_device_map(
model, max_memory={0: "10GiB", "cpu": "10GiB"}
)
from accelerate import init_empty_weights
from mingpt.model import GPT
model_config = GPT.get_default_config()
model_config.model_type = 'gpt2-xl'
model_config.vocab_size = 50257
model_config.block_size = 1024
with init_empty_weights():
model = GPT(model_config)
from accelerate import load_checkpoint_and_dispatch
model = load_checkpoint_and_dispatch(
model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)
model.hf_device_map
from mingpt.bpe import BPETokenizer
tokenizer = BPETokenizer()
inputs = tokenizer("Who is Napoleon Bonaparte?").to(0)
# Use 'inputs' instead of 'x1' here
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)[0]
tokenizer.decode(outputs.cpu().squeeze())
Fetching 9 files: 100%
9/9 [00:00<00:00, 370.91it/s]
Loading checkpoint shards: 100%
7/7 [00:01<00:00, 4.30it/s]
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at /root/.cache/huggingface/hub/models--marcsun13--gpt2-xl-linear-sharded/snapshots/aeb281f0cd2bfc947d4702b27aecd9194c322c7e and are newly initialized because the shapes did not match:
- transformer.h.0.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.0.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.0.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.1.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.1.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.1.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.2.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.2.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.2.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.3.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.3.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.3.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.4.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.4.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.4.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.5.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.10.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.10.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.10.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.11.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.11.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.11.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.12.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.12.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.12.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.5.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.5.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.6.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.6.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.6.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.7.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.7.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.7.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.8.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.8.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.8.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.9.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.9.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.9.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.13.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.13.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.13.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.14.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.14.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.14.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.15.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.15.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.15.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.16.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.16.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.16.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.17.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.17.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.17.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.18.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.18.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.18.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.19.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.19.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.19.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.20.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.20.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.20.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.21.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.21.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.21.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.22.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.22.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.22.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.23.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.23.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.23.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.24.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.24.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.24.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.25.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.25.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.25.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.26.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.26.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.26.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.27.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.27.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.27.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.28.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.28.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.28.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.29.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.29.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.29.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.30.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.30.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.30.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.31.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.31.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.31.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.32.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.32.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.32.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.33.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.33.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.33.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.34.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.34.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.34.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.35.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.35.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.35.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.36.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.36.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.36.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.37.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.37.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.37.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.38.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.38.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.38.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.39.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.39.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.39.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.40.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.40.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.40.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.41.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.41.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.41.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.42.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.42.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.42.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.43.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.43.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.43.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.44.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.44.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.44.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.45.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.45.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.45.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.46.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.46.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.46.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
- transformer.h.47.attn.c_attn.weight: found shape torch.Size([4800, 1600]) in the checkpoint and torch.Size([1600, 4800]) in the model instantiated
- transformer.h.47.mlp.c_fc.weight: found shape torch.Size([6400, 1600]) in the checkpoint and torch.Size([1600, 6400]) in the model instantiated
- transformer.h.47.mlp.c_proj.weight: found shape torch.Size([1600, 6400]) in the checkpoint and torch.Size([6400, 1600]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
number of parameters: 1557.61M
Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte was a French general who led the French army during the French Revolution. He was the first to use the term "Napoleon" to describe himself.\n\nWhat is the name of the French Revolution?\n\nThe French Revolution was a period of political and social upheaval in France that began in 1789. It was the first of the French revolutions, and was the first to be led by a man.\n\nWhat is the name of the French Revolution?\n\nThe French Revolution was a period of political and social upheaval in France that began in 1789. It was the first of the French revolutions, and was the first to be led by a man.\n\nWhat is the name of the French Revolution?\n\nThe French Revolution was a period of political and social upheaval in France that began in 1789. It was the first of the French revolutions, and was the first to be led by a man.\n\nWhat is the name of the French Revolution?\n\nThe French Revolution was a period of political and social upheaval in Fran
If I have a single 16 GB Vega and a processor, how do I run a larger model of Vega on the Vega and the processor so that I can benefit from the Vega acceleration? Are the codes that I ran correct or can they be modified to achieve good results?
What are the steps from a to z to run a model larger than the 16 GB Vega on the Vega and the processor? Starting from downloading the model, then creating an empty model, then placing the weights in it, then running it with a request or completing the text
@werruww please do not spam this with nearly the same result. It makes us think that this is an LLM instead of a real problem, and bloats our notifications as well
In general, do device_map="auto"
and accelerate will fill your model how it can, and offload the rest to the CPU/hard drive and run the model from there.
from huggingface_hub import snapshot_download checkpoint = "marcsun13/gpt2-xl-linear-sharded" weights_location = snapshot_download(repo_id=checkpoint)
from accelerate import init_empty_weights from mingpt.model import GPT
model_config = GPT.get_default_config() model_config.model_type = 'gpt2-xl' model_config.vocab_size = 50257 model_config.block_size = 1024
with init_empty_weights(): model = GPT(model_config)
from accelerate import load_checkpoint_and_dispatch
model = load_checkpoint_and_dispatch( model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block'] )
from mingpt.bpe import BPETokenizer tokenizer = BPETokenizer() inputs = tokenizer("who is python?").to(0)
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)[0] tokenizer.decode(outputs.cpu().squeeze())
device_map="auto" instead ????????? model = load_checkpoint_and_dispatch( model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block'] )
This is the code what is the modification؟
clear device_map = infer_auto_device_map( model, max_memory={0: "10GiB", "cpu": "10GiB"} )
and pot device_map="auto"
If you allow me to write a complete code that I trust I copied the codes from the site randomly If it is possible to write a complete code that reassures me that it will build an empty form, fill it out, and run it on the Vega and then the processor, no matter the size of the form
If possible, a collab page tpu 24g A model larger than 24 GB To clarify things Thank you
ValueError Traceback (most recent call last)
2 frames /usr/local/lib/python3.10/dist-packages/accelerate/utils/modeling.py in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics, tied_params_map) 371 # In other cases, we want to make sure we're not loading checkpoints that do not match the config. 372 if old_value.shape != value.shape and param_cls.name != "Params4bit": --> 373 raise ValueError( 374 f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this looks incorrect.' 375 )
ValueError: Trying to set a tensor of shape torch.Size([32768, 4096]) in "weight" (which has shape torch.Size([32768, 768])), this looks incorrect.
code
from huggingface_hub import snapshot_download checkpoint = "mistralai/Mistral-7B-Instruct-v0.3" weights_location = snapshot_download(repo_id=checkpoint)
from accelerate import init_empty_weights from mingpt.model import GPT
model_config = GPT.get_default_config() model_config.model_type = 'gpt2' model_config.vocab_size = 32768 model_config.block_size = 768 model_config.hidden_size = 768 with init_empty_weights(): model = GPT(model_config)
from accelerate import init_empty_weights from mingpt.model import GPT model_config = GPT.get_default_config() model_config.model_type = 'mistral' model_config.vocab_size = 32000 # حجم المفردات لـ Mistral model_config.block_size = 4096 # الحد الأقصى لطول السياق model_config.n_layer = 32 # عدد الطبقات model_config.n_head = 32 # عدد رؤوس الانتباه model_config.n_embd = 4096 # حجم التضمين الخفي
with init_empty_weights(): model = GPT(model_config)
from accelerate import load_checkpoint_and_dispatch
model = load_checkpoint_and_dispatch( model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block'] )
!model.hf_device_map
from mingpt.bpe import BPETokenizer tokenizer = BPETokenizer() inputs = tokenizer("Who is Napoleon Bonaparte?").to(0)
outputs = model.generate(inputs, max_new_tokens=1024, do_sample=False)[0] tokenizer.decode(outputs.cpu().squeeze())
Extended vocabulary to 32768
I ran the code. on colab t4 12 ram
ValueError Traceback (most recent call last)
2 frames /usr/local/lib/python3.10/dist-packages/accelerate/utils/modeling.py in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics, tied_params_map) 371 # In other cases, we want to make sure we're not loading checkpoints that do not match the config. 372 if old_value.shape != value.shape and param_cls.name != "Params4bit": --> 373 raise ValueError( 374 f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this looks incorrect.' 375 )
ValueError: Trying to set a tensor of shape torch.Size([32768, 4096]) in "weight" (which has shape torch.Size([32768, 768])), this looks incorrect.
from huggingface_hub import snapshot_download checkpoint = "openai-community/gpt2" weights_location = snapshot_download(repo_id=checkpoint)
import torch.nn as nn # import the torch.nn module and alias it as nn from accelerate import init_empty_weights
with init_emptyweights(): model = nn.Sequential(*[nn.Linear(10000, 10000) for in range(1000)])
import torch import torch.nn as nn from huggingface_hub import snapshot_download from accelerate import init_empty_weights, load_checkpoint_and_dispatch from transformers import GPT2LMHeadModel
checkpoint = "openai-community/gpt2" weights_location = snapshot_download(repo_id=checkpoint)
model = GPT2LMHeadModel.from_pretrained(checkpoint, torch_dtype=torch.float16)
model = load_checkpoint_and_dispatch( model, checkpoint=weights_location, device_map="auto", offload_folder="offload_folder", # Use a folder name, not "True" no_split_module_classes=['Block'] )
import torch.nn as nn # import the torch.nn module and alias it as nn from accelerate import init_empty_weights, load_checkpoint_and_dispatch from transformers import AutoModelForCausalLM # Import AutoModelForCausalLM from huggingface_hub import snapshot_download
checkpoint = "openai-community/gpt2" weights_location = snapshot_download(repo_id=checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)
model = load_checkpoint_and_dispatch( model, checkpoint=weights_location, device_map="auto", offload_folder="True" )
import torch from tokenizers import ByteLevelBPETokenizer from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello, my name is", return_tensors="pt").input_ids.to("cpu")
outputs = model.generate(inputs, max_new_tokens=10, do_sample=False)[0] decoded_output = tokenizer.decode(outputs.cpu().squeeze().tolist()) print(decoded_output)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: clean_up_tokenization_spaces
was not set. It will be set to True
by default. This behavior will be depracted in transformers v4.45, and will be then set to False
by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:50256 for open-end generation.
Hello, my name is John. I'm a writer, and I'm
colab no t4 no tpu
How do I create a model without a family gpt and without minGPT
like mistral, phi3.5,lama3.1,qwen
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
System Info
Information
Tasks
no_trainer
script in theexamples
folder of thetransformers
repo (such asrun_no_trainer_glue.py
)Reproduction
!git clone https://github.com/karpathy/minGPT.git !pip install minGPT/ !pip install huggingface_hub
!pip install accelerate --upgrade
Expected behavior
code run good python is a popular open source Python library for data analysis. It is used by many Python developers to perform data analysis tasks. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language. It is used by many people to do many things. Python is a very popular programming language
who is python?
I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure. I'm not sure.