Open angelovAlex opened 1 year ago
You can play with the code. I have no plan to implement it myself, but I know that this will work:
def load_model_and_offload(llm_config, checkpoint, half=False, backend='triton', lora_path=None, max_memory=None):
if max_memory is None:
max_memory = {0: '13Gib', 'cpu': '25Gib'}
config = RWConfig.from_pretrained(llm_config.hf_config_name)
config.max_seq_len = llm_config.max_seq_len
assert config.alibi is False
if half:
torch.set_default_dtype(torch.half)
with accelerate.init_empty_weights():
ql = importlib.import_module(f'falcontune.backend.{backend}.quantlinear')
model = RWForCausalLM(config)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
replace_4bit_linear(
model,
layers,
llm_config.bits,
llm_config.groupsize,
quantlinear_class=ql.QuantLinear
)
accelerate.load_checkpoint_in_model(model, checkpoint=checkpoint, device_map={'': 'cpu'})
model.loaded_in_4bit = True
if lora_path is not None:
model = PeftModel.from_pretrained(
model, lora_path,
device_map={'': 'cpu'},
torch_dtype=torch.float32,
is_trainable=True)
logger.info('{} Lora Applied.'.format(lora_path))
model.seqlen = llm_config.max_seq_len
for n, m in model.named_modules():
if isinstance(m, ql.QuantLinear) or isinstance(m, Linear4bitLt):
m.scales = m.scales.half()
m.bias = m.bias.half()
device_map = accelerate.infer_auto_device_map(
model, max_memory=max_memory,
no_split_module_classes=["DecoderLayer"])
model = accelerate.dispatch_model(
model, device_map=device_map,
offload_buffers=True, main_device=0)
torch.cuda.empty_cache()
logger.info('Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
tokenizer = transformers.AutoTokenizer.from_pretrained(llm_config.hf_config_name)
tokenizer.truncation_side = 'left'
tokenizer.bos_token_id = config.bos_token_id
tokenizer.eos_token_id = config.eos_token_id
tokenizer.pad_token_id = config.eos_token_id
return model, tokenizer
Also, some minor changes have to be made:
1) don't register inv_freq
2) don't register wf
And then Falcon-40B-GPTQ could be loaded and it will make inferences with a T4 GPU 16G
But it is a lot slower
Sorry, I did not touch it for a few weeks. I think inferences work fine with cpu offloading, but tuning crashes with error that all tensors expected to be on same device. Which seems to be not possible to finetune it on multiple gpus as well.
Is it possible to reduce required vram by offloading it to ram? From my experiments with Lora, models are able to pickup logic from custom dataset with very little amount of data. Would be great to be able to finetune it slow but with less amount of vram.