johnsmith0031 / alpaca_lora_4bit

MIT License
533 stars 84 forks source link

Problem with inference #119

Closed leexinyu1204 closed 1 year ago

leexinyu1204 commented 1 year ago

Using inference.py to test

import os
import sys
import time
import torch
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
replace_peft_model_with_gptq_lora_model()

config_path = './llama7b-4bit-v2/'
model_path = './llama7b-4bit-v2/llama7b-4bit-ts-ao-g128-v2.safetensors'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1)

print('Fitting 4bit scales and zeros to half')
model.half()
for n, m in model.named_modules():
    if isinstance(m, Autograd4bitQuantLinear):
        if m.is_v1_model:
            m.zeros = m.zeros.half()
        m.scales = m.scales.half()
        m.bias = m.bias.half()

print('Apply AMP Wrapper ...')

prompt = 'I think the meaning of life is'
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}

start = time.time()
with torch.no_grad():
    generated = model.generate(inputs=batch["input_ids"],
                               do_sample=True, use_cache=True,
                               repetition_penalty=1.1,
                               max_new_tokens=20,
                               temperature=0.9,
                               top_p=0.95,
                               top_k=40,
                               return_dict_in_generate=True,
                               output_attentions=False,
                               output_hidden_states=False,
                               output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()
print(result_text)
print(end - start)

the error is:

Loading Model ... The model weights are not tied. Please use the tie_weights method before using the infer_auto_device function. The model weights are not tied. Please use the tie_weights method before using the infer_auto_device function. The safetensors archive passed at ./llama7b-4bit-v2/llama7b-4bit-ts-ao-g128-v2.safetensors does not contain metadata. Make sure to save your model with the save_pretrained method. Defaulting to 'pt' metadata. Loaded the model in 18.04 seconds. Fitting 4bit scales and zeros to half Apply AMP Wrapper ... Traceback (most recent call last): File "inference.py", line 30, in generated = model.generate(inputs=batch["input_ids"], File "/usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.8/dist-packages/transformers/generation/utils.py", line 1572, in generate return self.sample( File "/usr/local/lib/python3.8/dist-packages/transformers/generation/utils.py", line 2655, in sample next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) RuntimeError: probability tensor contains either inf, nan or element < 0

use python 3.8 + cuda11.7

rakovskij-stanislav commented 1 year ago

Hi!

Did not tried your code yet (fine-tuning model for now), but I noticed an issue:

config_path = './llama7b-4bit-v2/'
model_path = './llama7b-4bit-v2/llama7b-4bit-ts-ao-g128-v2.safetensors'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1)

You are using groupsize=-1, but this model has groupsize=128. Try to set the proper value and try again

johnsmith0031 commented 1 year ago

also try this:

import matmul_utils_4bit
matmul_utils_4bit.act_order = True
leexinyu1204 commented 1 year ago
import os
import sys
import time
import torch
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
replace_peft_model_with_gptq_lora_model()

import matmul_utils_4bit
matmul_utils_4bit.act_order = True

config_path = './LLaMA-7B-4bit-128g/'
model_path = './LLaMA-7B-4bit-128g/llama-7b-4bit-128g.safetensors'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=128)

print('Fitting 4bit scales and zeros to half')
model.half()
for n, m in model.named_modules():
    if isinstance(m, Autograd4bitQuantLinear):
        if m.is_v1_model:
            m.zeros = m.zeros.half()
        m.scales = m.scales.half()
        m.bias = m.bias.half()

print('Apply AMP Wrapper ...')
from amp_wrapper import AMPWrapper
wrapper = AMPWrapper(model)
wrapper.apply_generate()

prompt = '''I think the meaning of life is'''
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}

start = time.time()
with torch.no_grad():
    generated = model.generate(inputs=batch["input_ids"],
                               do_sample=True, use_cache=True,
                               repetition_penalty=1.1,
                               max_new_tokens=20,
                               temperature=0.9,
                               top_p=0.95,
                               top_k=40,
                               return_dict_in_generate=True,
                               output_attentions=False,
                               output_hidden_states=False,
                               output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()
print(result_text)
print(end - start)

still with the same problem

leexinyu1204 commented 1 year ago

Hi!

Did not tried your code yet (fine-tuning model for now), but I noticed an issue:

config_path = './llama7b-4bit-v2/'
model_path = './llama7b-4bit-v2/llama7b-4bit-ts-ao-g128-v2.safetensors'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1)

You are using groupsize=-1, but this model has groupsize=128. Try to set the proper value and try again

still with the same problem and I don't know why here is my inference.py

import os
import sys
import time
import torch
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model
replace_peft_model_with_gptq_lora_model()

import matmul_utils_4bit
matmul_utils_4bit.act_order = True

config_path = './LLaMA-7B-4bit-128g/'
model_path = './LLaMA-7B-4bit-128g/llama-7b-4bit-128g.safetensors'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=128)

print('Fitting 4bit scales and zeros to half')
model.half()
for n, m in model.named_modules():
    if isinstance(m, Autograd4bitQuantLinear):
        if m.is_v1_model:
            m.zeros = m.zeros.half()
        m.scales = m.scales.half()
        m.bias = m.bias.half()

print('Apply AMP Wrapper ...')
from amp_wrapper import AMPWrapper
wrapper = AMPWrapper(model)
wrapper.apply_generate()

prompt = '''I think the meaning of life is'''
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}

start = time.time()
with torch.no_grad():
    generated = model.generate(inputs=batch["input_ids"],
                               do_sample=True, use_cache=True,
                               repetition_penalty=1.1,
                               max_new_tokens=20,
                               temperature=0.9,
                               top_p=0.95,
                               top_k=40,
                               return_dict_in_generate=True,
                               output_attentions=False,
                               output_hidden_states=False,
                               output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()
print(result_text)
print(end - start)
johnsmith0031 commented 1 year ago

try recompiling the cuda kernel?

pip uninstall gptq_llama
pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
leexinyu1204 commented 1 year ago

thanks, solved

leexinyu1204 commented 1 year ago

recompiling the cuda kernel solved the problem