Open aliseyfi opened 1 year ago
Hello @aliseyfi,
Are you able to provide a reproduction or more details? Does the sample code run correctly outside of a Jupyter context? The code below ran successfully for me on an inf2.24xlarge instance.
import os
from transformers_neuronx.gpt2.model import GPT2ForSampling
from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
from transformers_neuronx.module import save_pretrained_split
from transformers import AutoModelForCausalLM, AutoTokenizer
os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer-inference'
# Load and save the CPU model
model_cpu = AutoModelForCausalLM.from_pretrained('gpt2')
save_pretrained_split(model_cpu, 'gpt2-split')
# Create and compile the Neuron model
model_neuron = GPT2ForSampling.from_pretrained('gpt2-split', batch_size=2, tp_degree=2, n_positions=256, amp='f32', unroll=None)
model_neuron.to_neuron()
# Use the `HuggingFaceGenerationModelAdapter` to access the generate API
model = HuggingFaceGenerationModelAdapter(model_cpu.config, model_neuron)
# Get a tokenizer and exaple input
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
text = "Hello, I'm a language model,"
encoded_input = tokenizer([text,text], return_tensors='pt', padding=True, truncation=True)
# Run inference using temperature
model.reset_generation()
sample_output = model.generate(
input_ids=encoded_input.input_ids,
attention_mask=encoded_input.attention_mask,
do_sample=True,
max_length=256,
temperature=0.7,
)
print([tokenizer.decode(tok) for tok in sample_output])
Have you tried that snippet using inputs with different lengths?
Please see my code below which kills the kernel. I'm using batch_size=8
.
PS: Using neuron_model.sample()
also kills the kernel if the number of input queries is anything other than the batch size (eg. 2)
import os
from transformers_neuronx.gptneox.model import GPTNeoXForSampling
from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
from transformers_neuronx.module import save_pretrained_split
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, GPTNeoXTokenizerFast
os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer-inference'
model_name = "EleutherAI/gpt-neox-20b"
splits_path = './gpt-neox-split'
config = GPTNeoXConfig.from_pretrained(model_name)
hf_model = GPTNeoXForCausalLM.from_pretrained(model_name, config=config)
def amp_callback(model, dtype):
for block in model.gpt_neox.layers:
block.attention.to(dtype)
block.mlp.to(dtype)
model.embed_out.to(dtype)
amp_callback(hf_model, torch.float16)
save_pretrained_split(hf_model, splits_path)
neuron_model = GPTNeoXForSampling.from_pretrained(splits_path, activation_function='gelu_new', batch_size=8, tp_degree=4, amp='f16')
neuron_model.to_neuron()
# Use the `HuggingFaceGenerationModelAdapter` to access the generate API
model = HuggingFaceGenerationModelAdapter(hf_model.config, neuron_model)
# Get a tokenizer and example input
tokenizer = GPTNeoXTokenizerFast.from_pretrained(model_name,
padding_side='left',
return_tensors='pt')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
sentences = [
"The quick brown fox jumps.",
"A journey of a thousand miles definitely begins with a single step for the time being.",
"There was a thunderstorm.",
"Hello, I'm a language model, What day is today dude?",
"This is a sample.",
"This is another sample.",
"This is an extra sample.",
"This is the last sample.",
]
text = "Hello, I'm a language model,"
texts= [text, text]
encoded_input = tokenizer(sentences,
max_length=32,
padding='max_length',
truncation=True)
# Run inference using temperature
model.reset_generation()
sample_output = model.generate(
input_ids = encoded_input.input_ids,
attention_mask= encoded_input.attention_mask,
do_sample=True,
max_length=64,
temperature=0.1
)
I'm running this sample code [https://github.com/aws-neuron/transformers-neuronx#hugging-face-generate-api-support] using GPT-NeoX on an inf2.24xlarge instance, but the model.generate method kills the kernel on Jupyter. I am using padding and truncation in the tokenizer, and this fails for both single and double input sequences (texts). The batch size is 2.