Running GPT-NeoX on inf2.24xlarge kills kernel

aliseyfi commented 1 year ago

I'm running this sample code [https://github.com/aws-neuron/transformers-neuronx#hugging-face-generate-api-support] using GPT-NeoX on an inf2.24xlarge instance, but the model.generate method kills the kernel on Jupyter. I am using padding and truncation in the tokenizer, and this fails for both single and double input sequences (texts). The batch size is 2.

aws-taylor commented 1 year ago

Hello @aliseyfi,

Are you able to provide a reproduction or more details? Does the sample code run correctly outside of a Jupyter context? The code below ran successfully for me on an inf2.24xlarge instance.

import os
from transformers_neuronx.gpt2.model import GPT2ForSampling
from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
from transformers_neuronx.module import save_pretrained_split
from transformers import AutoModelForCausalLM, AutoTokenizer
os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer-inference'

# Load and save the CPU model
model_cpu = AutoModelForCausalLM.from_pretrained('gpt2')
save_pretrained_split(model_cpu, 'gpt2-split')

# Create and compile the Neuron model
model_neuron = GPT2ForSampling.from_pretrained('gpt2-split', batch_size=2, tp_degree=2, n_positions=256, amp='f32', unroll=None)
model_neuron.to_neuron()

# Use the `HuggingFaceGenerationModelAdapter` to access the generate API
model = HuggingFaceGenerationModelAdapter(model_cpu.config, model_neuron)

# Get a tokenizer and exaple input
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
text = "Hello, I'm a language model,"
encoded_input = tokenizer([text,text], return_tensors='pt', padding=True, truncation=True)

# Run inference using temperature
model.reset_generation()
sample_output = model.generate(
    input_ids=encoded_input.input_ids,
    attention_mask=encoded_input.attention_mask,
    do_sample=True,
    max_length=256,
    temperature=0.7,
)
print([tokenizer.decode(tok) for tok in sample_output])

aliseyfi commented 1 year ago

Have you tried that snippet using inputs with different lengths? Please see my code below which kills the kernel. I'm using batch_size=8. PS: Using neuron_model.sample() also kills the kernel if the number of input queries is anything other than the batch size (eg. 2)

import os
from transformers_neuronx.gptneox.model import GPTNeoXForSampling
from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
from transformers_neuronx.module import save_pretrained_split
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, GPTNeoXTokenizerFast
os.environ['NEURON_CC_FLAGS'] = '--model-type=transformer-inference'

model_name = "EleutherAI/gpt-neox-20b" 
splits_path = './gpt-neox-split'

config = GPTNeoXConfig.from_pretrained(model_name)
hf_model = GPTNeoXForCausalLM.from_pretrained(model_name, config=config)

def amp_callback(model, dtype):
    for block in model.gpt_neox.layers:
        block.attention.to(dtype)
        block.mlp.to(dtype)
    model.embed_out.to(dtype)

amp_callback(hf_model, torch.float16) 
save_pretrained_split(hf_model, splits_path)

neuron_model = GPTNeoXForSampling.from_pretrained(splits_path, activation_function='gelu_new', batch_size=8, tp_degree=4, amp='f16') 
neuron_model.to_neuron()

# Use the `HuggingFaceGenerationModelAdapter` to access the generate API
model = HuggingFaceGenerationModelAdapter(hf_model.config, neuron_model)

# Get a tokenizer and example input
tokenizer = GPTNeoXTokenizerFast.from_pretrained(model_name, 
                                                 padding_side='left',
                                                 return_tensors='pt')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

sentences = [
    "The quick brown fox jumps.",
    "A journey of a thousand miles definitely begins with a single step for the time being.",
    "There was a thunderstorm.",
    "Hello, I'm a language model, What day is today dude?",
    "This is a sample.",    
    "This is another sample.",    
    "This is an extra sample.",
    "This is the last sample.",
]
text = "Hello, I'm a language model,"
texts= [text, text]

encoded_input = tokenizer(sentences, 
                        max_length=32,
                        padding='max_length', 
                        truncation=True)

# Run inference using temperature
model.reset_generation()

sample_output = model.generate(
    input_ids     = encoded_input.input_ids,
    attention_mask= encoded_input.attention_mask,
    do_sample=True,
    max_length=64, 
    temperature=0.1
)

aws-neuron / transformers-neuronx

Running GPT-NeoX on inf2.24xlarge kills kernel #24