training_args = TrainingArguments(
output_dir="./results", # Directory to save results
num_train_epochs=1, # Number of training epochs
per_device_train_batch_size=4, # Batch size per device
evaluation_strategy="epoch", # Evaluate the model at the end of each epoch
logging_dir='./logs', # Directory to save logs
save_steps=500, # Save model every 500 steps
fp16=True # Use mixed precision for faster training (requires GPU)
)
model = GPT2LMHeadModel.from_pretrained("recbygus/gpt2_wikitext_3ep_infinitypreference_1ep")
Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # Masked Language Modeling is not used; it's for language generation tasks
)
Initialize the Trainer with model, arguments, datasets, and data collator
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"], # Training dataset
eval_dataset=tokenized_datasets["test"], # Validation dataset
data_collator=data_collator, # Data collator for processing batches
)
Start the training process
trainer.train()
Evaluate the model's performance on the validation dataset
!pip install transformers datasets
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments from datasets import load_dataset, load_metric from transformers import GPT2LMHeadModel, Trainer, DataCollatorForLanguageModeling
Load Model
tokenizer = GPT2Tokenizer.from_pretrained("recbygus/gpt2_wikitext_3ep_infinitypreference_1ep") model = GPT2LMHeadModel.from_pretrained("recbygus/gpt2_wikitext_3ep_infinitypreference_1ep")
Load the Infinity-Preference dataset
dataset = load_dataset("proj-persona/PersonaHub", "instruction")
Inspect the dataset
print(dataset)
Define a function to preprocess the dataset
def preprocess_function(examples): return tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=128)
Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
Set up training arguments
training_args = TrainingArguments( output_dir="./results", # Directory to save results num_train_epochs=1, # Number of training epochs per_device_train_batch_size=4, # Batch size per device evaluation_strategy="epoch", # Evaluate the model at the end of each epoch logging_dir='./logs', # Directory to save logs save_steps=500, # Save model every 500 steps fp16=True # Use mixed precision for faster training (requires GPU) )
model = GPT2LMHeadModel.from_pretrained("recbygus/gpt2_wikitext_3ep_infinitypreference_1ep")
Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False # Masked Language Modeling is not used; it's for language generation tasks )
Initialize the Trainer with model, arguments, datasets, and data collator
trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], # Training dataset eval_dataset=tokenized_datasets["test"], # Validation dataset data_collator=data_collator, # Data collator for processing batches )
Start the training process
trainer.train()
Evaluate the model's performance on the validation dataset
trainer.evaluate()
Save the trained model and tokenizer locally
model.save_pretrained("./trained_model") tokenizer.save_pretrained("./trained_model")
from huggingface_hub import login
Push the fine-tuned model to Hugging Face Hub
login('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') model.push_to_hub("gpt2_recbygus_ver3") tokenizer.push_to_hub("gpt2_recbygus_ver3")
from google.colab import drive drive.mount('/content/drive')
Generate text with model
from transformers import pipeline from huggingface_hub import login
login('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')
Load the fine-tuned model from Hugging Face
generator = pipeline("text-generation", model="recbygus/gpt2_recbygus_ver3")
Generate text with the model
input_text = "How can one ride a motorcycle?" output = generator(input_text, max_length=200, num_return_sequences=1, truncation=True)
Print the generated text
print(output[0]['generated_text'])