I have noticed that the performance metrics (precision, recall, accuracy) of my model are different while it is in memory (just after training and before serialization) and after loading a saved version.
To Reproduce
$ cat train.py
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from apex import amp
from transformers import AdamW, XLNetTokenizer, XLNetForSequenceClassification
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from keras_preprocessing.sequence import pad_sequences
categories = ['rec.sport.hockey', 'soc.religion.christian', 'rec.motorcycles']
remove = ('headers', 'footers')
random_state = 42
model_class = 'xlnet-base-cased'
max_len = 64
batch_size = 16
lr = 1e-4
fp16_opt_level = 'O1'
torch.manual_seed(random_state)
np.random.seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=random_state)
test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=random_state)
tokenizer = XLNetTokenizer.from_pretrained(model_class)
model = XLNetForSequenceClassification.from_pretrained(model_class, num_labels=len(categories))
model.cuda()
def make_inputs(data):
unpadded = []
for text in data['data']:
token_ids = tokenizer.encode(text)
truncated = token_ids[:max_len-2] + tokenizer.encode('')
unpadded.append(truncated)
input_ids = pad_sequences(unpadded, maxlen=max_len, padding='post', truncating='post', value=tokenizer.encode('')[0])
attention_masks = (input_ids != tokenizer.encode('')[0]).astype('int')
return TensorDataset(torch.tensor(input_ids, dtype=torch.int64),
torch.tensor(attention_masks, dtype=torch.int64), torch.tensor(data['target'], dtype=torch.int64))
train_tensor = make_inputs(train)
train_sampler = RandomSampler(train_tensor)
train_dl = DataLoader(train_tensor, sampler=train_sampler, batch_size=batch_size)
test_tensor = make_inputs(test)
test_sampler = SequentialSampler(test_tensor)
test_dl = DataLoader(test_tensor, sampler=test_sampler, batch_size=batch_size)
named_params = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [param for name, param in named_params if all(nd not in name for nd in no_decay)], 'weight_decay_rate': 0.01},
{'params': [param for name, param in named_params if any(nd in name for nd in no_decay)], 'weight_decay_rate': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)
#training loop
epochs = 1
for _ in range(epochs):
model.train()
for batch in tqdm(train_dl):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
optimizer.zero_grad()
loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss.backward()
optimizer.step()
#evaluate
model.eval()
all_preds = []
all_labels = []
for batch in tqdm(test_dl):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
logits = logits.detach().cpu().numpy()
preds = np.argmax(logits, axis=1).flatten()
labels = b_labels.to('cpu').numpy().flatten()
all_preds.append(preds)
all_labels.append(labels)
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
print(classification_report(all_labels, all_preds))
#save model
torch.save({'model_state': model.state_dict()}, 'model.pt')
$ cat test.py
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from apex import amp
from transformers import AdamW, XLNetTokenizer, XLNetForSequenceClassification
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from keras_preprocessing.sequence import pad_sequences
categories = ['rec.sport.hockey', 'soc.religion.christian', 'rec.motorcycles']
remove = ('headers', 'footers')
random_state = 42
model_class = 'xlnet-base-cased'
max_len = 64
batch_size = 16
lr = 1e-4
fp16_opt_level = 'O1'
torch.manual_seed(random_state)
np.random.seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=random_state)
tokenizer = XLNetTokenizer.from_pretrained(model_class)
def make_inputs(data):
unpadded = []
for text in data['data']:
token_ids = tokenizer.encode(text)
truncated = token_ids[:max_len-2] + tokenizer.encode('')
unpadded.append(truncated)
input_ids = pad_sequences(unpadded, maxlen=max_len, padding='post', truncating='post', value=tokenizer.encode('')[0])
attention_masks = (input_ids != tokenizer.encode('')[0]).astype('int')
return TensorDataset(torch.tensor(input_ids, dtype=torch.int64),
torch.tensor(attention_masks, dtype=torch.int64), torch.tensor(data['target'], dtype=torch.int64))
test_tensor = make_inputs(test)
test_sampler = SequentialSampler(test_tensor)
test_dl = DataLoader(test_tensor, sampler=test_sampler, batch_size=batch_size)
#load
model = XLNetForSequenceClassification.from_pretrained(model_class, num_labels=len(categories))
model.cuda()
model = amp.initialize(model, opt_level=fp16_opt_level)
model.load_state_dict(torch.load('model.pt')['model_state'])
#evaluate
model.eval()
all_preds = []
all_labels = []
for batch in tqdm(test_dl):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
logits = logits.detach().cpu().numpy()
preds = np.argmax(logits, axis=1).flatten()
labels = b_labels.to('cpu').numpy().flatten()
all_preds.append(preds)
all_labels.append(labels)
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
print(classification_report(all_labels, all_preds))
Run train.py to train the model, evaluate it and save it to disk.
Description
I have noticed that the performance metrics (precision, recall, accuracy) of my model are different while it is in memory (just after training and before serialization) and after loading a saved version.
To Reproduce
$ cat train.py
$ cat test.py
train.py
to train the model, evaluate it and save it to disk.test.py
to load saved model and evaluate it.