NVIDIA / apex

A PyTorch Extension: Tools for easy mixed precision and distributed training in Pytorch
BSD 3-Clause "New" or "Revised" License
8.4k stars 1.4k forks source link

Performance metrics are different after loading saved model #632

Open 0x7A31C7 opened 4 years ago

0x7A31C7 commented 4 years ago

Description

I have noticed that the performance metrics (precision, recall, accuracy) of my model are different while it is in memory (just after training and before serialization) and after loading a saved version.

To Reproduce

$ cat train.py


import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from apex import amp
from transformers import AdamW, XLNetTokenizer, XLNetForSequenceClassification
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from keras_preprocessing.sequence import pad_sequences

categories = ['rec.sport.hockey', 'soc.religion.christian', 'rec.motorcycles']
remove = ('headers', 'footers')
random_state = 42
model_class = 'xlnet-base-cased'
max_len = 64
batch_size = 16
lr = 1e-4
fp16_opt_level = 'O1'

torch.manual_seed(random_state)
np.random.seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=random_state)
test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=random_state)
tokenizer = XLNetTokenizer.from_pretrained(model_class)
model = XLNetForSequenceClassification.from_pretrained(model_class, num_labels=len(categories))
model.cuda()

def make_inputs(data):
    unpadded = []
    for text in data['data']:
        token_ids = tokenizer.encode(text)
        truncated = token_ids[:max_len-2] + tokenizer.encode(' ')
        unpadded.append(truncated)
    input_ids = pad_sequences(unpadded, maxlen=max_len, padding='post', truncating='post', value=tokenizer.encode('')[0])
    attention_masks = (input_ids != tokenizer.encode('')[0]).astype('int')
    return TensorDataset(torch.tensor(input_ids, dtype=torch.int64), 
                         torch.tensor(attention_masks, dtype=torch.int64), torch.tensor(data['target'], dtype=torch.int64))

train_tensor = make_inputs(train)
train_sampler = RandomSampler(train_tensor)
train_dl = DataLoader(train_tensor, sampler=train_sampler, batch_size=batch_size)

test_tensor = make_inputs(test)
test_sampler = SequentialSampler(test_tensor)
test_dl = DataLoader(test_tensor, sampler=test_sampler, batch_size=batch_size)

named_params = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [param for name, param in named_params if all(nd not in name for nd in no_decay)], 'weight_decay_rate': 0.01}, 
                                {'params': [param for name, param in named_params if any(nd in name for nd in no_decay)], 'weight_decay_rate': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

#training loop
epochs = 1
for _ in range(epochs):
  model.train()
  for batch in tqdm(train_dl):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch  
    optimizer.zero_grad()
    loss, _ = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss.backward()
    optimizer.step()

#evaluate
model.eval()
all_preds = []
all_labels = []
for batch in tqdm(test_dl):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
  logits = logits.detach().cpu().numpy()
  preds = np.argmax(logits, axis=1).flatten()
  labels = b_labels.to('cpu').numpy().flatten()
  all_preds.append(preds)
  all_labels.append(labels)
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
print(classification_report(all_labels, all_preds))

#save model
torch.save({'model_state': model.state_dict()}, 'model.pt')

$ cat test.py



import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from apex import amp
from transformers import AdamW, XLNetTokenizer, XLNetForSequenceClassification
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report
from keras_preprocessing.sequence import pad_sequences

categories = ['rec.sport.hockey', 'soc.religion.christian', 'rec.motorcycles']
remove = ('headers', 'footers')
random_state = 42
model_class = 'xlnet-base-cased'
max_len = 64
batch_size = 16
lr = 1e-4
fp16_opt_level = 'O1'

torch.manual_seed(random_state)
np.random.seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=random_state)
tokenizer = XLNetTokenizer.from_pretrained(model_class)

def make_inputs(data):
    unpadded = []
    for text in data['data']:
        token_ids = tokenizer.encode(text)
        truncated = token_ids[:max_len-2] + tokenizer.encode(' ')
        unpadded.append(truncated)
    input_ids = pad_sequences(unpadded, maxlen=max_len, padding='post', truncating='post', value=tokenizer.encode('')[0])
    attention_masks = (input_ids != tokenizer.encode('')[0]).astype('int')
    return TensorDataset(torch.tensor(input_ids, dtype=torch.int64), 
                         torch.tensor(attention_masks, dtype=torch.int64), torch.tensor(data['target'], dtype=torch.int64))

test_tensor = make_inputs(test)
test_sampler = SequentialSampler(test_tensor)
test_dl = DataLoader(test_tensor, sampler=test_sampler, batch_size=batch_size)

#load
model = XLNetForSequenceClassification.from_pretrained(model_class, num_labels=len(categories))
model.cuda()
model = amp.initialize(model, opt_level=fp16_opt_level)
model.load_state_dict(torch.load('model.pt')['model_state'])

#evaluate
model.eval()
all_preds = []
all_labels = []
for batch in tqdm(test_dl):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
  logits = logits.detach().cpu().numpy()
  preds = np.argmax(logits, axis=1).flatten()
  labels = b_labels.to('cpu').numpy().flatten()
  all_preds.append(preds)
  all_labels.append(labels)
all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
print(classification_report(all_labels, all_preds))

lbin commented 4 years ago

I also meet this problem

Shijihao commented 4 years ago

I also meet this.