Open Pin-Jiun opened 1 year ago
We have three kinds of datasets:
train
: for trainingdev
: for validationtest
: for testing (w/o target value)The COVID19Dataset
below does:
.csv
filescovid.train.csv
into train/dev setsFinishing TODO
below might make you pass medium baseline.
class COVID19Dataset(Dataset):
''' Dataset for loading and preprocessing the COVID19 dataset '''
def __init__(self,
path,
mode='train',
target_only=False):
self.mode = mode
# Read data into numpy arrays
with open(path, 'r') as fp:
#先將全部data讀取
data = list(csv.reader(fp))
#np.array(data[1:]) 先捨棄掉第一列不是數據的部分,並轉化成np.array
#[:, 1:].astype(float) 捨棄第一行, 因為此行是sample index, 並將型態轉成float
data = np.array(data[1:])[:, 1:].astype(float)
if not target_only:
#共有40+18+18+18=94個features
feats = list(range(93))
else:
# TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
pass
if mode == 'test':
# Testing data
# data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
data = data[:, feats]
self.data = torch.FloatTensor(data)
else:
# Training data (train/dev sets)
# data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
target = data[:, -1]
data = data[:, feats]
# Splitting training data into train & dev sets
#將sample index = 10的倍數設定為dev sets
if mode == 'train':
indices = [i for i in range(len(data)) if i % 10 != 0]
elif mode == 'dev':
indices = [i for i in range(len(data)) if i % 10 == 0]
# Convert data into PyTorch tensors
self.data = torch.FloatTensor(data[indices])
self.target = torch.FloatTensor(target[indices])
# Normalize features (you may remove this part to see what will happen)
self.data[:, 40:] = \
(self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \
/ self.data[:, 40:].std(dim=0, keepdim=True)
self.dim = self.data.shape[1]
print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'
.format(mode, len(self.data), self.dim))
def __getitem__(self, index):
# Returns one sample at a time
if self.mode in ['train', 'dev']:
# For training
return self.data[index], self.target[index]
else:
# For testing (no target)
return self.data[index]
def __len__(self):
# Returns the size of the dataset
return len(self.data)
A DataLoader
loads data from a given Dataset
into batches.
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
''' Generates a dataset, then is put into a dataloader. '''
dataset = COVID19Dataset(path, mode=mode, target_only=target_only) # Construct dataset
dataloader = DataLoader(
dataset, batch_size,
shuffle=(mode == 'train'), drop_last=False,
num_workers=n_jobs, pin_memory=True) # Construct dataloader
return dataloader
NeuralNet
is an nn.Module
designed for regression.
The DNN consists of 2 fully-connected layers with ReLU activation.
This module also included a function cal_loss
for calculating loss.
class NeuralNet(nn.Module):
''' A simple fully-connected deep neural network '''
def __init__(self, input_dim):
super(NeuralNet, self).__init__()
# Define your neural network here
# TODO: How to modify this model to achieve better performance?
self.net = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Linear(64, 1)
)
# Mean squared error loss
self.criterion = nn.MSELoss(reduction='mean')
def forward(self, x):
''' Given input of size (batch_size x input_dim), compute output of the network '''
return self.net(x).squeeze(1)
def cal_loss(self, pred, target):
''' Calculate loss '''
# TODO: you may implement L1/L2 regularization here
return self.criterion(pred, target)
squeeze() 能夠去除維度、unsqueeze() 則能增加維度,正如 squeeze(擠出)和 unsqueeze(鬆開)的含意。
首先來看能夠去除維度的 squeeze()。
import torch
data = torch.tensor([
[[0, 1, 2],
[3, 4, 5],
[6, 7, 8],]
])
print('Shape:', data.shape)
# squeeze()
squeeze_data = data.squeeze(0)
print('squeeze data:', squeeze_data)
print('squeeze(0) shape:', squeeze_data.shape)
#squeeze(0) shape: torch.Size([3, 3])
import torch
data = torch.tensor([
[[0, 1, 2],
[3, 4, 5],
[6, 7, 8],]
])
print('Shape:', data.shape)
# unsqueeze()
unsqueeze_data = data.unsqueeze(0)
print('unsqueeze data:', unsqueeze_data)
print('unsqueeze(0) shape:', unsqueeze_data.shape)
Shape: torch.Size([1, 3, 3])
unsqueeze data: tensor([[[
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]]]])
unsqueeze(0) shape: torch.Size([1, 1, 3, 3])
https://clay-atlas.com/blog/2020/09/02/pytorch-cn-squeeze-unsqueeze-usage/
config
contains hyper-parameters for training and the path to save your model.
device = get_device() # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True) # The trained model will be saved to ./models/
target_only = False # TODO: Using 40 states & 2 tested_positive features
# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
'n_epochs': 3000, # maximum number of epochs
'batch_size': 270, # mini-batch size for dataloader
'optimizer': 'SGD', # optimization algorithm (optimizer in torch.optim)
'optim_hparas': { # hyper-parameters for the optimizer (depends on which optimizer you are using)
'lr': 0.001, # learning rate of SGD
'momentum': 0.9 # momentum for SGD
},
'early_stop': 200, # early stopping epochs (the number epochs since your model's last improvement)
'save_path': 'models/model.pth' # your model will be saved here
}
pytorch.detach() https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
def train(tr_set, dv_set, model, config, device):
''' DNN training '''
n_epochs = config['n_epochs'] # Maximum number of epochs
# Setup optimizer
optimizer = getattr(torch.optim, config['optimizer'])(
model.parameters(), **config['optim_hparas'])
min_mse = 1000.
loss_record = {'train': [], 'dev': []} # for recording training loss
early_stop_cnt = 0
epoch = 0
while epoch < n_epochs:
model.train() # set model to training mode
for x, y in tr_set: # iterate through the dataloader
optimizer.zero_grad() # set gradient to zero
x, y = x.to(device), y.to(device) # move data to device (cpu/cuda)
pred = model(x) # forward pass (compute output)
mse_loss = model.cal_loss(pred, y) # compute loss model已經存在在GPU所以不用移動
mse_loss.backward() # compute gradient (backpropagation)
optimizer.step() # update model with optimizer
loss_record['train'].append(mse_loss.detach().cpu().item())
# After each epoch, test your model on the validation (development) set.
dev_mse = dev(dv_set, model, device)
if dev_mse < min_mse:
# Save model if your model improved
min_mse = dev_mse
print('Saving model (epoch = {:4d}, loss = {:.4f})'
.format(epoch + 1, min_mse))
torch.save(model.state_dict(), config['save_path']) # Save model to specified path
early_stop_cnt = 0
else:
early_stop_cnt += 1
epoch += 1
loss_record['dev'].append(dev_mse)
if early_stop_cnt > config['early_stop']:
# Stop training if your model stops improving for "config['early_stop']" epochs.
break
print('Finished training after {} epochs'.format(epoch))
return min_mse, loss_record
def dev(dv_set, model, device):
model.eval() # set model to evalutation mode
total_loss = 0
for x, y in dv_set: # iterate through the dataloader
x, y = x.to(device), y.to(device) # move data to device (cpu/cuda)
with torch.no_grad(): # disable gradient calculation
pred = model(x) # forward pass (compute output)
mse_loss = model.cal_loss(pred, y) # compute loss
total_loss += mse_loss.detach().cpu().item() * len(x) # accumulate loss
total_loss = total_loss / len(dv_set.dataset) # compute averaged loss
return total_loss
def test(tt_set, model, device):
model.eval() # set model to evalutation mode
preds = []
for x in tt_set: # iterate through the dataloader
x = x.to(device) # move data to device (cpu/cuda)
with torch.no_grad(): # disable gradient calculation
pred = model(x) # forward pass (compute output)
preds.append(pred.detach().cpu()) # collect prediction
preds = torch.cat(preds, dim=0).numpy() # concatenate all predictions and convert to a numpy array
return preds
tr_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)
model = NeuralNet(tr_set.dataset.dim).to(device) # Construct model and move to device
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device)
plot_learning_curve(model_loss_record, title='deep model')
del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model
model.load_state_dict(ckpt)
plot_pred(dv_set, model, device) # Show prediction on the validation set
The predictions of your model on testing set will be stored at pred.csv
.
def save_pred(preds, file):
''' Save predictions to specified file '''
print('Saving results to {}'.format(file))
with open(file, 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['id', 'tested_positive'])
for i, p in enumerate(preds):
writer.writerow([i, p])
preds = test(tt_set, model, device) # predict COVID-19 cases with your model
save_pred(preds, 'pred.csv') # save prediction file to pred.csv
Download Data
If the Google drive links are dead, you can download data from kaggle, and upload data manually to the workspace.
Import Some Packages
固定隨機數種子
為什麽使用相同的網絡結構,跑出來的效果完全不同,儘管用的學習率,叠代次數,batch size 都是一樣? 如果你用了cuda,別忘了cuda的隨機數種子。
區別在於:使用cpu時不需要設置GPU的額外參數,使用gpu時需要設置
在cpu中,我們只需要保證:
所有參數的初始化相同 學習率、叠代次數、batch size相同 那麽,參數的更新都是由梯度計算得到的,快慢、次數都相同,所以最後得到的模型的參數也相同,輸出結果就相同。
而當使用gpu訓練模型時,可能引入額外的隨機源,使得結果不能準確再現(gpu提供了多核並行計算的基礎結構)
torch.backends.cudnn.deterministic是啥?顧名思義,將這個 flag 置為True的話,每次返回的卷積算法將是確定的,即默認算法。 如果配合上設置 Torch 的隨機種子為固定值的話,應該可以保證每次運行網絡的時候相同輸入的輸出是固定的
https://pytorch.org/docs/stable/notes/randomness.html
Some Utilities
You do not need to modify this part.