maxjcohen / transformer

Implementation of Transformer model (originally from Attention is All You Need) applied to Time Series.
https://timeseriestransformer.readthedocs.io/en/latest/
GNU General Public License v3.0
842 stars 165 forks source link

**RuntimeError: The size of tensor a (896) must match the size of tensor b (14) at non-singleton dimension 0** #43

Closed rotameraklisi closed 3 years ago

rotameraklisi commented 3 years ago

Hello, I'm working on time series classification with transformer. I divide it into 10 classes with 14 features, 1 label (categoric-> LabelEnceder) d_input = 14, d_outpu = 10, d_model? Window_size = 16, but I am getting such an error. What should I do, why does the dimension of y appear one dimension?

[Epoch 1/2]: 0%| | 0/31514 [00:00<?, ?it/s]torch.Size([31514, 16, 14]) torch.Size([31514]) torch.Size([6092, 16, 14]) torch.Size([6092]) 31514 6092 Running on the GPU Using device cuda:0 torch.Size([64, 16, 14]) torch.Size([64]) [Epoch 1/2]: 0%| | 0/31514 [00:00<?, ?it/s]torch.Size([896, 16, 14]) torch.Size([896, 16, 14])

error------

RuntimeError Traceback (most recent call last)

in () 149 print(y.shape) 150 optimizer.zero_grad() --> 151 netout = net(x) 152 loss = loss_function(netout, y) 153 loss.backward() 5 frames /content/multiHeadAttention.py in forward(self, query, key, value, mask) 92 print(queries) 93 print(keys) ---> 94 self._scores = (queries@keys.T) / np.sqrt(K) 95 96 # Compute local map mask **RuntimeError: The size of tensor a (896) must match the size of tensor b (14) at non-singleton dimension 0** Here is code: ``` def create_datasetX(dataset,look_back): dataX=[] row=0 while(row+look_back) < len(dataset): dataX.append(dataset[row:(row+look_back)]) row=row+3 return np.array(dataX) def create_datasetY(dataset,look_back): dataY=[] col=0 while(col+look_back) < len(dataset): dataY.append(dataset[(col+look_back)]) col=col+3 return np.array(dataY) #Accuracy : 0.110 from numpy import vstack,argmax from pandas import read_csv import torch from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score from torch import Tensor from torch.utils.data import Dataset,DataLoader,random_split from torch.nn import * import pandas as pd import torch.nn as nn import torch.optim as optim from loss import OZELoss from transformer import Transformer import seaborn as sns from tqdm import tqdm import datetime from utils_ import compute_loss import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler from scipy.stats import zscore #from plot_functions import map_plot_function, plot_values_distribution, plot_error_distribution, plot_errors_threshold, plot_visual_sampl class CSVDataset(Dataset): def __init__(self): KINEMATICS_USECOLS = [c-1 for c in [39, 40, 41, 51, 52, 53, 57, 58, 59, 60, 70, 71, 72, 76,77]] trainX = [] trainY = [] filenamesTrainX = ['C001.txt','C002.txt','C003.txt','C004.txt','C005.txt', 'D001.txt','D002.txt','D003.txt','D004.txt','D005.txt', 'E001.txt','E002.txt','E003.txt','E004.txt','E005.txt', 'F001.txt','F002.txt','F003.txt','F004.txt','F005.txt', 'G001.txt','G002.txt','G003.txt','G004.txt','G005.txt', 'I001.txt','I002.txt','I003.txt','I004.txt','I005.txt' ] for fname in filenamesTrainX: trainXdata = pd.read_csv(fname,sep=',',usecols=KINEMATICS_USECOLS) self.X, self.y = trainXdata.values[:, :-1], trainXdata.values[:, -1] self.X = self.X.astype(np.float) mean = np.mean(self.X, axis=(0, 1)) std = np.std(self.X, axis=(0, 1)) self.X = (self.X - mean) / (std + np.finfo(float).eps) self.X=self.X.astype(np.float32) #M = np.max(self.X, axis=(0, 1)) #m = np.min(self.X, axis=(0, 1)) self.X, self.y = self.X.astype('float32'), LabelEncoder().fit_transform(self.y) self.X = create_datasetX(self.X, look_back) self.y = create_datasetY(self.y, look_back) trainX.extend(self.X) trainY.extend(self.y) trainX=np.array(trainX) trainY=np.array(trainY) self.X = trainX self.y = trainY self.X= torch.Tensor(self.X) self.y= torch.Tensor(self.y) print(self.X.shape) print(self.y.shape) def __len__(self): return len(self.X) def __getitem__(self, idx): # if torch.is_tensor(idx): # idx = idx.tolist() return [self.X[idx], self.y[idx]] class TestDataset(Dataset): def __init__(self): KINEMATICS_USECOLS = [c-1 for c in [39, 40, 41, 51, 52, 53, 57, 58, 59, 60, 70, 71, 72, 76,77]] from sklearn.preprocessing import MinMaxScaler from scipy.stats import zscore trainX = [] trainY = [] filenamesTrainX = ['B001.txt','B002.txt','B003.txt','B004.txt','B005.txt'] for fname in filenamesTrainX: trainXdata = pd.read_csv(fname,sep=',',usecols=KINEMATICS_USECOLS) self.X, self.y = trainXdata.values[:, :-1], trainXdata.values[:, -1] self.X = self.X.astype(np.float) mean = np.mean(self.X, axis=(0, 1)) std = np.std(self.X, axis=(0, 1)) self.X = (self.X - mean) / (std + np.finfo(float).eps) #M = np.max(self.X, axis=(0, 1)) #m = np.min(self.X, axis=(0, 1)) #self.X = (self.X - m) / (M - m + np.finfo(float).eps) look_back = 16 self.X, self.y = self.X.astype('float32'), LabelEncoder().fit_transform(self.y) self.X=self.X.astype(np.float32) self.X = create_datasetX(self.X, look_back) self.y = create_datasetY(self.y, look_back) trainX.extend(self.X) trainY.extend(self.y) trainX=np.array(trainX) trainY=np.array(trainY) self.X = trainX self.y = trainY self.X= torch.Tensor(self.X) self.y= torch.Tensor(self.y) print(self.X.shape) print(self.y.shape) def __len__(self): return len(self.X) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() return [self.X[idx], self.y[idx]] def prepare_data(): dataset = CSVDataset() train_dl = DataLoader(dataset, batch_size=BATCH_SIZE,shuffle=True, num_workers=NUM_WORKERS,pin_memory=False) return train_dl BATCH_SIZE =64 NUM_WORKERS= 0 LR =0.01 EPOCHS=2 d_model =16 q=14 v=14 h = 14 N = 7 attention_size = None dropout = 0.5 pe = None chunk_mode = None d_input = 14 d_output= 10 look_back=16 train_dl = prepare_data() dataset_test =TestDataset() test_dl = DataLoader(dataset_test, batch_size=BATCH_SIZE,shuffle=False, num_workers=NUM_WORKERS) print(len(train_dl.dataset), len(test_dl.dataset)) sns.set() if torch.cuda.is_available(): device = torch.device("cuda:0") # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device {device}") net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size, dropout=dropout, chunk_mode=chunk_mode, pe=pe) optimizer = optim.Adam(net.parameters(), lr=LR) #optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005) #optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) loss_function = nn.CrossEntropyLoss() model_save_path = f'models/model_{datetime.datetime.now().strftime("%Y_%m_%d__%H%M%S")}.pth' for idx_epoch in range(EPOCHS): running_loss = 0 with tqdm(total=len(train_dl.dataset), desc=f"[Epoch {idx_epoch+1:3d}/{EPOCHS}]") as pbar: for idx_batch, (x, y) in enumerate(train_dl): print(x.shape) print(y.shape) optimizer.zero_grad() netout = net(x) loss = loss_function(netout, y) loss.backward() optimizer.step() running_loss += loss.item() pbar.set_postfix({'loss': running_loss/(idx_batch+1)}) pbar.update(x.shape[0]) # evaluate the model def evaluate_model(test_dl, model): predictions, actuals = list(), list() for i, (inputs, targets) in enumerate(test_dl): yhat = model(inputs) yhat = yhat.detach().numpy() actual = targets.numpy() yhat = argmax(yhat, axis=1) actual = actual.reshape((len(actual), 1)) yhat = yhat.reshape((len(yhat), 1)) predictions.append(yhat) actuals.append(actual) predictions, actuals = vstack(predictions), vstack(actuals) print(predictions) print(actuals) acc = accuracy_score(actuals, predictions) return acc acc = evaluate_model(test_dl, net) print('Accuracy: %.3f' % acc) ```
maxjcohen commented 3 years ago

Hi, I'm sorry but I can't really help you on these dimension errors, it would require that I run the entire code with your dataset, which would take more time that I can afford. Especially considering that your using a slightly modified version of my code, such as:

self._scores = (queries@keys.T) / np.sqrt(K)

This does make the code much more readable, and follows the PEP, but I didn't had a chance to test it thoroughly.

I would encourage you to continue your debugging to understand why the batch matrix multiplication raises an error. You could find it helpful to start with the original code, and a fresh dataset, to get a working example, and then work back to your particular case. I would also encourage you to take a look at the equations behind the Multi Head Attention, in order to get a better understanding on how things should be multiplied.