Closed vanbasten23 closed 2 months ago
I have a backward NN model with dynamic input:
class Feedforward(torch.nn.Module): def __init__(self, input_size, hidden_size): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size) self.fc1.weight.data.fill_(0.01) self.fc1.bias.data.fill_(0.01) self.relu = torch.nn.ReLU() self.fc2 = torch.nn.Linear(self.hidden_size, 1) self.fc2.weight.data.fill_(0.01) self.fc2.bias.data.fill_(0.01) self.sigmoid = torch.nn.Sigmoid() def forward(self, x): hidden = self.fc1(x) relu = self.relu(hidden) output = self.fc2(relu) output = self.sigmoid(output) return output def create_dynamic_test_data(num_samples, num_features, device): x_test = torch.ones(num_samples, num_features) x_test[0][0] = 0 y_test = torch.ones(num_samples * 2) y_test[0] = 0 x_test_xla = x_test.to(device) x_test_nonzero_dev = torch.nonzero(x_test_xla.int()).float() y_test_xla = y_test.to(device) y_test_nonzero_dev = torch.nonzero(y_test_xla.int()).float().squeeze() return x_test_nonzero_dev, y_test_nonzero_dev num_features = 2 num_test_samples = 5 model = Feedforward(num_features, hidden_size=10).to(xla_dev) criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # code is from https://colab.sandbox.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb def train(model, loss_fn, optimizer): model.train() # x_train, y_train = make_blobs(n_samples=40, n_features=num_features, cluster_std=1.5, shuffle=True) # x_train = torch.Tensor(x_train) # y_train = torch.Tensor(y_train) # x_train_xla = x_train.to(xla_dev) # y_train_xla = y_train.to(xla_dev) x_train_xla, y_train_xla = create_dynamic_test_data(num_samples=40, num_features=2, device=xla_dev) optimizer.zero_grad() # Compute prediction error pred = model(x_train_xla) print('pred.size()=', pred.size()) loss = loss_fn(pred.squeeze(), y_train_xla) # Backpropagation xm.mark_step() loss.backward() xm.optimizer_step(optimizer) train(model, loss_fn=criterion, optimizer=optimizer)
Notice that without xm.mark_step() before loss.backward(), training fail in pytorch with error. But if I add xm.mark_step() before loss.backward(), the training fails in a much earlier place in XLA with error. I wonder what this XLA error mean.
xm.mark_step()
loss.backward()
is it ok to assign this issue to you, @vanbasten23 ?
❓ Questions and Help
I have a backward NN model with dynamic input:
Notice that without
xm.mark_step()
beforeloss.backward()
, training fail in pytorch with error. But if I addxm.mark_step()
beforeloss.backward()
, the training fails in a much earlier place in XLA with error. I wonder what this XLA error mean.