sooftware / conformer

[Unofficial] PyTorch implementation of "Conformer: Convolution-augmented Transformer for Speech Recognition" (INTERSPEECH 2020)
Apache License 2.0
958 stars 175 forks source link

NaN output and loss value #56

Open afmsaif opened 1 year ago

afmsaif commented 1 year ago

I am using the following training function and librispeech dataset. Every time the output of the model while training become Nan as a result the loss is also nan. What could be the possible issue.

class IterMeter(object): """keeps track of total iterations""" def init(self): self.val = 0

def step(self):
    self.val += 1

def get(self):
    return self.val

def train(model, device, train_loader, criterion, optimizer, scheduler, epoch): model.train()

train_loss = 0

data_len = len(train_loader.dataset)
for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data 

        spectrograms=torch.squeeze(spectrograms, dim=1)

        spectrograms = spectrograms.transpose(1,2)

        labels= torch.LongTensor(labels.long())

        input_lengths=torch.LongTensor(input_lengths)
        label_lengths=torch.LongTensor(label_lengths)
        input_lengths = input_lengths.to(device)
        label_lengths = label_lengths.to(device)
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        print(spectrograms.type())

        optimizer.zero_grad()

        output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)

        output = output.transpose(0, 1) # (time, batch, n_class)
        loss = criterion(output, labels, output_lengths, label_lengths)

        train_loss += loss.item() / len(train_loader)

        loss.backward()

        optimizer.step()
        scheduler.step()

        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))

return train_loss

def test(model, device, test_loader, criterion, epoch,batch_size=20): print('\nevaluating...') model.eval() test_loss = 0 test_cer, test_wer = [], [] n_classes = 29

if epoch%5==0:
    with torch.no_grad():
            for i, _data in enumerate(test_loader):
                spectrograms, labels, input_lengths, label_lengths = _data 
                spectrograms=torch.squeeze(spectrograms)

                spectrograms = spectrograms.transpose(1,2)

                labels=labels.long()

                input_lengths=torch.LongTensor(input_lengths)
                label_lengths=torch.LongTensor(label_lengths)
                input_lengths = input_lengths
                label_lengths = label_lengths

                spectrograms, labels = spectrograms.to(device), labels.to(device)

                output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)
                soft_max = torch.nn.functional.softmax(output,dim=2)
                output = output.transpose(0, 1) # (time, batch, n_class)
                loss = criterion(output, labels, output_lengths, label_lengths)
                test_loss += loss.item() / len(test_loader)

                decoder = CTCBeamDecoder(
                    [''] * (n_classes - 1) + [' '],
                    model_path=None,
                    alpha=0,
                    beta=0,
                    cutoff_top_n=40,
                    cutoff_prob=1.0,
                    beam_width=1000,
                    num_processes=4,
                    blank_id=28,
                    log_probs_input=False
                )
                beam_results, beam_scores, timesteps, out_lens = decoder.decode(soft_max, output_lengths)
                b=[]
                for i in range(batch_size):
                     b.append(beam_results[i][0][:out_lens[i][0]])
                decoded_preds, decoded_targets = numtoword(b,out_lens,labels, label_lengths)

                for j in range(len(decoded_preds)):
                    test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                    test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))

    return test_loss, avg_cer, avg_wer 
else:
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms=torch.squeeze(spectrograms)

            spectrograms = spectrograms.transpose(1,2)

            labels=labels.long()

            input_lengths=torch.LongTensor(input_lengths)
            label_lengths=torch.LongTensor(label_lengths)

            input_lengths = input_lengths.to(device)
            label_lengths = label_lengths.to(device)

            spectrograms, labels = spectrograms.to(device), labels.to(device)

            output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)
            soft_max = torch.nn.functional.softmax(output,dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)
            loss = criterion(output, labels, output_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)
    print('Test set: Average loss: {:.4f}\n'.format(test_loss))
    return test_loss, 0 , 0

def main(learning_rate=5e-4, batch_size=20, epochs=10, train_url="train-clean-100", test_url="test-clean"):

hparams = {

    "n_class": 29,
    "n_feats": 80,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}

use_cuda = torch.cuda.is_available()
torch.manual_seed(7)
device = torch.device("cuda" if use_cuda else "cpu")

if not os.path.isdir("./data"):
    os.makedirs("./data")

train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = data.DataLoader(dataset=train_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=True,
                            collate_fn=lambda x: data_processing(x, 'train'),
                            **kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=False,
                            collate_fn=lambda x: data_processing(x, 'valid'),
                            **kwargs)

model = Conformer(num_classes=hparams['n_class'], 
              input_dim=hparams['n_feats'], 
              encoder_dim=512, 
              num_encoder_layers=1)

model = nn.DataParallel(model)

model.to(device)

print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
criterion = nn.CTCLoss().to(device)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                        steps_per_epoch=int(len(train_loader)),
                                        epochs=hparams['epochs'],
                                        anneal_strategy='linear')
train_loss=[]
test_loss=[]
cer=[]
wer=[]
for epoch in range(1, epochs + 1):
    tra_loss = train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
    tes_loss, c, w =  test(model, device, test_loader, criterion, epoch)
    train_loss.append(tra_loss)
    test_loss.append(tes_loss)
    cer.append(c)
    wer.append(w)
return train_loss, test_loss, cer, wer