Open pkyoung opened 6 years ago
@silnos If you do manage to fix this, please post an update/solution!
I managed to execute train.py, but not yet confirmed training was successful or not. The quick (and dirty) remedy to above error was:
diff --git a/speech/models/seq2seq.py b/speech/models/seq2seq.py
index b2881e3..65e3a38 100644
--- a/speech/models/seq2seq.py
+++ b/speech/models/seq2seq.py
@@ -87,7 +87,7 @@ class Seq2Seq(model.Model):
hx = torch.zeros((x.shape[0], x.shape[2]), requires_grad=False)
if self.is_cuda:
- hx.cuda()
+ hx = hx.cuda()
ax = None; sx = None;
for t in range(y.size()[1] - 1):
sample = (out and self.scheduled_sampling)
@@ -119,7 +119,7 @@ class Seq2Seq(model.Model):
if state is None:
hx = torch.zeros((x.shape[0], x.shape[2]), requires_grad=False)
if self.is_cuda:
- hx.cuda()
+ hx = hx.cuda()
ax = None; sx = None;
else:
hx, ax, sx = state
@@ -164,7 +164,7 @@ class Seq2Seq(model.Model):
Infer a likely output. No beam search yet.
"""
x, y = self.collate(*batch)
- end_tok = y.data[0, -1] # TODO
+ end_tok = y.data[0, -1].cuda() # TODO
t = y
if self.is_cuda:
x = x.cuda()
@@ -172,7 +172,7 @@ class Seq2Seq(model.Model):
x = self.encode(x)
# needs to be the start token, TODO
- y = t[:, 0:1]
+ y = t[:, 0:1].cuda()
_, argmaxs = self.infer_decode(x, y, end_tok, max_len)
argmaxs = argmaxs.cpu().data.numpy()
return [seq.tolist() for seq in argmaxs]
And there was also error in train.py
diff --git a/train.py b/train.py
index a04eb6c..6141ba0 100644
--- a/train.py
+++ b/train.py
@@ -10,6 +10,7 @@ import torch
import torch.nn as nn
import torch.optim
import tqdm
+import copy
import speech
import speech.loader as loader
@@ -30,7 +31,7 @@ def run_epoch(model, optimizer, train_ldr, it, avg_loss):
loss.backward()
grad_norm = nn.utils.clip_grad_norm(model.parameters(), 200)
- loss = loss.data[0]
+ loss = loss.item()
optimizer.step()
prev_end_t = end_t
@@ -54,11 +55,13 @@ def eval_dev(model, ldr, preproc):
model.set_eval()
for batch in tqdm.tqdm(ldr):
- preds = model.infer(batch)
- loss = model.loss(batch)
- losses.append(loss.data[0])
+ batch_ = copy.deepcopy(batch)
+ preds = model.infer(batch_)
+ batch_ = copy.deepcopy(batch)
+ loss = model.loss(batch_)
+ losses.append(loss.item())
all_preds.extend(preds)
- all_labels.extend(batch[1])
+ all_labels.extend(list(batch)[1])
model.set_train()
Thanks, I implemented this on my copy as well. I'm still getting another error:
RuntimeError: Assertion cur_target >= 0 && cur_target < n_classes' failed.
-- I'm wondering if you have encountered something similar at all?
I'm wondering if you've run this on Librispeech?
@silnos, could you share your config file?? are you using the default included example??
@arattari Yes I am using default json file in examples/timit. I haven't tried librispeech
With timit examples, I got the 2x.x % error rate, and I am not sure this is okay or not yet. I am going to look into the codes soon.
With python3.6, pytorch0.4.1, cuda9.0, I got the following error when I run train.py with timit example:
$ python train.py examples/timit/seq2seq_config.json
Traceback (most recent call last):
File "train.py", line 146, in <module>
run(config)
File "train.py", line 104, in run
run_state = run_epoch(model, optimizer, train_ldr, *run_state)
File "train.py", line 29, in run_epoch
loss = model.loss(batch)
File "/path/to/speech/models/seq2seq.py", line 57, in loss
out, alis = self.forward_impl(x, y)
File "/path/to/speech/models/seq2seq.py", line 68, in forward_impl
out, alis = self.decode(x, y)
File "/path/to/speech/models/seq2seq.py", line 103, in decode
hx = self.dec_rnn(ix.squeeze(dim=1), hx)
File "/path/to/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/path/to/lib64/python3.6/site-packages/torch/nn/modules/rnn.py", line 794, in forward
self.bias_ih, self.bias_hh,
File "/path/to/lib64/python3.6/site-packages/torch/nn/_functions/rnn.py", line 53, in GRUCell
gh = F.linear(hidden, w_hh)
File "/path/to/lib64/python3.6/site-packages/torch/nn/functional.py", line 1026, in linear
output = input.matmul(weight.t())
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 'mat2'
hx = self.dec_rnn(ix.squeeze(dim=1), hx) modified successfully : hx = self.dec_rnn(ix.squeeze(dim=1), hx.cuda())
With python3.6, pytorch0.4.1, cuda9.0,
Traceback (most recent call last): File "train.py", line 148, in
run(config) File "train.py", line 110, in run dev_loss, dev_cer = eval_dev(model, dev_ldr, preproc) File "train.py", line 57, in evaldev preds = model.infer(batch) File "/path/to/speech/models/seq2seq.py", line 176, in infer , argmaxs = self.infer_decode(x, y, end_tok, max_len) File "/path/to/speech/models/seq2seq.py", line 155, in infer_decode if torch.sum(y.data == end_tok) == y.numel(): RuntimeError: Expected object of type torch.cuda.LongTensor but found type torch.LongTensor for argument #2 'other' Do you have idea to solve this?
if torch.sum(y.data == end_tok) == y.numel(): modified successfully : if torch.sum(y.cpu() == end_tok).tolist()==y.numel():
For someone using "transducer" ....
After convert check_type() function of "libs/transducer/functions/transducer.py" to below code, I can pass this error.
def check_type(var, t, name):
#if type(var) is not t:
if var.type() != str(t).split("'")[1]:
raise TypeError("{} must be {},\n{},\n{}".format(name, t,A,B))
After this, I added code at certify_inputs() and forward().
def certify_inputs(log_probs, labels, lengths, label_lengths):
if log_probs.is_cuda:
check_type(log_probs, torch.cuda.FloatTensor, "log_probs")
else:
check_type(log_probs, torch.FloatTensor, "log_probs")
if labels.is_cuda:
check_type(labels, torch.cuda.IntTensor, "labels")
else:
check_type(labels, torch.IntTensor, "labels")
if label_lengths.is_cuda:
check_type(label_lengths, torch.cuda.IntTensor, "label_lengths")
else:
check_type(label_lengths, torch.IntTensor, "label_lengths")
if lengths.is_cuda:
check_type(lengths, torch.cuda.IntTensor, "lengths")
else:
check_type(lengths, torch.IntTensor, "lengths")
..........
def forward(self, log_probs, labels, lengths, label_lengths):
"""
Computes the Transducer cost for a minibatch of examples.
Arguments:
log_probs (FloatTensor): The log probabilities should
be of shape
(minibatch, input len, output len, vocab size).
labels (IntTensor): 1D tensor of labels for each example
consecutively.
lengths (IntTensor): 1D tensor of number actviation time-steps
for each example.
label_lengths (IntTensor): 1D tensor of label lengths for
each example.
Returns:
costs (FloatTensor): .
"""
is_cuda = log_probs.is_cuda
certify_inputs(log_probs, labels, lengths, label_lengths)
log_probs = log_probs.cpu()
labels = labels.cpu()
lengths = lengths.cpu()
label_lengths = label_lengths.cpu()
...............
After these, I got these warnings while training.
WARNING: Forward backward likelihood mismatch 0.000084
WARNING: Forward backward likelihood mismatch 0.000092
WARNING: Forward backward likelihood mismatch 0.000046
With python3.6, pytorch0.4.1, cuda9.0, I got the following error when I run train.py with timit example:
If I add
torch.set_default_tensor_type('torch.cuda.FloatTensor')
in main function, error becomes:Do you have idea to solve this?