Does not work on gpus other than gpu:0

The transformer decoder inference only works on cuda:0. If placed on other gpus, it gives the following error or causes the kernel to restart. Both the data and model are placed on the same gpu.

/usr/local/lib/python3.7/dist-packages/turbo_transformers/layers/modeling_decoder.py in __call__(self, input_tensor, return_type, is_trans_weight, output)
    285         super(PositionwiseFeedForward, self).__call__(input_tensor, output,
    286                                                       is_trans_weight)
--> 287         return convert_returns_as_type(output, return_type)
    288 
    289     @staticmethod

/usr/local/lib/python3.7/dist-packages/turbo_transformers/layers/return_type.py in convert_returns_as_type(tensor, rtype)
     39         return tensor
     40     elif rtype == ReturnType.TORCH:
---> 41         return dlpack.from_dlpack(tensor.to_dlpack())
     42     else:
     43         raise NotImplementedError()

RuntimeError: Specified device cuda:1 does not match device of data cuda:0

Here is an example to reproduce the error. This example simply causes the kernel to restart without giving error messages.

from onmt.decoders.transformer import TransformerDecoder as OnmtDecoder
from turbo_transformers import TransformerDecoder as TurboDecoder
import torch

device = 'cuda:1'

class Embeddings(object):
    def __init__(self):
        super().__init__() 
        self.word_padding_idx = 1

    def __call__(self, source, step=None):
        return source 

embeddings = Embeddings()    

onmt_decoder = OnmtDecoder(num_layers=4, 
                           d_model=256, 
                           heads=8, 
                           d_ff=1024,
                           copy_attn=False, 
                           self_attn_type="scaled-dot", 
                           dropout=0.1, 
                           attention_dropout=0.1,
                           embeddings=embeddings, 
                           max_relative_positions=0, 
                           aan_useffn=False,
                           full_context_alignment=False, 
                           alignment_layer=0,
                           alignment_heads=0).to(device)

src = torch.rand(10, 4, 256).to(device)
tgt = torch.rand(10, 4, 256).to(device)
memory = torch.rand(10, 4, 256).to(device)
memory_lengths = torch.full((4,), 10.0).to(device)

with torch.no_grad():
    onmt_decoder.init_state(src, None, None)
    _ = onmt_decoder(tgt, memory, step=None, memory_lengths=memory_lengths)

with torch.no_grad():
    turbo_decoder = TurboDecoder.from_onmt(onmt_decoder)
    turbo_decoder.init_state(src, None, None)
    _ = turbo_decoder(tgt, memory, step=None, memory_lengths=memory_lengths)

Thanks in advance for your help!

Tencent / TurboTransformers

Does not work on gpus other than gpu:0 #190