drawbridge / keras-mmoe

A TensorFlow Keras implementation of "Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts" (KDD 2018)
MIT License
681 stars 217 forks source link

Have you implemented this in pytorch #13

Closed tomtang110 closed 3 years ago

tomtang110 commented 3 years ago

Hi

I reimplemented your code in pytorch, I found a strange phenomenon, the auc of income is just 0.56 and the auc of marital is 0.96. I checked the detail and don't know why it does not work?

Here is the model code:

class Model(nn.Module):
    def __init__(self,config):
        super(Model, self).__init__()

    # accept_unit = config.field_size*config.embed_size
    accept_unit = config.num_feature
    self.expert_kernels = torch.nn.Parameter(torch.randn(accept_unit, config.units, config.num_experts, device=config.device),
                                             requires_grad=True)
    self.gate_kernels = torch.nn.ParameterList(
        [nn.Parameter(torch.randn(accept_unit, config.num_experts, device=config.device), requires_grad=True) for i in
         range(config.num_tasks)])

    self.expert_kernels_bias = torch.nn.Parameter(torch.randn(config.units, config.num_experts, device=config.device),
                                                  requires_grad=True)
    self.gate_kernels_bias = torch.nn.ParameterList(
        [torch.nn.Parameter(torch.randn(config.num_experts, device=config.device), requires_grad=True) for i in range(config.num_tasks)])

    self.output_layer = nn.ModuleList([nn.Sequential(
        nn.Linear(config.units,config.hidden_units),
        nn.ReLU(),
        nn.Linear(config.hidden_units,unit),
    )
        for unit in config.label_dict
    ])

    self.expert_activation = nn.ReLU()

    # self.embedding_layer = nn.Embedding(config.num_feature,config.embed_size)

def forward(self,x):
    gate_outputs = []
    final_outputs = []
    # xi =x[0]
    # xv = x[1]

    # self.embeddings = self.embedding_layer(xi)
    # feat_value = xv.view(-1,xv.size(1),1)
    #
    # self.embeddings = feat_value * self.embeddings
    # self.embeddings = self.embeddings.view(xv.size(0),-1)

    expert_outputs = torch.einsum("ab,bcd->acd", (x, self.expert_kernels))
    expert_outputs += self.expert_kernels_bias
    expert_outputs = self.expert_activation(expert_outputs)

    for index, gate_kernel in enumerate(self.gate_kernels):
        gate_output = torch.einsum("ab,bc->ac", (x, gate_kernel))
        gate_output += self.gate_kernels_bias[index]
        gate_output = nn.Softmax(dim=-1)(gate_output)
        gate_outputs.append(gate_output)

    for gate_output in gate_outputs:
        expanded_gate_output = torch.unsqueeze(gate_output, 1)
        weighted_expert_output = expert_outputs * expanded_gate_output.expand_as(expert_outputs)
        final_outputs.append(torch.sum(weighted_expert_output, 2))

    output_layers = []
    for i,output in enumerate(final_outputs):
        output_layers.append(torch.sigmoid(self.output_layer[i](output)))

    return output_layers
alvin319 commented 3 years ago

Sorry for the delayed response! I haven't had the chance to port the code over to PyTorch, and unfortunately, I don't have much time on my end to investigate this. Sorry! I'll leave this issue open in-case if other people want to chime in (contribution is always welcomed).