Negative Sampling - Githubissues

I am wondering why we need 2 embedding layers for negative sampling in word2vec algorithm. I wasn't explained. Using the same embedding layer for both positive and negative samples seems to produce comparable results.

Apart from that, the model seems to be prone to numerical overflows when calculating losses. This is slightly mitigated by uniform weight initialization, although I would suggest clipping the results of matrix multiplication.

class SkipGramNeg(nn.Module):
    def __init__(self, n_vocab, n_embed, noise_dist=None):
        super().__init__()       
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist

        # define embedding layers for input and output words
        #self.in_embed = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_embed)
        #self.out_embed = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_embed)
        self.embed = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_embed)

        # Initialize both embedding tables with uniform distribution
        #self.in_embed.weight.data.uniform_(-1, 1)
        #self.out_embed.weight.data.uniform_(-1, 1)
        #self.in_embed.weight.data.normal_(-1, 1)
        #self.in_embed.weight.data.normal_(-1, 1)
        self.embed.weight.data.uniform_(-1, 1)

    def forward_input(self, input_words):
        # return input vector embeddings
        #x = self.in_embed(input_words)
        x = self.embed(input_words)
        return x

    def forward_output(self, output_words):
        # return output vector embeddings
        #x = self.out_embed(output_words)
        x = self.embed(output_words)
        return x

    def forward_noise(self, batch_size, n_samples):
        """ Generate noise vectors with shape (batch_size, n_samples, n_embed)"""
        if self.noise_dist is None:
            # Sample words uniformly
            noise_dist = torch.ones(self.n_vocab)
        else:
            noise_dist = self.noise_dist

        # Sample words from our noise distribution
        noise_words = torch.multinomial(noise_dist,
                                        batch_size * n_samples,
                                        replacement=True)

        #device = "cuda" if model.out_embed.weight.is_cuda else "cpu"
        device = "cuda" if model.embed.weight.is_cuda else "cpu"
        noise_words = noise_words.to(device)

        # reshape the embeddings so that they have dims (batch_size, n_samples, n_embed)
        #noise_embeddings = self.out_embed(noise_words)
        noise_embeddings = self.embed(noise_words)
        #print("noise_embeddings:", noise_embeddings.shape)
        noise_embeddings = noise_embeddings.view(batch_size, n_samples, self.n_embed)

        return noise_embeddings

class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors):

        batch_size, embed_size = input_vectors.shape

        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)

        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors)
        out_loss = torch.clamp(out_loss, -10, 10)
        out_loss = out_loss.sigmoid().log()
        out_loss = out_loss.squeeze()

        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors)
        noise_loss = torch.clamp(noise_loss, -10, 10)
        noise_loss = noise_loss.sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss

        return -(out_loss + noise_loss).mean()

udacity / deep-learning-v2-pytorch

Negative Sampling #199