facebookresearch / InferSent

InferSent sentence embeddings
Other
2.28k stars 471 forks source link

Same sentence, different encoding! #141

Open MarcusNerva opened 4 years ago

MarcusNerva commented 4 years ago

` import sys sys.path.append('../') import os import torch import math import numpy as np

from infersent_model import InferSent EPS = 1e-4

def cosine(u, v): return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

if name == 'main':

opt = myopts.parse_opt()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_version = 1
MODEL_PATH = './encoder'
assert MODEL_PATH is not None, '--infersent_model_path is None!'
MODEL_PATH = os.path.join(MODEL_PATH, 'infersent%s.pkl' % model_version)
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': model_version
}
infersent_model = InferSent(params_model)
infersent_model.load_state_dict(torch.load(MODEL_PATH))
infersent_model = infersent_model.to(device)
W2V_PATH = './Glove/glove.840B.300d.txt'
assert W2V_PATH is not None, '--w2v_path is None!'
infersent_model.set_w2v_path(W2V_PATH)
infersent_model.build_vocab_k_words(K=100000)

store = ['a man is talking about a movie pictures of a movie pictures' ,
         'a person is folding paper',
         'a man is singing',
         'people are dancing and dancing',
         'a man and woman are talking about something',
         'a woman is applying makeup',
         'a person is cooking a dish and adding ingredients into a pot',
         'a man is talking',
         'a man is talking about the weather on the screen',
         'cartoon characters are interacting']
# encoding sentences together
embeddings = infersent_model.encode(store, bsize=128, tokenize=True)

for i in range(len(store)):
    # encoding ith sentence alone
    temp = infersent_model.encode([store[i]], bsize=128, tokenize=True)[0]
    # calculate Cosine Similarity between ith sentence which is encoded alone 
    # and ith sentence which is encoded together with others sentences
    if math.fabs(1 - cosine(temp, embeddings[i])) > EPS:
        print(cosine(temp, embeddings[i]))

`

and here is the output: Vocab size : 100000 0.9066778 0.87379414 0.89509517 0.9344797 0.9010086 0.8247624 0.9670602 0.9080478

really weird, isn't it? Since all parameters are frozen, how could this happen?

fedorn commented 3 years ago

It seems that issue is caused by zero-padding and self.max_pad = False in V1 model. The V2 model doesn't have this issue.