Reproducing msmarco-MiniLM-L-6-v3 results

matprst commented 2 years ago

Hi! :smiley: I am trying to reproduce the evaluation of msmarco-MiniLM-L-6-v3 on BEIR. When I load the model using SentenceBERT I get similar results as the ones on the leaderboard except for ArguAna. However, when I load the same model but using Hugging Face, the results are quite different.

	scidocs	nfcorpus	scifact	trec-covid	arguana
official leaderboard	0.116	0.255	0.495	0.479	0.394 (typo?)
reproduction using SentenceBERT	0.1164	0.2547	0.4947	0.4794	0.2938
reproduction using Hugging Face (no pooling)	0.1104	0.2373	0.4482	0.3221	0.285
reproduction using Hugging Face (mean pooling)	0.0742	0.2392	0.4268	0.2306	0.2891

Here is my current setup to evaluate the SentenceBERT model (mostly taken from evaluate_faiss_dense.py):

import numpy as np
import torch
from beir import util
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import FlatIPFaissSearch
from transformers import AutoTokenizer, AutoModel

import pathlib, os
from typing import List, Dict

class BEIRBenchmark:
    def __init__(self, k_values) -> None:
        self.k_values = k_values

    def _dataset_to_url(self, dataset):
        return "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)

    @property
    def output_dir(self):
        return os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")

    @property
    def index_dir(self):
        return os.path.join(pathlib.Path(__file__).parent.absolute(), "faiss-index")

    def evaluate_one(self, model, dataset, batch_size=2):
        url = self._dataset_to_url(dataset)
        data_path = util.download_and_unzip(url, self.output_dir)

        corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
        faiss_search = FlatIPFaissSearch(model, batch_size=batch_size)

        prefix = dataset
        ext = "flat"
        if os.path.exists(os.path.join(self.index_dir, "{}.{}.faiss".format(prefix, ext))):
            faiss_search.load(input_dir=self.index_dir, prefix=prefix, ext=ext)

        retriever = EvaluateRetrieval(faiss_search, score_function="cos_sim", k_values=self.k_values)
        results = retriever.retrieve(corpus, queries)

        os.makedirs(self.index_dir, exist_ok=True)

        if not os.path.exists(os.path.join(self.index_dir, "{}.{}.faiss".format(prefix, ext))):
            faiss_search.save(output_dir=self.index_dir, prefix=prefix, ext=ext)

        ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

    def evaluate_all(self, model, datasets, batch_size=2):
        for dataset in datasets:
            self.evaluate_one(model, dataset, batch_size=batch_size)

benchmark = BEIRBenchmark(k_values=[10])
datasets = ["scidocs", "nfcorpus", "scifact", "trec-covid", "arguana"]

model = models.SentenceBERT("sentence-transformers/msmarco-MiniLM-L-6-v3")
benchmark.evaluate_all(model, datasets)

When I load the model from Hugging Face, I need to implement encode_queries and encode_corpus. I also encode the text with mean pooling as shown in the Hugging Face model card:

class HFmodel:
    def __init__(self, model_path=None, pooling=False) -> None:
        self.device = torch.device('cuda')
        self.model = AutoModel.from_pretrained(model_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.pooling = pooling

    def encode_queries(self, queries: List[str], batch_size: int, **kwargs) -> np.array:
        embeddings = []
        for i in range(0, len(queries), batch_size):
            # batch queries
            texts = queries[i:i+batch_size]

            # preprocess the input
            inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
            with torch.no_grad():
                result = self.model(**inputs)

            if self.pooling:
                pooled_result = self.mean_pooling(result, inputs['attention_mask'])
                embeddings.append(pooled_result.cpu().detach().numpy())
            else:
                embeddings.append(result[0][:, 0, :].cpu().detach().numpy())

        return np.vstack(embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) -> np.ndarray:
        embeddings = []
        for i in range(0, len(corpus), batch_size):
            # concatenate title and text of the documents in the batch
            title_abs = [f"{doc['title']} {(doc.get('text') or '')}" for doc in corpus[i:i+batch_size]]

            # preprocess the input
            inputs = self.tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
            with torch.no_grad():
                result = self.model(**inputs)

            if self.pooling:
                pooled_result = self.mean_pooling(result, inputs['attention_mask'])
                embeddings.append(pooled_result.cpu().detach().numpy())
            else:
                embeddings.append(result[0][:, 0, :].cpu().detach().numpy())

        return np.vstack(embeddings)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

benchmark = BEIRBenchmark(k_values=[10])
datasets = ["scidocs", "nfcorpus", "scifact", "trec-covid", "arguana"]

model = HFmodel("sentence-transformers/msmarco-MiniLM-L-6-v3", pooling=True)
benchmark.evaluate_all(model, datasets)

I tested both with mean pooling (as the leaderboard specify) and without. The results without pooling are better than the ones with mean pooling, but still not as good as the ones on the leaderboard. To me, the difference between Hugging Face and SentenceBERT could come from either my implementation of the encoding functions being wrong, or the way I do the pooling.

I have two questions:

Could there be a typo in the leaderboard for ArguAna? (see first 2 lines of the table above)
Do you have any idea what could be going wrong when I implement the model using the Hugging Face checkpoint?

PS: thanks for the great library, it's awesome!

cadurosar commented 2 years ago

Hi @matprst I'm not sure about problems on the HuggingFace part, but for Arguana, it seems that there's a bug with FaissDenseRetrieval, where the line that removes the query id from the result list is not present. This is needed because in Arguana (and quora) queries are a part of the corpus, but not a part of the qrel for itself, which is not an usual setting and can lead to problems like this. The same bug was present on ColBERT evaluation #67.

matprst commented 2 years ago

Thanks for the answer @cadurosar !

Concerning my other problem, I figured out the issue. In encode_corpus() you are supposed to normalize the embedding yourself when the argument normalize_embeddings is set to true. I find it a bit counter intuitive since this line suggests the normalization happens "hunder the hood" when using a cosine similarity function. Note that this is only an issue when the search uses a subclass of DenseRetrievalFaissSearch since the other retrievers do not use the normalize_embeddings argument when calling encode_corpus.

Here is my updated encode_corpus function:

def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) -> np.ndarray:
        embeddings = []
        for i in range(0, len(corpus), batch_size):
            # concatenate title and text of the documents in the batch
            title_abs = [(doc["title"] + " " + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus[i:i+batch_size]]

            # preprocess the input
            inputs = self.tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
            result = self.model(**inputs)

            if self.pooling:
                embedding = self.mean_pooling(result, inputs['attention_mask'])
            else: # use [cls] token
                embedding = result[0][:, 0, :]

            if kwargs.get('normalize_embeddings', False):
                embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)

            embeddings.append(embedding.cpu().detach().numpy())

        return np.vstack(embeddings)

beir-cellar / beir

Reproducing msmarco-MiniLM-L-6-v3 results #87