Prediction differs from non-explainable evaluation

Python version: 3.9.2
OS: centos07
transformers 4.15.0
transformers-interpret 0.9.6
torch 1.10.1+cpu
torchaudio 0.10.1+cpu
torchvision 0.11.2+cpu

I think I may be doing something wrong, but I can't seem to get this working. I can get it to produce the diagram and attribution scores but when I check what label it has predicted it doesn't match the one I expect. For example, I am using a custom fine-tuned model which i evaluate like so

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model)
config = AutoConfig.from_pretrained(args.model_path)
model = AutoModelForSequenceClassification.from_pretrained(args.model_path, config=config)

tokenized_input_seq_pair = tokenizer(
    batch_df.sentence1.tolist(),
    text_pair=batch_df.sentence2.tolist(),
    return_token_type_ids=True,
    truncation=True,
    padding=True,
    max_length=max_length,
)
tokenized = {
    'input_ids': (torch.Tensor(tokenized_input_seq_pair['input_ids']).long().to(device)),
    'token_type_ids': (
        torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().to(device)
    ),
    'attention_mask': (
        torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().to(device)
    ),
    'labels': None,
}

# only pass args compatible with the model forward signature
# https://github.com/huggingface/transformers/issues/2702#issuecomment-581019468
tokenized = {
    key: value
    for (key, value) in tokenized.items()
    if key in model.forward.__code__.co_varnames
}
outputs = gpu_model(**tokenized)

predicted_probability = torch.softmax(outputs.logits, dim=1)
predicted_index = torch.argmax(predicted_probability, dim=1)

batch_df['predicted_probability'] = predicted_probability.tolist()
batch_df['predicted_label'] = predicted_index.tolist()

Whereas when I use the same model, and the same inputs with the explainer I do not get the same label prediction.

explainer = PairwiseSequenceClassificationExplainer(model, tokenizer)
viz = []

for _, row in input_df.iterrows():
    attrs = explainer(row.sentence1, row.sentence2)
    viz.append(
        (
            row.id,
            row.sentence1,
            row.sentence2,
            explainer.predicted_class_name,  # this is the wrong label
            attrs,
        )
    )

df = pd.DataFrame(
    viz, columns=['id', 'sentence1', 'sentence2', 'attr_predicted_label', 'attributions']
)

Even when I subclass the explainer class to ensure the tokenizer is operating in the same way it still predicts incorrectly. Model and tokenizer were set up identical to above

class CustomPairwiseClassificationExplainer(PairwiseSequenceClassificationExplainer):
    def _make_input_reference_pair(
        self, text1: Union[List, str], text2: Union[List, str]
    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
        tokenized_input_seq_pair = self.tokenizer(
            text1,
            text_pair=text2,
            return_token_type_ids=True,
            truncation=True,
            padding=True,
            max_length=self.max_length,
        )

        return (
            torch.tensor([tokenized_input_seq_pair['input_ids']], device=self.device),
            torch.tensor([tokenized_input_seq_pair['token_type_ids']], device=self.device),
            torch.tensor([tokenized_input_seq_pair['attention_mask']], device=self.device),
            len(tokenized_input_seq_pair['input_ids']),
        )

    def _calculate_attributions(
        self,
        embeddings: Embedding,
        index: int = None,
        class_name: str = None,
        flip_sign: bool = False,
    ):  # type: ignore
        (
            self.input_ids,
            self.ref_input_ids,
            self.attention_mask,
            self.sep_idx,
        ) = self._make_input_reference_pair(self.text1, self.text2)

        (
            self.position_ids,
            self.ref_position_ids,
        ) = self._make_input_reference_position_id_pair(self.input_ids)

        (
            self.token_type_ids,
            self.ref_token_type_ids,
        ) = self._make_input_reference_token_type_pair(self.input_ids, self.sep_idx)

        # self.attention_mask = self._make_attention_mask(self.input_ids)

cdpierse / transformers-interpret

Prediction differs from non-explainable evaluation #127