One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

hi! i'm using captum in order to study input-output correlations when using transformer based protein language models (sequence classification). when i create an integratedgradients instance and use attribute i have the error: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

here salient code:

class xAInet(nn.Module):
  def __init__(self):
    super().__init__()

    self.hidden_dim = 25
    self.batch_size = 32
    self.embedding_dim = 512

    self.embedding_layer = nn.Embedding(24, self.embedding_dim, padding_idx=0)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)

    self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
    self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, num_layers=2,
                      bidirectional=True, dropout=.2)

    self.block_seq = nn.Sequential(nn.Linear(15050, 2048),
                                   nn.BatchNorm1d(2048),
                                   nn.LeakyReLU(),
                                   nn.Linear(2048, 1024),
                                   nn.BatchNorm1d(1024),
                                   nn.LeakyReLU(),
                                   nn.Linear(1024, 256),
                                   nn.BatchNorm1d(256),
                                   nn.ReLU(),
                                   nn.Linear(256, 8),
                                   nn.Linear(8, 2),
                                   nn.Softmax(dim=1))

  def forward(self, seq):
        seq = seq.long()
        embeddings = self.embedding_layer(seq)
        output = self.transformer_encoder(embeddings).permute(1, 0, 2)
        output, hn = self.gru(output)
        output = output.permute(1, 0, 2)
        hn = hn.permute(1, 0, 2)

        output = output.reshape(output.shape[0], -1)
        hn = hn.reshape(output.shape[0], -1)

        output = torch.cat([output, hn], 1)
        output = self.block_seq(output)
       # output = torch.argmax(output, dim=1)

        return output

  def train_model(self, seq):
    #with torch.no_grad():
        output = self.forward(seq)

        return output

def model_output(inputs):

    #inputs = inputs[0].unsqueeze(0)

    out = model(inputs)
    # Apply softmax to convert prediction scores to probabilities
    probabilities = torch.softmax(out, dim=1)

    # Get the predicted classes by selecting the class with the highest probability
    predicted_classes = torch.argmax(probabilities, dim=1)  
    return predicted_classes

def construct_input_and_baseline(text):

    max_length = 512
    #baseline_token_id = rnn_utils.pad_sequence()

    input_ids = []
    token_list = []

    aa_dict = {'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10,
               'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'O': 16, 'S': 17, 'U': 18, 'T': 19,
               'W': 20, 'Y': 21, 'V': 22, 'X': 23}

    for char in text:
      if char in aa_dict:
        input_ids.append(aa_dict[char])
        token_list.append(char)

    baseline_token_id = 23
    baseline_input_ids = [baseline_token_id] * len(input_ids)

    input_ids_tensor = torch.tensor([input_ids], device='cpu')
    baseline_input_ids_tensor = torch.tensor([baseline_input_ids], device='cpu')

    return input_ids_tensor, baseline_input_ids_tensor, token_list

text = 'MSKSKMLVFKSKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKMSKSKMLVFKMSKSKMLVFKMSKSKMLVFKMSKSKMLVFK'

input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)

desired_length = 299
padded_sequences = [seq[:desired_length] if len(seq) >= desired_length else torch.cat((seq, torch.zeros(desired_length - len(seq)))) for seq in input_ids]
# Apply pad_sequence on the padded_sequences
    #data = pad_sequence(padded_sequences, batch_first=True)

input_ids = rnn_utils.pad_sequence(padded_sequences, batch_first=True)

padded_sequences = [seq[:desired_length] if len(seq) >= desired_length else torch.cat((seq, torch.zeros(desired_length - len(seq)))) for seq in baseline_input_ids]
# Apply pad_sequence on the padded_sequences
    #data = pad_sequence(padded_sequences, batch_first=True)

baseline_input_ids = rnn_utils.pad_sequence(padded_sequences, batch_first=True)

ig = IntegratedGradients(model, model.embedding_layer)

attribution = ig.attribute(inputs=input_ids, baselines=baseline_input_ids, target=model_output(input_ids))

complete error:

RuntimeError                              Traceback (most recent call last)

[<ipython-input-71-9c05c85df27b>](https://localhost:8080/#) in <cell line: 4>()
      2 
      3 
----> 4 attribution = ig.attribute(inputs=input_ids, baselines=baseline_input_ids, target=model_output(input_ids))           #(, baselines = baseline, target=0)

4 frames

[/usr/local/lib/python3.10/dist-packages/captum/log/__init__.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
     40             @wraps(func)
     41             def wrapper(*args, **kwargs):
---> 42                 return func(*args, **kwargs)
     43 
     44             return wrapper

[/usr/local/lib/python3.10/dist-packages/captum/attr/_core/integrated_gradients.py](https://localhost:8080/#) in attribute(self, inputs, baselines, target, additional_forward_args, n_steps, method, internal_batch_size, return_convergence_delta)
    284             )
    285         else:
--> 286             attributions = self._attribute(
    287                 inputs=inputs,
    288                 baselines=baselines,

[/usr/local/lib/python3.10/dist-packages/captum/attr/_core/integrated_gradients.py](https://localhost:8080/#) in _attribute(self, inputs, baselines, target, additional_forward_args, n_steps, method, step_sizes_and_alphas)
    349 
    350         # grads: dim -> (bsz * #steps x inputs[0].shape[1:], ...)
--> 351         grads = self.gradient_func(
    352             forward_fn=self.forward_func,
    353             inputs=scaled_features_tpl,

[/usr/local/lib/python3.10/dist-packages/captum/_utils/gradient.py](https://localhost:8080/#) in compute_gradients(forward_fn, inputs, target_ind, additional_forward_args)
    117         # torch.unbind(forward_out) is a list of scalar tensor tuples and
    118         # contains batch_size * #steps elements
--> 119         grads = torch.autograd.grad(torch.unbind(outputs), inputs)
    120     return grads
    121 

[/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py](https://localhost:8080/#) in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused, is_grads_batched)
    301         return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs_)
    302     else:
--> 303         return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    304             t_outputs, grad_outputs_, retain_graph, create_graph, t_inputs,
    305             allow_unused, accumulate_grad=False)  # Calls into the C++ engine to run the backward pass

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

here a data dependency free snippet:

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import torch.nn.utils.rnn as rnn_utils
from captum.attr import IntegratedGradients

class xAInet(nn.Module):
  def __init__(self):
    super().__init__()

    self.hidden_dim = 25
    self.batch_size = 32
    self.embedding_dim = 512

    self.embedding_layer = nn.Embedding(24, self.embedding_dim, padding_idx=0)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)

    self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
    self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, num_layers=2,
                      bidirectional=True, dropout=.2)

    self.block_seq = nn.Sequential(nn.Linear(15050, 2048),
                                   nn.BatchNorm1d(2048),
                                   nn.LeakyReLU(),
                                   nn.Linear(2048, 1024),
                                   nn.BatchNorm1d(1024),
                                   nn.LeakyReLU(),
                                   nn.Linear(1024, 256),
                                   nn.BatchNorm1d(256),
                                   nn.ReLU(),
                                   nn.Linear(256, 8),
                                   nn.Linear(8, 2),
                                   nn.Softmax(dim=1))

  def forward(self, seq):
        seq = seq.long()
        embeddings = self.embedding_layer(seq)
        output = self.transformer_encoder(embeddings).permute(1, 0, 2)
        output, hn = self.gru(output)
        output = output.permute(1, 0, 2)
        hn = hn.permute(1, 0, 2)

        output = output.reshape(output.shape[0], -1)
        hn = hn.reshape(output.shape[0], -1)

        output = torch.cat([output, hn], 1)
        output = self.block_seq(output)

        return output

  def train_model(self, seq):
    #with torch.no_grad():
        output = self.forward(seq)

        return output

def model_output(inputs):

    #inputs = inputs[0].unsqueeze(0)

    out = model(inputs)
    # Apply softmax to convert prediction scores to probabilities
    probabilities = torch.softmax(out, dim=1)

    # Get the predicted classes by selecting the class with the highest probability
    predicted_classes = torch.argmax(probabilities, dim=1)  
    return predicted_classes

def construct_input_and_baseline(text):

    max_length = 512
    #baseline_token_id = rnn_utils.pad_sequence()

    input_ids = []
    token_list = []

    aa_dict = {'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10,
               'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'O': 16, 'S': 17, 'U': 18, 'T': 19,
               'W': 20, 'Y': 21, 'V': 22, 'X': 23}

    for char in text:
      if char in aa_dict:
        input_ids.append(aa_dict[char])
        token_list.append(char)

    baseline_token_id = 23
    baseline_input_ids = [baseline_token_id] * len(input_ids)

    input_ids_tensor = torch.tensor([input_ids], device='cpu')
    baseline_input_ids_tensor = torch.tensor([baseline_input_ids], device='cpu')

    return input_ids_tensor, baseline_input_ids_tensor, token_list

text = 'MSKSKMLVFKSKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKMSKSKMLVFKMSKSKMLVFKMSKSKMLVFKMSKSKMLVFK'

input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)

desired_length = 299
padded_sequences = [seq[:desired_length] if len(seq) >= desired_length else torch.cat((seq, torch.zeros(desired_length - len(seq)))) for seq in input_ids]

input_ids = rnn_utils.pad_sequence(padded_sequences, batch_first=True)

padded_sequences = [seq[:desired_length] if len(seq) >= desired_length else torch.cat((seq, torch.zeros(desired_length - len(seq)))) for seq in baseline_input_ids]

baseline_input_ids = rnn_utils.pad_sequence(padded_sequences, batch_first=True)

model = xAInet()
model.eval()

ig = IntegratedGradients(model, model.embedding_layer)

attribution = ig.attribute(inputs=input_ids, baselines=baseline_input_ids, target=model_output(input_ids))

pytorch / captum

One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior. #1141