Multiple features throw batchnorm exception, problem at size

estathop commented 6 years ago

with open('/home/estathop/Desktop/ordered_word2vec_22_sentences.pickle', 'r') as f42:
    ordered_word_feats = pickle.load(f42)
with open('/home/estathop/Desktop/22featvec.pickle', 'r') as f66:
    listoffeats = pickle.load(f66)

Here I load the pre-extracted features by me, ordered_word_feats is a list of 22 X N X 300 where N is the number of words in text. listoffeatsis a list of 22 X N X 2048 where N is the number of frames per video. Apparently, 22 is the number of video-text pair, and 300 and 2048 is the feature size. The ordered_word_feats[0] is the text from listoffeats[0] image features extracted from video. [1] for [1], [2] for [2] etc.

''' Load and initiate the pre-trained model from Miech Journal for similarity among modalities score'''
video_modality_dim = {'face': (128,128), 'audio': (128*16,128),
'visual': (2048,2048), 'motion': (1024,1024)}
the_model = Net(video_modality_dim, 300, audio_cluster=16)
the_model.load_state_dict(torch.load('/home/estathop/Desktop/journalmodel/msrvttjournal.pt'))
the_model.eval()

Here I load the model and put it in eval mode.

''' create indices for the last model'''
face_ind=np.zeros(1)
audio_ind=np.zeros(1)
motion_ind=np.zeros(1)

'''create the tensors for the last model'''

face_data = torch.from_numpy(np.zeros([1,128]))
audio_data = torch.from_numpy(np.zeros([1,1,128]))
motion_data =torch.from_numpy(np.zeros([1,1024]))

audio_data = audio_data.type(torch.FloatTensor)
audio_data = Variable(audio_data, requires_grad=False)

face_data = face_data.type(torch.FloatTensor)
face_data = Variable(face_data, requires_grad = False)

motion_data = motion_data.type(torch.FloatTensor)
motion_data = Variable(motion_data, requires_grad = False)

Here I create the indices and tensors for data I don't have, it's missing

pred_true=list()
pred_false=list()
for enum in ordered_word_feats:
    for enum2 in listoffeats:
         word_tensor_to_be = enum.reshape(1,len(enum),300)
         word_tensor = torch.from_numpy(np.array(word_tensor_to_be))
         visual_data = torch.from_numpy(np.array([enum2]))

         visual_data = visual_data.type(torch.FloatTensor)
         visual_data = Variable(visual_data, requires_grad = False)
         visual_ind= np.ones(len(enum2))

         ind = {'face': face_ind, 'audio': audio_ind, 'visual': visual_ind, 'motion': motion_ind}
         videos = {'face': face_data, 'audio': audio_data, 'visual': visual_data, 'motion': motion_data}
         ypreds = the_model(word_tensor, videos, ind)
         ypreds2 = the_model(word_tensor, videos, ind, False)
         pred_true.append(ypreds)
         pred_false.append(ypreds2)

And here is my problem word_tensor_to_be is supposed to be a (1,N,300) word feature vector with N being the number of words in that text, 1 is the batch size and 300 the feature size. enum and enum2 are the equivalent numpy arrays iterating through the parent lists for image and word feature vectors described above. My goal is to take the final similarity score among all pairs between 22 texts and 22 videos. But there is an error again which follows:

Traceback (most recent call last):

  File "<ipython-input-67-106513ee0f47>", line 3, in <module>
    ypreds = the_model(word_tensor, videos, ind)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "model.py", line 46, in forward
    return self.mee(text, aggregated_video, ind, conf)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "model.py", line 71, in forward
    video[self.m[i]] = l(video[self.m[i]])

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "model.py", line 128, in forward
    x = self.cg(x)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "model.py", line 145, in forward
    x1 = self.batch_norm(x1)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/modules/batchnorm.py", line 66, in forward
    exponential_average_factor, self.eps)

  File "/home/estathop/anaconda2/envs/tensorflow/lib/python2.7/site-packages/torch/nn/functional.py", line 1254, in batch_norm
    training, momentum, eps, torch.backends.cudnn.enabled

RuntimeError: running_mean should contain 16889 elements not 2048

As read in the journal, the array for visual_data should be Nx2048 with visual_ind being np.ones(len(N))

enum2.shape
Out[68]: (16889, 2048)

len(enum2)
Out[69]: 16889


enum.shape
Out[70]: (411, 300)

len(enum)
Out[71]: 411

Any ideas ? What am I missing again here ?

estathop commented 6 years ago

class Net(nn.Module):
    def __init__(self, video_modality_dim, text_dim, audio_cluster=8,  text_cluster=32):
        super(Net, self).__init__()

        self.audio_pooling = NetVLAD(feature_size=video_modality_dim['audio'][1],
                cluster_size=audio_cluster)
        self.text_pooling = NetVLAD(feature_size=text_dim,
                cluster_size=text_cluster)

        self.mee = MEE(video_modality_dim, self.text_pooling.out_dim)

    def forward(self, text, video, ind, conf=True):

        aggregated_video = {}

        aggregated_video['audio'] = self.audio_pooling(video['audio'])
        aggregated_video['face'] = video['face'] 
        aggregated_video['motion'] = video['motion']
        aggregated_video['visual'] = video['visual']

        text = self.text_pooling(text)

return self.mee(text, aggregated_video, ind, conf)

where exactly does the max pooling on appearance happen ? because the model when inserted with only 1 feature vector (1 x 2048) , it seems to work, it seems to miss the dimensions when inserted with multiple feature vectors concerning appearance

estathop commented 6 years ago

How do you perform exactly the max pooling ? I guess I have to do it myself (after trying all day to see what's wrong with the model )

antoine77340 commented 6 years ago

Yes, I have fed the aggregated video representation. Which means I took the list of features for each video and performed a maximum pooling on those features. For example if you have a representation of size N x T x 2048 where N is the number of videos and T the maximum number of features, you would just perform a max operation on the dimension of sized T. The final tensor would thus be of size N x 2048 (no temporal dimension)

estathop commented 6 years ago

the MEE block accepts input of size N X M , where N is the batch size and M is the product of (feature size X cluster size) derived from NetVlad. Before that, when passing to text variable a N X W X Y matrix to Netvlad, where N is the batch size, W is the the number of word features (W is the number of words in sentence to be precise) derived from word2vec and Y is the feature size which is 300 in this occasion. I created a _numpy.object with 22 X W X 300 where W varies depending on the total words of the sentence, but there isn't a valid way to cast to numpy.array or to tensor because of the arbitrary dimension of W. So is it assumed that the W is a fixed number of words ? for example 10 ? Or is it a way I am missing. For that reason, I created an external loop with batch size 1 and gave only 1 aggregated video representation and 1 array of W X 300 which is the matrix from word2vec.

for enum in ordered_word_feats:
    word_tensor_to_be = enum.reshape(1,len(enum),300)
    word_tensor = torch.from_numpy(np.array(word_tensor_to_be))
    for enum2 in aggr_vis_feats:

        sess = K.get_session()
        array = sess.run(enum2)

        visual_data = torch.from_numpy(array)
        list2d.append(array)

        #visual_data = visual_data.type(torch.FloatTensor)
        #visual_data = Variable(visual_data, requires_grad = False)
        visual_data= visual_data.type(torch.FloatTensor)
        visual_data = Variable(visual_data, requires_grad = False)
        visual_ind= np.ones(1)

        ind = {'face': face_ind, 'audio': audio_ind, 'visual': visual_ind, 'motion': motion_ind}
        videos = {'face': face_data, 'audio': audio_data, 'visual': visual_data, 'motion': motion_data}
        ypreds = the_model(word_tensor, videos, ind)
        ypreds2 = the_model(word_tensor, videos, ind, False)
        pred_true.append(ypreds)
        pred_false.append(ypreds2)

In your personal github repository and in the version of tensorflow of loupe.py you mention there that:

class NetVLAD(PoolingBaseModel):
def forward(self, reshaped_input):
        """Forward pass of a NetVLAD block.
        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

In your pytorch code in current project you don't expect the same shape as you perform it internally

def forward(self,x):
        max_sample = x.size()[1]
x = x.view(-1,self.feature_size)

What's the case in this ? What am I missing again ?

antoine77340 / Mixture-of-Embedding-Experts

Multiple features throw batchnorm exception, problem at size #10