facebookresearch / mmf

A modular framework for vision & language multimodal research from Facebook AI Research (FAIR)
https://mmf.sh/
Other
5.46k stars 932 forks source link

Fisher vector implementation over the phoc features #1198

Open sarthak-sg opened 2 years ago

sarthak-sg commented 2 years ago

size mismatch for linear_ocr_feat_to_mmt_in.weight: copying a param with shape torch.Size([768, 3002]) from checkpoint, the shape in current model is torch.Size([768, 3464]). Also I have implemented fisher vectors over phoc features by modifying the code in pythia/utils/phoc/build_phoc.py and after that I was getting the feature dimension to be (38400, ) whereas the previous dimension was (604, ) so I added a fully connected layer over it to reduce the dim back to the normal but after making changes here, how do I make my new model to work according to the new changes in ocr features.

Instructions To Reproduce the Issue:

I am getting size match error in checkpoint and I am not able to understand how to change the dimension in the checkpoint.

  1. full code you wrote or full changes you made (git diff)

import scipy.io import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import logging from tqdm import tqdm from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn import preprocessing from sklearn.mixture import GaussianMixture as GMM import time import json import os from .cphoc import build_phoc as _build_phoc_raw

_alphabet = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9"} # NoQA

def build_phocs(words, phoc_unigrams, unigram_levels, bigram_levels=None, phoc_bigrams=None, split_character=None, on_unknown_unigram='error'): ''' Calculate Pyramidal Histogram of Characters (PHOC) descriptor (see Almazan 2014). Args: word (str): word to calculate descriptor for phoc_unigrams (str): string of all unigrams to use in the PHOC unigram_levels (list of int): the levels for the unigrams in PHOC phoc_bigrams (list of str): list of bigrams to be used in the PHOC phoc_bigram_levls (list of int): the levels of the bigrams in the PHOC split_character (str): special character to split the word strings into characters on_unknown_unigram (str): What to do if a unigram appearing in a word is not among the supplied phoc_unigrams. Possible: 'warn', 'error' Returns: the PHOC for the given word '''

prepare output matrix

logger = logging.getLogger('PHOCGenerator')
if on_unknown_unigram not in ['error', 'warn']:
    raise ValueError('I don\'t know the on_unknown_unigram parameter \'%s\'' % on_unknown_unigram)
phoc_size = len(phoc_unigrams) * np.sum(unigram_levels)
if phoc_bigrams is not None:
    phoc_size += len(phoc_bigrams) * np.sum(bigram_levels)
phocs = np.zeros((len(words), phoc_size))
# prepare some lambda functions
occupancy = lambda k, n: [float(k) / n, float(k + 1) / n]
overlap = lambda a, b: [max(a[0], b[0]), min(a[1], b[1])]
size = lambda region: region[1] - region[0]

# map from character to alphabet position
char_indices = {d: i for i, d in enumerate(phoc_unigrams)}

# iterate through all the words
for word_index, word in enumerate(words):
    if split_character is not None:
        word = word.split(split_character)
    n = len(word)
    for index, char in enumerate(word):
        char_occ = occupancy(index, n)
        if char not in char_indices:
            if on_unknown_unigram == 'warn':
                logger.warn('The unigram \'%s\' is unknown, skipping this character', char)
                continue
            else:
                logger.fatal('The unigram \'%s\' is unknown', char)
                raise ValueError()
       char_index = char_indices[char]
        for level in unigram_levels:
            for region in range(level):
                region_occ = occupancy(region, level)
                if size(overlap(char_occ, region_occ)) / size(char_occ) >= 0.5:
                    feat_vec_index = sum([l for l in unigram_levels if l < level]) * len(
                        phoc_unigrams) + region * len(phoc_unigrams) + char_index
                    phocs[word_index, feat_vec_index] = 1

            # add bigrams
    if phoc_bigrams is not None:
        ngram_features = np.zeros(len(phoc_bigrams) * np.sum(bigram_levels))
        ngram_occupancy = lambda k, n: [float(k) / n, float(k + 2) / n]
        for i in range(n - 1):
            ngram = word[i:i + 2]
            phoc_dict = {k: v for v, k in enumerate(phoc_bigrams)}
            if phoc_dict.get(ngram, 666) == 666:
                continue
            occ = ngram_occupancy(i, n)
            for level in bigram_levels:
                for region in range(level):
                    region_occ = occupancy(region, level)
                    overlap_size = size(overlap(occ, region_occ)) / size(occ)
                    if overlap_size >= 0.5:
                        ngram_features[region * len(phoc_bigrams) + phoc_dict[ngram]] = 1
        phocs[word_index, -ngram_features.shape[0]:] = ngram_features

return phocs

def phoc(raw_word): ''' :param raw_word: string of word to be converted :return: phoc representation as a np.array (1,604) '''

word =[raw_word]
word_lowercase = word[0].lower()
word = [word_lowercase]
phoc_unigrams = 'abcdefghijklmnopqrstuvwxyz0123456789'
unigram_levels = [2,3,4,5]
bigram_levels=[]
bigram_levels.append(2)

phoc_bigrams = []
i = 0
with open('/pythia/utils/phoc/bigrams_new.txt','r') as f:
       for line in f:
        a = line.split()
        phoc_bigrams.append(a[0].lower())
        #phoc_bigrams.append(list(a[0])[0])
        #phoc_bigrams.append(list(a[0])[1])
        i = i +1
        if i >= 50:break

qry_phocs = build_phocs(words = word, phoc_unigrams = phoc_unigrams, unigram_levels = unigram_levels,
                       bigram_levels = bigram_levels, phoc_bigrams = phoc_bigrams)

return qry_phocs

def text_cleaner (dirty_text):

CLEANS NOT WANTED CHARACTERES AND SENDS STRING

clean_text = ''.join(c for c in dirty_text if c not in '(){}<>;:!@#$%^&*_-=+-*/[]\' \"?>.<,')
return clean_text

def fisher_vector(xx, gmm): """Computes the Fisher vector on a set of descriptors. Parameters

xx: array_like, shape (N, D) or (D, )
    The set of descriptors
gmm: instance of sklearn mixture.GMM object
    Gauassian mixture model of the descriptors.
Returns
-------
fv: array_like, shape (K + 2 * D * K, )
    Fisher vector (derivatives with respect to the mixing weights, means
    and variances) of the given descriptors.
Reference
---------
J. Krapac, J. Verbeek, F. Jurie.  Modeling Spatial Layout with Fisher
Vectors for Image Categorization.  In ICCV, 2011.
http://hal.inria.fr/docs/00/61/94/03/PDF/final.r1.pdf
"""
xx = np.atleast_2d(xx)
N = xx.shape[0]

# Compute posterior probabilities.
Q = gmm.predict_proba(xx)  # NxK

# Compute the sufficient statistics of descriptors.
Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
Q_xx = np.dot(Q.T, xx) / N
Q_xx_2 = np.dot(Q.T, xx ** 2) / N

# Compute derivatives with respect to mixing weights, means and variances.
d_pi = Q_sum.squeeze() - gmm.weights_
d_mu = Q_xx - Q_sum * gmm.means_
d_sigma = (
    - Q_xx_2
    - Q_sum * gmm.means_ ** 2
    + Q_sum * gmm.covariances_
    + 2 * Q_xx * gmm.means_)

# Merge derivatives into a vector.
#return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))
return np.hstack((d_mu.flatten(), d_sigma.flatten()))

dictionary = '/pythia/utils/phoc/90K_dictionary_Jaderberg.txt'

file = open(dictionary, 'r') lines = file.readlines() phoc_matrix = np.zeros((len(lines), 604)) for i,line in tqdm(enumerate (lines)): phoc_matrix[i] = phoc(text_cleaner(line.replace('\n',''))) print('The shape of the data is: ', np.shape(phoc_matrix))

FIRST L2 NORM PHOCS

norm_phoc_matrix = preprocessing.normalize(phoc_matrix, norm = 'l2')

SCALER THE PHOCS...

scaler = StandardScaler() scaler.fit(norm_phoc_matrix) data = scaler.transform(norm_phoc_matrix)

PCA

pca = PCA(n_components=300) pca.fit(data) pca_data = pca.transform(data) print(np.shape(pca_data)) print('PCA Complete!')

TRAIN GMM model

start = time.time()

Original on Raw PHOCs with PCA

gmm = GMM(n_components = 64, covariance_type = 'diag') print(np.shape(pca_data)) gmm.fit(pca_data)

def build_phoc(token): token = token.lower().strip() token = ''.join([c for c in token if c in _alphabet]) phoc = _build_phoc_raw(token) phoc = np.array(phoc, dtype=np.float32) phocs = np.resize(phoc, (1,np.shape(phoc)[0])) phoc_normalized = preprocessing.normalize(phocs, norm ='l2') scaler_phoc = scaler.transform(phoc_normalized) phoc_PCA = pca.transform(scaler_phoc) phoc_FV = fisher_vector(phoc_PCA, gmm) phoc_FV = preprocessing.normalize(phoc_FV.reshape(1,-1), norm ='l2') phoc_FV = phoc_FV.astype(np.float32) phoc_FV = torch.from_numpy(phoc_FV) m = nn.Linear(38400, 604) phoc_FV = m(phoc_FV) phoc_FV = phoc_FV.cpu().detach().numpy() phoc_FV = phoc_FV.reshape((-1)) return phoc_FV

2. what exact command you run:

python -m torch.distributed.launch --nproc_per_node 4 tools/run.py --tasks vqa --datasets m4c_textvqa --model m4c \
--config configs/vqa/m4c_textvqa/m4c.yml \
--save_dir save/m4c \
training_parameters.distributed True

3. __full logs__ you observed:

size mismatch for linear_ocr_feat_to_mmt_in.weight: copying a param with shape torch.Size([768, 3002]) from checkpoint, the shape in current model is torch.Size([768, 3464])


## Problem motive:

How do I extract the new features. Just like PHOC features, Faster RCNN features were extracted, can someone guide on how to extract Fisher vector features , it is no where mentioned in the paper how to extract features and where and how to store them and use them.

## Environment:

Provide your environment information using the following command:

python -m torch.utils.collect_env



If your issue looks like an installation issue / environment issue,
please first try to solve it with the instructions in
https://mmf.sh/docs/getting_started/installation