This repository consists of modules for pre-processing the textual data. Examples are also given for training deep models (DNN, CNN, RNN, LSTM). There are many additional functionilities which are as follows:
$ pip install -r requirements.txt
raw_data = ['this,,, is$$ a positive ..sentence','this is a ((*negative ,,@sentence',
'yet another..'' positive$$ sentence','the last one is ...,negative']
labels = [1,0,1,0]
This type of data is commonly used in sentiment analysis type problems. The first step is to clean the data:
from dl_text import dl
data = []
for sent in raw_data:
print data
['this is a positive sentence', 'this is a negative sentence',
'yet another positive sentence', 'the last one is negative']
Once the raw data is cleaned, the next step is the prepare that can be passed to the deep models. Use the following function:
data_inp = dl.process_data(sent_l = data, dimx = 10)
The process_data
function preprocesses the data that can be used with deep models. The process_data
has following parameters:
: data to be sent to training model (if you are using only one channel, as in the case of sentiment analysis, then use this parameter)sent_r
: data for the second channel (discussed later)wordVec_model
: pre-trained word vector embeddings (either GloVe or Word2vec)dimx
and dimy
: number of words to be included (if a sentence has lesser words than this value, it will be padded by 0, otherwise extra words will be truncated)vocab_size
: number of unique words to be included in the vocabularyembedding_dim
: size of the embeddings for wordVec_modelsfrom dl_text import dl
import gensim
# for 50-dim glove embeddings use:
wordVec_model = dl.loadGloveModel('path_of_the_embeddings/glove.6B.50d.txt')
# for 300 dim word2vec embeddings use:
wordVec_model = gensim.models.KeyedVectors.load_word2vec_format("path/GoogleNews-vectors-negative300.bin.gz",
data_inp, embedding_matrix = dl.process_data(sent_l = data, wordVec_model = wordVec_model, dimx = 10)
from dl_text import dl
from keras.layers import Input, Dense, Dropout, Merge, Conv1D, Lambda, Flatten, MaxPooling1D
def model_dnn(dimx, embedding_matrix):
inpx = Input(shape=(dimx,),dtype='int32',name='inpx')
embed = dl.word2vec_embedding_layer(embedding_matrix)(inpx)
flat_embed = Flatten()(embed)
nnet_h = Dense(units=10,activation='sigmoid')(flat_embed)
nnet_out = Dense(units=2,activation='sigmoid')(nnet_h)
model = Model([inpx],nnet_out)
return model
def model_cnn(dimx, embedding_matrix):
inpx = Input(shape=(dimx,),dtype='int32',name='inpx')
embed = dl.word2vec_embedding_layer(embedding_matrix)(inpx)
sent = Conv1D(nb_filter=3,filter_length=2,activation='relu')(embed)
pool = MaxPooling1D()(sent)
flat_embed = Flatten()(pool)
nnet_h = Dense(units=10,activation='sigmoid')(flat_embed)
nnet_out = Dense(units=2,activation='sigmoid')(nnet_h)
model = Model([inpx],nnet_out)
return model
data = ['this is a positive sentence', 'this is a negative sentence', 'yet another positive sentence', 'the last one is negative']
labels = [1,0,1,0]
data_inp, embedding_matrix = dl.process_data(sent_l = data, wordVec_model = wordVec_model, dimx = 10)
model = model_dnn(dimx = 10, embedding_matrix = embedding_matrix), labels)
model = model_cnn(dimx = 10, embedding_matrix = embedding_matrix), labels)
These type of models use two data streams. This can be used to NLP tasks such as question answering, sentence similarity computation, etc. The data looks like this
data_l = ['this is a positive sentence','this is a negative sentence',
'yet another positive sentence', 'the last one is negative']
data_r = ['positive words are good, better, best, etc.', 'negative words are bad, sad, etc.',
'feeling good', 'sooo depressed.']
labels = [1,0,1,0]
Here, data_l
and data_r
can be two sentences for computing sentence similarity, question-answer pairs for question answering problem, etc.
Let's define a model for the these type of tasks
def model_cnn2(dimx, dimy, embedding_matrix):
inpx = Input(shape=(dimx,),dtype='int32',name='inpx')
embedx = dl.word2vec_embedding_layer(embedding_matrix)(inpx)
inpy = Input(shape=(dimx,),dtype='int32',name='inpy')
embedy = dl.word2vec_embedding_layer(embedding_matrix)(inpy)
sent_l = Conv1D(nb_filter=3,filter_length=2,activation='relu')(embedx)
sent_r = Conv1D(nb_filter=3,filter_length=2,activation='relu')(embedy)
pool_l = MaxPooling1D()(sent_l)
pool_r = MaxPooling1D()(sent_r)
combine = merge(mode='concat')([pool_l, pool_r])
flat_embed = Flatten()(combine)
nnet_h = Dense(units=10,activation='sigmoid')(flat_embed)
nnet_out = Dense(units=2,activation='sigmoid')(nnet_h)
model = Model([inpx],nnet_out)
return model
data_inp_l, data_inp_r, embedding_matrix = dl.process_data(sent_l = data_l, sent_r = data_r,
wordVec_model = wordVec_model, dimx = 10, dimy = 10)
model = model_cnn2(dimx = 10, dimy = 10, embedding_matrix = embedding_matrix)[data_inp_l, data_inp_r], labels)
>>> from dl_text import lex_sem_ft
>>> sent1 = 'i like natural language processing'
>>> sent2 = 'i like deep learning'
>>> lex_sem_ft.tokenize(sent1) # tokenizing a sentence
['i', 'like', 'natural', 'language', 'processing']
>>> lex_sem_ft.overlap(sent1,sent2) # number of words common
Functions currently present in the lex_sem_ft
>>> from dl_text import rd_ft
>>> sent1 = 'i like natural language processing'
>>> rd_ft.CPW(sent1) # average characters per word
>>> rd_ft.ED('good','great') # edit distance between two words
Functions currently present in the rd_ft
from dl_text import dl
from dl_text import lex_sem_ft
from dl_text import rd_ft
data_l = ['this is a positive sentence','this is a negative sentence',
'yet another positive sentence', 'the last one is negative']
data_r = ['positive words are good, better, best, etc.', 'negative words are bad, sad, etc.',
'feeling good', 'sooo depressed.']
labels = [1,0,1,0]
wordVec_model = dl.loadGloveModel('path_of_the_embeddings/glove.6B.50d.txt')
all_feat = []
for i,j in zip(data_l, data_r):
feat1 = lex_sem_ft.overlap(i, j)
feat2 = lex_sem_ft.W2V_Vec(i, j, wordVec_model)
feat3 = rd_ft.ED(i, j)
feat4 = rd_ft.LCW(i, j)
data_inp_l, data_inp_r, embedding_matrix = dl.process_data(sent_l = data_l, sent_r = data_r,
wordVec_model = wordVec_model, dimx = 10, dimy = 10)
def model_cnn_ft(dimx, dimy, dimft, embedding_matrix):
inpx = Input(shape=(dimx,),dtype='int32',name='inpx')
embedx = dl.word2vec_embedding_layer(embedding_matrix)(inpx)
inpy = Input(shape=(dimx,),dtype='int32',name='inpy')
embedy = dl.word2vec_embedding_layer(embedding_matrix)(inpy)
inpz = Input(shape=(dimft,),dtype='int32',name='inpz')
sent_l = Conv1D(nb_filter=3,filter_length=2,activation='relu')(embedx)
sent_r = Conv1D(nb_filter=3,filter_length=2,activation='relu')(embedy)
pool_l = MaxPooling1D()(sent_l)
pool_r = MaxPooling1D()(sent_r)
combine = merge(mode='concat')([pool_l, pool_r,inpz])
flat_embed = Flatten()(combine)
nnet_h = Dense(units=10,activation='sigmoid')(flat_embed)
nnet_out = Dense(units=2,activation='sigmoid')(nnet_h)
model = Model([inpx],nnet_out)
return model
model = model_cnn_ft(dimx = 10, dimy = 10, dimz = len(all_feat), embedding_matrix = embedding_matrix)[data_inp_l, data_inp_r, all_feat], labels)
The mean average precision (MAP) and mean reciprocal recall (MRR) is computed as:
In our implementation we assume that the ground truth is arranged starting with the true labels and is/are followed by false labels.
>>> from dl_text import metrics
>>> pred = [[0,0,1],[0,0,1]] # we have two queries with 3 answers for each; 1 - relevant, 0 - irrelevant
'''Converting the prediction list to dictionary'''
>>> dict1 = {}
>>> for i,j in enumerate(pred):
dict1[i] = j
>>> metrics.Map(dict1)
>>> metrics.Mrr(dict1)
>>> pred = [[0,1,1],[0,1,0]]
>>> for i,j in enumerate(pred):
dict1[i] = j
>>> metrics.Map(dict1)
>>> metrics.Mrr(dict1)