import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
data = pd.read_csv('news_data.csv', usecols=['TITLE', 'CATEGORY'])
M class has way less data than the others, thus the classes are unbalanced.
print("Count of Data ", data.CATEGORY.value_counts())
balancing classes
num_of_categories = 45000
shuffled = data.reindex(np.random.permutation(data.index))
e = shuffled[shuffled['CATEGORY'] == 'e'][:num_of_categories]
b = shuffled[shuffled['CATEGORY'] == 'b'][:num_of_categories]
t = shuffled[shuffled['CATEGORY'] == 't'][:num_of_categories]
m = shuffled[shuffled['CATEGORY'] == 'm'][:num_of_categories]
concated = pd.concat([e,b,t,m], ignore_index=True)
Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index))
concated['LABEL'] = 0
print("Count of Data After balancing", concated.CATEGORY.value_counts())
txt = ["Regular fast food eating linked to fertility issues in women"]
txt = ["increase of mobiles are dangerous for society"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D from keras.models import Sequential from sklearn.feature_extraction.text import CountVectorizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical from keras.callbacks import EarlyStopping
data = pd.read_csv('news_data.csv', usecols=['TITLE', 'CATEGORY'])
M class has way less data than the others, thus the classes are unbalanced.
print("Count of Data ", data.CATEGORY.value_counts())
balancing classes
num_of_categories = 45000 shuffled = data.reindex(np.random.permutation(data.index)) e = shuffled[shuffled['CATEGORY'] == 'e'][:num_of_categories] b = shuffled[shuffled['CATEGORY'] == 'b'][:num_of_categories] t = shuffled[shuffled['CATEGORY'] == 't'][:num_of_categories] m = shuffled[shuffled['CATEGORY'] == 'm'][:num_of_categories] concated = pd.concat([e,b,t,m], ignore_index=True)
Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index)) concated['LABEL'] = 0 print("Count of Data After balancing", concated.CATEGORY.value_counts())
One-hot encode the label
concated.loc[concated['CATEGORY'] == 'e', 'LABEL'] = 0 concated.loc[concated['CATEGORY'] == 'b', 'LABEL'] = 1 concated.loc[concated['CATEGORY'] == 't', 'LABEL'] = 2 concated.loc[concated['CATEGORY'] == 'm', 'LABEL'] = 3
print(concated['LABEL'][:10])
labels = to_categorical(concated['LABEL'], num_classes=4) print(labels[:10])
if 'CATEGORY' in concated.keys(): concated.drop(['CATEGORY'], axis=1) print(concated)
n_most_common_words = 8000 max_len = 130 tokenizer = Tokenizer(num_words=n_most_commonwords, filters='!"#$%&()*+,-./:;<=>?@[]^`{|}~', lower=True) tokenizer.fit_on_texts(concated['TITLE'].values) sequences = tokenizer.texts_to_sequences(concated['TITLE'].values) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index))
X = pad_sequences(sequences, maxlen=max_len) X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)
epochs = 10 emb_dim = 128 batch_size = 256 labels[:2]
Printing data
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))
Building Model
model = Sequential() model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1])) model.add(SpatialDropout1D(0.7)) model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7)) model.add(Dense(4, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
Printing Model states
print(model.summary())
Filtering data to model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])
Fitting data to model
accuracy = model.evaluate(X_test,y_test)
Evaluating accuracy
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))
txt = ["Regular fast food eating linked to fertility issues in women"]
txt = ["increase of mobiles are dangerous for society"] seq = tokenizer.texts_to_sequences(txt) padded = pad_sequences(seq, maxlen=max_len) pred = model.predict(padded)
Predicting the data
labels = ['entertainment', 'bussiness', 'science/tech', 'health']
Printing closest to the data
print(pred, labels[np.argmax(pred)])
https://www.kaggle.com/ngyptr/multi-class-classification-with-lstm/data