import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D from keras.models import Sequential from sklearn.feature_extraction.text import CountVectorizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical from keras.callbacks import EarlyStopping

data = pd.read_csv('news_data.csv', usecols=['TITLE', 'CATEGORY'])

M class has way less data than the others, thus the classes are unbalanced.

print("Count of Data ", data.CATEGORY.value_counts())

balancing classes

num_of_categories = 45000 shuffled = data.reindex(np.random.permutation(data.index)) e = shuffled[shuffled['CATEGORY'] == 'e'][:num_of_categories] b = shuffled[shuffled['CATEGORY'] == 'b'][:num_of_categories] t = shuffled[shuffled['CATEGORY'] == 't'][:num_of_categories] m = shuffled[shuffled['CATEGORY'] == 'm'][:num_of_categories] concated = pd.concat([e,b,t,m], ignore_index=True)

Shuffle the dataset

concated = concated.reindex(np.random.permutation(concated.index)) concated['LABEL'] = 0 print("Count of Data After balancing", concated.CATEGORY.value_counts())

One-hot encode the label

concated.loc[concated['CATEGORY'] == 'e', 'LABEL'] = 0 concated.loc[concated['CATEGORY'] == 'b', 'LABEL'] = 1 concated.loc[concated['CATEGORY'] == 't', 'LABEL'] = 2 concated.loc[concated['CATEGORY'] == 'm', 'LABEL'] = 3

print(concated['LABEL'][:10])

labels = to_categorical(concated['LABEL'], num_classes=4) print(labels[:10])

if 'CATEGORY' in concated.keys(): concated.drop(['CATEGORY'], axis=1) print(concated)

n_most_common_words = 8000 max_len = 130 tokenizer = Tokenizer(num_words=n_most_commonwords, filters='!"#$%&()*+,-./:;<=>?@[]^`{|}~', lower=True) tokenizer.fit_on_texts(concated['TITLE'].values) sequences = tokenizer.texts_to_sequences(concated['TITLE'].values) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len) X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)

epochs = 10 emb_dim = 128 batch_size = 256 labels[:2]

Printing data

print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

Building Model

model = Sequential() model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1])) model.add(SpatialDropout1D(0.7)) model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7)) model.add(Dense(4, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

Printing Model states

print(model.summary())

Filtering data to model

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

Fitting data to model

accuracy = model.evaluate(X_test,y_test)

Evaluating accuracy

print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

txt = ["Regular fast food eating linked to fertility issues in women"]

txt = ["increase of mobiles are dangerous for society"] seq = tokenizer.texts_to_sequences(txt) padded = pad_sequences(seq, maxlen=max_len) pred = model.predict(padded)

simplysameer333 / MachineLearning

news2 - uci-news-aggregator.csv.gz #13