Closed chay-muvva closed 3 years ago
Could you make a smaller example? It's going to take a while to debug a script this big.
Faced the same problem. Could it be that your dataset is empty? I think in that case ProgbarLogger.on_epoch_end
might be called before ever calling ProgbarLogger.on_batch_begin
(which would set the attribute self.log_values
.
My problem was with fit_generator
function where I had accidentally set steps_per_epoch=0
which leads to the very same error. Anyway, this Exception is misleading. I would suggest to catch these cases by a different Exception being raised telling the user about their silly input.
import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM from keras import optimizers from keras.models import load_model import json, argparse, os import re import io import sys
Path to training and testing data file. This data can be downloaded from a link, details of which will be provided.
trainDataPath = "train.txt" testDataPath = "dev.txt"
Output file that will be generated. This file can be directly submitted.
solutionPath = "test.txt"
Path to directory where GloVe file is saved.
gloveDir = "./"
NUM_FOLDS = 1 # Value of K in K-fold Cross Validation NUM_CLASSES = 4 # Number of classes - Happy, Sad, Angry, Others MAX_NB_WORDS = 20000 # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer MAX_SEQUENCE_LENGTH = 100 # All sentences having lesser number of words than this will be padded EMBEDDING_DIM = 100 # The dimension of the word embeddings BATCH_SIZE = 200 # The batch size to be chosen for training the model. LSTM_DIM = 128 # The dimension of the representations learnt by the LSTM model DROPOUT = 0.2 # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/ NUM_EPOCHS = 15 # Number of epochs to train a model for
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"} emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}
def preprocessData(dataFilePath, mode): """Load data from a file, process and return indices, conversations and labels in separate lists Input: dataFilePath : Path to train/test file to be processed mode : "train" mode returns labels. "test" mode doesn't return labels. Output: indices : Unique conversation ID list conversations : List of 3 turn conversations, processed and each turn separated by the tag
labels : [Only available in "train" mode] List of labels
"""
indices = []
conversations = []
labels = []
with io.open(dataFilePath, encoding="utf8") as finput:
finput.readline()
for line in finput:
Convert multiple instances of . ? ! , to single instance
def getMetrics(predictions, ground): """Given predicted labels and the respective ground truth labels, display some metrics Input: shape [# of samples, NUM_CLASSES] predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0] Output: accuracy : Average accuracy microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001 microRecall : Recall calculated on a micro level microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification
"""
[0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
def writeNormalisedData(dataFilePath, texts): """Write normalised data to a file Input: dataFilePath : Path to original train/test file that has been processed texts : List containing the normalised 3 turn conversations, separated by the tag.
"""
normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
with io.open(dataFilePath, encoding='utf8') as fin:
fin.readline()
for lineNum, line in enumerate(fin):
line = line.strip().split('\t')
normalisedLine = texts[lineNum].strip().split('')
fout.write(line[0] + '\t')
Write the original turn, followed by the normalised version of the same turn
def getEmbeddingMatrix(wordIndex): """Populate an embedding matrix using a word-index. If the word "happy" has an index 19, the 19th row in the embedding matrix should contain the embedding vector for the word "happy". Input: wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser Output: embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding """ embeddingsIndex = {}
Load the embedding vectors from ther GloVe file
def buildModel(embeddingMatrix): """Constructs the architecture of the model Input: embeddingMatrix : The embedding matrix to be loaded in the embedding layer. Output: model : A basic LSTM model """ embeddingLayer = Embedding(embeddingMatrix.shape[0], EMBEDDING_DIM, weights=[embeddingMatrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) print(embeddingLayer) model = Sequential() model.add(embeddingLayer) model.add(LSTM(LSTM_DIM, dropout=DROPOUT)) model.add(Dense(NUM_CLASSES, activation='softmax'))
def main(): '''parser = argparse.ArgumentParser(description="Baseline Script for SemEval") parser.add_argument('-config', help='Config to read details', required=True) args = parser.parse_args()
if name == 'main': main()