anuragmishracse / caption_generator

A modular library built on top of Keras and TensorFlow to generate a caption in natural language for any input image.
MIT License
265 stars 120 forks source link

Low accuracy on MSCOCO #7

Open ReneeZD opened 7 years ago

ReneeZD commented 7 years ago

HI, I'm following your code and try to train the network on MSCOCO Here is my code

class Caption_Model: def init(self,char_to_int,int_to_char,vocab_size=26688,max_caption_len=20,folder_path=path,epochs=10,batch_size=64): self.img_model=Sequential() self.text_model=Sequential() self.model=Sequential() self.vocab_size=vocab_size self.max_caption_len=max_caption_len self.folder_path=folder_path self.data={} self.char_to_int=char_to_int self.int_to_char=int_to_char self.batch_size=batch_size self.epochs=epochs

def get_image_model(self):
    self.img_model.add(Dense(Embedding_dim,input_dim=4096,activation='relu'))
    self.img_model.add(RepeatVector(self.max_caption_len+1))
    # self.img_model.summary()
    return self.img_model

def get_text_model(self):
    self.text_model.add(Embedding(self.vocab_size,256,input_length=self.max_caption_len+1))
    self.text_model.add(LSTM(512,return_sequences=True))
    #self.text_model.add(Dropout(0.2))
    self.text_model.add(TimeDistributed(Dense(Embedding_dim,activation='relu')))
    # self.text_model.summary()
    return self.text_model

def get_caption_model(self,predict=False):
    self.get_image_model()
    self.get_text_model()
    self.model.add(Merge([self.img_model,self.text_model],mode='concat'))
    self.model.add(LSTM(1000,return_sequences=False))
    self.model.add(Dense(self.vocab_size))
    self.model.add(Activation('softmax'))
    print "Now model.model"
    sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.99, nesterov=True)
    rms = RMSprop(lr=0.005)
    if predict:
        return
    else:
        # weight='/home/paperspace/Document/DeepLearning/ImageCaption/code/Models/checkpoint/weights-improvement-02-5.2473.hdf5'
        # self.model.load_weights(weight)
        self.model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

def load_data(self,set_type='train'):
    data={}
    with open(self.folder_path+set_type+'.processed_img.2.pkl') as f:
        data['imgs']=pickle.load(f)
    with open(os.path.join(self.folder_path,'all%spartial_sentences_0.pkl'%set_type)) as f:
        data['partial_sentences']=pickle.load(f)
    return data

def data_generator(self,set_type='train'):
    data=self.load_data(set_type)
    j=0
    temp=data['partial_sentences'].keys()
    partial_sentences,images=[],[]
    next_words=np.zeros((self.batch_size,self.vocab_size)).astype(float)
    count=0
    round_count=0
    while True:
        round_count+=1
        random.shuffle(temp)
        print "the %d round!" %round_count
        for key in temp:
            image=data['imgs'][key]
            for sen in data['partial_sentences'][key]:
                for k in range(len(sen)):
                    count+=1
                    partial=sen[:k+1]
                    partial_sentences.append(partial)
                    images.append(image)
                    # print "index is: ",count-1
                    if k==len(sen)-1:
                        next_words[count-1][self.char_to_int['<end>']]=1
                    else:
                        next_words[count-1][sen[k+1]]=1
                    if count>=self.batch_size:
                        partial_sentences=sequence.pad_sequences(partial_sentences, maxlen=self.max_caption_len+1, padding='post')
                        partial_sentences=np.asarray(partial_sentences)
                        images=np.asarray(images)
                        # partial_sentences=partial_sentences/float(self.vocab_size)
                        # print partial_sentences
                        count=0
                        yield [images,partial_sentences],next_words
                        partial_sentences,images=[],[]
                        next_words=np.zeros((self.batch_size,self.vocab_size)).astype(float)

        j+=1

def train(self):
    self.get_caption_model()
    filepath="Models/checkpoint/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]

    self.model.fit_generator(self.data_generator('train'),steps_per_epoch=step_size/self.batch_size,epochs=self.epochs,validation_data=self.data_generator('val'),validation_steps=v_step_size/self.batch_size,callbacks=callbacks_list)
    # self.model.fit_generator(self.data_generator('train'),steps_per_epoch=step_size/self.batch_size,epochs=self.epochs,callbacks=callbacks_list)

    try:
        self.model.save('Models/WholeModel.h5',overwrite=True)
        self.model.save_weights('Models/Weights.h5',overwrite=True)
    except:
        print "Error in saving model."
    print "After training model...\n"

Accuracy maintains about 35% in the end and training loss is about 3.xxx I just cannot figure out what's wrong with the code. Could you please offer some help. Thank you so much!

MikhailovSergei commented 6 years ago

hi. do u avoid this problem?

ShixiangWan commented 6 years ago

@ReneeZD @MikhailovSergei Hi, do you fix this problem? I've tried many ways but cannot get proper captions. :(

ajay9022 commented 5 years ago

@ReneeZD Did you get the required results ? If yes, then how ?