TheAILearner / A-CRNN-model-for-Text-Recognition-in-Keras

64 stars 33 forks source link

Out value is empty #8

Open yosraammar opened 4 years ago

yosraammar commented 4 years ago

Hi, I am faced with a problem when I execute this model in the last step, I don't have any ctc value and I don't have any result of predicted value, any help please, this is the result: original_text = Enuresis predicted text = [] and this is my code: char_list = string.ascii_letters + string.digits print("Character List: ", char_list)

function to decode the text into indice of char list

def encode_to_labels(text):

We encode each output word into digits

digit_list = []
for index, character in enumerate(text):
    try:
        digit_list.append(char_list.index(character))
    except:
        print("Error in finding index for character ", character)
#End For
return digit_list

a= encode_to_labels('hola') print(a)

preprocess the data

read the image from IAM Dataset

n_samples = len(os.listdir('/home/yosra/Desktop/imagetest'))

Number of samples in xml file

xml_samples = len(dic)

list of trining_set

training_img = [] training_txt=[] train_input_length = [] train_label_length = [] orig_txt = []

lists for validation dataset

valid_img = [] valid_txt = [] valid_input_length = [] valid_label_length = [] valid_orig_txt = []

max_label_len = 0

Training Variables

k=1

for i, pic in enumerate(os.listdir('/home/yosra/Desktop/imagetest')):

Read image as grayscale

    img = cv2.imread(os.path.join('/home/yosra/Desktop/imagetest', pic), cv2.IMREAD_GRAYSCALE)

    pic_target = pic[:-4]
    # convert each image of shape (32, 128, 1)
    w, h = img.shape

    if h > 128 or w > 32:
        continue
    # endif

    # Process the images to bring them to scale
    if w < 32:
        add_zeros = np.ones((32-w, h))*255
        img = np.concatenate((img, add_zeros))
    # endif
    if h < 128:
        add_zeros = np.ones((32, 128-h))*255
        img = np.concatenate((img, add_zeros), axis=1)
    # endif    

    img = np.expand_dims(img , axis = 2)

    # Normalise the image
    img = img/255.

    # Get the text for the image
    txt = pic_target.split('_')[1]

    # compute maximum length of the text
    if len(txt) > max_label_len:
        max_label_len = len(txt)

    if k%10 == 0:     
        valid_orig_txt.append(txt)   
        valid_label_length.append(len(txt))
        valid_input_length.append(31)
        valid_img.append(img)
        valid_txt.append(encode_to_labels(txt))
    else:
        orig_txt.append(txt)   
        train_label_length.append(len(txt))
        train_input_length.append(31)
        training_img.append(img)
        training_txt.append(encode_to_labels(txt))
    k+=1

pad each output label to maximum text length

train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list)) valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))

input with shape of height=32 and width=128

input with shape of height=32 and width=128

inputs = Input(shape=(32,128,1))

convolution layer with kernel size (3,3)

conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)

poolig layer with kernel size (2,2)

pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)

conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1) pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)

conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)

conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)

poolig layer with kernel size (2,1)

pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)

conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)

Batch normalization layer

batch_norm_5 = BatchNormalization()(conv_5)

conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5) batch_norm_6 = BatchNormalization()(conv_6) pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)

conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)

squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)

bidirectional LSTM layers with units=128

blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed) blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)

outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

model to be used at test time

act_model = Model(inputs, outputs)

act_model.summary()

the CTC loss fnction is to predict the output text, it is very helpfull for the

text recognition topic.

labels = Input(name='the_labels', shape=[max_label_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64')

def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args

return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

model to be used at training time

model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

filepath="/home/yosra/Downloads/best_model.hdf5" checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto') callbacks_list = [checkpoint]

train the model

callbacks_list = [checkpoint] training_img = np.array(training_img) train_input_length = np.array(train_input_length) train_label_length = np.array(train_label_length)

valid_img = np.array(valid_img) valid_input_length = np.array(valid_input_length) valid_label_length = np.array(valid_label_length)

batch_size = 256 epochs = 10

model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epochs, validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)

act_model.save(filepath)

test the model

from keras.models import load_model

load the saved best model weights

load the saved best model weights

act_model.load_weights(filepath)

image=valid_img[:10]

predict outputs on validation images

prediction = act_model.predict(image)

use CTC decoder

out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1], greedy=True)[0][0])

see the results

i = 0 for x in out: print("original_text = ", valid_orig_txt[i]) print("predicted text = ", end = '') print(x) for p in x: if int(p) != -1: print(char_list[int(p)], end = '')
print('\n') i+=1

febefebrita commented 4 years ago

how many images did u use for training?