Hi, I am faced with a problem when I execute this model in the last step, I don't have any ctc value and I don't have any result of predicted value, any help please,
this is the result:
original_text = Enuresis
predicted text = []
and this is my code:
char_list = string.ascii_letters + string.digits
print("Character List: ", char_list)
function to decode the text into indice of char list
def encode_to_labels(text):
We encode each output word into digits
digit_list = []
for index, character in enumerate(text):
try:
digit_list.append(char_list.index(character))
except:
print("Error in finding index for character ", character)
#End For
return digit_list
for i, pic in enumerate(os.listdir('/home/yosra/Desktop/imagetest')):
Read image as grayscale
img = cv2.imread(os.path.join('/home/yosra/Desktop/imagetest', pic), cv2.IMREAD_GRAYSCALE)
pic_target = pic[:-4]
# convert each image of shape (32, 128, 1)
w, h = img.shape
if h > 128 or w > 32:
continue
# endif
# Process the images to bring them to scale
if w < 32:
add_zeros = np.ones((32-w, h))*255
img = np.concatenate((img, add_zeros))
# endif
if h < 128:
add_zeros = np.ones((32, 128-h))*255
img = np.concatenate((img, add_zeros), axis=1)
# endif
img = np.expand_dims(img , axis = 2)
# Normalise the image
img = img/255.
# Get the text for the image
txt = pic_target.split('_')[1]
# compute maximum length of the text
if len(txt) > max_label_len:
max_label_len = len(txt)
if k%10 == 0:
valid_orig_txt.append(txt)
valid_label_length.append(len(txt))
valid_input_length.append(31)
valid_img.append(img)
valid_txt.append(encode_to_labels(txt))
else:
orig_txt.append(txt)
train_label_length.append(len(txt))
train_input_length.append(31)
training_img.append(img)
training_txt.append(encode_to_labels(txt))
k+=1
pad each output label to maximum text length
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
greedy=True)[0][0])
see the results
i = 0
for x in out:
print("original_text = ", valid_orig_txt[i])
print("predicted text = ", end = '')
print(x)
for p in x:
if int(p) != -1:
print(char_list[int(p)], end = '')
print('\n')
i+=1
Hi, I am faced with a problem when I execute this model in the last step, I don't have any ctc value and I don't have any result of predicted value, any help please, this is the result: original_text = Enuresis predicted text = [] and this is my code: char_list = string.ascii_letters + string.digits print("Character List: ", char_list)
function to decode the text into indice of char list
def encode_to_labels(text):
We encode each output word into digits
a= encode_to_labels('hola') print(a)
preprocess the data
read the image from IAM Dataset
n_samples = len(os.listdir('/home/yosra/Desktop/imagetest'))
Number of samples in xml file
xml_samples = len(dic)
list of trining_set
training_img = [] training_txt=[] train_input_length = [] train_label_length = [] orig_txt = []
lists for validation dataset
valid_img = [] valid_txt = [] valid_input_length = [] valid_label_length = [] valid_orig_txt = []
max_label_len = 0
Training Variables
k=1
for i, pic in enumerate(os.listdir('/home/yosra/Desktop/imagetest')):
Read image as grayscale
pad each output label to maximum text length
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list)) valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))
input with shape of height=32 and width=128
input with shape of height=32 and width=128
inputs = Input(shape=(32,128,1))
convolution layer with kernel size (3,3)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
poolig layer with kernel size (2,2)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1) pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
poolig layer with kernel size (2,1)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
Batch normalization layer
batch_norm_5 = BatchNormalization()(conv_5)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5) batch_norm_6 = BatchNormalization()(conv_6) pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed) blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)
model to be used at test time
act_model = Model(inputs, outputs)
act_model.summary()
the CTC loss fnction is to predict the output text, it is very helpfull for the
text recognition topic.
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64')
def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])
model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
filepath="/home/yosra/Downloads/best_model.hdf5" checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto') callbacks_list = [checkpoint]
train the model
callbacks_list = [checkpoint] training_img = np.array(training_img) train_input_length = np.array(train_input_length) train_label_length = np.array(train_label_length)
valid_img = np.array(valid_img) valid_input_length = np.array(valid_input_length) valid_label_length = np.array(valid_label_length)
batch_size = 256 epochs = 10
model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epochs, validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)
act_model.save(filepath)
test the model
from keras.models import load_model
load the saved best model weights
load the saved best model weights
act_model.load_weights(filepath)
image=valid_img[:10]
predict outputs on validation images
prediction = act_model.predict(image)
use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1], greedy=True)[0][0])
see the results
i = 0 for x in out: print("original_text = ", valid_orig_txt[i]) print("predicted text = ", end = '') print(x) for p in x: if int(p) != -1: print(char_list[int(p)], end = '')
print('\n') i+=1