Bouding boxes predictions are concentrated in left top corner

Hi! I have a problem with the SSD300 implementation. I'm using a dataset of 1000 images and I'm using 750 of them to train and 250 to be the validation set. My dataset has only 1 positive class.

My training code is the following:

`img_height = 300 img_width = 300 img_channels = 3 mean_color = [123, 117, 104] swap_channels = [2, 1, 0] n_classes = 1 scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] scales = scales_pascal aspect_ratios = [[1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]] two_boxes_for_ar1 = True steps = [8, 16, 32, 64, 100, 300] offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] clip_boxes = False variances = [0.1, 0.1, 0.2, 0.2] normalize_coords = True

K.clear_session()

model = ssd_300(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels)

weights_path = 'VGG_weights/VGG_ILSVRC_16_layers_fc_reduced.h5' model.load_weights(weights_path, by_name=True) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

train_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None) val_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)

myDataSet_train_images_dir = 'myDatasets/Training/JPEGImages/' myDataSet_train_annotations_dir = 'myDatasets/Training/Annotations/' myDataSet_trainval_image_set_filename = 'myDatasets/Training/ImageSets/Main/default.txt'

myDataSet_test_images_dir = 'myDatasets/Testing/JPEGImages/' myDataSet_test_annotations_dir = 'myDatasets/Testing/Annotations/' myDataSet_test_image_set_filename = 'myDatasets/Testing/ImageSets/Main/default.txt'

classes = ['background', 'Plant']

train_dataset.parse_xml(images_dirs=[myDataSet_train_images_dir], image_set_filenames=[myDataSet_trainval_image_set_filename], annotations_dirs=[myDataSet_train_annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False)

val_dataset.parse_xml(images_dirs=[myDataSet_test_images_dir], image_set_filenames=[myDataSet_test_image_set_filename], annotations_dirs=[myDataSet_test_annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=True, ret=False)

batch_size = 5

ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color)

convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width)

predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]

ssd_input_encoder = SSDInputEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords)

train_generator = train_dataset.generate(batch_size=batch_size, shuffle=True, transformations=[ssd_data_augmentation], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size, shuffle=False, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False)

train_dataset_size = train_dataset.get_dataset_size() val_dataset_size = val_dataset.get_dataset_size()

print("Number of images in the training dataset:\t{:>6}".format(train_dataset_size)) print("Number of images in the validation dataset:\t{:>6}".format(val_dataset_size))

def lr_schedule(epoch): if epoch < 300: return 0.0001 elif epoch < 450: return 0.00001 else: return 0.000001

model_checkpoint = ModelCheckpoint(filepath='ssd300_pascal_07+12_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

csv_logger = CSVLogger(filename='ssd300_pascal_07+12_training_log.csv', separator=',', append=True)

learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule, verbose=1)

terminate_on_nan = TerminateOnNaN()

callbacks = [model_checkpoint, csv_logger, learning_rate_scheduler, terminate_on_nan]

initial_epoch = 0 final_epoch = 1000 steps_per_epoch = 1000

history = model.fit_generator(generator=train_generator, steps_per_epoch=steps_per_epoch, epochs=final_epoch, callbacks=callbacks, validation_data=val_generator, validation_steps=ceil(val_dataset_size/batch_size), initial_epoch=initial_epoch)`

The inference code is the following: `img_height = 300 img_width = 300

model_path = 'ssd300_pascal_07+12_epoch-180_loss-3.5966_val_loss-3.3306.h5' ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)

K.clear_session()

model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes, 'L2Normalization': L2Normalization, 'DecodeDetections': DecodeDetections, 'compute_loss': ssd_loss.compute_loss})

orig_images = [] input_images = []

img_path = 'myDatasets/Testing/JPEGImages/scene00371.png'

orig_images.append(imread(img_path)) img = image.load_img(img_path, target_size=(img_height, img_width)) img = image.img_to_array(img) input_images.append(img) input_images = np.array(input_images)

y_pred = model.predict(input_images)

confidence_threshold = 0.25

y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]

np.set_printoptions(precision=2, suppress=True, linewidth=90, threshold=sys.maxsize) print("Predicted boxes:\n") print('class conf xmin ymin xmax ymax') print(y_pred_thresh[0])

colors = plt.cm.hsv(np.linspace(0, 1, 2)).tolist() classes = ['background', 'Plant']

plt.figure(figsize=(20,12)) plt.imshow(orig_images[0])

current_axis = plt.gca()

for box in y_pred_thresh[0]: xmin = box[2] orig_images[0].shape[1] / img_width ymin = box[3] orig_images[0].shape[0] / img_height xmax = box[4] orig_images[0].shape[1] / img_width ymax = box[5] orig_images[0].shape[0] / img_height color = colors[round(box[0])] label = '{}: {:.2f}'.format(classes[round(box[0])], box[1]) current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))
current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})

plt.show()`

That's the output from the network: Figure_1

An example from the image annotation XML file:

<annotation>
<folder/>
<filename>scene00058.png</filename>
<source>
<database>Unknown</database>
<annotation>Unknown</annotation>
<image>Unknown</image>
</source>
<size>
<width>1280</width>
<height>720</height>
<depth/>
</size>
<segmented>0</segmented>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>33.02</xmin>
<ymin>13.54</ymin>
<xmax>105.77000000000001</xmax>
<ymax>610.24</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>1</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>189.63</xmin>
<ymin>10.2</ymin>
<xmax>262.43</xmax>
<ymax>545.78</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>2</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>

I really don't know why this is not working and I made just parameter tune changes from the original code.

pierluigiferrari / ssd_keras

Bouding boxes predictions are concentrated in left top corner #382