TannerGilbert / Tensorflow-Object-Detection-API-Train-Model

Train a object detection model with the Tensorflow Object Detection API and Tensorflow 2.
https://gilberttanner.com/blog/creating-your-own-objectdetector
MIT License
192 stars 103 forks source link

extract the images inside the bounding boxes #23

Closed shivprasad94 closed 3 years ago

shivprasad94 commented 3 years ago

I am still confused about extracting Bounding boxes as a separate image, where do I exactly need to use your code from commonly asked questions can you please elaborate?

Below are the function I am using and testing it on test images folder.

def load_custom_model(model_name):
  model_file = model_name
  model_dir = pathlib.Path(model_file)/"saved_model"
  model = tf.saved_model.load(str(model_dir))
  return model

model_name = 'exported-models/my_model'
detection_model = load_custom_model(model_name)

PATH_TO_LABELS` = 'training/label_map.pbtxt'
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)
PATH_TO_TEST_IMAGES_DIR = pathlib.Path('test_images')
TEST_IMAGE_PATHS = sorted(list(PATH_TO_TEST_IMAGES_DIR.glob("*.jpg")))

def run_inference_for_single_image(model, image):
  image = np.asarray(image)
  # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
  input_tensor = tf.convert_to_tensor(image)
  # The model expects a batch of images, so add an axis with `tf.newaxis`.
  input_tensor = input_tensor[tf.newaxis,...]

  # Run inference
  model_fn = model.signatures['serving_default']
  output_dict = model_fn(input_tensor)

  # All outputs are batches tensors.
  # Convert to numpy arrays, and take index [0] to remove the batch dimension.
  # We're only interested in the first num_detections.
  num_detections = int(output_dict.pop('num_detections'))
  output_dict = {key:value[0, :num_detections].numpy() 
                 for key,value in output_dict.items()}
  output_dict['num_detections'] = num_detections

  # detection_classes should be ints.
  output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)

  # Handle models with masks:
  if 'detection_masks' in output_dict:
    # Reframe the the bbox mask to the image size.
    detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
              output_dict['detection_masks'], output_dict['detection_boxes'],
               image.shape[0], image.shape[1])      
    detection_masks_reframed = tf.cast(detection_masks_reframed > 0.5,
                                       tf.uint8)
    output_dict['detection_masks_reframed'] = detection_masks_reframed.numpy()

  return output_dict

def show_inference(model, image_path):
  # the array based representation of the image will be used later in order to prepare the
  # result image with boxes and labels on it.
  image_np = np.array(Image.open(image_path))
  # Actual detection.
  output_dict = run_inference_for_single_image(model, image_np)
  # Visualization of the results of a detection.
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      output_dict['detection_boxes'],
      output_dict['detection_classes'],
      output_dict['detection_scores'],
      category_index,
      instance_masks=output_dict.get('detection_masks_reframed', None),
      use_normalized_coordinates=True,
      line_thickness=8)

  display(Image.fromarray(image_np))

for image_path in TEST_IMAGE_PATHS:
  show_inference(detection_model, image_path)
TannerGilbert commented 3 years ago

The following should work:

def load_custom_model(model_name):
    model_file = model_name
    model_dir = pathlib.Path(model_file)/"saved_model"
    model = tf.saved_model.load(str(model_dir))
    return model

model_name = 'exported-models/my_model'
detection_model = load_custom_model(model_name)

PATH_TO_LABELS` = 'training/label_map.pbtxt'
category_index = label_map_util.create_category_index_from_labelmap(
    PATH_TO_LABELS, use_display_name=True)
PATH_TO_TEST_IMAGES_DIR = pathlib.Path('test_images')
TEST_IMAGE_PATHS = sorted(list(PATH_TO_TEST_IMAGES_DIR.glob("*.jpg")))

def run_inference_for_single_image(model, image):
    if os.path.exists('results.csv'):
        df = pd.read_csv('results.csv')
    else:
        df = pd.DataFrame(columns=['timestamp', 'img_path'])

    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis, ...]

    # Run inference
    model_fn = model.signatures['serving_default']
    output_dict = model_fn(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key: value[0, :num_detections].numpy()
                   for key, value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(
        np.int64)

    # Handle models with masks:
    if 'detection_masks' in output_dict:
        # Reframe the the bbox mask to the image size.
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            output_dict['detection_masks'], output_dict['detection_boxes'],
            image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(detection_masks_reframed > 0.5,
                                           tf.uint8)
        output_dict['detection_masks_reframed'] = detection_masks_reframed.numpy()

    # Get data(label, xmin, ymin, xmax, ymax)
    output = []
    for index, score in enumerate(output_dict['detection_scores']):
        if score < threshold:
            continue
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

    # Save incident (could be extended to send a email or something)
    for l, x_min, y_min, x_max, y_max in output:
        if l == label_to_look_for:
            array = cv2.cvtColor(np.array(image_show), cv2.COLOR_RGB2BGR)
            image = Image.fromarray(array)
            cropped_img = image.crop((x_min, y_min, x_max, y_max))
            file_path = output_directory+'/images/'+str(len(df))+'.jpg'
            cropped_img.save(file_path, "JPEG", icc_profile=cropped_img.info.get('icc_profile'))
            df.loc[len(df)] = [datetime.datetime.now(), file_path]
            df.to_csv(output_directory+'/results.csv', index=None)

    return output_dict

def show_inference(model, image_path):
    # the array based representation of the image will be used later in order to prepare the
    # result image with boxes and labels on it.
    image_np = np.array(Image.open(image_path))
    # Actual detection.
    output_dict = run_inference_for_single_image(model, image_np)
    # Visualization of the results of a detection.
    vis_util.visualize_boxes_and_labels_on_image_array(
        image_np,
        output_dict['detection_boxes'],
        output_dict['detection_classes'],
        output_dict['detection_scores'],
        category_index,
        instance_masks=output_dict.get('detection_masks_reframed', None),
        use_normalized_coordinates=True,
        line_thickness=8)

    display(Image.fromarray(image_np))

for image_path in TEST_IMAGE_PATHS:
    show_inference(detection_model, image_path)

Don't forget to import the needed libraries. If it doesn't work let me know.

shivprasad94 commented 3 years ago

hey, @TannerGilbert thanks. I tried the above code snippet, but it's not working as expected. I even tried printing the output list with labels and coordinates but looks like it's an empty list because the code doesn't go on to excute if score < threshold:

But most importantly, I am able to see bounding boxes on my test images from your code snippet. looks like the index values - ymin, xmin, ymax, xmax are not get extracted and stored in output[]

  # Get data(label, xmin, ymin, xmax, ymax)
    output = []
    threshold=20  # I have set my threshold as 20
    output_directory = 'cropped' #a folder directory to save result
    label_to_look_for = 'num' #label name of the box

    for index, score in enumerate(output_dict['detection_scores']):
        if score < threshold:
            continue
        print('extracting index and label')    # this part itself is not getting printed*
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))
        print(output)

so because of this, i tried removing the if condition of checking score< threshold and replaced with below snippet

output = []
for index, score in enumerate(output_dict['detection_scores']):
    label = category_index[output_dict['detection_classes'][index]]['name']
    ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
    output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

and ended up with the below error NameError: name 'image_width' is not defined

so I defined values for both variable based on my input test image size 1920x1080

image_width = 1920
image_height = 1080

and then ended up with the below error

51     # Save incident (could be extended to send a email or something)
52     for l, x_min, y_min, x_max, y_max in output:
53      --->       array = cv2.cvtColor(np.array(image_show), cv2.COLOR_RGB2BGR)
54             image = Image.fromarray(array)
55             cropped_img = image.crop((x_min, y_min, x_max, y_max))

NameError: name 'image_show' is not defined

So had to replace it with 'image' and fixed the above errors.

So finally updated my code to below , by extracting coordinates of output[0]

 output = []
    image_width = 1920
    image_height = 1080
    label_to_look_for ='num'
    output_directory = 'cropped'

    for index, score in enumerate(output_dict['detection_scores']):
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

    # Save incident (could be extended to send a email or something)
    #for l, x_min, y_min, x_max, y_max in output:
    if label == label_to_look_for:
            print(output[0])
            x_min=output[0][1]
            y_min=output[0][2]
            x_max=output[0][3]
            y_max=output[0][4]
            array = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            image = Image.fromarray(array)
            cropped_img = image.crop((x_min, y_min, x_max, y_max))
            file_path = output_directory+'/images/'+str(len(df))+'.jpg'
            cropped_img.save(file_path, "JPEG", icc_profile=cropped_img.info.get('icc_profile'))
            df.loc[len(df)] = [datetime.datetime.now(), file_path]
            df.to_csv(output_directory+'/results.csv', index=None)
            print(output[0])

    return output_dict

is output[0] is best coordinate?

TannerGilbert commented 3 years ago

Sorry for not replying for such a long time. I finally had the time to write a working example. The example works with a webcam, but it should be simple to rewrite it to work with images instead if you want to.

I'll close this issue for now. If you have any further questions feel free to ask.