extract the images inside the bounding boxes

shivprasad94 commented 3 years ago

I am still confused about extracting Bounding boxes as a separate image, where do I exactly need to use your code from commonly asked questions can you please elaborate?

Below are the function I am using and testing it on test images folder.

def load_custom_model(model_name):
  model_file = model_name
  model_dir = pathlib.Path(model_file)/"saved_model"
  model = tf.saved_model.load(str(model_dir))
  return model

model_name = 'exported-models/my_model'
detection_model = load_custom_model(model_name)

PATH_TO_LABELS` = 'training/label_map.pbtxt'
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)
PATH_TO_TEST_IMAGES_DIR = pathlib.Path('test_images')
TEST_IMAGE_PATHS = sorted(list(PATH_TO_TEST_IMAGES_DIR.glob("*.jpg")))

def run_inference_for_single_image(model, image):
  image = np.asarray(image)
  # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
  input_tensor = tf.convert_to_tensor(image)
  # The model expects a batch of images, so add an axis with `tf.newaxis`.
  input_tensor = input_tensor[tf.newaxis,...]

  # Run inference
  model_fn = model.signatures['serving_default']
  output_dict = model_fn(input_tensor)

  # All outputs are batches tensors.
  # Convert to numpy arrays, and take index [0] to remove the batch dimension.
  # We're only interested in the first num_detections.
  num_detections = int(output_dict.pop('num_detections'))
  output_dict = {key:value[0, :num_detections].numpy() 
                 for key,value in output_dict.items()}
  output_dict['num_detections'] = num_detections

  # detection_classes should be ints.
  output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)

  # Handle models with masks:
  if 'detection_masks' in output_dict:
    # Reframe the the bbox mask to the image size.
    detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
              output_dict['detection_masks'], output_dict['detection_boxes'],
               image.shape[0], image.shape[1])      
    detection_masks_reframed = tf.cast(detection_masks_reframed > 0.5,
                                       tf.uint8)
    output_dict['detection_masks_reframed'] = detection_masks_reframed.numpy()

  return output_dict

def show_inference(model, image_path):
  # the array based representation of the image will be used later in order to prepare the
  # result image with boxes and labels on it.
  image_np = np.array(Image.open(image_path))
  # Actual detection.
  output_dict = run_inference_for_single_image(model, image_np)
  # Visualization of the results of a detection.
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      output_dict['detection_boxes'],
      output_dict['detection_classes'],
      output_dict['detection_scores'],
      category_index,
      instance_masks=output_dict.get('detection_masks_reframed', None),
      use_normalized_coordinates=True,
      line_thickness=8)

  display(Image.fromarray(image_np))

for image_path in TEST_IMAGE_PATHS:
  show_inference(detection_model, image_path)

TannerGilbert commented 3 years ago

The following should work:

def load_custom_model(model_name):
    model_file = model_name
    model_dir = pathlib.Path(model_file)/"saved_model"
    model = tf.saved_model.load(str(model_dir))
    return model

model_name = 'exported-models/my_model'
detection_model = load_custom_model(model_name)

PATH_TO_LABELS` = 'training/label_map.pbtxt'
category_index = label_map_util.create_category_index_from_labelmap(
    PATH_TO_LABELS, use_display_name=True)
PATH_TO_TEST_IMAGES_DIR = pathlib.Path('test_images')
TEST_IMAGE_PATHS = sorted(list(PATH_TO_TEST_IMAGES_DIR.glob("*.jpg")))

def run_inference_for_single_image(model, image):
    if os.path.exists('results.csv'):
        df = pd.read_csv('results.csv')
    else:
        df = pd.DataFrame(columns=['timestamp', 'img_path'])

    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis, ...]

    # Run inference
    model_fn = model.signatures['serving_default']
    output_dict = model_fn(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key: value[0, :num_detections].numpy()
                   for key, value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(
        np.int64)

    # Handle models with masks:
    if 'detection_masks' in output_dict:
        # Reframe the the bbox mask to the image size.
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            output_dict['detection_masks'], output_dict['detection_boxes'],
            image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(detection_masks_reframed > 0.5,
                                           tf.uint8)
        output_dict['detection_masks_reframed'] = detection_masks_reframed.numpy()

    # Get data(label, xmin, ymin, xmax, ymax)
    output = []
    for index, score in enumerate(output_dict['detection_scores']):
        if score < threshold:
            continue
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

    # Save incident (could be extended to send a email or something)
    for l, x_min, y_min, x_max, y_max in output:
        if l == label_to_look_for:
            array = cv2.cvtColor(np.array(image_show), cv2.COLOR_RGB2BGR)
            image = Image.fromarray(array)
            cropped_img = image.crop((x_min, y_min, x_max, y_max))
            file_path = output_directory+'/images/'+str(len(df))+'.jpg'
            cropped_img.save(file_path, "JPEG", icc_profile=cropped_img.info.get('icc_profile'))
            df.loc[len(df)] = [datetime.datetime.now(), file_path]
            df.to_csv(output_directory+'/results.csv', index=None)

    return output_dict

def show_inference(model, image_path):
    # the array based representation of the image will be used later in order to prepare the
    # result image with boxes and labels on it.
    image_np = np.array(Image.open(image_path))
    # Actual detection.
    output_dict = run_inference_for_single_image(model, image_np)
    # Visualization of the results of a detection.
    vis_util.visualize_boxes_and_labels_on_image_array(
        image_np,
        output_dict['detection_boxes'],
        output_dict['detection_classes'],
        output_dict['detection_scores'],
        category_index,
        instance_masks=output_dict.get('detection_masks_reframed', None),
        use_normalized_coordinates=True,
        line_thickness=8)

    display(Image.fromarray(image_np))

for image_path in TEST_IMAGE_PATHS:
    show_inference(detection_model, image_path)

Don't forget to import the needed libraries. If it doesn't work let me know.

shivprasad94 commented 3 years ago

hey, @TannerGilbert thanks. I tried the above code snippet, but it's not working as expected. I even tried printing the output list with labels and coordinates but looks like it's an empty list because the code doesn't go on to excute if score < threshold:

But most importantly, I am able to see bounding boxes on my test images from your code snippet. looks like the index values - ymin, xmin, ymax, xmax are not get extracted and stored in output[]

  # Get data(label, xmin, ymin, xmax, ymax)
    output = []
    threshold=20  # I have set my threshold as 20
    output_directory = 'cropped' #a folder directory to save result
    label_to_look_for = 'num' #label name of the box

    for index, score in enumerate(output_dict['detection_scores']):
        if score < threshold:
            continue
        print('extracting index and label')    # this part itself is not getting printed*
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))
        print(output)

so because of this, i tried removing the if condition of checking score< threshold and replaced with below snippet

output = []
for index, score in enumerate(output_dict['detection_scores']):
    label = category_index[output_dict['detection_classes'][index]]['name']
    ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
    output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

and ended up with the below error NameError: name 'image_width' is not defined

so I defined values for both variable based on my input test image size 1920x1080

image_width = 1920
image_height = 1080

and then ended up with the below error

51     # Save incident (could be extended to send a email or something)
52     for l, x_min, y_min, x_max, y_max in output:
53      --->       array = cv2.cvtColor(np.array(image_show), cv2.COLOR_RGB2BGR)
54             image = Image.fromarray(array)
55             cropped_img = image.crop((x_min, y_min, x_max, y_max))

NameError: name 'image_show' is not defined

So had to replace it with 'image' and fixed the above errors.

by doing all this change it ran successfully but I ended up with almost 99 cropped images in my file directory.
as I understand the output.append is appended with 0 to 99 coordinates because of which there are 0-99 different cropped images are stored in the directory how I can get only the best one with the highest confidence score?
I am not so sure, but is output[0] is best coordinate? instead of looping the overall list, its good to pass on output[0]?
I just want 1 best image shown in the bounding box or maybe an image stored in some variable so that I can pass it to opensource OCR tools like PyTesseract and extract text data out of it

So finally updated my code to below , by extracting coordinates of output[0]

 output = []
    image_width = 1920
    image_height = 1080
    label_to_look_for ='num'
    output_directory = 'cropped'

    for index, score in enumerate(output_dict['detection_scores']):
        label = category_index[output_dict['detection_classes'][index]]['name']
        ymin, xmin, ymax, xmax = output_dict['detection_boxes'][index]
        output.append((label, int(xmin * image_width), int(ymin * image_height), int(xmax * image_width), int(ymax * image_height)))

    # Save incident (could be extended to send a email or something)
    #for l, x_min, y_min, x_max, y_max in output:
    if label == label_to_look_for:
            print(output[0])
            x_min=output[0][1]
            y_min=output[0][2]
            x_max=output[0][3]
            y_max=output[0][4]
            array = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
            image = Image.fromarray(array)
            cropped_img = image.crop((x_min, y_min, x_max, y_max))
            file_path = output_directory+'/images/'+str(len(df))+'.jpg'
            cropped_img.save(file_path, "JPEG", icc_profile=cropped_img.info.get('icc_profile'))
            df.loc[len(df)] = [datetime.datetime.now(), file_path]
            df.to_csv(output_directory+'/results.csv', index=None)
            print(output[0])

    return output_dict

is output[0] is best coordinate?

TannerGilbert commented 3 years ago

Sorry for not replying for such a long time. I finally had the time to write a working example. The example works with a webcam, but it should be simple to rewrite it to work with images instead if you want to.

I'll close this issue for now. If you have any further questions feel free to ask.

TannerGilbert / Tensorflow-Object-Detection-API-Train-Model

extract the images inside the bounding boxes #23