google / automl

Google Brain AutoML
Apache License 2.0
6.21k stars 1.45k forks source link

Training efficientDet on custom dataset - parser/GatherNd_1 Error #1028

Closed philptt closed 3 years ago

philptt commented 3 years ago

Hi.

I'm trying to fine tune EfficientDet on a custom dataset using Google Colab (free). I'm new to tf so I tried to reproduce/modify an existing notebook (sorry if this question is stupid, I checked other issues concerning Gather_Nd here and tried some stuff but nothing worked). When training i get the following error:

(0) Invalid argument: indices[2] = [2] does not index into param shape [1,1], node name: parser/GatherNd_1
     [[{{node parser/GatherNd_1}}]]
     [[IteratorGetNext]]
     [[IteratorGetNext/_4303]]

Cannot get where that comes from, eventhough I'm aware that it could come from the TFrecord files. Here's how I generate them:

def create_tf_example(filepath, df_label):

    encoded_image_data = open(filepath, "rb").read()
    key = hashlib.sha256(encoded_image_data).hexdigest()
    filename = os.path.basename(filepath)
    image_name = filename.replace(".png", "")
    height0 = df_label["height0"].loc[df_label["id"]==image_name].iloc[0]
    width0 = df_label["width0"].loc[df_label["id"]==image_name].iloc[0]
    image_format = b'png'
    width = 256
    height = 256

    xmins = [x / width0 for x in df_label["xmins0"].loc[df_label["id"]==image_name].iloc[0]]
    xmaxs = [x / width0 for x in df_label["xmaxs0"].loc[df_label["id"]==image_name].iloc[0]]
    ymins = [x / height0 for x in df_label["ymins0"].loc[df_label["id"]==image_name].iloc[0]]
    ymaxs = [x / height0 for x in df_label["ymaxs0"].loc[df_label["id"]==image_name].iloc[0]]
    classes_text = ["opacity".encode("utf-8")]
    classes = [1]

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
        'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
        "image/filename": tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename.encode("utf-8")])),
        "image/source_id": tf.train.Feature(bytes_list=tf.train.BytesList(value=['0'.encode("utf-8")])), # Pb with image names solved with this hack
        "image/key/sha256": tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode("utf-8")])),
        "image/encoded": tf.train.Feature(bytes_list=tf.train.BytesList(value=[encoded_image_data])),
        "image/format": tf.train.Feature(bytes_list=tf.train.BytesList(value=["png".encode("utf-8")])),
        "image/object/bbox/xmin": tf.train.Feature(float_list=tf.train.FloatList(value=xmins)),
        "image/object/bbox/xmax": tf.train.Feature(float_list=tf.train.FloatList(value=xmaxs)),
        "image/object/bbox/ymin": tf.train.Feature(float_list=tf.train.FloatList(value=ymins)),
        "image/object/bbox/ymax": tf.train.Feature(float_list=tf.train.FloatList(value=ymaxs)),
        "image/object/class/text": tf.train.Feature(bytes_list=tf.train.BytesList(value=classes_text)),
        "image/object/class/label": tf.train.Feature(int64_list=tf.train.Int64List(value=classes)),
        }))
    return tf_example

writer_train = tf.io.TFRecordWriter('/content/drive/MyDrive/siim-covid19-detection/TFRecords/train/train.tfrecord')

for filepath in train_filepaths:
    tf_example = create_tf_example(filepath, df_train)
    writer_train.write(tf_example.SerializeToString())

writer_train.close()

Same code for val.tfrecord.

I downloaded the model with this:

if not os.path.isdir("automl"):

    !git clone --depth 1 https://github.com/google/automl
    %cd automl
    !git checkout f2b4480703278250fb05abe38a2f4ecbb16ba463

    %cd efficientdet

    %pip install -r requirements.txt
    %pip install -U "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"

MODEL = "efficientdet-d0"
if not os.path.exists(f"{MODEL}.tar.gz"):
    !curl -O https://storage.googleapis.com/cloud-tpu-checkpoints/efficientdet/coco2/{MODEL}.tar.gz
    !tar xvzf {MODEL}.tar.gz

Config is this:

PROJ_DIR = "/content/MODEL"
CONFIG_DIR = os.path.join(PROJ_DIR, "configs")
CONFIG_FILE = os.path.join(CONFIG_DIR, "default.yaml")
if not os.path.exists(CONFIG_DIR):
    os.mkdir(CONFIG_DIR)

config_text = \
"""image_size: 256x256 # this is the size of my images
num_classes: 1
label_map: {1: opacity}
input_rand_hflip: true
jitter_min: 0.8
jitter_max: 1.2
"""

with open(CONFIG_FILE, "w") as fwrite:
    fwrite.write(config_text)

TFRECORD_DIR = "/content/drive/MyDrive/siim-covid19-detection/TFRecords"

CKPT = MODEL

TRAIN_SET = os.path.join(TFRECORD_DIR, "train/train.tfrecord")
VAL_SET = os.path.join(TFRECORD_DIR, "val/val.tfrecord")
MODEL_DIR_TMP = os.path.join(PROJ_DIR, "tmp", f"{MODEL}-finetune")
TRAIN_NUM_EXAMPLES = len(train_filepaths)
EVAL_NUM_EXAMPLES = len(val_filepaths)
EPOCHS = 2
BATCH_SIZE = 16

And here's how I start the training:

!python -m main \
    --mode=train_and_eval \
    --train_file_pattern={TRAIN_SET} \
    --val_file_pattern={VAL_SET} \
    --model_name={MODEL} \
    --model_dir={MODEL_DIR_TMP} \
    --ckpt={CKPT} \
    --train_batch_size={BATCH_SIZE} \
    --eval_batch_size={BATCH_SIZE} \
    --num_epochs={EPOCHS} \
    --num_examples_per_epoch={TRAIN_NUM_EXAMPLES} \
    --eval_samples={EVAL_NUM_EXAMPLES} \
    --hparams={CONFIG_FILE}

Thanks in advance for your help !

philptt commented 3 years ago

All right, I found my error : the number of classes and classes texts did not match the number of bbox for a sample in the tfrecord file. I changed the code like this and it did the trick:

def create_tf_example(filepath, df_label):

    encoded_image_data = open(filepath, "rb").read()
    key = hashlib.sha256(encoded_image_data).hexdigest()
    filename = os.path.basename(filepath)
    image_name = filename.replace(".png", "")
    height0 = df_label["height0"].loc[df_label["id"]==image_name].iloc[0]
    width0 = df_label["width0"].loc[df_label["id"]==image_name].iloc[0]
    image_format = b'png'
    width = 256
    height = 256

    xmins = [x / width0 for x in df_label["xmins0"].loc[df_label["id"]==image_name].iloc[0]]
    xmaxs = [x / width0 for x in df_label["xmaxs0"].loc[df_label["id"]==image_name].iloc[0]]
    ymins = [x / height0 for x in df_label["ymins0"].loc[df_label["id"]==image_name].iloc[0]]
    ymaxs = [x / height0 for x in df_label["ymaxs0"].loc[df_label["id"]==image_name].iloc[0]]

    classes_text = ["opacity".encode("utf-8")]*len(xmins) # now it creates a list of strings with length equal to the number of bbox
    classes = [0]*len(xmins) # now it creates a list of int with length equal to the number of bbox

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
        'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
        "image/filename": tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename.encode("utf-8")])),
        "image/source_id": tf.train.Feature(bytes_list=tf.train.BytesList(value=['0'.encode("utf-8")])), # Pb with image names solved with this hack
        "image/key/sha256": tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode("utf-8")])),
        "image/encoded": tf.train.Feature(bytes_list=tf.train.BytesList(value=[encoded_image_data])),
        "image/format": tf.train.Feature(bytes_list=tf.train.BytesList(value=["png".encode("utf-8")])),
        "image/object/bbox/xmin": tf.train.Feature(float_list=tf.train.FloatList(value=xmins)),
        "image/object/bbox/xmax": tf.train.Feature(float_list=tf.train.FloatList(value=xmaxs)),
        "image/object/bbox/ymin": tf.train.Feature(float_list=tf.train.FloatList(value=ymins)),
        "image/object/bbox/ymax": tf.train.Feature(float_list=tf.train.FloatList(value=ymaxs)),
        "image/object/class/text": tf.train.Feature(bytes_list=tf.train.BytesList(value=classes_text)),
        "image/object/class/label": tf.train.Feature(int64_list=tf.train.Int64List(value=classes)),
        }))
    return tf_example