Bug: Invalid PNG data, size 196673

What happened?

looks I have a problem in my dataset, the training stopped in between. I have checked my images with Pillow, Scikit-learn, opencv, Tf.io.decode_jpeg but no corrupted images are detected

Relevant code

class WarmUpCosine(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self, learning_rate_base, total_steps, warmup_learning_rate, warmup_steps
    ):
        super(WarmUpCosine, self).__init__()

        self.learning_rate_base = learning_rate_base
        self.total_steps = total_steps
        self.warmup_learning_rate = warmup_learning_rate
        self.warmup_steps = warmup_steps
        self.pi = tf.constant(np.pi)

    def __call__(self, step):
        if self.total_steps < self.warmup_steps:
            raise ValueError("Total_steps must be larger or equal to warmup_steps.")
        learning_rate = (
            0.5
            * self.learning_rate_base
            * (
                1
                + tf.cos(
                    self.pi
                    * (tf.cast(step, tf.float32) - self.warmup_steps)
                    / float(self.total_steps - self.warmup_steps)
                )
            )
        )

        if self.warmup_steps > 0:
            if self.learning_rate_base < self.warmup_learning_rate:
                raise ValueError(
                    "Learning_rate_base must be larger or equal to "
                    "warmup_learning_rate."
                )
            slope = (
                self.learning_rate_base - self.warmup_learning_rate
            ) / self.warmup_steps
            warmup_rate = slope * tf.cast(step, tf.float32) + self.warmup_learning_rate
            learning_rate = tf.where(
                step < self.warmup_steps, warmup_rate, learning_rate
            )
        return tf.where(
            step > self.total_steps, 0.0, learning_rate, name="learning_rate"
        )
    def get_config(self):
        config = {
            'warmup_steps': self.warmup_steps,
            'total_steps': self.total_steps}
        return config

def dataset(path, batch_size):

    def configure_for_performance(dataset, batch_size):
        """
        examples.prefetch(2) will prefetch two elements (2 examples),
        while examples.batch(20).prefetch(2) will prefetch 2 elements
        (2 batches, of 20 examples each)

        train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
        """
        # dataset = dataset.shuffle(buffer_size=1025)
        # dataset = dataset.cache('/tmp/dump.tfcache_eff')
        dataset = dataset.repeat()
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return dataset

    def read_filename(filename):
        image = tf.io.read_file(filename)
        image = tf.io.decode_jpeg(image, channels=3)
        # image = tf.image.random_flip_up_down(image)
        image = tf.image.resize(image, [224, 224])
        return image

    def custom_tf_dataset(path, batch_size):
        classes = os.listdir(path)
        num_classes = len(classes)
        print("number of classes are", num_classes)
        filenames = glob(path + '/*/*')
        num_examples = len(filenames)
        print("number of examples are", num_examples)
        random.shuffle(filenames)
        labels = [classes.index(name.split('/')[-2]) for name in filenames]
        # labels = tf.one_hot(labels, len(classes))
        labels = tf.data.Dataset.from_tensor_slices(labels)
        image_data = tf.data.Dataset.from_tensor_slices(filenames)
        image_data = image_data.map(read_filename)
        image_label_dataset = tf.data.Dataset.zip((image_data, labels))
        image_label_dataset = configure_for_performance(image_label_dataset, batch_size)
        return image_label_dataset, num_classes, num_examples

    return custom_tf_dataset(path, batch_size)

train_ds, num_train_class, num_train_examples = dataset(train_dir,train_batch_size)
val_ds, num_val_class, num_val_examples = dataset(val_dir, val_batch_size)
TOTAL_STEPS = int((num_train_examples / train_batch_size) * EPOCHS)

def load_model(model_path, num_classes: int):
    inputs = tf.keras.Input((224,224,3))
    norm_layer = tf.keras.layers.Rescaling(scale=1.0 / 127.5, offset=-1)(inputs)
    hub_module = hub.KerasLayer(model_path, trainable=True)

    # x,_,_ = hub_module(norm_layer, training=True)
    x = hub_module(norm_layer, training=True)
    # dropout = tf.keras.layers.Dropout(0.2)(x)
    if num_classes > 1:
        outputs = tf.keras.layers.Dense(num_classes, activation="softmax")(x)
    else:
        outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)
    return model

 model = load_model('https://tfhub.dev/google/imagenet/mobilenet_v2_050_224/feature_vector/5',params["classes"])
model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
model.summary()
callback = tf.keras.callbacks.ModelCheckpoint(params["save_model"] + '_{epoch:02d}-{val_loss:5f}', save_best_only=False, monitor='val_loss', mode='min')
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS,steps_per_epoch=num_train_examples/train_batch_size,validation_steps=num_val_examples,callbacks=[callback])
model.save(params["save_model"])
result = pd.DataFrame(history.history)
fig, ax = plt.subplots(2, 1, figsize=(10, 10))
result[["accuracy", "val_accuracy"]].plot(xlabel="epoch", ylabel="score", ax=ax[0])
result[["loss", "val_loss"]].plot(xlabel="epoch", ylabel="score", ax=ax[1])

Relevant log output

Traceback (most recent call last):
  File "/data-mount/vit-tfhub/vit-train.py", line 185, in <module>
    history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS,steps_per_epoch=num_train_examples/train_batch_size,validation_steps=num_val_examples,callbacks=[callback])
  File "/home/keertika/miniconda3/envs/cartzy/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/keertika/miniconda3/envs/cartzy/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:

tensorflow_hub Version

0.12.0 (latest stable release)

TensorFlow Version

other (please specify)

Other libraries

tensorflow==2.10.0

Python Version

3.x

OS

Linux

tensorflow / hub

Bug: Invalid PNG data, size 196673 #874