How to prepare Multi Modal Dataset

sainivedh19pt commented 2 years ago

Thanks for your amazing work.

Error

         [[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext]
2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []

Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(https://github.com/google-research/google-research/issues/925#issuecomment-1020675579). I too can run the provided video data hmdb example but facing troubles if audio, text were added.

I generated a single tf-record using generate_from_csv.py script provided in examples by using 4 videos with captions.

Attaching code for dataset constructor class

from dmvr import tokenizers
import os
from dmvr import modalities
from dmvr import video_dataset

class ToyFactory2(video_dataset.BaseVideoDatasetFactory):
_SUBSETS = ('train', 'test', 'valid')
_SPLITS = (1, 2, 3)
_NUM_CLASSES = 4
_NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1}
def __init__(
    self,
    base_dir: str='generated_dataset',
    subset: str = 'valid',
    split: int = 1):
    """Constructor of ToyFactory2."""

    if subset not in ToyFactory2._SUBSETS:
    raise ValueError('Invalid subset "{}". The available subsets are: {}'
                    .format(subset, ToyFactory2._SUBSETS))

    if split not in ToyFactory2._SPLITS:
    raise ValueError('Invalid split "{}". The available splits are: {}'
                    .format(split, ToyFactory2._SPLITS))

    num_shards = self._NUM_SHARDS[subset]
    super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')])
def _build(self,
            is_training = True,
            # Video related parameters.
            num_frames = 32,
            stride = 1,
            num_test_clips = 1,
            min_resize = 256,
            crop_size = 224,
            multi_crop = False,
            crop_resize_style = 'Inception',
            min_aspect_ratio = 0.5,
            max_aspect_ratio = 2,
            min_area_ratio = 0.08,
            max_area_ratio = 1.0,
            zero_centering_image = False,
            color_augmentation = True,
            # Text related parameters.
            max_num_words = 16,
            max_context_sentences = 1,
            tokenizer = 'howto100m_en',
            prepend_bos = False,
            append_eos = False,
            keep_raw_string = False,
            # Audio related parameters.
            num_samples = 153600,  # 48000 (Hz) * 32 / 10 (fps)
            audio_stride = 1,
            sync_audio_and_image = True,
            # Label related parameters.
            one_hot_label = True,
            output_label_string = False,
            add_label_name = False,
            **kwargs):
    """Default build for this dataset.

    Args:
    is_training: Whether or not in training mode.
    num_frames: Number of frames per subclip. For single images, use 1.
    stride: Temporal stride to sample frames.
    num_test_clips: Number of test clips (1 by default). If more than 1, this
        will sample multiple linearly spaced clips within each video at test
        time. If 1, then a single clip in the middle of the video is sampled.
        The clips are aggreagated in the batch dimension.
    min_resize: Frames are resized so that `min(height, width)` is
        `min_resize`.
    crop_size: Final size of the frame after cropping the resized frames. Both
        height and width are the same.
    zero_centering_image: If `True`, frames are normalized to values in
        [-1, 1]. If `False`, values in [0, 1].
    one_hot_label: Return labels as one hot tensors.
    add_label_name: Also return the name of the label.
    """
    modalities.add_image(
        parser_builder=self.parser_builder,
        sampler_builder=self.sampler_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        postprocessor_builder=self.postprocessor_builder,
        is_training=is_training,
        num_frames=num_frames, stride=stride,
        num_test_clips=num_test_clips,
        min_resize=min_resize, crop_size=crop_size,
        zero_centering_image=zero_centering_image,
        input_feature_name="image/encoded")

    modalities.add_audio(
    parser_builder=self.parser_builder,
    sampler_builder=self.sampler_builder,
    postprocessor_builder=self.postprocessor_builder,
    # preprocessor_builder=preprocessor_builder,
    input_feature_name="WAVEFORM/feature/floats",
    output_feature_name='audio',
    is_training=is_training,
    num_samples=num_samples,
    stride=stride,
    num_test_clips=num_test_clips,
    sync_random_state=False,
    )

    self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt"))
    self.tokenizer.initialize()

    modalities.add_text(
    parser_builder=self.parser_builder,
    decoder_builder=self.decoder_builder,
    preprocessor_builder=self.preprocessor_builder,
    tokenizer=self.tokenizer,
    is_training=is_training,
    input_feature_name="caption/string",
    output_raw_string_name='text_string',
    output_feature_name='text',
    prepend_bos=prepend_bos,
    append_eos=append_eos,
    keep_raw_string=keep_raw_string,
    max_num_captions=1,
    max_num_tokens=16,
    sync_random_state=False,
    )

    modalities.add_label(
        parser_builder=self.parser_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        one_hot_label=one_hot_label,
        num_classes=ToyFactory2._NUM_CLASSES,
        add_label_name=add_label_name)

loading dmvr dataset

import tensorflow as tf
factory = ToyFactory2().configure(is_training=True)
ds = factory.make_dataset(batch_size=1)
data = next(iter(ds))
print(data.keys(), data)

Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.

Thanks

wuxianjun666 commented 2 years ago

can you finish multimodal example

wuxianjun666 commented 2 years ago

Thanks for your amazing work.

Error

         [[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext]
2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []

Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.

I generated a single tf-record using generate_from_csv.py script provided in examples by using 4 videos with captions.

Attaching code for dataset constructor class

from dmvr import tokenizers
import os
from dmvr import modalities
from dmvr import video_dataset

class ToyFactory2(video_dataset.BaseVideoDatasetFactory):
_SUBSETS = ('train', 'test', 'valid')
_SPLITS = (1, 2, 3)
_NUM_CLASSES = 4
_NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1}
def __init__(
    self,
    base_dir: str='generated_dataset',
    subset: str = 'valid',
    split: int = 1):
    """Constructor of ToyFactory2."""

    if subset not in ToyFactory2._SUBSETS:
    raise ValueError('Invalid subset "{}". The available subsets are: {}'
                    .format(subset, ToyFactory2._SUBSETS))

    if split not in ToyFactory2._SPLITS:
    raise ValueError('Invalid split "{}". The available splits are: {}'
                    .format(split, ToyFactory2._SPLITS))

    num_shards = self._NUM_SHARDS[subset]
    super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')])
def _build(self,
            is_training = True,
            # Video related parameters.
            num_frames = 32,
            stride = 1,
            num_test_clips = 1,
            min_resize = 256,
            crop_size = 224,
            multi_crop = False,
            crop_resize_style = 'Inception',
            min_aspect_ratio = 0.5,
            max_aspect_ratio = 2,
            min_area_ratio = 0.08,
            max_area_ratio = 1.0,
            zero_centering_image = False,
            color_augmentation = True,
            # Text related parameters.
            max_num_words = 16,
            max_context_sentences = 1,
            tokenizer = 'howto100m_en',
            prepend_bos = False,
            append_eos = False,
            keep_raw_string = False,
            # Audio related parameters.
            num_samples = 153600,  # 48000 (Hz) * 32 / 10 (fps)
            audio_stride = 1,
            sync_audio_and_image = True,
            # Label related parameters.
            one_hot_label = True,
            output_label_string = False,
            add_label_name = False,
            **kwargs):
    """Default build for this dataset.

    Args:
    is_training: Whether or not in training mode.
    num_frames: Number of frames per subclip. For single images, use 1.
    stride: Temporal stride to sample frames.
    num_test_clips: Number of test clips (1 by default). If more than 1, this
        will sample multiple linearly spaced clips within each video at test
        time. If 1, then a single clip in the middle of the video is sampled.
        The clips are aggreagated in the batch dimension.
    min_resize: Frames are resized so that `min(height, width)` is
        `min_resize`.
    crop_size: Final size of the frame after cropping the resized frames. Both
        height and width are the same.
    zero_centering_image: If `True`, frames are normalized to values in
        [-1, 1]. If `False`, values in [0, 1].
    one_hot_label: Return labels as one hot tensors.
    add_label_name: Also return the name of the label.
    """
    modalities.add_image(
        parser_builder=self.parser_builder,
        sampler_builder=self.sampler_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        postprocessor_builder=self.postprocessor_builder,
        is_training=is_training,
        num_frames=num_frames, stride=stride,
        num_test_clips=num_test_clips,
        min_resize=min_resize, crop_size=crop_size,
        zero_centering_image=zero_centering_image,
        input_feature_name="image/encoded")

    modalities.add_audio(
    parser_builder=self.parser_builder,
    sampler_builder=self.sampler_builder,
    postprocessor_builder=self.postprocessor_builder,
    # preprocessor_builder=preprocessor_builder,
    input_feature_name="WAVEFORM/feature/floats",
    output_feature_name='audio',
    is_training=is_training,
    num_samples=num_samples,
    stride=stride,
    num_test_clips=num_test_clips,
    sync_random_state=False,
    )

    self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt"))
    self.tokenizer.initialize()

    modalities.add_text(
    parser_builder=self.parser_builder,
    decoder_builder=self.decoder_builder,
    preprocessor_builder=self.preprocessor_builder,
    tokenizer=self.tokenizer,
    is_training=is_training,
    input_feature_name="caption/string",
    output_raw_string_name='text_string',
    output_feature_name='text',
    prepend_bos=prepend_bos,
    append_eos=append_eos,
    keep_raw_string=keep_raw_string,
    max_num_captions=1,
    max_num_tokens=16,
    sync_random_state=False,
    )

    modalities.add_label(
        parser_builder=self.parser_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        one_hot_label=one_hot_label,
        num_classes=ToyFactory2._NUM_CLASSES,
        add_label_name=add_label_name)

loading dmvr dataset

import tensorflow as tf
factory = ToyFactory2().configure(is_training=True)
ds = factory.make_dataset(batch_size=1)
data = next(iter(ds))
print(data.keys(), data)

Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.

Thanks

can you finish

cooper12121 commented 1 year ago

google-scenic uses dmvr as da

have you fix it? i also met the same problem,and it must be the dataset problem

google-deepmind / dmvr

How to prepare Multi Modal Dataset #5