Open sainivedh19pt opened 2 years ago
can you finish multimodal example
Thanks for your amazing work.
Error
[[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext] 2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.
I generated a single tf-record using
generate_from_csv.py
script provided in examples by using 4 videos with captions.Attaching code for dataset constructor class
from dmvr import tokenizers import os from dmvr import modalities from dmvr import video_dataset class ToyFactory2(video_dataset.BaseVideoDatasetFactory): _SUBSETS = ('train', 'test', 'valid') _SPLITS = (1, 2, 3) _NUM_CLASSES = 4 _NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1} def __init__( self, base_dir: str='generated_dataset', subset: str = 'valid', split: int = 1): """Constructor of ToyFactory2.""" if subset not in ToyFactory2._SUBSETS: raise ValueError('Invalid subset "{}". The available subsets are: {}' .format(subset, ToyFactory2._SUBSETS)) if split not in ToyFactory2._SPLITS: raise ValueError('Invalid split "{}". The available splits are: {}' .format(split, ToyFactory2._SPLITS)) num_shards = self._NUM_SHARDS[subset] super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')]) def _build(self, is_training = True, # Video related parameters. num_frames = 32, stride = 1, num_test_clips = 1, min_resize = 256, crop_size = 224, multi_crop = False, crop_resize_style = 'Inception', min_aspect_ratio = 0.5, max_aspect_ratio = 2, min_area_ratio = 0.08, max_area_ratio = 1.0, zero_centering_image = False, color_augmentation = True, # Text related parameters. max_num_words = 16, max_context_sentences = 1, tokenizer = 'howto100m_en', prepend_bos = False, append_eos = False, keep_raw_string = False, # Audio related parameters. num_samples = 153600, # 48000 (Hz) * 32 / 10 (fps) audio_stride = 1, sync_audio_and_image = True, # Label related parameters. one_hot_label = True, output_label_string = False, add_label_name = False, **kwargs): """Default build for this dataset. Args: is_training: Whether or not in training mode. num_frames: Number of frames per subclip. For single images, use 1. stride: Temporal stride to sample frames. num_test_clips: Number of test clips (1 by default). If more than 1, this will sample multiple linearly spaced clips within each video at test time. If 1, then a single clip in the middle of the video is sampled. The clips are aggreagated in the batch dimension. min_resize: Frames are resized so that `min(height, width)` is `min_resize`. crop_size: Final size of the frame after cropping the resized frames. Both height and width are the same. zero_centering_image: If `True`, frames are normalized to values in [-1, 1]. If `False`, values in [0, 1]. one_hot_label: Return labels as one hot tensors. add_label_name: Also return the name of the label. """ modalities.add_image( parser_builder=self.parser_builder, sampler_builder=self.sampler_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, postprocessor_builder=self.postprocessor_builder, is_training=is_training, num_frames=num_frames, stride=stride, num_test_clips=num_test_clips, min_resize=min_resize, crop_size=crop_size, zero_centering_image=zero_centering_image, input_feature_name="image/encoded") modalities.add_audio( parser_builder=self.parser_builder, sampler_builder=self.sampler_builder, postprocessor_builder=self.postprocessor_builder, # preprocessor_builder=preprocessor_builder, input_feature_name="WAVEFORM/feature/floats", output_feature_name='audio', is_training=is_training, num_samples=num_samples, stride=stride, num_test_clips=num_test_clips, sync_random_state=False, ) self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt")) self.tokenizer.initialize() modalities.add_text( parser_builder=self.parser_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, tokenizer=self.tokenizer, is_training=is_training, input_feature_name="caption/string", output_raw_string_name='text_string', output_feature_name='text', prepend_bos=prepend_bos, append_eos=append_eos, keep_raw_string=keep_raw_string, max_num_captions=1, max_num_tokens=16, sync_random_state=False, ) modalities.add_label( parser_builder=self.parser_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, one_hot_label=one_hot_label, num_classes=ToyFactory2._NUM_CLASSES, add_label_name=add_label_name)
loading dmvr dataset
import tensorflow as tf factory = ToyFactory2().configure(is_training=True) ds = factory.make_dataset(batch_size=1) data = next(iter(ds)) print(data.keys(), data)
Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.
Thanks
can you finish
google-scenic uses dmvr as da
have you fix it? i also met the same problem,and it must be the dataset problem
Thanks for your amazing work.
Error
Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(https://github.com/google-research/google-research/issues/925#issuecomment-1020675579). I too can run the provided video data hmdb example but facing troubles if audio, text were added.
I generated a single tf-record using
generate_from_csv.py
script provided in examples by using 4 videos with captions.Attaching code for dataset constructor class
loading dmvr dataset
Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.
Thanks