google-research / t5x

Apache License 2.0
2.67k stars 306 forks source link

ValueError: None values not supported #728

Closed StephennFernandes closed 2 years ago

StephennFernandes commented 2 years ago

upon running a seqio mixture on mT5 and ByT5 i get and error stating: ValueError: None values not supported

I currently am using a seqio mixture that i define in my task.py file and use the default mt5 tokenizer gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model with extra_ids=0

This is how my task.py file looks

import functools
import seqio
import tensorflow as tf
import t5.data
from datasets import load_from_disk, load_dataset
from t5.data import postprocessors
from t5.data import preprocessors
from t5.evaluation import metrics
from seqio import FunctionDataSource, utils

TaskRegistry = seqio.TaskRegistry
vocabulary = seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=vocabulary, add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=vocabulary, add_eos=True)
}

def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_path=None):
    dataset = load_dataset(dataset_path, streaming=True, use_auth_token=True)
    if shuffle:
        if seed:
            dataset = dataset.shuffle(seed=seed)
        else:
            dataset = dataset.shuffle()
    while True:
        for item in dataset[str(split)]:
            yield item[column]

def dataset_fn(split, shuffle_files, seed=None, dataset_path=None):
    return tf.data.Dataset.from_generator(
        functools.partial(gen_dataset, split, shuffle_files, seed, dataset_path=dataset_path),
        output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_path)
    )

@utils.map_over_dataset
def target_to_key(x, key_map, target_key):
    """Assign the value from the dataset to target_key in key_map"""
    return {**key_map, target_key: x}

# link to the mt5 sentencepiece tokenizer vocabulary
vocabulary = seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0)

TaskRegistry.add(
    "hindi_span_curruption",
    source=seqio.FunctionDataSource(
        dataset_fn=functools.partial(dataset_fn, dataset_path='StephennFernandes/ciil_mega_corpus_hindi'),
        splits=("train", "validation"),
        caching_permitted=False),
    preprocessors=[
        functools.partial(
            target_to_key, key_map={
                "inputs": None,
                "targets": None,
            }, target_key="targets"),
        seqio.preprocessors.tokenize,
        # seqio.CacheDatasetPlaceholder(),
        preprocessors.span_corruption, 
        seqio.preprocessors.append_eos_after_trim,
    ],
    output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"],"inputs": seqio.Feature(vocabulary=vocabulary,add_eos=True)},
    metric_fns=[]
)
### similar multiple tasks exist for multiple languages. ### 

seqio.MixtureRegistry.add(
  "ciil_mix_3",
  ["assamese_span_curruption", "bengali_span_curruption", 
  "bhisnupuriya_span_curruption", "bodo_span_curruption", 
  "divehi_span_curruption", "dogri_span_curruption", 
  "english_span_curruption", "gujarati_span_curruption",
  "hindi_span_curruption", "kannada_span_curruption", 
  "kashmiri_span_curruption", "konkani_span_curruption", 
  "maithili_span_curruption", "malayalam_span_curruption",
  "manipuri_span_curruption", "marathi_span_curruption",
  "nepali_span_curruption", "odia_span_curruption",
  "panjabi_span_curruption", "sanskrit_span_curruption",
  "tamil_span_curruption", "telugu_span_curruption",
   "urdu_span_curruption" ],
  default_rate=3
)

i use the ciil_mix_3 mixture in my .gin file this is how my .gin file looks

from __gin__ import dynamic_registration
import t5.data.mixtures
import __main__ as train_script

include 't5x/examples/t5/mt5/base.gin'
include 't5x/configs/runs/pretrain.gin'

import task 

MIXTURE_OR_TASK_NAME = "ciil_mix_3"
TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 114}
TRAIN_STEPS = 100000
DROPOUT_RATE = 0.0
BATCH_SIZE = 32

train_script.train:
  eval_period = 2000

The following is the entire stack track of the same:

Traceback (most recent call last):
  File "/home/stephen/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/stephen/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/train.py", line 748, in <module>
    gin_utils.run(main)
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/gin_utils.py", line 107, in run
    app.run(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/absl/app.py", line 308, in run
    _run_main(main, args)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/absl/app.py", line 254, in _run_main
    sys.exit(main(argv))
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/train.py", line 708, in main
    _main(argv)
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/train.py", line 744, in _main
    train_using_gin()
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
    utils.augment_exception_message_and_reraise(e, err_str)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
    raise proxy.with_traceback(exception.__traceback__) from None
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/train.py", line 249, in train
    train_ds = get_dataset_fn(train_dataset_cfg, ds_shard_id, num_ds_shards,
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/utils.py", line 1366, in get_dataset
    return get_dataset_inner(cfg, shard_info, feature_converter_cls, seed,
  File "/home/stephen/Desktop/t5x_final_test/t5x/t5x/utils.py", line 1387, in get_dataset_inner
    ds = seqio.get_dataset(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 1671, in get_dataset
    ds = mixture_or_task.get_dataset(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 1457, in get_dataset
    datasets = [
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 1458, in <listcomp>
    task.get_dataset(  # pylint:disable=g-complex-comprehension
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 1209, in get_dataset
    ds = self.preprocess_postcache(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 1044, in preprocess_postcache
    dataset = self._preprocess_dataset(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/dataset_providers.py", line 965, in _preprocess_dataset
    dataset = prep_fn(dataset, **kwargs)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/preprocessors.py", line 83, in tokenize
    return utils.map_over_dataset(fn=tokenize_fn)(dataset)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/utils.py", line 778, in wrapped_fn
    return ds.map(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2050, in map
    return ParallelMapDataset(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 5284, in __init__
    self._map_func = structured_function.StructuredFunctionWrapper(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py", line 271, in __init__
    self._function = fn_factory()
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 2567, in get_concrete_function
    graph_function = self._get_concrete_function_garbage_collected(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 2533, in _get_concrete_function_garbage_collected
    graph_function, _ = self._maybe_define_function(args, kwargs)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 2711, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 2627, in _create_graph_function
    func_graph_module.func_graph_from_py_func(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py", line 1141, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py", line 248, in wrapped_fn
    ret = wrapper_helper(*args)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/data/ops/structured_function.py", line 177, in wrapper_helper
    ret = autograph.tf_convert(self._func, ag_ctx)(*nested_args)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 692, in wrapper
    raise e.ag_error_metadata.to_exception(e)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 689, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 439, in converted_call
    result = converted_f(*effective_args, **kwargs)
  File "/tmp/__autograph_generated_fileu9gu1w4n.py", line 8, in <lambda>
    tf__lam = lambda arg: ag__.with_function_scope(lambda lscope: ag__.converted_call(fn, (arg,) + tuple(args), dict(**kargs), lscope), 'lscope', ag__.STD)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/core/function_wrappers.py", line 113, in with_function_scope
    return thunk(scope)
  File "/tmp/__autograph_generated_fileu9gu1w4n.py", line 8, in <lambda>
    tf__lam = lambda arg: ag__.with_function_scope(lambda lscope: ag__.converted_call(fn, (arg,) + tuple(args), dict(**kargs), lscope), 'lscope', ag__.STD)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 352, in converted_call
    return converted_call(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 439, in converted_call
    result = converted_f(*effective_args, **kwargs)
  File "/tmp/__autograph_generated_filezbhafqmt.py", line 113, in tf__tokenize_impl
    ag__.for_stmt(ag__.converted_call(ag__.ld(features).items, (), None, fscope), None, loop_body, get_state_4, set_state_4, (), {'iterate_names': '(k, v)'})
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 449, in for_stmt
    _py_for_stmt(iter_, extra_test, body, None, None)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 498, in _py_for_stmt
    body(target)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 464, in protected_body
    original_body(protected_iter)
  File "/tmp/__autograph_generated_filezbhafqmt.py", line 105, in loop_body
    ag__.if_stmt(ag__.ld(k) in ag__.ld(output_features), if_body_3, else_body_3, get_state_3, set_state_3, ('v',), 1)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 1341, in if_stmt
    _py_if_stmt(cond, body, orelse)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 1394, in _py_if_stmt
    return body() if cond else orelse()
  File "/tmp/__autograph_generated_filezbhafqmt.py", line 63, in if_body_3
    v = ag__.converted_call(ag__.ld(vocab).encode_tf, (ag__.ld(v),), None, fscope)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 441, in converted_call
    result = converted_f(*effective_args)
  File "/tmp/__autograph_generated_filef9jwq2ra.py", line 13, in tf__encode_tf
    retval_ = ag__.converted_call(ag__.ld(self)._encode_tf, (ag__.ld(s),), None, fscope)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 441, in converted_call
    result = converted_f(*effective_args)
  File "/tmp/__autograph_generated_filezpl5g8b_.py", line 21, in tf___encode_tf
    retval_ = ag__.converted_call(ag__.ld(self).tf_tokenizer.tokenize, (ag__.ld(s),), None, fscope)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 441, in converted_call
    result = converted_f(*effective_args)
  File "/tmp/__autograph_generated_filet9vre1mq.py", line 22, in tf__tokenize
    input_tensor = ag__.converted_call(ag__.ld(ragged_tensor).convert_to_tensor_or_ragged_tensor, (ag__.ld(input),), None, fscope)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 377, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 459, in _call_unconverted
    return f(*args)
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/ops/ragged/ragged_tensor.py", line 2683, in convert_to_tensor_or_ragged_tensor
    return ops.convert_to_tensor_v2_with_dispatch(
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow/python/framework/tensor_util.py", line 441, in make_tensor_proto
    raise ValueError("None values not supported.")
ValueError: in user code:

    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/utils.py", line 779, in None  *
        lambda arg: fn(arg, *args, **kargs)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/preprocessors.py", line 116, in tokenize_impl  *
        v = vocab.encode_tf(v)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/vocabularies.py", line 114, in encode_tf  *
        return self._encode_tf(s)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/vocabularies.py", line 413, in _encode_tf  *
        return self.tf_tokenizer.tokenize(s)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py", line 133, in tokenize  *
        input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)

    ValueError: None values not supported.

  In call to configurable 'train' (<function train at 0x7f79d8db2280>)

I even further tried to work the same with byT5 and the same error occurs: the following is the error occured using byT5

ValueError: in user code:

    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/utils.py", line 779, in None  *
        lambda arg: fn(arg, *args, **kargs)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/preprocessors.py", line 116, in tokenize_impl  *
        v = vocab.encode_tf(v)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/vocabularies.py", line 114, in encode_tf  *
        return self._encode_tf(s)
    File "/home/stephen/anaconda3/lib/python3.9/site-packages/seqio/vocabularies.py", line 555, in _encode_tf  *
        tf_ids = tf.io.decode_raw(s, tf.uint8) + self._num_special_tokens

    ValueError: Tried to convert 'bytes' to a tensor and failed. Error: None values not supported.
StephennFernandes commented 2 years ago

@hwchung27 could you please help me out here

StephennFernandes commented 2 years ago

@hwchung27 do you think this is occurring because maybe some of my samples are very short so the span corruption is masking the entire sentence ?

or is it a case where my samples have a blank line in them ?

StephennFernandes commented 2 years ago

@craffel @adarob hey guys, sorry to bother you on your busy schedule. but please could you tell me what could be the cause for this error ?

I inititally thought this error could be because my dataset has blank line samples or perhaps have samples that have only 5-6 words that the span curruption task couldn't cover. But even after filtering my dataset for keeping sentences only above 10 words. i still face the same error.

could you please help me out on this.

adarob commented 2 years ago

I don't have time to go through your code but this appears to be saying the value of 'input' is None. I would suggest running the steps of the pipeline before tokenize in a notebook (e.g., Colab) and see if your features look as you expect.

Closing this since it's not an actual T5X bug. Feel free to reopen a Discussion topic to solicit help in debugging your issue.

StephennFernandes commented 2 years ago

raised a discussion here @adarob @hwchung27 would really mean a lot if you could help me out on it