Open cdeepali opened 3 years ago
Skipping these tests for now in the Feedstock test script.
@cdeepali Is this still an issue?
I am checking these once.
Yes seeing a number of failures like:
=================================== FAILURES ===================================
__________ TestTokenizationBart.test_batch_encode_dynamic_overflowing __________
self = <tests.test_tokenization_bart.TestTokenizationBart testMethod=test_batch_encode_dynamic_overflowing>
def test_batch_encode_dynamic_overflowing(self):
"""
When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence:
[
Sequence 1: [Encoding 1, Encoding 2],
Sequence 2: [Encoding 1],
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
]
This needs to be padded so that it can represented as a tensor
"""
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
with self.subTest(
"{} ({}, {})".format(tokenizer.__class__.__name__, pretrained_name, tokenizer.__class__.__name__)
):
if is_torch_available():
returned_tensor = "pt"
elif is_tf_available():
returned_tensor = "tf"
else:
returned_tensor = "jax"
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
tokens = tokenizer.encode_plus(
"HuggingFace is solving NLP one commit at a time",
max_length=6,
padding=True,
truncation=True,
return_tensors=returned_tensor,
> return_overflowing_tokens=True,
)
tests/test_tokenization_common.py:2308:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/transformers/tokenization_utils_base.py:2357: in encode_plus
**kwargs,
src/transformers/models/gpt2/tokenization_gpt2_fast.py:173: in _encode_plus
return super()._encode_plus(*args, **kwargs)
src/transformers/tokenization_utils_fast.py:468: in _encode_plus
**kwargs,
src/transformers/models/gpt2/tokenization_gpt2_fast.py:163: in _batch_encode_plus
return super()._batch_encode_plus(*args, **kwargs)
src/transformers/tokenization_utils_fast.py:426: in _batch_encode_plus
return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
src/transformers/tokenization_utils_base.py:203: in __init__
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = {'input_ids': [[0, 40710, 3923, 34892, 16, 2], [0, 15582, 234, 21992, 65, 2], [0, 6225, 23, 10, 86, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0, 0]}
tensor_type = <TensorType.JAX: 'jax'>, prepend_batch_axis = False
def convert_to_tensors(
self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
):
"""
Convert the inner content to tensors.
Args:
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
Whether or not to add the batch dimension during the conversion.
"""
if tensor_type is None:
return self
# Convert to TensorType
if not isinstance(tensor_type, TensorType):
tensor_type = TensorType(tensor_type)
# Get a function reference for the correct framework
if tensor_type == TensorType.TENSORFLOW:
if not is_tf_available():
raise ImportError(
"Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
)
import tensorflow as tf
as_tensor = tf.constant
is_tensor = tf.is_tensor
elif tensor_type == TensorType.PYTORCH:
if not is_torch_available():
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
import torch
as_tensor = torch.tensor
is_tensor = torch.is_tensor
elif tensor_type == TensorType.JAX:
if not is_flax_available():
> raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
E ImportError: Unable to convert output to JAX tensors format, JAX is not installed.
src/transformers/tokenization_utils_base.py:677: ImportError
I am updated https://github.com/open-ce/transformers-feedstock/pull/22/ to skip tests with name - test_batch_encode_dynamic_overflowing
.
test_tokenization tests fail with the following error: