tensorflow / io

Dataset, streaming, and file system extensions maintained by TensorFlow SIG-IO
Apache License 2.0
706 stars 287 forks source link

tfio.IOTensor.from_json cannot decode strings #1610

Open perretv opened 2 years ago

perretv commented 2 years ago

It seems that tfio.IOTensor.from_json is currently unable to decode json files that contain string values. The error can be reproduced with the following code:

import json
import sys
import tempfile
import tensorflow as tf
import tensorflow_io as tfio

print(f"tensorflow={tf.__version__}")
print(f"tensorflow-io={tfio.__version__}")
print(sys.version)

tempfile = tempfile.NamedTemporaryFile(suffix=".json")
data = {"key1": 1, "key2": 2.0, "key3": True, "key4": "text"}
json.dump(data, open(tempfile.name, "w"))
io_tensor = tfio.IOTensor.from_json(tempfile.name)
for key in data:
    key_tensor = io_tensor(key).to_tensor()
    assert key_tensor.numpy() == data[key]
    print(f"{key} with dtype {key_tensor.dtype} parsed successfully")

The raised error looks like the following:

tensorflow=2.7.0
tensorflow-io=0.23.1
3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:46)
[GCC 9.4.0]
key1 with dtype <dtype: 'int64'> parsed successfully
key2 with dtype <dtype: 'float64'> parsed successfully
key3 with dtype <dtype: 'bool'> parsed successfully
2022-01-14 13:40:33.650567: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at io_interface.h:252 : INVALID_ARGUMENT: data type is not supported: string
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-2-cf9e9158dd2f> in <module>
     15 io_tensor = tfio.IOTensor.from_json(tempfile.name)
     16 for key in data:
---> 17     key_tensor = io_tensor(key).to_tensor()
     18     assert key_tensor.numpy() == data[key]
     19     print(f"{key} with dtype {key_tensor.dtype} parsed successfully")

~/miniforge3/lib/python3.9/site-packages/tensorflow_io/python/ops/io_tensor_ops.py in to_tensor(self, **kwargs)
    296         """
    297         with tf.name_scope(kwargs.get("name", "IOToTensor")):
--> 298             return self.__getitem__(slice(None, None))
    299
    300

~/miniforge3/lib/python3.9/site-packages/tensorflow_io/python/ops/io_tensor_ops.py in __getitem__(self, key)
    231         # based on python slice()'s indices method:
    232         index = key if isinstance(key, slice) else slice(key, key + 1)
--> 233         items = self._function(start=index.start, stop=index.stop)
    234         return tf.squeeze(items, axis=[0]) if items.shape[0] == 1 else items
    235

~/miniforge3/lib/python3.9/site-packages/tensorflow_io/python/ops/io_tensor_ops.py in __call__(self, start, stop)
     35     def __call__(self, start, stop):
     36         start, stop, _ = slice(start, stop).indices(self._length)
---> 37         return self._function(
     38             self._resource,
     39             start=start,

<string> in io_json_readable_read(input, start, stop, component, shape, dtype, name)

~/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
   7105 def raise_from_not_ok_status(e, name):
   7106   e.message += (" name: " + name if name is not None else "")
-> 7107   raise core._status_to_exception(e) from None  # pylint: disable=protected-access
   7108
   7109

InvalidArgumentError: data type is not supported: string [Op:IO>JSONReadableRead]

We can see that int, float & bool dtypes are successfully decoded however tfio fails when encountering a str in the json file.

perretv commented 2 years ago

@yongtang you seem to be knowledgeable on the subject :)

perretv commented 2 years ago

Note that the use of tfio.experimental.serialization.decode_json can circumvent the problem:

import json
import sys
import tempfile
import tensorflow as tf
import tensorflow_io as tfio

print(f"tensorflow={tf.__version__}")
print(f"tensorflow-io={tfio.__version__}")
print(sys.version)

tempfile = tempfile.NamedTemporaryFile(suffix=".json")
data = {"key1": 1, "key2": 2.0, "key3": True, "key4": "text"}
json.dump(data, open(tempfile.name, "w"))
io_tensor = tfio.IOTensor.from_json(tempfile.name)
spec_dict = {c: s for c, s in zip(io_tensor.columns, io_tensor.spec)}
json_tensor = tfio.experimental.serialization.decode_json(tf.io.read_file(tempfile.name), spec_dict)
for key in data:
    assert json_tensor[key].numpy() == data[key].encode() if isinstance(data[key], str) else data[key]
    print(f"{key} with dtype {json_tensor[key].dtype} parsed successfully")

returns:

tensorflow=2.7.0
tensorflow-io=0.23.1
3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:46)
[GCC 9.4.0]
key1 with dtype <dtype: 'int64'> parsed successfully
key2 with dtype <dtype: 'float64'> parsed successfully
key3 with dtype <dtype: 'bool'> parsed successfully
key4 with dtype <dtype: 'string'> parsed successfully