In [1]: from datashape import discover
In [2]: import numpy as np
In [3]: x = np.array([('Alice', 1), ('Bob', 2)], dtype=[('name', 'O'), ('amt', 'i4')])
# Before
In [4]: discover(x)
Out[4]: dshape("2 * {name: object, amt: int32}")
# After
In [4]: discover(x)
Out[4]: dshape("2 * {name: string, amt: int32}")
This checks the first five values of all arrays identified as object type. If all five are strings then it calls it a string column. This is less conservative.
This checks the first five values of all arrays identified as
object
type. If all five are strings then it calls it a string column. This is less conservative.