JuliaData / JSONTables.jl

JSON3.jl + Tables.jl
MIT License
67 stars 10 forks source link

Recovering the input table from a JSONified table #6

Open JockLawrie opened 5 years ago

JockLawrie commented 5 years ago

Hi there,

I have been using some code for transforming a DataFrame to JSON and back again, with the requirement that the de-JSONified DataFrame is an exact copy of the input DataFrame, eltypes and all. I'd like to make this code public, and see that this package has the same purpose but doesn't preserve types. Can we combine our efforts?

My code is below (...credit where it's due, this was written by Josh Bode).

Cheers, Jock

#=
Given `data::DataFrame`:
- Convert it to JSON:      `x = JSON.json(data)`
- Parse it back out again: `data2 = convert(DataFrame, JSON.parse(x))`
- data2 is element-wise equal to data
=#

################################################################################
# Convert a DataFrame to JSON

JSON.lower(x::Enum) = string(x)
JSON.lower(::Missing) = Vector{Union{Missing,Any}}()
JSON.lower(x::Complex) = [real(x), imag(x)]
JSON.lower(x::Set) = collect(x)

JSON.lower(x::DataFrames.DataFrame) = Dict{String, Vector{Any}}(
    "names" => DataFrames.names(x),
    "types" => DataFrames.eltypes(x),
    "columns" => DataFrames.columns(x)
)
JSON.lower(x::DataFrames.SubDataFrame) = JSON.lower(x[:])

################################################################################
# Convert data to a DataFrame, where data is parsed from JSON.
# Some data types need an explicit converter
function Base.convert(::Type{T}, x::AbstractString) where {T <: Union{Date, DateTime}}
    T(x)
end

Base.convert(::Type{Char}, x::AbstractString) = x[1]

function Base.convert(::Type{Set{T}}, x::AbstractVector) where T
    Set{T}(x)
end

function Base.convert(::Type{DataFrame}, x::Dict{String, Any})
    names, types, columns = try
        x["names"], x["types"], x["columns"]
    catch e
        error("Missing data: $(e.key)")
    end
    result = DataFrame()
    for (name, typename, coldata) in zip(names, types, columns)
        T1 = eval(Meta.parse(typename))  # E.g., Union{Missing, Int64}.
        T2 = Missings.T(T1)              # E.g., Int64
        @assert isconcretetype(T2) || T2 === Any "Not a concrete type"
        n = length(coldata)
        colname = Symbol(name)
        result[colname] = Vector{T1}(undef, n)
        for i = 1:n
            val = coldata[i]
            result[i, colname] = val == nothing ? missing : convert(T2, val)
        end
    end
    result
end

Parsers for custom types can be added. For example, here's one for ZonedDateTime.

using TimeZones

function Base.convert(::Type{TimeZones.ZonedDateTime}, x::AbstractString)
    x, tz = x[1:end-6], x[end-5:end]
    ZonedDateTime(DateTime(x), TimeZones.FixedTimeZone(tz))
end
JockLawrie commented 5 years ago

...and the test code

using Utils.jsontables  # the code above

using DataFrames
using Dates
using JSON
using TimeZones

d  = DataFrame(
               a = [1,2,3,missing],
               b = ['a', missing, 'c', 'd'],
               c = rand(4),
               d = Date.(["2018-10-01", "2018-10-02", "2018-10-03", "2010-10-04"])
              )

d[:zdt] = [ZonedDateTime(DateTime(x), TimeZone("Australia/Melbourne")) for x in d[:d]]

s  = JSON.json(d)
j  = JSON.parse(s)
d2 = convert(DataFrame, j)

@test size(d2)    == size(d)
@test names(d2)   == names(d)
@test eltypes(d2) == eltypes(d)

function cell_values_are_all_equal(d1::DataFrame, d2::DataFrame)
    good = true
    n    = size(d1, 1)
    for colname in names(d1)
        for i = 1:n
            val1 = d1[i, colname]
            val2 = d2[i, colname]
            ismissing(val1)  && ismissing(val2)  && continue
            bad  = (ismissing(val1)  && !ismissing(val2)) || (!ismissing(val1) && ismissing(val2)) || (val1 != val2)
            if bad == true
                good = false
                break
            end
        end
        good == false && break
    end
    good
end

@test cell_values_are_all_equal(d, d2)