JuliaData / JuliaDB.jl

Parallel analytical database in pure Julia
http://juliadb.org/
Other
768 stars 62 forks source link

CustomParser and StrRange problem #271

Open Drvi opened 5 years ago

Drvi commented 5 years ago

Hi, I'm trying to figure out how to make a custom parser in JuliaDB/TextParse and got stuck for several hours now. My goal is to parse UUIDs into UInt128 (my data is quite big and this saves a lot of space). Here is my approach

using TextParse: Nullable, CustomParser
using JuliaDB
using TextParse
using UUIDs

@inline function _tryparsenext_uuid_digit(str, i)
    y = iterate(str,i)
    y===nothing && (return (-3, i))  # end of string
    c = y[1]; ii = y[2]
    if '0' <= c <= '9'
        return c-'0', ii
    elseif 'a' <= c <= 'f'
        return c-'W', ii
    elseif c == '-'
        return -1, ii  # skip dashes
    else
        return -2, ii  # invalid char
    end
end

@inline function _tryparsenext_uuid(str, i)::Tuple{Nullable{UInt128}, Int64}
    i0 = i
    R = Nullable{UInt128}

    y = _tryparsenext_uuid_digit(str, i)
    if y[1] == -2
        return R(), i0
    else
        r = UInt128(y[1])
    end
    i = y[2]
    digits = 0

    basenum = UInt128(16)
    while true
        y2 = _tryparsenext_uuid_digit(str, i)
        d = y2[1]
        d == -2 && (return R(), i0) 
        i = y2[2]
        d == -1 && continue
        digits += 1
        d == -3 && return (digits == 32 ? (R(convert(UInt128, r)), i) : ((R(), i0)))

        r = r*basenum + d
    end
end

julia> _tryparsenext_uuid("wrong!", 1)
(Nullable{UInt128}(), 1)

julia> _tryparsenext_uuid("00000000-0000-0000-0000-000000000001", 1)
(Nullable{UInt128}(0x00000000000000000000000000000001), 37)

MWE:

nm1 = tempname()

open(nm1, "w") do io
    write(io, "$(UUID(1))\tabc\nabc\t$(UUID(1))")
end

uuid_parser = CustomParser(UInt128) do str, i, len, opts
    _tryparsenext_uuid(str, i)
end

JuliaDB.loadtable(nm1, 
                  delim='\t', 
                  header_exists=false, 
                  colparsers = Dict(1=>uuid_parser))

# MethodError: no method matching TextParse.StrRange(::Int64, ::Int64)
# pointing at https://github.com/JuliaComputing/TextParse.jl/blob/master/src/csv.jl#L592
# I tried Changing the StrRange(1, 0) to StrRange(1, 0, 0), then the broadcasting didn't work
# so I changed that into a forloop and I got the StrRanges in the output table...

rm(nm1, recursive=true)

I'm on Julia 1.1, JuliaDB 0.12.0, TextParse 0.9.0.

I'd really appreciate some help with this. Let me know, if I should provide more information.

davidanthoff commented 5 years ago

@joshday could you move this issue over to TextParse.jl? Clearly a problem there, and that way it won't get lost.