svilupp / PromptingTools.jl

Streamline your life using PromptingTools.jl, the Julia package that simplifies interacting with large language models.
https://svilupp.github.io/PromptingTools.jl/dev/
MIT License
96 stars 9 forks source link

Rewrite of parser: Julia type<->JSON and back #143

Open svilupp opened 2 months ago

svilupp commented 2 months ago

IT would be excellent if anyone wanted to look into improving the current parsing engine for structured extraction!

Currently:

Desired state:

Why JSON type (ie, JSON structure and types instead of values, all pretty-printed)? For open-source / smaller models, it seems to be a much better option - read more here.

Why the rewrite? Because it's not a very reliable code and we're desperately missing something like Pydantic in Julia. Moreover, it's WAY TO COMPLICATED NOW... Look how simple producing JSON schema can be in Python and Julia has way better reflection and analysis tools! image (Source: https://github.com/fastai/lm-hackers/blob/main/lm-hackers.ipynb)

I'd propose splitting it in a few PRs:

cpfiffer commented 2 months ago

Here's an extremely hacky way to do this. I'm going to re-do this I think because it relies on some weird metaprogramming shit and there has to be a much cleaner way.

primitive_types = Union{
    Integer, Real, AbstractString, Bool, Nothing, Missing, AbstractArray}

function Base.dump(arg; maxdepth = 10)
    # this is typically used interactively, so default to being in Main (or current active module)
    # mod = get(io, :module, PromptingTools)
    dump(IOContext(stdout), arg; maxdepth = maxdepth)
end

# Monster <: Any
#   name::String
#   age::Int64
#   height::Float64
#   friends::Vector{String}

"""
Convert a Julia type to a typed JSON schema. 

https://github.com/svilupp/PromptingTools.jl/issues/143
https://www.boundaryml.com/blog/type-definition-prompting-baml
"""
function typed_json_schema(x::Type{T}) where {T}
    @info "Typed JSON schema for $T" propertynames(T) fieldnames(T) T.types

    # If there are no fields, return the type
    if isempty(fieldnames(T))
        return to_json_schema(T)
    end

    dump(T)

    # Preallocate a mapping
    mapping = Dict()
    for (type, field) in zip(T.types, fieldnames(T))
        mapping[field] = typed_json_schema(type)
    end

    # Get property names
    return mapping
end

function type2dict(T)
    buffer = IOBuffer()
    dump(buffer, T)
    dumpstring = String(take!(buffer))

    lines = filter(!isempty, split(dumpstring, "\n"))
    main_type = string(T)
    mapping = Dict()
    for line in lines
        is_space = map(==(' '), collect(line))
        first_nonspace_index = findfirst(l -> !l, is_space)

        if first_nonspace_index == 1
            # This is the main type, skip it.
            continue
        elseif first_nonspace_index == 3
            # This is a field, add it to the dict. These are formatted as name::Type
            splitted = split(strip(line), "::")
            field_name = splitted[1]
            field_type = splitted[2] |> String
            field_type_expr = Meta.parse(:($field_type)) |> eval

            # Lastly, check if the type is a non-primitive. If so, recursively call type2dict.
            if !(field_type_expr <: primitive_types)
                mapping[field_name] = type2dict(field_type_expr)
            else
                mapping[field_name] = to_json_type(field_type_expr)
            end
        end
    end

    return mapping
end

struct SimpleSingleton
    singleton_value::Int
end
struct Nested
    inside_element::SimpleSingleton
end
struct IntFloatFlat
    int_value::Int
    float_value::Float64
end
struct Monster
    name::String
    age::Int
    height::Float64
    friends::Vector{String}
    nested::Nested
    flat::IntFloatFlat
end
res = type2dict(Monster) |> JSON3.write |> JSON3.read |> println

which gets you

{
    "nested": {
                 "inside_element": {
                                      "singleton_value": "integer"
                                   }
              },
      "name": "string",
    "height": "number",
      "flat": {
                 "float_value": "number",
                   "int_value": "integer"
              },
       "age": "integer",
   "friends": "string[]"
}

This also required changing the to_json_type stuff:

to_json_type(s::Type{<:AbstractString}) = "string"
to_json_type(n::Type{<:Real}) = "number"
to_json_type(n::Type{<:Integer}) = "integer"
to_json_type(b::Type{Bool}) = "boolean"
to_json_type(t::Type{<:Union{Missing, Nothing}}) = "null"
to_json_type(t::Type{T}) where {T <: AbstractArray} = to_json_type(eltype(t)) * "[]"

Note here that I removed the catch-all fallback of to_json_type(t::Type{<:Any}), since we should be handling these cases strictly. Not sure what effects that will have on the general ecosystem in here.

I'm not sure how we handle array types for custom structs, however. Not sure I could find a quick answer in the document linked above.

I.e.

struct ABunchOfVectors
    strings::Vector{String}
    ints::Vector{Int}
    floats::Vector{Float64}
    nested_vector::Vector{Nested}
end

does not have a well-defined type definition. Should this be

{
  "strings": "string[]",
  "ints": "number[]",
  "floats": "number[]",
  "nested_vector": "??????????????????????"
}

Perhaps this should be something like

{
   "nested_vector": [
    "nested": {
                 "inside_element": {
                                      "singleton_value": "integer"
                                   }
              },
    ]
}
hellovai commented 2 months ago

by chance would you like to get a julia integration with BAML? then you would be able to just take advantage of not just the type-schemas, but also the deserializer have. It's built in rust and we are just about to add support for ruby (and soon java)?

we found it can be a bit more tricky esp with nested types. for example, there are scenarios when one wants inline type definitions, and other times when one wants types defined up top.

https://www.github.com/boundaryml/baml

For some context, am one of the authors of BAML

cpfiffer commented 2 months ago

Honestly, yes. That sounds cool to me.

svilupp commented 2 months ago

Hi @hellovai , that sounds cool!

@cpfiffer , I had to handle the nested schemas in the Anthropic XML case (it's very similar to the "typed JSON"). I passed it as an List[Object] (or sth like that), because in JSON every struct is an object. It did work with strong enough models.

So we can probably just indicate the object structure and that it's a list of them - I'd try brackets around it rather than behind it, because that's how it would look if there was actually data.

In general, we'll need to give a guidance to people to keep their structs as flat as possible.

Btw I think we need to have first-class support for parsing:

That should be sufficient for 99% of cases. WDYT?

svilupp commented 2 months ago

Shall we create some parsing flavors? struct JSONTypeFlavor <: AbstractParserFlavor end?

to_json_type(::JSONTypeFlavor, s::Type{<:AbstractString}) = "string" ...

So we can support different schemas/parsing engines in parallel?

svilupp commented 2 months ago

Btw. clever to use dump! But stuff like this scares me: field_type_expr = Meta.parse(:($field_type)) |> eval

Is there a public interface to the functionality that dump uses? Maybe we could hook into that and just recurse through the object without changing it into a string and back?

cpfiffer commented 2 months ago

So we can probably just indicate the object structure and that it's a list of them - I'd try brackets around it rather than behind it, because that's how it would look if there was actually data.

Alright, sounds reasonable!

Btw I think we need to have first-class support for parsing:

  • simple struct (with singletons and/or simple native Julia types)
  • vector of simple structs (ItemsExtract wrapper)
  • maybe wrapper for uncertain extractions (MaybeExtract wrapper)

That should be sufficient for 99% of cases. WDYT?

Yeah -- agreed. For, MaybeExtract, it might be a little complicated to determine whether something is ambiguous, but I suppose this could be what is returned by a fallback function? Unclear to me at the moment.

Shall we create some parsing flavors? struct JSONTypeFlavor <: AbstractParserFlavor end?

Yes! Great idea.

Btw. clever to use dump! But stuff like this scares me: field_type_expr = Meta.parse(:($field_type)) |> eval

Is there a public interface to the functionality that dump uses? Maybe we could hook into that and just recurse through the object without changing it into a string and back?

Yeah, I'm not the biggest fan of using dump here. I think there's a way to do this without any of the metaprogramming shit. Let me take a second pass, which should be easier if we restrict to the cases above.

cpfiffer commented 2 months ago

Here is another option for the parser that does not use metaprogramming:

function typed_json_schema(x::Type{T}) where {T}
    @info "Typed JSON schema for $T" propertynames(T) fieldnames(T) T.types

    # If there are no fields, return the type
    if isempty(fieldnames(T))
        # Check if this is a vector type. If so, return the type of the elements.
        if T <: AbstractArray
            # Now check if the element type is a non-primitive. If so, recursively call typed_json_schema.
            if eltype(T) <: primitive_types
                return to_json_type(eltype(T))
            else
                return "List[" * JSON3.write(typed_json_schema(eltype(T))) * "]"
            end
        end

        # Check if the type is a non-primitive.
        if T <: primitive_types
            @info "Type is a primitive: $T"
            return to_json_type(T)
        else
            return typed_json_schema(T)
        end
    end

    # Preallocate a mapping
    mapping = Dict()
    for (type, field) in zip(T.types, fieldnames(T))
        mapping[field] = typed_json_schema(type)
    end

    # Get property names
    return mapping
end

This works pretty well:

struct ABunchOfVectors
    strings::Vector{String}
    ints::Vector{Int}
    floats::Vector{Float64}
    nested_vector::Vector{Nested}
end

res = typed_json_schema(ABunchOfVectors) |> JSON3.write
res |> JSON3.pretty

has

{
    "strings": "string",
    "ints": "integer",
    "nested_vector": "List[{\"inside_element\":{\"singleton_value\":\"integer\"}}]",
    "floats": "number"
}

Here I have opted to put the object type inside the list type, but not sure how well that works empirically. An alternative is to put the type in the value side:

function typed_json_schema(x::Type{T}) where {T}
    @info "Typed JSON schema for $T" propertynames(T) fieldnames(T) T.types

    # If there are no fields, return the type
    if isempty(fieldnames(T))
        # Check if this is a vector type. If so, return the type of the elements.
        if T <: AbstractArray
            # Now check if the element type is a non-primitive. If so, recursively call typed_json_schema.
            if eltype(T) <: primitive_types
                return to_json_type(eltype(T))
            else
                return Dict("list[Object]" => JSON3.write(typed_json_schema(eltype(T))))
                # return "List[" * JSON3.write(typed_json_schema(eltype(T))) * "]"
            end
        end

        # Check if the type is a non-primitive.
        if T <: primitive_types
            @info "Type is a primitive: $T"
            return to_json_type(T)
        else
            return typed_json_schema(T)
        end
    end

    # Preallocate a mapping
    mapping = Dict()
    for (type, field) in zip(T.types, fieldnames(T))
        mapping[field] = typed_json_schema(type)
    end

    # Get property names
    return mapping
end

which gives you

{
    "strings": "string",
    "ints": "integer",
    "nested_vector": {
        "list[Object]": "{\"inside_element\":{\"singleton_value\":\"integer\"}}"
    },
    "floats": "number"
}