TidierOrg / Tidier.jl

Meta-package for data analysis in Julia, modeled after the R tidyverse.
MIT License
515 stars 14 forks source link

Initial Implementation of TidierPlots.jl #97

Closed rdboyes closed 1 year ago

rdboyes commented 1 year ago

Here's a potential way to implement ggplot syntax and translate to a plot using AlgebraOfGraphics.jl. The general way it works is to define object types (geom and ggplot) that hold the information required by AlgebraOfGraphics, as well as define an operation that adds them together. The macros translate ggplot code into the appropriate object, and the draw_ggplot function does the final translation to AoG syntax.

Looking for feedback on the general structure - happy to set up the intial repo if this is generally the format we want to use!

using Makie, CairoMakie, AlgebraOfGraphics
using PalmerPenguins, DataFrames

penguins = dropmissing(DataFrame(PalmerPenguins.load()))

struct geom
    visual::Union{Symbol, Nothing}
    aes::Dict
    args::Dict
    analysis::Any
    required_aes::AbstractArray
end

struct ggplot
    geoms::AbstractArray
    default_aes::Dict
    data::Symbol
    axis::NamedTuple
end

function Base.:+(x::ggplot, y...)::ggplot
    result = ggplot(vcat(x.geoms, [i for i in y]), 
        x.default_aes, 
        x.data, 
        x.axis)

    return result
end

macro ggplot(exprs...)
    aes_dict, args_dict = extract_aes(:($(exprs)))

    if haskey(args_dict, "height")
        height = args_dict["height"]
    else 
        height = 400
    end

    if haskey(args_dict, "width")
        width = args_dict["width"]
    else 
        width = 600
    end

    return ggplot([], aes_dict, args_dict["data"], (height = height, width = width))
end

function extract_aes(geom)
    aes_dict = Dict{String, Symbol}() 
    args_dict = Dict{String, Any}() 

    for section in geom
        if section isa Expr
            # if the section is an expression, check if it is a aes function call
            if section.args[1] == :aes
                for aes_ex in section.args
                    if aes_ex isa Expr
                        aes_dict[String(aes_ex.args[1])] = aes_ex.args[2]
                    end
                end
            # if not, its a generic argument
            else
                args_dict[String(section.args[1])] = section.args[2]
            end
        end
    end

    return (aes_dict, args_dict)
end

function check_aes(required_aes, aes_dict)
    for aes in required_aes
        if !haskey(aes_dict, aes)
            error("missing required aesthetic: $aes")
        end
    end
end

macro geom_point(exprs...)
    geom_visual = :Scatter
    aes_dict, args_dict = extract_aes(:($(exprs)))
    analysis = nothing
    required_aes = ["x", "y"]

    check_aes(required_aes, aes_dict)

    return geom(geom_visual, aes_dict, args_dict, nothing, required_aes)
end

macro geom_smooth(exprs...)
    geom_visual = nothing
    aes_dict, args_dict = extract_aes(:($(exprs)))
    analysis = AlgebraOfGraphics.smooth
    required_aes = ["x", "y"]

    if haskey(args_dict, "method")
        if args_dict["method"] == "lm"
            analysis = AlgebraOfGraphics.linear
        end
    end

    check_aes(required_aes, aes_dict)

    return geom(geom_visual, aes_dict, args_dict, analysis, required_aes)
end

function geom_to_layer(geom)
    mapping_args = (geom.aes[key] for key in geom.required_aes)

    layer = data(eval(geom.args["data"])) *
        mapping(mapping_args...)

    if !isnothing(geom.analysis)
        layer = layer * (geom.analysis)()
    end

    if !isnothing(geom.visual)
        layer = layer * visual(eval(geom.visual))
    end

    if haskey(geom.aes, "color")
        layer = layer * mapping(color = geom.aes["color"])
    end

    return layer
end

function draw_ggplot(plot::ggplot)
    for geom in plot.geoms
        # if data is not specified at the geom level, use the ggplot default
        if !haskey(geom.args, "data")
            geom.args["data"] = plot.data
        end

        # if an aes isn't given in the geom, use the ggplot aes
        for aes in keys(plot.default_aes)
            if !haskey(geom.aes, aes)
                geom.aes[aes] = plot.default_aes[aes]
            end
        end
    end

    layers = []

    for geom in plot.geoms
        push!(layers, geom_to_layer(geom))
    end

    if length(layers) == 0
        error("No geoms supplied")
    elseif length(layers) == 1
        draw(layers[1]; axis = plot.axis)
    else 
        draw((+)(layers...); axis = plot.axis)
    end
end

test_plot = @ggplot(data = penguins, aes(color = species)) + 
    @geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) + 
    @geom_smooth(aes(x = bill_length_mm, y = bill_depth_mm), method = "lm")

draw_ggplot(test_plot)

download

kdpsingh commented 1 year ago

This is some top-notch wizardry!!! I have a slightly different implementation drafted, but let me look through this as I love the final product. My draft implementation is super early so I'm not wedded to one approach. I'm mainly thinking through what's the lowest level of complexity we can adopt to make this work and behave like ggplot. I also have a "solution" around the need for a draw_plot() function.

Let me play with your code and will plan to start a repo later in May (am traveling for conferences the next 2 weeks).

rdboyes commented 1 year ago

Awesome! I did attempt some hacks to get around the draw_ggplot function as well, but none of them seemed to work quite right so I'm curious what you came up with :)

kdpsingh commented 1 year ago

@rdboyes, having looked through this, you've done an amazing job! Your version is already more functional than what I had drafted. Your use of dictionaries is really clever and is making me want to go back to Tidier.jl to look at if there are design changes I should consider there.

If you've got bandwidth, I would suggest you commit this repo to TidierOrg as our official starting point for TidierPlots.

Here are a few suggestions I would have us think about which we can address in future updates once this is on GitHub.

Thoughts?

Thanks again, and excited to see this move forward!

kdpsingh commented 1 year ago

Two additional things we should consider:

rdboyes commented 1 year ago

Thanks for all the feedback! I should have some time this weekend to set up the repo. In the meantime, some responses:

Propagation of aesthetics from the ggplot call does already happen, yes. It happens in the draw_ggplot function right now - here:

    for geom in plot.geoms
        # if data is not specified at the geom level, use the ggplot default
        if !haskey(geom.args, "data")
            geom.args["data"] = plot.data
        end

        # if an aes isn't given in the geom, use the ggplot aes
        for aes in keys(plot.default_aes)
            if !haskey(geom.aes, aes)
                geom.aes[aes] = plot.default_aes[aes]
            end
        end
    end

However, as written, the code will throw an error if a required aes is not provided directly in the geom_x macro call. This should be a quick fix - I just have to change where the required aes checks are done (when we draw the plot instead of when the geom is called).

Auto-draw I think should be a toggle-able option if implemented the way you're suggesting - more complex ggplots could result in a lot of unnecessary plotting, especially because you'd have to also redraw for calls to scale_*, coords, labs, theme, etc. But I think this works for simple ggplots, which is where you want it to work anyway!

Plus vs. Pipe One thing I like about the plus-based approach with a geom struct is how closely it matches the AoG implementation. It's close enough that we could potentially expose a geom_to_layer() function that would allow you to write something like:

plt = geom_to_layer(@geom_smooth(aes(x = bill_length_mm, y = bill_depth_mm), method = "lm", data = penguins)) +
    penguin_bill * mapping(color = :species)

The other thing I like about the plus being the primary syntax is that it allows you to define geoms independent of a ggplot call and use them wherever you want. Also, I get that Hadley loves pipes, but a ggplot definition doesn't make immediate sense to me as a "order of operations" like I usually think of pipes representing. It's much more "things combined together" which the plus represents well ... maybe I'm just used to it!

Maybe we can support the pipe syntax using multiple dispatch? For each geom macro, have an alternate signature that looks like:

macro geom_x(plt::ggplot, exprs...)
   return plt + @geom_x(exprs....)
end

I'm not 100% sure if this works or not - I'd have to test it!

More ggplot fields - yes, definitely. My original plan was to define structs for at least theme and scale, maybe more? And have spots for them in the ggplot struct similar to how the geoms field is meant to hold geoms. Then edit the :+ function to pu them in the right spot conditionally, something like:

function Base.:+(x::ggplot, y...)::ggplot
    result = ggplot(
        vcat(x.geoms, [i for i in y if i isa geom]),
        vcat(x.scales, [j for j in y if j isa scale]),
        x.default_aes, 
        x.data, 
        x.axis)

    return result
end
kdpsingh commented 1 year ago

I agree with everything you said. Let me know if you run into any barriers as you try to create the repo or commit to it in terms of GitHub org/repo permissions.

rdboyes commented 1 year ago

It exists! I implemented a slightly dubious hack for automatic plotting as well - I think it works, and it only plots once, but it sure feels dicey