Initial Implementation of TidierPlots.jl

Here's a potential way to implement ggplot syntax and translate to a plot using AlgebraOfGraphics.jl. The general way it works is to define object types (geom and ggplot) that hold the information required by AlgebraOfGraphics, as well as define an operation that adds them together. The macros translate ggplot code into the appropriate object, and the draw_ggplot function does the final translation to AoG syntax.

Looking for feedback on the general structure - happy to set up the intial repo if this is generally the format we want to use!

using Makie, CairoMakie, AlgebraOfGraphics
using PalmerPenguins, DataFrames

penguins = dropmissing(DataFrame(PalmerPenguins.load()))

struct geom
    visual::Union{Symbol, Nothing}
    aes::Dict
    args::Dict
    analysis::Any
    required_aes::AbstractArray
end

struct ggplot
    geoms::AbstractArray
    default_aes::Dict
    data::Symbol
    axis::NamedTuple
end

function Base.:+(x::ggplot, y...)::ggplot
    result = ggplot(vcat(x.geoms, [i for i in y]), 
        x.default_aes, 
        x.data, 
        x.axis)

    return result
end

macro ggplot(exprs...)
    aes_dict, args_dict = extract_aes(:($(exprs)))

    if haskey(args_dict, "height")
        height = args_dict["height"]
    else 
        height = 400
    end

    if haskey(args_dict, "width")
        width = args_dict["width"]
    else 
        width = 600
    end

    return ggplot([], aes_dict, args_dict["data"], (height = height, width = width))
end

function extract_aes(geom)
    aes_dict = Dict{String, Symbol}() 
    args_dict = Dict{String, Any}() 

    for section in geom
        if section isa Expr
            # if the section is an expression, check if it is a aes function call
            if section.args[1] == :aes
                for aes_ex in section.args
                    if aes_ex isa Expr
                        aes_dict[String(aes_ex.args[1])] = aes_ex.args[2]
                    end
                end
            # if not, its a generic argument
            else
                args_dict[String(section.args[1])] = section.args[2]
            end
        end
    end

    return (aes_dict, args_dict)
end

function check_aes(required_aes, aes_dict)
    for aes in required_aes
        if !haskey(aes_dict, aes)
            error("missing required aesthetic: $aes")
        end
    end
end

macro geom_point(exprs...)
    geom_visual = :Scatter
    aes_dict, args_dict = extract_aes(:($(exprs)))
    analysis = nothing
    required_aes = ["x", "y"]

    check_aes(required_aes, aes_dict)

    return geom(geom_visual, aes_dict, args_dict, nothing, required_aes)
end

macro geom_smooth(exprs...)
    geom_visual = nothing
    aes_dict, args_dict = extract_aes(:($(exprs)))
    analysis = AlgebraOfGraphics.smooth
    required_aes = ["x", "y"]

    if haskey(args_dict, "method")
        if args_dict["method"] == "lm"
            analysis = AlgebraOfGraphics.linear
        end
    end

    check_aes(required_aes, aes_dict)

    return geom(geom_visual, aes_dict, args_dict, analysis, required_aes)
end

function geom_to_layer(geom)
    mapping_args = (geom.aes[key] for key in geom.required_aes)

    layer = data(eval(geom.args["data"])) *
        mapping(mapping_args...)

    if !isnothing(geom.analysis)
        layer = layer * (geom.analysis)()
    end

    if !isnothing(geom.visual)
        layer = layer * visual(eval(geom.visual))
    end

    if haskey(geom.aes, "color")
        layer = layer * mapping(color = geom.aes["color"])
    end

    return layer
end

function draw_ggplot(plot::ggplot)
    for geom in plot.geoms
        # if data is not specified at the geom level, use the ggplot default
        if !haskey(geom.args, "data")
            geom.args["data"] = plot.data
        end

        # if an aes isn't given in the geom, use the ggplot aes
        for aes in keys(plot.default_aes)
            if !haskey(geom.aes, aes)
                geom.aes[aes] = plot.default_aes[aes]
            end
        end
    end

    layers = []

    for geom in plot.geoms
        push!(layers, geom_to_layer(geom))
    end

    if length(layers) == 0
        error("No geoms supplied")
    elseif length(layers) == 1
        draw(layers[1]; axis = plot.axis)
    else 
        draw((+)(layers...); axis = plot.axis)
    end
end

test_plot = @ggplot(data = penguins, aes(color = species)) + 
    @geom_point(aes(x = bill_length_mm, y = bill_depth_mm)) + 
    @geom_smooth(aes(x = bill_length_mm, y = bill_depth_mm), method = "lm")

draw_ggplot(test_plot)

download

This is some top-notch wizardry!!! I have a slightly different implementation drafted, but let me look through this as I love the final product. My draft implementation is super early so I'm not wedded to one approach. I'm mainly thinking through what's the lowest level of complexity we can adopt to make this work and behave like ggplot. I also have a "solution" around the need for a draw_plot() function.

Let me play with your code and will plan to start a repo later in May (am traveling for conferences the next 2 weeks).

Awesome! I did attempt some hacks to get around the draw_ggplot function as well, but none of them seemed to work quite right so I'm curious what you came up with :)

@rdboyes, having looked through this, you've done an amazing job! Your version is already more functional than what I had drafted. Your use of dictionaries is really clever and is making me want to go back to Tidier.jl to look at if there are design changes I should consider there.

If you've got bandwidth, I would suggest you commit this repo to TidierOrg as our official starting point for TidierPlots.

Here are a few suggestions I would have us think about which we can address in future updates once this is on GitHub.

Instead of using + as the primary way of adding plots together, we may want to support both pipe and + based approaches. This is because it would make it easier to combine data analysis and plotting workflows all inside of a @chain macro. Hadley Wickham has also said that if he were to go back and re-make ggplot2, he would do it with pipes instead of +. I think it's possible for us to support both workflows. One way we could do this is to add a ggplot struct as the first argument for all geoms, which would allow us to make all the geoms pipeable as long as we sequentially build the ggplot as we go. Then, we could write a +(x::ggplot...) function that loops over the ggplots and sequentially pipes them as a thin wrapper around piping.
Auto-drawing: I think your approach of using draw_ggplot() makes a lot of sense. The way I automated this was to automatically draw the resulting plot as each geom_* argument was called. Because my implementation uses a pipe-based workflow, each geom_* argument as the input ggplot as the first argument, which means it has all the information needed to plot the resulting ggplot. So if you have 5 geoms, my approach would result in 5 draw calls. I'm not sure how this would show up in a notebook, but in VSCode, each new plot replaces the older plot so you can't really tell. There are obviously pros and cons to the multiple draw calls (instead of just one at the very end).
Question: do the aesthetics currently propagate? For example, if we specify x and y aesthetics in the @ggplot() call, will they automatically propagate to @geom_plot() and @geom_smooth()? Haven't tested so am not 100% sure. If not, we should try to find a way to fix this. Any given geom should be able to overwrite the aesthetics supplied to @ggplot(), but we should ensure that top-level aesthetics propagate to individual geoms.
We should add support for a top-level @aes() macro which can be used outside of @ggplot() just like in ggplot2 in R.
Support for interpolation: I can help with this later on, but eventually we should support the !! interpolation of columns in TidierPlots using the same underlying mechanism as Tidier.jl. It's not a major priority right now but would be nice to support later on.

Thoughts?

Thanks again, and excited to see this move forward!

Two additional things we should consider:

If we move to a pipe-based approach, we may not need a geom struct at all. This is because each geom_*() call would return a ggplot struct as its return type.
Also, if we move to a pipe-based approach, we may want to add a draw_options element (a NamedTuple?) to the ggplot struct. This is because certain theme elements and facet options need to go into the draw() call. This would allow us to keep track of those elements in the ggplot struct itself.

Thanks for all the feedback! I should have some time this weekend to set up the repo. In the meantime, some responses:

Propagation of aesthetics from the ggplot call does already happen, yes. It happens in the draw_ggplot function right now - here:

    for geom in plot.geoms
        # if data is not specified at the geom level, use the ggplot default
        if !haskey(geom.args, "data")
            geom.args["data"] = plot.data
        end

        # if an aes isn't given in the geom, use the ggplot aes
        for aes in keys(plot.default_aes)
            if !haskey(geom.aes, aes)
                geom.aes[aes] = plot.default_aes[aes]
            end
        end
    end

However, as written, the code will throw an error if a required aes is not provided directly in the geom_x macro call. This should be a quick fix - I just have to change where the required aes checks are done (when we draw the plot instead of when the geom is called).

Auto-draw I think should be a toggle-able option if implemented the way you're suggesting - more complex ggplots could result in a lot of unnecessary plotting, especially because you'd have to also redraw for calls to scale_*, coords, labs, theme, etc. But I think this works for simple ggplots, which is where you want it to work anyway!

Plus vs. Pipe One thing I like about the plus-based approach with a geom struct is how closely it matches the AoG implementation. It's close enough that we could potentially expose a geom_to_layer() function that would allow you to write something like:

plt = geom_to_layer(@geom_smooth(aes(x = bill_length_mm, y = bill_depth_mm), method = "lm", data = penguins)) +
    penguin_bill * mapping(color = :species)

The other thing I like about the plus being the primary syntax is that it allows you to define geoms independent of a ggplot call and use them wherever you want. Also, I get that Hadley loves pipes, but a ggplot definition doesn't make immediate sense to me as a "order of operations" like I usually think of pipes representing. It's much more "things combined together" which the plus represents well ... maybe I'm just used to it!

Maybe we can support the pipe syntax using multiple dispatch? For each geom macro, have an alternate signature that looks like:

macro geom_x(plt::ggplot, exprs...)
   return plt + @geom_x(exprs....)
end

I'm not 100% sure if this works or not - I'd have to test it!

More ggplot fields - yes, definitely. My original plan was to define structs for at least theme and scale, maybe more? And have spots for them in the ggplot struct similar to how the geoms field is meant to hold geoms. Then edit the :+ function to pu them in the right spot conditionally, something like:

function Base.:+(x::ggplot, y...)::ggplot
    result = ggplot(
        vcat(x.geoms, [i for i in y if i isa geom]),
        vcat(x.scales, [j for j in y if j isa scale]),
        x.default_aes, 
        x.data, 
        x.axis)

    return result
end

I agree with everything you said. Let me know if you run into any barriers as you try to create the repo or commit to it in terms of GitHub org/repo permissions.

It exists! I implemented a slightly dubious hack for automatic plotting as well - I think it works, and it only plots once, but it sure feels dicey

TidierOrg / Tidier.jl

Initial Implementation of TidierPlots.jl #97