absinthe-graphql / absinthe_ecto

DEPRECATED: Use dataloader
MIT License
130 stars 36 forks source link

Ecto.Schema resolution proposal #4

Open redrabbit opened 8 years ago

redrabbit commented 8 years ago

Hi, I've been using Absinthe and Absinthe.Relay intensively for a private project and wrote a few helper functions in order to resolve GraphQL queries into Ecto queries.

I've put the code here: https://gist.github.com/redrabbit/be3528a4e4479886acbe648a693e65c0

I don't know if Absinthe.Ecto is going to be implemented in a similar manner. If there is interest, I can write a PR and add some tests to it.

Basically, it uses Ecto.Schema and Absinthe.Resolution to resolve pretty much every kind of GraphQL query I'd came across so far.

There are two distinct functions, build_query/3 and batch_query/5.

By default, build_query/3 automatically resolves :join, :preload and :select clauses. It tries to be as performant as possible and selects only required fields.

Setup

Let's say we have schemas representing a music library:

schema "artists" do
  field :name, :string
  has_many :albums, Album
  has_many :tracks, Track
  timestamps
end

schema "albums" do
  field :title, :string
  field :release_date, Ecto.Date
  belongs_to :artist, Artist
  many_to_many :genres, Genre, join_through: "albums_genres"
  has_many :tracks, Track
  timestamps
end

schema "genres" do
  field :name, :string
  many_to_many :albums, Album, join_through: "albums_genres"
  timestamps
end

schema "tracks" do
  field :title, :string
  field :track_nr, :integer
  field :duration, :integer
  field :popularity, :integer
  belongs_to :artist, Artist
  belongs_to :album, Album
  timestamps
end

And the following matching GraphQL types:

object :artist do
  field :id, :id
  field :name, :string
  field :albums, list_of(:album)
  field :tracks, list_of(:track) do
    arg :top, :integer, default_value: 10
    resolve &resolve_artist_top_tracks/3
  end
end

object :album do
  field :id, :id
  field :title, :string
  field :release_date, :date
  field :artist, :artist
  field :tracks, list_of(:track)
  field :genres, list_of(:genre)
end

object :genre do
  field :id, :id
  field :name, :string
  field :albums, list_of(:album)
end

object :track do
  field :id, :id
  field :title, :string
  field :artist, :artist
  field :album, :album
  field :track_nr, :integer
  field :duration, :integer
  field :popularity, :integer
end

Usage

Following GraphQL query:

{
  artists {
    name
    albums {
      title
      tracks {
        title
      }
      genres {
        name
      }
    }
  }
}

Returns a single Ecto.Query:

#Ecto.Query<from a0 in Artist,
           join: a1 in assoc(a0, :albums),
           join: g in assoc(a1, :genres),
           join: t in assoc(a1, :tracks),
         select: [:id, :name, {:albums, [:id, :title, {:genres, [:id, :name]}, {:tracks, [:id, :title]}]}],
        preload: [albums: {a1, [genres: g, tracks: t]}]>

It handles :belongs_to, :has_one, :has_many and :many_to_many associations and will try to join and preload as much as possible. It also support resolving cyclic graphs.

Writing resolvers is straight forward:

 def resolve_artists(_args, info) do
  query = build_query(Artist, info, &order_by(&1, [asc: :name]))
  {:ok, Repo.all(query)}
end

def resolve_artist_by_id(%{id: id}, info) do
  id = String.to_integer(id)
  query = build_query(Artist, info, &where(&1, [id: ^id]))
  case Repo.one(query) do
    nil ->
      {:error, "Cannot find artist with id #{id}."}
    artist ->
      {:ok, artist}
  end
end

In order to support batching and avoid N+1 queries, batch_query/5 provides similar functionalities and resolve the given associations automatically. For example:

  object :artist do
    field :id, :id
    field :name, :string
    field :albums, list_of(:album)
    field :tracks, list_of(:track) do
      arg :top, :integer, default_value: 10
      resolve fn artist, args, info ->
        batch_query(Track, artist, info, &Repo.all/1, fn query ->
            query
            |> order_by([desc: :popularity])
            |> limit(^min(Map.get(args, :top, 10), 100))
        end)
      end
    end
  end

In the above object, the :albums field is resolved automatically within the query using a :join. The :tracks field will require a 2nd query (using where: [artist_id: id] or where: a1.artist_id in ^ids).

Resulting in executing two SQL queries for the following GraphQL:

{
  artists {
    name
    tracks(top:5) {
      title,
      duration
    }
    albums {
      title
      tracks {
        title
      }
      genres {
        name
      }
    }
  }
}
2016-11-26 02:57:28.093 [debug] QUERY OK source="artists" db=2.1ms
SELECT a0."id", a0."name", a1."id", a1."title", t2."id", t2."title", g3."id", g3."name" FROM "artists" AS a0 INNER JOIN "albums" AS a1 ON a1."artist_id" = a0."id" INNER JOIN "tracks" AS t2 ON t2."album_id" = a1."id" INNER JOIN "albums_genres" AS a4 ON a4."album_id" = a1."id" INNER JOIN "genres" AS g3 ON a4."genre_id" = g3."id" ORDER BY a0."name" []
2016-11-26 02:57:28.095 [debug] QUERY OK source="tracks" db=1.6ms
SELECT t0."artist_id", t0."id", t0."title", t0."duration" FROM "tracks" AS t0 WHERE (t0."artist_id" = $1) ORDER BY t0."popularity" DESC LIMIT $2 [1, 5]

Customization

You can customize your queries using the optional transform parameter both functions provide.

For example, to fetch albums sorted by :release_date and album tracks sorted by :track_nr, one can write two new resolver functions:

def resolve_artist_albums(artist, _args, info) do
  batch_query(Album, artist, info, &order_by(&1, [desc: :release_date]))
end

def resolve_album_tracks(album, _args, info) do
  batch_query(Track, album, info, &order_by(&1, [asc: :track_nr]))
end

And update the schema types like this:

object :artist do
  field :albums, list_of(:album), do: resolve &resolve_artist_albums/3
end

object :album do
  field :tracks, list_of(:track), do: resolve &resolve_album_tracks/3  
end
bruce commented 8 years ago

Thanks for this -- very thorough, great work! I love that you dug into field selection as well. @benwilson512 and I will give this a better look-over soon. My guess is that we'll push you to PR, with some feedback.

redrabbit commented 8 years ago

As I already wrote the code in a very generic way for my own needs, I thought it my help this project as well. Even if it just brings some ideas.

There a certainly a few concerns which might be relevant thought.

  1. Obviously, it uses Ecto.Schema to resolve fields. This means that the GraphQL objects must use the exact same field names as well. It would be nice if we could alias fields in order to differ from the original Ecto field names. Maybe this is something we can build on top of this.

  2. The batch_query/5 function resolves the association field using :has_one/:has_many (see here). For example, if we want to batch over the artist's albums or tracks, it currently requires the Artist schema to provide following fields: has_many(:albums, Album) and has_many(:tracks, Track). It might be a better solution to use :belongs_to to resolve those associations. Note that the batch_query/5 function also provide a way to resolve the assoc manually (see here), passing the 2nd as a tuple:

def resolve_artist_albums(artist, _args, info) do
  # this:
  batch_query(Album, artist, info, &order_by(&1, [desc: :release_date]))
  # equals that:
  batch_query(Album, {:artist_id, artist.id}, info, &order_by(&1, [desc: :release_date]))
end

I'm note sure how 1. is relevant. Maybe it's ok to use the same Ecto and Absinthe schema and field names. I can imagine Absinthe.Ecto providing a very high-level integration will Ecto:

defmodule MyApp.GraphQL.Schema.Types do
  use Absinthe.Ecto.Schema.Notation

  alias MyApp.Ecto.Schema.{Artist, Album, Track}

  # create an object from the given Ecto.Schema
  object :artist, ecto_schema: Artist

  # same but exporting only specific fields
  object :artist, ecto_schema: Artist, only: [:id, :name, :albums]

  # same but using except instead of only
  object :artist, ecto_schema: Artist, except: [:tracks]

  # same but allows to define specific resolvers
  object :artist, ecto_schema: Artist, except: [:tracks] do
    # defines resolver for :albums
    field :albums, do: resolve &resolve_artist_albums/3
    # other fields are resolved automatically like in the above examples 
  end
end
benwilson512 commented 8 years ago

This is a seriously impressive amount of work. There's a lot going on here and I'd like the chance to discuss it with you in a slightly more flexible medium. Do you have ScreenHero or Skype? You can find me on SH with my github handle [at] gmail [dot] com, and skype with just the same name as my github handle.

There's some philosophical differences between this approach and the one I had been targeting, but this is a ton of valuable capability, so I want to see where we can go with this.

tlvenn commented 8 years ago

ya great work @redrabbit ! Projecting only selected fields is great but it also has some challenges. There are 2 use cases where we will probably need something to load some fields even though they are not directly required by the graphql query:

@benwilson512 did you have a chance to talk with @redrabbit already ?

tlvenn commented 8 years ago

Both above points could probably be easily solved using the new meta facility.

redrabbit commented 8 years ago

I had the pleasure to have a very interesting discussion with @benwilson512 a few days ago.

I think the proposed code is working quiet well for what it is meant to do, mapping GraphQL queries into Ecto queries if their schemas match exactly. The problem is that currently, it is a all-or-nothing solution.

If your GraphQL schema deviates from it Ecto.Schema counterpart, the functions will not be able to resolve the query correctly.

For example, let's say your Ecto.Schema has a :first_name and :last_name field. It would be nice to have the possibility to define a :name field on the GraphQL object which internally will fetch both first and last name and concat them. Doing so would also provide a way to define aliases, for example when the name of the GraphQL object field derives from the Ecto.Schema field.

@tlvenn, adding :select fields should be quiet easy. If you have a look at the code, you will see:

  def build_query(schema, info, transform \\ nil) do
    # Resolves fields using info,
    # basically, a shortcut for:
    # fields = info.definition.fields
    fields = resolve_fields(info)

    # Resolves select statement using Ecto.Schema and fields,
    # this is the :select statement from Ecto.Query.
    # It also fetches fields required foreign keys
    # in order to resolve associations.
    select = resolve_fields(schema, fields)

    # Resolves associations using Ecto.Schema and fields,
    # this is a bit more complicated than the :select statement above.
    # It handles :has_one, :has_many, :belongs_to and :many_to_many.
    assocs = resolve_assocs(schema, fields)

    # Creates the Ecto.Query using the given Ecto.Schema
    # and the above :select and :assoc statements.
    # The optional transform parameter is a function
    # used to transform/compose the final query.
    new_query(schema, select, assocs, transform)
  end

We could simply pass an extra select parameter for additional select statements...

I really like how Ecto is really explicit in contrast to other ORMs (say Django or ActiveRecord) which do a lot of magic behind the scene. Having Ecto.Schema and Ecto.Changeset decoupled from the database schema itself is also a great thing. I also really like how preloading and joining associations is a task left to the developer. My code has some of this magic stuff going on behind the scene, I think the Absinthe core team will have to find the right amount of things that can be handled automatically without sacrificing flexibility.

I've posted the code in order to bring some ideas to the table. Maybe Absinthe.Ecto will build on top of it or only extract some of it. Maybe none of it will be used and they will take a completely different approach. Maybe Absinthe.Ecto will go the more explicit way and this code should live in a different library though. Personally, I'm ok will all of this 😄.

tlvenn commented 8 years ago

For example, let's say your Ecto.Schema has a :first_name and :last_name field. It would be nice to have the possibility to define a :name field on the GraphQL object which internally will fetch both first and last name and concat them

Ya that's pretty much my point 2 and can easily be solved by adding meta at the field level listing the Ecto schema fields needed to properly resolve this graphql field. Then as you pointed out, it's fairly trivial to update the resolve_fields/2 to introspect requested objects and fields and build a list of additional fields that should be fetched out of the meta.

Thanks again for your contribution @redrabbit , very much appreciated.

hellomika commented 7 years ago

@redrabbit This is great! I've been using it in a project where my Absinthe schema almost perfectly matches my Ecto schema. Better than writing resolution batchers. And 1 query with 4 joins beats 5 queries.

That said I think I've found a bug. Using this query:

query {
  visits {
    id
    patient {
      id
      clients {
        id
      }
    }
    staff {
      id
    }
  }
}

I get this error:

** (Ecto.QueryError) could not find association `staff` on schema App.Patient in query:
from v in App.Visit,
  join: p in assoc(v, :patient),
  join: c in assoc(p, :clients),
  join: s in assoc(p, :staff),
  select: [:id, :patient_id, {:patient, [:id, {:clients, [:id]}]}, :staff_id, {:staff, [:id]}],
  preload: [patient: {p, [clients: c]}, staff: c]

It looks like staff got joined to patient instead of visits. I've been able to replicate it with multiple combinations of different fields, including arguments.

benwilson512 commented 7 years ago

What we have here at the moment are two distinct ways of trying to efficiently load database records.

Whole Tree Up Front

This is the approach of the linked code. When resolving a top level field some form of introspection happens on the remaining tree producing a Big Fancy Query (BFQ). This query will effectively try to load the rest of the sub tree from the database in a single query. Resolver functions further down the tree don't really need to do anything since the data is already loaded.

Main Advantage: Load multiple tables worth of data in a single database query

Lazy Batching

This is the current approach of this library. Nothing is done up front. Resolution functions that map to a database association register the desire to load that association when run, and then groups of such associations are executed altogether.

Major Advantage: Extremely flexible.

A new scenario

Here's where things get interesting. My work with Apollo / Relay transport batching along with the recent work on subscriptions has put me in a situation where in each case I want to execute a SET of documents. That is instead of simply having:

{ posts(category: "a") { creator { name } } }

I have:

[
  {"query": "{ posts(category: \"a\") { creator { name } } }", "id": "A"},
  {"query": "{ posts(category: \"b\") { creator { name } } }", "id": "B"},
  {"query": "{ posts(category: \"c\") { creator { name } } }", "id": "C"},
  ...
]

This happens with transport batching when the front end groups together many graphql queries on the front end and then sends them back in one HTTP request. It happens in subscriptions because we track groups of documents that belong to particular subscription fields.

Batching here turns out to be super useful, because not only can you batch the creator lookup inside doc "A", you can batch all lookups for the creator field across ALL documents "A", "B", and so on.

No matter how many documents are there, you'll have just one database query for the users table. I'm not sure how this would be possible with the Whole Tree approach. You could try doing a tree merge but this starts to be an issue when the document shape varies.

Consider a document set:

[
  {"query": "{ posts(category: \"a\") { creator { name } } }", "id": "A"},
  {"query": "{ posts(category: \"b\") { creator { name } } }", "id": "B"},
  {"query": "{ posts(category: \"c\") { title }", "id": "C"},
  ...
]

For some posts we load the creator, for some we do not. If we tree merge we're going to load the creator for ALL posts whether or not the document actually asked us to. Obviously this will get filtered out prior to going to the client but it's database work we don't need to do and data loaded into memory that serves no purpose.

Things to think about:

tlvenn commented 7 years ago

@benwilson512 I always though that it would be best to leverage both approaches.

There are scenarios where the BFQ will win and others where batching queries is more efficient. The resolver from the batching strategy already check if the data is there already and if it's the case, simply return the data directly.

The missing piece would be a piece of middleware that could flag each query in the document set so the BFQ do not load any association at all for those flagged query and defer the work to the batching resolver.

The other scenario when it would be useful to fallback to the batching strategy is once we deal with the @defer @stream and @live

tlvenn commented 7 years ago

I was thinking that for that piece of middleware we could do something as simple as to reduce the document set by grouping queries by the root query and then counting them and computing the max or average nesting level.

Then if the number of queries is bigger than the nesting level, it's a good indication that batching will probably yield better query time than BFQ otherwise, it probably mean that BFQ will be more performant.

Now as you pointed out in Slack, this focus solely on the response time for a given query and not on overall how your workload / concurrency impacts your system where depending on what you do, a given approach might be overall more performant even though on a query basis it might be slower.

tlvenn commented 7 years ago

One caveat that is not necessarily obvious with the whole tree approach because Ecto hides it, is while it minimises round trip to the database, in case on traversing one to many associations, it will increase substantially the size of the payload that has to be downloaded from the DB.

In the example above, in case of artist <-> albums, artist data will be duplicated for each album and it get even worse if you nest them, for example: artist <-> albums <-> tracks.

If a given artist has 10 albums each with 20 tracks, the payload with have:

xtagon commented 7 years ago

I'm really impressed and following this development closely. One issue I have with the "all or nothing" concept is that I don't immediately see how this would fit together with other real-world requirements. What I mean is it looks like it works great if you just want to expose your Ecto schemas as a GraphQL graph, but how would one implement something per-resolver permissions on top of this? Different objects in my API have different permissions logic.

It seems to me that a batching solution could be used to help solve the issue of permitting resources based on their associations, in much the same way that it solves N+1 queries, but I haven't come up with the details of how yet.

Any thoughts on this? @redrabbit, you say you use this for your own use case. I'm curious to know if your use case relies on permissions/authorization per object?

tlvenn commented 7 years ago

Hi @xtagon, just because the data is already resolved, it does not mean that some piece of middleware cannot proxy the data and depending of some custom logic decide to let the data go through or not.

One tricky area with the "all or nothing" current approach is that it will fetch only the fields required by the Graphql query while your permission logic might need the value of other fields as well. This can be solved using the meta facility just like how you would need to address any issue when your Graphql object does not match 1 for 1 with your Ecto schema.

The batching approach is definitely a lot simpler and generally more flexible in that regard.

xtagon commented 7 years ago

Thanks @tlvenn. I am still a beginner to all of this but when I think about it there are basically two ways to authorize resources:

  1. Load the resource, and then logic checks whether it is permitted (which may load other resources to implement this check). If it's not permitted, it is not sent back to the client, but it was still loaded regardless. This must rely on whatever batching solution is present, or there will be N+1 queries for the permission checks just like for normal resource loading.
  2. Scope the database queries by a policy scope: If my user is only allowed to see posts they own, this is implicit because the graph schema would just be implemented to only get posts "belonging to" my user. For more complex relationships it seems like this would be hard to do if the batching solution is a black box or heavy on convention over configuration or "all or nothing"ism.
tslater commented 7 years ago

I hope it's ok to comment as an newcomer to this conversation. It looks like this issue/conversation has gone a little stale. Is the problem right now that there is no consensus on whole tree vs batching so we can't move forward with the code? My understanding is that @redrabbit has a basic implementation that is whole-tree, but that we'd probably prefer batching? If that's the case, should we shoot for a batching implementation first, and then maybe work toward getting in the whole-tree as an option later?

tslater commented 7 years ago

I've been reading this conversation and the documentation on the Absinthe homepage on Ecto Best Practices. After looking at the README.md for this repo, it looks like there might be some ecto schema directed batching implemented and the docs might just be out of date? Is that correct?

benwilson512 commented 7 years ago

The Ecto Best Practices page is indeed out of date, as it doesn't reflect the existence of this library. I'll make sure to update it.

Batching is something that we the core team took on because it required changes to how resolution operates, which wasn't going to be something achievable by third parties.

As far as I see though nothing should be blocking community members implementing this full document SQL query generation, other than perhaps education on the tools Absinthe provides. You can tag values on fields with https://hexdocs.pm/absinthe/Absinthe.Schema.Notation.html#meta/2 you can use https://hexdocs.pm/absinthe/Absinthe.Pipeline.html and the pipeline: option on Absinthe.Plug to setup custom phases, and that's really all you need to go on.

It will require getting familiar with Absinthe's internal representation of a document called a Blueprint to build the preloading phase but this is true for most any contribution.

benwilson512 commented 7 years ago

This is a long way of saying: I like this proposal, I think it has value, and if people have specific questions related to how they might make progress on it I'm happy to answer. However, it just isn't on the core team's TODO list for the time being.

tslater commented 7 years ago

@benwilson512 I just wanted to check my understanding here. The new assoc macro is a batching implementation that is complete, this issue is only for the whole-tree solution? If so, that's awesome.

benwilson512 commented 7 years ago

Correct. assoc/1 macro really just expands to a https://hexdocs.pm/absinthe_ecto/Absinthe.Ecto.html#ecto_batch/4 which uses the schema association information to call the built in absinthe batching mechanism to batch retrieve the association.

After some discussions with Chris McCord about Phoenix 1.3 conventions there will also soon be a belongs_to and has_many helper for when you want to do batching across contexts where there will not be a pre-existing ecto association.

tslater commented 7 years ago

That's cool. I assumed from the name of this repo "absinthe_ecto", that it was just a dependency of absinthe that takes care of the Ecto code. I skimmed over the first line of the README: "Provides some helper functions for easy batching of Ecto assocations." I'll have to try it out ASAP. Long-term, is there a plan to separate the rest of the Ecto-related code in this repo? Or move this code into the main repo? Or is this going to stay its own thing?

benwilson512 commented 7 years ago

Long-term, is there a plan to separate the rest of the Ecto-related code in this repo

I'm not sure what this means.

tslater commented 7 years ago

I also wanted to add something to this discussion. Anyone considering the big-fat-query/whole-tree needs might want to consider some inefficiencies with how sql joins work returning back data. You are going to save in terms of latency, but take a hit in bandwidth and data processing. Note this comment here: https://github.com/elixir-ecto/ecto/issues/659#issuecomment-121513411

benwilson512 commented 7 years ago

To perhaps clarify what features belong in what project: Absinthe has resolution middleware. It includes a couple of different middleware to serve as a common base of functionality. One of those is the Batching middleware, which can be used to batch any field resolution functionality, there is nothing ecto specific about it.

Absinthe.Ecto exists as a package to provide specific ecto related functionality that anyone using Absinthe as well as Ecto probably wants, and might be difficult to do on one's own. There is no intention for this library to get rolled up into Absinthe. Absinthe has no reason to know about any particular backend store.

tslater commented 7 years ago

Sorry, yeah. I was confused. I got the impression that there was Ecto-aware code inside the Absinthe core repo based on comments and other things, but there isn't. That's all done manually in resolvers. I should probably have spent more time with Absinthe before commenting on here. Just got working through things yesterday. I should like to help with the guides and documentation to help make things clearer, once I have it all straight in my head. I'm glad that is decoupled like it is.

benwilson512 commented 7 years ago

No problem! Further guide help is always appreciated!

jfrolich commented 7 years ago

I wrote an extended version of the assoc function that allows for callbacks and extra preloads (that also allow for scoping etc.). It might be handy for other people as well so I thought to share it here. Any feedback is welcome.

  defp default_callback(result) do
    {:ok, result}
  end

  def add_preloads_to_ultra_batch(structs, nil, field, caller)  do
    Repo.preload(structs, [field], caller: caller)
  end
  def add_preloads_to_ultra_batch(structs, preloads, field, caller) do
    Repo.preload(structs, [{field, preloads}], caller: caller)
  end

  def perform_ultra_batch({owner, owner_key, field, preloads, caller}, ids) do
    unique_ids = ids |> MapSet.new |> MapSet.to_list

    unique_ids
    |> Enum.map(&Map.put(struct(owner), owner_key, &1))
    |> add_preloads_to_ultra_batch(preloads, field, caller)
    |> Enum.map(&{Map.get(&1, owner_key), Map.get(&1, field)})
    |> Map.new
  end

  # this is a slightly different version than ecto_batch, this allows to specify
  # extra preloads for the association to be loaded efficiently
  def ultra_batch(%schema{} = parent, association, opts \\ []) do
    assoc = schema.__schema__(:association, association)
    callback = Keyword.get(opts, :callback) || &default_callback/1
    preloads = Keyword.get(opts, :preloads)

    %{owner: owner,
      owner_key: owner_key,
      field: field} = assoc

    id = Map.fetch!(parent, owner_key)

    meta = {owner, owner_key, field, preloads, self()}

    batch({__MODULE__, :perform_ultra_batch, meta}, id, fn results ->
      results
      |> Map.get(id)
      |> callback.()
    end)
  end

This allows you to do something like this:

    @visible_content_attachments from(a in Schema.Content.Attachment, where: a.deleted == false, preload: ^Schema.Content.Attachment.authorization_preloads())

    field :supporting_content_attachments, list_of(:content_attachment), resolve: fn parent, _, _ ->
      ultra_batch(
        parent,
        :content_attachments,
        preloads: @visible_content_attachments,
        callback: fn content_attachments ->
          {:ok, remove_primary_content_attachment(content_attachments, parent.primary_content_attachment_id)}
        end)
    end
user20230119 commented 1 year ago

Does dataloader.ecto do whole tree loads or do we need to do solution in https://github.com/absinthe-graphql/dataloader/issues/127?