discourse / mini_sql

a minimal, fast, safe sql executor
MIT License
395 stars 18 forks source link

Marshallable serializer #25

Closed xronos-i-am closed 3 years ago

xronos-i-am commented 3 years ago

The PR has 2 goals: 1) Simplify usage of serializer with Rails.cache 2) Speed up deserialization

To fetch cached MiniSql results we have to do something like this:

  recent_topics_json =
    Rails.cache.fetch('recent-topics') do
      MiniSql::Serializer.to_json(DB.query('SELECT * FROM topics ORDER BY id DESC LIMIT 10'))
    end
  recent_topics = DB.from_json(recent_topics_json)

With the proposed changes we can able to simplify code:

recent_topics =
  Rails.cache.fetch('recent-topics') do
    MiniSql::Serializer.marshallable(DB.query('SELECT * FROM topics ORDER BY id DESC LIMIT 10'))
  end
xronos-i-am commented 3 years ago

to_json/from_json suggested to be replaced with Marshal methods _dump/self._load. Perfomance of Serializing/deserializing is decreased a little bit. But usage is become easier. It allows us to override serializer driver (see below) also

# frozen_string_literal: true

require 'bundler/inline'

gemfile do
  gem 'mini_sql', github: 'discourse/mini_sql'
  gem 'benchmark-ips'
end

require 'benchmark/ips'
require 'mini_sql'
require 'json'

class Serializer < Array
  MAX_CACHE_SIZE = 500

  def initialize(result)
    replace(result)
  end

  def self.marshallable(result)
    new(result)
  end

  def _dump(level)
    JSON.generate(serialize)
  end

  def self._load(dump)
    materialize(JSON.parse(dump))
  end

  private

  def serialize
    if length == 0
      {}
    else
      {
        "decorator" => first.class.decorator.to_s,
        "fields" => first.to_h.keys,
        "data" => map(&:values),
      }
    end
  end

  def self.materialize(wrapper)
    if !wrapper["data"]
      []
    else
      materializer = cached_materializer(wrapper['fields'], wrapper['decorator'])
      wrapper["data"].map do |row|
        materializer.materialize(row)
      end
    end
  end

  def self.cached_materializer(fields, decorator_module = nil)
    @cache ||= {}
    key = fields
    m = @cache.delete(key)
    if m
      @cache[key] = m
    else
      m = @cache[key] = materializer(fields)
      @cache.shift if @cache.length > MAX_CACHE_SIZE
    end

    if decorator_module && decorator_module.length > 0
      decorator = Kernel.const_get(decorator_module)
      m = m.decorated(decorator)
    end

    m
  end

  def self.materializer(fields)
    Class.new do
      extend MiniSql::Decoratable
      include MiniSql::Result

      attr_accessor(*fields)

      instance_eval <<~RUBY
        def materialize(values)
          r = self.new
          #{col = -1; fields.map { |f| "r.#{f} = values[#{col += 1}]" }.join("; ")}
          r
        end
      RUBY
    end
  end
end

PgResult =
  Struct.new(:rows) do
    def ntuples
      rows.size
    end

    def fields
      @fields ||= rows[0].keys
    end

    def getvalue(row_num, col_num)
      rows[row_num][fields[col_num]]
    end
  end

module TopicDecorator
  def permalink
    "#{id}-#{title}"
  end
end

topics =
  10.times.map do |i|
    id = rand(1..9999)
    {
      id: id,
      title: "topic #{id}",
    }
  end
result = PgResult.new(topics)

materialized = MiniSql::Postgres::Connection.default_deserializer_cache.materialize(result, TopicDecorator)

# Benchmark.ips do |r|
#   r.report("to_json") do
#     Marshal.dump(MiniSql::Serializer.to_json(materialized))
#   end
#   r.report("marshal") do
#     Marshal.dump(Serializer.marshallable(materialized))
#   end
#   r.compare!
# end

# Warming up --------------------------------------
#              to_json     5.349k i/100ms
#              marshal     4.632k i/100ms
# Calculating -------------------------------------
#              to_json     51.900k (± 2.8%) i/s -    262.101k in   5.054218s
#              marshal     47.426k (± 3.1%) i/s -    240.864k in   5.083753s

# Comparison:
#              to_json:    51900.1 i/s
#              marshal:    47426.2 i/s - 1.09x  (± 0.00) slower

json = Marshal.dump(MiniSql::Serializer.to_json(materialized))
marshal = Marshal.dump(Serializer.marshallable(materialized))

Benchmark.ips do |r|
  r.report("from_json") do
    MiniSql::Serializer.from_json(Marshal.load(json))
  end
  r.report("marshal") do
    Marshal.load(marshal)
  end
  r.compare!
end

# Warming up --------------------------------------
#            from_json     5.586k i/100ms
#              marshal     5.137k i/100ms
# Calculating -------------------------------------
#            from_json     53.648k (± 2.3%) i/s -    273.714k in   5.104828s
#              marshal     50.210k (± 2.1%) i/s -    251.713k in   5.015367s

# Comparison:
#            from_json:    53648.0 i/s
#              marshal:    50209.9 i/s - 1.07x  (± 0.00) slower
xronos-i-am commented 3 years ago

Comparsion JSON.parse/MessagePack.unpack/Oj.load/Marshal.load/FastJsonparser.parse:

# frozen_string_literal: true

require 'bundler/inline'

gemfile do
  gem 'mini_sql', github: 'discourse/mini_sql'
  gem 'oj'
  gem 'msgpack'
  gem 'fast_jsonparser'
  gem 'benchmark-ips'
end

require 'benchmark/ips'
require 'mini_sql'
require 'json'
require 'msgpack'
require 'oj'

topics =
  10.times.map do |i|
    id = rand(1..9999)
    {
      'id' => id,
      'title' => "topic #{id}",
    }
  end
wrapper = { 'fields' => topics[0].keys, 'data' => topics.map(&:values) }

Benchmark.ips do |r|
  r.report("JSON.generate") do
    JSON.generate(wrapper)
  end
  r.report("JSON.fast_generate") do
    JSON.fast_generate(wrapper)
  end
  r.report("Marshal.dump") do
    Marshal.dump(wrapper)
  end
  r.report("Oj.dump") do
    Oj.dump(wrapper)
  end
  r.report("MessagePack.pack") do
    MessagePack.pack(wrapper)
  end
  r.compare!
end

# Warming up --------------------------------------
#        JSON.generate    14.948k i/100ms
#   JSON.fast_generate    14.886k i/100ms
#         Marshal.dump     8.701k i/100ms
#              Oj.dump    38.364k i/100ms
#     MessagePack.pack    51.212k i/100ms
# Calculating -------------------------------------
#        JSON.generate    143.838k (± 3.8%) i/s -    732.452k in   5.099622s
#   JSON.fast_generate    143.269k (± 2.1%) i/s -    729.414k in   5.093571s
#         Marshal.dump     83.986k (± 2.3%) i/s -    426.349k in   5.079210s
#              Oj.dump    443.358k (± 2.9%) i/s -      2.225M in   5.023052s
#     MessagePack.pack    497.658k (± 2.5%) i/s -      2.509M in   5.045690s

# Comparison:
#     MessagePack.pack:   497657.7 i/s
#              Oj.dump:   443358.0 i/s - 1.12x  (± 0.00) slower
#        JSON.generate:   143838.4 i/s - 3.46x  (± 0.00) slower
#   JSON.fast_generate:   143269.3 i/s - 3.47x  (± 0.00) slower
#         Marshal.dump:    83986.0 i/s - 5.93x  (± 0.00) slower

json = JSON.generate(wrapper)
marshal = Marshal.dump(wrapper)
oj = Oj.dump(wrapper)
msgpack = MessagePack.pack(wrapper)

Benchmark.ips do |r|
  r.report("JSON.parse") do
    JSON.parse(json)
  end
  r.report("Marshal.load") do
    Marshal.load(marshal)
  end
  r.report("Oj.load") do
    Oj.load(oj)
  end
  r.report("MessagePack.unpack") do
    MessagePack.unpack(msgpack)
  end
  r.report("FastJsonparser.parse") do
    FastJsonparser.parse(json)
  end
  r.compare!
end

# Warming up --------------------------------------
#           JSON.parse    13.582k i/100ms
#         Marshal.load    14.713k i/100ms
#              Oj.load    19.709k i/100ms
#   MessagePack.unpack    20.941k i/100ms
# FastJsonparser.parse    25.466k i/100ms
# Calculating -------------------------------------
#           JSON.parse    139.957k (± 3.5%) i/s -    706.264k in   5.052675s
#         Marshal.load    148.234k (± 2.6%) i/s -    750.363k in   5.065308s
#              Oj.load    188.609k (± 3.8%) i/s -    946.032k in   5.023421s
#   MessagePack.unpack    209.431k (± 3.5%) i/s -      1.047M in   5.005751s
# FastJsonparser.parse    266.068k (± 3.8%) i/s -      1.350M in   5.080188s

# Comparison:
# FastJsonparser.parse:   266068.2 i/s
#   MessagePack.unpack:   209431.2 i/s - 1.27x  (± 0.00) slower
#              Oj.load:   188608.8 i/s - 1.41x  (± 0.00) slower
#         Marshal.load:   148234.4 i/s - 1.79x  (± 0.00) slower
#           JSON.parse:   139957.0 i/s - 1.90x  (± 0.00) slower
xronos-i-am commented 3 years ago

Overridable Serializer interface allows us to use MessagePack (for example) instead of standard marshalling and achieve performance increase:

class MarshalSerializer < MiniSql::Serializer
  def _dump(level)
    Marshal.dump(serialize)
  end

  def self._load(dump)
    materialize(Marshal.load(dump))
  end
end

class MessagePackSerializer < MiniSql::Serializer
  def _dump(level)
    MessagePack.pack(serialize)
  end

  def self._load(dump)
    materialize(MessagePack.unpack(dump))
  end
end

class OjSerializer < MiniSql::Serializer
  def _dump(level)
    Oj.dump(serialize)
  end

  def self._load(dump)
    materialize(Oj.load(dump))
  end
end

class FastJsonparserOjSerializer < MiniSql::Serializer
  def _dump(level)
    Oj.dump(serialize)
  end

  def self._load(dump)
    materialize(FastJsonparser.parse(dump, symbolize_keys: false))
  end
end

PgResult =
  Struct.new(:rows) do
    def ntuples
      rows.size
    end

    def fields
      @fields ||= rows[0].keys
    end

    def getvalue(row_num, col_num)
      rows[row_num][fields[col_num]]
    end
  end

module TopicDecorator
  def permalink
    "#{id}-#{title}"
  end
end

topics =
  10.times.map do |i|
    id = rand(1..9999)
    {
      id: id,
      title: "topic #{id}",
    }
  end
result = PgResult.new(topics)

materialized = MiniSql::Postgres::Connection.default_deserializer_cache.materialize(result, TopicDecorator)

# Benchmark.ips do |r|
#   r.report("json") do
#     Marshal.dump(MiniSql::Serializer.marshallable(materialized))
#   end
#   r.report("marshal") do
#     Marshal.dump(MarshalSerializer.marshallable(materialized))
#   end
#   r.report("msgpack") do
#     Marshal.dump(MessagePackSerializer.marshallable(materialized))
#   end
#   r.report("oj") do
#     Marshal.dump(OjSerializer.marshallable(materialized))
#   end
#   r.report("fastjson") do
#     Marshal.dump(FastJsonparserOjSerializer.marshallable(materialized))
#   end
#   r.compare!
# end

# Warming up --------------------------------------
#                 json     4.422k i/100ms
#              marshal     3.894k i/100ms
#              msgpack     6.467k i/100ms
#                   oj     6.276k i/100ms
#             fastjson     6.192k i/100ms
# Calculating -------------------------------------
#                 json     43.298k (± 3.2%) i/s -    216.678k in   5.009511s
#              marshal     39.456k (± 3.5%) i/s -    198.594k in   5.039348s
#              msgpack     65.048k (± 3.7%) i/s -    329.817k in   5.077144s
#                   oj     60.845k (± 2.2%) i/s -    307.524k in   5.056674s
#             fastjson     60.353k (± 2.4%) i/s -    303.408k in   5.030024s

# Comparison:
#              msgpack:    65047.5 i/s
#                   oj:    60845.3 i/s - 1.07x  (± 0.00) slower
#             fastjson:    60353.4 i/s - 1.08x  (± 0.00) slower
#                 json:    43297.5 i/s - 1.50x  (± 0.00) slower
#              marshal:    39456.2 i/s - 1.65x  (± 0.00) slower

json = Marshal.dump(MiniSql::Serializer.marshallable(materialized))
marshal = Marshal.dump(MarshalSerializer.marshallable(materialized))
msgpack = Marshal.dump(MessagePackSerializer.marshallable(materialized))
oj = Marshal.dump(OjSerializer.marshallable(materialized))
fastjson = Marshal.dump(FastJsonparserOjSerializer.marshallable(materialized))

Benchmark.ips do |r|
  r.report("json") do
    Marshal.load(json)
  end
  r.report("marshal") do
    Marshal.load(marshal)
  end
  r.report("msgpack") do
    Marshal.load(msgpack)
  end
  r.report("oj") do
    Marshal.load(oj)
  end
  r.report("fastjson") do
    Marshal.load(fastjson)
  end
  r.compare!
end

# Warming up --------------------------------------
#                 json     5.030k i/100ms
#              marshal     5.323k i/100ms
#              msgpack     5.768k i/100ms
#                   oj     5.602k i/100ms
#             fastjson     6.090k i/100ms
# Calculating -------------------------------------
#                 json     47.365k (± 1.7%) i/s -    241.440k in   5.098920s
#              marshal     50.814k (± 1.8%) i/s -    255.504k in   5.029873s
#              msgpack     56.586k (± 1.7%) i/s -    288.400k in   5.098067s
#                   oj     54.719k (± 1.7%) i/s -    274.498k in   5.018006s
#             fastjson     58.535k (± 2.6%) i/s -    298.410k in   5.101610s

# Comparison:
#             fastjson:    58534.8 i/s
#              msgpack:    56585.8 i/s - same-ish: difference falls within error
#                   oj:    54719.0 i/s - 1.07x  (± 0.00) slower
#              marshal:    50813.8 i/s - 1.15x  (± 0.00) slower
#                 json:    47364.5 i/s - 1.24x  (± 0.00) slower
xronos-i-am commented 3 years ago

@SamSaffron can you take a look please

eviltrout commented 3 years ago

Sam is unfortunately not around right now so this might have to wait a couple weeks, sorry.

xronos-i-am commented 3 years ago

ok, sure

SamSaffron commented 3 years ago

I worry about having a deep dependency on marshal, this automatically poisons cache on major ruby upgrades, it is not all win.

Can we make this an optional behavior?

ermolaev commented 3 years ago

I think that _load _dump has low dependency on marshal, because in this case marshaling a simple string

class A
  def initialize(b)
     @b = [b, b+2, {b: b+3}]
     @c = 'str'
  end
end

class AA
  def initialize(b)
    @b = [b, b+2, {b: b+3}]
    @c = 'str'
  end

  def _dump(level)
    JSON.generate(b: @b, c: @c)
  end
end

puts Marshal.dump(A.new(2000)) # o:A:@bi�i�{:bi�:@cIstr:ET
puts Marshal.dump(AA.new(2000)) # Iu:AA+{"b":[2000,2002,{"b":2003}],"c":"str"}:ET

Are you aware of ruby versions that break Marshal's algorithm compatibility?

xronos-i-am commented 3 years ago

Can we make this an optional behavior?

No problem, but there is a problem with JSON serializing. Dates or custom types can not be automatically restored from JSON dump:

# frozen_string_literal: true

require 'bundler/inline'

gemfile do
  gem 'mini_sql', github: 'discourse/mini_sql'
end

require 'mini_sql'
require 'json'

PgResult =
  Struct.new(:rows) do
    def ntuples
      rows.size
    end

    def fields
      @fields ||= rows[0].keys
    end

    def getvalue(row_num, col_num)
      rows[row_num][fields[col_num]]
    end
  end

result = PgResult.new([{id: 1, date: Date.today}])

materialized = MiniSql::Postgres::Connection.default_deserializer_cache.materialize(result)
p materialized
# [#<#<Class:0x00007f0eabd4b370>:0x00007f0eabd4aa38 @id=1, @date=#<Date: 2021-02-01 ((2459247j,0s,0n),+0s,2299161j)>>]

json = MiniSql::Serializer.to_json(materialized)
puts json
# {"decorator":"","fields":["id","date"],"data":[[1,"2021-02-01"]]}

deserialized = MiniSql::Serializer.from_json(json)
p deserialized
# [#<#<Class:0x00007f0eabd49f48>:0x00007f0eabd496d8 @id=1, @date="2021-02-01">]
SamSaffron commented 3 years ago

Yeah that is a problem, I guess we just eat it up and use Marshal, will have think about this and get something merged in.

On Mon, Feb 1, 2021 at 5:30 PM Sergey Malykh notifications@github.com wrote:

Can we make this an optional behavior?

No problem, but there is a problem with JSON serializing. Dates or custom types can not be automatically restored from JSON dump:

frozen_string_literal: true

require 'bundler/inline' gemfile do gem 'mini_sql', github: 'discourse/mini_sql'end require 'mini_sql'require 'json' PgResult = Struct.new(:rows) do def ntuples rows.size end

def fields
  @fields ||= rows[0].keys
end

def getvalue(row_num, col_num)
  rows[row_num][fields[col_num]]
end

end result = PgResult.new([{id: 1, date: Date.today}]) materialized = MiniSql::Postgres::Connection.default_deserializer_cache.materialize(result)p materialized# [#<#:0x00007f0eabd4aa38 @id=1, @date=#<Date: 2021-02-01 ((2459247j,0s,0n),+0s,2299161j)>>] json = MiniSql::Serializer.to_json(materialized)puts json# {"decorator":"","fields":["id","date"],"data":[[1,"2021-02-01"]]} deserialized = MiniSql::Serializer.from_json(json)p deserialized# [#<#:0x00007f0eabd496d8 @id=1, @date="2021-02-01">]

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/discourse/mini_sql/pull/25#issuecomment-770606907, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAABIXK65LUKJIZ62WEKO7DS4ZDBRANCNFSM4VOCQMQQ .

SamSaffron commented 3 years ago

OK lets try this then ... weird error on 2.6, seems unrelated.