Read performance - Githubissues

retorquere commented 1 year ago

I'm looking to switch to blinkdb for its typescript support, but when I benchmark indexed search, blinkdb comes out substantially slower than lokijs:

var Benchmark = require('benchmark')
const Loki = require('lokijs')
const { createDB, createTable } = require( "blinkdb")
const { internalInsertMany } = require('blinkdb/dist/core/insertMany')
const { internalFirst } = require('blinkdb/dist/core/first')

const items = require('./mock.json')

const blinkdb = createDB()
const blinktable = createTable(blinkdb, 'citekeys')({
  primary: "id",
  indexes: ["firstmail"],
});
internalInsertMany(blinktable, items)

const DB = new Loki('better-bibtex', {})
const coll = DB.addCollection('citekeys', {
  indices: [ 'id', 'firstname' ],
  unique: [ 'id' ],
})
coll.insert(items)

var suite = new Benchmark.Suite;
suite
  .add('loki', function() { coll.findOne({ firstname: { $eq: `${Math.random()}` } }) })
  .add('blink', function() { internalFirst(blinktable, { where: { firstname: `${Math.random()}` } }) })
  .on('cycle', function(event) { console.log(String(event.target)) })
  .on('complete', function() { console.log('Fastest is ' + this.filter('fastest').map('name')); })
  .run();

(where mock.json has this content)

returns

loki x 1,466,141 ops/sec ±1.39% (84 runs sampled)
blink x 1,279 ops/sec ±1.49% (89 runs sampled)

and even when I turn off indexes in loki, it still comes out ahead:

loki x 18,342 ops/sec ±0.97% (95 runs sampled)
blink x 1,313 ops/sec ±1.29% (92 runs sampled)
Fastest is loki

I haven't tested more complex searches but search by single indexed field is something I'd be doing a fair bit of. Is this just not a good use-case for blinkdb?

maradotwebp commented 1 year ago

First issue I see is that you're creating a BlinkDB index on firstmail instead of firstname.

But I'll take a look at the query performance without indexes, BlinkDB shouldn't be 18x as slow as Loki when you search for a single key.

retorquere commented 1 year ago

Oopsie.

Both indexed:

loki x 1,501,305 ops/sec ±1.24% (90 runs sampled)
blink x 1,290,563 ops/sec ±0.93% (92 runs sampled)
Fastest is loki

Both unindexed:

loki x 17,738 ops/sec ±2.96% (91 runs sampled)
blink x 1,158 ops/sec ±6.18% (82 runs sampled)
Fastest is loki

retorquere commented 1 year ago

I'm also not seeing the performance in the sample from the front-page; unless I mucked it up again, I'm getting this (script below):

search should find 52 users scan x 4,240 ops/sec ±4.25% (83 runs sampled) sharding x 1,693,713 ops/sec ±1.56% (81 runs sampled) loki x 42,372 ops/sec ±1.31% (86 runs sampled) blinkdb x 933 ops/sec ±1.84% (80 runs sampled) Fastest is sharding

script:

var Benchmark = require('benchmark');
const Loki = require('lokijs')
const { createDB, createTable, insertMany, many } = require( "blinkdb")

// data at https://gist.github.com/retorquere/c71d72f63a6a79ad3d0f6cd217e71b03
const items = require('./mock.json')

async function main() {
  const blinkdb = createDB()
  const blinktable = createTable(blinkdb, 'users')({
    primary: "id",
    indexes: ["age", "name"],
  });
  await insertMany(blinktable, items)

  const DB = new Loki('better-bibtex', {})
  const coll = DB.addCollection('citekeys', {
    indices: [ 'id', 'age', 'name' ],
    unique: [ 'id' ],
  })
  coll.insert(items)

  var suite = new Benchmark.Suite()

  const names = ['Davies', 'Gosling']
  const age = 24
  const n = items.filter(item => item.age > age && names.includes(item.name)).length
  console.log('> search should find', n, 'users')

  const shards = new Map
  for (const user of items) {
    if (!shards.has(user.name)) {
      shards.set(user.name, [ user ])
    }
    else {
      shards.get(user.name).push(user)
    }
  }

  suite
    .add('scan', function() {
      if (items.filter(item => item.age > age && names.includes(item.name)).length !== n) throw new Error('')
    })
    .add('sharding', function() {
      let found = []
      for (const name of names) {
        found = found.concat((shards.get(name) || []).filter(user => user.age > age))
      }
      if (found.length !== n) throw new Error('')
    })
    .add('loki', function() {
      if (coll.find({ name: { $in: names }, age: { $gt: age } }).length !== n) throw new Error('')
    })
    .add('blinkdb', {
      defer: true,
      fn: async function(deferred) {
        if ((await many(blinktable, { where: { name: { in: names }, age: { gt: age } }, })).length !== n) throw new Error('')
        deferred.resolve()
      }
    })
    .on('cycle', function(event) { console.log('> ' + String(event.target)) })
    .on('complete', function() {
      console.log('> Fastest is ' + this.filter('fastest').map('name'));
    })
    .run({ async: true });
}

main()

retorquere commented 1 year ago

And if I'm looking for names that are not in the dataset:

search should find 0 users scan x 5,207 ops/sec ±4.94% (87 runs sampled) sharding x 13,567,067 ops/sec ±1.86% (87 runs sampled) loki x 1,055,781 ops/sec ±1.19% (87 runs sampled) blinkdb x 1,261 ops/sec ±1.72% (77 runs sampled) Fastest is sharding

but not only is loki faster in both cases, a table scan is also faster.

retorquere commented 1 year ago

And for 100.000 items (as per the case on the front page):

search should find 3 users scan x 1,139,156 ops/sec ±1.15% (88 runs sampled) sharding x 3,938,455 ops/sec ±3.28% (84 runs sampled) loki x 255,922 ops/sec ±0.82% (90 runs sampled) blinkdb x 119,386 ops/sec ±1.09% (79 runs sampled) Fastest is sharding

retorquere commented 1 year ago

Is the benchmark from the front page in the benchmark suite? I can't find it but I'd love to tinker around with it. My numbers don't align with that graph at all, so there must be something relevantly different between how these tests are set up.

retorquere commented 1 year ago

Changed to tinybench like the blinkdb tests:

``` const { Bench } = require('tinybench'); const Loki = require('lokijs') const { createDB, createTable, insertMany, many } = require( "blinkdb") // data at https://gist.github.com/retorquere/c71d72f63a6a79ad3d0f6cd217e71b03 const items = require('./mock.json') async function main() { const blinkdb = createDB() const blinktable = createTable(blinkdb, 'users')({ primary: "id", indexes: ["age", "name"], }); await insertMany(blinktable, items) const DB = new Loki('better-bibtex', {}) const coll = DB.addCollection('citekeys', { indices: [ 'id', 'name', 'age' ], unique: ['id'], }) coll.insert(items) var bench = new Bench const random = () => items[Math.round(Math.random() * items.length)] const names = [ random().name, random().name ] const age = random().age const n = items.filter(item => item.age > age && names.includes(item.name)).length console.log('> search should find', n, 'users') const partitions = new Map for (const user of items) { if (!partitions.has(user.name)) { partitions.set(user.name, [ user ]) } else { partitions.get(user.name).push(user) } } bench .add('scan', function() { if (items.filter(item => item.age > age && names.includes(item.name)).length !== n) throw new Error('') }) .add('partitions', () => { let found = [] for (const name of names) { found = found.concat((partitions.get(name) || []).filter(user => user.age > age)) } if (found.length !== n) throw new Error('') }) .add('loki', () => { if (coll.find({ name: { $in: names }, age: { $gt: age } }).length !== n) throw new Error('') }) .add('blinkdb', async () => { if ((await many(blinktable, { where: { name: { in: names }, age: { gt: age } }, })).length !== n) throw new Error('') }) await bench.run() console.table(bench.table()) } main() ```

(index)	Task Name	ops/sec	Average Time (ns)	Margin	Samples
0	'scan'	'505'	1978472.635444445	'±1.45%'	253
1	'partitions'	'386,787'	2585.400346700469	'±1.11%'	193440
2	'loki'	'8,665'	115399.97140832458	'±0.63%'	4333
3	'blinkdb'	'140'	7106147.1843383685	'±3.94%'	71

maradotwebp commented 1 year ago

Thanks for taking a look at this :) If you check the official benchmarks for BlinkDB, do you get similar results there (with BlinkDB consistently being slower than alternatives) ? The blinkdb benchmarks only compare my DB to LokiJS, but in my test runs BlinkDB generally reaches 0.8x - 1.2x of the performance of LokiJS, depending on the testcase.

I'll try implementing your testcase within the BlinkDB benchmarks from scratch, then we can see if there's any issues on either side.

maradotwebp commented 1 year ago

Is the benchmark from the front page in the benchmark suite?

Not really, I either did that manually a few versions ago or deleted it from the benchmarks in the last refactor. In either case, I should add it back in and update the values on the front page.

maradotwebp commented 1 year ago

I added a benchmark myself, available on the feature/add-front-page-test branch here: https://github.com/blinkdb-js/blinkdb/blob/feature/add-front-page-test/packages/benchmarks/src/benchmarks/blinkdb/front-page.ts You can run it by cloning the repo, navigating to the benchmark package, and running npm run start -- "blinkdb/front-page.ts".

My results look like this for 100.000 items in the DB:

blinkdb/front-page.ts --- map is 1.43x faster than scan
┌─────────┬───────────┬────────────────────┬────────────────────┬───────────┬─────────┐
│ (index) │   name    │      ops/sec       │ Average Time (ns)  │  Margin   │ Samples │
├─────────┼───────────┼────────────────────┼────────────────────┼───────────┼─────────┤
│    0    │   'map'   │ 318.43057225700716 │ 3140401.981229661  │ '±2.76%'  │   160   │
│    1    │  'scan'   │ 221.94537258402815 │ 4505613.198226972  │ '±2.49%'  │   111   │
│    2    │ 'blinkdb' │ 105.78578801070266 │ 9453065.660378002  │ '±10.83%' │   53    │
│    3    │ 'lokijs'  │  79.3651495976927  │ 12599988.849880174 │ '±9.05%'  │   40    │
└─────────┴───────────┴────────────────────┴────────────────────┴───────────┴─────────┘

Comparing our respective implementations, a few things I noticed:

Both BlinkDB and LokiJS have the option to clone returned objects. In BlinkDB this is turned on by default, in LokiJS off by default. Since I don't care about comparing the speed of clone implementations, I turn this off for BlinkDB in my benchmarks with createDB({ clone: false });. This doubles the performance of BlinkDB in this specific benchmark for me.
Your scan is particularly slow, and your map/partitions particularly fast compared to mine. I haven't found a specific reason for this yet.

Maybe you can gain other insights or discover issues looking through the code of my benchmark?

maradotwebp commented 1 year ago

Testing with your dataset gets me a benchmark more comparable to yours, but BlinkDB is somehow still faster on my side.

My code

```ts import { createDB, createTable, insertMany, many } from "blinkdb"; import loki from "lokijs"; import { Bench } from "tinybench"; // Same content as https://gist.github.com/retorquere/c71d72f63a6a79ad3d0f6cd217e71b03 import userJson from "./users.json"; interface User { id: number; name: string; // made age non-optional here age: number; } const blinkdb = createDB({ clone: false }); let blinkUserTable = createTable(blinkdb, "users")({ primary: "id", indexes: ["name", "age"] // added age index }); const lokidb = new loki("users.db"); let lokiUserTable = lokidb.addCollection("users", { unique: ["id"], indices: ["name", "age"] // added age index }); let users: User[] = userJson; let userMap: Map = new Map(); for(const user of users) { userMap.set(user.name, [...(userMap.get(user.name) ?? []), user]); } lokiUserTable.insert(users); insertMany(blinkUserTable, users); const random = () => users[Math.round(Math.random() * users.length)]; const names = [random().name, random().name]; const age = random().age; export const bench = new Bench() .add("scan", () => { users.filter(u => names.includes(u.name) && u.age && u.age > age); }) .add("map", () => { let u: User[] = []; for(const name of names) { u = u.concat((userMap.get(name) ?? []).filter(u => u.age && u.age > age)); } }) .add("lokijs", () => { lokiUserTable.find({ name: { $in: names }, age: { $gt: age } }); }) .add("blinkdb", async () => { await many(blinkUserTable ,{ where: { name: { in: names }, age: { gt: age } }}); }); ```

blinkdb/front-page.ts --- map is 42.64x faster than lokijs
┌─────────┬───────────┬────────────────────┬────────────────────┬──────────┬─────────┐
│ (index) │   name    │      ops/sec       │ Average Time (ns)  │  Margin  │ Samples │
├─────────┼───────────┼────────────────────┼────────────────────┼──────────┼─────────┤
│    0    │   'map'   │ 1489335.8240912817 │ 671.4402378725764  │ '±1.76%' │ 744668  │
│    1    │ 'lokijs'  │ 34925.248617732905 │ 28632.580713892505 │ '±1.07%' │  17463  │
│    2    │ 'blinkdb' │ 29151.137594377647 │ 34303.978593029904 │ '±3.88%' │  14576  │
│    3    │  'scan'   │ 2969.203635917305  │ 336790.6424144804  │ '±0.64%' │  1485   │
└─────────┴───────────┴────────────────────┴────────────────────┴──────────┴─────────┘

retorquere commented 1 year ago

How do you run this? I'm getting

Cannot use import statement outside a module

when I start it with npx ts-node.

retorquere commented 1 year ago

If you check the official benchmarks for BlinkDB, do you get similar results there (with BlinkDB consistently being slower than alternatives) ?

I haven't been able to get them to work yet.

and running npm run start -- "blinkdb/front-page.ts".

That gets me

Unknown file extension ".ts" for /Users/emile/github/blinkdb/packages/benchmarks/src/index.ts

Both BlinkDB and LokiJS have the option to clone returned objects. In BlinkDB this is turned on by default, in LokiJS off by default.

That is true. But when I turn clone off for blinkDB, my tests continue to show everything faster than blinkdb.

Your scan is particularly slow,

It's supposed to be a benchmark for the worst case. Nothing should be slower than a full scan.

and your map/partitions particularly fast compared to mine. I haven't found a specific reason for this yet.

Javascript Map is just ridiculuously fast, and after selection of the partition it's all integer comparisons. It's doing half the work, and the fast half at that.

I was working on a lokijs partition command where you could select one partition and then filter on the much smaller resultset as usual. But lokijs has not been active for a long while, and I'd much prefer to move to blinkdb if we can get this apparent disparity sorted out.

retorquere commented 1 year ago

Testing with your dataset gets me a benchmark more comparable to yours, but BlinkDB is somehow still faster on my side.

Well now... I've gotten this script to run and I'm now getting

(index)	Task Name	ops/sec	Average Time (ns)	Margin	Samples
0	'scan'	'503'	1987200.3532576538	'±0.44%'	252
1	'map'	'306,251'	3265.293322546093	'±2.15%'	153126
2	'lokijs'	'9,114'	109709.38964793888	'±1.07%'	4558
3	'blinkdb'	'20,498'	48784.63455380463	'±1.24%'	10250

retorquere commented 1 year ago

And this is the same script with clone on on both:

(index)	Task Name	ops/sec	Average Time (ns)	Margin	Samples
0	'scan'	'483'	2066123.4032791948	'±0.46%'	243
1	'map'	'511,727'	1954.164901172087	'±2.96%'	255864
2	'lokijs'	'1,874'	533549.3187751692	'±0.59%'	938
3	'blinkdb'	'18,850'	53049.23583965835	'±0.84%'	9426

How does cloning make such a big difference when a search returns so few records? There is no cloning during the search right, just for the hits that are returned from the search?

retorquere commented 1 year ago

and your map/partitions particularly fast compared to mine. I haven't found a specific reason for this yet.

I would imagine that partitions could implemented as hooks on insert/update/create/delete that would maintain T[] records in a Map keyed on the partition field values, and that there would be an equivalent for get that would return that partition T[]. These are not as flexible as general query methods (you can use them only at the start of the query chain) but given the results in these benchmarks they would provide a massive performance benefit where you can use them (and I have such a use case).

retorquere commented 1 year ago

and running npm run start -- "blinkdb/front-page.ts" .

I can just run "npm run test" on the toplevel -- that works.

retorquere commented 1 year ago

This is approximately what I meant by adding partition support to blinkdb:

``` const { createDB, createTable, insertMany, many, use, key } = require( "blinkdb") const { clone } = require('blinkdb/dist/core/clone') const loki = require( "lokijs" ) const { Bench } = require( "tinybench") // Same content as https://gist.github.com/retorquere/c71d72f63a6a79ad3d0f6cd217e71b03 const users = require( "./users.json" ) const blinkdb = createDB({ clone: true }) const blinkPartition = new Map let blinkUserTable = createTable( blinkdb, "users" )({ primary: "id", indexes: ["name", "age"] // added age index }) const lokidb = new loki("users.db") let lokiUserTable = lokidb.addCollection("users", { unique: ["id"], indices: ["name", "age"], // added age index clone: true }) function mapRemove(entry, partition, pk) { if (blinkPartition.has(entry[partition])) { blinkPartition.get(entry[partition]).delete(entry[pk]) if (blinkPartition.get(entry[partition]).size === 0) blinkPartition.delete(entry[partition]) } } function mapAdd(entry, partition, pk) { if (!blinkPartition.has(entry[partition])) blinkPartition.set(entry[partition], new Map) blinkPartition.get(entry[partition]).set(entry[pk], entry) } use(blinkUserTable, async (ctx) => { switch (ctx.action) { case 'update': case 'delete': var pk = key(ctx.params[0]) mapRemove(one(ctx.params[0], ctx.params[1][pk]), 'name', pk) break case 'updateMany': case 'deleteMany': var pk = key(ctx.params[0]) for (const entry of ctx.params[1]) { mapRemove(one(ctx.params[0], entry[pk]), 'name', pk) } } const result = await ctx.next(...ctx.params); switch (ctx.action) { case 'update': case 'insert': var pk = key(ctx.params[0]) mapAdd(ctx.params[1], 'name', pk) break case 'updateMany': case 'insertMany': var pk = key(ctx.params[0]) for (const entry of ctx.params[1]) { mapAdd(entry, 'name', pk) } } return result }); lokiUserTable.insert(users) insertMany(blinkUserTable, users) const random = () => users[Math.round(Math.random() * users.length)] const names = [random().name, random().name] const age = random().age async function main() { const bench = new Bench() .add("scan", () => { users.filter(u => names.includes(u.name) && u.age && u.age > age) }) .add("partition", () => { let found = [] for (const name of names) { const partition = blinkPartition.get(name) if (partition) found = found.concat([...partition.values()].filter(user => user.age > age)).map(clone) } }) .add("lokijs", () => { lokiUserTable.find({ name: { $in: names }, age: { $gt: age } }) }) .add("blinkdb", async () => { await many(blinkUserTable, { where: { name: { in: names }, age: { gt: age } } }) }) await bench.run(); console.table(bench.table()); } main() ```

(index)	Task Name	ops/sec	Average Time (ns)	Margin	Samples
0	'scan'	'456'	2189178.0046982015	'±1.19%'	229
1	'partition'	'26,279'	38052.23829942207	'±1.35%'	13140
2	'lokijs'	'1,697'	589064.4982071029	'±0.76%'	849
3	'blinkdb'	'14,259'	70128.07589656506	'±0.93%'	7130

edit: I had previously overlooked that I still needed to clone. With the clone overhead partitioning still comes out ahead, but the performance boost varies between 85% and 1300% depending on the size of the result set (smaller being faster regardless of the size of the DB being searched) rather than the consistent several orders of magnitude I was seeing. I'm not sure how I would make dynamic partition maintenance work without cloning. Loki seems to take a really hard hit from cloning. AAMOF I can substantially improve lokijs performance just by replacing its clone with rfdc. Still not as fast as blinkdb, but usually twice as fast as with lokijs's own clone.

Is there a general hook for "each deleted(/inserted/updated) entry" regardless of whether it was called through removeMany/remove/removeWhere?

retorquere commented 1 year ago

I must admit your use of typescript is a lot more advanced than mine; what I had envisioned was something along the lines of

const items = await many(userTable, {
  where: {
    name: { partition: ["Alice", "Charlie"] },
    age: { gt: 24 },
  },
});

but I've run aground in adding this to the query evaluation.

Is it correct that get is synchronous? I could write my own wrapper to implement one and many to get sync reads?

maradotwebp commented 1 year ago

If you need an array of items from the DB that is regenerated whenever items are inserted/updated/deleted, you don't need to implement partitions, you can just use watch. This gets you the fastest possible speed on read operations with small performance hits on every write.

Your benchmark with `watch`

```ts const { createDB, createTable, insertMany, many, use, key, watch } = require( "blinkdb") const { clone } = require('blinkdb/dist/core/clone') const loki = require( "lokijs" ) const { Bench } = require( "tinybench") // Same content as https://gist.github.com/retorquere/c71d72f63a6a79ad3d0f6cd217e71b03 const users = require( "./users.json" ) const blinkdb = createDB({ clone: true }) const blinkPartition = new Map let blinkUserTable = createTable( blinkdb, "users" )({ primary: "id", indexes: ["name", "age"] // added age index }) const lokidb = new loki("users.db") let lokiUserTable = lokidb.addCollection("users", { unique: ["id"], indices: ["name", "age"], // added age index clone: true }) function mapRemove(entry, partition, pk) { if (blinkPartition.has(entry[partition])) { blinkPartition.get(entry[partition]).delete(entry[pk]) if (blinkPartition.get(entry[partition]).size === 0) blinkPartition.delete(entry[partition]) } } function mapAdd(entry, partition, pk) { if (!blinkPartition.has(entry[partition])) blinkPartition.set(entry[partition], new Map) blinkPartition.get(entry[partition]).set(entry[pk], entry) } use(blinkUserTable, async (ctx) => { switch (ctx.action) { case 'update': case 'delete': var pk = key(ctx.params[0]) mapRemove(one(ctx.params[0], ctx.params[1][pk]), 'name', pk) break case 'updateMany': case 'deleteMany': var pk = key(ctx.params[0]) for (const entry of ctx.params[1]) { mapRemove(one(ctx.params[0], entry[pk]), 'name', pk) } } const result = await ctx.next(...ctx.params); switch (ctx.action) { case 'update': case 'insert': var pk = key(ctx.params[0]) mapAdd(ctx.params[1], 'name', pk) break case 'updateMany': case 'insertMany': var pk = key(ctx.params[0]) for (const entry of ctx.params[1]) { mapAdd(entry, 'name', pk) } } return result }); const random = () => users[Math.round(Math.random() * users.length)] const names = [random().name, random().name] const age = random().age let watchUsers = []; watch(blinkUserTable, { where: { name: { in: names }, age: { gt: age } } }, users => watchUsers = users) lokiUserTable.insert(users) insertMany(blinkUserTable, users) async function main() { const bench = new Bench() .add("scan", () => { const u = users.filter(u => names.includes(u.name) && u.age && u.age > age); }) .add("partition", () => { let found = [] for (const name of names) { const partition = blinkPartition.get(name) if (partition) found = found.concat([...partition.values()].filter(user => user.age > age)).map(clone) } }) .add("watch", () => { watchUsers }) .add("lokijs", () => { const l = lokiUserTable.find({ name: { $in: names }, age: { $gt: age } }) }) .add("blinkdb", async () => { const b = await many(blinkUserTable, { where: { name: { in: names }, age: { gt: age } } }) }) await bench.run(); console.table(bench.table()); } main() ```

How does cloning make such a big difference when a search returns so few records? There is no cloning during the search right, just for the hits that are returned from the search?

Can't speak for LokiJS, but BlinkDB only clones items returned from a call to get.

Is there a general hook for "each deleted(/inserted/updated) entry" regardless of whether it was called through removeMany/remove/removeWhere?

watch(...) doesn't give you the "changed" items (yet), but it does give you all items that match a given query. It's basically a simplified use(...) hook.

Is it correct that get is synchronous? I could write my own wrapper to implement one and many to get sync reads?

Yes. The entire query engine is synchronous. BlinkDB is just async in order to support async middleware.

retorquere commented 1 year ago

If you need an array of items from the DB that is regenerated whenever items are inserted/updated/deleted, you don't need to implement partitions, you can just use watch. This gets you the fastest possible speed on read operations with small performance hits on every write.

That's not entirely the same though. With the partition solution, you get the speedup without knowing the names in advance. The watch as used here only works if you know up front which names you're going to select. I need something where the names can be arbitrary. As far as I can tell, this watch setup would only work if I rebuilt the partitions for every mutation, which is going to be prohibitive for large tables. The map approach I posted only makes one or two changes to the map for each mutation, rather than rebuilding it.

watch(...) doesn't give you the "changed" items (yet), but it does give you all items that match a given query. It's basically a simplified use(...) hook.

I do need the changed/deleted/added items though, as I use them in my own event handling. How would I implement my use on a removeWhere? I would have to run the query again before the call to ctx.next I suppose?

retorquere commented 1 year ago

Yes. The entire query engine is synchronous. BlinkDB is just async in order to support async middleware.

Gotcha -- that is not a problem for me, I just need read/search to be sync, so I can start working on blink integration.

retorquere commented 1 year ago

I'm trying to understand how the query infrastructure works, is there any documentation on the design of it?

maradotwebp commented 1 year ago

I do need the changed/deleted/added items though, as I use them in my own event handling. How would I implement my use on a removeWhere? I would have to run the query again before the call to ctx.next I suppose?

I think it best to implement this in watch already. The callback could simply provide a diff argument which either contains the elements that were added, updated, or deleted that triggered the watch hook.

If you want to implement this using removeWhere, you could use the query engine yourself (the get, although I don't know if that API is publicly exposed yet) to retrieve all items that match the where filter, then perform some operation on them, then pass them to ctx.next.

retorquere commented 1 year ago

I think it best to implement this in watch already. The callback could simply provide a diff argument which either contains the elements that were added, updated, or deleted that triggered the watch hook.

That would work.

If you want to implement this using removeWhere, you could use the query engine yourself (the get, although I don't know if that API is publicly exposed yet)

Sort of -- I can fetch it using import { get } from 'blinkdb/src/query' -- edit: but not without a host of errors. I really do need to be able to use get.

to retrieve all items that match the where filter, then perform some operation on them, then pass them to ctx.next.

I'd much prefer the former option :)

retorquere commented 1 year ago

Looks like in the interim I could just subscribe to the dispatcher using

table[BlinkKey].events.onInsert.register((changes) => console.log(changes))
table[BlinkKey].events.onUpdate.register((changes) => console.log(changes))
table[BlinkKey].events.onRemove.register((changes) => console.log(changes))
table[BlinkKey].events.onClear.register((changes) => console.log(changes))

correct?

retorquere commented 10 months ago

Me again -- I took your last script and modified it (to avoid screwups like before) for a new case I'm working on, script + data are here, but I'm getting this for single-field indexed lookup:

┌─────────┬───────────┬──────────┬───────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec  │ Average Time (ns) │ Margin   │ Samples │
├─────────┼───────────┼──────────┼───────────────────┼──────────┼─────────┤
│ 0       │ 'lokijs'  │ '18,257' │ 54772.43624664317 │ '±0.51%' │ 9129    │
│ 1       │ 'blinkdb' │ '13,989' │ 71482.33448462115 │ '±0.25%' │ 6995    │
└─────────┴───────────┴──────────┴───────────────────┴──────────┴─────────┘

retorquere commented 10 months ago

For m: { in: ['math', 'text'] } the difference is even more pronounced:

┌─────────┬───────────┬──────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name │ ops/sec  │ Average Time (ns)  │ Margin   │ Samples │
├─────────┼───────────┼──────────┼────────────────────┼──────────┼─────────┤
│ 0       │ 'lokijs'  │ '20,296' │ 49269.873422405995 │ '±0.51%' │ 10149   │
│ 1       │ 'blinkdb' │ '11,252' │ 88871.98824261357  │ '±0.38%' │ 5627    │
└─────────┴───────────┴──────────┴────────────────────┴──────────┴─────────┘

retorquere commented 10 months ago

I've managed to get the benchmarks to run; I've taken the front-page.ts benchmark and modified it to this, so I'm pretty sure I got it right this time:

import { createDB, createTable, insertMany, many } from "blinkdb";
import loki from "lokijs";
import { Bench } from "tinybench";

interface Char {
  id: number;
  u: string;
  s: string;
  t: string;
  m: string;
  p: string;
}

const blinkdb = createDB({ clone: false });
let blinkUserTable = createTable<Char>(blinkdb, "users")({
  primary: "id",
  indexes: [ 'u', 's', 't', 'm', 'p' ],
});

const lokidb = new loki("users.db");
let lokiUserTable = lokidb.addCollection<Char>("users", {
  unique: ["id"],
  indices: [ 'u', 's', 't', 'm', 'p' ],
});

let chars = require('./config2.json')
chars.forEach((char: Char, i: number) => char.id = i)

lokiUserTable.insert(chars);
insertMany(blinkUserTable, chars);

export const bench = new Bench()
  .add("lokijs", () => {
    lokiUserTable.find({ m: { $in: ["m", "t"] } });
  })
  .add("blinkdb", async () => {
    await many(blinkUserTable ,{ where: {
      m: { in: ["m", "t"] },
    }});
  });

and there lokijs gets over twice the performance of blinkdb:

┌─────────┬───────────┬────────────────────┬────────────────────┬───────────┬─────────┐
│ (index) │   name    │      ops/sec       │ Average Time (ns)  │  Margin   │ Samples │
├─────────┼───────────┼────────────────────┼────────────────────┼───────────┼─────────┤
│    0    │ 'lokijs'  │ 4495.033687905337  │ 222467.74316523422 │ '±4.68%'  │  2248   │
│    1    │ 'blinkdb' │ 1990.6891247726794 │ 502338.6060413586  │ '±12.03%' │   996   │
└─────────┴───────────┴────────────────────┴────────────────────┴───────────┴─────────┘

with front-page.ts I get the expected

┌─────────┬───────────┬────────────────────┬────────────────────┬───────────┬─────────┐
│ (index) │   name    │      ops/sec       │ Average Time (ns)  │  Margin   │ Samples │
├─────────┼───────────┼────────────────────┼────────────────────┼───────────┼─────────┤
│    0    │   'map'   │ 237.99901467945804 │ 4201698.067308473  │ '±1.89%'  │   119   │
│    1    │  'scan'   │ 179.23980238634888 │ 5579117.956426408  │ '±3.32%'  │   90    │
│    2    │ 'blinkdb' │  86.9096681550792  │ 11506199.727004224 │ '±15.91%' │   44    │
│    3    │ 'lokijs'  │ 47.70440453076051  │ 20962424.95502035  │ '±12.22%' │   24    │
└─────────┴───────────┴────────────────────┴────────────────────┴───────────┴─────────┘

with blinkdb getting almost twice the performance of lokijs.

blinkdb-js / blinkdb

Read performance #25