Panic when read/written concurrently

dim commented 10 years ago

Here's a test script I've been using:

package main

import (
    "bytes"
    "encoding/binary"
    "fmt"
    "log"
    "os"
    "sync"

    fdb "github.com/couchbaselabs/goforestdb"
)

var testFile = "/tmp/test.fdb"
var testValue = bytes.Repeat([]byte{120}, 256)
var total = int(1e5)

func abortOn(err error) {
    if err != nil {
        log.Fatal(err.Error())
    }
}

func main() {
    os.RemoveAll(testFile)

    config := fdb.DefaultConfig()
    config.SetWalFlushBeforeCommit(true)

    wg := sync.WaitGroup{}
    db, err := fdb.Open(testFile, config)
    abortOn(err)
    defer db.Close()

    abortOn(write(db, 0))
    abortOn(read(db, 0))

    wg.Add(1)
    go func() {
        defer wg.Done()
        abortOn(write(db, 1))
    }()
    wg.Add(1)
    go func() {
        defer wg.Done()
        abortOn(read(db, 0))
    }()
    wg.Wait()
}

func write(db *fdb.Database, base int) error {
    fmt.Println("* writing")
    for n := 0; n < total; n++ {
        key := make([]byte, 4)
        binary.BigEndian.PutUint32(key, uint32(base*total+n))
        if err := db.SetKV(key, testValue); err != nil {
            return err
        }
    }

    return db.Commit(fdb.COMMIT_NORMAL)
}

func read(db *fdb.Database, base int) error {
    fmt.Println("* reading")
    for n := 0; n < total; n++ {
        key := make([]byte, 4)
        binary.BigEndian.PutUint32(key, uint32(base*total+n))
        if _, err := db.GetKV(key); err != nil {
            return err
        }
    }
    return nil
}

Causes:

$ go run fdb-bug.go 
* writing
* reading
* writing
* reading
*** Error in `/tmp/go-build726603444/command-line-arguments/_obj/exe/fdb-bug': double free or corruption (fasttop): 0x00000000097f6e00 ***
SIGABRT: abort
PC=0x7f1172a0ad27
signal arrived during cgo execution

goroutine 21 [syscall]:
runtime.cgocall(0x402410, 0x7f1173239e08)
  $GOROOT/src/pkg/runtime/cgocall.c:143 +0xe5 fp=0x7f1173239df0 sp=0x7f1173239da8
github.com/couchbaselabs/goforestdb._Cfunc_fdb_get_kv(0x13b62c0, 0xc208073040, 0x4, 0xc20803c758, 0xc208073048, 0x1)
  github.com/couchbaselabs/goforestdb/_obj/_cgo_defun.c:179 +0x31 fp=0x7f1173239e08 sp=0x7f1173239df0
github.com/couchbaselabs/goforestdb.(*Database).GetKV(0xc20803c028, 0xc208073040, 0x4, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0)
  $GOPATH/src/github.com/couchbaselabs/goforestdb/kv.go:32 +0xf2 fp=0x7f1173239e88 sp=0x7f1173239e08
main.read(0xc20803c028, 0x0, 0x0, 0x0)
  fdb-bug.go:68 +0x1ee fp=0x7f1173239f68 sp=0x7f1173239e88
main.func·002()
  fdb-bug.go:45 +0x58 fp=0x7f1173239fa8 sp=0x7f1173239f68
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445 fp=0x7f1173239fb0 sp=0x7f1173239fa8
created by main.main
  fdb-bug.go:46 +0x23d

goroutine 16 [semacquire]:
sync.runtime_Semacquire(0xc208096e50)
  $GOROOT/src/pkg/runtime/sema.goc:199 +0x30
sync.(*WaitGroup).Wait(0xc20805a000)
  $GOROOT/src/pkg/sync/waitgroup.go:129 +0x14b
main.main()
  fdb-bug.go:47 +0x24d

goroutine 19 [finalizer wait]:
runtime.park(0x4153f0, 0x77bbd8, 0x770de9)
  $GOROOT/src/pkg/runtime/proc.c:1369 +0x89
runtime.parkunlock(0x77bbd8, 0x770de9)
  $GOROOT/src/pkg/runtime/proc.c:1385 +0x3b
runfinq()
  $GOROOT/src/pkg/runtime/mgc0.c:2644 +0xcf
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445

goroutine 17 [syscall]:
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445

goroutine 20 [syscall]:
github.com/couchbaselabs/goforestdb._Cfunc_fdb_set_kv(0x13b62c0, 0xc208073020, 0x4, 0xc208040000, 0x100, 0x425de3)
  github.com/couchbaselabs/goforestdb/_obj/_cgo_defun.c:305 +0x31
github.com/couchbaselabs/goforestdb.(*Database).SetKV(0xc20803c028, 0xc208073020, 0x4, 0x4, 0xc208040000, 0x100, 0x100, 0x0, 0x0)
  $GOPATH/src/github.com/couchbaselabs/goforestdb/kv.go:58 +0xa4
main.write(0xc20803c028, 0x1, 0x0, 0x0)
  fdb-bug.go:55 +0x215
main.func·001()
  fdb-bug.go:40 +0x58
created by main.main
  fdb-bug.go:41 +0x1f0

rax     0x0
rbx     0x8b
rcx     0xffffffffffffffff
rdx     0x6
rdi     0x74ef
rsi     0x74f2
rbp     0x7f1170e9ebe0
rsp     0x7f1170e9e848
r8      0x3030653666373930
r9      0x6f6974707572726f
r10     0x8
r11     0x202
r12     0x7f1170e9e9f0
r13     0x7
r14     0x8b
r15     0x7
rip     0x7f1172a0ad27
rflags  0x202
cs      0x33
fs      0x0
gs      0x0
exit status 2
godep: go exit status 1

dim commented 10 years ago

Here's another variation:

$ go run fdb-bug.go 
* writing
* reading
* writing
* reading
fatal error: unexpected signal during runtime execution
[signal 0xb code=0x1 addr=0x0 pc=0x7f81833f1e49]

runtime stack:
runtime: unexpected return pc for runtime.sigpanic called from 0x7f81833f1e49
runtime.throw(0x76e205)
  $GOROOT/src/pkg/runtime/panic.c:520 +0x69
runtime: unexpected return pc for runtime.sigpanic called from 0x7f81833f1e49
runtime.sigpanic()
  $GOROOT/src/pkg/runtime/os_linux.c:222 +0x3d

goroutine 21 [syscall]:
runtime.cgocall(0x402410, 0x7f818365de08)
  $GOROOT/src/pkg/runtime/cgocall.c:143 +0xe5 fp=0x7f818365ddf0 sp=0x7f818365dda8
github.com/couchbaselabs/goforestdb._Cfunc_fdb_get_kv(0x22bb2c0, 0xc20806d028, 0x4, 0xc20803c768, 0xc20806d030, 0x7f8100000001)
  github.com/couchbaselabs/goforestdb/_obj/_cgo_defun.c:179 +0x31 fp=0x7f818365de08 sp=0x7f818365ddf0
github.com/couchbaselabs/goforestdb.(*Database).GetKV(0xc20803c028, 0xc20806d028, 0x4, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0)
  $GOPATH/src/github.com/couchbaselabs/goforestdb/kv.go:32 +0xf2 fp=0x7f818365de88 sp=0x7f818365de08
main.read(0xc20803c028, 0x0, 0x0, 0x0)
  fdb-bug.go:69 +0x1ee fp=0x7f818365df68 sp=0x7f818365de88
main.func·002()
  fdb-bug.go:46 +0x58 fp=0x7f818365dfa8 sp=0x7f818365df68
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445 fp=0x7f818365dfb0 sp=0x7f818365dfa8
created by main.main
  fdb-bug.go:47 +0x23d

goroutine 16 [semacquire]:
sync.runtime_Semacquire(0xc208000fa0)
  $GOROOT/src/pkg/runtime/sema.goc:199 +0x30
sync.(*WaitGroup).Wait(0xc20805a000)
  $GOROOT/src/pkg/sync/waitgroup.go:129 +0x14b
main.main()
  fdb-bug.go:48 +0x24d

goroutine 19 [finalizer wait]:
runtime.park(0x4153f0, 0x77bbd8, 0x770de9)
  $GOROOT/src/pkg/runtime/proc.c:1369 +0x89
runtime.parkunlock(0x77bbd8, 0x770de9)
  $GOROOT/src/pkg/runtime/proc.c:1385 +0x3b
runfinq()
  $GOROOT/src/pkg/runtime/mgc0.c:2644 +0xcf
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445

goroutine 17 [syscall]:
runtime.goexit()
  $GOROOT/src/pkg/runtime/proc.c:1445

goroutine 20 [syscall]:
github.com/couchbaselabs/goforestdb._Cfunc_fdb_set_kv(0x22bb2c0, 0xc20806cfd4, 0x4, 0xc208040000, 0x100, 0x425de3)
  github.com/couchbaselabs/goforestdb/_obj/_cgo_defun.c:305 +0x31
github.com/couchbaselabs/goforestdb.(*Database).SetKV(0xc20803c028, 0xc20806cfd4, 0x4, 0x4, 0xc208040000, 0x100, 0x100, 0x0, 0x0)
  $GOPATH/src/github.com/couchbaselabs/goforestdb/kv.go:58 +0xa4
main.write(0xc20803c028, 0x1, 0x0, 0x0)
  fdb-bug.go:56 +0x215
main.func·001()
  fdb-bug.go:41 +0x58
created by main.main
  fdb-bug.go:42 +0x1f0
exit status 2
godep: go exit status 1

mschoch commented 10 years ago

Unfortunately, the ForestDB handle cannot be used by multiple threads. I did not agree with this decision, and I'm still lobbying for it to change, but for now it means that you cannot share a ForestDB instance across go routines.

See http://www.couchbase.com/issues/browse/MB-11635

dim commented 10 years ago

Really? Are you serious? How useful is that then? So I basically need to add locking on application level?

Sorry, I understand it's not your choice, but that's a pretty essential functionality of any key/value engine. This will rule out ForestDB as a contender for most of all the basic use cases.

mschoch commented 10 years ago

You can safely work on the database concurrently from multiple threads, but they each must have their own ForestDB handle. For languages/platforms where you manually control threads this is not ideal, but it is manageable. In Go and other languages where the actual OS threads are opaque, this is a significant problem.

dim commented 10 years ago

Sorry again, I think I see now what you mean. I would need to open a new handle with every goroutine. Unfortunately, this is highly impractical in the case of e.g. a golang HTTP server which spawns a new goroutine on each connection. It would require some sort of handle pooling, right?

mschoch commented 10 years ago

Yeah, one idea which I did not recommend yet because I haven't tried it is to do the following.

Create an unbuffered channel of ForestDB connections, with size set to GOMAXPROCS. Then when you need a connection, read one off the channel and when you're done with it, write it back to the channel.

This should prevent two threads from using the same handle at the same time. But it won't ensure that the same thread always uses the same connection. Hopefully the first property is enough, but I haven't yet tested it.

dim commented 10 years ago

Would you consider adding something like the below to the package?

type Pool struct {
  handles chan *Database
  size    int
}

func NewPool(filename string, config *Config) (*Pool, error) {
  size, _ := strconv.Atoi(os.Getenv("GOMAXPROCS"))
  if size < 1 {
    size = runtime.NumCPU()
  }
  return NewPoolWithSize(filename, config, size)
}

func NewPoolWithSize(filename string, config *Config, size int) (*Pool, error) {
  pool := &Pool{
    handles: make(chan *Database, size), 
    size: size,
  }
  for i := 0; i < size; i++ {
    handle, err := Open(filename, config)
    if err != nil {
      pool.Close()
      return nil, err
    }
    pool.Checkin(handle)
  }
  return pool, nil
}

func (p *Pool) Checkout() *Database  { return <-p.handles }
func (p *Pool) Checkin(db *Database) { p.handles <- db }

func (p *Pool) With(cb func(*Database)) {
  db := p.Checkout()
  defer p.Checkin(db)
  cb(db)
}

func (p *Pool) Close() (err error) {
  for i := 0; i < p.size; i++ {
    if e := p.Checkout().Close(); e != nil {
      err = e
    }
  }
  return err
}

mschoch commented 10 years ago

Yeah, if we test it and it works, I'm fine with adding this. I'm still trying to get this fixed in ForestDB but we cannot wait for that.

tleyden commented 10 years ago

@dim what about sending a PR with that change?

dim commented 10 years ago

I would, just wanted to check if this is a valid/preferred approach in the first place. I actually tried to stress-test the code above and ran into problems. Will investigate.

tleyden commented 9 years ago

I actually tried to stress-test the code above and ran into problems. Will investigate.

@dim did you ever figure out the issues?

mschoch commented 9 years ago

I have heard from the 2i team that forestdb has concurrency issues right now. They are not using the code above, but a similar approach. Without seeing how you "ran into problems" I cannot comment further.

tleyden commented 9 years ago

Oops, I meant to quote @dim in that comment. @dim can you elaborate on the problems you ran into?

dim commented 9 years ago

The problems are 'by design': ForestDB handles are simply not thread-safe, the authors recommend to use a separate handle per thread. Thread scheduling cannot be directly accessed in Go and there is no trivial solution for this. I initially suggested a rather hacky fix above which involves using a pool of handles, but it's not something I would recommend in any production environment.

In the end I gave up on ForestDB, as I don't agree with overall the design, plus there are some very nasty code bits.

hisundar commented 9 years ago

A number of concurrency issues have been addressed in forestdb recently (April 3rd), with these the highly concurrent 2i test cases are running fine. So could you please confirm if the goforestdb repo has been refreshed to pick up these fixes?

Sharing of handles among concurrent threads makes implementing MVCC very tricky (read slow). However, a handle itself is a very lightweight data structure running a few bytes, so why are hacks like pool of handles being attempted instead of cleanly creating one handle per go routine ?

Thanks and with warm regards, Sundar

On Apr 7, 2015, at 1:09 AM, Dimitrij Denissenko notifications@github.com wrote:

The problems are 'by design': ForestDB handles are simply not thread-safe, the authors recommend to use a separate handle per thread. Thread scheduling cannot be directly accessed in Go and there is no trivial solution for this. I initially suggested a rather hacky fix above which involves using a pool of handles, but it's not something I would recommend in any production environment.

In the end I gave up on ForestDB, as I don't agree with overall the design, plus there are some very nasty code bits.

— Reply to this email directly or view it on GitHub.

mschoch commented 9 years ago

@hisundar because Go routines are not threads. You might organize your code to have a hundred thousand Go routines. Fdb handles may be lightweight enough to have one per thread, but I doubt they are lightweight enough to create a hundred thousand.

Further, there is no "thead-local" equivalent for easily knowing that a particular go-routine already has its own fdb handle. This means you have to organize your code in such a way that it knows this, or else create even more than one handle per go-routine just in case.

Sharing handles among concurrent threads is what every other KV store allows you to do. By choosing to work different than every other KV store in this regard, it means you can't just plug-in ForestDB. I understand that the ForestDB team is OK with this, and that is unfortunate.

mschoch commented 9 years ago

Also, given the design of ForestDB, I don't think having pools of handles in Go is hacky at all. Its actually one of the cleaner ways to safely use ForestDB handles without creating a ridiculous number of them.

Further details about why people find this hacky are welcome...

dim commented 9 years ago

I tried it but ran into a few edge cases with go1.3 where it still panicked. I then wrote an lru pool (based on sync.Pool) with idle time based expiration, which worked fine but was not exactly an elegant solution for something that should be relatively trivial

hisundar commented 9 years ago

@mschoch - To support sharing of handles among concurrent threads forestdb will have to simply add "big locks" on nearly every api operation as opposed to fine grained shared locks that allow linear scaling with increasing number of threads. This is not a particularly big change and depending on how much interest there is, I suppose we can enable it as a config parameter just for users like who call it from go routines. But given that only golang users, whose routines are inherently concurrency unaware, will ever need such locks, it looks to me like the the goforestdb wrapper is a better place to implement this than the forestdb repo itself. This is exactly what the 2i team has successfully done and I don't see why it won't work well for others just as well.

As a side note, we did not intentionally intend on being different from other kv stores, it just seemed like a natural decision to not have big locks. (Please check out leveldb code and you can easily notice how almost every single api begins early with a mutex_.Lock() which may even explain why they aren't even half as fast as forestdb on benchmark runs).

mschoch commented 9 years ago

@hisundar I disagree that having "big locks" is the only approach, and I think if you look beyond just LevelDB you'll find other approaches. However, this issue is not about changing ForestDB, its about making the goforestdb bindings work in a way that Go programmers expect.

As currently implemented, most users will do the wrong thing because the library doesn't take care of significant details. Instead its left up to the application (which is what 2i is doing).

I haven't recently reviewed the 2i code, but the last time I discussed it with them they agreed it was functionally equivalent to the channel based pooling above.

So what exactly are you adding to this issue?

hisundar commented 9 years ago

I did not mean to say the channel based pooling is bad, just a bit concerned that someone running benchmarks via the goforestdb might actually end up stress testing the golang scheduler rather than the underlying kv store that's all. Also forestdb team is definitely considering a better way of sharing handles among threads and we internally have a ticket for tracking this at https://issues.couchbase.com/browse/MB-12538.

mschoch commented 9 years ago

Obviously I'm aware of that issue as it was started only after I raised concerns (body of the issue description is largely copy/paste email from me)

In my opinion we're suffering from a much worse problem, which is that people use the library, find it crashes when they use it in what they consider "normal" ways, and then they throw it away calling it garbage.

hisundar commented 9 years ago

@mschoch I understand your concern and all I am proposing is that until the time that forestdb natively supports sharing file handles among concurrent callers, why not include the channel based locking (I assume the advlock in 2i is the same thing) by default for the apis in goforestdb. This is just to prevent the accidental misuse and abandonment you mention. I did not see this being natively enabled in goforestdb code, but please correct me I am wrong.

mschoch commented 9 years ago

At the moment the main reason was that the user who coded it up also said it didn't work. At the time that was reported forestdb had lots of concurrency issues. So its entirely possible that it does work now that many of those issues are fixed.

If someone wrote some test cases to at least demonstrate that it works as intended for basic cases I think we would go ahead and merge it.

hisundar commented 9 years ago

As I mentioned earlier, secondary indexing team has a suite of functional tests in go, which after April 3rd, with forestdb repo commit d78b41806b1a55b8fb6fc3361aa6f4609513ccc2, work fine. These include several go routines that concurrently perform read, write, range scans and point queries into the forestdb database. If you are looking for specific test cases to demonstrate correctness and functionality, I would recommend looking at largedatatests in the secondary indexing repo.

tleyden commented 9 years ago

@hisundar can you link to the secondary indexing code you mentioned above?

hisundar commented 9 years ago

The repo is located at https://github.com/couchbase/indexing The high concurrency test cases are located at https://github.com/couchbase/indexing/tree/master/secondary/tests/largedatatests

On Tue, Apr 7, 2015 at 1:38 PM, Traun Leyden notifications@github.com wrote:

@hisundar https://github.com/hisundar can you link to the secondary indexing code you mentioned above?

— Reply to this email directly or view it on GitHub https://github.com/couchbase/goforestdb/issues/9#issuecomment-90724970.

mschoch commented 9 years ago

I'm talking about tests that exercise the code we actually plan to merge into goforestdb. Last I checked the 2i code handling this was embedded in their application and not easily extracted. If we want to port it fine, if we want to use the approach above fine. But either way, we should have some tests of that code that show it does what it says...

hisundar commented 9 years ago

I assume that you are referring to the code that has the channel based locking implementation, I believe it is at https://github.com/couchbase/indexing/tree/master/secondary/fdb in the file advlock.go

On Tue, Apr 7, 2015 at 1:43 PM, Marty Schoch notifications@github.com wrote:

I'm talking about tests that exercise the code we actually plan to merge into goforestdb. Last I checked the 2i code handling this was embedded in their application and not easily extracted. If we want to port it fine, if we want to use the approach above fine. But either way, we should have some tests of that code that show it does what it says...

— Reply to this email directly or view it on GitHub https://github.com/couchbase/goforestdb/issues/9#issuecomment-90726720.

mschoch commented 9 years ago

Lol, actually I didn't realize they have their own fork of the code. By itself I can't see how the one file you referenced is a working solution.

mschoch commented 9 years ago

I've finally pushed a commit to address this: https://github.com/couchbase/goforestdb/commit/a758b1d390ed8d8cf52ccd28de4cf7f3fb2b4767

If you're using a single KV-store inside the ForestDB file there is a new KVPool that you can use to create/share a pool of these instances. I chose not to use sync.Pool because we lose direct control over cleaning up resources. We could try to use runtime.SetFinalizer(), but that too comes with a bag of luggage.

I have a very basic test of the pool, and one test which creates concurrent reads/writes. That test failed before wiring it up to the pool, and passes after. It is loosely based on the examples above.

This may not be the final solution here, but it is a practical step forward, and one that we will be using in Bleve shortly.

mschoch commented 9 years ago

I seem to not have permissions to close this issue, will work on that...

couchbase / goforestdb

Panic when read/written concurrently #9