google / trillian

A transparent, highly scalable and cryptographically verifiable data store.
Apache License 2.0
3.54k stars 379 forks source link

Possible race condition in storage Rollback() v begin() #617

Closed Martin2112 closed 6 years ago

Martin2112 commented 7 years ago

Travis log here:

https://travis-ci.org/google/trillian/jobs/232450746

Extract of error below. Happens inside the Go driver but I wonder if there's a path through our code that doesn't clean up properly on error as it happens when a transaction subsequently tries to start. We should examine both pieces of code that were involved to see if there's anything suspicious.

Also watch to see if it reproduces.

==================
WARNING: DATA RACE
Write at 0x00c4203b9dc6 by goroutine 71:
  github.com/go-sql-driver/mysql.(*mysqlConn).writeCommandPacketStr()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:425 +0x57
  github.com/go-sql-driver/mysql.(*mysqlConn).exec()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/connection.go:292 +0x6f
  github.com/go-sql-driver/mysql.(*mysqlTx).Rollback()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/transaction.go:28 +0xfd
  database/sql.(*Tx).rollback.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1526 +0x6e
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Tx).rollback()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1527 +0x10a
  database/sql.(*Tx).awaitDone()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1446 +0x8d
Previous write at 0x00c4203b9dc6 by goroutine 59:
  github.com/go-sql-driver/mysql.(*mysqlConn).readPacket()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:48 +0x1c8
  github.com/go-sql-driver/mysql.(*binaryRows).readRow()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:1108 +0x7d
  github.com/go-sql-driver/mysql.(*binaryRows).Next()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/rows.go:153 +0x9c
  database/sql.(*Rows).nextLocked()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2149 +0xd1
  database/sql.(*Rows).Next.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2134 +0x57
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Rows).Next()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2135 +0xe8
  github.com/google/trillian/storage/mysql.(*logTreeTX).DequeueLeaves()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:277 +0x64e
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:180 +0x36c
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 71 (running) created at:
  database/sql.(*DB).begin()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1383 +0x38f
  database/sql.(*DB).BeginTx()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1342 +0x89
  github.com/google/trillian/storage/mysql.(*mySQLTreeStorage).beginTreeTx()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/tree_storage.go:144 +0xaf
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).beginInternal()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:206 +0x23f
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).BeginForTree()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:227 +0x68
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:171 +0xa7
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 59 (finished) created at:
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:329 +0x6a6
  github.com/google/trillian/server.(*LogOperationManager).OperationLoop()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:358 +0xc4
  github.com/google/trillian/testonly/integration.NewLogEnv.func1()
      /home/travis/gopath/src/github.com/google/trillian/testonly/integration/clientserver.go:120 +0x76
==================
--- FAIL: TestInProcessCTIntegration (3.14s)
    testing.go:610: race detected during execution of test
==================
WARNING: DATA RACE
Read at 0x00c4203b9d70 by goroutine 71:
  github.com/go-sql-driver/mysql.(*mysqlConn).writeCommandPacketStr()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:428 +0x75
  github.com/go-sql-driver/mysql.(*mysqlConn).exec()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/connection.go:292 +0x6f
  github.com/go-sql-driver/mysql.(*mysqlTx).Rollback()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/transaction.go:28 +0xfd
  database/sql.(*Tx).rollback.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1526 +0x6e
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Tx).rollback()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1527 +0x10a
  database/sql.(*Tx).awaitDone()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1446 +0x8d
Previous write at 0x00c4203b9d70 by goroutine 59:
  github.com/go-sql-driver/mysql.(*buffer).readNext()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/buffer.go:104 +0xf5
  github.com/go-sql-driver/mysql.(*mysqlConn).readPacket()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:31 +0x74
  github.com/go-sql-driver/mysql.(*binaryRows).readRow()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:1108 +0x7d
  github.com/go-sql-driver/mysql.(*binaryRows).Next()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/rows.go:153 +0x9c
  database/sql.(*Rows).nextLocked()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2149 +0xd1
  database/sql.(*Rows).Next.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2134 +0x57
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Rows).Next()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2135 +0xe8
  github.com/google/trillian/storage/mysql.(*logTreeTX).DequeueLeaves()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:277 +0x64e
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:180 +0x36c
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 71 (running) created at:
  database/sql.(*DB).begin()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1383 +0x38f
  database/sql.(*DB).BeginTx()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1342 +0x89
  github.com/google/trillian/storage/mysql.(*mySQLTreeStorage).beginTreeTx()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/tree_storage.go:144 +0xaf
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).beginInternal()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:206 +0x23f
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).BeginForTree()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:227 +0x68
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:171 +0xa7
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 59 (finished) created at:
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:329 +0x6a6
  github.com/google/trillian/server.(*LogOperationManager).OperationLoop()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:358 +0xc4
  github.com/google/trillian/testonly/integration.NewLogEnv.func1()
      /home/travis/gopath/src/github.com/google/trillian/testonly/integration/clientserver.go:120 +0x76
==================
--- FAIL: TestLiveLogIntegration (0.00s)
    log_integration_test.go:46: Log integration test skipped as no tree ID provided
    testing.go:610: race detected during execution of test
==================
WARNING: DATA RACE
Write at 0x00c4203b9d68 by goroutine 71:
  github.com/go-sql-driver/mysql.(*buffer).fill()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/buffer.go:59 +0xaf
  github.com/go-sql-driver/mysql.(*buffer).readNext()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/buffer.go:97 +0x1dd
  github.com/go-sql-driver/mysql.(*mysqlConn).readPacket()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:31 +0x74
  github.com/go-sql-driver/mysql.(*mysqlConn).readResultSetHeaderPacket()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:517 +0x50
  github.com/go-sql-driver/mysql.(*mysqlConn).exec()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/connection.go:297 +0x9d
  github.com/go-sql-driver/mysql.(*mysqlTx).Rollback()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/transaction.go:28 +0xfd
  database/sql.(*Tx).rollback.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1526 +0x6e
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Tx).rollback()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1527 +0x10a
  database/sql.(*Tx).awaitDone()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1446 +0x8d
Previous write at 0x00c4203b9d68 by goroutine 59:
  github.com/go-sql-driver/mysql.(*buffer).readNext()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/buffer.go:103 +0xb2
  github.com/go-sql-driver/mysql.(*mysqlConn).readPacket()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:31 +0x74
  github.com/go-sql-driver/mysql.(*binaryRows).readRow()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/packets.go:1108 +0x7d
  github.com/go-sql-driver/mysql.(*binaryRows).Next()
      /home/travis/gopath/src/github.com/go-sql-driver/mysql/rows.go:153 +0x9c
  database/sql.(*Rows).nextLocked()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2149 +0xd1
  database/sql.(*Rows).Next.func1()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2134 +0x57
  database/sql.withLock()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2545 +0x76
  database/sql.(*Rows).Next()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:2135 +0xe8
  github.com/google/trillian/storage/mysql.(*logTreeTX).DequeueLeaves()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:277 +0x64e
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:180 +0x36c
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 71 (running) created at:
  database/sql.(*DB).begin()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1383 +0x38f
  database/sql.(*DB).BeginTx()
      /home/travis/.gimme/versions/go1.8.linux.amd64/src/database/sql/sql.go:1342 +0x89
  github.com/google/trillian/storage/mysql.(*mySQLTreeStorage).beginTreeTx()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/tree_storage.go:144 +0xaf
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).beginInternal()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:206 +0x23f
  github.com/google/trillian/storage/mysql.(*mySQLLogStorage).BeginForTree()
      /home/travis/gopath/src/github.com/google/trillian/storage/mysql/log_storage.go:227 +0x68
  github.com/google/trillian/log.Sequencer.SequenceBatch()
      /home/travis/gopath/src/github.com/google/trillian/log/sequencer.go:171 +0xa7
  github.com/google/trillian/server.(*SequencerManager).ExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/sequencer_manager.go:81 +0x991
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass.func1()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:314 +0x137
Goroutine 59 (finished) created at:
  github.com/google/trillian/server.(*LogOperationManager).getLogsAndExecutePass()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:329 +0x6a6
  github.com/google/trillian/server.(*LogOperationManager).OperationLoop()
      /home/travis/gopath/src/github.com/google/trillian/server/log_operation_manager.go:358 +0xc4
  github.com/google/trillian/testonly/integration.NewLogEnv.func1()
      /home/travis/gopath/src/github.com/google/trillian/testonly/integration/clientserver.go:120 +0x76
==================
Martin2112 commented 7 years ago

@RJPercival were you working on a fix or shall I take another look at it? We just saw it again.

RJPercival commented 7 years ago

I looked into go-sql-driver/mysql and database/sql and, from what I could tell, the rollback and read share a connection buffer but use different locks, which is the root of why this race occurs. I haven't looked at what we could do in Trillian to avoid reads and rollbacks happening concurrently though. The rollback occurs in a goroutine created in database/sql.go, so there's no controlling when that executes.

daviddrysdale commented 7 years ago

I can repro this race with a much simpler program, and (as @RJPercival says) it looks like it might be incorrect locking in the Golang database/sql code. My guesses:

So the underlying driver transaction is used with 2 different locks -- the driverConn lock and the Rows.closemu lock -- hence the race. It's comparatively rare because it needs the context to get cancelled out from underneath the transaction.

Utterly speculative change would be something like the following:

diff --git a/src/database/sql/sql.go b/src/database/sql/sql.go
index c016681fca13..08b0a415a433 100644
--- a/src/database/sql/sql.go
+++ b/src/database/sql/sql.go
@@ -2146,7 +2146,9 @@ func (rs *Rows) nextLocked() (doClose, ok bool) {
        if rs.lastcols == nil {
                rs.lastcols = make([]driver.Value, len(rs.rowsi.Columns()))
        }
-       rs.lasterr = rs.rowsi.Next(rs.lastcols)
+       withLock(rs.dc, func() {
+               rs.lasterr = rs.rowsi.Next(rs.lastcols)
+       })
        if rs.lasterr != nil {
                // Close the connection if there is a driver error.
                if rs.lasterr != io.EOF {

(but that might have lock ordering issues).

Recent golang commit 729685c1d1bb looks related but not exactly the same issue.

Smaller test program to repro the issue:

/*
   mysql -u root -e 'DROP DATABASE IF EXISTS racetest;'
   mysql -u root -e 'CREATE DATABASE racetest;'
   mysql -u root -e "GRANT ALL ON racetest.* TO 'racetest'@'localhost' IDENTIFIED BY 'zaphod';"
   mysql -u racetest -pzaphod -D racetest -e "CREATE TABLE IF NOT EXISTS MyTable( val1 VARCHAR(200), val2 BIGINT NOT NULL, val3 BIGINT NOT NULL, PRIMARY KEY(val1) );"
   mysql -u racetest -pzaphod -D racetest -e "INSERT INTO MyTable VALUES ('one', 1, -1), ('two', 2, -2), ('three', 3, -3);"
   go run -race ./dbtest.go --logtostderr
*/
package main

import (
    "context"
    "database/sql"
    "flag"
    "fmt"
    "time"

    _ "github.com/go-sql-driver/mysql"
    "github.com/golang/glog"
)

var (
    dbFlag = flag.String("db", "racetest:zaphod@tcp(127.0.0.1:3306)/racetest", "Database")
)

func main() {
    flag.Parse()
    ctx, cancel := context.WithCancel(context.Background())
    db, err := sql.Open("mysql", *dbFlag)
    if err != nil {
        glog.Exitf("failed to Open(%q): %v", *dbFlag, err)
    }
    defer db.Close()

    tx, err := db.BeginTx(ctx, nil)
    if err != nil {
        glog.Exitf("failed to BeginTx(): %v", err)
    }

    rows, err := tx.Query("SELECT val1, val2, val3 FROM MyTable")
    if err != nil {
        glog.Exitf("failed to Query(): %v", err)
    }

    count := 0
    for rows.Next() {
        count++
        var val1 string
        var val2, val3 int64
        if err := rows.Scan(&val1, &val2, &val3); err != nil {
            glog.Exitf("failed to Scan(): %v", err)
        }
        fmt.Printf("%s %d %d\n", val1, val2, val3)

        if count == 1 {
            glog.Errorf("cancel the context out from under the transaction!")
            cancel()
        }
        time.Sleep(100 * time.Millisecond)
    }
    rows.Close()
    tx.Commit()
}
daviddrysdale commented 7 years ago

cc @kardianos just in case he's interested

kardianos commented 7 years ago

Thanks. I'll look into this.

RJPercival commented 7 years ago

The fix for this is being considered for inclusion in Go 1.9, otherwise we'll get it by Go 1.10 (https://github.com/golang/go/issues/21117).

Martin2112 commented 6 years ago

Looks like this was fixed upstream.