golang / go

The Go programming language
https://go.dev
BSD 3-Clause "New" or "Revised" License
124.08k stars 17.68k forks source link

database/sql: possible deadlock on Rows.awaitDone()/Rows.Err()/Rows.Close() after RawBytes change #64498

Open fernando-az-alpaca opened 11 months ago

fernando-az-alpaca commented 11 months ago

Go version

go version go1.21.4 linux/arm64

What operating system and processor architecture are you using (go env)?

GO111MODULE=''
GOARCH='arm64'
GOBIN=''
GOCACHE='/root/.cache/go-build'
GOENV='/root/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='arm64'
GOHOSTOS='linux'
GOINSECURE=''
GOMODCACHE='/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='linux'
GOPATH='/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='local'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_arm64'
GOVCS=''
GOVERSION='go1.21.4'
GCCGO='gccgo'
AR='ar'
CC='gcc'
CXX='g++'
CGO_ENABLED='1'
GOMOD='/go/src/app/go.mod'
GOWORK=''
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -pthread -Wl,--no-gc-sections -fmessage-length=0 -
ffile-prefix-map=/tmp/go-build2162660666=/tmp/go-build -gno-record-g
cc-switches'

What did you do?

Note: whether or not this is legitimate bug or a usage error depends on the answer to the question: "after this change https://github.com/golang/go/issues/60304#issuecomment-1560205463, is it an error to call Rows.Err() after a context cancellation but before calling Rows.Close()? " Thanks in advance for taking a look and I apologize if I've made any mistkaes.

Note: a full docker-compose setup for the below code and additional required components is available at https://github.com/martonw/go-sql-deadlock.

package main

import (
    "context"
    "database/sql"
    "fmt"
    "time"

    _ "github.com/jackc/pgx/v5/stdlib"
)

func main() {

    var uuid sql.RawBytes
    ctx, ctxCancel := context.WithCancel(context.Background())
    defer ctxCancel()

    db, err := sql.Open("pgx", "postgres://test:test@localhost:5432/test")
    if err != nil {
        panic(err.Error())
    }
    defer db.Close()

    rows, err := db.QueryContext(ctx, "select id /* id is an uuid */ from test limit 1")
    if err != nil {
        panic(err.Error())

    }
    defer rows.Close()

    if rows.Next() {
        rows.Scan(&uuid)
    }

    fmt.Printf("uuid %v\n", uuid)

    ctxCancel()

    fmt.Println("sleep begin")
    time.Sleep(1 * time.Second)
    fmt.Println("sleep end")

    fmt.Printf("err %v\n", rows.Err())

    fmt.Println("successful exit")

}

What did you expect to see?

No deadlocks (assuming this pattern

defer rows.Close()
rows.Next()
rows.Scan()
ctxCancel()
rows.Err()

is legitimate - and I'll add here that this pattern is used by the gorm package - and so potentially affects a wide audience - and is not as hard to hit upon as it might seem at first glance).

What did you see instead?

The code above will sometimes (but not always) hang/deadlock.

$ ./test
uuid [100 98 97 100 48 49 55 57 45 101 49 98 52 45 52 54 98 55 45 97 99 57 53 45 99 55 49 53 57 97 102 51 98 55 57 55]
sleep begin
sleep end

(... just hangs here ...)

SIGABRT: abort
PC=0x78a3c m=0 sigcode=0

(...)

goroutine 1 [sync.RWMutex.RLock]:
runtime.gopark(0x7518a0?, 0x0?, 0xc0?, 0x60?, 0x7fdc0?)
        /usr/local/go/src/runtime/proc.go:398 +0xc8 fp=0x40001d3d20 sp=0x40001d3d00 pc=0x47cb8
runtime.goparkunlock(...)
        /usr/local/go/src/runtime/proc.go:404
runtime.semacquire1(0x4000128164, 0x10?, 0x3, 0x0, 0xa?)
        /usr/local/go/src/runtime/sema.go:160 +0x208 fp=0x40001d3d70 sp=0x40001d3d20 pc=0x59298
sync.runtime_SemacquireRWMutexR(0x40001d3e08?, 0xac?, 0x400003e030?)
        /usr/local/go/src/runtime/sema.go:82 +0x28 fp=0x40001d3db0 sp=0x40001d3d70 pc=0x73a08
sync.(*RWMutex).RLock(...)
        /usr/local/go/src/sync/rwmutex.go:71
database/sql.(*Rows).Err(0x4000128120)
        /usr/local/go/src/database/sql/sql.go:3100 +0x88 fp=0x40001d3e10 sp=0x40001d3db0 pc=0xdbbe8
main.main()
        /app/main.go:49 +0x224 fp=0x40001d3f30 sp=0x40001d3e10 pc=0x33deb4
runtime.main()
        /usr/local/go/src/runtime/proc.go:267 +0x2bc fp=0x40001d3fd0 sp=0x40001d3f30 pc=0x4788c
runtime.goexit()
        /usr/local/go/src/runtime/asm_arm64.s:1197 +0x4 fp=0x40001d3fd0 sp=0x40001d3fd0 pc=0x775c4

(...)

goroutine 13 [sync.RWMutex.Lock]:
runtime.gopark(0x7518a0?, 0x58bd0?, 0xe0?, 0x61?, 0x1c77c?)
        /usr/local/go/src/runtime/proc.go:398 +0xc8 fp=0x400003dd90 sp=0x400003dd70 pc=0x47cb8
runtime.goparkunlock(...)
        /usr/local/go/src/runtime/proc.go:404
runtime.semacquire1(0x4000128160, 0x8?, 0x3, 0x0, 0x80?)
        /usr/local/go/src/runtime/sema.go:160 +0x208 fp=0x400003dde0 sp=0x400003dd90 pc=0x59298
sync.runtime_SemacquireRWMutex(0x10?, 0xa0?, 0x1?)
        /usr/local/go/src/runtime/sema.go:87 +0x28 fp=0x400003de20 sp=0x400003dde0 pc=0x73a68
sync.(*RWMutex).Lock(0x400003dfa8?)
        /usr/local/go/src/sync/rwmutex.go:152 +0xf8 fp=0x400003de50 sp=0x400003de20 pc=0x80fc8
database/sql.(*Rows).close(0x4000128120, {0x49b960, 0x739520})
        /usr/local/go/src/database/sql/sql.go:3371 +0x88 fp=0x400003dee0 sp=0x400003de50 pc=0xdc3e8
database/sql.(*Rows).awaitDone(0x4000128120, {0x4a0df0, 0x4000068140}, {0x0, 0x0}, {0x4a0df0, 0x4000068410})
        /usr/local/go/src/database/sql/sql.go:2973 +0x1a4 fp=0x400003df80 sp=0x400003dee0 pc=0xdb694
database/sql.(*Rows).initContextClose.func1()
        /usr/local/go/src/database/sql/sql.go:2949 +0x40 fp=0x400003dfd0 sp=0x400003df80 pc=0xdb4b0
runtime.goexit()
        /usr/local/go/src/runtime/asm_arm64.s:1197 +0x4 fp=0x400003dfd0 sp=0x400003dfd0 pc=0x775c4
created by database/sql.(*Rows).initContextClose in goroutine 1
        /usr/local/go/src/database/sql/sql.go:2949 +0x180

dmitshur commented 11 months ago

CC @bradfitz, @kardianos, @kevinburke.

bradfitz commented 5 months ago

We just hit this too. 🤦‍♂️

We had a fun bug recently where some code got refactored and introduced this bug, moving the Err check to the wrong spot:

func QueryJSONRow[T any](tx *Tx, query string, args ...any) (*T, error) {
    rows, err := tx.Query(query, args...)
    if err != nil {
        return nil, err
    }
    defer rows.Close()
    if !rows.Next() {
        return nil, ErrNotFound
    }
    var j sql.RawBytes
    if err := rows.Scan(&j); err != nil {
        return nil, err
    }
    var ret T
    if err := json.Unmarshal(j, &ret); err != nil {
        return nil, err
    }
    if err := rows.Err(); err != nil {   // <------ the bug
        return nil, err
    }
    return &ret, nil
}

Note the rows.Err call while the rows are still open (no explicit Close call yet and Next has not reported false).

Unfortunately that used to always work (prior to the fix for #60304) and also mostly worked with our driver for a few days until we got unlucky goroutine timing wise, and it deadlocked:

1 @ 0x44396e 0x4568e5 0x4568b4 0x477845 0x97ad85 0x97ad60 0x19e943e 0x19a767a 0x19ca65e 0x19caba5 0x190aed0 0x190adf8 0x190af38 0x197951a 0x1979fc5 0x18e9c85 0x1e4b4da 0x1e4d66d 0x1e38aca 0x1e40405 0x1e36a0c 0x1e392e5 0x1e3569f 0x1d10cfa 0x13e2ac9 0x13e3d44 0x13e2ff4 0x12cd262 0x1d0e347 0x7a6149 0x1b591d5 0x1cc8f7b
#   0x477844    sync.runtime_SemacquireRWMutexR+0x24                                runtime/sema.go:82
#   0x97ad84    sync.(*RWMutex).RLock+0x64                                  sync/rwmutex.go:70
#   0x97ad5f    database/sql.(*Rows).Err+0x3f                                   database/sql/sql.go:3122

/cc @maisem @andrew-d