SlyMarbo / spdy

[deprecated] A full-featured SPDY library for the Go language.
BSD 2-Clause "Simplified" License
116 stars 13 forks source link

Crashes and lockups on network error #96

Open rnapier opened 9 years ago

rnapier commented 9 years ago

If there is a network error (corrupted data at a minimum, but I believe I'm also encountering it when servers suddenly become unavailable), there are data races in shutdown(). This can lead to panics or lockup in the client. In some cases, it panics with:

panic: assignment to entry in nil map

goroutine 1598 [running]:
github.com/SlyMarbo/spdy/spdy3.(*Conn).Request(0xc823460480, 0xc8228c41c0, 0x3001630, 0xc82347c240, 0x0, 0x0, 0x0, 0x0, 0x0)
    /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/requests.go:129 +0x18d9
github.com/SlyMarbo/spdy/spdy3.(*Conn).RequestResponse(0xc823460480, 0xc8228c41c0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0)
    /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/requests.go:139 +0x294
github.com/SlyMarbo/spdy.(*Transport).RoundTrip(0xc8228cf080, 0xc8228c41c0, 0x61e635, 0x0, 0x0)
    /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/transport.go:190 +0x539
net/http.send(0xc8228c41c0, 0x1564738, 0xc8228cf080, 0xf, 0x0, 0x0)
    /usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:220 +0x73e
net/http.(*Client).send(0xc8232285a0, 0xc8228c41c0, 0xf, 0x0, 0x0)
    /usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:143 +0x1f8
net/http.(*Client).doFollowingRedirects(0xc8232285a0, 0xc8228c41c0, 0x621a88, 0x0, 0x0, 0x0)
    /usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:380 +0x105a
net/http.(*Client).Get(0xc8232285a0, 0x56b340, 0xf, 0x0, 0x0, 0x0)
    /usr/local/Cellar/go/1.5/libexec/src/net/http/client.go:306 +0xc9
main.main.func2()
    /Users/rnapier/work/agent/src/kace/konea/cmd/kontap/client.go:29 +0x5c
created by main.main
    /Users/rnapier/work/agent/src/kace/konea/cmd/kontap/client.go:42 +0x142

In other cases, the panic is caught by a recover, but this leads to a deadlock because the send goroutine is no longer processing packets. So when the error handler tries to send a RST, the whole connection locks up. This can lead to further symptoms like Ping() never returning (which is how I discovered this issue originally).

This block in shutdown seems the primary cause of data races:

s.output = nil
s.Request = nil
s.Receiver = nil
s.header = nil
s.stop = nil

Here are a few of the conflicts that popped up in my tests:

  github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
    Attempt 26:   /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:129 +0x5cc

  github.com/SlyMarbo/spdy/spdy3.(*RequestStream).ReceiveFrame.func2()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/request_stream.go:193 +0x44

  github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:142 +0xac6

  github.com/SlyMarbo/spdy/spdy3.(*RequestStream).ReceiveFrame.func1()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/request_stream.go:183 +0x7a

To demonstrate the problem, I use the following programs:

The client and servers generally show data races immediately, and the client will generally crash within a few seconds. If you try to reuse the client for every iteration (which better matches my real use case), the Get call will hang.

// server.go
// openssl req -x509 -newkey rsa:2048 -keyout key.pem -out cert.pem -days 365 -nodes -batch

package main

import (
    "fmt"
    "log"
    "net/http"

    "github.com/SlyMarbo/spdy"
)

func httpHandler(w http.ResponseWriter, req *http.Request) {
    fmt.Fprintf(w, "SUCCESS")
}

func main() {
    http.HandleFunc("/", httpHandler)
    log.Printf("About to listen on 10443. Go to https://127.0.0.1:34567/")
    err := spdy.ListenAndServeTLS(":10443", "cert.pem", "key.pem", nil)
    if err != nil {
        log.Fatal(err)
    }
}
// client.go
package main

import (
    "fmt"
    "io/ioutil"
    "time"

    "github.com/SlyMarbo/spdy" // This adds SPDY support to net/http
)

func main() {
    n := 0
    client := spdy.NewClient(true) // In this configuration, the Get hangs w/o providing an error

    for {
        n++
        go func(iter int) {
            // client := spdy.NewClient(true) // Using a new client every time avoids the hang in Get, making crashes more common
            fmt.Printf("Attempt %d: ", iter)
            res, err := client.Get("https://:34567/") // 10443 for direct
            fmt.Printf("...")
            if err != nil {
                fmt.Println(err)
                return
            }

            body, err := ioutil.ReadAll(res.Body)
            if err != nil {
                fmt.Println("ERROR")
                return
            }
            res.Body.Close()
            fmt.Println(string(body))
        }(n)
        time.Sleep(10 * time.Millisecond)
    }
}
// proxy.go

package main

import (
    "fmt"
    "io"
    "log"
    "math/rand"
    "net"
    "os"
)

func main() {
    port := 34567
    dest := 10443
    rate := 0.01

    l, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
    if err != nil {
        fmt.Fprintln(os.Stderr, err)
        os.Exit(1)
    }
    fmt.Printf("http://%v -> :%d\n", l.Addr(), dest)

    for {
        conn, err := l.Accept()
        if err != nil {
            log.Fatal(err)
        }

        fmt.Printf("Connection from %s\n", conn.RemoteAddr())
        go proxy(conn, dest, rate)
    }
}

func proxy(in net.Conn, dest int, rate float64) {
    out, err := net.Dial("tcp", fmt.Sprintf(":%d", dest))
    if err != nil {
        fmt.Fprintln(os.Stderr, err)
        os.Exit(1)
    }

    go unreliableCopy(out, in, rate)
    unreliableCopy(in, out, rate)
    fmt.Printf("Disconnect from %s\n", in.RemoteAddr())
}

func unreliableCopy(dst io.Writer, src io.Reader, rate float64) {
    buf := make([]byte, 32*1024)
    for {
        nr, er := src.Read(buf)
        if nr > 0 {
            maybeCorrupt(buf[0:nr], rate)
            nw, ew := dst.Write(buf[0:nr])
            if ew != nil {
                break
            }
            if nr != nw {
                break
            }
        }
        if er == io.EOF {
            break
        }
        if er != nil {
            break
        }
    }
}

func maybeCorrupt(buf []byte, rate float64) {
    if rand.Float64() < rate {
        fmt.Println("Corrupting")
        loc := rand.Intn(len(buf))
        val := byte(rand.Intn(0x100))
        buf[loc] = val
    }
}

I'm currently seeing this very often in the field, and my infrastructure is locking up usually within a few hours of usage. I have one client, and many servers that come and go, so network errors are reasonably common.

SlyMarbo commented 9 years ago

Are you sure you're on master? The block you mention was mostly removed recently for this reason. I'm just about to make a change that should also help.

rnapier commented 9 years ago

I'm back a few commits from HEAD. I see baf750dfc0e85d3d73672dbef6ff735f010daf5a, which should help, too. I've got tests cases for my full system (not just the simplified version posted above), so I'll let you know if it fixes it.

rnapier commented 9 years ago

Still seeing data races and lockups with 7e7ce93:

==================
WARNING: DATA RACE
Write by goroutine 13:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).shutdown()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/shutdown.go:100 +0xa59
  github.com/SlyMarbo/spdy/spdy3.(*Conn).(github.com/SlyMarbo/spdy/spdy3.shutdown)-fm()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/shutdown.go:13 +0x2d
  sync.(*Once).Do()
      /usr/local/Cellar/go/1.5/libexec/src/sync/once.go:44 +0xf6
  github.com/SlyMarbo/spdy/spdy3.(*Conn).Close()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/shutdown.go:13 +0x8d
  github.com/SlyMarbo/spdy/spdy3.(*Conn).handleReadWriteError()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/error_handling.go:68 +0x303
  github.com/SlyMarbo/spdy/spdy3.(*Conn).readFrames()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:36 +0x1d9

Previous read by goroutine 12:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:142 +0xac6

Goroutine 13 (running) created at:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).Run()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/conn.go:199 +0xe0

Goroutine 12 (running) created at:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).Run()
      /Users/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/conn.go:195 +0x73
==================

I believe I'm still getting my deadlock after receiving this error:

(spdy) 2015/08/26 13:29:58 response_stream.go:267: Encountered stream error: runtime error: invalid memory address or nil pointer dereference (runtime.errorString)
rnapier commented 9 years ago

Another related data race. I believe this is a regression from earlier versions. Since I upgraded to the latest spdy, my Linux 64-bit system is reliably deadlocking with this panic right after this data race (I don't get a deadlock on any of my other systems, but still get the race):

(spdy) 2015/08/27 11:39:17 response_stream.go:267: Encountered stream error: runtime error: invalid memory address or nil pointer dereference (runtime.errorString)

And the race on c.shutdownError:

WARNING: DATA RACE
Write by goroutine 67:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).processFrame()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/processing.go:111 +0xef6
  github.com/SlyMarbo/spdy/spdy3.(*Conn).readFrames()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:50 +0x63e

Previous read by goroutine 68:
  github.com/SlyMarbo/spdy/spdy3.(*Conn).RequestResponse()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/requests.go:147 +0x172
  kace/konea/pkg/koneas.(*AgentConn).RoundTrip()
      /home/rnapier/work/agent/src/kace/konea/pkg/koneas/agentconn.go:75 +0x9c
  kace/konea/pkg/koneas.(*PushListener).Run()
      /home/rnapier/work/agent/src/kace/konea/pkg/koneas/pushlistener.go:72 +0x4f4
  runtime.goexit()
      /usr/local/go/src/runtime/asm_amd64.s:2232 +0x0
  sort.doPivot()
      /usr/local/go/src/sort/sort.go:127 +0x1e1
  sort.quickSort()
      /usr/local/go/src/sort/sort.go:173 +0xbb
  sort.quickSort()
      /usr/local/go/src/sort/sort.go:177 +0x116
  sort.Sort()
      /usr/local/go/src/sort/sort.go:200 +0x88
  compress/flate.sortByFreq()
      /usr/local/go/src/compress/flate/huffman_code.go:317 +0x184
  compress/flate.(*huffmanEncoder).generate()
      /usr/local/go/src/compress/flate/huffman_code.go:289 +0x50b
  compress/flate.(*huffmanBitWriter).writeBlock()
      /usr/local/go/src/compress/flate/huffman_bit_writer.go:414 +0x7c1
  compress/flate.(*compressor).writeBlock()
      /usr/local/go/src/compress/flate/deflate.go:142 +0x1f3
  compress/flate.(*compressor).deflate()
      /usr/local/go/src/compress/flate/deflate.go:259 +0x7fc
  compress/flate.(*compressor).syncFlush()
      /usr/local/go/src/compress/flate/deflate.go:387 +0x7c
  compress/flate.(*Writer).Flush()
      /usr/local/go/src/compress/flate/deflate.go:547 +0x4e
  compress/flate.(*Writer).Reset()
      /usr/local/go/src/compress/flate/deflate.go:565 +0x185
  compress/zlib.(*Writer).Reset()
      /usr/local/go/src/compress/zlib/writer.go:80 +0xb9
  github.com/SlyMarbo/spdy/common.(*compressor).Compress()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/common/compression.go:206 +0x18fa
  github.com/SlyMarbo/spdy/spdy3/frames.(*SYN_STREAM).Compress()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/frames/syn_stream.go:32 +0xb0
  github.com/SlyMarbo/spdy/spdy3.(*Conn).send()
      /home/rnapier/work/agent/src/github.com/SlyMarbo/spdy/spdy3/io.go:129 +0x6ed