Closed twang-rs closed 7 years ago
Here is a CloudWatch screenshot showing (green) IncomingRecords, and (blue & orange) OutgoingRecords for shard0 and shard1 of our stream.
This is the same stacktrace as above, but using debug/pprof/goroutine?debug=2
goroutine 5535597 [running]:
runtime/pprof.writeGoroutineStacks(0xf34060, 0xc4200f0460, 0x30, 0xc427b22bd0)
/usr/local/go/src/runtime/pprof/pprof.go:603 +0x79
runtime/pprof.writeGoroutine(0xf34060, 0xc4200f0460, 0x2, 0xc420030a90, 0x411748)
/usr/local/go/src/runtime/pprof/pprof.go:592 +0x44
runtime/pprof.(*Profile).WriteTo(0xf649a0, 0xf34060, 0xc4200f0460, 0x2, 0xc4200f0460, 0xc420030cc0)
/usr/local/go/src/runtime/pprof/pprof.go:302 +0x3b5
net/http/pprof.handler.ServeHTTP(0xc427b22a91, 0x9, 0xf39ee0, 0xc4200f0460, 0xc421310000)
/usr/local/go/src/net/http/pprof/pprof.go:209 +0x1d1
net/http/pprof.Index(0xf39ee0, 0xc4200f0460, 0xc421310000)
/usr/local/go/src/net/http/pprof/pprof.go:221 +0x1e3
net/http.HandlerFunc.ServeHTTP(0xbb26e8, 0xf39ee0, 0xc4200f0460, 0xc421310000)
/usr/local/go/src/net/http/server.go:1942 +0x44
net/http.(*ServeMux).ServeHTTP(0xf71cc0, 0xf39ee0, 0xc4200f0460, 0xc421310000)
/usr/local/go/src/net/http/server.go:2238 +0x130
net/http.serverHandler.ServeHTTP(0xc420157970, 0xf39ee0, 0xc4200f0460, 0xc421310000)
/usr/local/go/src/net/http/server.go:2568 +0x92
net/http.(*conn).serve(0xc4204a2500, 0xf3a960, 0xc4253e9b00)
/usr/local/go/src/net/http/server.go:1825 +0x612
created by net/http.(*Server).Serve
/usr/local/go/src/net/http/server.go:2668 +0x2ce
goroutine 1 [select, 91 minutes]:
github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic.(*Listener).Listen(0xc4202e41b0, 0xc42049a0e0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic/listener.go:177 +0x304
github.com/rewardStyle/jester.(*Jester).Start(0xc420074900, 0xc4200ec000, 0xf3abe0)
/go/src/github.com/rewardStyle/jester/jester.go:88 +0x4fd
main.processArgs(0xc420190140)
/go/src/github.com/rewardStyle/jester/jester_main/main.go:59 +0x89
reflect.Value.call(0xa800a0, 0xbb22f8, 0x13, 0xb878e5, 0x4, 0xc4201b3d00, 0x1, 0x1, 0xc4201b3c90, 0xb76160, ...)
/usr/local/go/src/reflect/value.go:434 +0x91f
reflect.Value.Call(0xa800a0, 0xbb22f8, 0x13, 0xc4201b3d00, 0x1, 0x1, 0x2, 0x2, 0xc420151640)
/usr/local/go/src/reflect/value.go:302 +0xa4
github.com/rewardStyle/jester/vendor/github.com/urfave/cli.HandleAction(0xa800a0, 0xbb22f8, 0xc420190140, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/urfave/cli/app.go:482 +0x18f
github.com/rewardStyle/jester/vendor/github.com/urfave/cli.(*App).Run(0xc420158480, 0xc420010190, 0x1, 0x1, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/urfave/cli/app.go:240 +0x560
main.main()
/go/src/github.com/rewardStyle/jester/jester_main/main.go:54 +0x1f7
goroutine 17 [syscall, 150 minutes, locked to thread]:
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:2197 +0x1
goroutine 6 [syscall, 150 minutes]:
os/signal.signal_recv(0x69d8ab)
/usr/local/go/src/runtime/sigqueue.go:116 +0x104
os/signal.loop()
/usr/local/go/src/os/signal/signal_unix.go:22 +0x22
created by os/signal.init.1
/usr/local/go/src/os/signal/signal_unix.go:28 +0x41
goroutine 7 [sleep]:
time.Sleep(0x3b9aca00)
/usr/local/go/src/runtime/time.go:59 +0xf9
github.com/rewardStyle/jester/vendor/github.com/valyala/fasthttp.init.1.func1()
/go/src/github.com/rewardStyle/jester/vendor/github.com/valyala/fasthttp/header.go:1372 +0x2a
created by github.com/rewardStyle/jester/vendor/github.com/valyala/fasthttp.init.1
/go/src/github.com/rewardStyle/jester/vendor/github.com/valyala/fasthttp/header.go:1375 +0x3a
goroutine 9 [IO wait]:
net.runtime_pollWait(0x7fe907ddcf58, 0x72, 0x0)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc4201839c8, 0x72, 0x0, 0xc4255589a0)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc4201839c8, 0xffffffffffffffff, 0x0)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).accept(0xc420183960, 0x0, 0xf34320, 0xc4255589a0)
/usr/local/go/src/net/fd_unix.go:430 +0x1e5
net.(*TCPListener).accept(0xc42000e230, 0xc4204a2580, 0xab69c0, 0xffffffffffffffff)
/usr/local/go/src/net/tcpsock_posix.go:136 +0x2e
net.(*TCPListener).AcceptTCP(0xc42000e230, 0xc420034db8, 0xc420034dc0, 0xc420034db0)
/usr/local/go/src/net/tcpsock.go:215 +0x49
net/http.tcpKeepAliveListener.Accept(0xc42000e230, 0xbb2508, 0xc4204a2500, 0xf3aa20, 0xc4201ec120)
/usr/local/go/src/net/http/server.go:3044 +0x2f
net/http.(*Server).Serve(0xc420157970, 0xf3a360, 0xc42000e230, 0x0, 0x0)
/usr/local/go/src/net/http/server.go:2643 +0x228
net/http.(*Server).ListenAndServe(0xc420157970, 0xc420157970, 0x0)
/usr/local/go/src/net/http/server.go:2585 +0xb0
net/http.ListenAndServe(0xb90964, 0xe, 0x0, 0x0, 0x0, 0x0)
/usr/local/go/src/net/http/server.go:2787 +0x7f
main.main.func1()
/go/src/github.com/rewardStyle/jester/jester_main/main.go:35 +0x5a
created by main.main
/go/src/github.com/rewardStyle/jester/jester_main/main.go:38 +0x8f
goroutine 29 [select, 150 minutes]:
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*eventDeouncer).flusher(0xc420015270)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/events.go:40 +0x136
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.newEventDeouncer
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/events.go:28 +0x10e
goroutine 35 [IO wait]:
net.runtime_pollWait(0x7fe907ddd018, 0x72, 0x7)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc4200506f8, 0x72, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc4200506f8, 0xc420218000, 0x2000)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).Read(0xc420050690, 0xc420218000, 0x2000, 0x2000, 0x0, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_unix.go:250 +0x1b7
net.(*conn).Read(0xc42000e0f8, 0xc420218000, 0x2000, 0x2000, 0x0, 0x0, 0x0)
/usr/local/go/src/net/net.go:181 +0x70
crypto/tls.(*block).readFromUntil(0xc4201bb140, 0x7fe907ddd440, 0xc42000e0f8, 0x5, 0xc42000e0f8, 0xc420016050)
/usr/local/go/src/crypto/tls/conn.go:488 +0x98
crypto/tls.(*Conn).readRecord(0xc4201c8a80, 0xbb2c17, 0xc4201c8ba0, 0x5)
/usr/local/go/src/crypto/tls/conn.go:590 +0xc4
crypto/tls.(*Conn).Read(0xc4201c8a80, 0xc4201b0000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/usr/local/go/src/crypto/tls/conn.go:1134 +0x11d
bufio.(*Reader).Read(0xc4201f4300, 0xc4200f0108, 0x1, 0x9, 0xc42020fc48, 0x568b17, 0xc420050690)
/usr/local/go/src/bufio/bufio.go:213 +0x312
io.ReadAtLeast(0xf320e0, 0xc4201f4300, 0xc4200f0108, 0x1, 0x9, 0x1, 0x30, 0x28, 0xb471a0)
/usr/local/go/src/io/io.go:307 +0xa9
io.ReadFull(0xf320e0, 0xc4201f4300, 0xc4200f0108, 0x1, 0x9, 0xc42023a000, 0x7fe907e76000, 0x0)
/usr/local/go/src/io/io.go:325 +0x58
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.readHeader(0xf320e0, 0xc4201f4300, 0xc4200f0108, 0x9, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/frame.go:360 +0xb6
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).recv(0xc4200f00e0, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:443 +0xba
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).serve(0xc4200f00e0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:416 +0x2b
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.Connect
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:247 +0x7df
goroutine 30 [select, 150 minutes]:
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*eventDeouncer).flusher(0xc4200152c0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/events.go:40 +0x136
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.newEventDeouncer
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/events.go:28 +0x10e
goroutine 40 [IO wait, 91 minutes]:
net.runtime_pollWait(0x7fe907ddce98, 0x72, 0x8)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc420050a78, 0x72, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc420050a78, 0xc42018e400, 0x400)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).Read(0xc420050a10, 0xc42018e400, 0x400, 0x400, 0x0, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_unix.go:250 +0x1b7
net.(*conn).Read(0xc42000e110, 0xc42018e400, 0x400, 0x400, 0x0, 0x0, 0x0)
/usr/local/go/src/net/net.go:181 +0x70
crypto/tls.(*block).readFromUntil(0xc4201bbaa0, 0x7fe907ddd440, 0xc42000e110, 0x5, 0xc42000e110, 0xc420016050)
/usr/local/go/src/crypto/tls/conn.go:488 +0x98
crypto/tls.(*Conn).readRecord(0xc42033c380, 0xbb2c17, 0xc42033c4a0, 0x7d)
/usr/local/go/src/crypto/tls/conn.go:590 +0xc4
crypto/tls.(*Conn).Read(0xc42033c380, 0xc4201fe000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/usr/local/go/src/crypto/tls/conn.go:1134 +0x11d
bufio.(*Reader).Read(0xc4201f4ea0, 0xc4200f02c8, 0x1, 0x9, 0xc42004bc48, 0x568b17, 0xc420050a10)
/usr/local/go/src/bufio/bufio.go:213 +0x312
io.ReadAtLeast(0xf320e0, 0xc4201f4ea0, 0xc4200f02c8, 0x1, 0x9, 0x1, 0x30, 0x28, 0xb471a0)
/usr/local/go/src/io/io.go:307 +0xa9
io.ReadFull(0xf320e0, 0xc4201f4ea0, 0xc4200f02c8, 0x1, 0x9, 0xc420119800, 0x7fe907e76000, 0x0)
/usr/local/go/src/io/io.go:325 +0x58
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.readHeader(0xf320e0, 0xc4201f4ea0, 0xc4200f02c8, 0x9, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/frame.go:360 +0xb6
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).recv(0xc4200f02a0, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:443 +0xba
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).serve(0xc4200f02a0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:416 +0x2b
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.Connect
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:247 +0x7df
goroutine 59 [IO wait]:
net.runtime_pollWait(0x7fe907ddcdd8, 0x72, 0x3)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc421e351e8, 0x72, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc421e351e8, 0xc4287a4fde, 0x5022)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).Read(0xc421e35180, 0xc4287a4fde, 0x5022, 0x5022, 0x0, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_unix.go:250 +0x1b7
net.(*conn).Read(0xc426c39498, 0xc4287a4fde, 0x5022, 0x5022, 0x0, 0x0, 0x0)
/usr/local/go/src/net/net.go:181 +0x70
crypto/tls.(*block).readFromUntil(0xc4277518c0, 0x7fe907ddd440, 0xc426c39498, 0x4035, 0xc426c39498, 0x0)
/usr/local/go/src/crypto/tls/conn.go:488 +0x98
crypto/tls.(*Conn).readRecord(0xc4201b2380, 0xbb2c17, 0xc4201b24a0, 0x0)
/usr/local/go/src/crypto/tls/conn.go:635 +0x1fa
crypto/tls.(*Conn).Read(0xc4201b2380, 0xc42bb5cec4, 0x1a1b96, 0x216f3c, 0x0, 0x0, 0x0)
/usr/local/go/src/crypto/tls/conn.go:1134 +0x11d
net/http.(*persistConn).Read(0xc425fe9560, 0xc42bb5cec4, 0x1a1b96, 0x216f3c, 0x400, 0xc4207e19a8, 0x42b4e6)
/usr/local/go/src/net/http/transport.go:1316 +0x14b
bufio.(*Reader).Read(0xc425b561e0, 0xc42bb5cec4, 0x1a1b96, 0x216f3c, 0x0, 0x0, 0x0)
/usr/local/go/src/bufio/bufio.go:199 +0x198
io.(*LimitedReader).Read(0xc426dc6300, 0xc42bb5cec4, 0x216f3c, 0x216f3c, 0xc42811da00, 0x0, 0x67bb7d)
/usr/local/go/src/io/io.go:436 +0x6c
net/http.(*body).readLocked(0xc4285e2c40, 0xc42bb5cec4, 0x216f3c, 0x216f3c, 0xc420001a00, 0x0, 0xc4221a9900)
/usr/local/go/src/net/http/transfer.go:761 +0x61
net/http.(*body).Read(0xc4285e2c40, 0xc42bb5cec4, 0x216f3c, 0x216f3c, 0x0, 0x0, 0x0)
/usr/local/go/src/net/http/transfer.go:753 +0xfd
net/http.(*bodyEOFSignal).Read(0xc4285e2c80, 0xc42bb5cec4, 0x216f3c, 0x216f3c, 0x0, 0x0, 0x0)
/usr/local/go/src/net/http/transport.go:2035 +0xe9
bytes.(*Buffer).ReadFrom(0xc4207e1b58, 0xf33d20, 0xc4285e2c80, 0xc423dd6000, 0x0, 0x200)
/usr/local/go/src/bytes/buffer.go:179 +0x160
io/ioutil.readAll(0xf33d20, 0xc4285e2c80, 0x200, 0x0, 0x0, 0x0, 0x0, 0x0)
/usr/local/go/src/io/ioutil/ioutil.go:33 +0x150
io/ioutil.ReadAll(0xf33d20, 0xc4285e2c80, 0xc4207e1c70, 0x82ca65, 0xc4207e1c70, 0x40efc8, 0xad4ba0)
/usr/local/go/src/io/ioutil/ioutil.go:42 +0x3e
github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/json/jsonutil.UnmarshalJSON(0xafb9a0, 0xc427e1c0f0, 0xf33d20, 0xc4285e2c80, 0xc4285e2c80, 0x839e91)
/go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/json/jsonutil/unmarshal.go:17 +0x62
github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/jsonrpc.Unmarshal(0xc4201c8000)
/go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/jsonrpc/jsonrpc.go:65 +0x10f
github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request.(*HandlerList).Run(0xc4201c81c8, 0xc4201c8000)
/go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request/handlers.go:136 +0x87
github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request.(*Request).Send(0xc4201c8000, 0xc42a4c11e0, 0xc4201c8000)
/go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request/request.go:405 +0x435
github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/service/kinesis.(*Kinesis).GetRecords(0xc42000e040, 0xc42a4c11e0, 0xc4201f7f98, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/service/kinesis/api.go:861 +0x4d
github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic.(*Listener).consume(0xc4202e41b0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic/listener.go:218 +0x1ec
created by github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic.(*Listener).init
/go/src/github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic/listener.go:86 +0x438
goroutine 42 [select]:
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*controlConn).heartBeat(0xc420122ac0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/control.go:61 +0x2bb
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*controlConn).connect
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/control.go:166 +0x268
goroutine 60 [chan receive]:
github.com/rewardStyle/jester/vendor/github.com/rcrowley/go-metrics.(*meterArbiter).tick(0xf71ec0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/rcrowley/go-metrics/meter.go:221 +0x4c
created by github.com/rewardStyle/jester/vendor/github.com/rcrowley/go-metrics.NewMeter
/go/src/github.com/rewardStyle/jester/vendor/github.com/rcrowley/go-metrics/meter.go:40 +0x127
goroutine 58 [IO wait, 91 minutes]:
net.runtime_pollWait(0x7fe907ddcd18, 0x72, 0x9)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc420050d88, 0x72, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc420050d88, 0xc42018f000, 0x400)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).Read(0xc420050d20, 0xc42018f000, 0x400, 0x400, 0x0, 0xf360e0, 0xf30708)
/usr/local/go/src/net/fd_unix.go:250 +0x1b7
net.(*conn).Read(0xc42000e130, 0xc42018f000, 0x400, 0x400, 0x0, 0x0, 0x0)
/usr/local/go/src/net/net.go:181 +0x70
crypto/tls.(*block).readFromUntil(0xc4201bbe30, 0x7fe907ddd440, 0xc42000e130, 0x5, 0xc42000e130, 0xc420016050)
/usr/local/go/src/crypto/tls/conn.go:488 +0x98
crypto/tls.(*Conn).readRecord(0xc4201b2a80, 0xbb2c17, 0xc4201b2ba0, 0x7e)
/usr/local/go/src/crypto/tls/conn.go:590 +0xc4
crypto/tls.(*Conn).Read(0xc4201b2a80, 0xc420208000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
/usr/local/go/src/crypto/tls/conn.go:1134 +0x11d
bufio.(*Reader).Read(0xc4201f5a40, 0xc4200f03a8, 0x1, 0x9, 0xc420047c48, 0x568b17, 0xc420050d20)
/usr/local/go/src/bufio/bufio.go:213 +0x312
io.ReadAtLeast(0xf320e0, 0xc4201f5a40, 0xc4200f03a8, 0x1, 0x9, 0x1, 0x30, 0x28, 0xb471a0)
/usr/local/go/src/io/io.go:307 +0xa9
io.ReadFull(0xf320e0, 0xc4201f5a40, 0xc4200f03a8, 0x1, 0x9, 0xc420119800, 0x7fe907e76000, 0x0)
/usr/local/go/src/io/io.go:325 +0x58
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.readHeader(0xf320e0, 0xc4201f5a40, 0xc4200f03a8, 0x9, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/frame.go:360 +0xb6
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).recv(0xc4200f0380, 0x0, 0x0)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:443 +0xba
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Conn).serve(0xc4200f0380)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:416 +0x2b
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.Connect
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/conn.go:247 +0x7df
goroutine 61 [IO wait, 150 minutes]:
net.runtime_pollWait(0x7fe907ddcb98, 0x72, 0x0)
/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc4200f33a8, 0x72, 0x0, 0xc42049c940)
/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc4200f33a8, 0xffffffffffffffff, 0x0)
/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).accept(0xc4200f3340, 0x0, 0xf34320, 0xc42049c940)
/usr/local/go/src/net/fd_unix.go:430 +0x1e5
net.(*TCPListener).accept(0xc42000e1f0, 0xc4201bb7a0, 0xc4201fde78, 0x55256d)
/usr/local/go/src/net/tcpsock_posix.go:136 +0x2e
net.(*TCPListener).AcceptTCP(0xc42000e1f0, 0x675293, 0xc4201fde90, 0xc4201fde88)
/usr/local/go/src/net/tcpsock.go:215 +0x49
net/http.tcpKeepAliveListener.Accept(0xc42000e1f0, 0xc4201bb770, 0xabd9a0, 0xf61330, 0xb1f7e0)
/usr/local/go/src/net/http/server.go:3044 +0x2f
net/http.(*Server).Serve(0xc4202834a0, 0xf3a360, 0xc42000e1f0, 0x0, 0x0)
/usr/local/go/src/net/http/server.go:2643 +0x228
net/http.(*Server).ListenAndServe(0xc4202834a0, 0xc4202834a0, 0x0)
/usr/local/go/src/net/http/server.go:2585 +0xb0
net/http.ListenAndServe(0xc42049a0c0, 0x5, 0xf33120, 0xc420494230, 0x5, 0xae02c0)
/usr/local/go/src/net/http/server.go:2787 +0x7f
created by github.com/rewardStyle/jester.(*Jester).Start
/go/src/github.com/rewardStyle/jester/jester.go:81 +0x48e
goroutine 48 [select]:
github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.(*Session).reconnectDownedHosts(0xc420100280, 0xdf8475800)
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/session.go:220 +0x259
created by github.com/rewardStyle/jester/vendor/github.com/gocql/gocql.NewSession
/go/src/github.com/rewardStyle/jester/vendor/github.com/gocql/gocql/session.go:190 +0xb23
goroutine 49 [select, 150 minutes, locked to thread]:
runtime.gopark(0xbb2b00, 0x0, 0xb89ecd, 0x6, 0x18, 0x2)
/usr/local/go/src/runtime/proc.go:271 +0x13a
runtime.selectgoImpl(0xc4201fcf50, 0x0, 0x18)
/usr/local/go/src/runtime/select.go:423 +0x1364
runtime.selectgo(0xc4201fcf50)
/usr/local/go/src/runtime/select.go:238 +0x1c
runtime.ensureSigM.func1()
/usr/local/go/src/runtime/signal_unix.go:434 +0x2dd
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:2197 +0x1
goroutine 5535598 [runnable]:
net/http.(*connReader).backgroundRead(0xc4253e9b40)
/usr/local/go/src/net/http/server.go:655
created by net/http.(*connReader).startBackgroundRead
/usr/local/go/src/net/http/server.go:652 +0xdf
goroutine 5425964 [select, 92 minutes]:
net/http.(*persistConn).readLoop(0xc425fe9560)
/usr/local/go/src/net/http/transport.go:1599 +0x9ec
created by net/http.(*Transport).dialConn
/usr/local/go/src/net/http/transport.go:1117 +0xa35
goroutine 5425965 [select, 92 minutes]:
net/http.(*persistConn).writeLoop(0xc425fe9560)
/usr/local/go/src/net/http/transport.go:1704 +0x43a
created by net/http.(*Transport).dialConn
/usr/local/go/src/net/http/transport.go:1118 +0xa5a
You can tell that our Listen
function is waiting in a select
call waiting for further messages to arrive in the channels. Focusing in on the goroutine that contains the GetRecords
call:
1 @ 0x42e0aa 0x4291c7 0x428809 0x568678 0x5686e4 0x569e37 0x57d210 0x5c3498 0x5c3b3a 0x5c770d 0x684b8b 0x51e8a8 0x4635ac 0x67bca1 0x67bbed 0x688989 0x46e210 0x4d0d90 0x4d0e6e 0x8969c2 0x89954f 0x82b387 0x82e045 0x89dffd 0x8b4c6c 0x45afe1
# 0x428808 net.runtime_pollWait+0x58 /usr/local/go/src/runtime/netpoll.go:164
# 0x568677 net.(*pollDesc).wait+0x37 /usr/local/go/src/net/fd_poll_runtime.go:75
# 0x5686e3 net.(*pollDesc).waitRead+0x33 /usr/local/go/src/net/fd_poll_runtime.go:80
# 0x569e36 net.(*netFD).Read+0x1b6 /usr/local/go/src/net/fd_unix.go:250
# 0x57d20f net.(*conn).Read+0x6f /usr/local/go/src/net/net.go:181
# 0x5c3497 crypto/tls.(*block).readFromUntil+0x97 /usr/local/go/src/crypto/tls/conn.go:488
# 0x5c3b39 crypto/tls.(*Conn).readRecord+0x1f9 /usr/local/go/src/crypto/tls/conn.go:635
# 0x5c770c crypto/tls.(*Conn).Read+0x11c /usr/local/go/src/crypto/tls/conn.go:1134
# 0x684b8a net/http.(*persistConn).Read+0x14a /usr/local/go/src/net/http/transport.go:1316
# 0x51e8a7 bufio.(*Reader).Read+0x197 /usr/local/go/src/bufio/bufio.go:199
# 0x4635ab io.(*LimitedReader).Read+0x6b /usr/local/go/src/io/io.go:436
# 0x67bca0 net/http.(*body).readLocked+0x60 /usr/local/go/src/net/http/transfer.go:761
# 0x67bbec net/http.(*body).Read+0xfc /usr/local/go/src/net/http/transfer.go:753
# 0x688988 net/http.(*bodyEOFSignal).Read+0xe8 /usr/local/go/src/net/http/transport.go:2035
# 0x46e20f bytes.(*Buffer).ReadFrom+0x15f /usr/local/go/src/bytes/buffer.go:179
# 0x4d0d8f io/ioutil.readAll+0x14f /usr/local/go/src/io/ioutil/ioutil.go:33
# 0x4d0e6d io/ioutil.ReadAll+0x3d /usr/local/go/src/io/ioutil/ioutil.go:42
# 0x8969c1 github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/json/jsonutil.UnmarshalJSON+0x61 /go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/json/jsonutil/unmarshal.go:17
# 0x89954e github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/jsonrpc.Unmarshal+0x10e /go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/private/protocol/jsonrpc/jsonrpc.go:65
# 0x82b386 github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request.(*HandlerList).Run+0x86 /go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request/handlers.go:136
# 0x82e044 github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request.(*Request).Send+0x434 /go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/aws/request/request.go:405
# 0x89dffc github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/service/kinesis.(*Kinesis).GetRecords+0x4c /go/src/github.com/rewardStyle/jester/vendor/github.com/aws/aws-sdk-go/service/kinesis/api.go:861
# 0x8b4c6b github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic.(*Listener).consume+0x1eb /go/src/github.com/rewardStyle/jester/vendor/github.com/rewardStyle/kinetic/listener.go:218
We can see that the trace through the SDK code is:
https://github.com/aws/aws-sdk-go/blob/5b99715ae2945a2434a2371f4e6c5542e839a32d/aws/request/request.go#L405 https://github.com/aws/aws-sdk-go/blob/5b99715ae2945a2434a2371f4e6c5542e839a32d/aws/request/handlers.go#L136 https://github.com/aws/aws-sdk-go/blob/5b99715ae2945a2434a2371f4e6c5542e839a32d/private/protocol/jsonrpc/jsonrpc.go#L65 https://github.com/aws/aws-sdk-go/blob/5b99715ae2945a2434a2371f4e6c5542e839a32d/private/protocol/json/jsonutil/unmarshal.go#L17
From the trace, we can see that we are blocked in IO Wait on an ioutil.ReadAll(stream)
-- prior to the actual json.Unmarshal()
call.
Note that a netstat shows the connection to Kinesis as ESTABLISHED:
/opt/jester # netstat -an
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State
tcp 0 0 127.0.0.1:6060 0.0.0.0:* LISTEN
tcp 0 0 172.17.0.2:36490 172.16.10.124:9042 ESTABLISHED
tcp 0 0 172.17.0.2:45670 54.240.251.200:443 ESTABLISHED
tcp 0 0 172.17.0.2:36486 172.16.10.124:9042 ESTABLISHED
tcp 0 0 172.17.0.2:36488 172.16.10.124:9042 ESTABLISHED
tcp 0 0 :::8090 :::* LISTEN
Active UNIX domain sockets (servers and established)
Proto RefCnt Flags Type State I-Node Path
Furthermore, if I tcpkill this socket (from the ECS host, since I can't find tcpkill for alpine), the process immediately becomes unblocked.
% sudo tcpkill -i eth0 host 54.240.251.200
Below is a sample payload that we're sending through Kinesis:
{
"Data": {
"data_type": "xxxxxxxxxxx",
"data": {
"user_agent": "Go-http-client/1.1",
"ts": "2017-03-21T20:34:13.57810677Z",
"ip": "xxxxxxxxxxxxxxxxxx",
"referer": "",
"fp": "stresstest",
"cookie_val": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"ingestion_id": "bef4fab8-0e75-11e7-8061-db9afaebe7e0",
"extra": {
"height": "xxxx",
"title": "xxxxxxxx",
"ts": "2016-11-15T20:34:31.990481Z",
"width": "xxxx",
"account_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"document_referrer": "https%3A%2F%2Fxxxxxxxxxxxxxx%2F",
"lang": "xxxxxx",
"orientation": "xxxxx",
"xxxxxxxxxx": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
}
},
"version": "0.0.1",
"ts": "2017-03-21T20:34:13.57810677Z"
},
"PartitionKey": "2be849cae2eaa6dbf949d615b65bf33a",
"ApproximateArrivalTimestamp": 1490128453.699,
"SequenceNumber": "49571551843860662170063845371697159861318589635654320130"
}
% wc foo
31 55 1037 foo
We are in the process of instrumenting the code as recommended here: https://github.com/aws/aws-sdk-go/issues/301#issuecomment-231491380
Will report more data when I have it.
As best as I can tell, we're experiencing a unexpected hang-up (or otherwise a lack of response) from the Kinesis service. The implementation in the SDK (using ioutil.ReadAll()
) as well as sendgridlabs/go-kinesis (using http.Client.Do()
) do not provide any timeouts and it is unclear to me (especially now that that one locked reader has unwound) whether TCP KeepAlives are still going back & forth.
Related reading: https://medium.com/@nate510/don-t-use-go-s-default-http-client-4804cb19f779#.5y9m4d68c
Tried a tcpdump to capture 0 length packets. Not sure how accurate this is:
% tcpdump -pni eth0 -v "tcp port 443 and ( tcp[tcpflags] & tcp-ack != 0 and ( (ip[2:2] - ((ip[0]&0xf)<<2) ) - ((tcp[12]&0xf0)>>2) ) == 0 ) "
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
10:29:35.243707 IP (tos 0x0, ttl 255, id 61938, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xc333), ack 1567741584, win 8646, length 0
10:29:35.244699 IP (tos 0x0, ttl 230, id 44126, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x0c66 (correct), ack 1, win 432, length 0
10:30:05.323713 IP (tos 0x0, ttl 255, id 61939, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xc333), ack 1, win 8646, length 0
10:30:05.324720 IP (tos 0x0, ttl 230, id 44127, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x0c66 (correct), ack 1, win 432, length 0
10:30:35.403718 IP (tos 0x0, ttl 255, id 61940, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xc333), ack 1, win 8646, length 0
10:30:35.404711 IP (tos 0x0, ttl 230, id 44128, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x0c66 (correct), ack 1, win 432, length 0
10:30:35.423818 IP (tos 0x0, ttl 255, id 61941, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd67), ack 1461, win 8669, length 0
10:30:35.424860 IP (tos 0x0, ttl 255, id 61942, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd50), ack 1461, win 8692, length 0
10:30:35.424989 IP (tos 0x0, ttl 255, id 61943, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd39), ack 1461, win 8715, length 0
10:31:05.483720 IP (tos 0x0, ttl 255, id 61944, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd3a), ack 1461, win 8715, length 0
10:31:05.484715 IP (tos 0x0, ttl 230, id 44132, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x00fe (correct), ack 1, win 432, length 0
10:31:35.563707 IP (tos 0x0, ttl 255, id 61945, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd3a), ack 1461, win 8715, length 0
10:31:35.564718 IP (tos 0x0, ttl 230, id 44133, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x00fe (correct), ack 1, win 432, length 0
10:32:05.643760 IP (tos 0x0, ttl 255, id 61946, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd3a), ack 1461, win 8715, length 0
10:32:05.644797 IP (tos 0x0, ttl 230, id 44134, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x00fe (correct), ack 1, win 432, length 0
10:32:35.723710 IP (tos 0x0, ttl 255, id 61947, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xbd3a), ack 1461, win 8715, length 0
10:32:35.724718 IP (tos 0x0, ttl 230, id 44135, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0x00fe (correct), ack 1, win 432, length 0
10:32:35.743690 IP (tos 0x0, ttl 255, id 61948, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xb76e), ack 2921, win 8738, length 0
10:32:35.744676 IP (tos 0x0, ttl 255, id 61949, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xb758), ack 2921, win 8760, length 0
10:32:35.744765 IP (tos 0x0, ttl 255, id 61950, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xb741), ack 2921, win 8783, length 0
10:33:05.803713 IP (tos 0x0, ttl 255, id 61951, offset 0, flags [DF], proto TCP (6), length 40)
172.17.0.2.45670 > 54.240.251.200.443: Flags [.], cksum 0xdee6 (incorrect -> 0xb742), ack 2921, win 8783, length 0
10:33:05.804711 IP (tos 0x0, ttl 230, id 44139, offset 0, flags [DF], proto TCP (6), length 40)
54.240.251.200.443 > 172.17.0.2.45670: Flags [.], cksum 0xf595 (correct), ack 1, win 432, length 0
^C
22 packets captured
22 packets received by filter
0 packets dropped by kernel
Since it was suggested that high CPU utilization may have something to do with the hang (I'm not entirely sure how), here is the Service Utilization:
These are running on machines with 1024 CPU unit and each container has a 340 unit reservation.
One of the application is running on an ECS container instance by itself. The other one shares with another (idle) application.
We see that the process goes up to about 250% (2.5 x 340 = 850 units).
Hello @twang-rs, thank you for reaching out to us and providing all this information. I've relayed this information to the service team. If you find anything else that may be helpful, please reach out.
Thank you very much for the in depth information. This is very helpful for us working with the service team to address this. We've forwarded this information on to the Kinesis service team to help address the issue.
A potential fix for this issue is for the SDK not to wait forever for a complete response from the server. Failing the connection if the server takes too long to respond with the complete response. In this case we should consider if the request can be retried without user intervention. Especially in the cases like Kinesis where the token may not be valid or expired. We'd need to make sure the requests are idempotent.
In the worst case the SDK could fail the request, and not retry. This would require user applications to determine if the request could be retried safely or not.
We'll need to investigate the best way to implement a read timeout on the response body without introducing significant latency. I think we can prototype a few methods of this timeout and go with the best.
Simply this has stabilized our client for about 2 days now:
config := aws.NewConfig()
config = config.WithHTTPClient(&http.Client{
Timeout: l.httpClientTimeout,
})
Here is a plot of our GetRecords:
In particular, notice initially that the two GetRecords graphs (orange & blue) fluctuated wildly and that the sum of the two values were significantly greater than the IncomingRecords (green). At this point, we were most likely catching up from TRIM_HORIZON and presumably each GetRecords request was returning a full 10k records. The download time for the payload was significant, causing the high fluctuation in the graph.
At some point, this settled down. Since the records themselves did not change (same payload with a different timestamp), I presume that the number of records returned by the GetRecords call was less than 10k, which also implies that we have "caught up" to the LATEST on the stream.
What is concerning is that at this point in the graph, the sum of the GetRecords for both shards is greater than the IncomingRecords. One possible explanation of this is that our timeout (set to 1 minute) is cancelling a certain number (nearly half!) of otherwise healthy GetRecords requests that were in the process of downloading the payload. This might cause CloudWatch to count the records returned by that particular request; however, since we cancelled that request, we would effectively retry that batch. The second attempt to fetch the batch may be faster, perhaps due to variations on load on the Kinesis server or even perhaps caching.
Similarly concerning is the fact that at some point, the GetRecords drops to where the sum of the GetRecords of the two shards are again roughly equal to the IncomingRecords. I have no explanation of how suddenly this would occur, other than variations on load (and therefore the response time of the Kinesis server).
Here is the same graph above, this time including GetRecords.IteratorAge. This may provide more insight:
It is a little hard to interpret IteratorAge, as we have turn our message producer on and off at various points. It does surprise me a bit that the IteratorAge is lower when our consumer starts initially, then jumps before making a slow decay to 0.
The retention on our stream is 96 hours (4 days) and I would have expected that the iterator age would be highest initially as the consumer starts from TRIM_HORIZON and reads our oldest records (those written prior to this graph).
In any case, we're currently working on a solution which does not simply set a global timeout for the entire client. Instead, we are setting an incremental timeout in the ioutil.ReadAll() of the GetRecord's HTTPResponse.Body.
This solution builds on top of the debugging code that @jasdel provided in #301, where we use a DebugReadCloser to wrap the HTTPResponse.Body ReadCloser stream.
While I don't have any code I can share just yet, these snippets should give you a general idea of how we're approaching this:
struct DebugReadCloser {
io.ReadCloser
ReadFn func(io.ReadCloser, []byte) (int, error)
CloseFn func()
}
func (r *DebugReadCloser) Read(b []byte) (int, error) {
return r.ReadFn(r.ReadCloser, b)
}
func (r *DebugReadCloser) Close() error {
r.CloseFn()
return r.ReadCloser.Close()
}
// in the consumer loop
req.Handlers.Unmarshal.PushFront(func(r *request.Request) {
timer := time.NewTimer(2 * time.Second)
r.HTTPResponse.Body = &DebugReadCloser{
ReadCloser: r.HTTPResponse.Body,
ReadFn: func(stream io.ReadCloser, b []byte) (n int, err error) {
type Result struct {
n int,
err error
}
c := make(chan Result, 1)
go func() {
var result Result
result.n, result.err = stream.Read(b)
c <- result
}()
select {
case result := <-c:
timer.Reset(2 * time.Second)
n, err = result.n, result.err
case <-timer.C:
err = errors.New("Timeout while reading GetRecords response body")
}
return
},
CloseFn: func() { ... } // Unimportant
}
}
Admittedly, we were looking for a solution that does not modify the SDK code. If were were willing to rewrite the UnmarshalJSON
function to not use ioutil.ReadAll
, a cleaner approach may be available.
One more screenshot showing (on top) sum metrics and (below) average metrics:
I believe using sum for GetRecords.IteratorAge provided a skewed graph. An average of the IteratorAge makes far more sense.
Thank you very much for the additional information @twang-rs. We've forwarded this on to the Kinesis team.
In addition I think the snip you provided is similar to the idea we'd like to implement. I think we need to profile the impact of using channels/goroutines in a Read
method. Since these methods are called many times in tight loops we need to be careful of the overhead of the timer logic.
@twang-rs, I have a PR, #1166, that fixes this for GetRecords
. Please let us know if any other methods needs this or if the duration for the read is too small. We can always expose readDuration
, but I felt 2 minutes was more than enough time for a Read
.
@twang-rs - We just merged in the fix for this. If you have any additional questions or concerns, please don't hesitate to reach out. Feedback is always welcomed
Similar to #301 (and maybe even #1037).
We have a Kinesis application that randomly hangs (
seemingly indefinitely, as I type this one of them unwound after being blocked from 3:35AM to 4:39AM) on the GetRecords request.Our library code can be found in our rewardStyle/kinetic library. Both the
master
branch (based on sendgridlabs/go-kinesis) and in theversion3
branch (using aws-go-sdk) hangs on the GetRecords call.There have been many, many iterations of code; but for the purposes of this ticket, we'll use this one:
https://github.com/rewardStyle/kinetic/tree/8f2a3f09581c6a9ed9a79983468a06f784ea1857
The version of aws-go-sdk we are using is hash: 5b99715ae2945a2434a2371f4e6c5542e839a32d
Below is a pprof of an application that is hung: