Closed runningman84 closed 4 years ago
Can you reproduce it? Can you start it with the environment variable SERVER_PROFILER_ENABLED=true
and then when it hangs capture a thread dump from GET /debug/pprof/goroutine?debug=1
I will monitor this a few days and report back. Right now everything looks stable.
ok I get the same error, unfortunatly the server was not running with the profiler. I have added the corresponding env now.
This is a strace output of the process:
strace: Process 3356 attached
epoll_pwait(11, [{EPOLLIN, {u32=745610920, u64=140020974427816}}], 128, -1, NULL, 3) = 1
futex(0x13d74b0, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x13d73b0, FUTEX_WAKE_PRIVATE, 1) = 0
accept4(13, {sa_family=AF_INET6, sin6_port=htons(37448), inet_pton(AF_INET6, "::ffff:10.42.0.1", &sin6_addr), sin6_flowinfo=htonl(0), sin6_scope_id=0}, [112->28], SOCK_CLOEXEC|SOCK_NONBLOCK) = 19
epoll_ctl(11, EPOLL_CTL_ADD, 19, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=745609672, u64=140020974426568}}) = 0
getsockname(19, {sa_family=AF_INET6, sin6_port=htons(8080), inet_pton(AF_INET6, "::ffff:10.42.0.178", &sin6_addr), sin6_flowinfo=htonl(0), sin6_scope_id=0}, [112->28]) = 0
setsockopt(19, SOL_TCP, TCP_NODELAY, [1], 4) = 0
futex(0xc000262148, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(13, 0xc000066b38, [112], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
nanosleep({tv_sec=0, tv_nsec=3000}, NULL) = 0
futex(0xc000136148, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xc000136148, FUTEX_WAKE_PRIVATE, 1) = 1
write(17, "\0\0\4\10\0\0\0\0\0\0\0\0\5", 13) = 13
write(17, "\0\0\5\1\4\0\0022\327\210\302\301\300\277", 14) = 14
futex(0xc000136148, FUTEX_WAKE_PRIVATE, 1) = 1
write(17, "\0\6B\0\0\0\0022\327\0\0\0\6=\n\235\6\n\7default\22\16tfli"..., 1611) = 1611
epoll_pwait(11, [], 128, 0, NULL, 824634273920) = 0
futex(0x13d8108, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x13d8108, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x13d74a0, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x13d8108, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(11, [], 128, 0, NULL, 0) = 0
epoll_pwait(11, [], 128, 0, NULL, 16) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 4) = 0
epoll_pwait(11, [], 128, 0, NULL, 4) = 0
epoll_pwait(11, [], 128, 0, NULL, 64) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 128) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 64) = 0
epoll_pwait(11, [], 128, 0, NULL, 128) = 0
epoll_pwait(11, [], 128, 0, NULL, 64) = 0
epoll_pwait(11, [], 128, 0, NULL, 128) = 0
epoll_pwait(11, [], 128, 0, NULL, 16) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 128) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 20806184) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 1) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 20806184) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 128) = 0
epoll_pwait(11, [], 128, 0, NULL, 20806184) = 0
epoll_pwait(11, [], 128, 0, NULL, 4) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 4) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 32) = 0
epoll_pwait(11, [], 128, 0, NULL, 20806184) = 0
epoll_pwait(11, [], 128, 0, NULL, 8) = 0
epoll_pwait(11, [], 128, 0, NULL, 824638486528) = 0
epoll_pwait(11, [], 128, 0, NULL, 824700288576) = 0
epoll_pwait(11, [], 128, 0, NULL, 824696070848) = 0
epoll_pwait(11, [], 128, 0, NULL, 824666689312) = 0
epoll_pwait(11, [], 128, 0, NULL, 18446744073709551615) = 0
futex(0x13d8108, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
nanosleep({tv_sec=0, tv_nsec=3000}, NULL) = 0
futex(0xc0000572c8, FUTEX_WAKE_PRIVATE, 1) = 1
sched_yield() = 0
futex(0x13d73b0, FUTEX_WAKE_PRIVATE, 1) = 0
write(15, "\0\0\4\10\0\0\0\0\0\0\0\6B\0\0\10\6\0\0\0\0\0\2\4\20\20\t\16\7\7", 30) = 30
epoll_pwait(11, [{EPOLLOUT, {u32=745610296, u64=140020974427192}}, {EPOLLIN|EPOLLOUT, {u32=745610088, u64=140020974426984}}, {EPOLLIN|EPOLLOUT|EPOLLRDHUP, {u32=745609672, u64=140020974426568}}], 128, 0, NULL, 2109442) = 3
read(17, "\0\0\4\10\0\0\0\0\0\0\0\6B\0\0\10\6\0\0\0\0\0\2\4\20\20\t\16\7\7", 4096) = 30
read(17, 0xc0002e2000, 4096) = -1 EAGAIN (Resource temporarily unavailable)
write(17, "\0\0\10\6\1\0\0\0\0\2\4\20\20\t\16\7\7", 17) = 17
epoll_pwait(11, [{EPOLLIN|EPOLLOUT, {u32=745610296, u64=140020974427192}}], 128, 0, NULL, 824634603342) = 1
read(15, "\0\0\10\6\1\0\0\0\0\2\4\20\20\t\16\7\7", 32768) = 17
read(15, 0xc0002c0000, 32768) = -1 EAGAIN (Resource temporarily unavailable)
epoll_pwait(11, [], 128, 0, NULL, 0) = 0
epoll_pwait(11,
The corresponding log messages look just fine:
│ 2019-09-09T18:46:57.753Z INFO server/server.go:88 API Request {"status": 408, "took": 30.029689656, "remote": "10.42.0.1:50076", "request": "/detect", "method": "POST", "packa │
│ ge": "server.request", "request-id": "doods-674d79b94b-xfwbq/I6mlLbelrp-072076"} │
│ 2019-09-09T18:46:58.857Z INFO server/server.go:88 API Request {"status": 200, "took": 0.001051048, "remote": "10.42.0.1:45932", "request": "/detectors", "method": "GET", "pack │
│ age": "server.request", "request-id": "doods-674d79b94b-xfwbq/I6mlLbelrp-072083"} │
│ 2019-09-09T18:47:00.558Z INFO server/server.go:88 API Request {"status": 200, "took": 0.001099912, "remote": "10.42.0.1:46110", "request": "/detectors", "method": "GET", "pack │
│ age": "server.request", "request-id": "doods-674d79b94b-xfwbq/I6mlLbelrp-072085"}
ok the server got stuck after a few days:
hass/appdaemon-96b4cd557-5cxhh[appdaemon]: 2019-10-04 19:52:08.016304 WARNING camera_image_scanner_flur_unten: Coroutine (<coroutine object Services.call_service at 0x7f325f0893b0>) took too long (10 seconds), cancelling the task...
hass/appdaemon-96b4cd557-5cxhh[appdaemon]: 2019-10-04 19:52:08.019590 WARNING HASS: Timeout in call_service(default/image_processing/scan, {'entity_id': 'image_processing.doods_flur_unten'})
This is the profile output:
curl "https://doods.local/debug/pprof/goroutine?debug=1" ±[master]
goroutine profile: total 1523
749 @ 0x436fdf 0x447549 0x44751f 0x4471c9 0x47bca5 0x9fcadb 0x9fcc4f 0xb2ec4d 0x75ec04 0x9973b9 0x4644e1
# 0x4471c8 sync.runtime_Semacquire+0x38 /usr/local/go/src/runtime/sema.go:56
# 0x47bca4 sync.(*WaitGroup).Wait+0x64 /usr/local/go/src/sync/waitgroup.go:130
# 0x9fcada google.golang.org/grpc.(*Server).serveStreams+0xea /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:719
# 0x9fcc4e google.golang.org/grpc.(*Server).ServeHTTP+0xfe /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:758
# 0xb2ec4c github.com/snowzach/doods/server.New.func2+0x5c /build/server/server.go:121
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0x9973b8 golang.org/x/net/http2.(*serverConn).runHandler+0x88 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/server.go:2149
746 @ 0x436fdf 0x40dafa 0x40dad0 0x40d7bb 0xad5b39 0xade408 0xa814c6 0xb20458 0xa75f38 0x9fe006 0xa01e66 0xa0e85f 0x4644e1
# 0xad5b38 github.com/snowzach/doods/detector/tflite.(*detector).Detect+0x108 /build/detector/tflite/detector.go:184
# 0xade407 github.com/snowzach/doods/detector.(*Mux).Detect+0x1d7 /build/detector/detector.go:121
# 0xa814c5 github.com/snowzach/doods/odrpc._Odrpc_Detect_Handler.func1+0x85 /build/odrpc/rpc.pb.go:855
# 0xb20457 github.com/grpc-ecosystem/go-grpc-middleware/auth.UnaryServerInterceptor.func1+0x107 /go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.0/auth/auth.go:46
# 0xa75f37 github.com/snowzach/doods/odrpc._Odrpc_Detect_Handler+0x157 /build/odrpc/rpc.pb.go:857
# 0x9fe005 google.golang.org/grpc.(*Server).processUnaryRPC+0x465 /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:995
# 0xa01e65 google.golang.org/grpc.(*Server).handleStream+0xda5 /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:1275
# 0xa0e85e google.golang.org/grpc.(*Server).serveStreams.func1.1+0x9e /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:710
4 @ 0x40adeb 0xad7db9 0xadb74e 0xad976b 0xad5c30 0xade408 0xa814c6 0xb20458 0xa75f38 0x9fe006 0xa01e66 0xa0e85f 0x4644e1
# 0xad7db8 github.com/snowzach/doods/detector/tflite._Cfunc_TFL_InterpreterInvoke+0x48 _cgo_gotypes.go:590
# 0xadb74d github.com/snowzach/doods/detector/tflite.(*Interpreter).Invoke.func1+0x5d /build/detector/tflite/tflite.go:155
# 0xad976a github.com/snowzach/doods/detector/tflite.(*Interpreter).Invoke+0x2a /build/detector/tflite/tflite.go:155
# 0xad5c2f github.com/snowzach/doods/detector/tflite.(*detector).Detect+0x1ff /build/detector/tflite/detector.go:195
# 0xade407 github.com/snowzach/doods/detector.(*Mux).Detect+0x1d7 /build/detector/detector.go:121
# 0xa814c5 github.com/snowzach/doods/odrpc._Odrpc_Detect_Handler.func1+0x85 /build/odrpc/rpc.pb.go:855
# 0xb20457 github.com/grpc-ecosystem/go-grpc-middleware/auth.UnaryServerInterceptor.func1+0x107 /go/pkg/mod/github.com/grpc-ecosystem/go-grpc-middleware@v1.0.0/auth/auth.go:46
# 0xa75f37 github.com/snowzach/doods/odrpc._Odrpc_Detect_Handler+0x157 /build/odrpc/rpc.pb.go:857
# 0x9fe005 google.golang.org/grpc.(*Server).processUnaryRPC+0x465 /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:995
# 0xa01e65 google.golang.org/grpc.(*Server).handleStream+0xda5 /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:1275
# 0xa0e85e google.golang.org/grpc.(*Server).serveStreams.func1.1+0x9e /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:710
2 @ 0x436fdf 0x40dafa 0x40dad0 0x40d7bb 0x9f1a81 0x4644e1
# 0x9f1a80 google.golang.org/grpc.(*addrConn).resetTransport+0x5d0 /go/pkg/mod/google.golang.org/grpc@v1.23.0/clientconn.go:1077
2 @ 0x436fdf 0x4326ea 0x431cd6 0x4c7edb 0x4c8ceb 0x4c8ccc 0x5296ff 0x53cb29 0x55975e 0x4a7a68 0x984418 0x9843d9 0x984c71 0x9b8800 0x4644e1
# 0x431cd5 internal/poll.runtime_pollWait+0x55 /usr/local/go/src/runtime/netpoll.go:182
# 0x4c7eda internal/poll.(*pollDesc).wait+0x9a /usr/local/go/src/internal/poll/fd_poll_runtime.go:87
# 0x4c8cea internal/poll.(*pollDesc).waitRead+0x19a /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
# 0x4c8ccb internal/poll.(*FD).Read+0x17b /usr/local/go/src/internal/poll/fd_unix.go:169
# 0x5296fe net.(*netFD).Read+0x4e /usr/local/go/src/net/fd_unix.go:202
# 0x53cb28 net.(*conn).Read+0x68 /usr/local/go/src/net/net.go:177
# 0x55975d bufio.(*Reader).Read+0x23d /usr/local/go/src/bufio/bufio.go:223
# 0x4a7a67 io.ReadAtLeast+0x87 /usr/local/go/src/io/io.go:310
# 0x984417 io.ReadFull+0x87 /usr/local/go/src/io/io.go:329
# 0x9843d8 golang.org/x/net/http2.readFrameHeader+0x48 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/frame.go:237
# 0x984c70 golang.org/x/net/http2.(*Framer).ReadFrame+0xa0 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/frame.go:492
# 0x9b87ff google.golang.org/grpc/internal/transport.(*http2Client).reader+0x17f /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/http2_client.go:1249
2 @ 0x436fdf 0x4326ea 0x431cd6 0x4c7edb 0x4c8ceb 0x4c8ccc 0x5296ff 0x53cb29 0x757d38 0x4644e1
# 0x431cd5 internal/poll.runtime_pollWait+0x55 /usr/local/go/src/runtime/netpoll.go:182
# 0x4c7eda internal/poll.(*pollDesc).wait+0x9a /usr/local/go/src/internal/poll/fd_poll_runtime.go:87
# 0x4c8cea internal/poll.(*pollDesc).waitRead+0x19a /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
# 0x4c8ccb internal/poll.(*FD).Read+0x17b /usr/local/go/src/internal/poll/fd_unix.go:169
# 0x5296fe net.(*netFD).Read+0x4e /usr/local/go/src/net/fd_unix.go:202
# 0x53cb28 net.(*conn).Read+0x68 /usr/local/go/src/net/net.go:177
# 0x757d37 net/http.(*connReader).backgroundRead+0x57 /usr/local/go/src/net/http/server.go:677
2 @ 0x436fdf 0x4326ea 0x431cd6 0x4c7edb 0x4c8ceb 0x4c8ccc 0x5296ff 0x53cb29 0x758187 0x55975e 0x4a856c 0xaff0c2 0x4a7a68 0x984418 0x9843d9 0x984c71 0x98ef64 0x4644e1
# 0x431cd5 internal/poll.runtime_pollWait+0x55 /usr/local/go/src/runtime/netpoll.go:182
# 0x4c7eda internal/poll.(*pollDesc).wait+0x9a /usr/local/go/src/internal/poll/fd_poll_runtime.go:87
# 0x4c8cea internal/poll.(*pollDesc).waitRead+0x19a /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
# 0x4c8ccb internal/poll.(*FD).Read+0x17b /usr/local/go/src/internal/poll/fd_unix.go:169
# 0x5296fe net.(*netFD).Read+0x4e /usr/local/go/src/net/fd_unix.go:202
# 0x53cb28 net.(*conn).Read+0x68 /usr/local/go/src/net/net.go:177
# 0x758186 net/http.(*connReader).Read+0x106 /usr/local/go/src/net/http/server.go:787
# 0x55975d bufio.(*Reader).Read+0x23d /usr/local/go/src/bufio/bufio.go:223
# 0x4a856b io.(*multiReader).Read+0xab /usr/local/go/src/io/multi.go:26
# 0xaff0c1 golang.org/x/net/http2/h2c.(*rwConn).Read+0x51 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/h2c/h2c.go:287
# 0x4a7a67 io.ReadAtLeast+0x87 /usr/local/go/src/io/io.go:310
# 0x984417 io.ReadFull+0x87 /usr/local/go/src/io/io.go:329
# 0x9843d8 golang.org/x/net/http2.readFrameHeader+0x48 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/frame.go:237
# 0x984c70 golang.org/x/net/http2.(*Framer).ReadFrame+0xa0 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/frame.go:492
# 0x98ef63 golang.org/x/net/http2.(*serverConn).readFrames+0xa3 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/server.go:748
2 @ 0x436fdf 0x446a68 0x98fa44 0x98ddf6 0xafd83e 0x761ed8 0x75dbc1 0x4644e1
# 0x98fa43 golang.org/x/net/http2.(*serverConn).serve+0x503 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/server.go:861
# 0x98ddf5 golang.org/x/net/http2.(*Server).ServeConn+0x715 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/server.go:472
# 0xafd83d golang.org/x/net/http2/h2c.h2cHandler.ServeHTTP+0x22d /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/h2c/h2c.go:87
# 0x761ed7 net/http.serverHandler.ServeHTTP+0xa7 /usr/local/go/src/net/http/server.go:2774
# 0x75dbc0 net/http.(*conn).serve+0x850 /usr/local/go/src/net/http/server.go:1878
2 @ 0x436fdf 0x446a68 0x9a8747 0x9a8f86 0x9ca13b 0x4644e1
# 0x9a8746 google.golang.org/grpc/internal/transport.(*controlBuffer).get+0x106 /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/controlbuf.go:395
# 0x9a8f85 google.golang.org/grpc/internal/transport.(*loopyWriter).run+0x1b5 /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/controlbuf.go:513
# 0x9ca13a google.golang.org/grpc/internal/transport.newHTTP2Client.func3+0x7a /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/http2_client.go:333
2 @ 0x436fdf 0x446a68 0x9ea59d 0x4644e1
# 0x9ea59c google.golang.org/grpc.(*ccBalancerWrapper).watcher+0x12c /go/pkg/mod/google.golang.org/grpc@v1.23.0/balancer_conn_wrappers.go:115
1 @ 0x412804 0x44b5ec 0x881922 0x4644e1
# 0x44b5eb os/signal.signal_recv+0x9b /usr/local/go/src/runtime/sigqueue.go:139
# 0x881921 os/signal.loop+0x21 /usr/local/go/src/os/signal/signal_unix.go:23
1 @ 0x436fdf 0x40dafa 0x40dad0 0x40d7bb 0xb306f9 0x5cd5fe 0x5ce25c 0xb2f712 0xb2f702 0xb31640 0x436bec 0x4644e1
# 0xb306f8 github.com/snowzach/doods/cmd.glob..func1+0x238 /build/cmd/api.go:46
# 0x5cd5fd github.com/spf13/cobra.(*Command).execute+0x2ad /go/pkg/mod/github.com/spf13/cobra@v0.0.5/command.go:830
# 0x5ce25b github.com/spf13/cobra.(*Command).ExecuteC+0x2fb /go/pkg/mod/github.com/spf13/cobra@v0.0.5/command.go:914
# 0xb2f711 github.com/spf13/cobra.(*Command).Execute+0x31 /go/pkg/mod/github.com/spf13/cobra@v0.0.5/command.go:864
# 0xb2f701 github.com/snowzach/doods/cmd.Execute+0x21 /build/cmd/root.go:58
# 0xb3163f main.main+0x1f /build/main.go:8
# 0x436beb runtime.main+0x20b /usr/local/go/src/runtime/proc.go:200
1 @ 0x436fdf 0x40dafa 0x40dad0 0x40d7fb 0x882491 0x4644e1
# 0x882490 github.com/snowzach/doods/conf.init.1.func1+0x40 /build/conf/signal.go:42
1 @ 0x436fdf 0x40de9e 0x40d7bb 0xa8157c 0x4644e1
# 0xa8157b github.com/snowzach/doods/odrpc.RegisterOdrpcHandlerFromEndpoint.func1.1+0x4b /build/odrpc/rpc.pb.gw.go:100
1 @ 0x436fdf 0x40de9e 0x40d7bb 0xafc83c 0x4644e1
# 0xafc83b github.com/snowzach/doods/server/rpc.RegisterVersionRPCHandlerFromEndpoint.func1.1+0x4b /build/server/rpc/version.pb.gw.go:56
1 @ 0x436fdf 0x4326ea 0x431cd6 0x4c7edb 0x4ca4ea 0x4ca4cb 0x52a012 0x546ee2 0x5459f8 0x7622bd 0xb2ee8e 0x4644e1
# 0x431cd5 internal/poll.runtime_pollWait+0x55 /usr/local/go/src/runtime/netpoll.go:182
# 0x4c7eda internal/poll.(*pollDesc).wait+0x9a /usr/local/go/src/internal/poll/fd_poll_runtime.go:87
# 0x4ca4e9 internal/poll.(*pollDesc).waitRead+0x1b9 /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
# 0x4ca4ca internal/poll.(*FD).Accept+0x19a /usr/local/go/src/internal/poll/fd_unix.go:384
# 0x52a011 net.(*netFD).accept+0x41 /usr/local/go/src/net/fd_unix.go:238
# 0x546ee1 net.(*TCPListener).accept+0x31 /usr/local/go/src/net/tcpsock_posix.go:139
# 0x5459f7 net.(*TCPListener).Accept+0x47 /usr/local/go/src/net/tcpsock.go:260
# 0x7622bc net/http.(*Server).Serve+0x22c /usr/local/go/src/net/http/server.go:2859
# 0xb2ee8d github.com/snowzach/doods/server.(*Server).ListenAndServe.func3+0x5d /build/server/server.go:218
1 @ 0x436fdf 0x446a68 0x9ae4cf 0x9adf8d 0x9fcacd 0x9fcc4f 0xb2ec4d 0x75ec04 0x9973b9 0x4644e1
# 0x9ae4ce google.golang.org/grpc/internal/transport.(*serverHandlerTransport).runStream+0xbe /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/handler_server.go:394
# 0x9adf8c google.golang.org/grpc/internal/transport.(*serverHandlerTransport).HandleStreams+0x52c /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/handler_server.go:384
# 0x9fcacc google.golang.org/grpc.(*Server).serveStreams+0xdc /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:706
# 0x9fcc4e google.golang.org/grpc.(*Server).ServeHTTP+0xfe /go/pkg/mod/google.golang.org/grpc@v1.23.0/server.go:758
# 0xb2ec4c github.com/snowzach/doods/server.New.func2+0x5c /build/server/server.go:121
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0x9973b8 golang.org/x/net/http2.(*serverConn).runHandler+0x88 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/server.go:2149
1 @ 0x436fdf 0x446a68 0x9c750c 0xa092bc 0xa092b3 0xa103e6 0xa0727c 0xa08090 0x9ebbd1 0x9eb904 0xa75812 0xa7fdb4 0xa821df 0xa6ba6a 0xb2ee18 0xb03ee6 0xb03ebb 0x75ec04 0xb2df55 0x75ec04 0xaf9bcb 0x75ec04 0xb1d94f 0x75ec04 0xb1db48 0x75ec04 0xb01bd4 0xb2ec85 0x75ec04 0xafda01 0x761ed8 0x75dbc1
# 0x9c750b google.golang.org/grpc/internal/transport.(*Stream).waitOnHeader+0xcb /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/transport.go:312
# 0xa092bb google.golang.org/grpc/internal/transport.(*Stream).RecvCompress+0x70b /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/transport.go:331
# 0xa092b2 google.golang.org/grpc.(*csAttempt).recvMsg+0x702 /go/pkg/mod/google.golang.org/grpc@v1.23.0/stream.go:871
# 0xa103e5 google.golang.org/grpc.(*clientStream).RecvMsg.func1+0x45 /go/pkg/mod/google.golang.org/grpc@v1.23.0/stream.go:736
# 0xa0727b google.golang.org/grpc.(*clientStream).withRetry+0x7b /go/pkg/mod/google.golang.org/grpc@v1.23.0/stream.go:594
# 0xa0808f google.golang.org/grpc.(*clientStream).RecvMsg+0xff /go/pkg/mod/google.golang.org/grpc@v1.23.0/stream.go:735
# 0x9ebbd0 google.golang.org/grpc.invoke+0x140 /go/pkg/mod/google.golang.org/grpc@v1.23.0/call.go:73
# 0x9eb903 google.golang.org/grpc.(*ClientConn).Invoke+0x1b3 /go/pkg/mod/google.golang.org/grpc@v1.23.0/call.go:37
# 0xa75811 github.com/snowzach/doods/odrpc.(*odrpcClient).Detect+0xd1 /build/odrpc/rpc.pb.go:772
# 0xa7fdb3 github.com/snowzach/doods/odrpc.request_Odrpc_Detect_0+0x273 /build/odrpc/rpc.pb.gw.go:49
# 0xa821de github.com/snowzach/doods/odrpc.RegisterOdrpcHandlerClient.func2+0x20e /build/odrpc/rpc.pb.gw.go:170
# 0xa6ba69 github.com/grpc-ecosystem/grpc-gateway/runtime.(*ServeMux).ServeHTTP+0xbe9 /go/pkg/mod/github.com/grpc-ecosystem/grpc-gateway@v1.9.6/runtime/mux.go:243
# 0xb2ee17 github.com/snowzach/doods/server.(*Server).ListenAndServe.func2+0x47 /build/server/server.go:206
# 0xb03ee5 net/http.HandlerFunc.ServeHTTP+0x245 /usr/local/go/src/net/http/server.go:1995
# 0xb03eba github.com/go-chi/chi.(*Mux).routeHTTP+0x21a /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:431
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb2df54 github.com/snowzach/doods/server.New.func1.1+0x264 /build/server/server.go:73
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xaf9bca github.com/go-chi/render.SetContentType.func1.1+0x18a /go/pkg/mod/github.com/go-chi/render@v1.0.1/content_type.go:52
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb1d94e github.com/go-chi/chi/middleware.Recoverer.func1+0x9e /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/middleware/recoverer.go:35
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb1db47 github.com/go-chi/chi/middleware.RequestID.func1+0x1c7 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/middleware/request_id.go:72
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb01bd3 github.com/go-chi/chi.(*Mux).ServeHTTP+0x293 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:82
# 0xb2ec84 github.com/snowzach/doods/server.New.func2+0x94 /build/server/server.go:123
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xafda00 golang.org/x/net/http2/h2c.h2cHandler.ServeHTTP+0x3f0 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/h2c/h2c.go:98
# 0x761ed7 net/http.serverHandler.ServeHTTP+0xa7 /usr/local/go/src/net/http/server.go:2774
# 0x75dbc0 net/http.(*conn).serve+0x850 /usr/local/go/src/net/http/server.go:1878
1 @ 0x436fdf 0x446a68 0x9c9d4d 0x4644e1
# 0x9c9d4c google.golang.org/grpc/internal/transport.(*serverHandlerTransport).HandleStreams.func1+0xfc /go/pkg/mod/google.golang.org/grpc@v1.23.0/internal/transport/handler_server.go:312
1 @ 0xb0ef70 0xb0ed90 0xb0b960 0xb18706 0xb19167 0x75ec04 0xb03f1f 0x75ec04 0xb1d342 0x75ec04 0xb01d91 0xb07a47 0x75ec04 0xb03f1f 0x75ec04 0xb2df55 0x75ec04 0xaf9bcb 0x75ec04 0xb1d94f 0x75ec04 0xb1db48 0x75ec04 0xb01bd4 0xb2ec85 0x75ec04 0xafda01 0x761ed8 0x75dbc1 0x4644e1
# 0xb0ef6f runtime/pprof.writeRuntimeProfile+0x8f /usr/local/go/src/runtime/pprof/pprof.go:708
# 0xb0ed8f runtime/pprof.writeGoroutine+0x9f /usr/local/go/src/runtime/pprof/pprof.go:670
# 0xb0b95f runtime/pprof.(*Profile).WriteTo+0x38f /usr/local/go/src/runtime/pprof/pprof.go:329
# 0xb18705 net/http/pprof.handler.ServeHTTP+0x355 /usr/local/go/src/net/http/pprof/pprof.go:245
# 0xb19166 net/http/pprof.Index+0x6f6 /usr/local/go/src/net/http/pprof/pprof.go:268
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb03f1e github.com/go-chi/chi.(*Mux).routeHTTP+0x27e /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:425
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb1d341 github.com/go-chi/chi/middleware.NoCache.func1+0x2a1 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/middleware/nocache.go:54
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb01d90 github.com/go-chi/chi.(*Mux).ServeHTTP+0x450 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:70
# 0xb07a46 github.com/go-chi/chi.(*Mux).Mount.func1+0x126 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:292
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb03f1e github.com/go-chi/chi.(*Mux).routeHTTP+0x27e /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:425
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb2df54 github.com/snowzach/doods/server.New.func1.1+0x264 /build/server/server.go:73
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xaf9bca github.com/go-chi/render.SetContentType.func1.1+0x18a /go/pkg/mod/github.com/go-chi/render@v1.0.1/content_type.go:52
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb1d94e github.com/go-chi/chi/middleware.Recoverer.func1+0x9e /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/middleware/recoverer.go:35
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb1db47 github.com/go-chi/chi/middleware.RequestID.func1+0x1c7 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/middleware/request_id.go:72
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xb01bd3 github.com/go-chi/chi.(*Mux).ServeHTTP+0x293 /go/pkg/mod/github.com/go-chi/chi@v4.0.2+incompatible/mux.go:82
# 0xb2ec84 github.com/snowzach/doods/server.New.func2+0x94 /build/server/server.go:123
# 0x75ec03 net/http.HandlerFunc.ServeHTTP+0x43 /usr/local/go/src/net/http/server.go:1995
# 0xafda00 golang.org/x/net/http2/h2c.h2cHandler.ServeHTTP+0x3f0 /go/pkg/mod/golang.org/x/net@v0.0.0-20190813141303-74dc4d7220e7/http2/h2c/h2c.go:98
# 0x761ed7 net/http.serverHandler.ServeHTTP+0xa7 /usr/local/go/src/net/http/server.go:2774
# 0x75dbc0 net/http.(*conn).serve+0x850 /usr/local/go/src/net/http/server.go:1878
The server log looks fine:
2019-10-04T17:51:41.014Z INFO server/server.go:88 API Request {"status": 200, "took": 0.0023002, "remote": "10.42.0.1:46792", "request": "/detectors", "method": "GET", "package": "server.request", "request-id": "doods-7dd57d9757-vxn2p/95Yz3vHPZJ-369071"}
2019-10-04T17:51:43.717Z INFO server/server.go:88 API Request {"status": 200, "took": 0.001087206, "remote": "10.42.0.1:46912", "request": "/detectors", "method": "GET", "package": "server.request", "request-id": "doods-7dd57d9757-vxn2p/95Yz3vHPZJ-369072"}
2019-10-04T17:51:45.524Z INFO server/server.go:88 API Request {"status": 200, "took": 0.039879463, "remote": "10.42.0.200:52254", "request": "/debug/pprof/goroutine?debug=1", "method": "GET", "package": "server.request", "request-id": "doods-7dd57d9757-vxn2p/95Yz3vHPZJ-369073"}
2019-10-04T17:51:51.015Z INFO server/server.go:88 API Request {"status": 200, "took": 0.001214559, "remote": "10.42.0.1:47238", "request": "/detectors", "method": "GET", "package": "server.request", "request-id": "doods-7dd57d9757-vxn2p/95Yz3vHPZJ-369074"}
2019-10-04T17:51:53.107Z INFO server/server.go:88 API Request {"status": 408, "took": 30.038410282, "remote": "10.42.0.1:52338", "request": "/detect", "method": "POST", "package": "server.request", "request-id": "doods-7dd57d9757-vxn2p/95Yz3vHPZJ-369067"
@snowzach do you have any idea how to fix this issue?
I need to update to the latest version of tensorflow and I know there was a couple other issue that were fixed as well as there is now a C api to tensorflow lite. Sorry, I am still trying to get the build process ironed out. I will try to get at least a timeout implemented soonish. It's likely going to cause a slow leak but better than it hanging.
Okay, if you want to pull the latest image, it's been update to tensorflow r1.15 and the way the edge tpu is pulled in is a lot cleaner. Have a shot and let me know.
I have tried the latest container. Unfortunatly it crashes without any error message:
Containers:
doods:
Container ID: containerd://72c62fff6b664a70948d005e326d53090eed429a0a37f88603c059941a7f18f4
Image: snowzach/doods:latest
Image ID: docker.io/snowzach/doods@sha256:649aa1be5432958fc8150bc65528bb085ca0167a78168d22bde4b26e63fc15fc
Port: 8080/TCP
Host Port: 0/TCP
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 132
Started: Sun, 27 Oct 2019 13:49:25 +0100
Finished: Sun, 27 Oct 2019 13:49:25 +0100
Ready: False
Restart Count: 8
Does the binary need AVX or something like that? I run it in a low power machine:
cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 122
model name : Intel(R) Pentium(R) Silver N5000 CPU @ 1.10GHz
stepping : 1
microcode : 0x2e
cpu MHz : 1428.545
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 4
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 24
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg cx16 xtpr pdcm sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave rdrand lahf_lm 3dnowprefetch cpuid_fault cat_l2 pti cdp_l2 ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust smep erms mpx rdt_a rdseed smap clflushopt intel_pt sha_ni xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts umip rdpid md_clear arch_capabilities
bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass
bogomips : 2188.80
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
Hey it does require AVX... However, just for you, use the docker tag noavx
instead of amd64
haha.. I have an Atomic Pi as well without AVX.
Great, the noavx version seems to work. I will report if it will get stuck in the future...
I am going to close this. Feel free to re-open if it occurs again.
Try pulling the latest image and let me know.
I will test that soon. Would it also be possible to get a health check url which just checks a static image and verifies if the tpu is working? I my case I only scan during motion which means a broken tpu would get noticed before it is actually needed...
The coral stick can freeze after some time if the power supply is not providing enough current. I recommend using a power supply which can supply 3 Amps. Note that the official RPI supply delivers only 2.5 Amp
My coral stick is attached to an msi cubi n usb 3 port. Even that box suffered from this problem. Lately the system seems to work fine. I will monitor it with the latest release. Kubernetes will log each failure and pod restart...
If you have pulled the latest image, I switched to using the throttled version of the EdgeTPU library. This could be the reason it's working fine as it's presumably using less power.
I am using the latest image but it does not crash as expected....
I get this errors in home assistant:
Update of image_processing.doods_flur_unten is taking over 10 seconds
22:00 __main__.py (WARNING) - message first occured at 17. November 2019, 8:46 and shows up 1416 times
That's not necessarily a problem. It just means it takes a while to process images. Does it appear to be working?
Normally the processing takes less than 1 second... restarting doods solved the issue.
Are you certain you've pulled the most recent docker image? Are you using the Tensorflow or the Tensorflow lite module? If you are using Tensorflow Lite, it should crash out after 10 seconds when it crashes and docker should be set to restart. I am nearly certain at this point crashes are related to poor USB power or not being plugged into a USB 3.0 port.
Do you have some version log output in order to verify the version?
I am using tflite models only.
The stick is attached to an usb3 port at my msi cubi n box.
Okay, I added the version string to the API string API Listening {"package": "server", "address": ":8080", "tls": false, "version": "v0.1.3-0-gb0fceac"}
Any luck?
I have been able to look into it...
I have just checked my kubernetes pod history. It looks like doods crashed 11 times within the last 14 days. The automatic recovery seems to work! :)
That's good as good as it can be right now. I am working on trying to reproduce the crashes with the stick. I think it's just with the C API.
I'm going to close this ticket now. If it pops up again, open an issue.
My doods server using the coral stick running in docker/k8s stops working after some time. This means I can still get the detectors output but the detect method times out.
Killing/restarting the container fixed the problem. There was no corresponding log entry for these failed requests.
It would be cool to have some health check url which would do an actual image detection in order to see if it is working end2end. I will enable debug logs in order to debug this issue...