dotmesh-io / dotmesh

dotmesh (dm) is like git for your data volumes (databases, files etc) in Docker and Kubernetes
https://dotmesh.com
Apache License 2.0
539 stars 29 forks source link

SIGSEGV when firing up Jupyter #594

Open alaric-dotmesh opened 6 years ago

alaric-dotmesh commented 6 years ago

When running the clone command as part of firing up Jupyter, I got a lovely message that seems to be a SIGSEGV from inside the dm client:

error from running dm clone,--stash-on-divergence,dotscience-hub,test/combined-houses - 2
Failed to set up dots: 2
2
fatal error: unexpected signal during runtime execution
[signal SIGSEGV: segmentation violation code=0x1 addr=0x63 pc=0x7fe669922448]

runtime stack:
runtime.throw(0xe12b25, 0x2a)
    GOROOT/src/runtime/panic.go:616 +0x81
runtime.sigpanic()
    GOROOT/src/runtime/signal_unix.go:372 +0x28e

goroutine 9 [syscall]:
runtime.cgocall(0xb70fee, 0xc420042df8, 0x29)
    GOROOT/src/runtime/cgocall.go:128 +0x64 fp=0xc420042db8 sp=0xc420042d80 pc=0x402384
net._C2func_getaddrinfo(0xc4201b8d30, 0x0, 0xc4201be9c0, 0xc42000e560, 0x0, 0x0, 0x0)
    _cgo_gotypes.go:92 +0x55 fp=0xc420042df8 sp=0xc420042db8 pc=0x5b6255
net.cgoLookupIPCNAME.func1(0xc4201b8d30, 0x0, 0xc4201be9c0, 0xc42000e560, 0xa, 0xa, 0xc4200700c0)
    GOROOT/src/net/cgo_unix.go:149 +0x13b fp=0xc420042e40 sp=0xc420042df8 pc=0x5bcf4b
net.cgoLookupIPCNAME(0xc420220f70, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
    GOROOT/src/net/cgo_unix.go:149 +0x174 fp=0xc420042f38 sp=0xc420042e40 pc=0x5b78c4
net.cgoIPLookup(0xc420074ba0, 0xc420220f70, 0x9)
    GOROOT/src/net/cgo_unix.go:201 +0x4d fp=0xc420042fc8 sp=0xc420042f38 pc=0x5b7f8d
runtime.goexit()
    bazel-out/k8-fastbuild/bin/external/io_bazel_rules_go/linux_amd64_static_stripped/stdlib~/src/runtime/asm_amd64.s:2361 +0x1 fp=0xc420042fd0 sp=0xc420042fc8 pc=0x458521
created by net.cgoLookupIP
    GOROOT/src/net/cgo_unix.go:211 +0xaf

goroutine 1 [runnable]:
net/http.(*Transport).getIdleConnCh(0x12c0e20, 0x0, 0xc42023a980, 0x4, 0xc420220f70, 0xf, 0xc4202da2a0)
    GOROOT/src/net/http/transport.go:770 +0x268
net/http.(*Transport).getConn(0x12c0e20, 0xc4202f7170, 0x0, 0xc42023a980, 0x4, 0xc420220f70, 0xf, 0x0, 0x0, 0xc420198930)
    GOROOT/src/net/http/transport.go:961 +0x3a7
net/http.(*Transport).RoundTrip(0x12c0e20, 0xc42024e400, 0x12c0e20, 0x0, 0x0)
    GOROOT/src/net/http/transport.go:409 +0x632
net/http.send(0xc42024e400, 0xed70c0, 0x12c0e20, 0x0, 0x0, 0x0, 0xc4201343a8, 0x0, 0xc420198c90, 0x1)
    GOROOT/src/net/http/client.go:252 +0x185
net/http.(*Client).send(0x12fd040, 0xc42024e400, 0x0, 0x0, 0x0, 0xc4201343a8, 0x0, 0x1, 0xc420198d60)
    GOROOT/src/net/http/client.go:176 +0xfa
net/http.(*Client).Do(0x12fd040, 0xc42024e400, 0x5, 0xc420220b20, 0x8)
    GOROOT/src/net/http/client.go:615 +0x28d
github.com/dotmesh-io/dotmesh/pkg/client.(*JsonRpcClient).reallyCallRemote(0xc420199230, 0xedd320, 0xc4202f6f00, 0xe00205, 0xf, 0x0, 0x0, 0xc84880, 0xc420220e58, 0xc42023a980, ...)
    pkg/client/client.go:112 +0x505
github.com/dotmesh-io/dotmesh/pkg/client.DeduceUrl(0xedd2a0, 0xc420024038, 0xc420219400, 0x1, 0x1, 0xdfbdac, 0x8, 0xc420220af5, 0x5, 0xc420220b20, ...)
    pkg/client/client.go:167 +0x231
github.com/dotmesh-io/dotmesh/pkg/client.(*JsonRpcClient).CallRemote(0xc42021c980, 0xedd2a0, 0xc420024038, 0xe02ee6, 0x13, 0xdb21c0, 0xc4202640c0, 0xc9fb80, 0xc4202193f0, 0x7ffcc1eeec99, ...)
    pkg/client/client.go:65 +0xef
github.com/dotmesh-io/dotmesh/pkg/client.(*DotmeshAPI).RequestTransfer(0xc4202f69f0, 0xdf85b2, 0x4, 0x7ffcc1eeec85, 0xe, 0x0, 0x0, 0xdf9f45, 0x6, 0x7ffcc1eeec94, ...)
    pkg/client/api.go:1076 +0x83b
github.com/dotmesh-io/dotmesh/cmd/dm/pkg/commands.NewCmdClone.func1.1(0xedf8a0, 0xc4202f5380)
    cmd/dm/pkg/commands/clone.go:42 +0x155
github.com/dotmesh-io/dotmesh/cmd/dm/pkg/commands.runHandlingError(0xc420199ca0)
    cmd/dm/pkg/commands/utils.go:89 +0x2b
github.com/dotmesh-io/dotmesh/cmd/dm/pkg/commands.NewCmdClone.func1(0xc4202fd200, 0xc4202f6690, 0x2, 0x3)
    cmd/dm/pkg/commands/clone.go:31 +0x79
github.com/dotmesh-io/dotmesh/cmd/dm/vendor/github.com/spf13/cobra.(*Command).execute(0xc4202fd200, 0xc4202f65d0, 0x3, 0x3, 0xc4202fd200, 0xc4202f65d0)
    cmd/dm/vendor/github.com/spf13/cobra/command.go:636 +0x234
github.com/dotmesh-io/dotmesh/cmd/dm/vendor/github.com/spf13/cobra.(*Command).ExecuteC(0x12c17a0, 0xc4202fc6c0, 0xc4202fc480, 0xc4202f2000)
    cmd/dm/vendor/github.com/spf13/cobra/command.go:722 +0x2d4
github.com/dotmesh-io/dotmesh/cmd/dm/vendor/github.com/spf13/cobra.(*Command).Execute(0x12c17a0, 0xc420130e80, 0xc420199f58)
    cmd/dm/vendor/github.com/spf13/cobra/command.go:681 +0x2b
main.main()
    cmd/dm/dotmesh.go:40 +0x1f9

goroutine 5 [chan receive]:
github.com/dotmesh-io/dotmesh/vendor/github.com/golang/glog.(*loggingT).flushDaemon(0x12fd6e0)
    vendor/github.com/golang/glog/glog.go:882 +0x8b
created by github.com/dotmesh-io/dotmesh/vendor/github.com/golang/glog.init.0
    vendor/github.com/golang/glog/glog.go:410 +0x203

goroutine 19 [syscall]:
os/signal.signal_recv(0xc4200427c8)
    GOROOT/src/runtime/sigqueue.go:139 +0xa6
os/signal.loop()
    GOROOT/src/os/signal/signal_unix.go:22 +0x22
created by os/signal.init.0
    GOROOT/src/os/signal/signal_unix.go:28 +0x41

goroutine 20 [select]:
net.(*Resolver).LookupIPAddr(0x12fbe00, 0xedd2e0, 0xc420074a20, 0xc420220f70, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0)
    GOROOT/src/net/lookup.go:212 +0x50d
net.(*Resolver).internetAddrList(0x12fbe00, 0xedd2e0, 0xc420074a20, 0xdf7db5, 0x3, 0xc420220f70, 0xf, 0x0, 0x0, 0x0, ...)
    GOROOT/src/net/ipsock.go:293 +0x5c4
net.(*Resolver).resolveAddrList(0x12fbe00, 0xedd2e0, 0xc420074a20, 0xdf8326, 0x4, 0xdf7db5, 0x3, 0xc420220f70, 0xf, 0x0, ...)
    GOROOT/src/net/dial.go:193 +0x50c
net.(*Dialer).DialContext(0xc4200740c0, 0xedd320, 0xc4202f6f00, 0xdf7db5, 0x3, 0xc420220f70, 0xf, 0x0, 0x0, 0x0, ...)
    GOROOT/src/net/dial.go:375 +0x22b
net.(*Dialer).DialContext-fm(0xedd320, 0xc4202f6f00, 0xdf7db5, 0x3, 0xc420220f70, 0xf, 0xc420053ae8, 0x63a7f9, 0xc4202f6f00, 0xd01e80)
    GOROOT/src/net/http/transport.go:46 +0x73
net/http.(*Transport).dial(0x12c0e20, 0xedd320, 0xc4202f6f00, 0xdf7db5, 0x3, 0xc420220f70, 0xf, 0x0, 0x0, 0x0, ...)
    GOROOT/src/net/http/transport.go:898 +0x20d
net/http.(*Transport).dialConn(0x12c0e20, 0xedd320, 0xc4202f6f00, 0x0, 0xc42023a980, 0x4, 0xc420220f70, 0xf, 0x0, 0x0, ...)
    GOROOT/src/net/http/transport.go:1143 +0x317
net/http.(*Transport).getConn.func4(0x12c0e20, 0xedd320, 0xc4202f6f00, 0xc4202f71a0, 0xc4202da2a0)
    GOROOT/src/net/http/transport.go:957 +0x78
created by net/http.(*Transport).getConn
    GOROOT/src/net/http/transport.go:956 +0x363

goroutine 8 [select]:
net.cgoLookupIP(0xedd260, 0xc420071a80, 0xc420220f70, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
    GOROOT/src/net/cgo_unix.go:212 +0x19f
net.(*Resolver).lookupIP(0x12fbe00, 0xedd260, 0xc420071a80, 0xc420220f70, 0x9, 0x0, 0xc4201e0480, 0xc4200700c0, 0x0, 0x0)
    GOROOT/src/net/lookup_unix.go:95 +0x12d
net.(*Resolver).(net.lookupIP)-fm(0xedd260, 0xc420071a80, 0xc420220f70, 0x9, 0x42a239, 0x8, 0xc4200700c0, 0x0, 0xc420042ea0)
    GOROOT/src/net/lookup.go:192 +0x56
net.glob..func10(0xedd260, 0xc420071a80, 0xc4201b2f80, 0xc420220f70, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0)
    GOROOT/src/net/hook.go:19 +0x52
net.(*Resolver).LookupIPAddr.func1(0x0, 0x0, 0x0, 0x0)
    GOROOT/src/net/lookup.go:206 +0xd8
internal/singleflight.(*Group).doCall(0x12fbdf0, 0xc42001eb90, 0xc420220f70, 0x9, 0xc4201be660)
    GOROOT/src/internal/singleflight/singleflight.go:95 +0x2e
created by internal/singleflight.(*Group).DoChan
    GOROOT/src/internal/singleflight/singleflight.go:88 +0x2d0

error from running dm clone,--stash-on-divergence,dotscience-hub,test/combined-houses - 2
Failed to set up dots: 2
2

It retried several times, the same effect each time.

alaric-dotmesh commented 6 years ago

Killing the running containers (docker rm -f dotscience-app-aa5c8742 dotscience-webhookrelay-aa5c8742 dotscience-committer-aa5c8742 ds-aa5c8742-26ca-4fdc-a0fd-278e588ed82d) then re-trying to start jupyter fixed it.

alaric-dotmesh commented 5 years ago

On further investigation, this seems to happen when #595 also happens - suggesting that the underlying problem is the dotmesh server container is unavailable when dm tries to look it up in DNS to connect to it.

That may be the underlying problem, but dm crashing when a DNS lookup fails is still a bug.