firecracker-microvm / firecracker-containerd

firecracker-containerd enables containerd to manage containers as Firecracker microVMs
Apache License 2.0
2.1k stars 180 forks source link

TestMultipleVMs_Isolated is still flaky #581

Open kzys opened 2 years ago

kzys commented 2 years ago

https://buildkite.com/firecracker-microvm/firecracker-containerd/builds/2535#8267eaab-d952-4276-a956-c304790b50b0

=== RUN   TestMultipleVMs_Isolated
    service_integ_test.go:263: TestMultipleVMs_Isolated: will run 100 vm's
    service_integ_test.go:450: 
            Error Trace:    service_integ_test.go:450
            Error:          Received unexpected error:
                            unexpected errors from VM 76: failed to create a VM while there are 0 Firecracker processes: rpc error: code = Unknown desc = failed to create VM: failed to dial the VM over vsock: non-temporary vsock dial failure: failed to dial "rootfs/firecracker.vsock" within 100ms: dial unix rootfs/firecracker.vsock: connect: resource temporarily unavailable
            Test:           TestMultipleVMs_Isolated
--- FAIL: TestMultipleVMs_Isolated (132.93s)

The whole error message doesn't sound right.

non-temporary vsock dial failure: failed to dial "rootfs/firecracker.vsock" within 100ms: dial unix rootfs/firecracker.vsock: connect: resource temporarily unavailable

kzys commented 2 years ago

We are hitting one hour timeout even. Something is wrong.

=== RUN   TestMultipleVMs_Isolated

    service_integ_test.go:269: TestMultipleVMs_Isolated: will run 100 vm's

panic: test timed out after 1h0m0s

goroutine 7359 [running]:

testing.(*M).startAlarm.func1()

    /usr/local/go/src/testing/testing.go:1710 +0x11f

created by time.goFunc

    /usr/local/go/src/time/sleep.go:180 +0x52

goroutine 1 [chan receive, 60 minutes]:

testing.(*T).Run(0xc00057c480, 0x1f48a06, 0x18, 0x1f8a3d8, 0x1)

    /usr/local/go/src/testing/testing.go:1249 +0x610

testing.runTests.func1(0xc00057c480)

    /usr/local/go/src/testing/testing.go:1521 +0xa7

testing.tRunner(0xc00057c480, 0xc0006bfce0)

    /usr/local/go/src/testing/testing.go:1203 +0x203

testing.runTests(0xc000312000, 0x2a85360, 0x2e, 0x2e, 0xc09240127ac50e15, 0x34635650755, 0x2af7be0, 0xc0006bfda0)

    /usr/local/go/src/testing/testing.go:1519 +0x613

testing.(*M).Run(0xc000310000, 0x0)

    /usr/local/go/src/testing/testing.go:1427 +0x3b4

main.main()

    _testmain.go:133 +0x237

goroutine 120 [semacquire, 59 minutes]:

sync.runtime_Semacquire(0xc00061ab20)

    /usr/local/go/src/runtime/sema.go:56 +0x45

sync.(*WaitGroup).Wait(0xc00061ab18)

    /usr/local/go/src/sync/waitgroup.go:130 +0xe5

golang.org/x/sync/errgroup.(*Group).Wait(0xc00061ab10, 0xc0006d26f0, 0x212e860)

    /go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:40 +0x45

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated(0xc00057c600)

    /src/runtime/service_integ_test.go:450 +0xe45

testing.tRunner(0xc00057c600, 0x1f8a3d8)

    /usr/local/go/src/testing/testing.go:1203 +0x203

created by testing.(*T).Run

    /usr/local/go/src/testing/testing.go:1248 +0x5d8

goroutine 121 [select, 59 minutes]:

google.golang.org/grpc.(*ccBalancerWrapper).watcher(0xc000232900)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/balancer_conn_wrappers.go:69 +0x197

created by google.golang.org/grpc.newCCBalancerWrapper

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/balancer_conn_wrappers.go:60 +0x2fc

goroutine 122 [chan receive, 59 minutes]:

google.golang.org/grpc.(*addrConn).resetTransport(0xc000550dc0)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/clientconn.go:1214 +0x8fd

created by google.golang.org/grpc.(*addrConn).connect

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/clientconn.go:844 +0x107

goroutine 135 [IO wait, 59 minutes]:

internal/poll.runtime_pollWait(0x7ff85bad2658, 0x72, 0x2a976d8)

    /usr/local/go/src/runtime/netpoll.go:227 +0x55

internal/poll.(*pollDesc).wait(0xc0003edf18, 0x72, 0x8000, 0x8000, 0xffffffffffffffff)

    /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0xe6

internal/poll.(*pollDesc).waitRead(...)

    /usr/local/go/src/internal/poll/fd_poll_runtime.go:92

internal/poll.(*FD).Read(0xc0003edf00, 0xc0004d8000, 0x8000, 0x8000, 0x0, 0x0, 0x0)

    /usr/local/go/src/internal/poll/fd_unix.go:166 +0x28f

net.(*netFD).Read(0xc0003edf00, 0xc0004d8000, 0x8000, 0x8000, 0x11, 0x0, 0x0)

    /usr/local/go/src/net/fd_posix.go:55 +0x69

net.(*conn).Read(0xc00057e000, 0xc0004d8000, 0x8000, 0x8000, 0xc0004d8000, 0x9, 0x9047c0)

    /usr/local/go/src/net/net.go:183 +0xec

bufio.(*Reader).Read(0xc0001920c0, 0xc000250120, 0x9, 0x9, 0x800010601, 0x1060100000000, 0x8)

    /usr/local/go/src/bufio/bufio.go:227 +0x7ba

io.ReadAtLeast(0x210ce60, 0xc0001920c0, 0xc000250120, 0x9, 0x9, 0x9, 0x1c9c710, 0x7ff82aee0008, 0xc0004ac0b8)

    /usr/local/go/src/io/io.go:328 +0x9d

io.ReadFull(...)

    /usr/local/go/src/io/io.go:347

golang.org/x/net/http2.readFrameHeader(0xc000250120, 0x9, 0x9, 0x210ce60, 0xc0001920c0, 0x0, 0xc000000000, 0x7ff85bacf378, 0x1)

    /go/pkg/mod/golang.org/x/net@v0.0.0-20220127200216-cd36cc0744dd/http2/frame.go:237 +0xb1

golang.org/x/net/http2.(*Framer).ReadFrame(0xc0002500e0, 0xc0007f4d68, 0xc0007f4d68, 0x0, 0x0)

    /go/pkg/mod/golang.org/x/net@v0.0.0-20220127200216-cd36cc0744dd/http2/frame.go:498 +0xff

google.golang.org/grpc/internal/transport.(*http2Client).reader(0xc00000c1e0)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/http2_client.go:1353 +0x2c5

created by google.golang.org/grpc/internal/transport.newHTTP2Client

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/http2_client.go:346 +0x1489

goroutine 136 [select, 59 minutes]:

google.golang.org/grpc/internal/transport.(*controlBuffer).get(0xc0004ac050, 0x1, 0x0, 0x0, 0x0, 0x0)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/controlbuf.go:407 +0x1ea

google.golang.org/grpc/internal/transport.(*loopyWriter).run(0xc00060e0c0, 0x0, 0x0)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/controlbuf.go:527 +0x2df

google.golang.org/grpc/internal/transport.newHTTP2Client.func3(0xc00000c1e0)

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/http2_client.go:396 +0x12e

created by google.golang.org/grpc/internal/transport.newHTTP2Client

    /go/pkg/mod/google.golang.org/grpc@v1.38.1/internal/transport/http2_client.go:394 +0x1b27

goroutine 138 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).run.func1(0xc00024cc80, 0xc0005ba0e0)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:265 +0x153

created by github.com/containerd/ttrpc.(*Client).run

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:262 +0x148

goroutine 24 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).run(0xc00024cc80)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:330 +0x2a5

created by github.com/containerd/ttrpc.NewClient

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:94 +0x35b

goroutine 139 [IO wait, 59 minutes]:

internal/poll.runtime_pollWait(0x7ff85bad2570, 0x72, 0x2a976d8)

    /usr/local/go/src/runtime/netpoll.go:227 +0x55

internal/poll.(*pollDesc).wait(0xc00024ca98, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)

    /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0xe6

internal/poll.(*pollDesc).waitRead(...)

    /usr/local/go/src/internal/poll/fd_poll_runtime.go:92

internal/poll.(*FD).Read(0xc00024ca80, 0xc0006fb000, 0x1000, 0x1000, 0x0, 0x0, 0x0)

    /usr/local/go/src/internal/poll/fd_unix.go:166 +0x28f

net.(*netFD).Read(0xc00024ca80, 0xc0006fb000, 0x1000, 0x1000, 0x1f184a0, 0x8fc9bc, 0x1c9c710)

    /usr/local/go/src/net/fd_posix.go:55 +0x69

net.(*conn).Read(0xc00057e258, 0xc0006fb000, 0x1000, 0x1000, 0x8, 0xc001a05bd0, 0x8b800f)

    /usr/local/go/src/net/net.go:183 +0xec

bufio.(*Reader).Read(0xc00003d0e0, 0xc0004bcee0, 0xa, 0xa, 0x80, 0xc0005dd000, 0xc001a05d30)

    /usr/local/go/src/bufio/bufio.go:227 +0x7ba

io.ReadAtLeast(0x210ce60, 0xc00003d0e0, 0xc0004bcee0, 0xa, 0xa, 0xa, 0xc001a05d70, 0x8b69b5, 0x8ec7fc)

    /usr/local/go/src/io/io.go:328 +0x9d

io.ReadFull(...)

    /usr/local/go/src/io/io.go:347

github.com/containerd/ttrpc.readMessageHeader(0xc0004bcee0, 0xa, 0xa, 0x210ce60, 0xc00003d0e0, 0x0, 0xc0004bce50, 0xc001a05df8, 0x8fbbaa)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/channel.go:53 +0x99

github.com/containerd/ttrpc.(*channel).recv(0xc0004bcec0, 0xc0004c4780, 0xc001a05f00, 0x0, 0x0, 0x0, 0x0, 0x0)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/channel.go:101 +0xa5

github.com/containerd/ttrpc.(*Client).run.func2(0xc0001c2180, 0xc00024cc80, 0xc0005ba0e0)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:294 +0xfa

created by github.com/containerd/ttrpc.(*Client).run

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:286 +0x187

goroutine 327 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).dispatch(0xc00024cc80, 0x212e860, 0xc0004bcf80, 0xc00043c8a0, 0xc000c12240, 0x7ff85c83fa68, 0x10)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:163 +0x505

github.com/containerd/ttrpc.defaultClientInterceptor(0x212e860, 0xc0004bcf80, 0xc00043c8a0, 0xc000c12240, 0xc0006200c0, 0xc0006200d0, 0x15, 0x10)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/interceptor.go:49 +0x62

github.com/containerd/ttrpc.(*Client).Call(0xc00024cc80, 0x212e860, 0xc0004bcf80, 0x1f3855c, 0xb, 0x1f34967, 0x8, 0x1ef77a0, 0xc00067e820, 0x1eca380, ...)

    /go/pkg/mod/github.com/containerd/ttrpc@v1.1.0/client.go:132 +0x495

github.com/firecracker-microvm/firecracker-containerd/proto/service/fccontrol/ttrpc.(*firecrackerClient).CreateVM(0xc00057e260, 0x212e860, 0xc0004bcf80, 0xc00067e820, 0x0, 0x0, 0x5)

    /src/proto/service/fccontrol/ttrpc/fccontrol.pb.go:172 +0x137

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated.func1(0x212e860, 0xc0004bcf80, 0x0, 0x0)

    /src/runtime/service_integ_test.go:360 +0x7f9

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated.func2(0xc000ba7750, 0x0)

    /src/runtime/service_integ_test.go:442 +0x5f

golang.org/x/sync/errgroup.(*Group).Go.func1(0xc00061ab10, 0xc000b84120)

    /go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:57 +0x95

created by golang.org/x/sync/errgroup.(*Group).Go

    /go/pkg/mod/golang.org/x/sync@v0.0.0-20210220032951-036812b2e83c/errgroup/errgroup.go:54 +0x74

exit status 2

FAIL    github.com/firecracker-microvm/firecracker-containerd/runtime   3600.109s

make[1]: *** [Makefile:56: integ-test-TestMultipleVMs_Isolated] Error 1

make[1]: Leaving directory '/local/home/buildkite-agent/builds/ip-10-0-0-199-36/firecracker-microvm/firecracker-containerd/runtime'

make: *** [Makefile:52: integ-test] Error 2
kzys commented 2 years ago

There were Firecracker processes that couldn't progress much. I attached strace and then it made them move forward somehow. Maybe there is a timing issue?

fc-strace.zip