grafana / mimir

Grafana Mimir provides horizontally scalable, highly available, multi-tenant, long-term storage for Prometheus.
https://grafana.com/oss/mimir/
GNU Affero General Public License v3.0
4.15k stars 535 forks source link

Flaky TestAlertmanager_ReplicasPosition data race #197

Open bboreham opened 3 years ago

bboreham commented 3 years ago

Test reports a data race; also it runs for four minutes and consumes > 500MB of RAM.

$ make test

>>>> Entering build container: protos
time docker run --rm --tty -i -v ~/.ssh/:/root/.ssh:delegated,z -v /home/vagrant/src/github.com/grafana/mimir/.cache:/go/cache:delegated,z -v /home/vagrant/src/github.com/grafana/mimir/.pkg:/go/pkg:delegated,z -v /home/vagrant/src/github.com/grafana/mimir:/go/src/github.com/grafana/mimir:delegated,z us.gcr.io/kubernetes-dev/mimir-build-image protos;
make: Entering directory '/go/src/github.com/grafana/mimir'
make: Nothing to be done for 'protos'.
make: Leaving directory '/go/src/github.com/grafana/mimir'
0.04user 0.01system 0:01.08elapsed 5%CPU (0avgtext+0avgdata 61264maxresident)k
0inputs+0outputs (0major+7619minor)pagefaults 0swaps

>>>> Entering build container: test
time docker run --rm --tty -i -v ~/.ssh/:/root/.ssh:delegated,z -v /home/vagrant/src/github.com/grafana/mimir/.cache:/go/cache:delegated,z -v /home/vagrant/src/github.com/grafana/mimir/.pkg:/go/pkg:delegated,z -v /home/vagrant/src/github.com/grafana/mimir:/go/src/github.com/grafana/mimir:delegated,z us.gcr.io/kubernetes-dev/mimir-build-image test;
make: Entering directory '/go/src/github.com/grafana/mimir'
go test -tags netgo -timeout 30m -race -count 1 ./...
?       github.com/grafana/mimir/cmd/blocksconvert      [no test files]
ok      github.com/grafana/mimir/cmd/mimir      1.035s
ok      github.com/grafana/mimir/cmd/proftest   2.554s
ok      github.com/grafana/mimir/cmd/query-tee  0.071s
?       github.com/grafana/mimir/cmd/test-exporter      [no test files]
?       github.com/grafana/mimir/cmd/thanosconvert      [no test files]
?       github.com/grafana/mimir/integration    [no test files]
?       github.com/grafana/mimir/integration/ca [no test files]
ok      github.com/grafana/mimir/integration/e2e        0.037s
?       github.com/grafana/mimir/integration/e2e/cache  [no test files]
?       github.com/grafana/mimir/integration/e2e/db     [no test files]
?       github.com/grafana/mimir/integration/e2e/images [no test files]
?       github.com/grafana/mimir/integration/e2emimir   [no test files]
level=info component=MultiTenantAlertmanager msg="instance not found in the ring" instance=instance-1 ring=alertmanager
level=info component=MultiTenantAlertmanager waitinguntilalertmanagerisJOININGinthering=(MISSING)
level=info component=MultiTenantAlertmanager msg="alertmanager is JOINING in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="waiting until initial state sync is complete for all users"
level=info component=MultiTenantAlertmanager msg="initial state sync is complete"
level=info component=MultiTenantAlertmanager msg="waiting until alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="instance not found in the ring" instance=instance-2 ring=alertmanager
level=info component=MultiTenantAlertmanager waitinguntilalertmanagerisJOININGinthering=(MISSING)
level=info component=MultiTenantAlertmanager msg="alertmanager is JOINING in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="waiting until initial state sync is complete for all users"
level=info component=MultiTenantAlertmanager msg="initial state sync is complete"
level=info component=MultiTenantAlertmanager msg="waiting until alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="instance not found in the ring" instance=instance-3 ring=alertmanager
level=info component=MultiTenantAlertmanager waitinguntilalertmanagerisJOININGinthering=(MISSING)
level=info component=MultiTenantAlertmanager msg="alertmanager is JOINING in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="waiting until initial state sync is complete for all users"
level=info component=MultiTenantAlertmanager msg="initial state sync is complete"
level=info component=MultiTenantAlertmanager msg="waiting until alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=1
level=debug component=MultiTenantAlertmanager msg="setting config" user=user1
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user1
level=debug component=MultiTenantAlertmanager user=user1 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user1 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user1 addr=127.0.0.1-3:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-3:0 user=user1 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-3 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user1 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user1 msg="no state for user in storage; proceeding" user=user1
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=2
level=debug component=MultiTenantAlertmanager msg="setting config" user=user3
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user3
level=debug component=MultiTenantAlertmanager user=user3 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user3 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user3 addr=127.0.0.1-3:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-3:0 user=user3 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-3 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user3 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user3 msg="no state for user in storage; proceeding" user=user3
level=debug component=MultiTenantAlertmanager msg="setting config" user=user2
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user2
level=debug component=MultiTenantAlertmanager user=user2 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user2 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user2 addr=127.0.0.1-3:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-3:0 user=user2 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-3 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user2 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user2 msg="no state for user in storage; proceeding" user=user2
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=3
level=debug component=MultiTenantAlertmanager msg="setting config" user=user1
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user1
level=debug component=MultiTenantAlertmanager user=user1 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user1 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user1 addr=127.0.0.1-1:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-1:0 user=user1 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-1 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user1 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user1 msg="no state for user in storage; proceeding" user=user1
level=debug component=MultiTenantAlertmanager msg="setting config" user=user2
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user2
level=debug component=MultiTenantAlertmanager user=user2 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user2 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user2 addr=127.0.0.1-2:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-2:0 user=user2 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-2 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user2 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user2 msg="no state for user in storage; proceeding" user=user2
level=debug component=MultiTenantAlertmanager msg="setting config" user=user3
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user3
level=debug component=MultiTenantAlertmanager user=user3 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user3 msg="Waiting for notification and silences to settle..."
level=debug component=MultiTenantAlertmanager msg="contacting replica for full state" user=user3 addr=127.0.0.1-2:0
level=error component=MultiTenantAlertmanager msg="rpc reading state from replica failed" addr=127.0.0.1-2:0 user=user3 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp: lookup 127.0.0.1-2 on 192.168.0.1:53: no such host\""
level=info component=MultiTenantAlertmanager user=user3 msg="state not settled; trying to read from storage" err="failed to read state from any replica"
level=info component=MultiTenantAlertmanager user=user3 msg="no state for user in storage; proceeding" user=user3
level=info component=MultiTenantAlertmanager msg="unregistering instance from ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Running maintenance"
level=info component=MultiTenantAlertmanager msg="instance removed from the ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Maintenance done" duration=3.068597ms size=0
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Maintenance done" duration=5.063864ms size=0
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Maintenance done" duration=1.25843ms size=0
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Maintenance done" duration=1.427035ms size=0
level=debug component=MultiTenantAlertmanager user=user3 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user3 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user3 component=nflog msg="Maintenance done" duration=769.344µs size=0
level=debug component=MultiTenantAlertmanager user=user3 component=silences msg="Maintenance done" duration=1.056838ms size=0
level=debug component=MultiTenantAlertmanager user=user3 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user3 component=nflog msg="Running maintenance"
level=info component=MultiTenantAlertmanager msg="unregistering instance from ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user3 component=silences msg="Maintenance done" duration=1.148904ms size=0
level=info component=MultiTenantAlertmanager msg="instance removed from the ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user3 component=nflog msg="Maintenance done" duration=1.775341ms size=0
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Maintenance done" duration=923.19µs size=0
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Maintenance done" duration=1.177849ms size=0
level=info component=MultiTenantAlertmanager msg="unregistering instance from ring" ring=alertmanager
level=info component=MultiTenantAlertmanager msg="instance removed from the ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Maintenance done" duration=792.585µs size=0
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Maintenance done" duration=938.907µs size=0
level=info component=MultiTenantAlertmanager msg="instance not found in the ring" instance=instance-1 ring=alertmanager
level=info component=MultiTenantAlertmanager waitinguntilalertmanagerisJOININGinthering=(MISSING)
level=info component=MultiTenantAlertmanager msg="alertmanager is JOINING in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="waiting until initial state sync is complete for all users"
level=info component=MultiTenantAlertmanager msg="initial state sync is complete"
level=info component=MultiTenantAlertmanager msg="waiting until alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="instance not found in the ring" instance=instance-2 ring=alertmanager
level=info component=MultiTenantAlertmanager waitinguntilalertmanagerisJOININGinthering=(MISSING)
level=info component=MultiTenantAlertmanager msg="alertmanager is JOINING in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="waiting until initial state sync is complete for all users"
level=info component=MultiTenantAlertmanager msg="initial state sync is complete"
level=info component=MultiTenantAlertmanager msg="waiting until alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="alertmanager is ACTIVE in the ring"
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=2
level=debug component=MultiTenantAlertmanager msg="setting config" user=user2
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user2
level=debug component=MultiTenantAlertmanager user=user2 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user2 msg="Waiting for notification and silences to settle..."
level=info component=MultiTenantAlertmanager user=user2 msg="skipping settling (no replicas)"
level=debug component=MultiTenantAlertmanager msg="setting config" user=user1
level=debug component=MultiTenantAlertmanager msg="initializing new per-tenant alertmanager" user=user1
level=debug component=MultiTenantAlertmanager user=user1 msg="starting tenant alertmanager with ring-based replication"
level=info component=MultiTenantAlertmanager user=user1 msg="Waiting for notification and silences to settle..."
level=info component=MultiTenantAlertmanager user=user1 msg="skipping settling (no replicas)"
level=debug component=MultiTenantAlertmanager user=user2 msg="persisting state" user=user2
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=debug component=MultiTenantAlertmanager user=user1 msg="persisting state" user=user1
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=2
level=debug component=MultiTenantAlertmanager msg="setting config" user=user1
level=debug component=MultiTenantAlertmanager msg="setting config" user=user2
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=1
level=debug component=MultiTenantAlertmanager msg="setting config" user=user2
level=info component=MultiTenantAlertmanager msg="deactivating per-tenant alertmanager" user=user1
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user1 component=silences msg="Maintenance done" duration=1.192147ms size=0
level=debug component=MultiTenantAlertmanager user=user1 component=nflog msg="Maintenance done" duration=1.435949ms size=0
level=info component=MultiTenantAlertmanager msg="deactivated per-tenant alertmanager" user=user1
level=info component=MultiTenantAlertmanager msg="deleted local directory for user" dir=/tmp/alertmanager640832338/user1 user=user1
level=info component=MultiTenantAlertmanager msg="deleted remote state for user" user=user1
level=info component=MultiTenantAlertmanager msg="synchronizing alertmanager configs for users"
level=debug component=MultiTenantAlertmanager msg="adding configurations" num_configs=0
level=info component=MultiTenantAlertmanager msg="unregistering instance from ring" ring=alertmanager
level=info component=MultiTenantAlertmanager msg="instance removed from the ring" ring=alertmanager
level=info component=MultiTenantAlertmanager msg="unregistering instance from ring" ring=alertmanager
level=info component=MultiTenantAlertmanager msg="instance removed from the ring" ring=alertmanager
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Running maintenance"
level=debug component=MultiTenantAlertmanager user=user2 component=nflog msg="Maintenance done" duration=959.531µs size=0
level=debug component=MultiTenantAlertmanager user=user2 component=silences msg="Maintenance done" duration=1.075525ms size=0
==================
WARNING: DATA RACE
Write at 0x00c0000ea510 by goroutine 97:
  github.com/hashicorp/memberlist.(*Memberlist).aliveNode()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/state.go:1108 +0x6fd
  github.com/hashicorp/memberlist.(*Memberlist).handleAlive()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/net.go:673 +0x58e
  github.com/hashicorp/memberlist.(*Memberlist).packetHandler()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/net.go:448 +0x236

Previous read at 0x00c0000ea510 by goroutine 103:
  github.com/hashicorp/memberlist.(*Node).Address()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/state.go:43 +0x44
  github.com/prometheus/alertmanager/cluster.(*Peer).refresh()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:452 +0x364
  github.com/prometheus/alertmanager/cluster.(*Peer).refresh-fm()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:439 +0x4a
  github.com/prometheus/alertmanager/cluster.(*Peer).runPeriodicTask()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:395 +0xa3

Goroutine 97 (running) created at:
  github.com/hashicorp/memberlist.newMemberlist()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/memberlist.go:218 +0xa29
  github.com/hashicorp/memberlist.Create()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/memberlist.go:228 +0x3c
  github.com/prometheus/alertmanager/cluster.Create()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:238 +0x1504
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler.func1()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:28 +0x27c
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:52 +0x421
  testing.tRunner()
      /usr/local/go/src/testing/testing.go:1193 +0x202

Goroutine 103 (running) created at:
  github.com/prometheus/alertmanager/cluster.(*Peer).Join()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:271 +0x459
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:55 +0x4e6
  testing.tRunner()
      /usr/local/go/src/testing/testing.go:1193 +0x202
==================
==================
WARNING: DATA RACE
Write at 0x00c0000ea528 by goroutine 97:
  github.com/hashicorp/memberlist.(*Memberlist).aliveNode()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/state.go:1109 +0x771
  github.com/hashicorp/memberlist.(*Memberlist).handleAlive()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/net.go:673 +0x58e
  github.com/hashicorp/memberlist.(*Memberlist).packetHandler()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/net.go:448 +0x236

Previous read at 0x00c0000ea528 by goroutine 103:
  github.com/hashicorp/memberlist.(*Node).Address()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/state.go:43 +0x8e
  github.com/prometheus/alertmanager/cluster.(*Peer).refresh()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:452 +0x364
  github.com/prometheus/alertmanager/cluster.(*Peer).refresh-fm()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:439 +0x4a
  github.com/prometheus/alertmanager/cluster.(*Peer).runPeriodicTask()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:395 +0xa3

Goroutine 97 (running) created at:
  github.com/hashicorp/memberlist.newMemberlist()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/memberlist.go:218 +0xa29
  github.com/hashicorp/memberlist.Create()
      /go/src/github.com/grafana/mimir/vendor/github.com/hashicorp/memberlist/memberlist.go:228 +0x3c
  github.com/prometheus/alertmanager/cluster.Create()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:238 +0x1504
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler.func1()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:28 +0x27c
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:52 +0x421
  testing.tRunner()
      /usr/local/go/src/testing/testing.go:1193 +0x202

Goroutine 103 (running) created at:
  github.com/prometheus/alertmanager/cluster.(*Peer).Join()
      /go/src/github.com/grafana/mimir/vendor/github.com/prometheus/alertmanager/cluster/cluster.go:271 +0x459
  github.com/grafana/mimir/pkg/alertmanager.TestMultitenantAlertmanager_GetStatusHandler()
      /go/src/github.com/grafana/mimir/pkg/alertmanager/alertmanager_http_test.go:55 +0x4e6
  testing.tRunner()
      /usr/local/go/src/testing/testing.go:1193 +0x202
==================
--- FAIL: TestAlertmanager_ReplicasPosition (113.26s)
    testing.go:1092: race detected during execution of test
FAIL
FAIL    github.com/grafana/mimir/pkg/alertmanager       245.909s
krajorama commented 3 years ago

I'm new, but let me take a look. Where can I see test results or did you see on your machine? How frequently? Can I run "make test" multiple times in parallel to fasten repro? @bboreham

pracucci commented 3 years ago

Can I run "make test" multiple times in parallel to fasten repro?

Run the specific go test with -count=100 -race.

krajorama commented 3 years ago

Investigation so far into the race condition:

func (m *Memberlist) Members() []*Node function returns a list of pointers to member Nodes, prometheus alertmanager/cluster.go uses data from the returned list. There is a comment above this function that one must not write through the pointers, since the data is supposed to be protected by Memberlist mutex. The issue is that there is at least one situation when even read becomes inconsistent. If a node rejoins a memberlist with the same Name but different Address or Port, after being dead or after having left , then it's data may be reclaimed and it's new data is written into the same Pointer.

I don't know how serious this issue is. To be triggered, the memberlist has to be configured with DeadNodeReclaimTime > 0 and a node with different address needs to leave/be marked dead and rejoin.

Strictly speaking the Members function is theoretically broken and it should return a copy of the data , not pointers to data under a mutex.

Proof: The bellow test yields this output (even in latest version):

Node address = 127.0.0.11 *Node = 0xc00007a180
Node address = 127.0.0.11 *Node = 0xc00007a180
127.0.0.112021/11/11 14:29:31 [ERR] memberlist: Failed to send ping: write udp 127.0.0.11:37259->127.0.0.10:37259: use of closed network connection
127.0.0.112021/11/11 14:29:31 [ERR] memberlist: Failed to send gossip to 127.0.0.10:37259: write udp 127.0.0.11:37259->127.0.0.10:37259: use of closed network connection
127.0.0.102021/11/11 14:29:31 [DEBUG] memberlist: Stream connection from=127.0.0.1:49450
127.0.0.122021/11/11 14:29:31 [DEBUG] memberlist: Initiating push/pull sync with: 127.0.0.10 127.0.0.10:37259
127.0.0.102021/11/11 14:29:31 [DEBUG] memberlist: Stream connection from=127.0.0.1:49452
127.0.0.102021/11/11 14:29:31 [INFO] memberlist: Updating address for left or failed node 127.0.0.11 from 127.0.0.11:37259 to 127.0.0.12:37259
Node address = 127.0.0.12 *Node = 0xc00007a180
Node address = 127.0.0.12 *Node = 0xc00007a180
func TestMemberlist_JoinDataRace(t *testing.T) {
    c1 := testConfig(t)
    c1.DeadNodeReclaimTime = 1 * time.Nanosecond
    m1, err := Create(c1)
    require.NoError(t, err)
    defer m1.Shutdown()

    bindPort := m1.config.BindPort

    // Create a second node
    c2 := testConfig(t)
    c2.BindPort = bindPort

    m2, err := Create(c2)
    require.NoError(t, err)
    //defer m2.Shutdown()

    fmt.Println("KRAJO1: ", c1.Name)
    fmt.Println("KRAJO2: ", c2.Name)

    shutdownChannel := make(chan int)
    lister := func() int {
        for {
            select {
            case x := <- shutdownChannel:
                return x * 0
            default:
                members := m1.Members()
                for _,v := range members {
                    if v.Name != "127.0.0.10" {
                        fmt.Printf("Node address = %v *Node = %p\n", v.Addr, v)

                    }
                }
            }
        }
    }
    shutdownLister := func() {
        shutdownChannel <- 0
    }
    defer shutdownLister()
    go lister()

    num, err := m2.Join([]string{m1.config.Name + "/" + m1.config.BindAddr})
    if num != 1 {
        t.Fatalf("unexpected 1: %d", num)
    }
    if err != nil {
        t.Fatalf("unexpected err: %s", err)
    }

    // Check the hosts
    if len(m2.Members()) != 2 {
        t.Fatalf("should have 2 nodes! %v", m2.Members())
    }
    if m2.estNumNodes() != 2 {
        t.Fatalf("should have 2 nodes! %v", m2.Members())
    }
    yield()
    time.After(1 * time.Second)
    m2.Leave(1 * time.Second)
    m2.Shutdown()
    // Re-create the second node with a different address
    c3 := testConfig(t)
    c3.Name = c2.Name  // force reclaim
    c3.BindPort = bindPort

    m3, err := Create(c3)
    require.NoError(t, err)
    defer m3.Shutdown()

    numB, errB := m3.Join([]string{m1.config.Name + "/" + m1.config.BindAddr})
    if numB != 1 {
        t.Fatalf("unexpected 1: %d", numB)
    }
    if errB != nil {
        t.Fatalf("unexpected err: %s", errB)
    }

    time.After(5 * time.Second)
}
krajorama commented 3 years ago

As far as I can tell, the worst that can happen is that the information that prometheus cluster.go is using will be newer. Even if the data is garbled, it's not used directly. https://github.com/prometheus/alertmanager/blob/fad796931b792fc30f35bb18a580ae7323ef0241/cluster/cluster.go#L448

krajorama commented 3 years ago

I did reproduce the race, but with a slightly more direct method:

WARNING: DATA RACE
Write at 0x00c0000a07a8 by goroutine 48:
  github.com/hashicorp/memberlist.(*Memberlist).aliveNode()
      /home/krajo/go/github.com/hashicorp/memberlist/state.go:1114 +0x2666
  github.com/hashicorp/memberlist.(*Memberlist).mergeState()
      /home/krajo/go/github.com/hashicorp/memberlist/state.go:1305 +0x284
  github.com/hashicorp/memberlist.(*Memberlist).mergeRemoteState()
      /home/krajo/go/github.com/hashicorp/memberlist/net.go:1174 +0x194
  github.com/hashicorp/memberlist.(*Memberlist).handleConn()
      /home/krajo/go/github.com/hashicorp/memberlist/net.go:277 +0x94a
  github.com/hashicorp/memberlist.(*Memberlist).streamListen·dwrap·19()
      /home/krajo/go/github.com/hashicorp/memberlist/net.go:213 +0x58

Previous read at 0x00c0000a07a8 by goroutine 30:
  github.com/hashicorp/memberlist.(*Node).Address()
      /home/krajo/go/github.com/hashicorp/memberlist/state.go:43 +0x6a
  github.com/hashicorp/memberlist.TestMemberlist_JoinDataRace.func1()
      /home/krajo/go/github.com/hashicorp/memberlist/memberlist_test.go:2122 +0xb5
  github.com/hashicorp/memberlist.TestMemberlist_JoinDataRace·dwrap·123()
      /home/krajo/go/github.com/hashicorp/memberlist/memberlist_test.go:2133 +0x39

Goroutine 48 (running) created at:
  github.com/hashicorp/memberlist.(*Memberlist).streamListen()
      /home/krajo/go/github.com/hashicorp/memberlist/net.go:213 +0x44
  github.com/hashicorp/memberlist.newMemberlist·dwrap·4()
      /home/krajo/go/github.com/hashicorp/memberlist/memberlist.go:216 +0x39

Goroutine 30 (running) created at:
  github.com/hashicorp/memberlist.TestMemberlist_JoinDataRace()
      /home/krajo/go/github.com/hashicorp/memberlist/memberlist_test.go:2133 +0x68b
  testing.tRunner()
      /usr/local/go/src/testing/testing.go:1259 +0x22f
  testing.(*T).Run·dwrap·21()
      /usr/local/go/src/testing/testing.go:1306 +0x47
==================
--- FAIL: TestMemberlist_JoinDataRace (1.26s)
    testing.go:1152: race detected during execution of test
FAIL

Test code:

func TestMemberlist_JoinDataRace(t *testing.T) {
    c1 := testConfig(t)
    c1.DeadNodeReclaimTime = 1 * time.Nanosecond
    m1, err := Create(c1)
    require.NoError(t, err)
    defer m1.Shutdown()

    bindPort := m1.config.BindPort

    // Create a second node
    c2 := testConfig(t)
    c2.BindPort = bindPort

    m2, err := Create(c2)
    require.NoError(t, err)
    //defer m2.Shutdown()

    num, err := m2.Join([]string{m1.config.Name + "/" + m1.config.BindAddr})
    if num != 1 {
        t.Fatalf("unexpected 1: %d", num)
    }
    if err != nil {
        t.Fatalf("unexpected err: %s", err)
    }

    // Check the hosts
    if len(m2.Members()) != 2 {
        t.Fatalf("should have 2 nodes! %v", m2.Members())
    }
    if m2.estNumNodes() != 2 {
        t.Fatalf("should have 2 nodes! %v", m2.Members())
    }
    yield()
    time.After(1 * time.Second)

    shutdownChannel := make(chan int)
    members := m1.Members()
    lister := func() int {
        for {
            select {
            case x := <- shutdownChannel:
                return x * 0
            default:
                // members := m1.Members()
                for _,v := range members {
                    yield()
                    if v.Address() == "" {
                        panic("cannot be")
                    }
                }
            }
        }
    }
    shutdownLister := func() {
        shutdownChannel <- 0
    }
    defer shutdownLister()
    go lister()

    m2.Leave(1 * time.Second)
    m2.Shutdown()
    // Re-create the second node with a different address
    c3 := testConfig(t)
    c3.Name = c2.Name  // force reclaim
    c3.BindPort = bindPort

    m3, err := Create(c3)
    require.NoError(t, err)
    defer m3.Shutdown()

    numB, errB := m3.Join([]string{m1.config.Name + "/" + m1.config.BindAddr})
    if numB != 1 {
        t.Fatalf("unexpected 1: %d", numB)
    }
    if errB != nil {
        t.Fatalf("unexpected err: %s", errB)
    }

    time.After(5 * time.Second)
}
krajorama commented 3 years ago

The above race-condition can be circumvented in our case with something like:

func (m *Memberlist) MemberAddresses() []string {
    m.nodeLock.RLock()
    defer m.nodeLock.RUnlock()

    addresses := make([]string, 0, len(m.nodes))
    for _, n := range m.nodes {
        if !n.DeadOrLeft() {
            addresses = append(addresses, n.Node.Address())
        }
    }

    return addresses
}
krajorama commented 2 years ago

Opened https://github.com/hashicorp/memberlist/issues/250 to get some feedback on this issue.