Closed agardiman closed 1 year ago
From the logs, the panic looks happening here:
[signal SIGSEGV: segmentation violation code=0x1 addr=0x7fcdfdb9ec0a pc=0x4024b5]
goroutine 38915 [running]:
runtime.throw({0x1e04d0c, 0x4c})
/usr/local/go/src/runtime/panic.go:1198 +0x71 fp=0xc011ff4f80 sp=0xc011ff4f50 pc=0x437131
runtime.sigpanic()
/usr/local/go/src/runtime/signal_unix.go:742 +0x2f6 fp=0xc011ff4fd0 sp=0xc011ff4f80 pc=0x44d676
cmpbody()
/usr/local/go/src/internal/bytealg/compare_amd64.s:230 +0x1d5 fp=0xc011ff4fd8 sp=0xc011ff4fd0 pc=0x4024b5
strings.Compare(...)
/usr/local/go/src/strings/compare.go:24
github.com/thanos-io/thanos/pkg/strutil.mergeTwoStringSlices({0xc0519ba000, 0x9ff81, 0xa07f3}, {0xc0667a4000, 0x1df208, 0x209400})
/__w/mimir/mimir/vendor/github.com/thanos-io/thanos/pkg/strutil/merge.go:43 +0x1ab fp=0xc011ff50a8 sp=0xc011ff4fd8 pc=0x17af12b
github.com/thanos-io/thanos/pkg/strutil.MergeSlices({0xc00b760000, 0xc0224f19b0, 0x10})
/__w/mimir/mimir/vendor/github.com/thanos-io/thanos/pkg/strutil/merge.go:21 +0xd8 fp=0xc011ff5100 sp=0xc011ff50a8 pc=0x17aeef8
github.com/grafana/mimir/pkg/storegateway.(*BucketStore).LabelValues(0xc0002ca700, {0x23dea48, 0xc01ac2b980}, 0xc02399d8b0)
/__w/mimir/mimir/pkg/storegateway/bucket.go:1379 +0xa3b fp=0xc011ff52b8 sp=0xc011ff5100 pc=0x17c3f3b
github.com/grafana/mimir/pkg/storegateway.(*BucketStores).LabelValues(0xc001926800, {0x23dea48, 0xc01ac2b980}, 0x1a72780)
/__w/mimir/mimir/pkg/storegateway/bucket_stores.go:351 +0x145 fp=0xc011ff5368 sp=0xc011ff52b8 pc=0x17d5bc5
github.com/grafana/mimir/pkg/storegateway.(*StoreGateway).LabelValues.func2()
/__w/mimir/mimir/pkg/storegateway/gateway.go:344 +0x30 fp=0xc011ff5398 sp=0xc011ff5368 pc=0x17dbe10
github.com/grafana/mimir/pkg/storegateway/threadpool.(*Threadpool).Execute(0xc000384690, 0xc00010f4f8)
/__w/mimir/mimir/pkg/storegateway/threadpool/threadpool.go:115 +0x27d fp=0xc011ff54a0 sp=0xc011ff5398 pc=0x17b1d9d
github.com/grafana/mimir/pkg/storegateway.(*StoreGateway).LabelValues(0xc000422000, {0x23dea48, 0xc01ac2b980}, 0xc02399d8b0)
/__w/mimir/mimir/pkg/storegateway/gateway.go:343 +0x188 fp=0xc011ff5530 sp=0xc011ff54a0 pc=0x17dbce8
github.com/grafana/mimir/pkg/storegateway/storegatewaypb._StoreGateway_LabelValues_Handler.func1({0x23dea48, 0xc01ac2b980}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/pkg/storegateway/storegatewaypb/gateway.pb.go:221 +0x78 fp=0xc011ff5570 sp=0xc011ff5530 pc=0x17b0258
github.com/grafana/mimir/pkg/mimir.ThanosTracerUnaryInterceptor({0x23dea48, 0xc01ac2b950}, {0x1d3b5a0, 0xc02399d8b0}, 0xc0002c1220, 0xc0224f1908)
/__w/mimir/mimir/pkg/mimir/tracing.go:19 +0x7a fp=0xc011ff55b0 sp=0xc011ff5570 pc=0x1934dfa
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x23dea48, 0xc01ac2b950}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:25 +0x3a fp=0xc011ff55f0 sp=0xc011ff55b0 pc=0x103bc5a
github.com/weaveworks/common/middleware.ServerUserHeaderInterceptor({0x23dea48, 0xc01ac2b8f0}, {0x1d3b5a0, 0xc02399d8b0}, 0x40d234, 0xc01d3a8c00)
/__w/mimir/mimir/vendor/github.com/weaveworks/common/middleware/grpc_auth.go:38 +0x65 fp=0xc011ff5620 sp=0xc011ff55f0 pc=0x1081145
github.com/grafana/mimir/pkg/util/noauth.SetupAuthMiddleware.func1({0x23dea48, 0xc01ac2b8f0}, {0x1d3b5a0, 0xc02399d8b0}, 0xc01d3a8be0, 0xc01d3a8c00)
/__w/mimir/mimir/pkg/util/noauth/no_auth.go:32 +0xa7 fp=0xc011ff5660 sp=0xc011ff5620 pc=0x1924407
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x23dea48, 0xc01ac2b8f0}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:25 +0x3a fp=0xc011ff56a0 sp=0xc011ff5660 pc=0x103bc5a
github.com/weaveworks/common/middleware.UnaryServerInstrumentInterceptor.func1({0x23dea48, 0xc01ac2b8f0}, {0x1d3b5a0, 0xc02399d8b0}, 0xc01d3a8be0, 0xc01d3a8c20)
/__w/mimir/mimir/vendor/github.com/weaveworks/common/middleware/grpc_instrumentation.go:33 +0xa2 fp=0xc011ff5730 sp=0xc011ff56a0 pc=0x1081662
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x23dea48, 0xc01ac2b8f0}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:25 +0x3a fp=0xc011ff5770 sp=0xc011ff5730 pc=0x103bc5a
github.com/opentracing-contrib/go-grpc.OpenTracingServerInterceptor.func1({0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0}, 0xc01d3a8be0, 0xc01d3a8c40)
/__w/mimir/mimir/vendor/github.com/opentracing-contrib/go-grpc/server.go:57 +0x40f fp=0xc011ff59a0 sp=0xc011ff5770 pc=0x103f08f
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:25 +0x3a fp=0xc011ff59e0 sp=0xc011ff59a0 pc=0x103bc5a
github.com/weaveworks/common/middleware.GRPCServerLog.UnaryServerInterceptor({{0x24156d8, 0xc000b5e280}, 0x20}, {0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0}, 0xc01d3a8be0, 0xc01d3a8c60)
/__w/mimir/mimir/vendor/github.com/weaveworks/common/middleware/grpc_logging.go:29 +0xbe fp=0xc011ff5a98 sp=0xc011ff59e0 pc=0x10819fe
github.com/weaveworks/common/middleware.GRPCServerLog.UnaryServerInterceptor-fm({0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0}, 0x18, 0xc01d3a8be0)
/__w/mimir/mimir/vendor/github.com/weaveworks/common/middleware/grpc_logging.go:27 +0x68 fp=0xc011ff5af0 sp=0xc011ff5a98 pc=0x108df08
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1({0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0})
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:25 +0x3a fp=0xc011ff5b30 sp=0xc011ff5af0 pc=0x103bc5a
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1({0x23dea48, 0xc01ac2b860}, {0x1d3b5a0, 0xc02399d8b0}, 0xc019ad9bd0, 0x1b2eec0)
/__w/mimir/mimir/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:34 +0xbf fp=0xc011ff5b88 sp=0xc011ff5b30 pc=0x103baff
github.com/grafana/mimir/pkg/storegateway/storegatewaypb._StoreGateway_LabelValues_Handler({0x1d6a4a0, 0xc000422000}, {0x23dea48, 0xc01ac2b860}, 0xc01dbdeba0, 0xc00033e3f0)
/__w/mimir/mimir/pkg/storegateway/storegatewaypb/gateway.pb.go:223 +0x138 fp=0xc011ff5be0 sp=0xc011ff5b88 pc=0x17b0118
google.golang.org/grpc.(*Server).processUnaryRPC(0xc0000ef880, {0x240bad0, 0xc00f0c7ba0}, 0xc0013d8480, 0xc00036ddd0, 0x33342f8, 0x0)
/__w/mimir/mimir/vendor/google.golang.org/grpc/server.go:1282 +0xccf fp=0xc011ff5e48 sp=0xc011ff5be0 pc=0xa2ae8f
google.golang.org/grpc.(*Server).handleStream(0xc0000ef880, {0x240bad0, 0xc00f0c7ba0}, 0xc0013d8480, 0x0)
/__w/mimir/mimir/vendor/google.golang.org/grpc/server.go:1619 +0xa2a fp=0xc011ff5f68 sp=0xc011ff5e48 pc=0xa2eb4a
google.golang.org/grpc.(*Server).serveStreams.func1.2()
/__w/mimir/mimir/vendor/google.golang.org/grpc/server.go:921 +0x98 fp=0xc011ff5fe0 sp=0xc011ff5f68 pc=0xa289b8
runtime.goexit()
/usr/local/go/src/runtime/asm_amd64.s:1581 +0x1 fp=0xc011ff5fe8 sp=0xc011ff5fe0 pc=0x46b241
created by google.golang.org/grpc.(*Server).serveStreams.func1
/__w/mimir/mimir/vendor/google.golang.org/grpc/server.go:919 +0x294
Mimir version: 2.2.0
Could you upgrade to 2.4.0 and let us know if the happens there too?
Yeah sure! I'll upgrade and update here if this happens again with the new version.
Mimir version: 2.2.0
Could you upgrade to 2.4.0 and let us know if the happens there too?
According to https://github.com/grafana/mimir/issues/3759, it's happening in Mimir 2.5.0 too. The stack trace from #3759 is:
unexpected fault address 0x7f49341ae5fc
fatal error: fault
[signal SIGSEGV: segmentation violation code=0x1 addr=0x7f49341ae5fc pc=0x4032f1]
goroutine 25642886 [running]:
runtime.throw({0x1faba5c?, 0xc0500d6bb0?})
/usr/local/go/src/runtime/panic.go:1047 +0x5d fp=0xc0bbb9ec98 sp=0xc0bbb9ec68 pc=0x43a07d
runtime.sigpanic()
/usr/local/go/src/runtime/signal_unix.go:842 +0x2c5 fp=0xc0bbb9ece8 sp=0xc0bbb9ec98 pc=0x450a25
memeqbody()
/usr/local/go/src/internal/bytealg/equal_amd64.s:108 +0xd1 fp=0xc0bbb9ecf0 sp=0xc0bbb9ece8 pc=0x4032f1
github.com/grafana/mimir/pkg/util.mergeTwoStringSlices({0xc003fa7400, 0xa, 0x40}, {0xc0500d6580, 0xa, 0xb})
/__w/mimir/mimir/pkg/util/merger.go:74 +0x155 fp=0xc0bbb9eda0 sp=0xc0bbb9ecf0 pc=0x167da75
github.com/grafana/mimir/pkg/util.MergeSlices({0xc0cbfa9570, 0x606?, 0x9c6})
/__w/mimir/mimir/pkg/util/merger.go:63 +0xd8 fp=0xc0bbb9edf8 sp=0xc0bbb9eda0 pc=0x167d898
...
github.com/grafana/mimir/pkg/storegateway.(*BucketStore).LabelValues(0xc000336600, {0x25f3bd0, 0xc05d9ac4e0}, 0xc0a797bd40)
/__w/mimir/mimir/pkg/storegateway/bucket.go:1288 +0xa3b fp=0xc0bbb9f320 sp=0xc0bbb9f168 pc=0x18e9e1b
...
I have a theory (just a theory so far). The label values returned by BinaryReader.LabelValues()
are yolo strings, which means they directly reference the underlying mmap area and they're not a copy of it:
https://github.com/grafana/mimir/blob/main/pkg/storegateway/indexheader/binary_reader.go#L887
If the mmap area becomes invalid (because the mmap file description is closed) after the call to BinaryReader.LabelValues()
but before the call to BucketStore.Labels()
ends, then we could access an invalid memory area when we merge the label values.
If the mmap area becomes invalid
How could this happen? It shouldn't because of the lazy index header loader hit the idle timeout, given we just accessed it. However, what if we block is removed from the store-gateway at the same time we call LabelValues()
? We have a protection for that based on tracking the pending reader, but I think we release it too early here:
https://github.com/grafana/mimir/blob/main/pkg/storegateway/bucket.go#L1426
I was thinking to change this: https://github.com/grafana/mimir/blob/f85191002cede7fa5c0da91fb447b676465bbee5/pkg/storegateway/bucket.go#L1423-L1431
With something like this, but it's still not safe (see the TODO comment):
indexr := b.indexReader()
// Close the index reader only once the function returned, because the label values may
// reference the index-header mmap-ed area.
// TODO This is still not enough, because the memory area is still referenced in the
// storepb.LabelValuesResponse{} returned in this response.
defer runutil.CloseWithLogOnErr(s.logger, indexr, "label values")
g.Go(func() error {
result, err := blockLabelValues(gctx, indexr, req.Label, reqSeriesMatchers, s.logger, newSafeQueryStats())
if err != nil {
return errors.Wrapf(err, "block %s", b.meta.ULID)
}
An alternative is copy the label strings instead of using a yoloString()
here. We should run a benchmark to see the impact on performance, could it would have an impact on Series()
API calls too:
https://github.com/grafana/mimir/blob/main/pkg/storegateway/indexheader/binary_reader.go#L887
This issue shouldn't occur in the new experimental implementation of the index-header reader based on file reading syscalls instead of mmap.
Since the mmap reader is no longer in Mimir as of version 2.7.1 (#4280), can we close this issue?
Describe the bug
Some store-gateway recently crashed because of a segmentation fault.
To Reproduce
Few instances crashed but they stopped. I don't know what event made them fail.
Expected behavior
To not crash for segmentation fault.
Environment
Additional Context
store-gateway pod
Config (only values that differ from default)
Runtime config (diff from default)
store-gateway-47.log