m3db / m3

M3 monorepo - Distributed TSDB, Aggregator and Query Engine, Prometheus Sidecar, Graphite Compatible, Metrics Platform
https://m3db.io/
Apache License 2.0
4.75k stars 453 forks source link

write data leads to painc #4080

Open ruiz-code opened 2 years ago

ruiz-code commented 2 years ago

deployment a cluster, node config:

m3_1.4.2_linux_amd64.tar.gz

node1

"coordinator":
  "listenAddress": "0.0.0.0:7201"
  "local":
    "namespaces":
    - "namespace": "default"
      "retention": "48h"
      "type": "unaggregated"
  "logging":
    "level": "info"
  "metrics":
    "extended": "none"
    "prometheus":
      "handlerPath": "/metrics"
      "listenAddress": "0.0.0.0:7203"
    "samplingRate": 1
    "sanitization": "prometheus"
    "scope":
      "prefix": "coordinator"
  "tagOptions":
    "idScheme": "quoted"
"db":
  "cache":
    "postingsList":
      "size": 262144
    "series":
      "policy": "lru"
  "client":
    "readConsistencyLevel": "one"
    "writeConsistencyLevel": "one"
  "clusterListenAddress": "0.0.0.0:9001"
  "commitlog":
    "flushEvery": "1s"
    "flushMaxBytes": 524288
    "queue":
      "calculationType": "fixed"
      "size": 2097152
  discovery:
    "config":
      "service":
        "cacheDir": "/var/lib/m3kv"
        "env": "default_env"
        "etcdClusters":
        - "endpoints":
          - "http://127.0.0.1:2379"
          "zone": "embedded"
        "service": "m3db"
        "zone": "embedded"
  "debugListenAddress": "0.0.0.0:9004"
  "filesystem":
    "filePathPrefix": "/var/lib/m3db"
  "gcPercentage": 100
  "httpClusterListenAddress": "0.0.0.0:9003"
  "httpNodeListenAddress": "0.0.0.0:9002"
  "listenAddress": "0.0.0.0:9000"
  "logging":
    "level": "info"
  "hostID":
    "resolver": "hostname"
  "metrics":
    "extended": "detailed"
    "prometheus":
      "handlerPath": "/metrics"
    "samplingRate": 1
    "sanitization": "prometheus"
  "writeNewSeriesAsync": true
  "writeNewSeriesBackoffDuration": "2ms"

node2

"coordinator":
  "listenAddress": "0.0.0.0:7201"
  "local":
    "namespaces":
    - "namespace": "default"
      "retention": "48h"
      "type": "unaggregated"
  "logging":
    "level": "info"
  "metrics":
    "extended": "none"
    "prometheus":
      "handlerPath": "/metrics"
      "listenAddress": "0.0.0.0:7203"
    "samplingRate": 1
    "sanitization": "prometheus"
    "scope":
      "prefix": "coordinator"
  "tagOptions":
    "idScheme": "quoted"
"db":
  "cache":
    "postingsList":
      "size": 262144
    "series":
      "policy": "lru"
  "client":
    "readConsistencyLevel": "one"
    "writeConsistencyLevel": "one"
  "clusterListenAddress": "0.0.0.0:9001"
  "commitlog":
    "flushEvery": "1s"
    "flushMaxBytes": 524288
    "queue":
      "calculationType": "fixed"
      "size": 2097152
  discovery:
    "config":
      "service":
        "cacheDir": "/var/lib/m3kv"
        "env": "default_env"
        "etcdClusters":
        - "endpoints":
          - "http://127.0.0.1:2379"
          "zone": "embedded"
        "service": "m3db"
        "zone": "embedded"
  "debugListenAddress": "0.0.0.0:9004"
  "filesystem":
    "filePathPrefix": "/var/lib/m3db"
  "gcPercentage": 100
  "httpClusterListenAddress": "0.0.0.0:9003"
  "httpNodeListenAddress": "0.0.0.0:9002"
  "listenAddress": "0.0.0.0:9000"
  "hostID":
    "resolver": "hostname"
  "logging":
    "level": "info"
  "writeNewSeriesAsync": true
  "writeNewSeriesBackoffDuration": "20ms"

node3

"coordinator":
  "listenAddress": "0.0.0.0:7201"
  "local":
    "namespaces":
    - "namespace": "default"
      "retention": "48h"
      "type": "unaggregated"
  "logging":
    "level": "info"
  "metrics":
    "extended": "none"
    "prometheus":
      "handlerPath": "/metrics"
      "listenAddress": "0.0.0.0:7203"
    "samplingRate": 1
    "sanitization": "prometheus"
    "scope":
      "prefix": "coordinator"
  "tagOptions":
    "idScheme": "quoted"
"db":
  "cache":
    "postingsList":
      "size": 262144
    "series":
      "policy": "lru"
  "client":
    "readConsistencyLevel": "one"
    "writeConsistencyLevel": "one"
  "clusterListenAddress": "0.0.0.0:9001"
  "commitlog":
    "flushEvery": "1s"
    "flushMaxBytes": 524288
    "queue":
      "calculationType": "fixed"
      "size": 2097152
  discovery:
    "config":
      "service":
        "cacheDir": "/var/lib/m3kv"
        "env": "default_env"
        "etcdClusters":
        - "endpoints":
          - "http://127.0.0.1:2379"
          "zone": "embedded"
        "service": "m3db"
        "zone": "embedded"
  "debugListenAddress": "0.0.0.0:9004"
  "filesystem":
    "filePathPrefix": "/var/lib/m3db"
  "gcPercentage": 100
  "httpClusterListenAddress": "0.0.0.0:9003"
  "httpNodeListenAddress": "0.0.0.0:9002"
  "listenAddress": "0.0.0.0:9000"
  "hostID":
    "resolver": "hostname"
  "logging":
    "level": "info"
  "writeNewSeriesAsync": true
  "writeNewSeriesBackoffDuration": "20ms"

start node cmd:

M3DB_HOST_ID=node1 ./m3dbnode -f config.yml
M3DB_HOST_ID=node2 ./m3dbnode -f config.yml
M3DB_HOST_ID=node3 ./m3dbnode -f config.yml

create database

curl -X POST http://localhost:7201/api/v1/database/create -d '{
  "type": "cluster",
  "namespaceName": "default",
  "retentionTime": "48h",
  "numShards": "6",
  "replicationFactor": "2",
  "hosts": [
        {
            "id": "node1",
            "isolationGroup": "test1",
            "zone": "embedded",
            "weight": 100,
            "address": "10.10.10.1",
            "port": 9000
        },
        {
            "id": "node1",
            "isolationGroup": "test2",
            "zone": "embedded",
            "weight": 100,
            "address": "10.10.10.2",
            "port": 9000
        },
        {
            "id": "node3",
            "isolationGroup": "test3",
            "zone": "embedded",
            "weight": 100,
            "address": "10.10.10.3",
            "port": 9000
        }
    ]
}'

initial namespace

curl -X POST http://localhost:7201/api/v1/services/m3db/namespace/ready -d '{
  "name": "default"
}'

when write to db

curl -X POST http://localhost:7201/api/v1/json/write -d '{
  "tags":
    {
      "__name__": "third_avenue",
      "city": "new_york",
      "checkout": "1"
    },
    "timestamp": '\"$(date "+%s")\"',
    "value": 5347.26
}'

get the painc below. Because there is too much data I only posted the start and end

SIGILL: illegal instruction
PC=0xdcdc60 m=5 sigcode=2
instruction bytes: 0xf 0x1 0xf9 0x48 0x81 0xe1 0xff 0x0 0x0 0x0 0x48 0x89 0x4c 0x24 0x8 0xc3

goroutine 6123 [running]:
github.com/m3db/m3/src/x/sync.getCore(0x36, 0xc0030dc160, 0x12df07d, 0x3, 0x0, 0x0, 0x1, 0x2389d60, 0xc000728900, 0x234e8c0, ...)
        /go/src/github.com/m3db/m3/src/x/sync/cpu_linux_amd64.s:9 fp=0xc0030dc0e8 sp=0xc0030dc0e0 pc=0xdcdc60
github.com/m3db/m3/src/x/sync.CPUCore(0x3)
        /go/src/github.com/m3db/m3/src/x/sync/index_cpu.go:49 +0x3f fp=0xc0030dc100 sp=0xc0030dc0e8 pc=0xdcb9df
github.com/m3db/m3/src/dbnode/storage.(*dbShardInsertQueue).Insert(0xc04e021080, 0xc05908f980, 0x0, 0x16d8bfdd4cbc4200, 0x40b4e3428f5c28f6, 0x2, 0x0, 0x0, 0x0, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/shard_insert_queue.go:293 +0x3d fp=0xc0030dc170 sp=0xc0030dc100 pc=0x12df07d
github.com/m3db/m3/src/dbnode/storage.(*dbShard).insertSeriesAsyncBatched(0xc000728900, 0x2396c50, 0xc002dd2c30, 0x0, 0xc058a16900, 0x39, 0x40, 0x0, 0x0, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/shard.go:1315 +0x178 fp=0xc0030dc3c0 sp=0xc0030dc170 pc=0x12d1e38
github.com/m3db/m3/src/dbnode/storage.(*dbShard).writeAndIndex(0xc000728900, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c30, 0x0, 0xc058a16900, 0x39, 0x40, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/shard.go:986 +0x258 fp=0xc0030dc950 sp=0xc0030dc3c0 pc=0x12cf818
github.com/m3db/m3/src/dbnode/storage.(*dbShard).WriteTagged(0xc000728900, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c30, 0x0, 0xc058a16900, 0x39, 0x40, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/shard.go:873 +0x19d fp=0xc0030dcd18 sp=0xc0030dc950 pc=0x12cf23d
github.com/m3db/m3/src/dbnode/storage.(*dbNamespace).WriteTagged(0xc0500e3500, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c30, 0x0, 0xc058a16900, 0x39, 0x40, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/namespace.go:768 +0x2e7 fp=0xc0030dd150 sp=0xc0030dcd18 pc=0x12ad8a7
github.com/m3db/m3/src/dbnode/storage.(*db).writeBatch(0xc000865dc0, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c00, 0x7f3f525efc38, 0xc00fa71140, 0x234e660, 0xc0139b62a0, 0xc058a16901, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/database.go:932 +0x3cb fp=0xc0030dd7c8 sp=0xc0030dd150 pc=0x1279ccb
github.com/m3db/m3/src/dbnode/storage.(*db).WriteTaggedBatch(0xc000865dc0, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c00, 0x7f3f525efc38, 0xc00fa71140, 0x234e660, 0xc0139b62a0, 0x0, ...)
        /go/src/github.com/m3db/m3/src/dbnode/storage/database.go:890 +0x99 fp=0xc0030dd838 sp=0xc0030dd7c8 pc=0x12798b9
github.com/m3db/m3/src/dbnode/storage/cluster.(*clusterDB).WriteTaggedBatch(0xc04ff16d20, 0x23a5b00, 0xc026269570, 0x2396c50, 0xc002dd2c00, 0x7f3f525efc38, 0xc00fa71140, 0x234e660, 0xc0139b62a0, 0xc033d50002, ...)
        <autogenerated>:1 +0xa9 fp=0xc0030dd8a0 sp=0xc0030dd838 pc=0x1913709
github.com/m3db/m3/src/dbnode/network/server/tchannelthrift/node.(*service).WriteTaggedBatchRaw(0xc0075c4a00, 0x7f3f517a9998, 0xc058cdb930, 0xc058cfb9b0, 0x0, 0x0)
        /go/src/github.com/m3db/m3/src/dbnode/network/server/tchannelthrift/node/service.go:2031 +0x7fb fp=0xc0030dda18 sp=0xc0030dd8a0 pc=0x18bc83b
github.com/m3db/m3/src/dbnode/generated/thrift/rpc.(*tchanNodeServer).handleWriteTaggedBatchRaw(0xc013a2e7a0, 0x7f3f517a9998, 0xc058cdb930, 0x23b3900, 0xc03b910090, 0xc03d315ae0, 0x48b7f4, 0x30dd620, 0xc03b1d2400, 0x0)
        /go/src/github.com/m3db/m3/src/dbnode/generated/thrift/rpc/tchan-rpc.go:2015 +0xc7 fp=0xc0030dda70 sp=0xc0030dda18 pc=0xfe6ec7
github.com/m3db/m3/src/dbnode/generated/thrift/rpc.(*tchanNodeServer).Handle(0xc013a2e7a0, 0x7f3f517a9998, 0xc058cdb930, 0xc058c7c5a6, 0x13, 0x23b3900, 0xc03b910090, 0xc058cfb920, 0xc03d315b88, 0x418df3, ...)
        /go/src/github.com/m3db/m3/src/dbnode/generated/thrift/rpc/tchan-rpc.go:1162 +0x112a fp=0xc0030ddaf0 sp=0xc0030dda70 pc=0xfe1d6a
github.com/uber/tchannel-go/thrift.(*Server).handle(0xc013a20690, 0x238b960, 0xc058cdb8a0, 0x237a870, 0xc013a2e7a0, 

...

goroutine 6073 [runnable]:
internal/poll.runtime_pollWait(0x7f3f50502f78, 0x72, 0xffffffffffffffff)
        /usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc0586f5398, 0x72, 0x0, 0x10, 0xffffffffffffffff)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc0586f5380, 0xc0557e87f0, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc0586f5380, 0xc0557e87f0, 0x10, 0x10, 0xc05914b7f0, 0xc058a35140, 0xc0590ad4a0)
        /usr/local/go/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc054f08a58, 0xc0557e87f0, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/net/net.go:183 +0x91
io.ReadAtLeast(0x2352400, 0xc054f08a58, 0xc0557e87f0, 0x10, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/io/io.go:328 +0x87
io.ReadFull(...)
        /usr/local/go/src/io/io.go:347
github.com/uber/tchannel-go.(*Connection).readFrames(0xc03d1d9340, 0xc00000009a)
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:660 +0xd9
created by github.com/uber/tchannel-go.(*Channel).newConnection
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:374 +0xf7c

goroutine 5226 [runnable]:
internal/poll.runtime_pollWait(0x7f3f4fd56dd0, 0x72, 0xffffffffffffffff)
        /usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc0586f5718, 0x72, 0x0, 0x10, 0xffffffffffffffff)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc0586f5700, 0xc055ab00f0, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc0586f5700, 0xc055ab00f0, 0x10, 0x10, 0xc05914b820, 0xc058a351a0, 0xc0590ad550)
        /usr/local/go/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc054f08aa0, 0xc055ab00f0, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/net/net.go:183 +0x91
io.ReadAtLeast(0x2352400, 0xc054f08aa0, 0xc055ab00f0, 0x10, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/io/io.go:328 +0x87
io.ReadFull(...)
        /usr/local/go/src/io/io.go:347
github.com/uber/tchannel-go.(*Connection).readFrames(0xc03e08bb80, 0xc00000009d)
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:660 +0xd9
created by github.com/uber/tchannel-go.(*Channel).newConnection
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:374 +0xf7c

goroutine 6086 [select]:
github.com/uber/tchannel-go.(*Connection).writeFrames(0xc035af7b80, 0xc000000098)
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:737 +0xa5
created by github.com/uber/tchannel-go.(*Channel).newConnection
        /go/src/github.com/m3db/m3/vendor/github.com/uber/tchannel-go/connection.go:375 +0xfa9

goroutine 6100 [runnable]:
internal/poll.runtime_pollWait(0x7f3f4fd56778, 0x72, 0xffffffffffffffff)
        /usr/local/go/src/runtime/netpoll.go:222 +0x55
internal/poll.(*pollDesc).wait(0xc0586f5a98, 0x72, 0x0, 0x10, 0xffffffffffffffff)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x45
internal/poll.(*pollDesc).waitRead(...)
        /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc0586f5a80, 0xc055ab0110, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/internal/poll/fd_unix.go:166 +0x1d5
net.(*netFD).Read(0xc0586f5a80, 0xc055ab0110, 0x10, 0x10, 0xc059380010, 0xc058eda5a0, 0xc03b9d8580)
        /usr/local/go/src/net/fd_posix.go:55 +0x4f
net.(*conn).Read(0xc054f08ad8, 0xc055ab0110, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/net/net.go:183 +0x91
io.ReadAtLeast(0x2352400, 0xc054f08ad8, 0xc055ab0110, 0x10, 0x10, 0x10, 0x0, 0x0, 0x0)
        /usr/local/go/src/io/io.go:328 +0x87
io.ReadFull(...)
        /usr/loca
gs     0x0

already set kernel args by sysctl

sysctl -w vm.max_map_count=3000000
sysctl -w vm.swappiness=1
sysctl -w fs.file-max=3000000
sysctl -w fs.nr_open=3000000
ruiz-code commented 2 years ago

Is here any one who could help me? thanks you so much

ruiz-code commented 2 years ago

@robskillington help, please !!! Maybe the instruction set is wrong on my machine

// func getCore() int
TEXT ·getCore(SB), NOSPLIT, $0
    // RDTSCP
    BYTE $0x0f; BYTE $0x01; BYTE $0xf9

    // Linux puts core ID in the bottom byte.
    ANDQ $0xff, CX
    MOVQ CX, ret+0(FP)
    RET
kendrickclark commented 2 years ago

I too have this issue

amritanshu-pandey commented 1 year ago

I am also facing this issue! Will add more details tomorrow.