panic on restart with new version

motyla commented 6 years ago

just upgraded from 0.7.4_419_gef90f826-1 to 0.7.4_435_ge8334309-1 and got this:


Dec 18 13:45:56 metrictank[10397]: [Macaron] 2017-12-18 13:45:56: Completed /render/ 503 Service Unavailable in 122.248µs
Dec 18 13:45:59 metrictank[10397]: 2017/12/18 13:45:59 [DEBUG] memberlist: Initiating push/pull sync with: x.x.x.x:7946
Dec 18 13:46:00 metrictank[10397]: panic: aggmetric 1.a7542fb0979415e203015b4a0bf68d84 queried for chunk 0 out of 0 chunks
Dec 18 13:46:00 metrictank[10397]: goroutine 446 [running]:
Dec 18 13:46:00 metrictank[10397]: github.com/grafana/metrictank/mdata.(*AggMetric).getChunk(0xc704285ef0, 0x0, 0xc704285f10)
Dec 18 13:46:00 metrictank[10397]: /home/ubuntu/.go_workspace/src/github.com/grafana/metrictank/mdata/aggmetric.go:135 +0x1a9
Dec 18 13:46:00 metrictank[10397]: github.com/grafana/metrictank/mdata.(*AggMetric).GC(0xc704285ef0, 0x5a3772385a37b888, 0xc704154700)
Dec 18 13:46:00 metrictank[10397]: /home/ubuntu/.go_workspace/src/github.com/grafana/metrictank/mdata/aggmetric.go:567 +0x183
Dec 18 13:46:00 metrictank[10397]: github.com/grafana/metrictank/mdata.(*AggMetrics).GC(0xc420a0e240)
Dec 18 13:46:00 metrictank[10397]: /home/ubuntu/.go_workspace/src/github.com/grafana/metrictank/mdata/aggmetrics.go:65 +0x241
Dec 18 13:46:00 metrictank[10397]: created by github.com/grafana/metrictank/mdata.NewAggMetrics
Dec 18 13:46:00 metrictank[10397]: /home/ubuntu/.go_workspace/src/github.com/grafana/metrictank/mdata/aggmetrics.go:35 +0x17c```

Dieterbe commented 6 years ago

please share your metrictank.ini and storage*conf files. you get this straight after restart? consistently?

motyla commented 6 years ago

not straight after. it take a while until panic

storage-aggregation.conf

[default]
pattern = .*
xFilesFactor = 0.1
aggregationMethod = avg,min,max

storage-schemas.conf

chunkspan = 2h
numchunks = 1

[kkk]
pattern = kkk
retentions = 1h:1y:24h:1,1d:3y:24h:1

[mmm]
pattern = ^mmm.
retentions = 5m:7d:12h:1,1h:1y:24h:1,1d:3y:24h:1

[vvv30m]
pattern = vvv_.*_30_min
retentions = 30m:30d:24h:1,1h:1y:24h:1,1d:3y:24h:1

[zzz]
pattern = ^zzz.
retentions = 1m:7d:2h:2,5m:30d:12h:1,1h:1y:24h:1,1d:3y:24h:1
reorderBuffer = 60

[my_queue]
pattern = ^application.production.from_db.technical_alerts.my_queue.per_code.
retentions = 15m:30d:24h:1,6h:1y:24h:1,1d:3y:24h:1

[application]
pattern = ^application.
retentions = 1m:1d:2h:1,5m:7d:12h:1,15m:30d:12h:1,6h:1y:24h:1,1d:3y:24h:1

[bbb]
pattern = ^aaa.production..*.ccc.
retentions = 1h:1y:24h:1,1d:3y:24h:1

[aaa]
pattern = ^aaa.
retentions = 1m:7d:2h:2,5m:30d:12h:1,1h:1y:24h:1,1d:3y:24h:1
reorderBuffer = 60

[metrictank]
pattern = ^metrictank.
retentions = 1s:35d:10min:2

[carbon]
pattern = ^service_is_carbon-relay-ng\.
retentions = 1s:35d:10min:2

[default]
pattern = .*
retentions = 60s:1d:2h:1,1h:3d:24h:1

metrictank.ini:

instance = mt

accounting-period = 5min

drop-first-chunk = false

chunk-max-stale = 1h

metric-max-stale = 6h

gc-interval = 15m

warm-up-period = 1h

cassandra-addrs = "cluster addresses"

cassandra-keyspace = metrictank

cassandra-consistency = one

cassandra-host-selection-policy = tokenaware,hostpool-epsilon-greedy

cassandra-timeout = 10000

cassandra-read-concurrency = 100

cassandra-write-concurrency = 10

cassandra-read-queue-size = 200000

cassandra-write-queue-size = 100000

cassandra-retries = 10

cql-protocol-version = 4

cassandra-create-keyspace = false

cassandra-ssl = false

cassandra-ca-path = /etc/metrictank/ca.pem

cassandra-host-verification = true

cassandra-auth = false

cassandra-username = cassandra

cassandra-password = cassandra

block-profile-rate = 0

mem-profile-rate = 524288

proftrigger-freq = 60s

proftrigger-path = /tmp

proftrigger-min-diff = 1h

proftrigger-heap-thresh = 25000000000

log-level = 3

tracing-enabled = false

timeout = 10s

[retention]

schemas-file = /etc/metrictank/storage-schemas.conf

aggregations-file = /etc/metrictank/storage-aggregation.conf

[stats]

enabled = true

prefix = metrictank.stats.default.$instance

addr = localhost:20003

interval = 1

buffer-size = 20000

[chunk-cache]

max-size = 4294967296

[http]

listen = :6060

gzip = true

ssl = false

cert-file = /etc/ssl/certs/ssl-cert-snakeoil.pem

key-file = /etc/ssl/private/ssl-cert-snakeoil.key

max-points-per-req-soft = 1000000

max-points-per-req-hard = 20000000

multi-tenant = true

fallback-graphite-addr = http://graphite_addr

log-min-dur = 5min

time-zone = local

get-targets-concurrency = 20

[carbon-in]

enabled = false

addr = :2003

partition = 1

[kafka-mdm-in]

enabled = true

brokers = "kafka servers addresses"

topics = "topic lists"

offset = last

partitions = "partition set list"

offset-commit-interval = 5s

data-dir = /var/lib/metrictank

channel-buffer-size = 100000

consumer-fetch-min = 1

consumer-fetch-default = 32768

consumer-max-wait-time = 1s

consumer-max-processing-time = 1s

net-max-open-requests = 100

[cluster]

name = metrictank

primary-node = true

max-priority = 10

peers = "list of other metrictank servers"

mode = multi

http-timeout = 60s

min-available-shards = 0

[swim]

use-config = default-lan

bind-addr = x.x.x.x:7946

tcp-timeout = 10s

indirect-checks = 3

retransmit-mult = 4

suspicion-multi = 4

suspicion-max-timeout-mult = 6

push-pull-interval = 30s

probe-interval = 1s

probe-timeout = 500ms

disable-tcp-pings = false

awareness-max-multiplier = 8

gossip-nodes = 3

gossip-interval = 200ms

gossip-to-the-dead-time = 30s

enable-compression = true

dns-config-path = /etc/resolv.conf

[kafka-cluster]

enabled = true

brokers = "list of kafka brokers addresses"

topic = metricpersist

partitions = "partition set list"

partition-scheme = bySeries

offset = last

offset-commit-interval = 5s

backlog-process-timeout = 60s

data-dir = /var/lib/metrictank

[nsq-cluster]

enabled = false

[cassandra-idx]

enabled = true

keyspace = metrictank

hosts = "list of cassandra addresses"

protocol-version = 4

consistency = one

timeout = 10s

num-conns = 10

write-queue-size = 100000

max-stale = 0

prune-interval = 1h

update-cassandra-index = true

update-interval = 2h

ssl = false

ca-path = /etc/metrictank/ca.pem

host-verification = true

auth = false

username = cassandra

password = cassandra

create-keyspace = false

[memory-idx]
enabled = false
tag-support = false
match-cache-size = 1000

woodsaj commented 6 years ago

This is a bug introduced in https://github.com/grafana/metrictank/commit/fc64bb55e3eb0335b8bc57666100fcd0ad8f9f49

The problem is in: https://github.com/grafana/metrictank/blob/fc64bb55e3eb0335b8bc57666100fcd0ad8f9f49/mdata/aggmetric.go#L562-L567

if len(a.Chunks) is 0, and the reorderBuffer has points, then the call to a.getChunk() will panic.

this needs to be changed to

        if len(a.Chunks) == 0 {
            if (a.rob == nil || !a.rob.HasData()) {
        return true
            } else {
                return false
            }
    }

shanson7 commented 6 years ago

Seems like it could be

    if len(a.Chunks) == 0 {
        return a.rob == nil || a.rob.IsEmpty()
    }

Dieterbe commented 6 years ago

hi @motyla the fix is now in master and building a new build.

motyla commented 6 years ago

Thanks , will soon test it

On Tue, Dec 19, 2017, 20:30 Dieter Plaetinck notifications@github.com wrote:

hi @motyla https://github.com/motyla the fix is now in master and building a new build.

— You are receiving this because you were mentioned.

Reply to this email directly, view it on GitHub https://github.com/grafana/metrictank/issues/788#issuecomment-352845788, or mute the thread https://github.com/notifications/unsubscribe-auth/AAGugOAYaak_BqG1FdLdpw7cPXAzvU2fks5tCACzgaJpZM4RFiYU .

motyla commented 6 years ago

latest build, 0.7.4-556 solve that issue

grafana / metrictank

panic on restart with new version #788