peter-mount / nre-feeds

go library & Application for handling the NRE DarwinD3 feeds
Apache License 2.0
8 stars 0 forks source link

LDB out of memory error #8

Closed peter-mount closed 5 years ago

peter-mount commented 5 years ago

Whilst the v16 feed was running for the first time tonight I've noticed this:

2019/04/05 19:10:59 StationMessage 83726 []
2019/04/05 19:12:17 StationMessage 83734 [GLC SOA CBN LSK PLY WKF NTA EXD LMS YRK ALM LDS WIL BWK NCL BPW BSK DBY CDF WNE BUT BEE SOU PAR PNZ TOT SPO HYM EDB RED BAN DAR CNM ATB OXF LGE TRU DON TAM UNI SAU GCR MTH SER TAU TVP BOD DHM SHF BHM CPW RDG NWP WIN BRI]
fatal error: runtime: out of memory

runtime stack:
runtime.throw(0x8dd4ce, 0x16)
    /usr/local/go/src/runtime/panic.go:617 +0x72
runtime.sysMap(0xc0ec000000, 0x4000000, 0xcdf578)
    /usr/local/go/src/runtime/mem_linux.go:170 +0xc7
runtime.(*mheap).sysAlloc(0xcc6fc0, 0x2000, 0xcc6fd0, 0x1)
    /usr/local/go/src/runtime/malloc.go:633 +0x1cd
runtime.(*mheap).grow(0xcc6fc0, 0x1, 0x0)
    /usr/local/go/src/runtime/mheap.go:1232 +0x42
runtime.(*mheap).allocSpanLocked(0xcc6fc0, 0x1, 0xcdf588, 0x7f92cfc0d1e0)
    /usr/local/go/src/runtime/mheap.go:1150 +0x3a7
runtime.(*mheap).alloc_m(0xcc6fc0, 0x1, 0xc000000036, 0x7f92cfc0d1e0)
    /usr/local/go/src/runtime/mheap.go:977 +0xc2
runtime.(*mheap).alloc.func1()
    /usr/local/go/src/runtime/mheap.go:1048 +0x4c
runtime.systemstack(0x1c)
    /usr/local/go/src/runtime/asm_amd64.s:351 +0x66
runtime.mstart()
    /usr/local/go/src/runtime/proc.go:1153

goroutine 25 [running]:
runtime.systemstack_switch()
    /usr/local/go/src/runtime/asm_amd64.s:311 fp=0xc0e4774ab8 sp=0xc0e4774ab0 pc=0x455d30
runtime.(*mheap).alloc(0xcc6fc0, 0x1, 0x10036, 0x0)
    /usr/local/go/src/runtime/mheap.go:1047 +0x8a fp=0xc0e4774b08 sp=0xc0e4774ab8 pc=0x42407a
runtime.(*mcentral).grow(0xcc80c0, 0x0)
    /usr/local/go/src/runtime/mcentral.go:256 +0x95 fp=0xc0e4774b50 sp=0xc0e4774b08 pc=0x417135
runtime.(*mcentral).cacheSpan(0xcc80c0, 0x848840)
    /usr/local/go/src/runtime/mcentral.go:106 +0x2ff fp=0xc0e4774bb0 sp=0xc0e4774b50 pc=0x416c3f
runtime.(*mcache).refill(0x7f92d6b2a008, 0x36)
    /usr/local/go/src/runtime/mcache.go:135 +0x86 fp=0xc0e4774bd0 sp=0xc0e4774bb0 pc=0x4166d6
runtime.(*mcache).nextFree(0x7f92d6b2a008, 0x43b136, 0x884e20, 0x4032e0, 0xc0e4774d18)
    /usr/local/go/src/runtime/malloc.go:786 +0x88 fp=0xc0e4774c08 sp=0xc0e4774bd0 pc=0x40b078
runtime.mallocgc(0x280, 0x8b8620, 0x848001, 0x5)
    /usr/local/go/src/runtime/malloc.go:939 +0x76e fp=0xc0e4774ca8 sp=0xc0e4774c08 pc=0x40b98e
reflect.unsafe_New(0x8b8620, 0x100000084ca7d)
    /usr/local/go/src/runtime/malloc.go:1073 +0x38 fp=0xc0e4774cd8 sp=0xc0e4774ca8 pc=0x40bdf8
reflect.New(0x9eb880, 0x8b8620, 0x8b8620, 0x9eb880, 0x89c4a0)
    /usr/local/go/src/reflect/value.go:2290 +0x45 fp=0xc0e4774d08 sp=0xc0e4774cd8 pc=0x482ca5
encoding/json.indirect(0x89c4a0, 0xc0ebffa5b0, 0x196, 0x100, 0x0, 0x0, 0x182, 0x0, 0x884e20, 0xc0ebffa590, ...)
    /usr/local/go/src/encoding/json/decode.go:474 +0x3f1 fp=0xc0e4774d88 sp=0xc0e4774d08 pc=0x4e46d1
encoding/json.(*decodeState).object(0xc0e7881180, 0x89c4a0, 0xc0ebffa5b0, 0x196, 0xc0e78811a8, 0x7b)
    /usr/local/go/src/encoding/json/decode.go:607 +0x62 fp=0xc0e4775048 sp=0xc0e4774d88 pc=0x4e5792
encoding/json.(*decodeState).value(0xc0e7881180, 0x89c4a0, 0xc0ebffa5b0, 0x196, 0x89c4a0, 0xc0ebffa5b0)
    /usr/local/go/src/encoding/json/decode.go:381 +0x6e fp=0xc0e47750b0 sp=0xc0e4775048 pc=0x4e401e
encoding/json.(*decodeState).object(0xc0e7881180, 0x8825a0, 0xc0df394a00, 0x196, 0xc0e78811a8, 0x7b)
    /usr/local/go/src/encoding/json/decode.go:763 +0x18bc fp=0xc0e4775370 sp=0xc0e47750b0 pc=0x4e6fec
encoding/json.(*decodeState).value(0xc0e7881180, 0x8825a0, 0xc0df394a00, 0x196, 0x0, 0x16)
    /usr/local/go/src/encoding/json/decode.go:381 +0x6e fp=0xc0e47753d8 sp=0xc0e4775370 pc=0x4e401e
encoding/json.(*decodeState).object(0xc0e7881180, 0x848c00, 0xc0e6b43568, 0x195, 0xc0e78811a8, 0x7b)
    /usr/local/go/src/encoding/json/decode.go:763 +0x18bc fp=0xc0e4775698 sp=0xc0e47753d8 pc=0x4e6fec
encoding/json.(*decodeState).value(0xc0e7881180, 0x848c00, 0xc0e6b43568, 0x195, 0x848c00, 0xc0e6b43568)
    /usr/local/go/src/encoding/json/decode.go:381 +0x6e fp=0xc0e4775700 sp=0xc0e4775698 pc=0x4e401e
encoding/json.(*decodeState).object(0xc0e7881180, 0x8992c0, 0xc0e6b43540, 0x16, 0xc0e78811a8, 0xc0eb79cb7b)
    /usr/local/go/src/encoding/json/decode.go:763 +0x18bc fp=0xc0e47759c0 sp=0xc0e4775700 pc=0x4e6fec
encoding/json.(*decodeState).value(0xc0e7881180, 0x8992c0, 0xc0e6b43540, 0x16, 0xc0e4775a70, 0x4f6e39)
    /usr/local/go/src/encoding/json/decode.go:381 +0x6e fp=0xc0e4775a28 sp=0xc0e47759c0 pc=0x4e401e
encoding/json.(*decodeState).unmarshal(0xc0e7881180, 0x8992c0, 0xc0e6b43540, 0xc0e78811a8, 0x0)
    /usr/local/go/src/encoding/json/decode.go:179 +0x209 fp=0xc0e4775ab0 sp=0xc0e4775a28 pc=0x4e3839
encoding/json.Unmarshal(0x7f92b7ffdc01, 0xbe791, 0xbe791, 0x8992c0, 0xc0e6b43540, 0xc0ebf43200, 0x3)
    /usr/local/go/src/encoding/json/decode.go:106 +0x123 fp=0xc0e4775af8 sp=0xc0e4775ab0 pc=0x4e3213
github.com/peter-mount/nre-feeds/ldb.StationFromBytes(0x7f92b7ffdc01, 0xbe791, 0xbe791, 0x3)
    /work/ldb/station.go:50 +0x78 fp=0xc0e4775b48 sp=0xc0e4775af8 pc=0x7e9408
github.com/peter-mount/nre-feeds/ldb.(*LDB).GetStationTiploc.func1(0xc0ebe801c0, 0x8f3418, 0xc0ebe801c0)
    /work/ldb/stations.go:59 +0x171 fp=0xc0e4775bd0 sp=0xc0e4775b48 pc=0x7ebdb1
github.com/etcd-io/bbolt.(*DB).View(0xc00000a1e0, 0xc0e4775c78, 0x0, 0x0)
    /go/pkg/mod/github.com/etcd-io/bbolt@v1.3.1-etcd.8.0.20181208010242-22d122a8b910/db.go:701 +0x90 fp=0xc0e4775c20 sp=0xc0e4775bd0 pc=0x7071d0
github.com/peter-mount/nre-feeds/ldb.(*LDB).View(...)
    /work/ldb/departures.go:96
github.com/peter-mount/nre-feeds/ldb.(*LDB).GetStationTiploc(0xc00008a6e0, 0xc0ebee6172, 0x7, 0xc0ebed3500)
    /work/ldb/stations.go:54 +0xca fp=0xc0e4775cb0 sp=0xc0e4775c20 pc=0x7ea28a
github.com/peter-mount/nre-feeds/ldb.(*LDB).locationListener(0xc00008a6e0, 0xc0eb89caf0)
    /work/ldb/locationListener.go:15 +0x91 fp=0xc0e4775d00 sp=0xc0e4775cb0 pc=0x7e89f1
github.com/peter-mount/nre-feeds/ldb.(*LDB).locationListener-fm(0xc0eb89caf0)
    /work/ldb/locationListener.go:9 +0x34 fp=0xc0e4775d20 sp=0xc0e4775d00 pc=0x7ec044
github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents.func1(0xc0001e2a80, 0x8da19f, 0xf, 0xc000026240)
    /work/darwind3/event.go:114 +0x16c fp=0xc0e4775fc0 sp=0xc0e4775d20 pc=0x7e26bc
runtime.goexit()
    /usr/local/go/src/runtime/asm_amd64.s:1337 +0x1 fp=0xc0e4775fc8 sp=0xc0e4775fc0 pc=0x457c81
created by github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents
    /work/darwind3/event.go:103 +0x4c3

goroutine 1 [IO wait, 46 minutes]:
internal/poll.runtime_pollWait(0x7f92d48c5f38, 0x72, 0x0)
    /usr/local/go/src/runtime/netpoll.go:182 +0x56
internal/poll.(*pollDesc).wait(0xc002c2b318, 0x72, 0x0, 0x0, 0x8d7017)
    /usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0x9b
internal/poll.(*pollDesc).waitRead(...)
    /usr/local/go/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Accept(0xc002c2b300, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
    /usr/local/go/src/internal/poll/fd_unix.go:384 +0x1ba
net.(*netFD).accept(0xc002c2b300, 0xc000028070, 0xc000028000, 0x40b509)
    /usr/local/go/src/net/fd_unix.go:238 +0x42
net.(*TCPListener).accept(0xc0001eb118, 0xc00020dba0, 0x72888940, 0xf0b4a67c226ebbbb)
    /usr/local/go/src/net/tcpsock_posix.go:139 +0x32
net.(*TCPListener).AcceptTCP(0xc0001eb118, 0xc00020dbc8, 0x4a23c6, 0x5ca79e85)
    /usr/local/go/src/net/tcpsock.go:247 +0x48
net/http.tcpKeepAliveListener.Accept(0xc0001eb118, 0xc00020dc18, 0x18, 0xc000000180, 0x67e1c4)
    /usr/local/go/src/net/http/server.go:3264 +0x2f
net/http.(*Server).Serve(0xc0001e00d0, 0x9e2200, 0xc0001eb118, 0x0, 0x0)
    /usr/local/go/src/net/http/server.go:2859 +0x22d
net/http.(*Server).ListenAndServe(0xc0001e00d0, 0x16, 0xc00020de30)
    /usr/local/go/src/net/http/server.go:2797 +0xe4
github.com/peter-mount/golib/rest.(*Server).Run(0xc0001854d0, 0x9da5c0, 0xc0001854d0)
    /go/pkg/mod/github.com/peter-mount/golib@v0.0.0-20181214134500-eb1182399956/rest/server.go:174 +0x56f
github.com/peter-mount/golib/kernel.(*Kernel).run(0xc00008cb00, 0x0, 0x0)
    /go/pkg/mod/github.com/peter-mount/golib@v0.0.0-20181214134500-eb1182399956/kernel/kernel.go:200 +0xb4
github.com/peter-mount/golib/kernel.Launch(0xc0001b1f78, 0x1, 0x1, 0x0, 0x0)
    /go/pkg/mod/github.com/peter-mount/golib@v0.0.0-20181214134500-eb1182399956/kernel/kernel.go:122 +0x27e
main.main()
    /work/ldb/bin/main.go:10 +0x6f

goroutine 18 [syscall, 129 minutes]:
os/signal.signal_recv(0x0)
    /usr/local/go/src/runtime/sigqueue.go:139 +0x9c
os/signal.loop()
    /usr/local/go/src/os/signal/signal_unix.go:23 +0x22
created by os/signal.init.0
    /usr/local/go/src/os/signal/signal_unix.go:29 +0x41

goroutine 4 [chan receive, 129 minutes]:
github.com/peter-mount/golib/kernel.Launch.func1(0xc000086a80, 0xc00008cb00)
    /go/pkg/mod/github.com/peter-mount/golib@v0.0.0-20181214134500-eb1182399956/kernel/kernel.go:106 +0x41
created by github.com/peter-mount/golib/kernel.Launch
    /go/pkg/mod/github.com/peter-mount/golib@v0.0.0-20181214134500-eb1182399956/kernel/kernel.go:105 +0x20a

goroutine 5 [select, 129 minutes]:
gopkg.in/robfig/cron%2ev2.(*Cron).run(0xc00008a780)
    /go/pkg/mod/gopkg.in/robfig/cron.v2@v2.0.0-20150107220207-be2e0b0deed5/cron.go:182 +0x561
created by gopkg.in/robfig/cron%2ev2.(*Cron).Start
    /go/pkg/mod/gopkg.in/robfig/cron.v2@v2.0.0-20150107220207-be2e0b0deed5/cron.go:157 +0x43

goroutine 23 [runnable]:
syscall.Syscall(0x0, 0x3, 0xc000221000, 0x1000, 0x1000, 0x1000, 0x0)
    /usr/local/go/src/syscall/asm_linux_amd64.s:18 +0x5
syscall.read(0x3, 0xc000221000, 0x1000, 0x1000, 0x0, 0x9db8a0, 0xc7e348)
    /usr/local/go/src/syscall/zsyscall_linux_amd64.go:732 +0x5a
syscall.Read(...)
    /usr/local/go/src/syscall/syscall_unix.go:172
internal/poll.(*FD).Read(0xc00010a600, 0xc000221000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
    /usr/local/go/src/internal/poll/fd_unix.go:165 +0x131
net.(*netFD).Read(0xc00010a600, 0xc000221000, 0x1000, 0x1000, 0xd80, 0xd4d, 0xc02ca88380)
    /usr/local/go/src/net/fd_unix.go:202 +0x4f
net.(*conn).Read(0xc000084100, 0xc000221000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
    /usr/local/go/src/net/net.go:177 +0x69
bufio.(*Reader).Read(0xc000087080, 0xc0e23f8657, 0x7, 0x7, 0xc000267e60, 0x40bd98, 0x7)
    /usr/local/go/src/bufio/bufio.go:223 +0x23e
io.ReadAtLeast(0x9da240, 0xc000087080, 0xc0e23f8657, 0x7, 0x7, 0x7, 0xc000267ed8, 0x404d75, 0xc000087020)
    /usr/local/go/src/io/io.go:310 +0x88
io.ReadFull(...)
    /usr/local/go/src/io/io.go:329
github.com/streadway/amqp.(*reader).ReadFrame(0xc000267f40, 0xc000267f50, 0xc0e40cf840, 0x0, 0x0)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/read.go:49 +0x76
github.com/streadway/amqp.(*Connection).reader(0xc0000ddb80, 0x9da800, 0xc000084100)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/connection.go:516 +0xd6
created by github.com/streadway/amqp.Open
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/connection.go:230 +0x261

goroutine 24 [select]:
github.com/streadway/amqp.(*Connection).heartbeater(0xc0000ddb80, 0x2540be400, 0xc0000870e0)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/connection.go:546 +0x187
created by github.com/streadway/amqp.(*Connection).openTune
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/connection.go:777 +0x482

goroutine 10 [select]:
github.com/streadway/amqp.(*consumers).buffer(0xc00001e8a0, 0xc0001e2ae0, 0xc0001e2a80)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:65 +0x1da
created by github.com/streadway/amqp.(*consumers).add
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:97 +0x163

goroutine 26 [chan receive]:
github.com/streadway/amqp.(*consumers).buffer(0xc00017fc20, 0xc0000669c0, 0xc000066960)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:61 +0x351
created by github.com/streadway/amqp.(*consumers).add
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:97 +0x163

goroutine 11 [chan receive]:
github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents.func1(0xc000066960, 0x8d8715, 0xb, 0xc0001c2800)
    /work/darwind3/event.go:105 +0x7e
created by github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents
    /work/darwind3/event.go:103 +0x4c3

goroutine 12 [chan receive, 3 minutes]:
github.com/streadway/amqp.(*consumers).buffer(0xc00001eae0, 0xc0001e2c60, 0xc0001e2c00)
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:61 +0x351
created by github.com/streadway/amqp.(*consumers).add
    /go/pkg/mod/github.com/streadway/amqp@v0.0.0-20181205114330-a314942b2fd9/consumers.go:97 +0x163

goroutine 27 [chan receive, 3 minutes]:
github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents.func1(0xc0001e2c00, 0x8d9b29, 0xe, 0xc0000262f0)
    /work/darwind3/event.go:105 +0x7e
created by github.com/peter-mount/nre-feeds/darwind3.(*DarwinEventManager).ListenToEvents
    /work/darwind3/event.go:103 +0x4c3

So although ldb was running, something isn't right

peter-mount commented 5 years ago

It's definitely the fault of LDB but it could be an issue with memory fragmentation & how go's GC operates.

I've got both the d3 & ldb services recording memory use into Graphite now & it's clear that it's LDB's fault.

  1. d3 can be seen at D3 Grafana
  2. ldb can be seen at LDB Grafana

Even after a full resync from darwin the D3 service only hits 469MB of memory use, but LDB hits 3.7GB!

The problem seems to be where, in LDB when things settle down it's only using just under 25MB of heap, but the RSS is still at 3.7GB and the OS (Linux/Docker in this instance) doesn't claim it back so the server memory use is still high, as is swap as pages get pushed to swap.

Screenshot_2019-04-12_23-53-31 Screenshot_2019-04-12_23-53-16

peter-mount commented 5 years ago

This issue had a big learning curve. Due to how golang handles it's heap, specifically structs with pointers, the heap grew at times when the system was underload, e.g. a full resync or during the morning snapshot updates. Also changed what is stored per service at each station so that it's storing only what it needs & getting the full data at request time reduced the memory footprint even more.