influxdata / influxdb

Scalable datastore for metrics, events, and real-time analytics
https://influxdata.com
Apache License 2.0
28.59k stars 3.54k forks source link

Neverending compaction #14426

Open alxchk opened 5 years ago

alxchk commented 5 years ago

Steps to reproduce: List the minimal actions needed to reproduce the behavior.

  1. Db work for some time (weeks maybe)
  2. ...
  3. Infinite loop with failures

Expected behavior: Compaction should finish successfully.

Actual behavior: Compaction crashed (?)

Environment info:

/dev/mmcblk0 on /containers type f2fs (rw,relatime,lazytime,background_gc=on,no_heap,user_xattr,inline_xattr,acl,inline_data,inline_dentry,flush_merge,extent_cache,mode=adaptive,active_logs=6)

free -h

          total        used        free      shared  buff/cache   available

Mem: 2.0Gi 602Mi 99Mi 1.0Mi 1.3Gi 1.3Gi Swap: 1.0Gi 227Mi 778Mi

ps aux | grep influxdb

influxdb 23991 5.0 9.5 2834780 195952 ? Ssl 21:54 1:36 /usr/bin/influxd -config /etc/influxdb/influxdb.conf


__Config:__

[meta] dir = "/containers/system/monitoring/influxdb/meta" retention-autocreate = true logging-enabled = true pprof-enabled = false lease-duration = "30m0s" bind-address = "127.0.0.1:8088" enabled = false

[data] enabled = true dir = "/containers/system/monitoring/influxdb/data" wal-dir = "/containers/system/monitoring/influxdb/wal" wal-logging-enabled = true query-log-enabled = false cache-snapshot-memory-size = 512000 compact-min-file-count = 10 compact-full-write-cold-duration = "24h" max-points-per-block = 256 max-concurrent-compactions = 4 max-series-per-database = 4096


__Logs:__
Include snippet of errors in log.

ts=2019-07-23T19:20:43.238047Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4I~sl000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists" ts=2019-07-23T19:20:44.239585Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4I~sl000 op_name=tsm1_compact_group op_event=end op_elapsed=1007.711ms ts=2019-07-23T19:20:45.233311Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group op_event=start ts=2019-07-23T19:20:45.234088Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_files_n=4 ts=2019-07-23T19:20:45.234606Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm ts=2019-07-23T19:20:45.235190Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm ts=2019-07-23T19:20:45.235759Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influ xdb/data/collectd/autogen/216/000000021-000000002.tsm ts=2019-07-23T19:20:45.236324Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm ts=2019-07-23T19:20:45.238585Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists" ts=2019-07-23T19:20:46.241812Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group op_event=end op_elapsed=1010.110ms ts=2019-07-23T19:20:47.233691Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group op_event=start ts=2019-07-23T19:20:47.234447Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_files_n=4 ts=2019-07-23T19:20:47.234995Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm ts=2019-07-23T19:20:47.235585Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm ts=2019-07-23T19:20:47.236159Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm ts=2019-07-23T19:20:47.236735Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm ts=2019-07-23T19:20:47.238685Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists" ts=2019-07-23T19:20:48.240944Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group op_event=end op_elapsed=1007.408ms ts=2019-07-23T19:20:49.233927Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group op_event=start ts=2019-07-23T19:20:49.242853Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_files_n=4 ts=2019-07-23T19:20:49.243677Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm ts=2019-07-23T19:20:49.244268Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm ts=2019-07-23T19:20:49.244826Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm ts=2019-07-23T19:20:49.245365Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm ts=2019-07-23T19:20:49.253546Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists" ts=2019-07-23T19:20:50.255163Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group op_event=end op_elapsed=1021.392ms ts=2019-07-23T19:20:51.232560Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group op_event=start ts=2019-07-23T19:20:51.233319Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_files_n=4 ts=2019-07-23T19:20:51.233842Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm ts=2019-07-23T19:20:51.234417Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm ts=2019-07-23T19:20:51.234986Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm ts=2019-07-23T19:20:51.235551Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm ts=2019-07-23T19:20:51.237402Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"

.....



[vars.txt](https://github.com/influxdata/influxdb/files/3423784/vars.txt)
yellowpattern commented 5 years ago

I've also seen the "Aborted compaction" a lot. See: https://github.com/influxdata/influxdb/issues/14608

stale[bot] commented 4 years ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.

yellowpattern commented 4 years ago

What does "needs repro" mean? Needs reproducing?

zeinsteinz commented 4 years ago

I have the same issue, and this behavior will explode 'syslog' and 'daemon.log' files

mrMoe commented 4 years ago

as am I

aemondis commented 4 years ago

Is this on a 32-bit platform such as an RPi3? If so, look at #11339.

Your options are move to a 64-bit kernel (and try your luck with the 2GB of RAM you have) or move to a larger device entirely. An experimental 64-bit kernel is available for Raspbian, which I am using successfully for many months now on an RPi 4 with 4GB RAM, where InfluxDB has not had any issues. RPi3 is a dead end for InfluxDB as the devs have indicated there will be no 32-bit "niceties" for the compaction (a fix was actually provided as a prior branch that will fix this issue, but will not be merged it appears). The compaction fails simply because it tries to load everything into RAM, which once the database is large enough - simply cannot be allocated in a 32-bit environment.

alxchk commented 4 years ago

It's more about virtual address space, not /physical/ memory consumption.

As for me - I stuck with aggressive retention policy to keep DB small. Each tech may have it's own limitation, but situations like this are sick.

Okay, dear influxdb, if I can not have so many data, please inform me about this with the message I can understand or hint me how I can fix|workaround this. Just don't destroy my logs/device/storage with trying to do something which can not be done in never ending loop. ..

gshif commented 4 years ago

Setup

Result

Introduce tmp file

little-arhat commented 2 years ago
ts=2021-10-12T11:36:54.985996Z lvl=info msg="Compacting file" log_id=0X94JF8W000 engine=tsm1 tsm1_level=2 tsm1_strategy=level trace_id=0X94VknG000 op_name=tsm1_compact_group db_shard_id=2595 tsm1_index=30 tsm1_file=/data/influxdb/data/db/default/2595/000042198-000000032.tsm
ts=2021-10-12T11:36:55.236326Z lvl=info msg="Aborted compaction" log_id=0X94JF8W000 engine=tsm1 tsm1_level=2 tsm1_strategy=level trace_id=0X94NzwG004 op_name=tsm1_compact_group db_shard_id=2604 error="compactions disabled"

What does above mean?

torsj commented 5 months ago

I have the same problem on an RPi4, InfluxDB v1.6.4. It has been running for 11 months, and lately I've got some timeouts when writing, and also one crash. I have tried to delete the .tmp files, but to no avail.

The syslog is overrun with these messages:

Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.311705Z lvl=info msg="TSM compaction (start)" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group op_event=start Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.311794Z lvl=info msg="Beginning compaction" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group tsm1_files_n=3 Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.311846Z lvl=info msg="Compacting file" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/var/lib/infl Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.311899Z lvl=info msg="Compacting file" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/var/lib/infl Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.311951Z lvl=info msg="Compacting file" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/var/lib/infl Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.312172Z lvl=info msg="Aborted compaction" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTB0Nl000 op_name=tsm1_compact_group error="compaction in progress: op Apr 09 09:49:29 raspi4 influxd[15829]: ts=2024-04-09T07:49:29.315844Z lvl=info msg="TSM compaction (end)" log_id=0oSSrZl0000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oSTAxUl000 op_name=tsm1_compact_group op_event=end op_elapsed=1000.60

Around the time of the crash today, I found this in syslog:

Apr 9 00:06:16 raspi4 influxd[534]: [httpd] 10.10.10.237 - tor [09/Apr/2024:00:06:16 +0200] "POST /write?db=nexahack_test HTTP/1.1" 204 0 "-" "python-requests/2.25.1" 38717363-f5f4-11ee-bc76-000000000000 5234 Apr 9 00:06:17 raspi4 influxd[534]: [httpd] 10.10.10.237 - tor [09/Apr/2024:00:06:16 +0200] "POST /write?db=nexahack_test HTTP/1.1" 204 0 "-" "python-requests/2.25.1" 38a63393-f5f4-11ee-bc77-000000000000 419511 Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.050267Z lvl=info msg="Error adding new TSM files from snapshot. Removing temp files." log_id=0nY00NNG000 engine=tsm1 trace_id=0oRwo6Sl000 op_name=tsm1_cache_snapshot error="cannot allocate memory " Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.050353Z lvl=info msg="Cache snapshot (end)" log_id=0nY00NNG000 engine=tsm1 trace_id=0oRwo6Sl000 op_name=tsm1_cache_snapshot op_event=end op_elapsed=2406.519ms Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.050396Z lvl=info msg="Error writing snapshot" log_id=0nY00NNG000 engine=tsm1 error="cannot allocate memory" Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.050446Z lvl=info msg="Cache snapshot (start)" log_id=0nY00NNG000 engine=tsm1 trace_id=0oRwoFrW000 op_name=tsm1_cache_snapshot op_event=start Apr 9 00:06:17 raspi4 influxd[534]: [httpd] 10.10.10.237 - tor [09/Apr/2024:00:06:17 +0200] "POST /write?db=nexahack_test HTTP/1.1" 204 0 "-" "python-requests/2.25.1" 38e6df74-f5f4-11ee-bc78-000000000000 6409 Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.089525Z lvl=info msg="TSM compaction (start)" log_id=0nY00NNG000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oRwoG0G000 op_name=tsm1_compact_group op_event=start Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.089578Z lvl=info msg="Beginning compaction" log_id=0nY00NNG000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oRwoG0G000 op_name=tsm1_compact_group tsm1_files_n=2 Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.089599Z lvl=info msg="Compacting file" log_id=0nY00NNG000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oRwoG0G000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/var/lib/influx db/data/nexahack_test/autogen/367/000000002-000000002.tsm Apr 9 00:06:17 raspi4 influxd[534]: ts=2024-04-08T22:06:17.089622Z lvl=info msg="Compacting file" log_id=0nY00NNG000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0oRwoG0G000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/var/lib/influx