elastic / logstash

Logstash - transport and process your logs, events, or other data
https://www.elastic.co/products/logstash
Other
69 stars 3.5k forks source link

All workers are dead #6401

Closed joshuaspence closed 7 years ago

joshuaspence commented 7 years ago

We are seeing severse performance issues with Logstash 2.4.1. Basically, after some amount of time (from a few hours to a few days), the throughput of Logstash is severely diminished. The service is still running and is still listening on port 5044 (we are using the Beats input), but we don't seem to be able to get many messages on the output. Running service logstash restart dumps the following JSON data to the log file:

{
    "timestamp": "2016-12-12T23:06:15.481000-0500",
    "message": {
        "events_filtered": 3376805,
        "events_consumed": 3373750,
        "worker_count": 4,
        "inflight_count": 0,
        "worker_states": [
            {
                "status": "dead",
                "alive": false,
                "index": 0,
                "inflight_count": 0
            },
            {
                "status": "dead",
                "alive": false,
                "index": 1,
                "inflight_count": 0
            },
            {
                "status": "dead",
                "alive": false,
                "index": 2,
                "inflight_count": 0
            },
            {
                "status": "dead",
                "alive": false,
                "index": 3,
                "inflight_count": 0
            }
        ],
        "output_info": [
            {
                "type": "http",
                "config": {
                    "http_method": "get",
                    "url": "http://127.0.0.1:8500/v1/agent/check/pass/service:beats:1",
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": false,
                "events_received": 1539,
                "workers": [
                    "LogStash::Outputs::Http: {\"http_method\"=>\"get\", \"url\"=>\"http://127.0.0.1:8500/v1/agent/check/pass/service:beats:1\", \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"workers\"=>1, \"request_timeout\"=>60, \"socket_timeout\"=>10, \"connect_timeout\"=>10, \"follow_redirects\"=>true, \"pool_max\"=>50, \"pool_max_per_route\"=>25, \"keepalive\"=>true, \"automatic_retries\"=>1, \"retry_non_idempotent\"=>false, \"validate_after_inactivity\"=>200, \"ssl_certificate_validation\"=>true, \"keystore_type\"=>\"JKS\", \"truststore_type\"=>\"JKS\", \"cookies\"=>true, \"verify_ssl\"=>true, \"format\"=>\"json\"}"
                ],
                "busy_workers": 1
            },
            {
                "type": "sqs",
                "config": {
                    "queue": "logstash",
                    "region": "us-east-1",
                    "workers": 4,
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": true,
                "events_received": 3376805,
                "workers": [
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}"
                ],
                "busy_workers": 3
            },
            {
                "type": "statsd",
                "config": {
                    "count": {
                        "logstash_shipper.logstash_events": "%{[events][count]}"
                    },
                    "gauge": {
                        "logstash_shipper.logstash_event_rate_1m": "%{[events][rate_1m]}",
                        "logstash_shipper.logstash_event_rate_5m": "%{[events][rate_5m]}",
                        "logstash_shipper.logstash_event_rate_15m": "%{[events][rate_15m]}"
                    },
                    "host": "localhost",
                    "port": 9125,
                    "sender": "i-8da97a9a",
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": false,
                "events_received": 3055,
                "workers": [
                    "LogStash::Outputs::Statsd: {\"count\"=>{\"logstash_shipper.logstash_events\"=>\"%{[events][count]}\"}, \"gauge\"=>{\"logstash_shipper.logstash_event_rate_1m\"=>\"%{[events][rate_1m]}\", \"logstash_shipper.logstash_event_rate_5m\"=>\"%{[events][rate_5m]}\", \"logstash_shipper.logstash_event_rate_15m\"=>\"%{[events][rate_15m]}\"}, \"host\"=>\"localhost\", \"port\"=>9125, \"sender\"=>\"i-8da97a9a\", \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"workers\"=>1, \"namespace\"=>\"logstash\", \"increment\"=>[], \"decrement\"=>[], \"timing\"=>{}, \"set\"=>{}, \"sample_rate\"=>1, \"debug\"=>false}"
                ],
                "busy_workers": 0
            }
        ],
        "thread_info": [],
        "stalling_threads_info": []
    },
    "level": "warn"
}
joshuaspence commented 7 years ago

These are the plugins that we have installed

> /opt/logstash/bin/logstash-plugin list --verbose
logstash-codec-collectd (2.0.4)
logstash-codec-dots (2.0.4)
logstash-codec-edn (2.0.4)
logstash-codec-edn_lines (2.0.4)
logstash-codec-es_bulk (2.0.4)
logstash-codec-fluent (2.0.4)
logstash-codec-graphite (2.0.4)
logstash-codec-json (2.1.4)
logstash-codec-json_lines (2.1.3)
logstash-codec-line (2.1.2)
logstash-codec-msgpack (2.0.4)
logstash-codec-multiline (2.0.11)
logstash-codec-netflow (2.1.1)
logstash-codec-oldlogstashjson (2.0.4)
logstash-codec-plain (2.0.4)
logstash-codec-rubydebug (2.0.7)
logstash-filter-anonymize (2.0.4)
logstash-filter-checksum (2.0.4)
logstash-filter-clone (2.0.6)
logstash-filter-csv (2.1.3)
logstash-filter-date (2.1.6)
logstash-filter-dns (2.1.3)
logstash-filter-drop (2.0.4)
logstash-filter-fingerprint (2.0.5)
logstash-filter-geoip (2.0.7)
logstash-filter-grok (2.0.5)
logstash-filter-json (2.0.6)
logstash-filter-kv (2.1.0)
logstash-filter-metrics (3.0.2)
logstash-filter-multiline (2.0.5)
logstash-filter-mutate (2.0.6)
logstash-filter-ruby (2.0.5)
logstash-filter-sleep (2.0.4)
logstash-filter-split (2.0.5)
logstash-filter-syslog_pri (2.0.4)
logstash-filter-throttle (2.0.4)
logstash-filter-urldecode (2.0.4)
logstash-filter-useragent (2.0.8)
logstash-filter-uuid (2.0.5)
logstash-filter-xml (2.2.0)
logstash-input-beats (3.1.12)
logstash-input-couchdb_changes (2.0.4)
logstash-input-elasticsearch (2.0.5)
logstash-input-eventlog (3.0.3)
logstash-input-exec (2.0.6)
logstash-input-file (2.2.5)
logstash-input-ganglia (2.0.6)
logstash-input-gelf (2.0.7)
logstash-input-generator (2.0.4)
logstash-input-graphite (2.0.7)
logstash-input-heartbeat (2.0.4)
logstash-input-http (2.2.3)
logstash-input-http_poller (2.1.0)
logstash-input-imap (2.0.5)
logstash-input-irc (2.0.5)
logstash-input-jdbc (3.1.0)
logstash-input-kafka (2.0.9)
logstash-input-log4j (2.0.7)
logstash-input-lumberjack (2.0.7)
logstash-input-pipe (2.0.4)
logstash-input-rabbitmq (4.1.0)
logstash-input-redis (2.0.6)
logstash-input-s3 (2.0.6)
logstash-input-snmptrap (2.0.4)
logstash-input-sqs (2.0.5)
logstash-input-stdin (2.0.4)
logstash-input-syslog (2.0.5)
logstash-input-tcp (3.0.6)
logstash-input-twitter (2.2.2)
logstash-input-udp (2.0.5)
logstash-input-unix (2.0.6)
logstash-input-xmpp (2.0.5)
logstash-input-zeromq (2.0.4)
logstash-output-cloudwatch (2.0.4)
logstash-output-csv (2.0.5)
logstash-output-elasticsearch (2.7.1)
logstash-output-email (3.0.5)
logstash-output-exec (2.0.5)
logstash-output-file (2.2.5)
logstash-output-ganglia (2.0.4)
logstash-output-gelf (2.0.5)
logstash-output-graphite (2.0.5)
logstash-output-hipchat (3.0.4)
logstash-output-http (2.1.3)
logstash-output-irc (2.0.4)
logstash-output-juggernaut (2.0.4)
logstash-output-kafka (2.0.5)
logstash-output-lumberjack (2.0.6)
logstash-output-nagios (2.0.4)
logstash-output-nagios_nsca (2.0.5)
logstash-output-null (2.0.4)
logstash-output-opentsdb (2.0.4)
logstash-output-pagerduty (2.0.4)
logstash-output-pipe (2.0.4)
logstash-output-rabbitmq (3.1.2)
logstash-output-redis (2.0.5)
logstash-output-s3 (2.0.7)
logstash-output-sns (3.0.4)
logstash-output-sqs (2.0.5)
logstash-output-statsd (2.0.7)
logstash-output-stdout (2.0.6)
logstash-output-tcp (2.0.4)
logstash-output-udp (2.0.4)
logstash-output-xmpp (2.0.4)
logstash-output-zeromq (2.1.0)
logstash-patterns-core (2.0.5)
joshuaspence commented 7 years ago

Here's another stack trace, this one seems to have some additional information.

{
    "timestamp": "2016-12-13T01:16:31.197000-0500",
    "message": {
        "events_filtered": 2564057,
        "events_consumed": 2562562,
        "worker_count": 4,
        "inflight_count": 125,
        "worker_states": [
            {
                "status": "dead",
                "alive": false,
                "index": 0,
                "inflight_count": 0
            },
            {
                "status": "dead",
                "alive": false,
                "index": 1,
                "inflight_count": 0
            },
            {
                "status": "dead",
                "alive": false,
                "index": 2,
                "inflight_count": 0
            },
            {
                "status": "sleep",
                "alive": true,
                "index": 3,
                "inflight_count": 125
            }
        ],
        "output_info": [
            {
                "type": "http",
                "config": {
                    "http_method": "get",
                    "url": "http://127.0.0.1:8500/v1/agent/check/pass/service:beats:1",
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": false,
                "events_received": 767,
                "workers": [
                    "LogStash::Outputs::Http: {\"http_method\"=>\"get\", \"url\"=>\"http://127.0.0.1:8500/v1/agent/check/pass/service:beats:1\", \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"workers\"=>1, \"request_timeout\"=>60, \"socket_timeout\"=>10, \"connect_timeout\"=>10, \"follow_redirects\"=>true, \"pool_max\"=>50, \"pool_max_per_route\"=>25, \"keepalive\"=>true, \"automatic_retries\"=>1, \"retry_non_idempotent\"=>false, \"validate_after_inactivity\"=>200, \"ssl_certificate_validation\"=>true, \"keystore_type\"=>\"JKS\", \"truststore_type\"=>\"JKS\", \"cookies\"=>true, \"verify_ssl\"=>true, \"format\"=>\"json\"}"
                ],
                "busy_workers": 0
            },
            {
                "type": "sqs",
                "config": {
                    "queue": "logstash",
                    "region": "us-east-1",
                    "workers": 4,
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": true,
                "events_received": 2564057,
                "workers": [
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}",
                    "LogStash::Outputs::SQS: {\"queue\"=>\"logstash\", \"region\"=>\"us-east-1\", \"workers\"=>4, \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"use_ssl\"=>true, \"batch\"=>true, \"batch_events\"=>10, \"batch_timeout\"=>5, \"message_max_size\"=>262144, \"stop_on_fail\"=>false}"
                ],
                "busy_workers": 1
            },
            {
                "type": "statsd",
                "config": {
                    "count": {
                        "logstash_shipper.logstash_events": "%{[events][count]}"
                    },
                    "gauge": {
                        "logstash_shipper.logstash_event_rate_1m": "%{[events][rate_1m]}",
                        "logstash_shipper.logstash_event_rate_5m": "%{[events][rate_5m]}",
                        "logstash_shipper.logstash_event_rate_15m": "%{[events][rate_15m]}"
                    },
                    "host": "localhost",
                    "port": 9125,
                    "sender": "i-00957992",
                    "__ALLOW_ENV__": false
                },
                "is_multi_worker": false,
                "events_received": 1495,
                "workers": [
                    "LogStash::Outputs::Statsd: {\"count\"=>{\"logstash_shipper.logstash_events\"=>\"%{[events][count]}\"}, \"gauge\"=>{\"logstash_shipper.logstash_event_rate_1m\"=>\"%{[events][rate_1m]}\", \"logstash_shipper.logstash_event_rate_5m\"=>\"%{[events][rate_5m]}\", \"logstash_shipper.logstash_event_rate_15m\"=>\"%{[events][rate_15m]}\"}, \"host\"=>\"localhost\", \"port\"=>9125, \"sender\"=>\"i-00957992\", \"codec\"=><LogStash::Codecs::Plain charset=>\"UTF-8\">, \"workers\"=>1, \"namespace\"=>\"logstash\", \"increment\"=>[], \"decrement\"=>[], \"timing\"=>{}, \"set\"=>{}, \"sample_rate\"=>1, \"debug\"=>false}"
                ],
                "busy_workers": 0
            }
        ],
        "thread_info": [
            {
                "thread_id": 38,
                "name": "[main]>worker3",
                "plugin": [
                    "LogStash::Filters::Metrics",
                    {
                        "add_field": {
                            "[@metadata][self]": "true",
                            "type": "metric"
                        },
                        "add_tag": "metric",
                        "meter": [
                            "events"
                        ]
                    }
                ],
                "backtrace": [
                    "[...]/vendor/bundle/jruby/1.9/gems/stud-0.0.22/lib/stud/buffer.rb:151:in `sleep'",
                    "[...]/vendor/bundle/jruby/1.9/gems/stud-0.0.22/lib/stud/buffer.rb:151:in `buffer_receive'",
                    "[...]/vendor/local_gems/99462def/logstash-output-sqs/lib/logstash/outputs/sqs.rb:134:in `receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/outputs/base.rb:109:in `multi_receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/outputs/base.rb:109:in `each'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/outputs/base.rb:109:in `multi_receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/output_delegator.rb:130:in `worker_multi_receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/output_delegator.rb:129:in `worker_multi_receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/output_delegator.rb:114:in `multi_receive'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/pipeline.rb:301:in `output_batch'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/pipeline.rb:301:in `each'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/pipeline.rb:301:in `output_batch'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/pipeline.rb:232:in `worker_loop'",
                    "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.4.1-java/lib/logstash/pipeline.rb:201:in `start_workers'"
                ],
                "blocked_on": null,
                "status": "sleep",
                "current_call": "[...]/vendor/bundle/jruby/1.9/gems/stud-0.0.22/lib/stud/buffer.rb:151:in `sleep'"
            }
        ],
        "stalling_threads_info": [
            {
                "thread_id": 38,
                "name": "[main]>worker3",
                "plugin": [
                    "LogStash::Filters::Metrics",
                    {
                        "add_field": {
                            "[@metadata][self]": "true",
                            "type": "metric"
                        },
                        "add_tag": "metric",
                        "meter": [
                            "events"
                        ]
                    }
                ],
                "current_call": "[...]/vendor/bundle/jruby/1.9/gems/stud-0.0.22/lib/stud/buffer.rb:151:in `sleep'"
            }
        ]
    },
    "level": "warn"
}
colinsurprenant commented 7 years ago

@joshuaspence thanks for the report. Can you retrieve the logstash error logs related to this? They will probably provide information for why the workers are dying.

ph commented 7 years ago

A thread dump would also help in this case, so we can see where the workers are actually stuck. And a configuration file, to see what plugins you are using

joshuaspence commented 7 years ago

I'm not seeing this issue anymore.