Closed laurentluce closed 6 years ago
I tried reproducing this by taking the effective config, replacing TLS certificate paths, removing all data directory and log path keys as well as HiPE and backing queue module settings (which are all defaults):
[{amqp_client,[{prefer_ipv6,false},{ssl_options,[]}]},
{asn1,[]},
{autocluster,
[{autocluster_failure,ignore},
{autocluster_log_level,info},
{aws_autoscaling,true},
{backend,aws},
{cleanup_warn_only,false},
{cluster_cleanup,true}]},
{compiler,[]},
{cowboy,[]},
{cowlib,[]},
{crypto,[]},
{inets,[]},
{kernel,
[{error_logger,tty},
{inet_default_connect_options,[{nodelay,true}]},
{inet_dist_listen_max,25672},
{inet_dist_listen_min,25672}]},
{os_mon,
[{start_cpu_sup,false},
{start_disksup,false},
{start_memsup,false},
{start_os_sup,false}]},
{public_key,[]},
{rabbit,
[{auth_backends,[rabbit_auth_backend_internal]},
{auth_mechanisms,['PLAIN','AMQPLAIN']},
{background_gc_enabled,false},
{background_gc_target_interval,60000},
{channel_max,0},
{channel_operation_timeout,15000},
{cluster_keepalive_interval,10000},
{cluster_nodes,{[],disc}},
{cluster_partition_handling,autoheal},
{collect_statistics,coarse},
{collect_statistics_interval,60000},
{config_entry_decoder,
[{cipher,aes_cbc256},
{hash,sha512},
{iterations,1000},
{passphrase,undefined}]},
{credit_flow_default_credit,{400,200}},
{default_permissions,[<<".*">>,<<".*">>,<<".*">>]},
{default_user,<<"guest">>},
{default_user_tags,[administrator]},
{default_vhost,<<"/">>},
{delegate_count,16},
{disk_free_limit,50000000},
{disk_monitor_failure_retries,10},
{disk_monitor_failure_retry_interval,120000},
{fhc_read_buffering,false},
{fhc_write_buffering,true},
{frame_max,131072},
{halt_on_upgrade_failure,true},
{handshake_timeout,10000},
{heartbeat,60},
{hipe_compile,false},
{lazy_queue_explicit_gc_run_operation_threshold,1000},
{log_levels,[{connection,info},{autocluster,info}]},
{loopback_users,[<<"guest">>]},
{memory_monitor_interval,2500},
{mirroring_flow_control,true},
{mirroring_sync_batch_size,4096},
{mnesia_table_loading_retry_limit,10},
{mnesia_table_loading_retry_timeout,30000},
{msg_store_credit_disc_bound,{4000,800}},
{msg_store_file_size_limit,16777216},
{msg_store_index_module,rabbit_msg_store_ets_index},
{msg_store_io_batch_size,4096},
{num_ssl_acceptors,1},
{num_tcp_acceptors,10},
{password_hashing_module,rabbit_password_hashing_sha256},
{queue_explicit_gc_run_operation_threshold,1000},
{queue_index_embed_msgs_below,4096},
{queue_index_max_journal_entries,32768},
{reverse_dns_lookups,false},
{server_properties,[]},
{ssl_allow_poodle_attack,false},
{ssl_apps,[asn1,crypto,public_key,ssl]},
{ssl_cert_login_from,distinguished_name},
{ssl_handshake_timeout,5000},
{ssl_listeners,[]},
{ssl_options,[]},
{tcp_listen_options,
[{backlog,128},
{nodelay,true},
{linger,{true,0}},
{exit_on_close,false}]},
{tcp_listeners,[5672]},
{trace_vhosts,[]},
{vm_memory_high_watermark,0.75},
{vm_memory_high_watermark_paging_ratio,0.9}]},
{rabbit_common,[]},
{rabbitmq_management,
[{cors_allow_origins,[]},
{cors_max_age,1800},
{http_log_dir,none},
{listener,
[{port,15672},
{ssl,false},
{ssl_opts,
[
{cacertfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/ca_certificate.pem"},
{certfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/server_certificate.pem"},
{keyfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/server_key.pem"}
]}]},
{load_definitions,none},
{management_db_cache_multiplier,5},
{process_stats_gc_timeout,300000},
{rates_mode,none},
{sample_retention_policies,
[{global,[{60,5}]},{basic,[{60,20}]},{detailed,[{5,5}]}]},
{stats_event_max_backlog,250}]},
{rabbitmq_management_agent,
[{rates_mode,basic},
{sample_retention_policies,
[{global,[{605,5},{3660,60},{29400,600},{86400,1800}]},
{basic,[{605,5},{3600,60}]},
{detailed,[{605,5}]}]}]},
{rabbitmq_stomp,
[{default_user,[{login,<<"guest">>},{passcode,<<"guest">>}]},
{default_vhost,<<"/">>},
{hide_server_info,false},
{implicit_connect,false},
{num_ssl_acceptors,1},
{num_tcp_acceptors,10},
{ssl_cert_login,false},
{ssl_listeners,[]},
{tcp_listen_options,[{backlog,128},{nodelay,true}]},
{tcp_listeners,[61613]},
{trailing_lf,true}]},
{rabbitmq_web_dispatch,[]},
{rabbitmq_web_stomp,
[{cowboy_opts,[]},
{num_ssl_acceptors,1},
{num_tcp_acceptors,10},
{port,15671},
{sockjs_opts,[]},
{ssl_config,
[{port,15674},
{backlog,1024},
{recbuf,32768},
{sndbuf,32768},
{cacertfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/ca_certificate.pem"},
{certfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/server_certificate.pem"},
{keyfile, "/Users/antares/Development/Opensource/tls-gen.git/basic/result/server_key.pem"}]},
{tcp_config,[]},
{use_http_auth,true},
{ws_frame,text}]},
{ranch,[]},
{sasl,[{errlog_type,error},{sasl_error_logger,false}]},
{sockjs,[]},
{ssl,[{protocol_version,['tlsv1.2','tlsv1.1',tlsv1]}]},
{stdlib,[]},
{syntax_tools,[]},
{xmerl,[]}].
I cannot reproduce the issue.
@laurentluce can you share a Dockerfile or a VM image that reproduces this, or at least your config file? does the problem happens if you don't use rabbitmq-autocluster 0.6.1
(that plugin hijacks RabbitMQ boot process, which can lead to plugin initialization issues such as missing ETS tables).
@michaelklishin So far, I did not see the issue without the autocluster plugin, running one node only. In clustering mode, I am using the latest autocluster plugin from https://github.com/rhoegg/rabbitmq-autocluster/releases. Do you recommend using a different version?
@laurentluce see https://github.com/rabbitmq/rabbitmq-autocluster#current-maintainers. 3 members of our team have worked on rabbitmq-autocluster since 3.6.7 and I don't remember this coming up.
@dcorbacho @Gsantomaggio do you? ^^^
@michaelklishin I haven't seen that with the latest releases
me neither
@michaelklishin I am going to try with rabbitmq-autocluster 0.7.0 and see how it works.
@laurentluce does this have any observable effect on the management plugin or is it just log noise?
@michaelklishin I started seeing issues with connections not reported by the management plugin (/api/connections). I would bring a new web-stomp client and its connection would not always be reported even when the connection was alive. This is when I tried to reset the mgmt DB and you pointed me to Erlang 20 not being supported. After that, I switched to erlang-rpm 19.3.4 and continued seeing the error log messages described above.
@michaelklishin I ran some tests with 2 to 4 nodes + rabbitmq-autocluster 0.7.0 and I see the badarg error on about 50% of the nodes. The first error usually happens after the autocluster random startup pause so it seems to match your hypothesis.
=INFO REPORT==== 9-Jun-2017::00:17:55 ===
node : rabbit@ip-10-0-90-119
home dir : /var/lib/rabbitmq
config file(s) : /etc/rabbitmq/rabbitmq.config
cookie hash : svshrPKw+J4rCUxya03Kog==
log : /var/log/rabbitmq/rabbit@ip-10-0-90-119.log
sasl log : /var/log/rabbitmq/rabbit@ip-10-0-90-119-sasl.log
database dir : /var/lib/rabbitmq/mnesia/rabbit@ip-10-0-90-119
=INFO REPORT==== 9-Jun-2017::00:17:55 ===
autocluster: Delaying startup for 1173ms.
=ERROR REPORT==== 9-Jun-2017::00:17:56 ===
** Generic server channel_exchange_metrics_metrics_collector terminating
** Last message in was collect_metrics
** When Server state == {state,channel_exchange_metrics,5000,
{[{60,20}],[{5,5}],[{60,5}]},
none,
#Fun<rabbit_mgmt_metrics_collector.0.6133563>,
#Fun<rabbit_mgmt_metrics_collector.1.6133563>,
{dict,0,16,16,8,80,48,
{[],[],[],[],[],[],[],[],[],[],[],[],[],
[],[],[]},
{{[],[],[],[],[],[],[],[],[],[],[],[],[],
[],[],[]}}}}
** Reason for termination ==
** {badarg,
[{ets,safe_fixtable,[channel_exchange_metrics,true],[]},
{ets,foldl,3,[{file,"ets.erl"},{line,571}]},
{rabbit_mgmt_metrics_collector,aggregate_metrics,2,
[{file,"src/rabbit_mgmt_metrics_collector.erl"},{line,158}]},
{rabbit_mgmt_metrics_collector,handle_info,2,
[{file,"src/rabbit_mgmt_metrics_collector.erl"},{line,102}]},
{gen_server,try_dispatch,4,[{file,"gen_server.erl"},{line,601}]},
{gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,667}]},
{proc_lib,init_p_do_apply,3,[{file,"proc_lib.erl"},{line,247}]}]}
@michaelklishin I ran more tests this time with 3.6.6 (instead of 3.6.10) and rabbitmq-autocluster 0.7.0. I see less errors. I see one instead of 5-6 errors. The error is seen when the autocluster plugin starts:
=INFO REPORT==== 11-Jun-2017::19:02:16 ===
autocluster: Delaying startup for 2163ms.
=ERROR REPORT==== 11-Jun-2017::19:02:18 ===
** Generic server rabbit_mgmt_external_stats terminating
** Last message in was emit_update
** When Server state == {state,65536,
[{{io_file_handle_open_attempt,count},10},
{{io_file_handle_open_attempt,time},468},
{{io_read,bytes},1},
{{io_read,count},1},
{{io_read,time},344},
{{io_reopen,count},0},
{{io_seek,count},0},
{{io_seek,time},0},
{{io_sync,count},0},
{{io_sync,time},0},
{{io_write,bytes},0},
{{io_write,count},0},
{{io_write,time},0},
{{mnesia_disk_tx,count},12},
{{mnesia_ram_tx,count},41},
{{msg_store_read,count},0},
{{msg_store_write,count},0},
{{queue_index_journal_write,count},0},
{{queue_index_read,count},0},
{{queue_index_write,count},0}],
{set,1,16,16,8,80,48,
{[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],
[]},
{{[],[],[],[],[],[],[],[],[],[],[],
[{'rabbitmq-cli-45@ip-10-0-26-122',
<0.583.0>}],
[],[],[],[]}}},
undefined}
** Reason for termination ==
** {badarg,
[{ets,match_object,[file_handle_cache_stats,'_'],[]},
{ets,tab2list,1,[{file,"ets.erl"},{line,710}]},
{file_handle_cache_stats,get,0,
[{file,"src/file_handle_cache_stats.erl"},{line,59}]},
{rabbit_mgmt_external_stats,update_state,1,
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,381}]},
{rabbit_mgmt_external_stats,emit_update,1,
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,359}]},
{rabbit_mgmt_external_stats,handle_info,2,
[{file,"src/rabbit_mgmt_external_stats.erl"},{line,347}]},
{gen_server,try_dispatch,4,[{file,"gen_server.erl"},{line,601}]},
{gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,667}]}]}
3.6.6 uses a different data model for stats, so this is not really an indication of anything.
rabbitmq-autocluster hijacks the boot process in a way that creates a race condition in plugin initialization.
On Sun, Jun 11, 2017 at 10:13 PM, Laurent Luce notifications@github.com wrote:
@michaelklishin https://github.com/michaelklishin I ran more tests this time with 3.6.6 and rabbitmq-autocluster 0.7.0. I see less errors. I see one instead of 5-6. It is again around the time the autocluster plugin starts:
=INFO REPORT==== 11-Jun-2017::19:02:16 === autocluster: Delaying startup for 2163ms.
=ERROR REPORT==== 11-Jun-2017::19:02:18 === Generic server rabbit_mgmt_external_stats terminating Last message in was emit_update When Server state == {state,65536, [{{io_file_handle_open_attempt,count},10}, {{io_file_handle_open_attempt,time},468}, {{io_read,bytes},1}, {{io_read,count},1}, {{io_read,time},344}, {{io_reopen,count},0}, {{io_seek,count},0}, {{io_seek,time},0}, {{io_sync,count},0}, {{io_sync,time},0}, {{io_write,bytes},0}, {{io_write,count},0}, {{io_write,time},0}, {{mnesia_disk_tx,count},12}, {{mnesia_ram_tx,count},41}, {{msg_store_read,count},0}, {{msg_store_write,count},0}, {{queue_index_journal_write,count},0}, {{queue_index_read,count},0}, {{queue_index_write,count},0}], {set,1,16,16,8,80,48, {[],[],[],[],[],[],[],[],[],[],[],[],[],[],[], []}, {{[],[],[],[],[],[],[],[],[],[],[], [{'rabbitmq-cli-45@ip-10-0-26-122', <0.583.0>}], [],[],[],[]}}}, undefined} Reason for termination == ** {badarg, [{ets,match_object,[file_handle_cachestats,''],[]}, {ets,tab2list,1,[{file,"ets.erl"},{line,710}]}, {file_handle_cache_stats,get,0, [{file,"src/file_handle_cache_stats.erl"},{line,59}]}, {rabbit_mgmt_external_stats,update_state,1, [{file,"src/rabbit_mgmt_external_stats.erl"},{line,381}]}, {rabbit_mgmt_external_stats,emit_update,1, [{file,"src/rabbit_mgmt_external_stats.erl"},{line,359}]}, {rabbit_mgmt_external_stats,handle_info,2, [{file,"src/rabbit_mgmt_external_stats.erl"},{line,347}]}, {gen_server,try_dispatch,4,[{file,"gen_server.erl"},{line,601}]}, {gen_server,handle_msg,5,[{file,"gen_server.erl"},{line,667}]}]}
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/rabbitmq/rabbitmq-management/issues/411#issuecomment-307650060, or mute the thread https://github.com/notifications/unsubscribe-auth/AAAEQlWKpWtMa40ZtpwG4OM1861rro_xks5sDDxZgaJpZM4NwB26 .
-- MK
Staff Software Engineer, Pivotal/RabbitMQ
@michaelklishin Do you recommend delaying even more the rabbitmq-autocluster initialization to avoid this issue?
rabbitmq-autocluster won't start RabbitMQ until after the delay has passed, so it shouldn't matter. You can definitely try.
On Sat, Jun 17, 2017 at 1:15 AM, Laurent Luce notifications@github.com wrote:
@michaelklishin https://github.com/michaelklishin Do you recommend delaying even more the rabbitmq-autocluster initialization to avoid this issue?
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/rabbitmq/rabbitmq-management/issues/411#issuecomment-309146785, or mute the thread https://github.com/notifications/unsubscribe-auth/AAAEQj-iXZzagO7QSGMnzSlzVGm8daMQks5sEv6GgaJpZM4NwB26 .
-- MK
Staff Software Engineer, Pivotal/RabbitMQ
This is no longer an issue we see mentioned, possibly because peer discovery is now a core feature and therefore plugin initialisation order/race conditions no longer apply.
RMQ 3.6.10 Erlang 19.3.4