Residue under large specifications

HW-lj commented 5 years ago

# keepalived -v
Keepalived v2.0.6 (02/03,2019), git commit +

Copyright(C) 2001-2019 Alexandre Cassen, <acassen@gmail.com>

Built with kernel headers for Linux 4.19.25
Running on Linux 4.19.25-vhulk1902.3.1.h113.aarch64 #1 SMP Tue Feb 12 00:00:00 UTC 2019

configure options: 

Config options:  LVS VRRP VRRP_AUTH OLD_CHKSUM_COMPAT FIB_ROUTING

System options:  PIPE2 SIGNALFD INOTIFY_INIT1 VSYSLOG IPV4_DEVCONF LIBNL3 RTA_ENCAP RTA_EXPIRES RTA_NEWDST RTA_PREF FRA_SUPPRESS_PREFIXLEN FRA_SUPPRESS_IFGROUP FRA_TUN_ID RTAX_CC_ALGO RTAX_QUICKACK RTEXT_FILTER_SKIP_STATS FRA_L3MDEV FRA_UID_RANGE RTAX_FASTOPEN_NO_COOKIE RTA_VIA FRA_OIFNAME FRA_PROTOCOL FRA_IP_PROTO FRA_SPORT_RANGE FRA_DPORT_RANGE IFA_FLAGS IP_MULTICAST_ALL LWTUNNEL_ENCAP_MPLS LWTUNNEL_ENCAP_ILA LIBIPTC_LINUX_NET_IF_H_COLLISION LIBIPVS_NETLINK IPVS_DEST_ATTR_ADDR_FAMILY IPVS_SYNCD_ATTRIBUTES IPVS_64BIT_STATS VRRP_VMAC SOCK_NONBLOCK SOCK_CLOEXEC O_PATH GLOB_BRACE INET6_ADDR_GEN_MODE SO_MARK SCHED_RT SCHED_RESET_ON_FORK

1、config 300 Virtual Server which have 100 Real Server 2、start keepalived until they are all online 3、stop keepalived and watch ipvsadm -ln or cat /proc/net/ip_vs, then you will see there are still some vs or vs with rs there.

The reason I can see is that in the main process of keepalived , the checker can not shut down the service in time, so it will be killed in function sigend in keepalived/core/main.c

The following is the script which can produce large conf

#!/bin/sh

num_to_ip()
{
    local test_num=$1
    local test_net=$2

    if [[ ${test_net} =~ '.' ]];then
        local part_3=$[test_num/256+1]
        local part_4=$[test_num%256]
        echo "${test_net}.${part_3}.${part_4}"
    else
        local part_3=`printf "%x" $[test_num/65536+1]`
        local part_4=`printf "%x" $[test_num%65536]`
        echo "${test_net}::${part_3}:${part_4}"
    fi
}

ip_to_num()
{
    local test_ip=$1

    local part_3=`echo ${test_ip} | awk -F '[.|:]' '{print $(NF-1)}'`
    local part_4=`echo ${test_ip} | awk -F '[.|:]' '{print $NF}'`

    if [[ ${test_ip} =~ : ]];then
        part_3=`printf "%d" 0x${part_3}`
        part_4=`printf "%d" 0x${part_4}`
        echo "(${part_3}-1)*65536+${part_4}" | bc
    else
        echo "(${part_3}-1)*256+${part_4}" | bc
    fi
}

ak_read_key()
{
    local tmp_key=${1};shift

    [[ "${OTHER_ARGS}" =~ "${tmp_key}" ]] && echo "${tmp_key}" && return 0
    echo "#${tmp_key}" && return 1
}
ak_read_pair()
{
    local tmp_key=${1};shift   
    local def_val=${1};shift   

    for tmp_pair in ${OTHER_ARGS}
    do
        ## explicitly assigned
        [[ "${tmp_pair}" =~ "${tmp_key}=" ]] && echo "${tmp_pair//=/ }" && return 0
    done

    ## not explicitly assigned but has default value
    [ ! -z ${def_val} ] && echo "${tmp_key} ${def_val}" && return 0

    ## not explicitly assigned and no default value
    echo "#${tmp_key} XXX" && return 1
}

ak_add_checker()
{
    local hc_rip=${1};shift

    local hc_args="connect_port ${HC_PORT}
            `ak_read_pair connect_timeout $[RANDOM%1000+1]`
            `ak_read_pair delay_before_retry $[RANDOM%10+1]`
            `ak_read_pair retry $[RANDOM%3+1]`
            `ak_read_pair bindto`"

    case ${HC_TYPE} in
    tcp)
        echo "TCP_CHECK {
            ${hc_args}
        }"
        ;;
    http)
        echo "HTTP_GET {
            url {
                path /
                status_code 200
            }
            ${hc_args}
        }"
        ;;
    ssl)
        echo "SSL_GET {
            url {
                path /
                status_code 200
            }
            ${hc_args}
        }"
        ;;
    misc)
        echo "MISC_CHECK {
            misc_path \"/root/misc_check.sh ${hc_rip} ${HC_PORT} 10 5 3 3 > /dev/null && exit 0 || exit 1\"
            misc_timeout 15
            misc_dynamic
        }"
        ;;
    esac
}
ak_add_rs()
{
    local rip=${1};shift
    echo "
    real_server ${rip} ${RIP_PORT} {
        `ak_read_pair weight $[RANDOM%10]`
        `ak_add_checker ${rip}`
    }"
}

ak_add_vs()
{
    local vip=${1};shift

    [ -f /run/dpvs.pid ] && ip_tool=/usr/bin/dpip || ip_tool=/usr/sbin/ip
    list_algo=(rr wrr lc wlc sh consh)
    list_algo_len=${#list_algo[@]}

    echo "
virtual_server ${vip} ${VIP_PORT} {
    protocol ${PROTO}
    lb_kind ${LB_KIND}
    lb_algo rr
    `ak_read_pair delay_loop $[RANDOM%10+1]`
    `ak_read_pair persistence_timeout $[RANDOM%120]`
    alpha
    hysteresis 0
    quorum 1
    quorum_up \"${ip_tool} addr add ${vip} `ak_read_pair dev lo`\"
    #quorum_down \"${ip_tool} addr del ${vip} `ak_read_pair dev lo`\"
    $(for nr in `seq ${RIP_START} ${RIP_END}`;do
        rip=`num_to_ip ${nr} ${RIP_NET}`
            ak_add_rs ${rip}
    done)
}"
}

ak_sts_conf()
{
    mkdir -p ${CONF_DIR}
    for nv in `seq ${VIP_START} ${VIP_END}`;do
        vip=`num_to_ip ${nv} ${VIP_NET}`
        tmp_conf=${CONF_DIR}/`printf %.5d ${nv}`_${PROTO}_${vip}_${VIP_PORT}_${LB_KIND}_${HC_TYPE}_${HC_PORT}.conf
        ak_add_vs ${vip} > ${tmp_conf} &
    done
    wait
}

## default args
CONF_DIR=sts

VIP_START=1
VIP_END=1
VIP_NET=44
VIP_PORT=22

PROTO=TCP
LB_KIND=DR
LG_NAME=lg_v6
CIP_NET=22

RIP_START=1
RIP_END=1
RIP_NET=33
RIP_PORT=22

HC_TYPE=tcp
HC_PORT=22
## update args by input
while true; do
    case "$1" in
        -d | --dir )
            CONF_DIR=$2
            shift 2 ;;
        -v | --vip_port_num )
            arr=(${2//_/ })
            vip=${arr[0]}
            if [[ ${vip} =~ : ]];then
                VIP_NET=${vip%::*}
            else
                VIP_NET=${vip%.*.*}
            fi
            VIP_PORT=${arr[1]}
            VIP_END=${arr[2]}
            VIP_START=`ip_to_num ${vip}`
            VIP_END=$[VIP_START+VIP_END-1]
            shift 2 ;;
        -r | --rip_port_num )
            arr=(${2//_/ })
            rip=${arr[0]}
            if [[ ${rip} =~ : ]];then
                RIP_NET=${rip%::*}
            else
                RIP_NET=${rip%.*.*}
            fi
            RIP_PORT=${arr[1]}
            RIP_END=${arr[2]}
            RIP_START=`ip_to_num ${rip}`
            RIP_END=$[RIP_START+RIP_END-1]
            shift 2 ;;
        -h | --hck_port )
            arr=(${2//_/ })
            HC_TYPE=${arr[0]}
            HC_PORT=${arr[1]}
            shift 2 ;;
        -p | --protocol )
            PROTO=$2
            shift 2 ;;
        -k | --lb_kind )
            LB_KIND=$2
            shift 2 ;;
        * )
            break ;;
    esac
done
OTHER_ARGS="$*"
# OTHER_ARGS="$* alpha omega dev=eth1 bindto=33::5"

ak_sts_conf

The usage of it is : sh ak.sh -d /etc/keepalived/conf -v 22.22.1.1_80_300 -r 22.22.221.1_80_100 -h tcp_80 Please contact me immediately if you have any question about it : )

HW-lj commented 5 years ago

It seems something is wrong as the version I use looks like does not support so large specifications. We just wait for a while and the process will die though the rules are not cleared, either. So I will use the newest version to see if there are still this situation

pqarmitage commented 5 years ago

The range of values for weight and persistence_timeout are not correct in the script, since 0 is not valid for either of them; the minimum values are 1 in both cases.

By default, keepalived will wait 5 seconds for each of the child processes to terminate. If they have not terminated within that time, they are sent a SIGKILL signal, and that would then cause the checker process not to remove any remaining virtual/real servers. The default 5 seconds can be overridden by specifying in the configuration child_wait_time 20 for example. This is specified at the outermost level of the config, e.g.

child_wait_time 20

global_defs {
   ...
}

I have tested your configuration with the latest commit of keepalived, and with 300 virtual servers, each with 100 real servers, the checker process terminates on my system in 2 seconds, and so does not get sent a SIGKILL by the parent process, and consequently all the virtual and real servers are removed.

HW-lj commented 5 years ago

configuration with the latest commit of keepalived, and with 300 virtual servers, each with 100 real servers, the checker process terminates on my system in 2 seconds, and so does not get sent a SIGKILL by the parent process, and consequently all the virtual and real servers are removed.

Yes, when I use the newest version, it works well and then I closed the issue.

pqarmitage commented 5 years ago

@HW-lj Are you experiencing a problem with trying to run 30,000 TCP checkers at the same time? On my system, I run out of file descriptors.

HW-lj commented 5 years ago

@pqarmitage Indeed, when I test it, I have changed defalut epoll fd up to 500,000

HW-lj commented 5 years ago

Hi, I have recently encountered a tricky problem. Configuration with the latest commit of keepalived, and with 500 virtual servers, each with 100 real servers, we start the keepalived until all the real server to be online. And then reload it and stop it immediately, maybe it sounds crazy, but I'm sure some bad guys will do this : ) Now I found the checker process is busy handling with migrate_checkers, so I consider we should do better in the function migrate_checkers.

HW-lj commented 5 years ago

@HW-lj Are you experiencing a problem with trying to run 30,000 TCP checkers at the same time? On my system, I run out of file descriptors.

Hello, now I update my keepalived version to 2.0.15, and then I have no idea how to change the epoll fd limit. I have tried to change rlimit in the code, it seems does not work. Could you give me some advice?

pqarmitage commented 5 years ago

@HW-lj It isn't only the epoll fd that needs changing, it is the max open files limit that needs to be changed too. The vrrp process handle this for large numbers of vrrp instances - see set_vrrp_max_fds() in vrrp_daemon.c.

You will need to work out the maximum number to tcp_checkers that can be active at any one time (or keep a count) and set/increase the RLIMIT_NOFILE value accordingly for the checker process.

HW-lj commented 5 years ago

@HW-lj It isn't only the epoll fd that needs changing, it is the max open files limit that needs to be changed too. The vrrp process handle this for large numbers of vrrp instances - see set_vrrp_max_fds() in vrrp_daemon.c.

You will need to work out the maximum number to tcp_checkers that can be active at any one time (or keep a count) and set/increase the RLIMIT_NOFILE value accordingly for the checker process.

OK, thank you very much. I will try it right now.

HW-lj commented 5 years ago

@HW-lj It isn't only the epoll fd that needs changing, it is the max open files limit that needs to be changed too. The vrrp process handle this for large numbers of vrrp instances - see set_vrrp_max_fds() in vrrp_daemon.c.

You will need to work out the maximum number to tcp_checkers that can be active at any one time (or keep a count) and set/increase the RLIMIT_NOFILE value accordingly for the checker process.

Correct me if I am wrong, with the following patch, it seems still does not work well. Although the socket for tcp_check is created, 10,000 TCP checkers were not available.

diff --git a/keepalived/check/check_daemon.c b/keepalived/check/check_daemon.c
index 49c49b3..dd7715b 100644
--- a/keepalived/check/check_daemon.c
+++ b/keepalived/check/check_daemon.c
@@ -70,6 +70,7 @@
 #ifdef _WITH_CN_PROC_
 #include "track_process.h"
 #endif
+#define EPOLL_MAX_FD 500016

 /* Global variables */
 bool using_ha_suspend;
@@ -520,6 +521,7 @@ start_check_child(void)
 #ifndef _DEBUG_
        pid_t pid;
        char *syslog_ident;
+       struct rlimit limit;

        /* Initialize child process */
 #ifdef ENABLE_LOG_TO_FILE
@@ -604,6 +606,13 @@ start_check_child(void)
                log_message(LOG_INFO, "Healthcheck child process: cannot write pidfile");
                exit(KEEPALIVED_EXIT_FATAL);
        }
+       /* set fd ulimits  */
+       limit.rlim_cur = EPOLL_MAX_FD;
+       limit.rlim_max = EPOLL_MAX_FD;
+       if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
+               log_message(LOG_INFO, "epoll_init: set limit fd to %d failed.", EPOLL_MAX_FD);
+                               exit(KEEPALIVED_EXIT_FATAL);
+       }

        /* Create the new master thread */
        thread_destroy_master(master);  /* This destroys any residual settings from the parent */

the following is the keepalived -v:

# keepalived -v
Keepalived v2.0.6 (07/23,2018)

Copyright(C) 2001-2018 Alexandre Cassen, <acassen@gmail.com>

Built with kernel headers for Linux 4.18.16
Running on Linux 4.19.36-vhulk1904.3.1.h226.eulerosv2r8.aarch64 #1 SMP Mon Apr 1 00:00:00 UTC 2019

configure options: --build=aarch64-Huawei-linux-gnu --host=aarch64-Huawei-linux-gnu --program-prefix= --disable-dependency-tracking --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/usr/share/man --infodir=/usr/share/info --enable-snmp --enable-snmp-rfc --enable-sha1 --with-init=systemd build_alias=aarch64-Huawei-linux-gnu host_alias=aarch64-Huawei-linux-gnu PKG_CONFIG_PATH=:/usr/lib64/pkgconfig:/usr/share/pkgconfig CFLAGS=-O2 -g -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fexceptions -fstack-protector-strong -grecord-gcc-switches -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -fasynchronous-unwind-tables -fstack-clash-protection LDFLAGS=-Wl,-z,relro -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld

Config options:  LIBIPSET_DYNAMIC LVS VRRP VRRP_AUTH OLD_CHKSUM_COMPAT FIB_ROUTING SNMP_V3_FOR_V2 SNMP_VRRP SNMP_CHECKER SNMP_RFCV2 SNMP_RFCV3

System options:  PIPE2 SIGNALFD INOTIFY_INIT1 VSYSLOG IPV4_DEVCONF LIBNL3 RTA_ENCAP RTA_EXPIRES RTA_NEWDST RTA_PREF FRA_SUPPRESS_PREFIXLEN FRA_SUPPRESS_IFGROUP FRA_TUN_ID RTAX_CC_ALGO RTAX_QUICKACK RTEXT_FILTER_SKIP_STATS FRA_L3MDEV FRA_UID_RANGE RTAX_FASTOPEN_NO_COOKIE RTA_VIA FRA_OIFNAME FRA_PROTOCOL FRA_IP_PROTO FRA_SPORT_RANGE FRA_DPORT_RANGE IFA_FLAGS IP_MULTICAST_ALL LWTUNNEL_ENCAP_MPLS LWTUNNEL_ENCAP_ILA LIBIPTC LIBIPVS_NETLINK IPVS_DEST_ATTR_ADDR_FAMILY IPVS_SYNCD_ATTRIBUTES IPVS_64BIT_STATS VRRP_VMAC SOCK_NONBLOCK SOCK_CLOEXEC O_PATH GLOB_BRACE INET6_ADDR_GEN_MODE SO_MARK SCHED_RT SCHED_RESET_ON_FORK

pqarmitage commented 5 years ago

I am a bit confused about the numbers here. You refer to having 500 virtual servers, each with 100 real servers, and each real server has 1 TCP_CHECKer, based on the output of the ak.sh script above. However, you seem to be increasing the number of available flie descriptors to 500,000 whereas 500 * 100 is only 50,000. You then refer to 10,000 TCP_CHECKers.

Could you please clarify exactly how many virtual servers, real servers and tcp checkers you are configuring. If you are still using the ak.sh script above to generate your configuration, then providing the parameters you give to the script would be sufficient (and indeed preferable).

HW-lj commented 5 years ago

Emmm, Now my configration is 100 virtual servers, each with 100 real servers, and each real server has 1 TCP_CHECKer. And I increased the number of available flie descriptors to 500,000. In this circumstance, the real servers can not be online together, it seems when the oneline numbers about 2,000, some checkers will failed and then removed.

HW-lj commented 5 years ago

You can refer to the command: sh ak.sh -d /etc/keepalived/conf -v 22.22.1.1_80_100 -r 22.22.221.1_80_100 -h tcp_80 -k NAT

pqarmitage commented 5 years ago

I will test this and see what I observe.

Could you please post a copy of your top level configuration file. I assume it has a line like: include /etc/keepalived/conf/*.conf but I need to see what else you have in it.

HW-lj commented 5 years ago

[2019-06-28 09:35:37] root@localhost ~ 
# cat /etc/keepalived/keepalived.conf

! Configuration File for keepalived

include /etc/keepalived/conf/*.conf

Except for this, there is nothing.

pqarmitage commented 5 years ago

I have applied your patch, but rather than using 500016 as the max file descriptor, I have used 10240, and that works for me. I can see (I have applied a patch to log the fd number after connect if it is successfully opened) that the maximum fd that is used is 9959 (some tcp_checks completed before it had completed starting all the checks).

With the patch applied, I no longer get TCP connect fail to create socket. Rescheduling. error messages, which I was getting following opening fd 1023.

The patch may not be working for you if the value in /proc/sys/fs/nr_open is less than 500016, since if it is the setrlimit() call will fail, and you should see a log message epoll_init: set limit fd to 500016 failed.

pqarmitage commented 5 years ago

I think we probably need to try and estimate the max open fds that the checker process could need (probably also need to do so for the vrrp process), and then set RLIMIT_NOFILE appropriately.

For the checker process, we need to count at least the number of tcp, http, dns and smtp checkers and allow one fd for each of those, plus a little overhead.

For the vrrp process, it already allows 2 fds per vrrp instance. With the overhead allowed, that may be sufficient (I think only 1 file for track_files can be open at a time for example).

pqarmitage commented 5 years ago

Commit b7e66c4 makes the checker process increase the open file limit to allow for one open file for each TCP, HTTP/SSL, DNS and SMTP checker, and also for each smtp_alert. Commits fd456e4 and 31f6a28 tidy up the original commit, and commit cdfff2a adds an allowance of 1 open file for each smtp_alert in the vrrp process.

This means that your configurations with 10,000, 30,000 and 50,000 tcp checkers should no longer have problems running all the checkers.

HW-lj commented 5 years ago

Thank you for your timely help. Now I found the reason to my question is that my real server's service httpd can not works well, I replaced it with nginx server, all works well.

acassen / keepalived

Residue under large specifications #1173