Open vedarthk opened 5 years ago
@vedarthk are you sure your it's not segfaulting before the new processes are created? I don't see any configuration that'll reload workers on 500.
@xrmx segmentation faults start when more requests hit the process, there was no segmentation fault in the strace output, here is some more strace output just before the clone:
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 13
connect(13, {sa_family=AF_INET, sin_port=htons(3306), sin_addr=inet_addr("10.210.91.53")}, 16) = 0
fcntl(13, F_SETFL, O_RDONLY) = 0
fcntl(13, F_GETFL) = 0x2 (flags O_RDWR)
setsockopt(13, SOL_SOCKET, SO_RCVTIMEO, "\2003\341\1\0\0\0\0\0\0\0\0\0\0\0\0", 16) = 0
setsockopt(13, SOL_SOCKET, SO_SNDTIMEO, "\2003\341\1\0\0\0\0\0\0\0\0\0\0\0\0", 16) = 0
setsockopt(13, SOL_IP, IP_TOS, [8], 4) = 0
setsockopt(13, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(13, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
read(13, "N\0\0\0\n5.6.34-log\0v\24&\222VSz.s|5/\0\377\377\10"..., 16384) = 82
stat("/usr/share/mysql/charsets/Index.xml", 0x7ffe656c71d0) = -1 ENOENT (No such file or directory)
futex(0x7fb102d3eb68, FUTEX_WAKE_PRIVATE, 2147483647) = 0
write(13, "^\0\0\1\17\242\17\0\0\0\0@\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 98) = 98
read(13, "\7\0\0\2\0\0\0\2\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\17\0\0\0\3SET NAMES utf8", 19) = 19
read(13, "\7\0\0\1\0\0\0\2\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\21\0\0\0\3set autocommit=0", 21) = 21
read(13, "\7\0\0\1\0\0\0\0\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\21\0\0\0\3set autocommit=1", 21) = 21
read(13, "\7\0\0\1\0\0\0\2\0\0\0", 16384) = 11
gettimeofday({1551102471, 681536}, NULL) = 0
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\32\0\0\0\3SELECT @@SQL_AUTO_IS_NULL", 30) = 30
read(13, "\1\0\0\1\1(\0\0\2\3def\0\0\0\22@@SQL_AUTO_IS_N"..., 16384) = 73
stat("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warnings", 0x7ffe656c6110) = -1 ENOENT (No such file or directory)
open("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warnings.x86_64-linux-gnu.so", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warnings.so", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warningsmodule.so", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warnings.py", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/opt/envs/app/local/lib/python2.7/site-packages/MySQLdb/warnings.pyc", O_RDONLY) = -1 ENOENT (No such file or directory)
gettimeofday({1551102471, 682573}, NULL) = 0
gettimeofday({1551102471, 682654}, NULL) = 0
gettimeofday({1551102471, 682806}, NULL) = 0
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\21\0\0\0\3SELECT VERSION()", 21) = 21
read(13, "\1\0\0\1\1\37\0\0\2\3def\0\0\0\tVERSION()\0\f!\0\36\0"..., 16384) = 73
gettimeofday({1551102471, 683330}, NULL) = 0
gettimeofday({1551102471, 683408}, NULL) = 0
write(13, "\1\0\0\0\1", 5) = 5
shutdown(13, SHUT_RDWR) = 0
close(13) = 0
gettimeofday({1551102471, 683941}, NULL) = 0
stat("/etc/resolv.conf", {st_mode=S_IFREG|0644, st_size=173, ...}) = 0
open("/etc/hosts", O_RDONLY|O_CLOEXEC) = 13
fstat(13, {st_mode=S_IFREG|0644, st_size=274, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fb108b45000
read(13, "127.0.0.1 appserver-ip-10-210-71"..., 4096) = 274
read(13, "", 4096) = 0
close(13) = 0
munmap(0x7fb108b45000, 4096) = 0
stat("/etc/resolv.conf", {st_mode=S_IFREG|0644, st_size=173, ...}) = 0
socket(PF_INET, SOCK_DGRAM|SOCK_NONBLOCK, IPPROTO_IP) = 13
connect(13, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.210.0.2")}, 16) = 0
gettimeofday({1551102471, 685002}, NULL) = 0
poll([{fd=13, events=POLLOUT}], 1, 0) = 1 ([{fd=13, revents=POLLOUT}])
sendmmsg(13, {{{msg_name(0)=NULL, msg_iov(1)=[{"\v\321\1\0\0\1\0\0\0\0\0\0\23mysqldb"..., 78}], msg_controllen=0, msg_flags=MSG_TRUNC|MSG_WAITALL|MSG_SYN|MSG_MORE|MSG_FASTOPEN|MSG_CMSG_CLOEXEC|0x56c0010}, 78}, {{msg_name(0)=NULL, msg_iov(1)=[{"\302X\1\0\0\1\0\0\0\0\0\0\23mysqldb"..., 78}], msg_controllen=0, msg_flags=MSG_CTRUNC|MSG_TRUNC|MSG_WAITALL|MSG_FIN|MSG_CONFIRM|MSG_RST|MSG_ERRQUEUE|MSG_NOSIGNAL|0x5bc0010}, 78}}, 2, MSG_NOSIGNAL) = 2
poll([{fd=13, events=POLLIN}], 1, 5000) = 1 ([{fd=13, revents=POLLIN}])
ioctl(13, FIONREAD, [94]) = 0
recvfrom(13, "\v\321\201\200\0\1\0\1\0\0\0\0\23mysqldb"..., 2048, 0, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.210.0.2")}, [16]) = 94
gettimeofday({1551102471, 685659}, NULL) = 0
poll([{fd=13, events=POLLIN}], 1, 4999) = 1 ([{fd=13, revents=POLLIN}])
ioctl(13, FIONREAD, [160]) = 0
recvfrom(13, "\302X\201\200\0\1\0\0\0\1\0\0\23mysqldb"..., 65536, 0, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.210.0.2")}, [16]) = 160
close(13) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 13
connect(13, {sa_family=AF_INET, sin_port=htons(3306), sin_addr=inet_addr("10.210.91.53")}, 16) = 0
fcntl(13, F_SETFL, O_RDONLY) = 0
fcntl(13, F_GETFL) = 0x2 (flags O_RDWR)
setsockopt(13, SOL_SOCKET, SO_RCVTIMEO, "\2003\341\1\0\0\0\0\0\0\0\0\0\0\0\0", 16) = 0
setsockopt(13, SOL_SOCKET, SO_SNDTIMEO, "\2003\341\1\0\0\0\0\0\0\0\0\0\0\0\0", 16) = 0
setsockopt(13, SOL_IP, IP_TOS, [8], 4) = 0
setsockopt(13, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(13, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
read(13, "N\0\0\0\n5.6.34-log\0w\24&\222Vf~hnpf1\0\377\377\10"..., 16384) = 82
write(13, "^\0\0\1\17\242\17\0\0\0\0@\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 98) = 98
read(13, "\7\0\0\2\0\0\0\2\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\17\0\0\0\3SET NAMES utf8", 19) = 19
read(13, "\7\0\0\1\0\0\0\2\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\21\0\0\0\3set autocommit=0", 21) = 21
read(13, "\7\0\0\1\0\0\0\0\0\0\0", 16384) = 11
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "\21\0\0\0\3set autocommit=1", 21) = 21
read(13, "\7\0\0\1\0\0\0\2\0\0\0", 16384) = 11
gettimeofday({1551102471, 688454}, NULL) = 0
poll([{fd=13, events=POLLIN|POLLPRI}], 1, 0) = 0 (Timeout)
write(13, "x\0\0\0\3SELECT `django_site`.`id`, "..., 124) = 124
read(13, "\1\0\0\1\3<\0\0\2\3def\fapp\vdjang"..., 16384) = 263
gettimeofday({1551102471, 689037}, NULL) = 0
gettimeofday({1551102471, 689119}, NULL) = 0
brk(0x5c0a000) = 0x5c0a000
brk(0x5c3a000) = 0x5c3a000
gettimeofday({1551102471, 689993}, NULL) = 0
gettimeofday({1551102471, 690109}, NULL) = 0
clone(child_stack=0x7fb0f452afb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7fb0f452b9d0, tls=0x7fb0f452b700, child_tidptr=0x7fb0f452b9d0) = 1094
futex(0x23489d0, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x452a460, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7fb0ec0013e0, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x23489d0, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x23489d0, FUTEX_WAIT_PRIVATE, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable)
gettimeofday({1551102471, 691884}, NULL) = 0
gettimeofday({1551102471, 691999}, NULL) = 0
adding htop screenhots for more information if that helps:
before any exception in Django application:
after any exception is thrown in Django:
From the first ps output the new processes were children of master not a previous worker. Please check /proc directly, e.g. grep Pid /proc/PIDNUMBER/status
@xrmx from the status of the process it looks like the parent process ID is of master, here is the output:
$ grep Pid /proc/10167/status
Pid: 10167
PPid: 10134
TracerPid: 0
$ grep Pid /proc/10137/status
Pid: 10137
PPid: 10134
TracerPid: 0
So am not sure what to suggests except update your distribution and make your application not segfault.
@xrmx there are no segmentation faults in the Django application, whenever there is 500 internal server errors in the Django application, new process are created and then UWSGI segfaults because it is not able to allocate the required amount of memory over the period of time.
@vedarthk did you try to check this problem on uwsgi==2.0.17.1
?
Versions:
UWSGI commandline options:
whenever any exception is thrown in Django, the UWSGI child process create child process, I was able to verify this using after there were 2 requests made and there was some exception thrown in the Python code:
output:
master process ID: 1055 child process IDs: 1057, 1058 process forked/cloned after 500 exceptions: 1094, 1098
I ran
strace
on the child process and I was able to verify that the process/thread with PID 1094 was cloned after 500 exception from django app:this behaviour leads to memory consumption and eventually the machine runs out of memory after this point there are segmentation faults because of insufficient memory