Open michael-arion opened 10 years ago
Could you
debug
mode, then try to collapse, and provide that log outputarakoon --dump-store
on head.db.clone
after it was createdhead.db
and head.db.clone
and check whether these are the sameroot@CCAUMAN3:~# vi /opt/qbase5/cfg/qconfig/arakoon/marketplace/marketplace.cfg root@CCAUMAN3:~# grep -C3 github /opt/qbase5/cfg/qconfig/arakoon/marketplace/marketplace.cfg ip = 10.74.8.132 client_port = 7081 messaging_port = 10001
log_level = debug log_dir = /opt/qbase5/var/log/arakoon/Marketplace home = /opt/ssd/arakoon/Marketplace root@CCAUMAN3:~# ps -ef | grep daemonize root 18780 5105 0 15:31 pts/14 00:00:00 grep --color=auto daemonize root 21738 1 0 May07 ? 00:16:45 arakoon --node arakoon_3 -config /opt/qbase5/cfg/qconfig/arakoon/arakoon/arakoon.cfg -daemonize root 23138 1 0 May07 ? 00:49:33 arakoon --node Marketplace_3 -config /opt/qbase5/cfg/qconfig/arakoon/marketplace/marketplace.cfg -daemonize root 23999 1 0 May07 ? 00:11:35 arakoon --node Marketplacefe_3 -config /opt/qbase5/cfg/qconfig/arakoon/marketplacefe/marketplacefe.cfg -daemonize root@CCAUMAN3:~# kill -USR1 23138 => now in debug mode. Dump-store from before collapse: root@CCAUMAN3:~# arakoon --dump-store /opt/tlogs/Marketplace/head.db i: Some ("99279767") master: Some(Marketplace2,0) routing : -- interval: {(,),(,_)} root@CCAUMAN3:~# ls -ltrh /opt/tlogs/Marketplace/ total 3.7G -rw-r--r-- 1 root root 7.4M 2014-05-13 03:20 992.tlf -rw-r--r-- 1 root root 7.4M 2014-05-13 12:31 993.tlf -rw-r--r-- 1 root root 7.4M 2014-05-13 21:37 994.tlf -rw-r--r-- 1 root root 7.4M 2014-05-14 06:48 995.tlf -rw-r--r-- 1 root root 7.4M 2014-05-14 15:54 996.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 01:04 997.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 10:11 998.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 19:17 999.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 04:27 1000.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 13:34 1001.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 22:40 1002.tlf -rw-r--r-- 1 root root 7.4M 2014-05-17 07:51 1003.tlf -rw-r--r-- 1 root root 7.4M 2014-05-17 16:56 1004.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 02:07 1005.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 11:17 1006.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 20:23 1007.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 05:31 1008.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 14:36 1009.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 23:46 1010.tlf -rw-r--r-- 1 root root 7.4M 2014-05-20 08:56 1011.tlf -rw-r--r-- 1 root root 7.4M 2014-05-20 18:02 1012.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 03:10 1013.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 12:20 1014.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 21:26 1015.tlf -rw-r--r-- 1 root root 7.4M 2014-05-22 06:36 1016.tlf -rw-r--r-- 1 root root 7.4M 2014-05-22 15:42 1017.tlf -rw-r--r-- 1 root root 7.4M 2014-05-23 00:58 1018.tlf -rw-r--r-- 1 root root 1.8G 2014-05-23 03:03 head.db.clone -rw-r--r-- 1 root root 7.4M 2014-05-23 10:09 1019.tlf -rw-r--r-- 1 root root 1.8G 2014-05-23 15:33 head.db -rw-r--r-- 1 root root 27M 2014-05-23 15:34 1020.tlog root@CCAUMAN3:~#
=> I run the remote-collapse on the node itself (ip 10.74.8.132) root@CCAUMAN3:~# arakoon --collapse-remote marketplace 10.74.8.132 7081 3 arakoon: main: remote_collapsing_failed: Failure("unknown failure (EC: 255)") root@CCAUMAN3:~# ls -ltrh /opt/tlogs/Marketplace/ total 3.7G -rw-r--r-- 1 root root 7.4M 2014-05-13 03:20 992.tlf -rw-r--r-- 1 root root 7.4M 2014-05-13 12:31 993.tlf -rw-r--r-- 1 root root 7.4M 2014-05-13 21:37 994.tlf -rw-r--r-- 1 root root 7.4M 2014-05-14 06:48 995.tlf -rw-r--r-- 1 root root 7.4M 2014-05-14 15:54 996.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 01:04 997.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 10:11 998.tlf -rw-r--r-- 1 root root 7.4M 2014-05-15 19:17 999.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 04:27 1000.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 13:34 1001.tlf -rw-r--r-- 1 root root 7.4M 2014-05-16 22:40 1002.tlf -rw-r--r-- 1 root root 7.4M 2014-05-17 07:51 1003.tlf -rw-r--r-- 1 root root 7.4M 2014-05-17 16:56 1004.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 02:07 1005.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 11:17 1006.tlf -rw-r--r-- 1 root root 7.4M 2014-05-18 20:23 1007.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 05:31 1008.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 14:36 1009.tlf -rw-r--r-- 1 root root 7.4M 2014-05-19 23:46 1010.tlf -rw-r--r-- 1 root root 7.4M 2014-05-20 08:56 1011.tlf -rw-r--r-- 1 root root 7.4M 2014-05-20 18:02 1012.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 03:10 1013.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 12:20 1014.tlf -rw-r--r-- 1 root root 7.4M 2014-05-21 21:26 1015.tlf -rw-r--r-- 1 root root 7.4M 2014-05-22 06:36 1016.tlf -rw-r--r-- 1 root root 7.4M 2014-05-22 15:42 1017.tlf -rw-r--r-- 1 root root 7.4M 2014-05-23 00:58 1018.tlf -rw-r--r-- 1 root root 7.4M 2014-05-23 10:09 1019.tlf -rw-r--r-- 1 root root 1.8G 2014-05-23 15:33 head.db -rw-r--r-- 1 root root 1.8G 2014-05-23 15:37 head.db.clone -rw-r--r-- 1 root root 27M 2014-05-23 15:38 1020.tlog
=> now turning off debug mode again: root@CCAUMAN3:~# vi /opt/qbase5/cfg/qconfig/arakoon/marketplace/marketplace.cfg root@CCAUMAN3:~# ps -ef | grep daemonize root 21738 1 0 May07 ? 00:16:45 arakoon --node arakoon_3 -config /opt/qbase5/cfg/qconfig/arakoon/arakoon/arakoon.cfg -daemonize root 23138 1 0 May07 ? 00:49:42 arakoon --node Marketplace_3 -config /opt/qbase5/cfg/qconfig/arakoon/marketplace/marketplace.cfg -daemonize root 23835 5105 0 15:38 pts/14 00:00:00 grep --color=auto daemonize root 23999 1 0 May07 ? 00:11:35 arakoon --node Marketplacefe_3 -config /opt/qbase5/cfg/qconfig/arakoon/marketplacefe/marketplacefe.cfg -daemonize root@CCAUMAN3:~# kill -USR1 23138 root@CCAUMAN3:~# grep "May 23 15" /opt/qbase5/var/log/arakoon/Marketplace/Marketplace_3.log > arakoon_Marketplace_debug.log root@CCAUMAN3:~# less arakoon_Marketplace_debug.log root@CCAUMAN3:~# arakoon --dump-store /opt/tlogs/Marketplace/head.db i: Some ("99279767") master: Some(Marketplace2,0) routing : -- interval: {(,),(,_)} root@CCAUMAN3:~# arakoon --dump-store /opt/tlogs/Marketplace/head.db.clone i: Some ("99279767") master: Some(Marketplace2,0) routing : -- interval: {(,),(,_)} root@CCAUMAN3:~# md5sum /opt/tlogs/Marketplace/head.db d125bdf8ebc5d651e97d47b7ad753fba /opt/tlogs/Marketplace/head.db root@CCAUMAN3:~# md5sum /opt/tlogs/Marketplace/head.db.clone d125bdf8ebc5d651e97d47b7ad753fba /opt/tlogs/Marketplace/head.db.clone root@CCAUMAN3:~#
Hoi Nicolas,
In bijlage de debug logfile van tijdens de collapse-remote.
Grtjs, Michael Van Wesenbeeck Belgacom Cloud Services Operations
On 23 May 2014, at 12:10, Nicolas Trangez notifications@github.com wrote:
Could you
Run the node in debug mode, then try to collapse, and provide that log output Run arakoon --dump-store on head.db.clone after it was created Calculate the MD5 checksum of head.db and head.db.clone and check whether these are the same — Reply to this email directly or view it on GitHub.
Github 'Issues' doesn't keep e-mail attachments. Could you post it as a Gist?
Excuse my noobness: https://gist.github.com/michael-arion/d14c51cc1bc27e3f1d70
Could you provide the output of arakoon --version
and tcbmgr inform /path/to/head.db
?
The version we had before the upgrade: root@CCAUMAN3:~# arakoon --version version: 1.6.5 git_revision: "1.6.5-0-ga879267-dirty" compiled: "01/10/2013 15:47:27 UTC" machine: "ubuntu 3.0.0-32-generic x86_64 x86_64 x86_64 GNU/Linux" tlogEntriesPerFile: 100000 dependencies: camltc 0.7.0 OCaml bindings for tokyo cabinet lwt 2.4.3 A cooperative threads library for OCaml ounit 2.0.0 Unit testing framework loosely based on HUnit. It is similar to JUnit, and other XUnit testing frameworks
The current version: root@CCAUMAN3:~# arakoon --version version: 1.6.10 git_revision: "tags/1.6.10-0-ged8263b" compiled: "04/04/2014 11:46:38 UTC" machine: "jenkins-arakoon-git-build-and-package-18 3.5.0-42-generic x86_64 x86_64 x86_64 GNU/Linux" tlogEntriesPerFile: 100000 dependencies: camltc 0.9.0 Tokyo Cabinet bindings for OCaml. lwt 2.4.4 A cooperative threads library for OCaml ounit 2.0.0 Unit testing framework loosely based on HUnit. It is similar to JUnit, and other XUnit testing frameworks
and the tcbmgr inform: root@CCAUMAN3:~# /opt/qbase5/apps/arakoon/bin/tcbmgr inform /opt/tlogs/Marketplace/head.db path: /opt/tlogs/Marketplace/head.db database type: btree additional flags: comparison function: lexical max leaf member: 128 max node member: 256 leaf number: 151647 node number: 954 bucket number: 229373 alignment: 256 free block pool: 1024 inode number: 917749 modified time: 2014-05-23T14:46:00+01:00 options: large record number: 218798 file size: 1827966976
For reference: should look into https://github.com/toolslive/camltc/commit/a28c7033ae819f38bed44db4ad27987b1754dc40 & check whether tcbdbsetmutex
could return non-zero without setting an error code on the bdb
reference for some reason.
Since camltc 0.8.1
, tcbdbsetmutex
is called whenever a database is created (https://github.com/toolslive/camltc/commit/a28c7033ae819f38bed44db4ad27987b1754dc40). This call, as implemented in TokyoCabinet, can fail without setting any error code, see https://github.com/Incubaid/tokyocabinet/blob/2eed757b82bd2854c729ad5f4c8cc82169f4bc51/tcbdb.c#L212: when pthread_rwlock_init
or pthread_mutex_init
fails, the procedure returns false
without calling tcbdbsetecode
.
Whilst this might explain what you're experiencing, it's hard to know why these call would fail on your system. According to the man
pages, EAGAIN
, ENOMEM
or EPERM
are most likely (EBUSY
and EINVAL
are not applicable).
What do you suggest to debug this further? Run a node under strace? Thx.
It's not necessarily a system call going wrong, so strace
might not help. I'd rather run the node under gdb
, break in tcbdbsetmutex
, then step through it to check the value of err
after the 2 pthread_*_init
calls. Note you might enter tcbdbsetmutex
a couple of times (at startup and after sending the collapse
command).
Something like
$ gdb ./arakoon.native
...
Reading symbols from ./arakoon.native...done.
(gdb) break tcbdbsetmutex
Breakpoint 1 at 0x71d950: file tcbdb.c, line 212.
(gdb) run --node arakoon_0
Starting program: /home/nicolas/Projects/arakoon/arakoon.native --node arakoon_0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7ffff1451700 (LWP 2789)]
[New Thread 0x7ffff0c50700 (LWP 2790)]
Breakpoint 1, tcbdbsetmutex (bdb=0xb1fe20) at tcbdb.c:212
212 bool tcbdbsetmutex(TCBDB *bdb){
...
(gdb) next
215 if(bdb->mmtx || bdb->open){
(gdb)
219 TCMALLOC(bdb->mmtx, sizeof(pthread_rwlock_t));
(gdb)
220 TCMALLOC(bdb->cmtx, sizeof(pthread_mutex_t));
(gdb)
222 if(pthread_rwlock_init(bdb->mmtx, NULL) != 0) err = true;
(gdb)
223 if(pthread_mutex_init(bdb->cmtx, NULL) != 0) err = true;
(gdb) p err
$1 = false
(gdb) next
222 if(pthread_rwlock_init(bdb->mmtx, NULL) != 0) err = true;
(gdb)
223 if(pthread_mutex_init(bdb->cmtx, NULL) != 0) err = true;
(gdb)
224 if(err){
(gdb) next
231 return tchdbsetmutex(bdb->hdb);
(gdb) next
232 }
(gdb) continue
Continuing.
Note you should adjust the arguments passed to run
(but don't pass -daemonize
) according to the way you launch Arakoon. You might get a slightly different output than above, better print err
more often.
Hi,
We have a 3 node arakoon cluster that was recently upgraded to version 1.6.10. Remote collapsing is not working on it.
Output of dump-store of the
head.db
:I have all tlogs starting from
992.tlf
:Still, I get an error when trying a collapse-remote call:
This is the relevant info in the arakoon server log during the collapse-remote client call:
When checking the files in my tlog dir, I see that the copy of
head.db
tohead.db.clone
is done, but that thehead.db
is untouched.Thx for looking into this. Michael Van Wesenbeeck