ICGC-TCGA-PanCancer / store-and-forward-workflow

A workflow that transfers data.
0 stars 0 forks source link

Workflow use of gt-download-upload-wrapper hangs indefinitely #12

Open denis-yuen opened 9 years ago

denis-yuen commented 9 years ago

The 1.0.9 version of the workflow hangs indefinitely on the s28_GNOS_download_4.sh step.

This appears to be a bug with the gt-download or vcf-uploader components for this version.

To duplicate:

1) Build and then unzip the workflow on a worker (or a development machine with seqware installed and valid gnos keys) 2) Run the workflow using seqware bundle launch --dir <your workflow directory> --no-metadata --ini <an ini with the new parameter defined ... or not, it hangs either way> to run with sample data 3) In a different terminal, go to the shared_workspace/downloads folder. You'll find that the test data (177MB) is downloaded successfully. However, the call to s28_GNOS_download_4.sh hangs indefinitely. 4) Use strace to see the eternal waiting

dyuen@odl-dyuen:/usr/tmp/oozie/oozie-6bd4cb54-a4ed-4f6d-ac93-fc97c66586ef/shared_workspace/downloads$ ps aux | grep perl
dyuen     9568  0.0  0.0 116292  8088 pts/13   Sl+  14:13   0:00 docker run -v /usr/tmp/oozie/oozie-6bd4cb54-a4ed-4f6d-ac93-fc97c66586ef/shared_workspace/downloads:/workflow_data -v /home/dyuen/.gnos/gnos.pem:/gnos_icgc_keyfile.pem pancancer/pancancer_upload_download:1.5 /bin/bash -c cd /workflow_data/ && perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.13/lib /opt/vcf-uploader/vcf-uploader-2.0.7/gnos_download_file.pl --url https://gtrepo-dkfz.annailabs.com//cghub/data/analysis/download/a90ba420-1c47-11e5-bd41-63b64df37a3f .  --retries 3 --timeout-min 20  --file /gnos_icgc_keyfile.pem --pem /gnos_icgc_keyfile.pem
root      9575  0.0  0.0  17960  1440 ?        Ss   14:13   0:00 /bin/bash -c cd /workflow_data/ && perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.13/lib /opt/vcf-uploader/vcf-uploader-2.0.7/gnos_download_file.pl --url https://gtrepo-dkfz.annailabs.com//cghub/data/analysis/download/a90ba420-1c47-11e5-bd41-63b64df37a3f .  --retries 3 --timeout-min 20  --file /gnos_icgc_keyfile.pem --pem /gnos_icgc_keyfile.pem
root      9601  0.0  0.0  31728 10336 ?        S    14:13   0:00 perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.13/lib /opt/vcf-uploader/vcf-uploader-2.0.7/gnos_download_file.pl --url https://gtrepo-dkfz.annailabs.com//cghub/data/analysis/download/a90ba420-1c47-11e5-bd41-63b64df37a3f . --retries 3 --timeout-min 20 --file /gnos_icgc_keyfile.pem --pem /gnos_icgc_keyfile.pem
dyuen    12466  0.0  0.0  11748   932 pts/0    S+   14:54   0:00 grep --color=auto perl
dyuen@odl-dyuen:/usr/tmp/oozie/oozie-6bd4cb54-a4ed-4f6d-ac93-fc97c66586ef/shared_workspace/downloads$ sudo strace -p 9575
Process 9575 attached
wait4(-1, ^CProcess 9575 detached
 <detached ...>
dyuen@odl-dyuen:/usr/tmp/oozie/oozie-6bd4cb54-a4ed-4f6d-ac93-fc97c66586ef/shared_workspace/downloads$ sudo strace -p 9601
Process 9601 attached
restart_syscall(<... resuming interrupted call ...>) = 0
pipe([3, 4])                            = 0
pipe([5, 6])                            = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fd22544aa10) = 1023
close(6)                                = 0
close(4)                                = 0
read(5, "", 4)                          = 0
close(5)                                = 0
ioctl(3, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffd35c606c0) = -1 ENOTTY (Inappropriate ioctl for device)
lseek(3, 0, SEEK_CUR)                   = -1 ESPIPE (Illegal seek)
fstat(3, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
read(3, "root      1023  0.0  0.0   4440 "..., 8192) = 280
read(3, "", 8192)                       = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1023, si_status=0, si_utime=0, si_stime=0} ---
fstat(3, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
close(3)                                = 0
rt_sigaction(SIGHUP, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
rt_sigaction(SIGINT, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
rt_sigaction(SIGQUIT, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
wait4(1023, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 1023
rt_sigaction(SIGHUP, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
rt_sigaction(SIGINT, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
pipe([3, 4])                            = 0
pipe([5, 6])                            = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fd22544aa10) = 1026
close(6)                                = 0
close(4)                                = 0
read(5, "", 4)                          = 0
close(5)                                = 0
ioctl(3, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffd35c606c0) = -1 ENOTTY (Inappropriate ioctl for device)
lseek(3, 0, SEEK_CUR)                   = -1 ESPIPE (Illegal seek)
fstat(3, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
read(3, "================================"..., 8192) = 1366
read(3, "", 8192)                       = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1026, si_status=0, si_utime=0, si_stime=0} ---
fstat(3, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0
close(3)                                = 0
rt_sigaction(SIGHUP, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
rt_sigaction(SIGINT, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
rt_sigaction(SIGQUIT, {SIG_IGN, [], SA_RESTORER, 0x7fd224b1cd40}, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, 8) = 0
wait4(1026, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 1026
rt_sigaction(SIGHUP, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
rt_sigaction(SIGINT, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {SIG_DFL, [], SA_RESTORER, 0x7fd224b1cd40}, NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({10, 0}, ^CProcess 9601 detached
 <detached ...>

`

Also see #1

briandoconnor commented 9 years ago

I think I know how to fix this. I'll be in around 4pm today, grab me and I'll walk you through the work around.