epi2me-labs / wf-human-variation

Other
87 stars 41 forks source link

Support with custom profile #56

Closed Phillip-a-richmond closed 12 months ago

Phillip-a-richmond commented 12 months ago

Ask away!

I'm running the wf-human-variation workflow on an HPC cluster running PBS Pro, with compute nodes that don't have internet access.

A couple other nuances: singularity is activated by an LMOD command, and the cluster has different nodes for CPU and GPU.

I have singularity version 3.8.5 (also apptainer available at 1.1.9 but haven't tried it yet).

Based on the error I was getting, it seems like the container isn't getting recognized. I've been through several iterations and this is my current nextflow config:

I pulled the images and store them in a local directory since I'm worried about them being fetched at runtime. For some reason though, my error suggests they are not getting read. I've run the pipeline successfully on a different cluster using local profile (but that had open ports to the internet).

I kept the other profiles in tact:

// used by default for "standard" (docker) and singularity profiles,
// other profiles may override.
process {
    container = "ontresearch/${params.wf.name}:${params.wf.e2l_base_tag}"
    withLabel:wf_human_snp {
        container = "ontresearch/wf-human-variation-snp:${params.wf.e2l_snp_tag}"
    }
    withLabel:wf_human_sv {
        container = "ontresearch/wf-human-variation-sv:${params.wf.e2l_sv_tag}"
    }
    withLabel:wf_human_methyl {
        container = "ontresearch/wf-human-variation-methyl:${params.wf.e2l_methyl_tag}"
    }
    withLabel:wf_basecalling {
        container = "nanoporetech/${params.wf.basecaller_container}"
    }
    withLabel:wf_cnv {
        container = "ontresearch/wf-cnv:${params.wf.cnv_tag}"
    }
    withLabel:wf_human_str {
        container = "ontresearch/wf-human-variation-str:${params.wf.str_tag}"
    }
    withLabel:snpeff_annotation {
        container = "ontresearch/snpeff:${params.wf.snpeff_tag}"
    }
    shell = ['/bin/bash', '-euo', 'pipefail']

    // by default GPU tasks will run in serial to avoid GPU management.
    // cluster and cloud users can remove this with -profile discrete_gpus.
    // we use profiles to handle this as maxForks cannot be set dynamically
    // see https://github.com/nextflow-io/nextflow/discussions/3806 and CW-1857
    withLabel:gpu {
        maxForks = 1
    }
}

profiles {
    // the "standard" profile is used implicitely by nextflow
    // if no other profile is given on the CLI
    standard {
        docker {
            enabled = true
            // this ensures container is run as host user and group, but
            //    also adds host user to the within-container group
            runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
        }
        process."withLabel:gpu".containerOptions = "--gpus all"
    }

    // using singularity instead of docker
    singularity {
        singularity {
            enabled = true
            autoMounts = true
            //envWhitelist = "" // if your cluster sets a variable to indicate which GPU has been assigned you will want to allow it here
        }
        process."withLabel:gpu".containerOptions = "--nv"
    }

    // keep stub conda profile to prevent unknown profile warning so users get a better error
    conda {
        conda {
            enabled = true
        }
    }

    // Using AWS batch.
    // May need to set aws.region and aws.batch.cliPath
    awsbatch {
        process {
            executor = 'awsbatch'
            queue = { "${params.aws_queue}" }
            memory = '8G'
            withLabel:wfdefault {
                container = { "${params.aws_image_prefix}-${params.wf.name}:${params.wf.e2l_base_tag}-root" }
            }
            withLabel:wf_human_snp {
                container = { "${params.aws_image_prefix}-wf-human-variation-snp:${params.wf.e2l_snp_tag}-root" }
            }
            withLabel:wf_human_sv {
                container = { "${params.aws_image_prefix}-wf-human-variation-sv:${params.wf.e2l_sv_tag}-root" }
            }
            withLabel:wf_human_methyl {
                container = { "${params.aws_image_prefix}-wf-human-variation-methyl:${params.wf.e2l_methyl_tag}-root" }
            }
            withLabel:wf_basecalling {
                container = { "${params.aws_image_prefix}-${params.wf.basecaller_container}-root" }
            }
            withLabel:wf_cnv {
                container = { "${params.aws_image_prefix}-wf-cnv:${params.wf.cnv_tag}-root" }
            }
            withLabel:wf_human_str {
                container = { "${params.aws_image_prefix}-wf-human-variation-str:${params.wf.str_tag}-root" }
            }
            withLabel:snpeff_annotation {
                container = "${params.aws_image_prefix}-snpeff:${params.wf.snpeff_tag}-root"
            }
            shell = ['/bin/bash', '-euo', 'pipefail']
        }
    }

    // local profile for simplified development testing
    local {
        process.executor = 'local'
    }

And made a new profile called "sockeye". I've tried with and without commenting this container.

    sockeye {
        process {
            executor = 'pbspro'
//      container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation_sha95dd6fedf10c10fc65bdff3c2c48a1af2adcc8c0.sif'
            beforeScript = 'module load singularity'
        clusterOptions =  '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=2:mem=8gb' 
            cpus = '2'
            memory = '8GB'

            withLabel:wf_human_snp {
            container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation-snp_sha8276a92fc99a60a740b64812843943e357730844.sif'
        clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
        cpus = '4'
        memory = '16GB'
            }
            withLabel:wf_human_sv {
        container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation-sv_shabc3ac908a14705f248cdf49f218956ec33e93ef9.sif'
        clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
        cpus = '4'
        memory = '16GB'
            }
            withLabel:wf_human_methyl {
        container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation-methyl_sha44a13bcf48db332b2277bb9f95b56d64e393a1d5.sif'
        clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
        cpus = '4'
        memory = '16GB'
            }
            withLabel:wf_basecalling {
        container = 'file:///project/st-sturvey-3/Tools/Singularity/dorado_sha1433bfc3146fd0dc94ad9648452364f2327cf1b0.sif'
        clusterOptions = '-A st-sturvey-3-gpu -l walltime=24:00:00,select=1:ncpus=4:ngpus=1:gpu_mem=16gb:mem=24gb'
        memory = '16GB'
            beforeScript = 'module load singularity cuda'
            }
            withLabel:wf_cnv {
        container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-cnv_sha428cb19e51370020ccf29ec2af4eead44c6a17c2.sif'
        clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
        cpus = '4'
        memory = '16GB'
            }
            withLabel:wf_human_str {
        container = 'file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation-str_sha28799bc3058fa256c01c1f07c87f04e4ade1fcc1.sif'
        clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
        cpus = '4'
        memory = '16GB'
            }
            withLabel:snpeff_annotation {
        container = "file:///project/st-sturvey-3/Tools/Singularity/snpeff_sha2fa4cecff842fda832e54ef243862506abb6f3a6.sif"
            clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
        }
    }
        shell = ['/bin/bash', '-euo', 'pipefail']
    } 

This is how I execute the pipeline, pointing to a local installation of the pipeline version 1.6.1:

#!/bin/bash

# Run this script from the command line on the head node of sockeye, so that singularity images can be pulled and conda environments can be created 

##########
# Set up #
##########

# Load singularity and java
module load singularity
module load openjdk/11.0.8_10

# Where is nextflow
Nextflow=/project/st-sturvey-1/PrecisionHealthVirtualEnvironment/Software/Nextflow/nextflow

# Where is the singularity image?
singularity_image_wf_human_variation=/scratch/st-sturvey-3/Process/DH5557/wf-human-variation_latest.sif

# Where is the wf-human-variation github clone
WorkflowDir=/project/st-sturvey-3/Tools/wf-human-variation/

# Set working dir where input data comes from
WorkingDir=/project/st-sturvey-3/Processed/DH5557/
BamDir=$WorkingDir

# Final dir where output data goes to 
FinalDir=/scratch/st-sturvey-3/Process/DH5557/
mkdir -p $FinalDir

# GenomeDir
GenomeVer=GRCh38
GenomeDir=/project/st-sturvey-3/Databases/References/GRCh38/Genome/
GenomeFasta=GRCh38.primary_assembly.genome.fa

SINGULARITYENV_TMPDIR=$WorkingDir/singularity/tmp
mkdir -p $SINGULARITYENV_TMPDIR

SINGULARITYENV_NXF_DEBUG=$WorkingDir/singularity/debug
mkdir -p $SINGULARITYENV_NXF_DEBUG

NXF_SINGULARITY_CACHEDIR=$WorkingDir/singularity/cache
mkdir -p $NXF_SINGULARITY_CACHEDIR

############
# Proband  #
############

SampleID=DH555701
FAST5_DIR=/project/st-sturvey-3/Raw/DH5557/DH555701_fast5/
OUTPUT=/scratch/st-sturvey-3/Process/DH5557/${SampleID}_EPI2ME/
$Nextflow run $WorkflowDir \
    -w ${OUTPUT}/workspace \
    -profile singularity,sockeye,discrete_gpus \
    --snp --sv --methyl \
    --fast5_dir $FAST5_DIR \
    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'  \
    --remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2' \
    --sample_name $SampleID \
    --ref $GenomeDir/$GenomeFasta \
    --out_dir ${OUTPUT}

And this is the error I get:

ERROR ~ Error executing process > 'sv:runReport:getVersions'

Caused by:
  Process `sv:runReport:getVersions` terminated with an error exit status (127)

Command executed:

  trap '' PIPE # suppress SIGPIPE without interfering with pipefail
  python -c "import pysam; print(f'pysam,{pysam.__version__}')" >> versions.txt
  truvari version | sed 's/ /,/' >> versions.txt
  fastcat --version | sed 's/^/fastcat,/' >> versions.txt
  sniffles --version | head -n 1 | sed 's/ Version //' >> versions.txt
  bcftools --version | head -n 1 | sed 's/ /,/' >> versions.txt
  samtools --version | head -n 1 | sed 's/ /,/' >> versions.txt
  minimap2 --version | head -n 1 | sed 's/^/minimap2,/' >> versions.txt
  echo `seqtk 2>&1 | head -n 3 | tail -n 1 | cut -d ':' -f 2 | sed 's/ /seqtk,/'` >> versions.txt

Command exit status:
  127

Command output:
  (empty)

Command error:
  INFO:    Converting SIF file to temporary sandbox...
  WARNING: underlay of /etc/localtime required more than 50 (78) bind mounts
  .command.sh: line 3: python: command not found
  INFO:    Cleaning up image...

Work dir:
  /scratch/st-sturvey-3/Process/DH5557/DH555701_EPI2ME/workspace/b4/a90b29feaae3b1999e8fd89a42e9b6

Tip: view the complete command output by changing to the process work dir and entering the command `cat .command.out`

 -- Check '.nextflow.log' file for details
WARN: Killing running tasks (65)

I pulled these containers with:

 singularity pull ontresearch/wf-human-variation-sv:shabc3ac908a14705f248cdf49f218956ec33e93ef9
 singularity pull docker://ontresearch/wf-human-variation-sv:shabc3ac908a14705f248cdf49f218956ec33e93ef9
 singularity pull docker://nanoporetech/dorado:sha1433bfc3146fd0dc94ad9648452364f2327cf1b0
 singularity pull docker://ontresearch/wf-cnv:sha428cb19e51370020ccf29ec2af4eead44c6a17c2
 singularity pull docker://ontresearch/wf-human-variation-str:sha28799bc3058fa256c01c1f07c87f04e4ade1fcc1
 singularity pull docker://ontresearch/snpeff:sha2fa4cecff842fda832e54ef243862506abb6f3a6
 singularity pull docker://ontresearch/wf-human-variation-methyl:sha44a13bcf48db332b2277bb9f95b56d64e393a1d5
 singularity pull docker://ontresearch/wf-human-variation:sha95dd6fedf10c10fc65bdff3c2c48a1af2adcc8c0

Open to any suggestions or other examples of config profiles that work on clusters with "offline" nodes.

Thanks, Phil

cjw85 commented 12 months ago

Unfortunately I do not think we are going to be able to help you much here. We don't have experience running nextflow in an offline setting. I think your question might be better asked toward the wider audience in the Nextflow slack community.

Phillip-a-richmond commented 12 months ago

okay thanks can I keep this issue open while I ask the NF community? it may be useful to others who try to run in an offline mode.

Phillip-a-richmond commented 12 months ago

Perhaps the issue is with the apptainer performance on these images:

apptainer pull docker://ontresearch/wf-human-variation:latest

That works fine.

apptainer exec  wf-human-variation_latest.sif mosdepth

Which gives an error:

FATAL:   "mosdepth": executable file not found in $PATH
Phillip-a-richmond commented 12 months ago

Okay I see the issue now. By default the apptainer wasn't being contained, and you've built the tools inside of a /home/epi2melabs/ directory in the container, which conflicted with my own /home/ on the shared system.

By adding -C and -W $PWD I'm able to now get the tool to run.

apptainer shell -W $PWD -C wf-human-variation_latest.sif
INFO:    Converting SIF file to temporary sandbox...
WARNING: underlay of /etc/localtime required more than 50 (80) bind mounts
Apptainer> mosdepth
Usage: mosdepth [options] <prefix> <BAM-or-CRAM>
error parsing arguments
Phillip-a-richmond commented 12 months ago

Ooooh I figured it out. This is a HPC config that works for execution across a shared cluster with apptainer.

Runscript:

#!/bin/bash

# Run this script from the command line on the head node of sockeye, so that singularity images can be pulled and conda environments can be created 

##########
# Set up #
##########

# Load singularity and java
module load apptainer
module load openjdk/11.0.8_10

# Where is nextflow
Nextflow=/project/st-sturvey-1/PrecisionHealthVirtualEnvironment/Software/Nextflow/nextflow 

# Where is the apptainer image cache?
ApptainerDir=/scratch/st-sturvey-3/Cache/

# Where is the wf-human-variation github clone
WorkflowDir=/project/st-sturvey-3/Tools/wf-human-variation/

# Set working dir where input data comes from
WorkingDir=/project/st-sturvey-3/Processed/DH5557/
BamDir=$WorkingDir

# Final dir where output data goes to 
FinalDir=/scratch/st-sturvey-3/Process/DH5557/
mkdir -p $FinalDir

# GenomeDir
GenomeVer=GRCh38
GenomeDir=/project/st-sturvey-3/Databases/References/GRCh38/Genome/
GenomeFasta=GRCh38.primary_assembly.genome.fa

export APPTAINERENV_TMPDIR=$ApptainerDir/tmp
mkdir -p $APPTAINERENV_TMPDIR

export NXF_APPTAINER_DEBUG=$ApptainerDir/debug
mkdir -p $NXF_APPTAINER_DEBUG

export NXF_APPTAINER_CACHEDIR=$ApptainerDir
mkdir -p $NXF_APPTAINER_CACHEDIR

export NXF_OFFLINE='TRUE'

############
# Proband  #
############

SampleID=DH555701
FAST5_DIR=/project/st-sturvey-3/Raw/DH5557/DH555701_fast5/
OUTPUT=/scratch/st-sturvey-3/Process/DH5557/${SampleID}_EPI2ME/
$Nextflow run $WorkflowDir \
    -w ${OUTPUT}/workspace \
    -profile sockeye,discrete_gpus \
    -with-apptainer \
    --snp --sv --methyl \
    --fast5_dir $FAST5_DIR \
    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'  \
    --remora_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0_5mCG_5hmCG@v2' \
    --sample_name $SampleID \
    --ref $GenomeDir/$GenomeFasta \
    --out_dir ${OUTPUT}

Profile in the nextflow.config with locally installed images.

    sockeye {
        //apptainer stuff
        apptainer.enabled = true
        apptainer.autoMounts = true
        apptainer.cacheDir = '/scratch/st-sturvey-3/Cache/'
        apptainer.runOptions = '-C -W $PWD'
        process {
            executor = 'pbspro'
            //apptainer stuff
            apptainer.enabled = true
            apptainer.autoMounts = true
            apptainer.cacheDir = '/scratch/st-sturvey-3/Cache/'
            apptainer.runOptions = '-C -W $PWD'
            process."withLabel:gpu".containerOptions = "--nv"
            beforeScript = 'module load apptainer'

            container = "file:///project/st-sturvey-3/Tools/Singularity/wf-human-variation_sha95dd6fedf10c10fc65bdff3c2c48a1af2adcc8c0.sif"

            // cluster stuff
            clusterOptions =  '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=2:mem=8gb'
            cpus = '2'
            memory = '8GB'

            withLabel:wf_human_snp {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/wf-human-variation-snp_sha8276a92fc99a60a740b64812843943e357730844.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
            withLabel:wf_human_sv {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/wf-human-variation-sv_shabc3ac908a14705f248cdf49f218956ec33e93ef9.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
            withLabel:wf_human_methyl {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/wf-human-variation-methyl_sha44a13bcf48db332b2277bb9f95b56d64e393a1d5.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
            withLabel:wf_basecalling {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/dorado_sha1433bfc3146fd0dc94ad9648452364f2327cf1b0.sif"
                clusterOptions = '-A st-sturvey-3-gpu -l walltime=24:00:00,select=1:ncpus=4:ngpus=1:gpu_mem=16gb:mem=24gb'
                memory = '16GB'
                beforeScript = 'module load apptainer cuda'
            }
            withLabel:wf_cnv {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/wf-cnv_sha428cb19e51370020ccf29ec2af4eead44c6a17c2.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
            withLabel:wf_human_str {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/wf-human-variation-str_sha28799bc3058fa256c01c1f07c87f04e4ade1fcc1.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
            withLabel:snpeff_annotation {
                container = "file:///project/st-sturvey-3/Tools/Singularity/Apptainer/snpeff_sha2fa4cecff842fda832e54ef243862506abb6f3a6.sif"
                clusterOptions = '-A st-sturvey-3 -l walltime=24:00:00,select=1:ncpus=4:mem=16gb'
                cpus = '4'
                memory = '16GB'
            }
        }
        shell = ['/bin/bash', '-euo', 'pipefail']
    }

It wasn't being applied to all processes until I pulled it out of the process definition.

Cheers, Phil