nf-core / modules

Repository to host tool-specific module files for the Nextflow DSL2 community!
https://nf-co.re/modules
MIT License
279 stars 695 forks source link

new module: xengsort/classify #5411

Open diegomscoelho opened 6 months ago

diegomscoelho commented 6 months ago

Is there an existing module for this?

Is there an open PR for this?

Is there an open issue for this?

Are you going to work on this?

tamuanand commented 4 months ago

Hi

I have this that works for me - from https://gitlab.com/genomeinformatics/xengsort#how-to-classify

Note: I explicitly rename my graft/host files as human/mouse. I also cat/merge all files not graft/host as other. Feel free to adapt this.

process XENGSORT {
    tag "$sample_id Attempt_${task.attempt}_cpus_${task.cpus}_mem_${task.memory}"
    debug true

    publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
        mode: 'copy', pattern: "*_stats.txt"
    publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
        mode: 'copy', saveAs: { it == '.command.sh' ? "${sample_id}.${task.process}.commands.txt" : null }

    input:
    path(xengsort_reference)
    tuple val(sample_id), path(reads)

    output:
    tuple val(sample_id), path('*_human*.fastq.gz')  , emit: human_reads
    tuple val(sample_id), path('*_mouse*.fastq.gz')  , emit: mouse_reads
    tuple val(sample_id), path('*_other*.fastq.gz')  , emit: other_reads
    tuple val(sample_id), path('*_human*.fastq.gz'), path('*_other*.fastq.gz')  , emit: human_and_other_reads
    tuple val(sample_id), path('*.xengsort_stats.txt') , emit: xengsort_stats
    path(".command.sh")
    path "versions.yml"                              , emit: versions

    script:
    if (params.single_end) {
        """
        [ ! -f  ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz

         xengsort -DD classify \\
            --index \$INDEX \\
            --fastq ${sample_id}_1.trim.fastq.gz \\
            --prefix "${sample_id}" \\
            --classification count \\
            --threads "${task.cpus}" \\
            --compression gz \\
            --progress \\
            >& "${sample_id}.${task.process}.xengsort_stats.txt"

       // Do other stuff like in the paired-end section as below
        """
    } else {
        """

        [ ! -f  ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz
        [ ! -f  ${sample_id}_2.trim.fastq.gz ] && ln -s ${reads[1]} ${sample_id}_2.trim.fastq.gz

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Starting xengsort classify ===============" \$now
        echo -e "xengsort classify started:\t\$now" >> ${sample_id}.job_stats.txt

        INDEX=`find -L ./ -name "*.hash" | sed 's/\\.hash\$//'`

        echo "INDEX = \$INDEX" \\
            | tee -a "${sample_id}.job_stats.txt"

        xengsort -DD classify \\
            --index \$INDEX \\
            --fastq ${sample_id}_1.trim.fastq.gz --pairs ${sample_id}_2.trim.fastq.gz \\
            --prefix "${sample_id}" \\
            --classification count \\
            --threads "${task.cpus}" \\
            --compression gz \\
            --progress \\
            >& "${sample_id}.${task.process}.xengsort_stats.txt"

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Finished xengsort classify ===============" \$now
        echo -e "xengsort classify ended, merge fastq files:\t\$now" \\
            | tee -a "${sample_id}.job_stats.txt"

        mv "${sample_id}-graft.1.fq.gz" "${sample_id}_human_R1.fastq.gz"
        mv "${sample_id}-graft.2.fq.gz" "${sample_id}_human_R2.fastq.gz"

        mv "${sample_id}-host.1.fq.gz" "${sample_id}_mouse_R1.fastq.gz"
        mv "${sample_id}-host.2.fq.gz" "${sample_id}_mouse_R2.fastq.gz"

        OTHER_1=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).1.fq.gz" | tr '\\n' ' '`
        echo "OTHER_1 = \$OTHER_1" \\
            | tee -a "${sample_id}.job_stats.txt"

        OTHER_2=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).2.fq.gz" | tr '\\n' ' '`
        echo "OTHER_2 = \$OTHER_2" \\
            | tee -a "${sample_id}.job_stats.txt"

        cat \$OTHER_1 > "${sample_id}_other_R1.fastq.gz"
        cat \$OTHER_2 > "${sample_id}_other_R2.fastq.gz"

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Merge fastq xengsort done ===============" \$now

        echo "Deleting OTHER_1 and OTHER_2 -- \$OTHER_1 \$OTHER_2" \\
            | tee -a "${sample_id}.job_stats.txt"

        rm -f \$OTHER_1 \$OTHER_2 

        echo -e "Merge fastq xengsort done:\t\$now" >> ${sample_id}.job_stats.txt
        echo -e "ALL DONE:\t\$now" >> ${sample_id}.job_stats.txt

        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            xengsort: \$( xengsort --version )
            container: "${task.container}"
        END_VERSIONS
        """
    }
}