new module: xengsort/classify #5411

Open diegomscoelho opened 6 months ago

diegomscoelho commented 6 months ago

tamuanand commented 4 months ago


I have this that works for me - from

Note: I explicitly rename my graft/host files as human/mouse. I also cat/merge all files not graft/host as other. Feel free to adapt this.

process XENGSORT {
    tag "$sample_id Attempt_${task.attempt}_cpus_${task.cpus}_mem_${task.memory}"
    debug true

    publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
        mode: 'copy', pattern: "*_stats.txt"
    publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
        mode: 'copy', saveAs: { it == '' ? "${sample_id}.${task.process}.commands.txt" : null }

    tuple val(sample_id), path(reads)

    tuple val(sample_id), path('*_human*.fastq.gz')  , emit: human_reads
    tuple val(sample_id), path('*_mouse*.fastq.gz')  , emit: mouse_reads
    tuple val(sample_id), path('*_other*.fastq.gz')  , emit: other_reads
    tuple val(sample_id), path('*_human*.fastq.gz'), path('*_other*.fastq.gz')  , emit: human_and_other_reads
    tuple val(sample_id), path('*.xengsort_stats.txt') , emit: xengsort_stats
    path "versions.yml"                              , emit: versions

    if (params.single_end) {
        [ ! -f  ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz

         xengsort -DD classify \\
            --index \$INDEX \\
            --fastq ${sample_id}_1.trim.fastq.gz \\
            --prefix "${sample_id}" \\
            --classification count \\
            --threads "${task.cpus}" \\
            --compression gz \\
            --progress \\
            >& "${sample_id}.${task.process}.xengsort_stats.txt"

       // Do other stuff like in the paired-end section as below
    } else {

        [ ! -f  ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz
        [ ! -f  ${sample_id}_2.trim.fastq.gz ] && ln -s ${reads[1]} ${sample_id}_2.trim.fastq.gz

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Starting xengsort classify ===============" \$now
        echo -e "xengsort classify started:\t\$now" >> ${sample_id}.job_stats.txt

        INDEX=`find -L ./ -name "*.hash" | sed 's/\\.hash\$//'`

        echo "INDEX = \$INDEX" \\
            | tee -a "${sample_id}.job_stats.txt"

        xengsort -DD classify \\
            --index \$INDEX \\
            --fastq ${sample_id}_1.trim.fastq.gz --pairs ${sample_id}_2.trim.fastq.gz \\
            --prefix "${sample_id}" \\
            --classification count \\
            --threads "${task.cpus}" \\
            --compression gz \\
            --progress \\
            >& "${sample_id}.${task.process}.xengsort_stats.txt"

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Finished xengsort classify ===============" \$now
        echo -e "xengsort classify ended, merge fastq files:\t\$now" \\
            | tee -a "${sample_id}.job_stats.txt"

        mv "${sample_id}-graft.1.fq.gz" "${sample_id}_human_R1.fastq.gz"
        mv "${sample_id}-graft.2.fq.gz" "${sample_id}_human_R2.fastq.gz"

        mv "${sample_id}-host.1.fq.gz" "${sample_id}_mouse_R1.fastq.gz"
        mv "${sample_id}-host.2.fq.gz" "${sample_id}_mouse_R2.fastq.gz"

        OTHER_1=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).1.fq.gz" | tr '\\n' ' '`
        echo "OTHER_1 = \$OTHER_1" \\
            | tee -a "${sample_id}.job_stats.txt"

        OTHER_2=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).2.fq.gz" | tr '\\n' ' '`
        echo "OTHER_2 = \$OTHER_2" \\
            | tee -a "${sample_id}.job_stats.txt"

        cat \$OTHER_1 > "${sample_id}_other_R1.fastq.gz"
        cat \$OTHER_2 > "${sample_id}_other_R2.fastq.gz"

        now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
        echo "======== Merge fastq xengsort done ===============" \$now

        echo "Deleting OTHER_1 and OTHER_2 -- \$OTHER_1 \$OTHER_2" \\
            | tee -a "${sample_id}.job_stats.txt"

        rm -f \$OTHER_1 \$OTHER_2 

        echo -e "Merge fastq xengsort done:\t\$now" >> ${sample_id}.job_stats.txt
        echo -e "ALL DONE:\t\$now" >> ${sample_id}.job_stats.txt

        cat <<-END_VERSIONS > versions.yml
            xengsort: \$( xengsort --version )
            container: "${task.container}"