Open diegomscoelho opened 6 months ago
Hi
I have this that works for me - from https://gitlab.com/genomeinformatics/xengsort#how-to-classify
Note: I explicitly rename my graft/host files as human/mouse. I also cat/merge all files not graft/host as other
. Feel free to adapt this.
process XENGSORT {
tag "$sample_id Attempt_${task.attempt}_cpus_${task.cpus}_mem_${task.memory}"
debug true
publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
mode: 'copy', pattern: "*_stats.txt"
publishDir "${params.outdir}/1_QC/PreAlignment/2_${task.process}",
mode: 'copy', saveAs: { it == '.command.sh' ? "${sample_id}.${task.process}.commands.txt" : null }
input:
path(xengsort_reference)
tuple val(sample_id), path(reads)
output:
tuple val(sample_id), path('*_human*.fastq.gz') , emit: human_reads
tuple val(sample_id), path('*_mouse*.fastq.gz') , emit: mouse_reads
tuple val(sample_id), path('*_other*.fastq.gz') , emit: other_reads
tuple val(sample_id), path('*_human*.fastq.gz'), path('*_other*.fastq.gz') , emit: human_and_other_reads
tuple val(sample_id), path('*.xengsort_stats.txt') , emit: xengsort_stats
path(".command.sh")
path "versions.yml" , emit: versions
script:
if (params.single_end) {
"""
[ ! -f ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz
xengsort -DD classify \\
--index \$INDEX \\
--fastq ${sample_id}_1.trim.fastq.gz \\
--prefix "${sample_id}" \\
--classification count \\
--threads "${task.cpus}" \\
--compression gz \\
--progress \\
>& "${sample_id}.${task.process}.xengsort_stats.txt"
// Do other stuff like in the paired-end section as below
"""
} else {
"""
[ ! -f ${sample_id}_1.trim.fastq.gz ] && ln -s ${reads[0]} ${sample_id}_1.trim.fastq.gz
[ ! -f ${sample_id}_2.trim.fastq.gz ] && ln -s ${reads[1]} ${sample_id}_2.trim.fastq.gz
now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
echo "======== Starting xengsort classify ===============" \$now
echo -e "xengsort classify started:\t\$now" >> ${sample_id}.job_stats.txt
INDEX=`find -L ./ -name "*.hash" | sed 's/\\.hash\$//'`
echo "INDEX = \$INDEX" \\
| tee -a "${sample_id}.job_stats.txt"
xengsort -DD classify \\
--index \$INDEX \\
--fastq ${sample_id}_1.trim.fastq.gz --pairs ${sample_id}_2.trim.fastq.gz \\
--prefix "${sample_id}" \\
--classification count \\
--threads "${task.cpus}" \\
--compression gz \\
--progress \\
>& "${sample_id}.${task.process}.xengsort_stats.txt"
now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
echo "======== Finished xengsort classify ===============" \$now
echo -e "xengsort classify ended, merge fastq files:\t\$now" \\
| tee -a "${sample_id}.job_stats.txt"
mv "${sample_id}-graft.1.fq.gz" "${sample_id}_human_R1.fastq.gz"
mv "${sample_id}-graft.2.fq.gz" "${sample_id}_human_R2.fastq.gz"
mv "${sample_id}-host.1.fq.gz" "${sample_id}_mouse_R1.fastq.gz"
mv "${sample_id}-host.2.fq.gz" "${sample_id}_mouse_R2.fastq.gz"
OTHER_1=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).1.fq.gz" | tr '\\n' ' '`
echo "OTHER_1 = \$OTHER_1" \\
| tee -a "${sample_id}.job_stats.txt"
OTHER_2=`ls -1v | grep -E "(ambiguous|both|neither|unclassified).2.fq.gz" | tr '\\n' ' '`
echo "OTHER_2 = \$OTHER_2" \\
| tee -a "${sample_id}.job_stats.txt"
cat \$OTHER_1 > "${sample_id}_other_R1.fastq.gz"
cat \$OTHER_2 > "${sample_id}_other_R2.fastq.gz"
now="\$(TZ=EST5EDT date +'%d-%b-%Y_%H%M') EDT"
echo "======== Merge fastq xengsort done ===============" \$now
echo "Deleting OTHER_1 and OTHER_2 -- \$OTHER_1 \$OTHER_2" \\
| tee -a "${sample_id}.job_stats.txt"
rm -f \$OTHER_1 \$OTHER_2
echo -e "Merge fastq xengsort done:\t\$now" >> ${sample_id}.job_stats.txt
echo -e "ALL DONE:\t\$now" >> ${sample_id}.job_stats.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
xengsort: \$( xengsort --version )
container: "${task.container}"
END_VERSIONS
"""
}
}
Is there an existing module for this?
Is there an open PR for this?
Is there an open issue for this?
Are you going to work on this?
Assignees
to facilitate tracking who is working on the module