find_isoform error #33

Open nick-youngblut opened 1 month ago

nick-youngblut commented 1 month ago

I'm just using 3 input fastq files with 10k reads each, as just a test.

My workflow:

config_file = FLAMES::create_config(outdir, type = "sc_3end", do_barcode_demultiplex = TRUE)

sce = sc_long_pipeline(
    fastq = fastq_dir, 
    annotation = ref_gtf_file, 
    genome_fa = ref_fasta_file,
    outdir = outdir, 
    config_file = config_file, 
    expect_cell_number = 8000

The error:


1. sc_long_pipeline(fastq = fastq_dir, annotation = ref_gtf_file, 
 .     genome_fa = ref_fasta_file, outdir = outdir, config_file = config_file, 
 .     expect_cell_number = 8000)
2. find_isoform(annotation, genome_fa, genome_bam, outdir, config)
3. find_isoform_flames(annotation, genome_fa, genome_bam, outdir, 
 .     config)
4. basiliskRun(env = flames_env, fun = function(gff3, genome, iso, 
 .     tss, fa, tran, ds, conf, raw) {
 .     python_path <- system.file("python", package = "FLAMES")
 .     find <- reticulate::import_from_path("find_isoform", python_path)
 .     ret <- find$find_isoform(gff3, genome, iso, tss, fa, tran, 
 .         ds, conf, raw)
 .     ret
 . }, gff3 = annotation, genome = genome_bam, iso = file.path(outdir, 
 .     "isoform_annotated.gff3"), tss = file.path(outdir, "tss_tes.bedgraph"), 
 .     fa = genome_fa, tran = file.path(outdir, "transcript_assembly.fa"), 
 .     ds = config$isoform_parameters$downsample_ratio, conf = config, 
 .     raw = ifelse(config$isoform_parameters$generate_raw_isoform, 
 .         file.path(outdir, "splice_raw.gff3"), FALSE))
5. fun(...)
6. find$find_isoform(gff3, genome, iso, tss, fa, tran, ds, conf, 
 .     raw)
7. py_call_impl(callable, call_args$unnamed, call_args$named)

My references:


nick-youngblut commented 1 month ago

If I use 3 samples (fastq files) of 500k reads each, BLAZE dies:

    "name": "ERROR",
    "message": "generator raised StopIteration",
    "stack": "generator raised StopIterationTraceback:

1. sc_long_pipeline(fastq = fastq_dir, annotation = ref_gtf_file, 
 .     genome_fa = ref_fasta_file, outdir = outdir, config_file = config_file, 
 .     expect_cell_number = 8000)
2. blaze(expect_cell_number, fastq, `output-prefix` = paste0(outdir, 
 .     \"/\"), `output-fastq` = \"matched_reads.fastq\", threads = config$pipeline_parameters$threads, 
 .     `max-edit-distance` = config$barcode_parameters$max_bc_editdistance, 
 .     overwrite = TRUE)
3. basiliskRun(env = flames_env, fun = function(blaze_argv) {
 .     cat(\"Running BLAZE...\
 .     cat(\"Argument: \", blaze_argv, \"\
 .     blaze <- reticulate::import(\"blaze\")
 .     ret <- blaze$blaze(blaze_argv)
 .     ret
 . }, blaze_argv = blaze_argv)
4. fun(...)
5. blaze$blaze(blaze_argv)
6. py_call_impl(callable, call_args$unnamed, call_args$named)"

I'm using a machine with 8 threads and 64 GB of memory, so I'm guessing that the issue is not due to a lack of memory.

The lack of a full stack trace for the BLAZE subprocess makes this issue hard to troubleshoot (a downside of calling python via reticulate::py_call_impl() versus keeping python and R code separate; e.g., different processes in a Nextflow pipeline).