grailbio / reflow

A language and runtime for distributed, incremental data processing in the cloud
Apache License 2.0
965 stars 52 forks source link

Issues with file-ifying strings #110

Closed olgabot closed 5 years ago

olgabot commented 5 years ago

Hello, I'm writing a workflow to compare all samples from an S3 bucket, where multiple runs of the same sample are separated by semicolons, and samples are separated by pipes. There's some issue with parsing the string to create the separate files such that the output of SplitByPipeAndMergeR1R2 is of type [string] instead of [file].

Reflow workflow

```golang param ( // Samples are pipe-separated, multiple sequencing runs per // samples are semicolon-separated, e.g.: // s1_run1_R1.fastq.gz;s1_run2_R1.fastq.gz|s2_run1_R1.fastq.gz;s2_run2_R1.fastq.gz read1s string // Samples are pipe-separated, multiple sequencing runs per // samples are semicolon-separated, e.g.: // s1_run1_R1.fastq.gz;s1_run2_R1.fastq.gz|s2_run1_R1.fastq.gz;s2_run2_R1.fastq.gz read2s string // Semicolon-separated names of samples in read1s, read2s names string // Full s3 file location to put the output comparison output string // Either "minhash" (sourmash program), "hyperloglog" (dashing program), // or "truejaccard" for absolute number of overlapping kmers method = "minhash" // Whether to compare the sequences on DNA or protein molecule = "dna" // sketch size = 2**log2_sketch_size log2_sketch_size = 10 // Size of kmer to use. For HyperLogLog, only ksizes<=32 are valid ksize = 21 // GiB of memory for compare // For > 100 samples, increase memory = 8 // Number of processes to use for comparison threads = 16 ) val files = make("$/files") val strings = make("$/strings") val dirs = make("$/dirs") val dashing = make("./../tools/dashing.rf") val sourmash = make("./../tools/sourmash.rf") func Cat(files [file]) = exec(image := "ubuntu") (catted file) {" cat {{files}} > {{catted}} "} read1s_split := strings.Split(read1s, "|") read2s_split := strings.Split(read2s, "|") func SplitBySemicolon(reads string) (read_files [file]) = { reads_split := strings.Split(reads, ";") read_files := [file(r) | r <- reads_split] read_files } func SplitbyPipeAndMergeR1R2(read1s, read2s string) = { // Samples are separated by pipes read1s_split_by_samples := strings.Split(read1s, "|") read2s_split_by_samples := strings.Split(read2s, "|") // Samples may have multiple runs, which are merged read1s_split_by_runs := [SplitBySemicolon(r) | r <- read1s_split_by_samples] read2s_split_by_runs := [SplitBySemicolon(r) | r <- read2s_split_by_samples] // Each sample may have multiple reads, separated by semicolons r1_r2_flattened := [flatten([r1, r2]) | (r1, r2) <- zip(read1s_split_by_runs, read2s_split_by_runs)] r1_r2_flattened := trace(r1_r2_flattened) // Output a list of files [Cat(r1_r2) | r1_r2 <- r1_r2_flattened] } // Make a single concatenated reads file for each sample, ignoring R1 R2 reads := SplitbyPipeAndMergeR1R2(read1s, read2s) val matrix = if method == "minhash" { sourmash.CompareFastqs(reads, names, molecule, log2_sketch_size, ksize, threads) } else { distance := "dist" dashing.CompareFastqs(reads, names, log2_sketch_size, ksize, threads, distance) } val Main = files.Copy(matrix, output) ```

reflow doc output

(base)
 ✘  Mon 25 Feb - 13:03  ~/code/reflow-workflows/workflows   origin ☊ olgabot/kmer-similarity 1● 
  reflow doc kmer_similarity.rf
kmer_similarity.rf:85:11: cannot use type string as type [string] in argument to function (type func(fastqs [file], names [string], molecule string, log2_sketch_size, ksize, threads int) (csv file))
kmer_similarity.rf:88:12: cannot use type string as type [string] in argument to function (type func(fastqs [file], names [string], log2_sketch_size, ksize, threads int, distance string) (matrix file))
kmer_similarity.rf:109:29: identifier "matrix" not defined

Do you know what may be happening? Thank you! Warmest, Olga

prasadgopal commented 5 years ago

param "names" is a string, but the function you are calling (CompareFastqs) is expecting a list of strings([string]).

On Mon, Feb 25, 2019 at 1:10 PM Olga Botvinnik notifications@github.com wrote:

Hello, I'm writing a workflow to compare all samples from an S3 bucket, where multiple runs of the same sample are separated by semicolons, and samples are separated by pipes. There's some issue with parsing the string to create the separate files such that the output of SplitByPipeAndMergeR1R2 is of type [string] instead of [file]. Reflow workflow

param (

// Samples are pipe-separated, multiple sequencing runs per

// samples are semicolon-separated, e.g.:

// s1_run1_R1.fastq.gz;s1_run2_R1.fastq.gz|s2_run1_R1.fastq.gz;s2_run2_R1.fastq.gz

read1s string

// Samples are pipe-separated, multiple sequencing runs per

// samples are semicolon-separated, e.g.:

// s1_run1_R1.fastq.gz;s1_run2_R1.fastq.gz|s2_run1_R1.fastq.gz;s2_run2_R1.fastq.gz

read2s string

// Semicolon-separated names of samples in read1s, read2s

names string

// Full s3 file location to put the output comparison

output string

// Either "minhash" (sourmash program), "hyperloglog" (dashing program),

// or "truejaccard" for absolute number of overlapping kmers

method = "minhash"

// Whether to compare the sequences on DNA or protein

molecule = "dna"

// sketch size = 2**log2_sketch_size

log2_sketch_size = 10

// Size of kmer to use. For HyperLogLog, only ksizes<=32 are valid

ksize = 21

// GiB of memory for compare

// For > 100 samples, increase

memory = 8

// Number of processes to use for comparison

threads = 16

)

val files = make("$/files")

val strings = make("$/strings")

val dirs = make("$/dirs")

val dashing = make("./../tools/dashing.rf")

val sourmash = make("./../tools/sourmash.rf")

func Cat(files [file]) =

exec(image := "ubuntu") (catted file) {"
    cat {{files}} > {{catted}}
"}

read1s_split := strings.Split(read1s, "|") read2s_split := strings.Split(read2s, "|")

func SplitBySemicolon(reads string) (read_files [file]) = {

reads_split := strings.Split(reads, ";")

read_files := [file(r) | r <- reads_split]

read_files

}

func SplitbyPipeAndMergeR1R2(read1s, read2s string) = {

// Samples are separated by pipes

read1s_split_by_samples := strings.Split(read1s, "|")

read2s_split_by_samples := strings.Split(read2s, "|")

// Samples may have multiple runs, which are merged

read1s_split_by_runs := [SplitBySemicolon(r) | r <- read1s_split_by_samples]

read2s_split_by_runs := [SplitBySemicolon(r) | r <- read2s_split_by_samples]

// Each sample may have multiple reads, separated by semicolons

r1_r2_flattened := [flatten([r1, r2]) | (r1, r2) <- zip(read1s_split_by_runs, read2s_split_by_runs)]

r1_r2_flattened := trace(r1_r2_flattened)

// Output a list of files

[Cat(r1_r2) | r1_r2 <- r1_r2_flattened]

}

// Make a single concatenated reads file for each sample, ignoring R1 R2 reads := SplitbyPipeAndMergeR1R2(read1s, read2s)

val matrix = if method == "minhash" {

sourmash.CompareFastqs(reads, names, molecule, log2_sketch_size, ksize, threads)

} else {

distance := "dist"

dashing.CompareFastqs(reads, names, log2_sketch_size, ksize, threads, distance)

}

val Main = files.Copy(matrix, output)

reflow doc output

(base)

✘  Mon 25 Feb - 13:03  ~/code/reflow-workflows/workflows   origin ☊ olgabot/kmer-similarity 1● 

 reflow doc kmer_similarity.rf

kmer_similarity.rf:85:11: cannot use type string as type [string] in argument to function (type func(fastqs [file], names [string], molecule string, log2_sketch_size, ksize, threads int) (csv file))

kmer_similarity.rf:88:12: cannot use type string as type [string] in argument to function (type func(fastqs [file], names [string], log2_sketch_size, ksize, threads int, distance string) (matrix file))

kmer_similarity.rf:109:29: identifier "matrix" not defined

Do you know what may be happening? Thank you! Warmest, Olga

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/grailbio/reflow/issues/110, or mute the thread https://github.com/notifications/unsubscribe-auth/AfC0QwIlWn5cfBve_fdp-7dOdEg5zB8Fks5vRFEygaJpZM4bQyx1 .

--

This email message, including attachments, may contain private, proprietary, or privileged information and is the confidential information and/or property of GRAIL, Inc., and is for the sole use of the intended recipient(s). Any unauthorized review, use, disclosure or distribution is strictly prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message.

olgabot commented 5 years ago

Great eye! Thank you.