grailbio / reflow

A language and runtime for distributed, incremental data processing in the cloud
Apache License 2.0
965 stars 52 forks source link

Renaming lots of files? #55

Closed olgabot closed 6 years ago

olgabot commented 6 years ago

Hello, I think I'm doing something very naive and there's a much better way to do it. I'm working on the following workflow for FastQC which outputs two files, stdin_fastqc.html and stdin_fastqc.zip. The name is determined from the input files and as there could be multiple input FASTQ files, I use zcat to unzip them all and pipe into fastqc. But then they're all named stdin_fastqc.html instead of as the ID name. So I'm using dirs.Pick to get each output, and then wrote this Copy function to operate on each extension, but I keep getting a syntax error:

 ✘  Tue 10 Jul - 18:09  ~/code/tick-genome/reflow   origin ☊ master 3☀ 3● 
  reflow run fastqc.rf -reads=s3://czbiohub-seqbot/fastqs/180628_A00111_0168_AHFVJVDMXX/rawdata/Undetermined_S0_R1_001.fastq.gz -output=s3://tick-genome/dna/2018-06-28 -id=Undetermined_S0_R1
/Users/olgabot/code/tick-genome/reflow/fastqc.rf:52:11: syntax error: unexpected tokAssign

Here is the workflow:

param (
    // S3 path to read1 of the fastq/fasta file. If multiple files, 
    // can be pipe-separated e.g. sample1_01.fastq.gz|sample1_02.fastq.gz
    reads string

    // Full s3 path location to put the FastQC reports
    output string

    // name of the sample
    id string
)

val fastqc = "quay.io/biocontainers/fastqc:0.11.7--4"

func FastQC(reads [file]) =
    // Use kmer-hashing image which has latest khmer to avoid bug with basenames in reflow
    exec(image := fastqc, mem := 4*GiB, cpu := 8) (outdir dir) {"
        zcat {{reads}} | /usr/local/bin/fastqc -o {{outdir}} --threads 8 --format fastq stdin
"}

// Instantiate the system modules "dirs" (system modules begin
// with $), assigning its instance to the "dirs" identifier. To
// view the documentation for this module, run "reflow doc
// $/dirs".
val dirs = make("$/dirs")
val files = make("$/files")
val path = make("$/path")

// Instantiate Go system module "strings"
val strings = make("$/strings")

// Split each read string by the pipe "|" to get individual s3 paths
val reads_split = strings.Split(reads, "|")

// Create a file for each element in the `read1s`, `read2s` string array
// Now `r1`, `r1` are arrays of files
val reads_files = [file(read) | read <- reads_split]

val outdir = FastQC(reads_files)

// Two files are output: stdin_fastqc.html and stdin_fastq.zip
val (html, _) = dirs.Pick(outdir, "*html")

// "zip" is a keyword in Reflow so say "zipfile" instead
val (zipfile, _) = dirs.Pick(outdir, "*zip")

outputs := ["html": html, "zip": zipfile]

func Copy(extension string, item file){
    suffix := "_fastqc." + extension
    basename := strings.Join(id, suffix)
    s3_output := path.Join(output, basename)
    files.Copy(item, s3_output)
}

val Main = {
    [Copy(extension, item) | extension, item <- outputs]
}

Can you help me understand what I'm missing here? Thanks! Olga

prasadgopal commented 6 years ago

On Tue, Jul 10, 2018 at 6:28 PM Olga Botvinnik notifications@github.com wrote:

Hello, I think I'm doing something very naive and there's a much better way to do it. I'm working on the following workflow for FastQC which outputs two files, stdin_fastqc.html and stdin_fastqc.zip. The name is determined from the input files and as there could be multiple input FASTQ files, I use zcat to unzip them all and pipe into fastqc. But then they're all named stdin_fastqc.html instead of as the ID name. So I'm using dirs.Pick to get each output, and then wrote this Copy function to operate on each extension, but I keep getting a syntax error:

✘  Tue 10 Jul - 18:09  ~/code/tick-genome/reflow   origin ☊ master 3☀ 3●   reflow run fastqc.rf -reads=s3://czbiohub-seqbot/fastqs/180628_A00111_0168_AHFVJVDMXX/rawdata/Undetermined_S0_R1_001.fastq.gz -output=s3://tick-genome/dna/2018-06-28 -id=Undetermined_S0_R1 /Users/olgabot/code/tick-genome/reflow/fastqc.rf:52:11: syntax error: unexpected tokAssign

Here is the workflow:

param ( // S3 path to read1 of the fastq/fasta file. If multiple files, // can be pipe-separated e.g. sample1_01.fastq.gz|sample1_02.fastq.gz reads string

// Full s3 path location to put the FastQC reports
output string

// name of the sample
id string

)

val fastqc = "quay.io/biocontainers/fastqc:0.11.7--4" func FastQC(reads [file]) = // Use kmer-hashing image which has latest khmer to avoid bug with basenames in reflow exec(image := fastqc, mem := 4*GiB, cpu := 8) (outdir dir) {" zcat {{reads}} | /usr/local/bin/fastqc -o {{outdir}} --threads 8 --format fastq stdin"}

// Instantiate the system modules "dirs" (system modules begin// with $), assigning its instance to the "dirs" identifier. To// view the documentation for this module, run "reflow doc// $/dirs". val dirs = make("$/dirs") val files = make("$/files") val path = make("$/path")

// Instantiate Go system module "strings" val strings = make("$/strings") // Split each read string by the pipe "|" to get individual s3 paths val reads_split = strings.Split(reads, "|") // Create a file for each element in the read1s, read2s string array// Now r1, r1 are arrays of files val reads_files = [file(read) | read <- reads_split]

val outdir = FastQC(reads_files) // Two files are output: stdin_fastqc.html and stdinfastq.zipval (html, ) = dirs.Pick(outdir, "html") // "zip" is a keyword in Reflow so say "zipfile" insteadval (zipfile, _) = dirs.Pick(outdir, "zip") outputs := ["html": html, "zip": zipfile] func Copy(extension string, item file){ suffix := "_fastqc." + extension basename := strings.Join(id, suffix) s3_output := path.Join(output, basename) files.Copy(item, s3_output) }

val Main = { [Copy(extension, item) | extension, item <- outputs] }

Can you try something like the following?

func Copy(extension string, item file) = {

    suffix := "_fastqc." + extension

    basename := strings.Join([id], suffix)

    s3_output := path.Join([output, basename])

    files.Copy(item, s3_output) ~> item

}

val Main =

    [Copy(extension, item) | (extension, item) <- outputs]

Can you help me understand what I'm missing here?

Thanks! Olga

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/grailbio/reflow/issues/55, or mute the thread https://github.com/notifications/unsubscribe-auth/AfC0Q6K4wkLHwSo-LqDq9GbWADhniIH_ks5uFVTDgaJpZM4VKWe0 .

olgabot commented 6 years ago

Thanks! It's running now and I'll let you know how it goes. This works well for a one or two files at a time, but what about the generic case? The workflow below outputs three files which begin with kat.hist and I'd like to rename all of those to {{id}}_kat.hist.

``` 2018/07/11 13:24:35 <- kat_hist.KatHist e864461e ok exec 26m17s 16.3GiB 2018/07/11 13:24:35 kat_hist.KatHist e864461e /Users/olgabot/code/tick-genome/reflow/kat_hist.rf:18:9: resources: {mem:64.0GiB cpu:8 disk:0B} sha256:762c035d11b2c52c592592d007ced53444c918ff891f2347566b4be0ddbd0ea5 sha256:e864461e3af81c4809062ef5a2dcbddf4f9b0027d3a5c8cd6ae63228b38a8da7 ec2-54-200-198-199.us-west-2.compute.amazonaws.com:9000/645f2b42a4c6f396/e864461e3af81c4809062ef5a2dcbddf4f9b0027d3a5c8cd6ae63228b38a8da7 quay.io/biocontainers/kat:2.4.0--py36h355e19c_3 command: cd {{outdir}} mv {{fastq}} Undetermined_S0_R1.fastq.gz kat hist --threads 8 Undetermined_S0_R1.fastq.gz pwd ls -lha where: {{fastq}} = . sha256:2de8237ab8745755761cb164d5674447a97b7f72f7a1e711e715a67c92d4121f 16.3GiB result: {{outdir}} = Undetermined_S0_R1.fastq.gz sha256:2de8237ab8745755761cb164d5674447a97b7f72f7a1e711e715a67c92d4121f 16.3GiB kat.hist sha256:99b663e7c0d3f4beb0342b5ea50fd5a223cfc9c2aa9940ad31cf5f8e857d6813 77.5KiB kat.hist.dist_analysis.json sha256:84749050434d442a51eb1c5c5f25ca5e5aece85d4ec3c0b8ddafb8c44197c62c 663B kat.hist.png sha256:9ccbe4fc4969b3aa91bcaef0f1e0de1820c3a1888687829e386437d26c21f6a0 111.0KiB profile: cpu mean=7.4 max=8.0 mem mean=23.1GiB max=56.1GiB disk mean=15.7GiB max=16.3GiB tmp mean=0B max=0B 2018/07/11 13:24:35 -> kat_hist.Main 9787cdc0 run extern s3://tick-genome/dna/2018-06-28 77.5KiB 2018/07/11 13:24:39 <- kat_hist.Main 9787cdc0 ok extern 0s 0B 2018/07/11 13:24:39 total n=3 time=29m32s ident n ncache transfer runtime(m) cpu mem(GiB) disk(GiB) tmp(GiB) kat_hist.Main 1 0 0B kat_hist.KatHist 1 0 16.3GiB 26/26/26 7.4/7.4/7.4 56.1/56.1/56.1 16.3/16.3/16.3 0.0/0.0/0.0 kat_hist.reads_file 1 1 0B ```

How can one rename all files in a directory with a particular prefix? I'm trying the below and am getting stuck on converting the output of dirs.Files into strings to manipulate. It's clear to me how to convert string --> file with the file() command but unclear how to convert from file --> string as path.Base requires a string type.

```golang param ( // S3 path to a single fastq file reads string // Full s3 file location to put the FastQC report output string // name of the sample id string ) // K-mer analysis toolkit (KAT) // https://github.com/TGAC/KAT val kat = "quay.io/biocontainers/kat:2.4.0--py36h355e19c_3" func KatHist(reads [file]) = // Use kmer-hashing image which has latest khmer to avoid bug with basenames in reflow exec(image := kat, cpu := 8, mem := 64*GiB) (outdir dir) {" cd {{outdir}} kat hist --threads 8 {{reads}} "} // Instantiate the system modules "dirs" (system modules begin // with $), assigning its instance to the "dirs" identifier. To // view the documentation for this module, run "reflow doc // $/dirs". val dirs = make("$/dirs") val files = make("$/files") val path = make("$/path") // Instantiate Go system module "strings" val strings = make("$/strings") // Split each read string by the pipe "|" to get individual s3 paths val reads_split = strings.Split(reads, "|") // Create a file for each element in the `read1s`, `read2s` string array // Now `r1`, `r1` are arrays of files val reads_files = [file(read) | read <- reads_split] val outdir = KatHist(reads_files) func CopyRenamed(results dir, id, output string) = { // Prefix "id" to all files in "results" directory and copy to s3 location in "output" filenames := dirs.Files(results) filenames_strings := [string(filename) | filename <- filenames] basenames := [path.Base(filename) | filename <- filenames_strings] renamed := [path.Join([output, id + "_" + basename]) | basename <- basenames] file_to_destination := zip(filenames, renamed) [files.Copy(filename, destination) | (filename, destination) <- file_to_destination] } val Main = CopyRenamed(outdir, id, output) ```

Running the workflow gives me this error on the filenames_strings := [string(filename) | filename <- filenames] line:

(tick-genome-env) 
 ✘  Wed 11 Jul - 13:49  ~/code/tick-genome/reflow   origin ☊ master 2☀ 2● 
  reflow run kat_hist.rf -reads=s3://czbiohub-seqbot/fastqs/180628_A00111_0168_AHFVJVDMXX/rawdata/Undetermined_S0_R1_001.fastq.gz -output=s3://tick-genome/dna/2018-06-28 -id=Undetermined_S0_R1
/Users/olgabot/code/tick-genome/reflow/kat_hist.rf:47:30: syntax error: unexpected tokString, expecting tokEllipsis or ']'
prasadgopal commented 6 years ago

Not sure I understood your question. Does something like below work for you?

cat /tmp/list.rf

id := "12345_"

l := ["kat1.hist", "kat2.hist", "kat3.hist"]

Main := [id + v | v <- l]

reflow run /tmp/list.rf

["12345_kat1.hist", "12345_kat2.hist", "12345_kat3.hist"]

On Wed, Jul 11, 2018 at 1:56 PM Olga Botvinnik notifications@github.com wrote:

Thanks! It's running now and I'll let you know how it goes. This works well for a one or two files at a time, but what about the generic case? The workflow below outputs three files whicn begin with kat.hist and I'd like to rename all of those to {{id}}_kat.hist.

2018/07/11 13:24:35 <- kat_hist.KatHist e864461e ok exec 26m17s 16.3GiB 2018/07/11 13:24:35 kat_hist.KatHist e864461e /Users/olgabot/code/tick-genome/reflow/kat_hist.rf:18:9: resources: {mem:64.0GiB cpu:8 disk:0B} sha256:762c035d11b2c52c592592d007ced53444c918ff891f2347566b4be0ddbd0ea5 sha256:e864461e3af81c4809062ef5a2dcbddf4f9b0027d3a5c8cd6ae63228b38a8da7 ec2-54-200-198-199.us-west-2.compute.amazonaws.com:9000/645f2b42a4c6f396/e864461e3af81c4809062ef5a2dcbddf4f9b0027d3a5c8cd6ae63228b38a8da7 quay.io/biocontainers/kat:2.4.0--py36h355e19c_3 command: cd {{outdir}} mv {{fastq}} Undetermined_S0_R1.fastq.gz kat hist --threads 8 Undetermined_S0_R1.fastq.gz pwd ls -lha where: {{fastq}} = . sha256:2de8237ab8745755761cb164d5674447a97b7f72f7a1e711e715a67c92d4121f 16.3GiB result: {{outdir}} = Undetermined_S0_R1.fastq.gz sha256:2de8237ab8745755761cb164d5674447a97b7f72f7a1e711e715a67c92d4121f 16.3GiB kat.hist sha256:99b663e7c0d3f4beb0342b5ea50fd5a223cfc9c2aa9940ad31cf5f8e857d6813 77.5KiB kat.hist.dist_analysis.json sha256:84749050434d442a51eb1c5c5f25ca5e5aece85d4ec3c0b8ddafb8c44197c62c 663B kat.hist.png sha256:9ccbe4fc4969b3aa91bcaef0f1e0de1820c3a1888687829e386437d26c21f6a0 111.0KiB profile: cpu mean=7.4 max=8.0 mem mean=23.1GiB max=56.1GiB disk mean=15.7GiB max=16.3GiB tmp mean=0B max=0B 2018/07/11 13:24:35 -> kat_hist.Main 9787cdc0 run extern s3://tick-genome/dna/2018-06-28 77.5KiB 2018/07/11 13:24:39 <- kat_hist.Main 9787cdc0 ok extern 0s 0B 2018/07/11 13:24:39 total n=3 time=29m32s ident n ncache transfer runtime(m) cpu mem(GiB) disk(GiB) tmp(GiB) kat_hist.Main 1 0 0B kat_hist.KatHist 1 0 16.3GiB 26/26/26 7.4/7.4/7.4 56.1/56.1/56.1 16.3/16.3/16.3 0.0/0.0/0.0 kat_hist.reads_file 1 1 0B

How can one rename all files in a directory with a particular prefix? I'm trying the below and am getting stuck on converting the output of dirs.Files into strings to manipulate. It's clear to me how to convert string --> file with the file() command but unclear how to convert from file --> string as path.Base requires a string type.

param ( // S3 path to a single fastq file reads string

// Full s3 file location to put the FastQC report
output string

// name of the sample
id string

) // K-mer analysis toolkit (KAT)// https://github.com/TGAC/KAT val kat = "quay.io/biocontainers/kat:2.4.0--py36h355e19c_3" func KatHist(reads [file]) = // Use kmer-hashing image which has latest khmer to avoid bug with basenames in reflow exec(image := kat, cpu := 8, mem := 64*GiB) (outdir dir) {" cd {{outdir}} kat hist --threads 8 {{reads}}"}

// Instantiate the system modules "dirs" (system modules begin// with $), assigning its instance to the "dirs" identifier. To// view the documentation for this module, run "reflow doc// $/dirs". val dirs = make("$/dirs") val files = make("$/files") val path = make("$/path") // Instantiate Go system module "strings" val strings = make("$/strings") // Split each read string by the pipe "|" to get individual s3 paths val reads_split = strings.Split(reads, "|") // Create a file for each element in the read1s, read2s string array// Now r1, r1 are arrays of files val reads_files = [file(read) | read <- reads_split]

val outdir = KatHist(reads_files) func CopyRenamed(results dir, id, output string) = { // Prefix "id" to all files in "results" directory and copy to s3 location in "output" filenames := dirs.Files(results) filenames_strings := [string(filename) | filename <- filenames] basenames := [path.Base(filename) | filename <- filenamesstrings] renamed := [path.Join([output, id + "" + basename]) | basename <- basenames]

file_to_destination := zip(filenames, renamed)

[files.Copy(filename, destination) | (filename, destination) <- file_to_destination] }

val Main = CopyRenamed(outdir, id, output)

— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/grailbio/reflow/issues/55#issuecomment-404306598, or mute the thread https://github.com/notifications/unsubscribe-auth/AfC0Q0uG0GNJKcxstujskEnu4voRANARks5uFmaCgaJpZM4VKWe0 .

olgabot commented 6 years ago

Thanks for your response. My issue is that while for some programs I may know all the output file names, I'd rather write a function that automatically renames everything with my sample id as a prefix.

The core issue is that I'm getting the filenames using dir.Files but then I can't use $/strings or $/path to rename the files. Here's an example:

func CopyRenamed(results dir, id, output string) = {
    // Prefix "id" to all files in "results" directory and copy to s3 location in "output"
    filenames := dirs.Files(results)
    filenames_strings := [string(filename) | filename <- filenames]
    basenames := [path.Base(filename) | filename <- filenames_strings]
    renamed := [path.Join([output, id + "_" + basename]) | basename <- basenames]

    file_to_destination := zip(filenames, renamed)

    [files.Copy(filename, destination) | (filename, destination) <- file_to_destination]
}
prasadgopal commented 6 years ago
dirs := make("$/dirs")                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
func CopyRenamed(results dir, id, output string) = {                                                                                                                                                                                                                          
  r := map([(id + "_" + n, f)| (n, f) <- map(results)])                                                                                                                                                                                                                       
  d := dirs.Make(r)                                                                                                                                                                                                                                                           
  dirs.Copy(d, output)                                                                                                                                                                                                                                                        
}                                                                                                                                                                                                                                                      
d := dir("s3://mybucket/src/")                                                                                                                                                                                                                                                
id := "prefix"                                                                                                                                                                                                                                                                
@requires(cpu := 1)                                                                                                                                                                                                                                                           
val Main = CopyRenamed(d, id, "s3://mybucket/dst/")  
olgabot commented 6 years ago

Thank you, that worked!!!