broadinstitute / cromwell

Scientific workflow engine designed for simplicity & scalability. Trivially transition between one off use cases to massive scale production environments
http://cromwell.readthedocs.io/
BSD 3-Clause "New" or "Revised" License
988 stars 357 forks source link

Help configuring system portion config file to optimally use google cloud #5352

Closed njbernstein closed 4 years ago

njbernstein commented 4 years ago

IMPORTANT: Please file new issues over in our Jira issue tracker!

https://broadworkbench.atlassian.net/projects/BA/issues

You may need to create an account before you can view/create issues.

Hi there,

I'm using cromwell to kick off 1000 mutect2 jobs at a time, but right now the workflow doesn't kick off all 1000 sub-workflows simultaneously. When I look at my google cloud quotas none are close to being full. I'm wondering what settings I might need to change in the system or some other section of the google conf to better use all the compute I have available

I'm kicking off google cloud jobs using cromwell from a local VM.

version 1.0

#
#  Description of inputs
#  intervals: genomic intervals
#  ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary
#  vi : arrays of normal bams
#  scatter_count: number of parallel jobs when scattering over intervals
#  pon_name: the resulting panel of normals is {pon_name}.vcf
#  m2_extra_args: additional command line parameters for Mutect2.  This should not involve --max-mnp-distance,
#  which the wdl hard-codes to 0 because GenpmicsDBImport can't handle MNPs

#import "mutect2.wdl" as m2

import "https://raw.githubusercontent.com/gatk-workflows/gatk4-somatic-snvs-indels/2.6.0/mutect2.wdl" as m2

workflow Mutect2_multisample {
  input {
    File? intervals
    File ref_fasta
    File ref_fai
    File ref_dict
    Int scatter_count
    File file_normal_bams
    Array[File] normal_bams = read_lines(file_normal_bams)
    File file_normal_bais
    Array[File] normal_bais = read_lines(file_normal_bais)
    File? pon
    File? pon_idx
    File? variants_for_contamination
    File? variants_for_contamination_idx
    File? realignment_index_bundle
    String? realignment_extra_args
    File gnomad
    File gnomad_idx
    String? m2_extra_args
    String? create_pon_extra_args
    Boolean? compress

    Int? min_contig_size
    Int? num_contigs

    # runtime
    String gatk_docker
    File? gatk_override
    String basic_bash_docker = "ubuntu:16.04"
    Boolean? filter_funcotations

    Int? preemptible
    Int? max_retries
    Int small_task_cpu = 2
    Int small_task_mem = 8
    Int small_task_disk = 100
    Int boot_disk_size = 12

      # Funcotator inputs
      Boolean? run_funcotator
      String? sequencing_center
      String? sequence_source
      String? funco_reference_version
      String? funco_output_format
      Boolean? funco_compress
      Boolean? funco_use_gnomad_AF
      File? funco_data_sources_tar_gz
      String? funco_transcript_selection_mode
      File? funco_transcript_selection_list
      Array[String]? funco_annotation_defaults
      Array[String]? funco_annotation_overrides
      Array[String]? funcotator_excluded_fields
      Boolean? funco_filter_funcotations
      String? funcotator_extra_args

      String funco_default_output_format = "MAF"

    # Use as a last resort to increase the disk given to every task in case of ill behaving data
    Int? emergency_extra_disk
  }

  Int contig_size = select_first([min_contig_size, 1000000])
  Int preemptible_or_default = select_first([preemptible, 2])
  Int max_retries_or_default = select_first([max_retries, 2])

  Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
            "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu,
            "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
            "disk": small_task_disk, "boot_disk_size": boot_disk_size}

    scatter (normal_bam in zip(normal_bams, normal_bais)) {
        call m2.Mutect2 {
            input:
                intervals = intervals,
                ref_fasta = ref_fasta,
                ref_fai = ref_fai,
                ref_dict = ref_dict,
                tumor_reads = normal_bam.left,
                tumor_reads_index = normal_bam.right,
                scatter_count = scatter_count,
                m2_extra_args = select_first([m2_extra_args, ""]) + " --max-mnp-distance 0",
                gatk_override = gatk_override,
                gatk_docker = gatk_docker,
                preemptible = preemptible,
                max_retries = max_retries,
                pon = pon,
                pon_idx = pon_idx,
                gnomad = gnomad,
                gnomad_idx = gnomad_idx
        }
    }

    output {
        Array[File] normal_calls = Mutect2.filtered_vcf
        Array[File] normal_calls_idx = Mutect2.filtered_vcf_idx

    }
}
include required(classpath("application"))
google {
application-name = "cromwell"
auths = [
{ 
name = "application-default"
scheme = "application_default"
}
]
}
engine {
filesystems {
gcs {
auth = "application-default"
}
}
}
backend {
default = "JES"
providers {
JES {
actor-factory = "cromwell.backend.impl.jes.JesBackendLifecycleActorFactory"
config {
// Google project
project = "calico-uk-biobank"
compute-service-account = "default"
// Base bucket for workflow executions
root = "nicholas-b-test"
// Polling for completion backs-off gradually for slower-running jobs.
// This is the maximum polling interval (in seconds):
maximum-polling-interval = 600
// Optional Dockerhub Credentials. Can be used to access private docker images.
dockerhub {
  // account = ""
  // token = ""
}
genomics {
  // A reference to an auth defined in the `google` stanza at the top.  This auth is used to create
  // Pipelines and manipulate auth JSONs.
  auth = "application-default"
  // Endpoint for APIs, no reason to change this unless directed by Google.
  endpoint-url = "https://genomics.googleapis.com/"
}
filesystems {
  gcs {
    // A reference to a potentially different auth for manipulating files via engine functions.
    auth = "application-default"
  }
}
}
}
}
}
system {
input-read-limits {
lines = 12800000
bool = 7
int = 19
float = 50
string = 12800000
json = 12800000
tsv = 12800000
map = 12800000
object = 12800000
}
}
illusional commented 4 years ago

Not necessarily the problem, but even non-obvious GCP quotas can limit how many workers are scheduled. Specifically, some things to look out for:

I thought mine were high enough, but from this page (replace $region with your region) you can click the "Current Usage" to sort by in-demand resources:

image

njbernstein commented 4 years ago

@illusional Thanks for your response!

I should have posted this with my original questions. None of my quotas are close to being full, so I think it must be something on the Cromwell side of things. However I'm not sure how to troubleshoot what it might be. Screen Shot 2020-01-13 at 2 03 07 PM

illusional commented 4 years ago

Hey @njbernstein, did you work out what it was, if so, mind sharing here?