mllg / batchtools

Tools for computation on batch systems
https://mllg.github.io/batchtools/
GNU Lesser General Public License v3.0
169 stars 51 forks source link

submitJob - can't find qsub function #296

Closed MislavSag closed 9 months ago

MislavSag commented 9 months ago

I have tried to use batchtools with mlr3batchmark to benchmark my (task,resampling,learners) grid.

The cluster I am trying to run the script uses PBS Pro.

I have developed simple template:

#!/bin/bash

#PBS -N <%= job.hash %>
#PBS -o <%= log.file %>
#PBS -q cpu
#PBS -j oe
#PBS -l select=<%= resources$ncpus %>:ncpus=<%= resources$ncpus %>:mem=<%= resources$memory %>

cd ${PBS_O_WORKDIR}

## Resources needed:
<% if (length(resources) > 0) {
  opts <- unlist(resources, use.names = TRUE)
  opts <- sprintf("%s=%s", names(opts), opts)
  opts <- paste(opts, collapse = ",") %>
#PBS -l <%= opts %>
<% } %>

<%= if (array.jobs) sprintf("#PBS -J 1-%i", nrow(jobs)) else "" %>

## Launch R and evaluated the batchtools R job
Rscript -e 'batchtools::doJobCollection("<%= uri %>")'

and here is R code (only releveant snippet):

# create registry
print("Create registry")
packages = c("data.table", "gausscov", "paradox", "mlr3", "mlr3pipelines",
             "mlr3tuning", "mlr3misc", "future", "future.apply", 
             "mlr3extralearners")
time = strftime(Sys.time(), format = "%Y%m%d%H%M%S")
reg = makeExperimentRegistry(
  file.dir = paste0("./experiments-", time),
  seed = 1,
  packages = packages
)

# populate registry with problems and algorithms to form the jobs
print("Batchmark")
batchmark(designs_sample, reg = reg)

# create cluster function
makeClusterFunctionsPBSPro = function(template = "padobran.tmpl", scheduler.latency = 1, fs.latency = 65) {
  template = findTemplateFile(template)
  if (testScalarNA(template)) {
    stopf("Argument 'template' (=\"%s\") must point to a readable template", template)
  }
  template = cfReadBrewTemplate(template, "##")

  submitJob = function(reg, jc) {
    assertRegistry(reg, writeable = TRUE)
    assertClass(jc, "JobCollection")

    outfile = cfBrewTemplate(reg, template, jc)
    res = runOSCommand("qsub", shQuote(outfile))
    output = stri_flatten(stri_trim_both(res$output), "\n")

    if (res$exit.code > 0L) {
      return(cfHandleUnknownSubmitError("qsub", res$exit.code, res$output))
    }

    if (jc$array.jobs) {
      logs = sprintf("%s-%i", fs::path_file(jc$log.file), seq_row(jc$jobs))
      makeSubmitJobResult(status = 0L, batch.id = stri_replace_first_fixed(output, "[]", stri_paste("[", seq_row(jc$jobs), "]")), log.file = logs)
    } else {
      makeSubmitJobResult(status = 0L, batch.id = output)
    }
  }

  killJob = function(reg, batch.id) {
    assertRegistry(reg, writeable = TRUE)
    assertString(batch.id)
    cfKillJob(reg, "qdel", batch.id)
  }

  # listJobs = function(reg, args) {
  #   assertRegistry(reg, writeable = FALSE)
  #   res = runOSCommand("qstat", args)
  #   if (res$exit.code > 0L)
  #     OSError("Listing of jobs failed", res)
  #   res$output
  # }
  # 
  # listJobsQueued = function(reg) {
  #   args = c("-u $USER")
  #   listJobs(reg, args)
  # }
  # 
  # listJobsRunning = function(reg) {
  #   args = c("-u $USER")
  #   listJobs(reg, args)
  # }
  listJobsQueued = NULL
  listJobsRunning = NULL

  makeClusterFunctions(name = "PBSPro", submitJob = submitJob, killJob = killJob, listJobsQueued = listJobsQueued,
                       listJobsRunning = listJobsRunning, array.var = "PBS_ARRAY_INDEX", store.job.collection = TRUE,
                       scheduler.latency = scheduler.latency, fs.latency = fs.latency)
}

# create cluster template
print("Cluster template")
cf = makeClusterFunctionsPBSPro("padobran.tmpl")
reg$cluster.functions = cf
saveRegistry(reg = reg)

# define resources
print("Set resources")
resources = list(ncpus = 128, select=2, walltime = 3600*24)

# submit job!
print("Submit job !")
submitJobs(ids = 1:nrow(designs_sample), resources = resources, reg = reg)

After the last command (submitJobs) I got an error - qsub comamnd doesn't exist (127).

It seems to me the problem appears because I use R script inside apptainer image. Probbaly, the line

Rscript -e 'batchtools::doJobCollection("<%= uri %>")'

should be apptainer exec image.sif Rscript -e 'batchtools::doJobCollection("<%= uri %>")', but it doesn;t work that way either

What would you recommend ?

MislavSag commented 9 months ago

I am closing this issue. This is not package problem. R is installed inside apptainer image and there is a problem with binding paths that contain qsub command.

Admins installed R on the cluster, so I will try to call R inside apptainer with qsub...