gboris / blkbox

Data exploration with multiple machine learning algorithms
14 stars 4 forks source link

pot include mic/mine #39

Closed gboris closed 8 years ago

gboris commented 8 years ago

http://www.sciencemag.org/cgi/content/full/334/6062/1518?ijkey=cRCIlh2G7AjiA&keytype=ref&siteid=sci

How to run MINE in Java First, make sure Java is installed. Then download MINE.jar, open a command prompt, and type

java -jar MINE.jar

followed by the necessary command-line parameters.

How to run MINE in R First, make sure both R and Java are installed. Next, download and save both MINE.jar and MINE.r into the same directory and make that directory the working directory in R. The example below then runs MINE on all the variable pairs in the file example.csv.

install.packages("rJava") # 1-time initialization step source("MINE.r") MINE("example.csv","all.pairs")

The library exposes two functions: MINE, and rMINE. The difference between them is that MINE takes as input a path to a csv file to be analyzed, while rMINE takes as input an R matrix containing the data to be analyzed. Both functions take parameters identical to those in the Java version. For additional information, see the comments in MINE.r.

library("rJava") .jinit(classpath="MINE.jar")

###########################################
# MINE - runs MINE on a comma-separated
#  values (CSV) file. All parameters are as
#  in the Java version. The parameters
#  var1.id and var2.id are for use when
#  the analysis style specified requires
#  more information. For instance, since
#  style="master.variable" requires an
#  additional variable id (the id of the
#  master variable, you will need to
#  specify var1.id in this case.
#
#  As in the Java version, if you do not
#  specify a value for style, and you
#  specify a value for var1.id and not
#  var2.id, then style="master.variable"
#  will be assumed. If you do not specify a
#  value for style, and you specify values
#  for both var1.id and var2.id, then
#  style="one.pair" will be assumed.
#
# EXAMPLES:
#  MINE("Spellman.csv","all.pairs",0)
#  MINE("Spellman.csv",0)
#   will both run MINE on "Spellman.csv"
#   and have it analyze each variable only
#   against the 0-th variable.
#
#  MINE("Spellman.csv","one.pair",0,5)
#  MINE("Spellman.csv",0,5)
#   will both run MINE on "Spellman.csv"
#   and have it analyze only the 0-th
#   variable against the 5-th variable.
###########################################
MINE <- function (
        input.filename,
        style=c("master.variable", "all.pairs", "adjacent.pairs", "pairs.between", "one.pair"),
        var1.id=NA,
        var2.id=NA,
        required.common.vals.fraction=0,
        max.num.boxes.exponent=0.6,
        notify.wait=100,
        num.clumps.factor=15,
        debug.level=0,
        gc.wait=Inf,
        job.id
        ) {

    printHeader()

    params <- getParams(input.filename, style, var1.id, var2.id, required.common.vals.fraction, max.num.boxes.exponent, notify.wait, num.clumps.factor, debug.level, gc.wait, job.id)

    # run the analysis
    cat("reading in dataset...\n")
    flush.console()

    dataset <- .jnew("data/Dataset",
                params$inputfile,
                params$analysisParams$mineParams$debug)

    cat("done.\n")
    flush.console()

    doAnalysis(dataset, params)
}

###########################################
# rMINE - runs MINE on an R matrix.
#  all parameters are as in MINE, except
#  that the name of the results file will
#  begin with output.prefix rather than
#  the name of the input file (since there
#  is no input file).
#
#  MINE assumes that each row of the
#  supplied matrix is a variable, and each
#  column is a record.
#
# EXAMPLE:
#  rMINE(matrix(1:10,2),"matrix",0)
#   will run MINE on matrix(1:10,2),
#   assuming that each of the two rows
#   in the matrix is a variable.
###########################################
rMINE <- function (
        data,
        output.prefix,
        style=c("master.variable", "all.pairs", "adjacent.pairs", "pairs.between", "one.pair"),
        var1.id=NA,
        var2.id=NA,
        required.common.vals.fraction=0,
        max.num.boxes.exponent=0.6,
        notify.wait=100,
        num.clumps.factor=15,
        debug.level=0,
        gc.wait=Inf,
        job.id
        ) {
    printHeader()

    if(missing(output.prefix))
        stop("you must specify output.prefix so that I'll know what to name the output file!")

    params <- getParams(output.prefix, style, var1.id, var2.id, required.common.vals.fraction, max.num.boxes.exponent, notify.wait, num.clumps.factor, debug.level, gc.wait, job.id)

    # run the analysis
    cat("reading in dataset...\n")
    flush.console()

    data <- .jarray(data, dispatch=TRUE)

    dataset <- .jnew("data/Dataset",
                data, params$analysisParams$mineParams$debug)
    cat("done.\n")
    flush.console()

    doAnalysis(dataset, params)
}

printHeader <- function () {
    # print header
    cat(J("main/Analyze")$header())
    cat("\n\n")
    flush.console()
}

getParams <- function(
        input.filename,
        style=c("master.variable", "all.pairs", "adjacent.pairs", "pairs.between", "one.pair"),
        var1.id=NA,
        var2.id=NA,
        required.common.vals.fraction=0,
        max.num.boxes.exponent=0.6,
        notify.wait=100,
        num.clumps.factor=15,
        debug.level=0,
        gc.wait=Inf,
        job.id
        ) {
    if (gc.wait==Inf) gc.wait <- J("java.lang.Integer")$MAX_VALUE
    else gc.wait <- as.integer(gc.wait)

    # create parameters object  
    if(missing(job.id)) {
        args <- c(
                input.filename,
                style,
                var1.id,
                var2.id,
                paste("cv=", required.common.vals.fraction, sep = ""),
                paste("exp=", max.num.boxes.exponent, sep = ""),
                paste("notify=", notify.wait, sep = ""),
                paste("c=", num.clumps.factor, sep = ""),
                paste("d=", debug.level, sep = ""),
                paste("gc=", gc.wait, sep = "")
        )
    } else {
        args <- c(
                input.filename,
                style,
                var1.id,
                var2.id,
                paste("cv=", required.common.vals.fraction, sep = ""),
                paste("exp=", max.num.boxes.exponent, sep = ""),
                paste("notify=", notify.wait, sep = ""),
                paste("c=", num.clumps.factor, sep = ""),
                paste("d=", debug.level, sep = ""),
                paste("gc=", gc.wait, sep = ""),
                paste("id=", job.id, sep = "")
        )
    }

    # this removes NA entries from args, so that if var1.id and var2.id aren't defined
    # then we won't pass the NA's on to Java.
    args <- args[!is.na(args)]

    params <- .jnew("main/JobParameters", .jarray(args))
    flush.console()

    #confirm parameters for user
    cat(params$toString())
    cat("\n")
    flush.console()

    params
}

doAnalysis <- function (dataset, params) {
    toAnalyze <- .jnew("analysis/VarPairQueue", dataset)

    params$analysisStyle$addVarPairsTo(toAnalyze, dataset$numVariables())

    a <- .jnew("analysis/Analysis", dataset, toAnalyze)

    cat("Analyzing...\n")
    flush.console()

    while(! a$varPairQueue()$isEmpty()) {
        # print a status update
        statusUpdate <- paste(a$numResults() + 1, " calculating: ", a$varPairQueue()$peek()$var1$name(), " vs ", a$varPairQueue()$peek()$var2$name(), "...\n", sep="")
        cat(statusUpdate)
        flush.console()

        # create a file containing the status update (for use when running on a cluster)
        write(statusUpdate, file=params$statusFileName())

        # analyze some more pairs
        a$analyzePairs(J("analysis.results/BriefResult")$class,
                    params$analysisParams,
                    params$notifyWait)
    }

    cat(paste(a$numResults(), " variable pairs analyzed.\n", "Sorting results in descending order...\n", sep=""))
    flush.console()

    results <- a$getSortedResults()

    cat("done. printing results\n")
    flush.console()

    #print the results
    repeat {
        if(J("main/Analyze")$printResults(results, params)) {
            break
        }
        else {
            n <- readline("writing results to output file failed. Perhaps it is locked in some way. Enter 1 to try again, 0 otherwise: ")
            if(n == 0) break
        }           
    }

    cat("Analysis finished. See file \"")
    cat(params$resultsFileName())
    cat("\" for output\n")
}